diff --git a/.gitignore b/.gitignore index 4165b58..50e4eb1 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ /.vscode *.egg-info __pycache__ -*.log \ No newline at end of file +*.log +/powerisa-instructions.xml \ No newline at end of file diff --git a/README.md b/README.md index 6c474c3..f8fae5d 100644 --- a/README.md +++ b/README.md @@ -1 +1,39 @@ parser for the OPF PowerISA 3.1C pdf to attempt to extract all instructions' pseudo-code including subscripts/superscripts and other formatting + +Usage: +* Download the OPF PowerISA 3.1C pdf (yes you need that exact version) from +* Obtain CPython 3.11 (the default `python3` in [Debian Bookworm](https://www.debian.org/releases/bookworm/)) + + On Debian Bookworm you can do: + + ```bash + sudo apt update + sudo apt install python3-venv + ``` + +* Create a venv: + + ```bash + python3.11 -m venv --upgrade-deps path/to/your/new/.venv + ``` + +* Activate the venv: + + ```bash + . path/to/your/new/.venv/bin/activate + ``` + +* Install this project in the venv: + + ```bash + pip install -e . + ``` + +* Run it: + + ```bash + parse_powerisa_pdf path/to/downloaded/OPF_PowerISA_v3.1C.pdf > out.log + ``` + +* This will spit out lots of errors and then successfully create + the output file -- `powerisa-instructions.xml` in the current directory. diff --git a/parse_powerisa_pdf/parse_powerisa_pdf.py b/parse_powerisa_pdf/parse_powerisa_pdf.py index 338e6e5..527c5cb 100755 --- a/parse_powerisa_pdf/parse_powerisa_pdf.py +++ b/parse_powerisa_pdf/parse_powerisa_pdf.py @@ -10,6 +10,8 @@ from typing import ClassVar, TypeVar, assert_never from xml.etree import ElementTree import enum import traceback +from copy import deepcopy +from pathlib import Path from pdfminer.high_level import extract_pages from pdfminer.layout import LTChar, LTLine, LTPage, LTRect, LTTextBox @@ -256,6 +258,39 @@ class ParsedTextLine: def __str__(self) -> str: return "\n" * self.preceding_blank_lines + ElementTree.tostring(self.element, encoding="unicode") + def write_xml(self, parent: ElementTree.Element, trailing_nl: bool): + for _ in range(self.preceding_blank_lines): + ElementTree.SubElement(parent, "br").tail = "\n" + if self.element.text is not None: + if len(parent) == 0: + parent.text = (parent.text or "") + self.element.text + else: + parent[-1].tail = (parent[-1].tail or "") + self.element.text + for element in self.element: + parent.append(deepcopy(element)) + if trailing_nl: + ElementTree.SubElement(parent, "br").tail = "\n" + + @staticmethod + def write_xml_lines( + lines: Iterable[ParsedTextLine], + parent: ElementTree.Element, + trailing_nl: bool, + preceding_nl: bool=False, + ): + if preceding_nl: + ElementTree.SubElement(parent, "br").tail = "\n" + first = True + for line in lines: + if first: + first = False + else: + ElementTree.SubElement(parent, "br").tail = "\n" + line.write_xml(parent, trailing_nl=False) + if trailing_nl: + ElementTree.SubElement(parent, "br").tail = "\n" + + _T = TypeVar("_T") class BaselinePos(enum.Enum): @@ -535,6 +570,17 @@ class InsnBitField: def __str__(self) -> str: return f"" + def write_xml(self, parent: ElementTree.Element): + field = ElementTree.SubElement(parent, "field") + field.text = "\n" + field.tail = "\n" + name = ElementTree.SubElement(field, "name") + name.tail = "\n" + self.name.write_xml(name, trailing_nl=False) + bit_number = ElementTree.SubElement(field, "bit-number") + bit_number.tail = "\n" + self.bit_number.write_xml(bit_number, trailing_nl=False) + @dataclass(unsafe_hash=True, frozen=True) class InsnBitFieldsPrefix: box_min_x: float @@ -555,6 +601,18 @@ class InsnBitFieldsPrefix: f" ]\n" f" suffix_text={self.suffix_text}>") + def write_xml(self, parent: ElementTree.Element): + prefix_elm = ElementTree.SubElement(parent, "prefix") + prefix_elm.text = "\n" + prefix_elm.tail = "\n" + prefix_text = ElementTree.SubElement(prefix_elm, "prefix-text") + prefix_text.tail = "\n" + self.prefix_text.write_xml(prefix_text, trailing_nl=False) + InsnBitFields.write_xml_fields(self.fields, prefix_elm) + suffix_text = ElementTree.SubElement(prefix_elm, "suffix-text") + suffix_text.tail = "\n" + self.suffix_text.write_xml(suffix_text, trailing_nl=False) + @dataclass(unsafe_hash=True, frozen=True) class InsnBitFields: prefix: None | InsnBitFieldsPrefix @@ -573,6 +631,23 @@ class InsnBitFields: f"({self.box_max_x},{self.box_max_y}) [\n" f" {sep.join(map(str, self.fields))},\n]>") + @staticmethod + def write_xml_fields(fields: Iterable[InsnBitField], parent: ElementTree.Element): + fields_elm = ElementTree.SubElement(parent, "fields") + fields_elm.text = "\n" + fields_elm.tail = "\n" + for field in fields: + field.write_xml(fields_elm) + + def write_xml(self, parent: ElementTree.Element): + bit_fields = ElementTree.SubElement(parent, "bit-fields") + bit_fields.text = "\n" + bit_fields.tail = "\n" + if self.prefix is not None: + self.prefix.write_xml(bit_fields) + InsnBitFields.write_xml_fields(self.fields, bit_fields) + + @dataclass(unsafe_hash=True, frozen=True) class InsnSpRegsAlteredEntry: reg: ParsedTextLine @@ -600,6 +675,20 @@ class InsnSpRegsAlteredEntry: f"{indent} conds={conds},\n" f"{indent})") + def write_xml(self, parent: ElementTree.Element): + entry = ElementTree.SubElement(parent, "entry") + entry.text = "\n" + entry.tail = "\n" + reg = ElementTree.SubElement(entry, "register") + reg.tail = "\n" + self.reg.write_xml(reg, trailing_nl=False) + fields = ElementTree.SubElement(entry, "fields") + fields.tail = "\n" + ParsedTextLine.write_xml_lines(self.fields, fields, trailing_nl=False) + conds = ElementTree.SubElement(entry, "conditions") + conds.tail = "\n" + ParsedTextLine.write_xml_lines(self.conds, conds, trailing_nl=False) + @dataclass(unsafe_hash=True, frozen=True) class InsnSpRegsAltered: sp_regs_altered_text: ParsedTextLine @@ -631,6 +720,28 @@ class InsnSpRegsAltered: lines.append(f")") return "\n".join(lines) + def write_xml(self, parent: ElementTree.Element): + sp_regs_altered = ElementTree.SubElement(parent, "special-registers-altered") + sp_regs_altered.text = "\n" + sp_regs_altered.tail = "\n" + title = ElementTree.SubElement(sp_regs_altered, "title") + title.tail = "\n" + self.sp_regs_altered_text.write_xml(title, trailing_nl=False) + if self.special_text is not None: + special_text = ElementTree.SubElement(sp_regs_altered, "special-text") + special_text.tail = "\n" + self.special_text.write_xml(special_text, trailing_nl=False) + if self.table_header_reg is not None: + table_header_reg = ElementTree.SubElement(sp_regs_altered, "table-header-register") + table_header_reg.tail = "\n" + self.table_header_reg.write_xml(table_header_reg, trailing_nl=False) + if self.table_header_fields is not None: + table_header_fields = ElementTree.SubElement(sp_regs_altered, "table-header-fields") + table_header_fields.tail = "\n" + self.table_header_fields.write_xml(table_header_fields, trailing_nl=False) + for entry in self.entries: + entry.write_xml(sp_regs_altered) + class _InsnParseSection(enum.Enum): CODE = "code" HEADER = "header" @@ -687,8 +798,9 @@ class Page: ) if text_section is None: if PAGE_BODY_MIN_Y <= element.y0 <= PAGE_BODY_MAX_Y: - raise AssertionError( - f"char not in text section: {element}\npage_num={page_num}") + if page_num != 1072: # page 1072 has characters in the margins + raise AssertionError( + f"char not in text section: {element}\npage_num={page_num}") continue char = Char( text=element.get_text(), @@ -1003,10 +1115,57 @@ class TextSection: return i return None +@dataclass(frozen=True) +class InsnHeader: + header_lines: tuple[ParsedTextLine, ...] + mnemonic_lines: tuple[ParsedTextLine, ...] + bit_fields: InsnBitFields + + @property + def min_y(self) -> float: + return self.bit_fields.box_min_y + + def write_xml(self, parent: ElementTree.Element): + header = ElementTree.SubElement(parent, "header") + header.text = "\n" + header.tail = "\n" + title = ElementTree.SubElement(header, "title") + title.tail = "\n" + ParsedTextLine.write_xml_lines(self.header_lines, title, trailing_nl=False) + mnemonics = ElementTree.SubElement(header, "mnemonics") + mnemonics.tail = "\n" + ParsedTextLine.write_xml_lines(self.mnemonic_lines, mnemonics, trailing_nl=False) + self.bit_fields.write_xml(header) + +@dataclass(frozen=True) +class Insn: + headers: tuple[InsnHeader, ...] + code_lines: tuple[ParsedTextLine, ...] + desc_lines: tuple[ParsedTextLine, ...] + sp_regs_altered: None | InsnSpRegsAltered + + def write_xml(self, parent: ElementTree.Element): + insn = ElementTree.SubElement(parent, "instruction") + insn.text = "\n" + insn.tail = "\n" + for header in self.headers: + header.write_xml(insn) + if len(self.code_lines) != 0: + code = ElementTree.SubElement(insn, "code") + code.tail = "\n" + ParsedTextLine.write_xml_lines(self.code_lines, code, trailing_nl=False) + if len(self.desc_lines) != 0: + desc = ElementTree.SubElement(insn, "description") + desc.tail = "\n" + ParsedTextLine.write_xml_lines(self.desc_lines, desc, trailing_nl=False) + if self.sp_regs_altered is not None: + self.sp_regs_altered.write_xml(insn) + @dataclass() class Parser: pages: Pages = field(default_factory=Pages) text_section: TextSection = TextSection.first() + insns: list[Insn] = field(default_factory=list) @property def page(self) -> Page: @@ -1017,7 +1176,7 @@ class Parser: return self.pages[self.text_section.page_num].unprocessed_chars[self.text_section] @staticmethod - def __pages_gen(file: str, page_numbers: Iterable[int] | None) -> Generator[Page, None, None]: + def __pages_gen(file: Path, page_numbers: Iterable[int] | None) -> Generator[Page, None, None]: if page_numbers is not None: page_numbers = sorted(i - 1 for i in page_numbers) for i, page in enumerate(extract_pages(file, page_numbers=page_numbers)): @@ -1028,7 +1187,7 @@ class Parser: print(f"page {page_num}") yield Page.from_lt_page(page_num=page_num, page=page) - def parse_pdf(self, file: str, page_numbers: Iterable[int] | None = None): + def parse_pdf(self, file: Path, page_numbers: Iterable[int] | None = None): self.pages = Pages(pages_gen=Parser.__pages_gen( file=file, page_numbers=page_numbers)) self.text_section = TextSection.first() @@ -1059,7 +1218,7 @@ class Parser: try: with self.note_text_section(): self.extract_insns() - except InsnParseError as e: + except (InsnParseError, PageParseError) as e: print("".join(traceback.format_exception_only(e)), flush=True) traceback.print_exc() @@ -1200,10 +1359,11 @@ class Parser: self.unprocessed_chars[char.font].remove(char) if allowed_start_min_y_error is None: allowed_start_min_y_error = 0.01 - assert abs(start_min_y - retval.regular_min_y) < allowed_start_min_y_error, ( - f"start_min_y={start_min_y} regular_min_y={retval.regular_min_y}\n" - f"start_min_y error: {start_min_y - retval.regular_min_y}\n" - f"allowed_start_min_y_error={allowed_start_min_y_error}") + if abs(start_min_y - retval.regular_min_y) > allowed_start_min_y_error: + raise PageParseError( + f"start_min_y={start_min_y} regular_min_y={retval.regular_min_y}\n" + f"start_min_y error: {start_min_y - retval.regular_min_y}\n" + f"allowed_start_min_y_error={allowed_start_min_y_error}") return retval def extract_following_text_lines( @@ -1403,7 +1563,7 @@ class Parser: self, start_min_y: float, header_start_char: None | Char = None, - ) -> None | tuple[list[ParsedTextLine], list[ParsedTextLine], InsnBitFields]: + ) -> None | InsnHeader: assert header_start_char is None or \ header_start_char.font == Font.INSN_HEADER header_line = self.extract_text_line( @@ -1458,7 +1618,11 @@ class Parser: print(insn_bit_fields) if insn_bit_fields is None: raise InsnParseError("can't find insn bit fields") - return header_lines, mnemonic_lines, insn_bit_fields + return InsnHeader( + header_lines=tuple(header_lines), + mnemonic_lines=tuple(mnemonic_lines), + bit_fields=insn_bit_fields, + ) def extract_insn_sp_regs_altered( self, @@ -1474,12 +1638,14 @@ class Parser: max_y=sp_regs_altered_text.regular_min_y - 5, allow_processed=False, ) - assert table_header_reg_char is not None, \ - "can't find special registers altered table's register-column's header" + if table_header_reg_char is None: + raise InsnParseError( + "can't find special registers altered table's register-column's header") KNOWN_SPECIAL_TEXTS = ( "None", "Dependent on the system service", "See above.", + "See Table 5.1", ) match table_header_reg_char.text: case "R": @@ -1611,7 +1777,7 @@ class Parser: final_regular_min_y=regular_min_y, ) - def extract_insn(self, header_start_char: Char): + def extract_insn(self, header_start_char: Char) -> Insn: assert header_start_char.font == Font.INSN_HEADER print(header_start_char) header = self.extract_insn_header_mnemonics_and_bit_fields( @@ -1620,7 +1786,7 @@ class Parser: ) if header is None: raise PageParseError("can't find header text line") - next_start_min_y = header[2].box_min_y - 5 + next_start_min_y = header.min_y - 5 headers = [header] code_lines: list[ParsedTextLine] = [] desc_lines: list[ParsedTextLine] = [] @@ -1687,7 +1853,7 @@ class Parser: if header is None: raise InsnParseError("can't find header text line") headers.append(header) - next_start_min_y = header[2].box_min_y - 5 + next_start_min_y = header.min_y - 5 case _InsnParseSection.DESC: desc_line = self.extract_text_line( start_char=next_char, @@ -1706,7 +1872,7 @@ class Parser: first_text_line=desc_line, min_x=desc_line.chars[0].min_x, max_x=self.text_section.max_x, - allowed_start_min_y_error=3, + allowed_start_min_y_error=3.5, ) print("more insn desc lines:") print("\n".join(map(str, more_desc_lines))) @@ -1729,6 +1895,12 @@ class Parser: print("sp_regs_altered:") print(sp_regs_altered) # TODO: finish + return Insn( + headers=tuple(headers), + code_lines=tuple(code_lines), + desc_lines=tuple(desc_lines), + sp_regs_altered=sp_regs_altered, + ) def extract_insns(self): while True: @@ -1737,7 +1909,7 @@ class Parser: self.unprocessed_chars[Font.INSN_HEADER])) except StopIteration: break - self.extract_insn(header_start_char=header_start_char) + self.insns.append(self.extract_insn(header_start_char=header_start_char)) def main(): if 2 < len(sys.argv): @@ -1747,4 +1919,19 @@ def main(): page_numbers = tuple(int(i) for i in sys.argv[2].split(",")) else: page_numbers = None - Parser().parse_pdf(sys.argv[1], page_numbers=page_numbers) \ No newline at end of file + parser = Parser() + file_name = Path(sys.argv[1]) + parser.parse_pdf(file_name, page_numbers=page_numbers) + insns = ElementTree.Element("instructions", attrib={"is-subset": str(page_numbers is not None)}) + insns.text = "\n" + insns.tail = "\n" + comment = ElementTree.Comment(f" Automatically generated from {file_name.name} ") + comment.tail = "\n" + insns.append(comment) + for insn in parser.insns: + insn.write_xml(insns) + ElementTree.ElementTree(insns).write( + "powerisa-instructions.xml", + encoding="utf-8", + xml_declaration=True, + ) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 547a7e3..c2ec3e0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,6 +8,7 @@ version = "0.0.0" dependencies = [ "pdfminer.six == 20240706" ] +requires-python = ">= 3.11" [project.scripts] parse_powerisa_pdf = "parse_powerisa_pdf.parse_powerisa_pdf:main" \ No newline at end of file