diff --git a/.gitignore b/.gitignore index 50e4eb1..4165b58 100644 --- a/.gitignore +++ b/.gitignore @@ -2,5 +2,4 @@ /.vscode *.egg-info __pycache__ -*.log -/powerisa-instructions.xml \ No newline at end of file +*.log \ No newline at end of file diff --git a/README.md b/README.md index f8fae5d..6c474c3 100644 --- a/README.md +++ b/README.md @@ -1,39 +1 @@ parser for the OPF PowerISA 3.1C pdf to attempt to extract all instructions' pseudo-code including subscripts/superscripts and other formatting - -Usage: -* Download the OPF PowerISA 3.1C pdf (yes you need that exact version) from -* Obtain CPython 3.11 (the default `python3` in [Debian Bookworm](https://www.debian.org/releases/bookworm/)) - - On Debian Bookworm you can do: - - ```bash - sudo apt update - sudo apt install python3-venv - ``` - -* Create a venv: - - ```bash - python3.11 -m venv --upgrade-deps path/to/your/new/.venv - ``` - -* Activate the venv: - - ```bash - . path/to/your/new/.venv/bin/activate - ``` - -* Install this project in the venv: - - ```bash - pip install -e . - ``` - -* Run it: - - ```bash - parse_powerisa_pdf path/to/downloaded/OPF_PowerISA_v3.1C.pdf > out.log - ``` - -* This will spit out lots of errors and then successfully create - the output file -- `powerisa-instructions.xml` in the current directory. diff --git a/parse_powerisa_pdf/parse_powerisa_pdf.py b/parse_powerisa_pdf/parse_powerisa_pdf.py index 527c5cb..338e6e5 100755 --- a/parse_powerisa_pdf/parse_powerisa_pdf.py +++ b/parse_powerisa_pdf/parse_powerisa_pdf.py @@ -10,8 +10,6 @@ from typing import ClassVar, TypeVar, assert_never from xml.etree import ElementTree import enum import traceback -from copy import deepcopy -from pathlib import Path from pdfminer.high_level import extract_pages from pdfminer.layout import LTChar, LTLine, LTPage, LTRect, LTTextBox @@ -258,39 +256,6 @@ class ParsedTextLine: def __str__(self) -> str: return "\n" * self.preceding_blank_lines + ElementTree.tostring(self.element, encoding="unicode") - def write_xml(self, parent: ElementTree.Element, trailing_nl: bool): - for _ in range(self.preceding_blank_lines): - ElementTree.SubElement(parent, "br").tail = "\n" - if self.element.text is not None: - if len(parent) == 0: - parent.text = (parent.text or "") + self.element.text - else: - parent[-1].tail = (parent[-1].tail or "") + self.element.text - for element in self.element: - parent.append(deepcopy(element)) - if trailing_nl: - ElementTree.SubElement(parent, "br").tail = "\n" - - @staticmethod - def write_xml_lines( - lines: Iterable[ParsedTextLine], - parent: ElementTree.Element, - trailing_nl: bool, - preceding_nl: bool=False, - ): - if preceding_nl: - ElementTree.SubElement(parent, "br").tail = "\n" - first = True - for line in lines: - if first: - first = False - else: - ElementTree.SubElement(parent, "br").tail = "\n" - line.write_xml(parent, trailing_nl=False) - if trailing_nl: - ElementTree.SubElement(parent, "br").tail = "\n" - - _T = TypeVar("_T") class BaselinePos(enum.Enum): @@ -570,17 +535,6 @@ class InsnBitField: def __str__(self) -> str: return f"" - def write_xml(self, parent: ElementTree.Element): - field = ElementTree.SubElement(parent, "field") - field.text = "\n" - field.tail = "\n" - name = ElementTree.SubElement(field, "name") - name.tail = "\n" - self.name.write_xml(name, trailing_nl=False) - bit_number = ElementTree.SubElement(field, "bit-number") - bit_number.tail = "\n" - self.bit_number.write_xml(bit_number, trailing_nl=False) - @dataclass(unsafe_hash=True, frozen=True) class InsnBitFieldsPrefix: box_min_x: float @@ -601,18 +555,6 @@ class InsnBitFieldsPrefix: f" ]\n" f" suffix_text={self.suffix_text}>") - def write_xml(self, parent: ElementTree.Element): - prefix_elm = ElementTree.SubElement(parent, "prefix") - prefix_elm.text = "\n" - prefix_elm.tail = "\n" - prefix_text = ElementTree.SubElement(prefix_elm, "prefix-text") - prefix_text.tail = "\n" - self.prefix_text.write_xml(prefix_text, trailing_nl=False) - InsnBitFields.write_xml_fields(self.fields, prefix_elm) - suffix_text = ElementTree.SubElement(prefix_elm, "suffix-text") - suffix_text.tail = "\n" - self.suffix_text.write_xml(suffix_text, trailing_nl=False) - @dataclass(unsafe_hash=True, frozen=True) class InsnBitFields: prefix: None | InsnBitFieldsPrefix @@ -631,23 +573,6 @@ class InsnBitFields: f"({self.box_max_x},{self.box_max_y}) [\n" f" {sep.join(map(str, self.fields))},\n]>") - @staticmethod - def write_xml_fields(fields: Iterable[InsnBitField], parent: ElementTree.Element): - fields_elm = ElementTree.SubElement(parent, "fields") - fields_elm.text = "\n" - fields_elm.tail = "\n" - for field in fields: - field.write_xml(fields_elm) - - def write_xml(self, parent: ElementTree.Element): - bit_fields = ElementTree.SubElement(parent, "bit-fields") - bit_fields.text = "\n" - bit_fields.tail = "\n" - if self.prefix is not None: - self.prefix.write_xml(bit_fields) - InsnBitFields.write_xml_fields(self.fields, bit_fields) - - @dataclass(unsafe_hash=True, frozen=True) class InsnSpRegsAlteredEntry: reg: ParsedTextLine @@ -675,20 +600,6 @@ class InsnSpRegsAlteredEntry: f"{indent} conds={conds},\n" f"{indent})") - def write_xml(self, parent: ElementTree.Element): - entry = ElementTree.SubElement(parent, "entry") - entry.text = "\n" - entry.tail = "\n" - reg = ElementTree.SubElement(entry, "register") - reg.tail = "\n" - self.reg.write_xml(reg, trailing_nl=False) - fields = ElementTree.SubElement(entry, "fields") - fields.tail = "\n" - ParsedTextLine.write_xml_lines(self.fields, fields, trailing_nl=False) - conds = ElementTree.SubElement(entry, "conditions") - conds.tail = "\n" - ParsedTextLine.write_xml_lines(self.conds, conds, trailing_nl=False) - @dataclass(unsafe_hash=True, frozen=True) class InsnSpRegsAltered: sp_regs_altered_text: ParsedTextLine @@ -720,28 +631,6 @@ class InsnSpRegsAltered: lines.append(f")") return "\n".join(lines) - def write_xml(self, parent: ElementTree.Element): - sp_regs_altered = ElementTree.SubElement(parent, "special-registers-altered") - sp_regs_altered.text = "\n" - sp_regs_altered.tail = "\n" - title = ElementTree.SubElement(sp_regs_altered, "title") - title.tail = "\n" - self.sp_regs_altered_text.write_xml(title, trailing_nl=False) - if self.special_text is not None: - special_text = ElementTree.SubElement(sp_regs_altered, "special-text") - special_text.tail = "\n" - self.special_text.write_xml(special_text, trailing_nl=False) - if self.table_header_reg is not None: - table_header_reg = ElementTree.SubElement(sp_regs_altered, "table-header-register") - table_header_reg.tail = "\n" - self.table_header_reg.write_xml(table_header_reg, trailing_nl=False) - if self.table_header_fields is not None: - table_header_fields = ElementTree.SubElement(sp_regs_altered, "table-header-fields") - table_header_fields.tail = "\n" - self.table_header_fields.write_xml(table_header_fields, trailing_nl=False) - for entry in self.entries: - entry.write_xml(sp_regs_altered) - class _InsnParseSection(enum.Enum): CODE = "code" HEADER = "header" @@ -798,9 +687,8 @@ class Page: ) if text_section is None: if PAGE_BODY_MIN_Y <= element.y0 <= PAGE_BODY_MAX_Y: - if page_num != 1072: # page 1072 has characters in the margins - raise AssertionError( - f"char not in text section: {element}\npage_num={page_num}") + raise AssertionError( + f"char not in text section: {element}\npage_num={page_num}") continue char = Char( text=element.get_text(), @@ -1115,57 +1003,10 @@ class TextSection: return i return None -@dataclass(frozen=True) -class InsnHeader: - header_lines: tuple[ParsedTextLine, ...] - mnemonic_lines: tuple[ParsedTextLine, ...] - bit_fields: InsnBitFields - - @property - def min_y(self) -> float: - return self.bit_fields.box_min_y - - def write_xml(self, parent: ElementTree.Element): - header = ElementTree.SubElement(parent, "header") - header.text = "\n" - header.tail = "\n" - title = ElementTree.SubElement(header, "title") - title.tail = "\n" - ParsedTextLine.write_xml_lines(self.header_lines, title, trailing_nl=False) - mnemonics = ElementTree.SubElement(header, "mnemonics") - mnemonics.tail = "\n" - ParsedTextLine.write_xml_lines(self.mnemonic_lines, mnemonics, trailing_nl=False) - self.bit_fields.write_xml(header) - -@dataclass(frozen=True) -class Insn: - headers: tuple[InsnHeader, ...] - code_lines: tuple[ParsedTextLine, ...] - desc_lines: tuple[ParsedTextLine, ...] - sp_regs_altered: None | InsnSpRegsAltered - - def write_xml(self, parent: ElementTree.Element): - insn = ElementTree.SubElement(parent, "instruction") - insn.text = "\n" - insn.tail = "\n" - for header in self.headers: - header.write_xml(insn) - if len(self.code_lines) != 0: - code = ElementTree.SubElement(insn, "code") - code.tail = "\n" - ParsedTextLine.write_xml_lines(self.code_lines, code, trailing_nl=False) - if len(self.desc_lines) != 0: - desc = ElementTree.SubElement(insn, "description") - desc.tail = "\n" - ParsedTextLine.write_xml_lines(self.desc_lines, desc, trailing_nl=False) - if self.sp_regs_altered is not None: - self.sp_regs_altered.write_xml(insn) - @dataclass() class Parser: pages: Pages = field(default_factory=Pages) text_section: TextSection = TextSection.first() - insns: list[Insn] = field(default_factory=list) @property def page(self) -> Page: @@ -1176,7 +1017,7 @@ class Parser: return self.pages[self.text_section.page_num].unprocessed_chars[self.text_section] @staticmethod - def __pages_gen(file: Path, page_numbers: Iterable[int] | None) -> Generator[Page, None, None]: + def __pages_gen(file: str, page_numbers: Iterable[int] | None) -> Generator[Page, None, None]: if page_numbers is not None: page_numbers = sorted(i - 1 for i in page_numbers) for i, page in enumerate(extract_pages(file, page_numbers=page_numbers)): @@ -1187,7 +1028,7 @@ class Parser: print(f"page {page_num}") yield Page.from_lt_page(page_num=page_num, page=page) - def parse_pdf(self, file: Path, page_numbers: Iterable[int] | None = None): + def parse_pdf(self, file: str, page_numbers: Iterable[int] | None = None): self.pages = Pages(pages_gen=Parser.__pages_gen( file=file, page_numbers=page_numbers)) self.text_section = TextSection.first() @@ -1218,7 +1059,7 @@ class Parser: try: with self.note_text_section(): self.extract_insns() - except (InsnParseError, PageParseError) as e: + except InsnParseError as e: print("".join(traceback.format_exception_only(e)), flush=True) traceback.print_exc() @@ -1359,11 +1200,10 @@ class Parser: self.unprocessed_chars[char.font].remove(char) if allowed_start_min_y_error is None: allowed_start_min_y_error = 0.01 - if abs(start_min_y - retval.regular_min_y) > allowed_start_min_y_error: - raise PageParseError( - f"start_min_y={start_min_y} regular_min_y={retval.regular_min_y}\n" - f"start_min_y error: {start_min_y - retval.regular_min_y}\n" - f"allowed_start_min_y_error={allowed_start_min_y_error}") + assert abs(start_min_y - retval.regular_min_y) < allowed_start_min_y_error, ( + f"start_min_y={start_min_y} regular_min_y={retval.regular_min_y}\n" + f"start_min_y error: {start_min_y - retval.regular_min_y}\n" + f"allowed_start_min_y_error={allowed_start_min_y_error}") return retval def extract_following_text_lines( @@ -1563,7 +1403,7 @@ class Parser: self, start_min_y: float, header_start_char: None | Char = None, - ) -> None | InsnHeader: + ) -> None | tuple[list[ParsedTextLine], list[ParsedTextLine], InsnBitFields]: assert header_start_char is None or \ header_start_char.font == Font.INSN_HEADER header_line = self.extract_text_line( @@ -1618,11 +1458,7 @@ class Parser: print(insn_bit_fields) if insn_bit_fields is None: raise InsnParseError("can't find insn bit fields") - return InsnHeader( - header_lines=tuple(header_lines), - mnemonic_lines=tuple(mnemonic_lines), - bit_fields=insn_bit_fields, - ) + return header_lines, mnemonic_lines, insn_bit_fields def extract_insn_sp_regs_altered( self, @@ -1638,14 +1474,12 @@ class Parser: max_y=sp_regs_altered_text.regular_min_y - 5, allow_processed=False, ) - if table_header_reg_char is None: - raise InsnParseError( - "can't find special registers altered table's register-column's header") + assert table_header_reg_char is not None, \ + "can't find special registers altered table's register-column's header" KNOWN_SPECIAL_TEXTS = ( "None", "Dependent on the system service", "See above.", - "See Table 5.1", ) match table_header_reg_char.text: case "R": @@ -1777,7 +1611,7 @@ class Parser: final_regular_min_y=regular_min_y, ) - def extract_insn(self, header_start_char: Char) -> Insn: + def extract_insn(self, header_start_char: Char): assert header_start_char.font == Font.INSN_HEADER print(header_start_char) header = self.extract_insn_header_mnemonics_and_bit_fields( @@ -1786,7 +1620,7 @@ class Parser: ) if header is None: raise PageParseError("can't find header text line") - next_start_min_y = header.min_y - 5 + next_start_min_y = header[2].box_min_y - 5 headers = [header] code_lines: list[ParsedTextLine] = [] desc_lines: list[ParsedTextLine] = [] @@ -1853,7 +1687,7 @@ class Parser: if header is None: raise InsnParseError("can't find header text line") headers.append(header) - next_start_min_y = header.min_y - 5 + next_start_min_y = header[2].box_min_y - 5 case _InsnParseSection.DESC: desc_line = self.extract_text_line( start_char=next_char, @@ -1872,7 +1706,7 @@ class Parser: first_text_line=desc_line, min_x=desc_line.chars[0].min_x, max_x=self.text_section.max_x, - allowed_start_min_y_error=3.5, + allowed_start_min_y_error=3, ) print("more insn desc lines:") print("\n".join(map(str, more_desc_lines))) @@ -1895,12 +1729,6 @@ class Parser: print("sp_regs_altered:") print(sp_regs_altered) # TODO: finish - return Insn( - headers=tuple(headers), - code_lines=tuple(code_lines), - desc_lines=tuple(desc_lines), - sp_regs_altered=sp_regs_altered, - ) def extract_insns(self): while True: @@ -1909,7 +1737,7 @@ class Parser: self.unprocessed_chars[Font.INSN_HEADER])) except StopIteration: break - self.insns.append(self.extract_insn(header_start_char=header_start_char)) + self.extract_insn(header_start_char=header_start_char) def main(): if 2 < len(sys.argv): @@ -1919,19 +1747,4 @@ def main(): page_numbers = tuple(int(i) for i in sys.argv[2].split(",")) else: page_numbers = None - parser = Parser() - file_name = Path(sys.argv[1]) - parser.parse_pdf(file_name, page_numbers=page_numbers) - insns = ElementTree.Element("instructions", attrib={"is-subset": str(page_numbers is not None)}) - insns.text = "\n" - insns.tail = "\n" - comment = ElementTree.Comment(f" Automatically generated from {file_name.name} ") - comment.tail = "\n" - insns.append(comment) - for insn in parser.insns: - insn.write_xml(insns) - ElementTree.ElementTree(insns).write( - "powerisa-instructions.xml", - encoding="utf-8", - xml_declaration=True, - ) \ No newline at end of file + Parser().parse_pdf(sys.argv[1], page_numbers=page_numbers) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index c2ec3e0..547a7e3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,6 @@ version = "0.0.0" dependencies = [ "pdfminer.six == 20240706" ] -requires-python = ">= 3.11" [project.scripts] parse_powerisa_pdf = "parse_powerisa_pdf.parse_powerisa_pdf:main" \ No newline at end of file