From 0da4efc597b436246a7c29e16df9a9d02c82c5bd Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Sun, 27 Oct 2024 01:54:14 -0700 Subject: [PATCH] WIP --- .gitignore | 3 +- parse_powerisa_pdf/parse_powerisa_pdf.py | 1000 +++++++++++++++------- 2 files changed, 686 insertions(+), 317 deletions(-) diff --git a/.gitignore b/.gitignore index b4f223b..4165b58 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ /.venv /.vscode *.egg-info -__pycache__ \ No newline at end of file +__pycache__ +*.log \ No newline at end of file diff --git a/parse_powerisa_pdf/parse_powerisa_pdf.py b/parse_powerisa_pdf/parse_powerisa_pdf.py index bd96947..c830da1 100755 --- a/parse_powerisa_pdf/parse_powerisa_pdf.py +++ b/parse_powerisa_pdf/parse_powerisa_pdf.py @@ -4,7 +4,7 @@ from dataclasses import dataclass, field import dataclasses from functools import cached_property import sys -from typing import ClassVar, Container, Iterable, Iterator, NewType, TypeAlias, TypeVar, assert_never, overload +from typing import Callable, ClassVar, Iterable, Iterator, TypeAlias, TypeVar, assert_never from xml.etree import ElementTree import enum import traceback @@ -23,28 +23,30 @@ class Font: @cached_property def space_width(self) -> float: - return 3.985 * self.size / Font.INSTR_CODE[0].size + return 3.985 * self.size / Font.INSN_CODE[0].size @cached_property def line_height(self) -> float: match self.font_name: - case _ if any(self.font_name == i.font_name for i in Font.INSTR_CODE): - return 9.464 * self.size / Font.INSTR_CODE[0].size - case Font.INSTR_DESC_BOLD.font_name | \ - Font.INSTR_DESC_ITALIC.font_name | \ - Font.INSTR_DESC_BOLD_ITALIC.font_name | \ - Font.NOTATION_PAGE_SUBSCRIPT.font_name | \ - Font.NOTATION_PAGE_SUBSCRIPT_SYM.font_name: - return 10.959 * self.size / Font.INSTR_DESC[0].size - case _ if self in Font.INSTR_DESC or self.font_name == Font.INSTR_DESC[0].font_name: - return 10.959 * self.size / Font.INSTR_DESC[0].size + case _ if any(self.font_name == f.font_name for f in Font.insn_code_fonts()): + return 9.464 * self.size / Font.INSN_CODE[0].size + case Font.INSN_DESC_BOLD.font_name | \ + Font.INSN_DESC_ITALIC.font_name | \ + Font.INSN_DESC_BOLD_ITALIC.font_name | \ + Font.NOTATION_PAGE_SUBSCRIPT.font_name: + return 10.959 * self.size / Font.INSN_DESC[0].size + case _ if self in Font.INSN_DESC or self.font_name == Font.INSN_DESC[0].font_name: + return 10.959 * self.size / Font.INSN_DESC[0].size case _ if self in Font.MATH_MISC: - return 10.959 * self.size / Font.INSTR_DESC[0].size - case _ if self in Font.NOTATION_PAGE_SYM: - return 10.959 * self.size / Font.INSTR_DESC[0].size + return 10.959 * self.size / Font.INSN_DESC[0].size case _: raise AssertionError(f"no line height: {self}") + @classmethod + def insn_code_fonts(cls) -> Iterator[Font]: + yield from cls.INSN_CODE + yield from cls.INSN_CODE_SUBSCRIPT + @classmethod def known_fonts(cls) -> Iterator[Font]: return iter(cls.__KNOWN_NAMES.keys()) @@ -55,122 +57,66 @@ class Font: @classmethod def _register_known_fonts(cls) -> None: - cls.INSTR_HEADER = Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=9.963) + cls.INSN_HEADER = Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=9.963) cls.RTL_FN_HEADER = Font(font_name='APUYSQ+zcoN-Regular', size=9.963) cls.PAGE_HEADER = Font(font_name='MJBFWM+DejaVuSansCondensed', size=9.963) cls.PAGE_FOOTER = Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.981) - cls.INSTR_DESC = ( + cls.INSN_DESC = ( Font(font_name='MJBFWM+DejaVuSansCondensed', size=8.966), + Font(font_name='FZTIYT+CMMI9', size=8.966), + Font(font_name='ONUAYC+CMSSI9', size=8.966), + Font(font_name='TNGBFZ+CMSY9', size=8.966), Font(font_name='WHMZPU+CMEX9', size=8.966), + Font(font_name='ZJTMSG+CMSS9', size=8.966), ) - cls.INSTR_DESC_MISC = ( - Font(font_name='MJBFWM+DejaVuSansCondensed', size=2.377), - Font(font_name='MJBFWM+DejaVuSansCondensed', size=2.561), - Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.492), - Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.641), - Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.772), - Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.864), - Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.925), - Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.097), - Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.123), - Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.131), - Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.516), - Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.604), - Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.634), - Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.906), - Font(font_name='MJBFWM+DejaVuSansCondensed', size=6.033), - Font(font_name='MJBFWM+DejaVuSansCondensed', size=6.068), - Font(font_name='MJBFWM+DejaVuSansCondensed', size=6.213), - Font(font_name='MJBFWM+DejaVuSansCondensed', size=6.252), - Font(font_name='MJBFWM+DejaVuSansCondensed', size=6.962), - Font(font_name='MJBFWM+DejaVuSansCondensed', size=7.977), + cls.INSN_DESC_MISC = tuple( + Font(font_name='MJBFWM+DejaVuSansCondensed', size=i) + for i in [ + 2.377, 2.561, 4.492, 4.641, 4.772, 4.864, 4.925, + 5.097, 5.123, 5.131, 5.516, 5.604, 5.634, 5.906, + 6.033, 6.068, 6.213, 6.252, 6.962, 7.977, + ] ) - cls.INSTR_DESC_CODE = Font(font_name='APUYSQ+zcoN-Regular', size=6.974) - cls.INSTR_DESC_CODE_MISC = ( + cls.INSN_DESC_CODE = Font(font_name='APUYSQ+zcoN-Regular', size=6.974) + cls.INSN_DESC_CODE_MISC = ( Font(font_name='APUYSQ+zcoN-Regular', size=3.587), Font(font_name='APUYSQ+zcoN-Regular', size=4.483), ) - cls.INSTR_DESC_ITALIC = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=8.966) - cls.INSTR_DESC_BOLD = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=8.966) - cls.INSTR_DESC_BOLD_ITALIC = Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=8.966) - cls.INSTR_DESC_SMALL = Font(font_name='MJBFWM+DejaVuSansCondensed', size=7.97) - cls.INSTR_DESC_SMALL_ITALIC = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=7.97) - cls.INSTR_DESC_SMALL_BOLD = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=7.97) - cls.INSTR_DESC_SMALL_BOLD_ITALIC = Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=7.97) - cls.INSTR_DESC_BOLD_MISC = ( - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=2.21), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=2.399), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=2.763), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=2.946), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=2.949), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=2.999), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.065), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.086), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.183), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.686), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.744), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.825), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.842), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.857), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.979), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.032), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.112), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.161), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.206), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.353), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.378), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.434), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.595), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.619), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.647), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.68), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.693), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.736), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.781), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.802), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.995), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.201), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.258), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.363), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.442), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.473), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.485), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.512), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.543), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.613), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.744), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.774), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.809), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.849), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.911), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.92), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.962), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.981), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.146), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.213), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.221), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.243), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.55), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.62), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.699), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.725), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.751), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.856), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=8.029), - Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=8.406), + cls.INSN_DESC_ITALIC = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=8.966) + cls.INSN_DESC_BOLD = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=8.966) + cls.INSN_DESC_BOLD_ITALIC = Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=8.966) + cls.INSN_DESC_SMALL = Font(font_name='MJBFWM+DejaVuSansCondensed', size=7.97) + cls.INSN_DESC_SMALL_ITALIC = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=7.97) + cls.INSN_DESC_SMALL_BOLD = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=7.97) + cls.INSN_DESC_SMALL_BOLD_ITALIC = Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=7.97) + cls.INSN_DESC_BOLD_MISC = tuple( + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=i) + for i in [ + 2.21, 2.399, 2.763, 2.946, 2.949, 2.999, + 3.065, 3.086, 3.183, 3.686, 3.744, 3.825, 3.842, 3.857, 3.979, + 4.032, 4.112, 4.161, 4.206, 4.353, 4.378, 4.434, + 4.595, 4.619, 4.647, 4.68, 4.693, 4.736, 4.781, 4.802, 4.995, + 5.201, 5.258, 5.363, 5.442, 5.473, 5.485, + 5.512, 5.543, 5.613, 5.744, 5.774, 5.809, 5.849, 5.911, 5.92, 5.962, 5.981, + 6.146, 6.213, 6.221, 6.243, 6.55, 6.62, 6.699, 6.725, 6.751, 6.856, + 8.029, 8.406, + ] ) - cls.INSTR_DESC_SUBSCRIPT = Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.978) - cls.INSTR_DESC_BOLD_SUBSCRIPT = \ + cls.INSN_DESC_SUBSCRIPT = Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.978) + cls.INSN_DESC_BOLD_SUBSCRIPT = \ Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.978) - cls.INSTR_DESC_BOLD_ITALIC_SUBSCRIPT = \ + cls.INSN_DESC_BOLD_ITALIC_SUBSCRIPT = \ Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=5.978) - cls.INSTR_EXT_MNEMONIC = Font(font_name='APUYSQ+zcoN-Regular', size=8.966) - cls.INSTR_CODE = ( + cls.INSN_EXT_MNEMONIC = Font(font_name='APUYSQ+zcoN-Regular', size=8.966) + cls.INSN_CODE = ( Font(font_name='APUYSQ+zcoN-Regular', size=7.97), Font(font_name='RRFUNA+CMSY8', size=7.97), Font(font_name='HPXOZC+CMSS8', size=7.97), ) - cls.INSTR_CODE_SUBSCRIPT = Font(font_name='APUYSQ+zcoN-Regular', size=5.978) + cls.INSN_CODE_SUBSCRIPT = ( + Font(font_name='APUYSQ+zcoN-Regular', size=5.978), + Font(font_name='DBQTKF+CMSY6', size=5.978), + ) cls.TITLE_PAGE_BIG = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=24.787) cls.TITLE_PAGE_VERSION = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=9.963) cls.TITLE_PAGE_TM = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.974) @@ -179,13 +125,6 @@ class Font: cls.LEGAL_PAGE_ITALIC = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=9.963) cls.CHANGE_SUMMARY_PAGE_BOLD = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=11.955) cls.CHAPTER_TITLE = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=17.215) - cls.NOTATION_PAGE_SYM = ( - Font(font_name='FZTIYT+CMMI9', size=8.966), - Font(font_name='ONUAYC+CMSSI9', size=8.966), - Font(font_name='TNGBFZ+CMSY9', size=8.966), - Font(font_name='ZJTMSG+CMSS9', size=8.966), - ) - cls.NOTATION_PAGE_SUBSCRIPT_SYM = Font(font_name='DBQTKF+CMSY6', size=5.978) cls.NOTATION_PAGE_SUBSCRIPT = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=5.978) cls.MATH_MISC = ( Font(font_name='AAJMKT+CMMI6', size=5.978), @@ -251,32 +190,64 @@ class Char: @dataclass() class Parser: - def parse_pdf(self, file: str, page_numbers: Container[int] | None = None): - for page in extract_pages(file, page_numbers=page_numbers): + def parse_pdf(self, file: str, page_numbers: Iterable[int] | None = None): + if page_numbers is not None: + page_numbers = sorted(i - 1 for i in page_numbers) + for i, page in enumerate(extract_pages(file, page_numbers=page_numbers)): + if page_numbers is not None: + page_num = page_numbers[i] + 1 + else: + page_num = i + 1 + print(f"page {page_num}") try: - PageParser(parser=self, page_id=page.pageid).parse_page(page) + PageParser(parser=self, page_num=page_num).parse_page(page) except Exception as e: - e.add_note(f"pageid={page.pageid}") + e.add_note(f"page_num={page_num}") raise COLUMN_SPLIT_X = 300.0 -INSTR_BIT_FIELDS_PREFIX_TEXT_TOP_PAD_HEIGHT = 29.938 -INSTR_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT = 9.278 -INSTR_BIT_FIELDS_PREFIX_BOX_BOTTOM_TO_SUFFIX_TEXT_HEIGHT = 20.971 -INSTR_BIT_FIELDS_TOP_PAD_HEIGHT = 20.175 -INSTR_BIT_FIELDS_TOP_PAD_HEIGHT2 = 14.694 -INSTR_BIT_FIELDS_BOX_HEIGHT = 22.317 -INSTR_BIT_FIELDS_BOTTOM_TO_CODE_TEXT_HEIGHT = 24.657 + 1.586 -INSTR_BIT_FIELDS_BOTTOM_TO_HEADER_TEXT_HEIGHT = 32.927 + 2.1519216 -INSTR_MNEMONIC_TOP_PAD_HEIGHT = 15.75 +PAGE_BODY_MAX_Y = 780.0 +PAGE_BODY_MIN_Y = 45.0 +INSN_BIT_FIELDS_PREFIX_TEXT_TOP_PAD_HEIGHT = 29.938 +INSN_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT = 9.278 +INSN_BIT_FIELDS_PREFIX_BOX_BOTTOM_TO_SUFFIX_TEXT_HEIGHT = 20.971 +INSN_BIT_FIELDS_TOP_PAD_HEIGHT = 20.175 +INSN_BIT_FIELDS_TOP_PAD_HEIGHT2 = 14.694 +INSN_BIT_FIELDS_BOX_HEIGHT = 22.317 +INSN_SP_REGS_ALTERED_REGISTER_COLUMN_X = 34.405 +INSN_SP_REGS_ALTERED_FIELDS_COLUMN_X = 86.692 +INSN_SP_REGS_ALTERED_FIELDS_CONDS_SPLIT_X = 188.74 @dataclass() class ParsedTextLine: element: ElementTree.Element regular_min_y: float + regular_max_y: float fonts: TextLineFonts chars: list[Char] + preceding_blank_lines: int + + @property + def regular_height(self) -> float: + return self.regular_max_y - self.regular_min_y + + def get_header_text(self) -> None | str: + assert self.fonts == TextLineFonts.INSN_DESC_FONTS + if (self.element.text or "").strip() != "": + return None + if (self.element.tail or "").strip() != "": + return None + if len(self.element) != 1: + return None + if self.element[0].tag != "b": + return None + if len(self.element[0]) != 0: + return None + text = "".join(self.element.itertext()) + if text.endswith(":") and text[0].istitle(): + return text + return None def __repr__(self) -> str: fields = [] @@ -295,13 +266,12 @@ class ParsedTextLine: return f"{__class__.__name__}({sep.join(fields)})" def __str__(self) -> str: - return ElementTree.tostring(self.element, encoding="unicode") + return "\n" * self.preceding_blank_lines + ElementTree.tostring(self.element, encoding="unicode") _T = TypeVar("_T") class BaselinePos(enum.Enum): ABOVE = "above" - BASELINE = "baseline" BELOW = "below" @dataclass(unsafe_hash=True, frozen=True) @@ -311,27 +281,76 @@ class TextLineFonts: bold: tuple[Font, ...] | None = None bold_italic: tuple[Font, ...] | None = None subscript: tuple[Font, ...] | None = None + code: tuple[Font, ...] | None = None + code_subscript: tuple[Font, ...] | None = None - def get_font(self, part_kind: TextLineFontKind, default: _T=None) -> _T | tuple[tuple[Font, ...], BaselinePos]: + @classmethod + def _define_fonts(cls): + cls.INSN_MNEMONIC_FONTS = cls( + regular=Font.INSN_DESC, + ) + cls.INSN_HEADER_FONTS = cls( + regular=(Font.INSN_HEADER,), + ) + cls.INSN_BIT_FIELD_BIT_NUMBER_FONTS = cls( + regular=(Font.INSN_DESC_SMALL,), + ) + cls.INSN_BIT_FIELD_NAME_FONTS = cls( + regular=Font.INSN_DESC, + ) + cls.INSN_BIT_FIELDS_AFFIX_TITLE_FONTS = cls( + regular=(Font.INSN_DESC_SMALL,), + bold=(Font.INSN_DESC_SMALL_BOLD,), + ) + cls.INSN_CODE_FONTS = cls( + regular=Font.INSN_CODE, + subscript=Font.INSN_CODE_SUBSCRIPT, + ) + cls.INSN_DESC_FONTS = cls( + regular=Font.INSN_DESC, + bold=(Font.INSN_DESC_BOLD,), + italic=(Font.INSN_DESC_ITALIC,), + bold_italic=(Font.INSN_DESC_BOLD_ITALIC,), + subscript=(Font.INSN_DESC_SUBSCRIPT,), + code=(Font.INSN_DESC_CODE, Font.INSN_EXT_MNEMONIC), + code_subscript=Font.INSN_CODE_SUBSCRIPT, + ) + + del cls._define_fonts + + def get_font( + self, + part_kind: TextLineFontKind, + default: _T=None, + ) -> _T | tuple[tuple[Font, ...], None | BaselinePos]: match part_kind: case TextLineFontKind.REGULAR: font = self.regular - baseline_pos = BaselinePos.BASELINE + baseline_pos = None case TextLineFontKind.ITALIC: font = self.italic - baseline_pos = BaselinePos.BASELINE + baseline_pos = None case TextLineFontKind.BOLD: font = self.bold - baseline_pos = BaselinePos.BASELINE + baseline_pos = None case TextLineFontKind.BOLD_ITALIC: font = self.bold_italic - baseline_pos = BaselinePos.BASELINE + baseline_pos = None case TextLineFontKind.SUBSCRIPT: font = self.subscript baseline_pos = BaselinePos.BELOW case TextLineFontKind.SUPERSCRIPT: font = self.subscript baseline_pos = BaselinePos.ABOVE + case TextLineFontKind.CODE: + font = self.code + baseline_pos = None + case TextLineFontKind.CODE_SUBSCRIPT: + font = self.code_subscript + baseline_pos = BaselinePos.BELOW + case TextLineFontKind.CODE_SUPERSCRIPT: + font = self.code_subscript + baseline_pos = BaselinePos.ABOVE case _: assert_never(part_kind) if font is None: @@ -339,8 +358,8 @@ class TextLineFonts: return font, baseline_pos @cached_property - def __font_to_kind_map(self) -> dict[tuple[Font, BaselinePos], TextLineFontKind]: - retval: dict[tuple[Font, BaselinePos], TextLineFontKind] = {} + def __font_to_kind_map(self) -> dict[tuple[Font, None | BaselinePos], TextLineFontKind]: + retval: dict[tuple[Font, None | BaselinePos], TextLineFontKind] = {} for kind in TextLineFontKind: fonts = self.get_font(kind) if fonts is None: @@ -352,8 +371,26 @@ class TextLineFonts: retval[font, baseline_pos] = kind return retval + @cached_property + def fonts(self) -> frozenset[Font]: + retval: set[Font] = set() + for kind in TextLineFontKind: + fonts = self.get_font(kind) + if fonts is None: + continue + fonts, baseline_pos = fonts + retval.update(fonts) + return frozenset(retval) + def get_kind(self, font: Font, baseline_pos: BaselinePos, default: _T=None) -> _T | TextLineFontKind: - return self.__font_to_kind_map.get((font, baseline_pos), default) + retval = self.__font_to_kind_map.get((font, baseline_pos)) + if retval is None: + retval = self.__font_to_kind_map.get((font, None)) + if retval is None: + return default + return retval + +TextLineFonts._define_fonts() class TextLineFontKind(enum.Enum): REGULAR = "regular" @@ -362,6 +399,9 @@ class TextLineFontKind(enum.Enum): BOLD_ITALIC = "bold_italic" SUBSCRIPT = "subscript" SUPERSCRIPT = "superscript" + CODE = "code" + CODE_SUBSCRIPT = "code_subscript" + CODE_SUPERSCRIPT = "code_superscript" @cached_property def text_line_tags(self) -> tuple[str, ...]: @@ -378,13 +418,19 @@ class TextLineFontKind(enum.Enum): return "sub", case TextLineFontKind.SUPERSCRIPT: return "sup", + case TextLineFontKind.CODE: + return "code", + case TextLineFontKind.CODE_SUBSCRIPT: + return "code", "sub" + case TextLineFontKind.CODE_SUPERSCRIPT: + return "code", "sup" case _: assert_never(self) -class PageParseFailed(Exception): +class PageParseError(Exception): pass -class InstrParseFailed(Exception): +class InsnParseError(Exception): pass class ElementBodyBuilder: @@ -440,28 +486,28 @@ class ElementBodyBuilder: self.__flush_text_buffer() @dataclass(unsafe_hash=True, frozen=True) -class InstrBitField: +class InsnBitField: box_min_x: float box_max_x: float name: None | ParsedTextLine - start_bit: None | ParsedTextLine + bit_number: None | ParsedTextLine def __str__(self) -> str: - return f"" + return f"" @dataclass(unsafe_hash=True, frozen=True) -class InstrBitFieldsPrefix: +class InsnBitFieldsPrefix: box_min_x: float box_min_y: float box_max_x: float box_max_y: float prefix_text: ParsedTextLine - fields: tuple[InstrBitField, ...] + fields: tuple[InsnBitField, ...] suffix_text: ParsedTextLine def __str__(self): sep = ",\n " - return (f"") @dataclass(unsafe_hash=True, frozen=True) -class InstrBitFields: - prefix: None | InstrBitFieldsPrefix +class InsnBitFields: + prefix: None | InsnBitFieldsPrefix box_min_x: float box_min_y: float box_max_x: float box_max_y: float - fields: tuple[InstrBitField, ...] + fields: tuple[InsnBitField, ...] def __str__(self): sep = ",\n " prefix_str = "" if self.prefix is not None: prefix_str = f"{self.prefix}\n" - return (f"{prefix_str}") +@dataclass(unsafe_hash=True, frozen=True) +class InsnSpRegsAlteredEntry: + reg: ParsedTextLine + fields: tuple[ParsedTextLine, ...] + conds: tuple[ParsedTextLine, ...] + + def __str__(self, indent="") -> str: + fields = "\n".join([ + "(", + *(f"{indent} {i}," for i in self.fields), + f"{indent} )", + ]) + if self.fields == (): + fields = "()" + conds = "\n".join([ + "(", + *(f"{indent} {i}," for i in self.conds), + f"{indent} )", + ]) + if self.conds == (): + conds = "()" + return (f"Entry(\n" + f"{indent} reg={self.reg},\n" + f"{indent} fields={fields},\n" + f"{indent} conds={conds},\n" + f"{indent})") + +@dataclass(unsafe_hash=True, frozen=True) +class InsnSpRegsAltered: + sp_regs_altered_text: ParsedTextLine + special_text: None | ParsedTextLine + table_header_reg: None | ParsedTextLine + table_header_fields: None | ParsedTextLine + entries: tuple[InsnSpRegsAlteredEntry, ...] + final_regular_min_y: float + + def __str__(self) -> str: + lines = [ + "InsnSpRegsAltered(", + f" sp_regs_altered_text={self.sp_regs_altered_text}," + ] + if self.special_text is not None: + lines.append(f" special_text={self.special_text},") + if self.table_header_reg is not None: + lines.append(f" table_header_reg={self.table_header_reg},") + if self.table_header_fields is not None: + lines.append(f" table_header_fields={self.table_header_fields},") + if len(self.entries) == 0: + lines.append(" entries=(),") + else: + lines.append(" entries=(") + for entry in self.entries: + lines.append(f" {entry.__str__(' ')},") + lines.append(" ),") + lines.append(f" final_regular_min_y={self.final_regular_min_y},") + lines.append(f")") + return "\n".join(lines) + CHAR_TO_EXPANDED = { "\ufb00": "ff", "\ufb01": "fi", @@ -498,7 +602,7 @@ CHAR_TO_EXPANDED = { @dataclass() class PageParser: parser: Parser - page_id: int + page_num: int qt: QuadTree[Char | LTLine | LTRect] = field(default_factory=QuadTree) unprocessed_chars: defaultdict[Font, SetById[Char]] = field( default_factory=lambda: defaultdict(SetById[Char])) @@ -508,6 +612,8 @@ class PageParser: def parse_page(self, page: LTPage): for component in page: if isinstance(component, (LTLine, LTRect)): + if isinstance(component, LTRect): + print(component) self.qt.insert(component.x0, component.y0, component) continue if not isinstance(component, LTTextBox): @@ -545,10 +651,39 @@ class PageParser: raise AssertionError("\nunknown fonts:\n" + "\n".join(unknown_fonts) + "\n\n" + "\n".join(unknown_font_errors)) try: - self.extract_instrs() - except InstrParseFailed: + self.extract_insns() + except InsnParseError as e: + e.add_note(f"page_num={self.page_num}") + print("".join(traceback.format_exception_only(e)), flush=True) traceback.print_exc() + def find_top_left_char_in_range(self, *, + min_x: float, + max_x: float, + min_y: float, + max_y: float, + allow_processed: bool, + pred: None | Callable[[Char], bool] = None, + ) -> None | Char: + retval = None + for x, y, char in self.qt.range( + min_x=min_x, + max_x=max_x, + min_y=min_y, + max_y=max_y, + ): + if not isinstance(char, Char): + continue + if not allow_processed and char not in self.unprocessed_chars[char.font]: + continue + if retval is None: + if pred is None or pred(char): + retval = char + elif char.min_x - char.min_y < retval.min_x - retval.min_y: + if pred is None or pred(char): + retval = char + return retval + def extract_text_line( self, *, start_char: None | Char = None, @@ -556,6 +691,7 @@ class PageParser: min_x: float, max_x: float, fonts: TextLineFonts, + preceding_blank_lines: int = 0, skip_initial_spaces=False, allowed_start_min_y_error=None, ) -> None | ParsedTextLine: @@ -582,23 +718,30 @@ class PageParser: retval = ParsedTextLine( element=ElementTree.Element("text-line"), regular_min_y=chars[0].min_y, + regular_max_y=chars[0].max_y, fonts=fonts, chars=chars, + preceding_blank_lines=preceding_blank_lines, ) text_and_tag_stacks: list[tuple[str, tuple[str, ...]]] = [] last_max_x = min_x last_kind = None last_char = None for char in chars: - if char.min_y - retval.regular_min_y < -0.2: - baseline_pos = BaselinePos.BELOW - elif char.min_y - retval.regular_min_y > 1: + if (char.max_y + char.min_y) * 0.5 > (retval.regular_max_y + retval.regular_min_y) * 0.5: baseline_pos = BaselinePos.ABOVE else: - baseline_pos = BaselinePos.BASELINE + baseline_pos = BaselinePos.BELOW kind = fonts.get_kind(font=char.font, baseline_pos=baseline_pos) if kind is None: - print(f"font kind is None:\nfonts={fonts}\nchar={char}\nbaseline_pos={baseline_pos}") + print( + f"font kind is None:\n" + f"regular_min_y={retval.regular_min_y}\n" + f"fonts={fonts}\n" + f"char={char}\n" + f"baseline_pos={baseline_pos}\n" + f"chars[0]={chars[0]}" + ) return None if last_kind is None: space_kind = kind @@ -606,16 +749,22 @@ class PageParser: space_kind = TextLineFontKind.REGULAR else: space_kind = kind - space_font, _ = fonts.get_font(space_kind, (fonts.regular, BaselinePos.BASELINE)) + space_font, _ = fonts.get_font(space_kind, (fonts.regular, None)) space_width = char.min_x - last_max_x space_count_f = space_width / space_font[0].space_width space_count = round(space_count_f) + if space_count == 0 and space_count_f > 0.4: + space_count = 1 if space_count_f > 0.25 and abs(space_count - space_count_f) > 0.15: print(f"spaces: space_count_f={space_count_f} space_width={space_width}") if space_count > 0 and not skip_initial_spaces: text_and_tag_stacks.append((" " * space_count, space_kind.text_line_tags)) skip_initial_spaces = False - if (char.text == "\u0338" and last_char is not None and last_char.text == "=" and abs(char.min_x - last_char.min_x) < 0.01 and abs(char.min_y - last_char.min_y) < 0.01): + if (char.text == "\u0338" + and last_char is not None + and last_char.text == "=" + and abs(char.min_x - last_char.min_x) < 0.01 + and abs(char.min_y - last_char.min_y) < 0.01): text_and_tag_stacks[-1] = "\u2260", () last_max_x = last_char.max_x else: @@ -658,19 +807,19 @@ class PageParser: ) return retval - def extract_instr_bit_fields( + def extract_insn_bit_fields( self, min_x: float, max_x: float, mnemonic_lines: list[ParsedTextLine], - ) -> None | InstrBitFields: + ) -> None | InsnBitFields: found_non_affix_line = False if len(mnemonic_lines) > 1: expected_non_affix_line_y = (mnemonic_lines[-1].regular_min_y - - INSTR_BIT_FIELDS_TOP_PAD_HEIGHT2) + - INSN_BIT_FIELDS_TOP_PAD_HEIGHT2) else: expected_non_affix_line_y = (mnemonic_lines[-1].regular_min_y - - INSTR_BIT_FIELDS_TOP_PAD_HEIGHT) + - INSN_BIT_FIELDS_TOP_PAD_HEIGHT) for x, y, line in self.qt.range( min_x=min_x - 5, max_x=max_x + 5, @@ -683,63 +832,57 @@ class PageParser: found_non_affix_line = True break if found_non_affix_line: - return self.extract_instr_bit_fields_box( + return self.extract_insn_bit_fields_box( min_x=min_x, max_x=max_x, expected_box_max_y=expected_non_affix_line_y, ) prefix_text = self.extract_text_line( start_min_y=mnemonic_lines[-1].regular_min_y - - INSTR_BIT_FIELDS_PREFIX_TEXT_TOP_PAD_HEIGHT, + - INSN_BIT_FIELDS_PREFIX_TEXT_TOP_PAD_HEIGHT, min_x=min_x, max_x=max_x, - fonts=TextLineFonts( - regular=(Font.INSTR_DESC_SMALL,), - bold=(Font.INSTR_DESC_SMALL_BOLD,), - ), + fonts=TextLineFonts.INSN_BIT_FIELDS_AFFIX_TITLE_FONTS, allowed_start_min_y_error=2, ) if prefix_text is None: - raise InstrParseFailed("can't find instr prefix bit fields title") + raise InsnParseError("can't find insn prefix bit fields title") prefix_text_str = "".join(prefix_text.element.itertext()) if prefix_text_str != "Prefix:": - raise InstrParseFailed( - f"instr prefix bit fields title is not as expected: {prefix_text_str!r}") - prefix_bit_fields = self.extract_instr_bit_fields_box( + raise InsnParseError( + f"insn prefix bit fields title is not as expected: {prefix_text_str!r}") + prefix_bit_fields = self.extract_insn_bit_fields_box( min_x=min_x, max_x=max_x, expected_box_max_y=prefix_text.regular_min_y - - INSTR_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT, + - INSN_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT, ) if prefix_bit_fields is None: - raise InstrParseFailed("can't find instr prefix bit fields") + raise InsnParseError("can't find insn prefix bit fields") suffix_text = self.extract_text_line( start_min_y=prefix_bit_fields.box_min_y - - INSTR_BIT_FIELDS_PREFIX_BOX_BOTTOM_TO_SUFFIX_TEXT_HEIGHT, + - INSN_BIT_FIELDS_PREFIX_BOX_BOTTOM_TO_SUFFIX_TEXT_HEIGHT, min_x=min_x, max_x=max_x, - fonts=TextLineFonts( - regular=(Font.INSTR_DESC_SMALL,), - bold=(Font.INSTR_DESC_SMALL_BOLD,), - ), + fonts=TextLineFonts.INSN_BIT_FIELDS_AFFIX_TITLE_FONTS, allowed_start_min_y_error=2, ) if suffix_text is None: - raise InstrParseFailed("can't find instr suffix bit fields title") + raise InsnParseError("can't find insn suffix bit fields title") suffix_text_str = "".join(suffix_text.element.itertext()) if suffix_text_str != "Suffix:": - raise InstrParseFailed( - f"instr suffix bit fields title is not as expected: {suffix_text_str!r}") - suffix_bit_fields = self.extract_instr_bit_fields_box( + raise InsnParseError( + f"insn suffix bit fields title is not as expected: {suffix_text_str!r}") + suffix_bit_fields = self.extract_insn_bit_fields_box( min_x=min_x, max_x=max_x, expected_box_max_y=suffix_text.regular_min_y - - INSTR_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT, + - INSN_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT, ) if suffix_bit_fields is None: - raise InstrParseFailed("can't find instr suffix bit fields") - return InstrBitFields( - prefix=InstrBitFieldsPrefix( + raise InsnParseError("can't find insn suffix bit fields") + return InsnBitFields( + prefix=InsnBitFieldsPrefix( box_min_x=prefix_bit_fields.box_min_x, box_min_y=prefix_bit_fields.box_min_y, box_max_x=prefix_bit_fields.box_max_x, @@ -755,18 +898,18 @@ class PageParser: fields=suffix_bit_fields.fields, ) - def extract_instr_bit_fields_box( + def extract_insn_bit_fields_box( self, min_x: float, max_x: float, expected_box_max_y: float, - ) -> None | InstrBitFields: + ) -> None | InsnBitFields: h_lines: list[LTLine] = [] v_lines: list[LTLine] = [] for x, y, line in self.qt.range( min_x=min_x - 5, max_x=max_x + 5, - min_y=expected_box_max_y - INSTR_BIT_FIELDS_BOX_HEIGHT - 5, + min_y=expected_box_max_y - INSN_BIT_FIELDS_BOX_HEIGHT - 5, max_y=expected_box_max_y + 5, ): if not isinstance(line, LTLine): @@ -783,10 +926,10 @@ class PageParser: if len(h_lines) == 0 and len(v_lines) == 0: return None if len(h_lines) != 2: - raise InstrParseFailed( + raise InsnParseError( f"instruction bit fields box has wrong number of horizontal lines:\n{h_lines}") if len(v_lines) < 2: - raise InstrParseFailed( + raise InsnParseError( f"instruction bit fields box has too few vertical lines:\n{h_lines}") bottom_line, top_line = h_lines box_min_x = v_lines[0].x0 @@ -797,35 +940,31 @@ class PageParser: print(f"bottom_line={bottom_line}") print(f"top_line={top_line}") print(v_lines) - fields: list[InstrBitField] = [] + fields: list[InsnBitField] = [] for i in range(len(v_lines) - 1): left_line = v_lines[i] right_line = v_lines[i + 1] field_box_min_x = left_line.x1 field_box_max_x = right_line.x0 - fields.append(InstrBitField( + fields.append(InsnBitField( box_min_x=field_box_min_x, box_max_x=field_box_max_x, name=self.extract_text_line( start_min_y=box_mid_y + 3.288, min_x=field_box_min_x, max_x=field_box_max_x, - fonts=TextLineFonts( - regular=Font.INSTR_DESC, - ), + fonts=TextLineFonts.INSN_BIT_FIELD_NAME_FONTS, skip_initial_spaces=True, ), - start_bit=self.extract_text_line( + bit_number=self.extract_text_line( start_min_y=box_min_y + 3.487, min_x=field_box_min_x, max_x=field_box_max_x, - fonts=TextLineFonts( - regular=(Font.INSTR_DESC_SMALL,), - ), + fonts=TextLineFonts.INSN_BIT_FIELD_BIT_NUMBER_FONTS, skip_initial_spaces=True, ), )) - return InstrBitFields( + return InsnBitFields( prefix=None, box_min_x=box_min_x, box_min_y=box_min_y, @@ -834,146 +973,375 @@ class PageParser: fields=tuple(fields), ) - def extract_instr(self, header_start_char: Char): - assert header_start_char.font == Font.INSTR_HEADER - column_min_x = header_start_char.min_x - if column_min_x < COLUMN_SPLIT_X: - column_max_x = COLUMN_SPLIT_X - else: - column_max_x = 1000 + def extract_insn_header_mnemonics_and_bit_fields( + self, + column_min_x: float, + column_max_x: float, + start_min_y: float, + header_start_char: None | Char = None, + ) -> None | tuple[list[ParsedTextLine], list[ParsedTextLine], InsnBitFields]: + assert header_start_char is None or \ + header_start_char.font == Font.INSN_HEADER header_line = self.extract_text_line( start_char=header_start_char, - start_min_y=header_start_char.min_y, + start_min_y=start_min_y, min_x=column_min_x, max_x=column_max_x, - fonts=TextLineFonts(regular=(Font.INSTR_HEADER,)), + fonts=TextLineFonts.INSN_HEADER_FONTS, + skip_initial_spaces=True, + allowed_start_min_y_error=6, ) if header_line is None: - raise PageParseFailed("can't find header text line") + return None + print(f"found header line:\n{header_line}") header_lines = self.extract_following_text_lines( first_text_line=header_line, min_x=column_min_x, max_x=column_max_x, allowed_start_min_y_error=1.5, ) - print("instr header lines:") + print("insn header lines:") print("\n".join(map(str, header_lines))) + mnemonic_start_char = self.find_top_left_char_in_range( + min_x=column_min_x - 5, + max_x=column_max_x + 5, + min_y=header_lines[-1].regular_min_y - 50, + max_y=header_lines[-1].regular_min_y - 5, + allow_processed=False, + ) + if mnemonic_start_char is None: + raise InsnParseError("can't find insn mnemonic text line") mnemonic_line = self.extract_text_line( - start_min_y=header_lines[-1].regular_min_y - INSTR_MNEMONIC_TOP_PAD_HEIGHT, + start_char=mnemonic_start_char, + start_min_y=mnemonic_start_char.min_y, min_x=column_min_x, max_x=column_max_x, - fonts=TextLineFonts( - regular=Font.INSTR_DESC, - ), + fonts=TextLineFonts.INSN_MNEMONIC_FONTS, skip_initial_spaces=True, - allowed_start_min_y_error=3, ) if mnemonic_line is None: - raise InstrParseFailed("can't find instr mnemonic text line") + raise InsnParseError("can't find insn mnemonic text line") mnemonic_lines = self.extract_following_text_lines( first_text_line=mnemonic_line, min_x=mnemonic_line.chars[0].min_x, max_x=column_max_x, ) - print("instr mnemonic lines:") + print("insn mnemonic lines:") print("\n".join(map(str, mnemonic_lines))) - instr_bit_fields = self.extract_instr_bit_fields( + insn_bit_fields = self.extract_insn_bit_fields( min_x=column_min_x, max_x=column_max_x, mnemonic_lines=mnemonic_lines, ) - print(instr_bit_fields) - if instr_bit_fields is None: - raise InstrParseFailed("can't find instr bit fields") - alt_header_line = self.extract_text_line( - start_min_y=instr_bit_fields.box_min_y - - INSTR_BIT_FIELDS_BOTTOM_TO_HEADER_TEXT_HEIGHT, - min_x=column_min_x, - max_x=column_max_x, - fonts=TextLineFonts( - regular=(Font.INSTR_HEADER,), - ), - skip_initial_spaces=True, - allowed_start_min_y_error=6, - ) - if alt_header_line is not None: - print(f"found alt header line:\n{alt_header_line}") - alt_header_lines = self.extract_following_text_lines( - first_text_line=alt_header_line, - min_x=column_min_x, - max_x=column_max_x, - allowed_start_min_y_error=1.5, - ) - print("instr alt header lines:") - print("\n".join(map(str, alt_header_lines))) - alt_mnemonic_line = self.extract_text_line( - start_min_y=alt_header_lines[-1].regular_min_y - INSTR_MNEMONIC_TOP_PAD_HEIGHT, - min_x=column_min_x, - max_x=column_max_x, - fonts=TextLineFonts( - regular=Font.INSTR_DESC, - ), - skip_initial_spaces=True, - allowed_start_min_y_error=1.5, - ) - if alt_mnemonic_line is None: - raise InstrParseFailed("can't find instr alt mnemonic text line") - alt_mnemonic_lines = self.extract_following_text_lines( - first_text_line=alt_mnemonic_line, - min_x=alt_mnemonic_line.chars[0].min_x, - max_x=column_max_x, - ) - print("instr alt mnemonic lines:") - print("\n".join(map(str, alt_mnemonic_lines))) - alt_instr_bit_fields = self.extract_instr_bit_fields( - min_x=column_min_x, - max_x=column_max_x, - mnemonic_lines=alt_mnemonic_lines, - ) - print(alt_instr_bit_fields) - if alt_instr_bit_fields is None: - raise InstrParseFailed("can't find instr alt bit fields") - last_instr_bit_fields = alt_instr_bit_fields - else: - print("no alt header line") - alt_header_lines = None - alt_mnemonic_lines = None - alt_instr_bit_fields = None - last_instr_bit_fields = instr_bit_fields + print(insn_bit_fields) + if insn_bit_fields is None: + raise InsnParseError("can't find insn bit fields") + return header_lines, mnemonic_lines, insn_bit_fields - code_line = None - for y_offset in reversed(range(4)): - code_line = self.extract_text_line( - start_min_y=last_instr_bit_fields.box_min_y - - INSTR_BIT_FIELDS_BOTTOM_TO_CODE_TEXT_HEIGHT - + y_offset * 0.5 * Font.INSTR_CODE[0].line_height, - min_x=column_min_x, - max_x=column_max_x, - fonts=TextLineFonts( - regular=Font.INSTR_CODE, - subscript=(Font.INSTR_CODE_SUBSCRIPT,), - ), - skip_initial_spaces=True, - allowed_start_min_y_error=1, - ) - if code_line is not None: - break - if code_line is None: - raise InstrParseFailed("can't find instr code text line") - code_lines = self.extract_following_text_lines( - first_text_line=code_line, - min_x=code_line.chars[0].min_x, - max_x=column_max_x, - allowed_start_min_y_error=0.05, + def extract_insn_sp_regs_altered( + self, + sp_regs_altered_text: ParsedTextLine, + column_min_x: float, + column_max_x: float, + ) -> InsnSpRegsAltered: + sp_regs_altered_text.preceding_blank_lines = 0 + fonts = TextLineFonts.INSN_DESC_FONTS + table_header_reg_char = self.find_top_left_char_in_range( + min_x=column_min_x - 1, + max_x=column_min_x + INSN_SP_REGS_ALTERED_FIELDS_COLUMN_X - 1, + min_y=sp_regs_altered_text.regular_min_y - 30, + max_y=sp_regs_altered_text.regular_min_y - 5, + allow_processed=False, ) - print("instr code lines:") + assert table_header_reg_char is not None, \ + "can't find special registers altered table's register-column's header" + match table_header_reg_char.text: + case "N": + none_text = self.extract_text_line( + start_char=table_header_reg_char, + start_min_y=table_header_reg_char.min_y, + min_x=column_min_x, + max_x=column_max_x, + fonts=fonts, + skip_initial_spaces=True, + ) + assert none_text is not None and none_text.element.text == "None", \ + f"can't find special-registers-altered None: none_text={none_text}" + return InsnSpRegsAltered( + sp_regs_altered_text=sp_regs_altered_text, + special_text=none_text, + table_header_reg=None, + table_header_fields=None, + entries=(), + final_regular_min_y=none_text.regular_min_y, + ) + case "D": + special_text = self.extract_text_line( + start_char=table_header_reg_char, + start_min_y=table_header_reg_char.min_y, + min_x=column_min_x, + max_x=column_max_x, + fonts=fonts, + skip_initial_spaces=True, + ) + assert special_text is not None \ + and special_text.element.text == "Dependent on the system service", \ + f"can't find special-registers-altered special-text:\n{special_text}" + return InsnSpRegsAltered( + sp_regs_altered_text=sp_regs_altered_text, + special_text=special_text, + table_header_reg=None, + table_header_fields=None, + entries=(), + final_regular_min_y=special_text.regular_min_y, + ) + case "R": + pass + case text: + raise AssertionError( + f"unknown special-registers-altered special-text start character: {text!r}") + table_header_fields_char = self.find_top_left_char_in_range( + min_x=column_min_x + INSN_SP_REGS_ALTERED_FIELDS_COLUMN_X - 10, + max_x=column_min_x + INSN_SP_REGS_ALTERED_FIELDS_CONDS_SPLIT_X, + min_y=table_header_reg_char.min_y - 5, + max_y=table_header_reg_char.min_y + 5, + allow_processed=False, + ) + assert table_header_fields_char is not None, \ + "can't find special registers altered table's fields-column's header" + assert table_header_fields_char.text == "F", ( + f"can't find special registers altered table's fields-column's header:\n" + f"table_header_fields_char={table_header_fields_char}") + columns_x_bounds = ( + (table_header_reg_char.min_x, table_header_fields_char.min_x - 1), + (table_header_fields_char.min_x, + column_min_x + INSN_SP_REGS_ALTERED_FIELDS_CONDS_SPLIT_X), + (column_min_x + INSN_SP_REGS_ALTERED_FIELDS_CONDS_SPLIT_X, column_max_x), + ) + table_header_reg = self.extract_text_line( + start_char=table_header_reg_char, + start_min_y=table_header_reg_char.min_y, + min_x=columns_x_bounds[0][0], + max_x=columns_x_bounds[0][1], + fonts=fonts, + ) + assert table_header_reg is not None, \ + "can't find special registers altered table's register-column's header" + table_header_reg_text = "".join(table_header_reg.element.itertext()) + assert table_header_reg_text == "Register", ( + f"can't find special registers altered table's register-column's header:\n" + f"table_header_reg_text={table_header_reg_text!r}") + table_header_fields = self.extract_text_line( + start_char=table_header_fields_char, + start_min_y=table_header_fields_char.min_y, + min_x=columns_x_bounds[1][0], + max_x=columns_x_bounds[1][1], + fonts=fonts, + ) + assert table_header_fields is not None, \ + "can't find special registers altered table's fields-column's header" + table_header_fields_text = "".join(table_header_fields.element.itertext()) + assert table_header_fields_text == "Field(s)", ( + f"can't find special registers altered table's fields-column's header:\n" + f"table_header_fields_text={table_header_fields_text!r}") + regular_min_y = table_header_reg.regular_min_y + entries: list[InsnSpRegsAlteredEntry] = [] + row: list[None | ParsedTextLine] = [None, None, None] + cur_reg: None | ParsedTextLine = None + cur_fields: list[ParsedTextLine] = [] + cur_conds: list[ParsedTextLine] = [] + while True: + next_regular_min_y = None + for i, (min_x, max_x) in enumerate(columns_x_bounds): + row[i] = cell = self.extract_text_line( + start_min_y=regular_min_y - fonts.regular[0].line_height, + min_x=min_x, + max_x=max_x, + fonts=fonts, + skip_initial_spaces=True, + allowed_start_min_y_error=2, + ) + if cell is not None and next_regular_min_y is None: + next_regular_min_y = cell.regular_min_y + if next_regular_min_y is None: + break + regular_min_y = next_regular_min_y + cur_reg_cell, cur_fields_cell, cur_conds_cell = row + if cur_reg_cell is None: + assert cur_reg is not None, \ + "can't find special registers altered table's first register" + if cur_fields_cell is not None: + cur_fields.append(cur_fields_cell) + if cur_conds_cell is not None: + cur_conds.append(cur_conds_cell) + continue + if cur_reg is not None: + entries.append(InsnSpRegsAlteredEntry( + reg=cur_reg, + fields=tuple(cur_fields), + conds=tuple(cur_conds), + )) + cur_fields.clear() + cur_conds.clear() + cur_reg = cur_reg_cell + if cur_fields_cell is not None: + cur_fields.append(cur_fields_cell) + if cur_conds_cell is not None: + cur_conds.append(cur_conds_cell) + assert cur_reg is not None, \ + "can't find special registers altered table's first register" + entries.append(InsnSpRegsAlteredEntry( + reg=cur_reg, + fields=tuple(cur_fields), + conds=tuple(cur_conds), + )) + return InsnSpRegsAltered( + sp_regs_altered_text=sp_regs_altered_text, + special_text=None, + table_header_reg=table_header_reg, + table_header_fields=table_header_fields, + entries=tuple(entries), + final_regular_min_y=regular_min_y, + ) + + def extract_insn(self, header_start_char: Char): + assert header_start_char.font == Font.INSN_HEADER + print(header_start_char) + column_min_x = header_start_char.min_x + if column_min_x < COLUMN_SPLIT_X: + column_max_x = COLUMN_SPLIT_X + else: + column_max_x = 1000 + header = self.extract_insn_header_mnemonics_and_bit_fields( + column_min_x=column_min_x, + column_max_x=column_max_x, + start_min_y=header_start_char.min_y, + header_start_char=header_start_char, + ) + if header is None: + raise PageParseError("can't find header text line") + next_start_min_y = header[2].box_min_y - 5 + headers = [header] + code_lines: list[ParsedTextLine] = [] + desc_lines: list[ParsedTextLine] = [] + sp_regs_altered = None + while True: + search_min_y = next_start_min_y - 70 + next_char = self.find_top_left_char_in_range( + min_x=column_min_x - 5, + max_x=column_max_x + 5, + min_y=max(search_min_y, PAGE_BODY_MIN_Y), + max_y=next_start_min_y, + allow_processed=False, + ) + if next_char is None: + if column_max_x == COLUMN_SPLIT_X and search_min_y <= PAGE_BODY_MIN_Y: + # go to other column + column_min_x = COLUMN_SPLIT_X + column_max_x = 1000 + next_start_min_y = PAGE_BODY_MAX_Y + continue + else: + raise InsnParseError("can't find insn code or description text") + match next_char.font: + case font if font in TextLineFonts.INSN_CODE_FONTS.fonts: + next_section = "code" + case font if font in TextLineFonts.INSN_DESC_FONTS.fonts: + next_section = "desc" + case Font.INSN_HEADER: + next_section = "header" + case font: + raise InsnParseError(f"can't find insn code or description text\nfont={font}") + match next_section: + case "code": + if len(desc_lines) != 0: + break + code_line = self.extract_text_line( + start_char=next_char, + start_min_y=next_char.min_y, + min_x=next_char.min_x, + max_x=column_max_x, + fonts=TextLineFonts.INSN_CODE_FONTS, + preceding_blank_lines=0 if len(code_lines) == 0 else 1, + ) + if code_line is None: + raise InsnParseError("can't find insn code text line") + more_code_lines = self.extract_following_text_lines( + first_text_line=code_line, + min_x=code_line.chars[0].min_x, + max_x=column_max_x, + allowed_start_min_y_error=0.05, + ) + print("more insn code lines:") + print("\n".join(map(str, more_code_lines))) + code_lines.extend(more_code_lines) + next_start_min_y = code_lines[-1].regular_min_y - 5 + case "header": + if len(code_lines) != 0 or len(desc_lines) != 0: + break + header = self.extract_insn_header_mnemonics_and_bit_fields( + column_min_x=column_min_x, + column_max_x=column_max_x, + start_min_y=next_char.min_y, + header_start_char=next_char, + ) + if header is None: + raise InsnParseError("can't find header text line") + headers.append(header) + next_start_min_y = header[2].box_min_y - 5 + case "desc": + desc_line = self.extract_text_line( + start_min_y=next_char.min_y, + min_x=next_char.min_x, + max_x=column_max_x, + fonts=TextLineFonts.INSN_DESC_FONTS, + preceding_blank_lines=0 if len(desc_lines) == 0 else 1, + ) + if desc_line is None: + raise InsnParseError("can't find insn desc text line") + match desc_line.get_header_text(): + case None: + more_desc_lines = self.extract_following_text_lines( + first_text_line=desc_line, + min_x=desc_line.chars[0].min_x, + max_x=column_max_x, + allowed_start_min_y_error=3, + ) + print("more insn desc lines:") + print("\n".join(map(str, more_desc_lines))) + desc_lines.extend(more_desc_lines) + next_start_min_y = desc_lines[-1].regular_min_y - 5 + case "Special Registers Altered:": + sp_regs_altered = self.extract_insn_sp_regs_altered( + sp_regs_altered_text=desc_line, + column_min_x=column_min_x, + column_max_x=column_max_x, + ) + next_start_min_y = sp_regs_altered.final_regular_min_y + break + case header_text: + raise AssertionError(f"unhandled header text: {header_text!r}\n{desc_line}") + case _: + assert_never(next_section) + print("insn code lines:") print("\n".join(map(str, code_lines))) + print("insn desc lines:") + print("\n".join(map(str, desc_lines))) + print("sp_regs_altered:") + print(sp_regs_altered) # TODO: finish - def extract_instrs(self): - unprocessed_header_chars = self.unprocessed_chars[Font.INSTR_HEADER] + def extract_insns(self): + unprocessed_header_chars = self.unprocessed_chars[Font.INSN_HEADER] while len(unprocessed_header_chars) != 0: - self.extract_instr(next(iter(unprocessed_header_chars))) + self.extract_insn(next(iter(unprocessed_header_chars))) def main(): - Parser().parse_pdf(sys.argv[1], page_numbers=range(1495)) \ No newline at end of file + if 2 < len(sys.argv): + if ":" in sys.argv[2]: + page_numbers = range(*map(int, sys.argv[2].split(":"))) + else: + page_numbers = tuple(int(i) for i in sys.argv[2].split(",")) + else: + page_numbers = None + Parser().parse_pdf(sys.argv[1], page_numbers=page_numbers) \ No newline at end of file