From b9f47e5ae1496b22a0a33b6c42f3c1cc1906c964 Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Fri, 25 Oct 2024 17:45:41 -0700 Subject: [PATCH] WIP... all fonts included --- parse_powerisa_pdf/parse_powerisa_pdf.py | 646 +++++++++++++++++++---- 1 file changed, 539 insertions(+), 107 deletions(-) diff --git a/parse_powerisa_pdf/parse_powerisa_pdf.py b/parse_powerisa_pdf/parse_powerisa_pdf.py index c42118e..bd96947 100755 --- a/parse_powerisa_pdf/parse_powerisa_pdf.py +++ b/parse_powerisa_pdf/parse_powerisa_pdf.py @@ -4,9 +4,10 @@ from dataclasses import dataclass, field import dataclasses from functools import cached_property import sys -from typing import ClassVar, Iterable, Iterator, NewType, TypeAlias, TypeVar, assert_never, overload +from typing import ClassVar, Container, Iterable, Iterator, NewType, TypeAlias, TypeVar, assert_never, overload from xml.etree import ElementTree import enum +import traceback from pdfminer.high_level import extract_pages from pdfminer.layout import LTChar, LTLine, LTPage, LTRect, LTTextBox @@ -22,22 +23,30 @@ class Font: @cached_property def space_width(self) -> float: - match self: - case Font.INSTR_HEADER: - return 3.12 - case _: - return self.size * 0.31 + return 3.985 * self.size / Font.INSTR_CODE[0].size @cached_property def line_height(self) -> float: - match self: - case Font.INSTR_HEADER: - return 10.961 + match self.font_name: + case _ if any(self.font_name == i.font_name for i in Font.INSTR_CODE): + return 9.464 * self.size / Font.INSTR_CODE[0].size + case Font.INSTR_DESC_BOLD.font_name | \ + Font.INSTR_DESC_ITALIC.font_name | \ + Font.INSTR_DESC_BOLD_ITALIC.font_name | \ + Font.NOTATION_PAGE_SUBSCRIPT.font_name | \ + Font.NOTATION_PAGE_SUBSCRIPT_SYM.font_name: + return 10.959 * self.size / Font.INSTR_DESC[0].size + case _ if self in Font.INSTR_DESC or self.font_name == Font.INSTR_DESC[0].font_name: + return 10.959 * self.size / Font.INSTR_DESC[0].size + case _ if self in Font.MATH_MISC: + return 10.959 * self.size / Font.INSTR_DESC[0].size + case _ if self in Font.NOTATION_PAGE_SYM: + return 10.959 * self.size / Font.INSTR_DESC[0].size case _: - return self.size * 1.1 + raise AssertionError(f"no line height: {self}") @classmethod - def __iter__(cls) -> Iterator[Font]: + def known_fonts(cls) -> Iterator[Font]: return iter(cls.__KNOWN_NAMES.keys()) @property @@ -47,25 +56,160 @@ class Font: @classmethod def _register_known_fonts(cls) -> None: cls.INSTR_HEADER = Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=9.963) + cls.RTL_FN_HEADER = Font(font_name='APUYSQ+zcoN-Regular', size=9.963) cls.PAGE_HEADER = Font(font_name='MJBFWM+DejaVuSansCondensed', size=9.963) cls.PAGE_FOOTER = Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.981) - cls.INSTR_DESC = Font(font_name='MJBFWM+DejaVuSansCondensed', size=8.966) + cls.INSTR_DESC = ( + Font(font_name='MJBFWM+DejaVuSansCondensed', size=8.966), + Font(font_name='WHMZPU+CMEX9', size=8.966), + ) + cls.INSTR_DESC_MISC = ( + Font(font_name='MJBFWM+DejaVuSansCondensed', size=2.377), + Font(font_name='MJBFWM+DejaVuSansCondensed', size=2.561), + Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.492), + Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.641), + Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.772), + Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.864), + Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.925), + Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.097), + Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.123), + Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.131), + Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.516), + Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.604), + Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.634), + Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.906), + Font(font_name='MJBFWM+DejaVuSansCondensed', size=6.033), + Font(font_name='MJBFWM+DejaVuSansCondensed', size=6.068), + Font(font_name='MJBFWM+DejaVuSansCondensed', size=6.213), + Font(font_name='MJBFWM+DejaVuSansCondensed', size=6.252), + Font(font_name='MJBFWM+DejaVuSansCondensed', size=6.962), + Font(font_name='MJBFWM+DejaVuSansCondensed', size=7.977), + ) + cls.INSTR_DESC_CODE = Font(font_name='APUYSQ+zcoN-Regular', size=6.974) + cls.INSTR_DESC_CODE_MISC = ( + Font(font_name='APUYSQ+zcoN-Regular', size=3.587), + Font(font_name='APUYSQ+zcoN-Regular', size=4.483), + ) cls.INSTR_DESC_ITALIC = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=8.966) cls.INSTR_DESC_BOLD = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=8.966) cls.INSTR_DESC_BOLD_ITALIC = Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=8.966) + cls.INSTR_DESC_SMALL = Font(font_name='MJBFWM+DejaVuSansCondensed', size=7.97) + cls.INSTR_DESC_SMALL_ITALIC = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=7.97) + cls.INSTR_DESC_SMALL_BOLD = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=7.97) + cls.INSTR_DESC_SMALL_BOLD_ITALIC = Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=7.97) + cls.INSTR_DESC_BOLD_MISC = ( + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=2.21), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=2.399), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=2.763), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=2.946), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=2.949), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=2.999), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.065), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.086), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.183), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.686), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.744), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.825), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.842), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.857), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.979), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.032), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.112), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.161), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.206), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.353), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.378), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.434), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.595), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.619), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.647), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.68), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.693), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.736), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.781), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.802), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.995), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.201), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.258), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.363), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.442), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.473), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.485), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.512), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.543), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.613), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.744), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.774), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.809), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.849), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.911), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.92), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.962), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.981), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.146), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.213), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.221), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.243), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.55), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.62), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.699), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.725), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.751), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.856), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=8.029), + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=8.406), + ) cls.INSTR_DESC_SUBSCRIPT = Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.978) - cls.INSTR_FIELD_BIT_NUMS = Font(font_name='MJBFWM+DejaVuSansCondensed', size=7.97) + cls.INSTR_DESC_BOLD_SUBSCRIPT = \ + Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.978) + cls.INSTR_DESC_BOLD_ITALIC_SUBSCRIPT = \ + Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=5.978) cls.INSTR_EXT_MNEMONIC = Font(font_name='APUYSQ+zcoN-Regular', size=8.966) - cls.INSTR_CODE = Font(font_name='APUYSQ+zcoN-Regular', size=7.97) - cls.INSTR_CODE_SYM = Font(font_name='RRFUNA+CMSY8', size=7.97) - cls.INSTR_CODE_NE_EQ_SIGN = Font(font_name='HPXOZC+CMSS8', size=7.97) + cls.INSTR_CODE = ( + Font(font_name='APUYSQ+zcoN-Regular', size=7.97), + Font(font_name='RRFUNA+CMSY8', size=7.97), + Font(font_name='HPXOZC+CMSS8', size=7.97), + ) cls.INSTR_CODE_SUBSCRIPT = Font(font_name='APUYSQ+zcoN-Regular', size=5.978) + cls.TITLE_PAGE_BIG = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=24.787) + cls.TITLE_PAGE_VERSION = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=9.963) + cls.TITLE_PAGE_TM = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.974) + cls.TITLE_PAGE_REV = Font(font_name='MJBFWM+DejaVuSansCondensed', size=6.974) + cls.TITLE_PAGE_BOOK = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=20.663) + cls.LEGAL_PAGE_ITALIC = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=9.963) + cls.CHANGE_SUMMARY_PAGE_BOLD = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=11.955) + cls.CHAPTER_TITLE = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=17.215) + cls.NOTATION_PAGE_SYM = ( + Font(font_name='FZTIYT+CMMI9', size=8.966), + Font(font_name='ONUAYC+CMSSI9', size=8.966), + Font(font_name='TNGBFZ+CMSY9', size=8.966), + Font(font_name='ZJTMSG+CMSS9', size=8.966), + ) + cls.NOTATION_PAGE_SUBSCRIPT_SYM = Font(font_name='DBQTKF+CMSY6', size=5.978) + cls.NOTATION_PAGE_SUBSCRIPT = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=5.978) + cls.MATH_MISC = ( + Font(font_name='AAJMKT+CMMI6', size=5.978), + Font(font_name='CUTMFD+CMSSI8', size=5.978), + Font(font_name='CUTMFD+CMSSI8', size=7.97), + Font(font_name='FZTIYT+CMMI9', size=5.734), + Font(font_name='FZTIYT+CMMI9', size=7.168), + Font(font_name='HONFQS+CMMI8', size=7.97), + Font(font_name='HPXOZC+CMSS8', size=5.978), + Font(font_name='LLVRDD+CMSY10', size=11.955), + Font(font_name='ZJTMSG+CMSS9', size=7.168), + ) cls.__KNOWN_NAMES = {} for name, value in cls.__dict__.items(): - if name[0].isupper() and isinstance(value, cls): - assert value not in cls.__KNOWN_NAMES, f"duplicate known font: {value}" - cls.__KNOWN_NAMES[value] = name + if name[0].isupper(): + if isinstance(value, cls): + assert value not in cls.__KNOWN_NAMES, f"duplicate known font: {value}" + cls.__KNOWN_NAMES[value] = name + elif isinstance(value, tuple) and all(isinstance(i, cls) for i in value): + for i, font in enumerate(value): + assert isinstance(font, cls) + assert font not in cls.__KNOWN_NAMES, f"duplicate known font: {font}" + cls.__KNOWN_NAMES[font] = f"{name}[{i}]" old_repr = cls.__repr__ def __repr__(self: cls) -> str: @@ -77,6 +221,10 @@ class Font: del cls._register_known_fonts + for font in Font.known_fonts(): + font.space_width # initialize + font.line_height # initialize + Font._register_known_fonts() @dataclass(unsafe_hash=True, frozen=True) @@ -103,17 +251,25 @@ class Char: @dataclass() class Parser: - def parse_pdf(self, file: str, page_numbers: range | None = None): + def parse_pdf(self, file: str, page_numbers: Container[int] | None = None): for page in extract_pages(file, page_numbers=page_numbers): - PageParser(parser=self, page_id=page.pageid).parse_page(page) + try: + PageParser(parser=self, page_id=page.pageid).parse_page(page) + except Exception as e: + e.add_note(f"pageid={page.pageid}") + raise COLUMN_SPLIT_X = 300.0 -INSTR_BIT_FIELDS_TOP_PAD_HEIGHT = 14.694 +INSTR_BIT_FIELDS_PREFIX_TEXT_TOP_PAD_HEIGHT = 29.938 +INSTR_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT = 9.278 +INSTR_BIT_FIELDS_PREFIX_BOX_BOTTOM_TO_SUFFIX_TEXT_HEIGHT = 20.971 +INSTR_BIT_FIELDS_TOP_PAD_HEIGHT = 20.175 +INSTR_BIT_FIELDS_TOP_PAD_HEIGHT2 = 14.694 INSTR_BIT_FIELDS_BOX_HEIGHT = 22.317 -INSTR_BIT_FIELDS_BOTTOM_PAD_HEIGHT = 24.657 -INSTR_BIT_FIELDS_PADDED_HEIGHT = (INSTR_BIT_FIELDS_TOP_PAD_HEIGHT - + INSTR_BIT_FIELDS_BOX_HEIGHT + INSTR_BIT_FIELDS_BOTTOM_PAD_HEIGHT) +INSTR_BIT_FIELDS_BOTTOM_TO_CODE_TEXT_HEIGHT = 24.657 + 1.586 +INSTR_BIT_FIELDS_BOTTOM_TO_HEADER_TEXT_HEIGHT = 32.927 + 2.1519216 +INSTR_MNEMONIC_TOP_PAD_HEIGHT = 15.75 @dataclass() class ParsedTextLine: @@ -143,49 +299,69 @@ class ParsedTextLine: _T = TypeVar("_T") +class BaselinePos(enum.Enum): + ABOVE = "above" + BASELINE = "baseline" + BELOW = "below" + @dataclass(unsafe_hash=True, frozen=True) class TextLineFonts: - regular: Font - italic: Font | None = None - bold: Font | None = None - bold_italic: Font | None = None + regular: tuple[Font, ...] + italic: tuple[Font, ...] | None = None + bold: tuple[Font, ...] | None = None + bold_italic: tuple[Font, ...] | None = None + subscript: tuple[Font, ...] | None = None - def get_font(self, part_kind: TextLineFontKind, default: _T=None) -> _T | Font: + def get_font(self, part_kind: TextLineFontKind, default: _T=None) -> _T | tuple[tuple[Font, ...], BaselinePos]: match part_kind: case TextLineFontKind.REGULAR: - retval = self.regular + font = self.regular + baseline_pos = BaselinePos.BASELINE case TextLineFontKind.ITALIC: - retval = self.italic + font = self.italic + baseline_pos = BaselinePos.BASELINE case TextLineFontKind.BOLD: - retval = self.bold + font = self.bold + baseline_pos = BaselinePos.BASELINE case TextLineFontKind.BOLD_ITALIC: - retval = self.bold_italic + font = self.bold_italic + baseline_pos = BaselinePos.BASELINE + case TextLineFontKind.SUBSCRIPT: + font = self.subscript + baseline_pos = BaselinePos.BELOW + case TextLineFontKind.SUPERSCRIPT: + font = self.subscript + baseline_pos = BaselinePos.ABOVE case _: assert_never(part_kind) - if retval is None: + if font is None: return default - return retval + return font, baseline_pos @cached_property - def __font_to_kind_map(self) -> dict[Font, TextLineFontKind]: - retval = {} + def __font_to_kind_map(self) -> dict[tuple[Font, BaselinePos], TextLineFontKind]: + retval: dict[tuple[Font, BaselinePos], TextLineFontKind] = {} for kind in TextLineFontKind: - font = self.get_font(kind) - if font is None: + fonts = self.get_font(kind) + if fonts is None: continue - assert font not in retval, \ - f"duplicate font: kind={kind} old_kind={retval[font]} font={font}" - retval[font] = kind + fonts, baseline_pos = fonts + for font in fonts: + assert font not in retval, \ + f"duplicate font: kind={kind} old_kind={retval[font]} font={font}" + retval[font, baseline_pos] = kind return retval - def get_kind(self, font: Font, default: _T=None) -> _T | TextLineFontKind: - return self.__font_to_kind_map.get(font, default) + def get_kind(self, font: Font, baseline_pos: BaselinePos, default: _T=None) -> _T | TextLineFontKind: + return self.__font_to_kind_map.get((font, baseline_pos), default) class TextLineFontKind(enum.Enum): REGULAR = "regular" ITALIC = "italic" BOLD = "bold" BOLD_ITALIC = "bold_italic" + SUBSCRIPT = "subscript" + SUPERSCRIPT = "superscript" @cached_property def text_line_tags(self) -> tuple[str, ...]: @@ -198,12 +374,19 @@ class TextLineFontKind(enum.Enum): return "b", case TextLineFontKind.BOLD_ITALIC: return "b", "i" + case TextLineFontKind.SUBSCRIPT: + return "sub", + case TextLineFontKind.SUPERSCRIPT: + return "sup", case _: assert_never(self) class PageParseFailed(Exception): pass +class InstrParseFailed(Exception): + pass + class ElementBodyBuilder: def __init__(self, containing_element: ElementTree.Element): self.__containing_element = containing_element @@ -266,8 +449,29 @@ class InstrBitField: def __str__(self) -> str: return f"" +@dataclass(unsafe_hash=True, frozen=True) +class InstrBitFieldsPrefix: + box_min_x: float + box_min_y: float + box_max_x: float + box_max_y: float + prefix_text: ParsedTextLine + fields: tuple[InstrBitField, ...] + suffix_text: ParsedTextLine + + def __str__(self): + sep = ",\n " + return (f"") + @dataclass(unsafe_hash=True, frozen=True) class InstrBitFields: + prefix: None | InstrBitFieldsPrefix box_min_x: float box_min_y: float box_max_x: float @@ -276,9 +480,20 @@ class InstrBitFields: def __str__(self): sep = ",\n " - return (f"") + f" {sep.join(map(str, self.fields))},\n]>") + +CHAR_TO_EXPANDED = { + "\ufb00": "ff", + "\ufb01": "fi", + "\ufb02": "fl", + "\ufb03": "ffi", + "\ufb04": "ffl", +} @dataclass() class PageParser: @@ -315,16 +530,24 @@ class PageParser: self.unprocessed_chars[char.font].add(char) for i in self.unprocessed_chars.values(): i.sort(key=Char.top_down_left_to_right_sort_key) + unknown_fonts=[] + unknown_font_errors=[] for font, chars in self.unprocessed_chars.items(): - print() - print(font) - text = "" - char = None - for char in chars: - text += char.text - print(repr(text)) - assert font.known_name is not None, f"unknown font {font}\nlast char: {char}" - self.extract_instrs() + if font.known_name is None: + text = "" + char = None + for char in chars: + text += char.text + unknown_fonts.append(repr(font) + ",") + unknown_font_errors.append(f"unknown font {font}\nlast char: {char}\ntext: {text!r}") + unknown_fonts.sort() + if len(unknown_fonts) != 0: + raise AssertionError("\nunknown fonts:\n" + "\n".join(unknown_fonts) + + "\n\n" + "\n".join(unknown_font_errors)) + try: + self.extract_instrs() + except InstrParseFailed: + traceback.print_exc() def extract_text_line( self, *, @@ -334,60 +557,85 @@ class PageParser: max_x: float, fonts: TextLineFonts, skip_initial_spaces=False, + allowed_start_min_y_error=None, ) -> None | ParsedTextLine: chars: list[Char] = [] + chars_set: SetById[Char] = SetById() if start_char is not None: chars.append(start_char) - self.unprocessed_chars[start_char.font].remove(start_char) + chars_set.add(start_char) for x, y, char in self.qt.range( - min_x=min_x, + min_x=min_x - fonts.regular[0].size * 0.5, max_x=max_x, - min_y=start_min_y - fonts.regular.size * 0.5, - max_y=start_min_y + fonts.regular.size * 0.5, + min_y=start_min_y - fonts.regular[0].size * 0.4, + max_y=start_min_y + fonts.regular[0].size * 0.6, ): if not isinstance(char, Char): continue - if char not in self.unprocessed_chars[char.font]: + if char not in self.unprocessed_chars[char.font] or char in chars_set: continue - self.unprocessed_chars[char.font].remove(char) + chars_set.add(char) chars.append(char) if len(chars) == 0: return None - chars.sort(key=Char.top_down_left_to_right_sort_key) + chars.sort(key=lambda char: (char.min_x, char.text)) retval = ParsedTextLine( element=ElementTree.Element("text-line"), regular_min_y=chars[0].min_y, fonts=fonts, chars=chars, ) - with ElementBodyBuilder(retval.element) as body_builder: - last_max_x = min_x - last_kind = None - for char in chars: - kind = fonts.get_kind(char.font) - if kind is None: - print(f"font kind is None:\nfonts={fonts}\nchar={char}") - return None - if last_kind is None: - space_kind = kind - elif last_kind != kind: - space_kind = TextLineFontKind.REGULAR - else: - space_kind = kind - space_font = fonts.get_font(space_kind, fonts.regular) - space_width = char.min_x - last_max_x - space_count_f = space_width / space_font.space_width - space_count = round(space_count_f) - if space_count_f > 0.25 and abs(space_count - space_count_f) > 0.15: - print(f"spaces: space_count_f={space_count_f} space_width={space_width}") - if space_count > 0 and not skip_initial_spaces: - body_builder.set_tag_stack(space_kind.text_line_tags) - body_builder.write_text(" " * space_count) - skip_initial_spaces = False - body_builder.set_tag_stack(kind.text_line_tags) - body_builder.write_text(char.text) + text_and_tag_stacks: list[tuple[str, tuple[str, ...]]] = [] + last_max_x = min_x + last_kind = None + last_char = None + for char in chars: + if char.min_y - retval.regular_min_y < -0.2: + baseline_pos = BaselinePos.BELOW + elif char.min_y - retval.regular_min_y > 1: + baseline_pos = BaselinePos.ABOVE + else: + baseline_pos = BaselinePos.BASELINE + kind = fonts.get_kind(font=char.font, baseline_pos=baseline_pos) + if kind is None: + print(f"font kind is None:\nfonts={fonts}\nchar={char}\nbaseline_pos={baseline_pos}") + return None + if last_kind is None: + space_kind = kind + elif last_kind != kind: + space_kind = TextLineFontKind.REGULAR + else: + space_kind = kind + space_font, _ = fonts.get_font(space_kind, (fonts.regular, BaselinePos.BASELINE)) + space_width = char.min_x - last_max_x + space_count_f = space_width / space_font[0].space_width + space_count = round(space_count_f) + if space_count_f > 0.25 and abs(space_count - space_count_f) > 0.15: + print(f"spaces: space_count_f={space_count_f} space_width={space_width}") + if space_count > 0 and not skip_initial_spaces: + text_and_tag_stacks.append((" " * space_count, space_kind.text_line_tags)) + skip_initial_spaces = False + if (char.text == "\u0338" and last_char is not None and last_char.text == "=" and abs(char.min_x - last_char.min_x) < 0.01 and abs(char.min_y - last_char.min_y) < 0.01): + text_and_tag_stacks[-1] = "\u2260", () + last_max_x = last_char.max_x + else: + char_text = CHAR_TO_EXPANDED.get(char.text, char.text) + text_and_tag_stacks.append((char_text, kind.text_line_tags)) last_max_x = char.max_x - last_kind = kind + last_kind = kind + last_char = char + with ElementBodyBuilder(retval.element) as body_builder: + for text, tag_stack in text_and_tag_stacks: + body_builder.set_tag_stack(tag_stack) + body_builder.write_text(text) + for char in chars: + self.unprocessed_chars[char.font].remove(char) + if allowed_start_min_y_error is None: + allowed_start_min_y_error = 0.01 + assert abs(start_min_y - chars[0].min_y) < allowed_start_min_y_error, ( + f"start_min_y={start_min_y} regular_min_y={chars[0].min_y}\n" + f"start_min_y error: {start_min_y - chars[0].min_y}\n" + f"allowed_start_min_y_error={allowed_start_min_y_error}") return retval def extract_following_text_lines( @@ -395,16 +643,18 @@ class PageParser: first_text_line: ParsedTextLine, min_x: float, max_x: float, + allowed_start_min_y_error=None, ) -> list[ParsedTextLine]: retval: list[ParsedTextLine] = [] line = first_text_line while line is not None: retval.append(line) line = self.extract_text_line( - start_min_y=line.regular_min_y - first_text_line.fonts.regular.line_height, + start_min_y=line.regular_min_y - first_text_line.fonts.regular[0].line_height, min_x=min_x, max_x=max_x, fonts=first_text_line.fonts, + allowed_start_min_y_error=allowed_start_min_y_error, ) return retval @@ -412,18 +662,112 @@ class PageParser: self, min_x: float, max_x: float, - last_mnemonic_line_min_y: float, + mnemonic_lines: list[ParsedTextLine], + ) -> None | InstrBitFields: + found_non_affix_line = False + if len(mnemonic_lines) > 1: + expected_non_affix_line_y = (mnemonic_lines[-1].regular_min_y + - INSTR_BIT_FIELDS_TOP_PAD_HEIGHT2) + else: + expected_non_affix_line_y = (mnemonic_lines[-1].regular_min_y + - INSTR_BIT_FIELDS_TOP_PAD_HEIGHT) + for x, y, line in self.qt.range( + min_x=min_x - 5, + max_x=max_x + 5, + min_y=expected_non_affix_line_y - 5, + max_y=expected_non_affix_line_y + 5, + ): + if not isinstance(line, LTLine): + continue + if line.width > line.height: + found_non_affix_line = True + break + if found_non_affix_line: + return self.extract_instr_bit_fields_box( + min_x=min_x, + max_x=max_x, + expected_box_max_y=expected_non_affix_line_y, + ) + prefix_text = self.extract_text_line( + start_min_y=mnemonic_lines[-1].regular_min_y + - INSTR_BIT_FIELDS_PREFIX_TEXT_TOP_PAD_HEIGHT, + min_x=min_x, + max_x=max_x, + fonts=TextLineFonts( + regular=(Font.INSTR_DESC_SMALL,), + bold=(Font.INSTR_DESC_SMALL_BOLD,), + ), + allowed_start_min_y_error=2, + ) + if prefix_text is None: + raise InstrParseFailed("can't find instr prefix bit fields title") + prefix_text_str = "".join(prefix_text.element.itertext()) + if prefix_text_str != "Prefix:": + raise InstrParseFailed( + f"instr prefix bit fields title is not as expected: {prefix_text_str!r}") + prefix_bit_fields = self.extract_instr_bit_fields_box( + min_x=min_x, + max_x=max_x, + expected_box_max_y=prefix_text.regular_min_y + - INSTR_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT, + ) + if prefix_bit_fields is None: + raise InstrParseFailed("can't find instr prefix bit fields") + suffix_text = self.extract_text_line( + start_min_y=prefix_bit_fields.box_min_y + - INSTR_BIT_FIELDS_PREFIX_BOX_BOTTOM_TO_SUFFIX_TEXT_HEIGHT, + min_x=min_x, + max_x=max_x, + fonts=TextLineFonts( + regular=(Font.INSTR_DESC_SMALL,), + bold=(Font.INSTR_DESC_SMALL_BOLD,), + ), + allowed_start_min_y_error=2, + ) + if suffix_text is None: + raise InstrParseFailed("can't find instr suffix bit fields title") + suffix_text_str = "".join(suffix_text.element.itertext()) + if suffix_text_str != "Suffix:": + raise InstrParseFailed( + f"instr suffix bit fields title is not as expected: {suffix_text_str!r}") + suffix_bit_fields = self.extract_instr_bit_fields_box( + min_x=min_x, + max_x=max_x, + expected_box_max_y=suffix_text.regular_min_y + - INSTR_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT, + ) + if suffix_bit_fields is None: + raise InstrParseFailed("can't find instr suffix bit fields") + return InstrBitFields( + prefix=InstrBitFieldsPrefix( + box_min_x=prefix_bit_fields.box_min_x, + box_min_y=prefix_bit_fields.box_min_y, + box_max_x=prefix_bit_fields.box_max_x, + box_max_y=prefix_bit_fields.box_max_y, + prefix_text=prefix_text, + fields=prefix_bit_fields.fields, + suffix_text=suffix_text, + ), + box_min_x=suffix_bit_fields.box_min_x, + box_min_y=suffix_bit_fields.box_min_y, + box_max_x=suffix_bit_fields.box_max_x, + box_max_y=suffix_bit_fields.box_max_y, + fields=suffix_bit_fields.fields, + ) + + def extract_instr_bit_fields_box( + self, + min_x: float, + max_x: float, + expected_box_max_y: float, ) -> None | InstrBitFields: h_lines: list[LTLine] = [] v_lines: list[LTLine] = [] for x, y, line in self.qt.range( min_x=min_x - 5, max_x=max_x + 5, - min_y=last_mnemonic_line_min_y - - INSTR_BIT_FIELDS_PADDED_HEIGHT - + INSTR_BIT_FIELDS_BOTTOM_PAD_HEIGHT / 2, - max_y=last_mnemonic_line_min_y - - INSTR_BIT_FIELDS_TOP_PAD_HEIGHT / 2, + min_y=expected_box_max_y - INSTR_BIT_FIELDS_BOX_HEIGHT - 5, + max_y=expected_box_max_y + 5, ): if not isinstance(line, LTLine): continue @@ -439,10 +783,10 @@ class PageParser: if len(h_lines) == 0 and len(v_lines) == 0: return None if len(h_lines) != 2: - raise PageParseFailed( + raise InstrParseFailed( f"instruction bit fields box has wrong number of horizontal lines:\n{h_lines}") if len(v_lines) < 2: - raise PageParseFailed( + raise InstrParseFailed( f"instruction bit fields box has too few vertical lines:\n{h_lines}") bottom_line, top_line = h_lines box_min_x = v_lines[0].x0 @@ -463,7 +807,7 @@ class PageParser: box_min_x=field_box_min_x, box_max_x=field_box_max_x, name=self.extract_text_line( - start_min_y=box_mid_y + 3, + start_min_y=box_mid_y + 3.288, min_x=field_box_min_x, max_x=field_box_max_x, fonts=TextLineFonts( @@ -472,16 +816,17 @@ class PageParser: skip_initial_spaces=True, ), start_bit=self.extract_text_line( - start_min_y=box_min_y + 3, + start_min_y=box_min_y + 3.487, min_x=field_box_min_x, max_x=field_box_max_x, fonts=TextLineFonts( - regular=Font.INSTR_FIELD_BIT_NUMS, + regular=(Font.INSTR_DESC_SMALL,), ), skip_initial_spaces=True, ), )) return InstrBitFields( + prefix=None, box_min_x=box_min_x, box_min_y=box_min_y, box_max_x=box_max_x, @@ -501,7 +846,7 @@ class PageParser: start_min_y=header_start_char.min_y, min_x=column_min_x, max_x=column_max_x, - fonts=TextLineFonts(regular=Font.INSTR_HEADER), + fonts=TextLineFonts(regular=(Font.INSTR_HEADER,)), ) if header_line is None: raise PageParseFailed("can't find header text line") @@ -509,20 +854,22 @@ class PageParser: first_text_line=header_line, min_x=column_min_x, max_x=column_max_x, + allowed_start_min_y_error=1.5, ) print("instr header lines:") print("\n".join(map(str, header_lines))) mnemonic_line = self.extract_text_line( - start_min_y=header_lines[-1].regular_min_y - 18.788, + start_min_y=header_lines[-1].regular_min_y - INSTR_MNEMONIC_TOP_PAD_HEIGHT, min_x=column_min_x, max_x=column_max_x, fonts=TextLineFonts( regular=Font.INSTR_DESC, ), skip_initial_spaces=True, + allowed_start_min_y_error=3, ) if mnemonic_line is None: - raise PageParseFailed("can't find instr mnemonic text line") + raise InstrParseFailed("can't find instr mnemonic text line") mnemonic_lines = self.extract_following_text_lines( first_text_line=mnemonic_line, min_x=mnemonic_line.chars[0].min_x, @@ -533,9 +880,94 @@ class PageParser: instr_bit_fields = self.extract_instr_bit_fields( min_x=column_min_x, max_x=column_max_x, - last_mnemonic_line_min_y=mnemonic_lines[-1].regular_min_y, + mnemonic_lines=mnemonic_lines, ) print(instr_bit_fields) + if instr_bit_fields is None: + raise InstrParseFailed("can't find instr bit fields") + alt_header_line = self.extract_text_line( + start_min_y=instr_bit_fields.box_min_y + - INSTR_BIT_FIELDS_BOTTOM_TO_HEADER_TEXT_HEIGHT, + min_x=column_min_x, + max_x=column_max_x, + fonts=TextLineFonts( + regular=(Font.INSTR_HEADER,), + ), + skip_initial_spaces=True, + allowed_start_min_y_error=6, + ) + if alt_header_line is not None: + print(f"found alt header line:\n{alt_header_line}") + alt_header_lines = self.extract_following_text_lines( + first_text_line=alt_header_line, + min_x=column_min_x, + max_x=column_max_x, + allowed_start_min_y_error=1.5, + ) + print("instr alt header lines:") + print("\n".join(map(str, alt_header_lines))) + alt_mnemonic_line = self.extract_text_line( + start_min_y=alt_header_lines[-1].regular_min_y - INSTR_MNEMONIC_TOP_PAD_HEIGHT, + min_x=column_min_x, + max_x=column_max_x, + fonts=TextLineFonts( + regular=Font.INSTR_DESC, + ), + skip_initial_spaces=True, + allowed_start_min_y_error=1.5, + ) + if alt_mnemonic_line is None: + raise InstrParseFailed("can't find instr alt mnemonic text line") + alt_mnemonic_lines = self.extract_following_text_lines( + first_text_line=alt_mnemonic_line, + min_x=alt_mnemonic_line.chars[0].min_x, + max_x=column_max_x, + ) + print("instr alt mnemonic lines:") + print("\n".join(map(str, alt_mnemonic_lines))) + alt_instr_bit_fields = self.extract_instr_bit_fields( + min_x=column_min_x, + max_x=column_max_x, + mnemonic_lines=alt_mnemonic_lines, + ) + print(alt_instr_bit_fields) + if alt_instr_bit_fields is None: + raise InstrParseFailed("can't find instr alt bit fields") + last_instr_bit_fields = alt_instr_bit_fields + else: + print("no alt header line") + alt_header_lines = None + alt_mnemonic_lines = None + alt_instr_bit_fields = None + last_instr_bit_fields = instr_bit_fields + + code_line = None + for y_offset in reversed(range(4)): + code_line = self.extract_text_line( + start_min_y=last_instr_bit_fields.box_min_y + - INSTR_BIT_FIELDS_BOTTOM_TO_CODE_TEXT_HEIGHT + + y_offset * 0.5 * Font.INSTR_CODE[0].line_height, + min_x=column_min_x, + max_x=column_max_x, + fonts=TextLineFonts( + regular=Font.INSTR_CODE, + subscript=(Font.INSTR_CODE_SUBSCRIPT,), + ), + skip_initial_spaces=True, + allowed_start_min_y_error=1, + ) + if code_line is not None: + break + if code_line is None: + raise InstrParseFailed("can't find instr code text line") + code_lines = self.extract_following_text_lines( + first_text_line=code_line, + min_x=code_line.chars[0].min_x, + max_x=column_max_x, + allowed_start_min_y_error=0.05, + ) + print("instr code lines:") + print("\n".join(map(str, code_lines))) # TODO: finish def extract_instrs(self): @@ -544,4 +976,4 @@ class PageParser: self.extract_instr(next(iter(unprocessed_header_chars))) def main(): - Parser().parse_pdf(sys.argv[1], page_numbers=range(76, 78)) \ No newline at end of file + Parser().parse_pdf(sys.argv[1], page_numbers=range(1495)) \ No newline at end of file