WIP... all fonts included

This commit is contained in:
Jacob Lifshay 2024-10-25 17:45:41 -07:00
parent 08141ce560
commit b9f47e5ae1
Signed by: programmerjake
SSH key fingerprint: SHA256:B1iRVvUJkvd7upMIiMqn6OyxvD2SgJkAH3ZnUOj6z+c

View file

@ -4,9 +4,10 @@ from dataclasses import dataclass, field
import dataclasses import dataclasses
from functools import cached_property from functools import cached_property
import sys import sys
from typing import ClassVar, Iterable, Iterator, NewType, TypeAlias, TypeVar, assert_never, overload from typing import ClassVar, Container, Iterable, Iterator, NewType, TypeAlias, TypeVar, assert_never, overload
from xml.etree import ElementTree from xml.etree import ElementTree
import enum import enum
import traceback
from pdfminer.high_level import extract_pages from pdfminer.high_level import extract_pages
from pdfminer.layout import LTChar, LTLine, LTPage, LTRect, LTTextBox from pdfminer.layout import LTChar, LTLine, LTPage, LTRect, LTTextBox
@ -22,22 +23,30 @@ class Font:
@cached_property @cached_property
def space_width(self) -> float: def space_width(self) -> float:
match self: return 3.985 * self.size / Font.INSTR_CODE[0].size
case Font.INSTR_HEADER:
return 3.12
case _:
return self.size * 0.31
@cached_property @cached_property
def line_height(self) -> float: def line_height(self) -> float:
match self: match self.font_name:
case Font.INSTR_HEADER: case _ if any(self.font_name == i.font_name for i in Font.INSTR_CODE):
return 10.961 return 9.464 * self.size / Font.INSTR_CODE[0].size
case Font.INSTR_DESC_BOLD.font_name | \
Font.INSTR_DESC_ITALIC.font_name | \
Font.INSTR_DESC_BOLD_ITALIC.font_name | \
Font.NOTATION_PAGE_SUBSCRIPT.font_name | \
Font.NOTATION_PAGE_SUBSCRIPT_SYM.font_name:
return 10.959 * self.size / Font.INSTR_DESC[0].size
case _ if self in Font.INSTR_DESC or self.font_name == Font.INSTR_DESC[0].font_name:
return 10.959 * self.size / Font.INSTR_DESC[0].size
case _ if self in Font.MATH_MISC:
return 10.959 * self.size / Font.INSTR_DESC[0].size
case _ if self in Font.NOTATION_PAGE_SYM:
return 10.959 * self.size / Font.INSTR_DESC[0].size
case _: case _:
return self.size * 1.1 raise AssertionError(f"no line height: {self}")
@classmethod @classmethod
def __iter__(cls) -> Iterator[Font]: def known_fonts(cls) -> Iterator[Font]:
return iter(cls.__KNOWN_NAMES.keys()) return iter(cls.__KNOWN_NAMES.keys())
@property @property
@ -47,25 +56,160 @@ class Font:
@classmethod @classmethod
def _register_known_fonts(cls) -> None: def _register_known_fonts(cls) -> None:
cls.INSTR_HEADER = Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=9.963) cls.INSTR_HEADER = Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=9.963)
cls.RTL_FN_HEADER = Font(font_name='APUYSQ+zcoN-Regular', size=9.963)
cls.PAGE_HEADER = Font(font_name='MJBFWM+DejaVuSansCondensed', size=9.963) cls.PAGE_HEADER = Font(font_name='MJBFWM+DejaVuSansCondensed', size=9.963)
cls.PAGE_FOOTER = Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.981) cls.PAGE_FOOTER = Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.981)
cls.INSTR_DESC = Font(font_name='MJBFWM+DejaVuSansCondensed', size=8.966) cls.INSTR_DESC = (
Font(font_name='MJBFWM+DejaVuSansCondensed', size=8.966),
Font(font_name='WHMZPU+CMEX9', size=8.966),
)
cls.INSTR_DESC_MISC = (
Font(font_name='MJBFWM+DejaVuSansCondensed', size=2.377),
Font(font_name='MJBFWM+DejaVuSansCondensed', size=2.561),
Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.492),
Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.641),
Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.772),
Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.864),
Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.925),
Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.097),
Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.123),
Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.131),
Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.516),
Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.604),
Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.634),
Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.906),
Font(font_name='MJBFWM+DejaVuSansCondensed', size=6.033),
Font(font_name='MJBFWM+DejaVuSansCondensed', size=6.068),
Font(font_name='MJBFWM+DejaVuSansCondensed', size=6.213),
Font(font_name='MJBFWM+DejaVuSansCondensed', size=6.252),
Font(font_name='MJBFWM+DejaVuSansCondensed', size=6.962),
Font(font_name='MJBFWM+DejaVuSansCondensed', size=7.977),
)
cls.INSTR_DESC_CODE = Font(font_name='APUYSQ+zcoN-Regular', size=6.974)
cls.INSTR_DESC_CODE_MISC = (
Font(font_name='APUYSQ+zcoN-Regular', size=3.587),
Font(font_name='APUYSQ+zcoN-Regular', size=4.483),
)
cls.INSTR_DESC_ITALIC = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=8.966) cls.INSTR_DESC_ITALIC = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=8.966)
cls.INSTR_DESC_BOLD = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=8.966) cls.INSTR_DESC_BOLD = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=8.966)
cls.INSTR_DESC_BOLD_ITALIC = Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=8.966) cls.INSTR_DESC_BOLD_ITALIC = Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=8.966)
cls.INSTR_DESC_SMALL = Font(font_name='MJBFWM+DejaVuSansCondensed', size=7.97)
cls.INSTR_DESC_SMALL_ITALIC = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=7.97)
cls.INSTR_DESC_SMALL_BOLD = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=7.97)
cls.INSTR_DESC_SMALL_BOLD_ITALIC = Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=7.97)
cls.INSTR_DESC_BOLD_MISC = (
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=2.21),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=2.399),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=2.763),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=2.946),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=2.949),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=2.999),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.065),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.086),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.183),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.686),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.744),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.825),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.842),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.857),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.979),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.032),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.112),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.161),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.206),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.353),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.378),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.434),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.595),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.619),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.647),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.68),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.693),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.736),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.781),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.802),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.995),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.201),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.258),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.363),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.442),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.473),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.485),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.512),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.543),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.613),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.744),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.774),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.809),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.849),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.911),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.92),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.962),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.981),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.146),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.213),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.221),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.243),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.55),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.62),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.699),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.725),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.751),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.856),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=8.029),
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=8.406),
)
cls.INSTR_DESC_SUBSCRIPT = Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.978) cls.INSTR_DESC_SUBSCRIPT = Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.978)
cls.INSTR_FIELD_BIT_NUMS = Font(font_name='MJBFWM+DejaVuSansCondensed', size=7.97) cls.INSTR_DESC_BOLD_SUBSCRIPT = \
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.978)
cls.INSTR_DESC_BOLD_ITALIC_SUBSCRIPT = \
Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=5.978)
cls.INSTR_EXT_MNEMONIC = Font(font_name='APUYSQ+zcoN-Regular', size=8.966) cls.INSTR_EXT_MNEMONIC = Font(font_name='APUYSQ+zcoN-Regular', size=8.966)
cls.INSTR_CODE = Font(font_name='APUYSQ+zcoN-Regular', size=7.97) cls.INSTR_CODE = (
cls.INSTR_CODE_SYM = Font(font_name='RRFUNA+CMSY8', size=7.97) Font(font_name='APUYSQ+zcoN-Regular', size=7.97),
cls.INSTR_CODE_NE_EQ_SIGN = Font(font_name='HPXOZC+CMSS8', size=7.97) Font(font_name='RRFUNA+CMSY8', size=7.97),
Font(font_name='HPXOZC+CMSS8', size=7.97),
)
cls.INSTR_CODE_SUBSCRIPT = Font(font_name='APUYSQ+zcoN-Regular', size=5.978) cls.INSTR_CODE_SUBSCRIPT = Font(font_name='APUYSQ+zcoN-Regular', size=5.978)
cls.TITLE_PAGE_BIG = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=24.787)
cls.TITLE_PAGE_VERSION = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=9.963)
cls.TITLE_PAGE_TM = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.974)
cls.TITLE_PAGE_REV = Font(font_name='MJBFWM+DejaVuSansCondensed', size=6.974)
cls.TITLE_PAGE_BOOK = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=20.663)
cls.LEGAL_PAGE_ITALIC = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=9.963)
cls.CHANGE_SUMMARY_PAGE_BOLD = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=11.955)
cls.CHAPTER_TITLE = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=17.215)
cls.NOTATION_PAGE_SYM = (
Font(font_name='FZTIYT+CMMI9', size=8.966),
Font(font_name='ONUAYC+CMSSI9', size=8.966),
Font(font_name='TNGBFZ+CMSY9', size=8.966),
Font(font_name='ZJTMSG+CMSS9', size=8.966),
)
cls.NOTATION_PAGE_SUBSCRIPT_SYM = Font(font_name='DBQTKF+CMSY6', size=5.978)
cls.NOTATION_PAGE_SUBSCRIPT = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=5.978)
cls.MATH_MISC = (
Font(font_name='AAJMKT+CMMI6', size=5.978),
Font(font_name='CUTMFD+CMSSI8', size=5.978),
Font(font_name='CUTMFD+CMSSI8', size=7.97),
Font(font_name='FZTIYT+CMMI9', size=5.734),
Font(font_name='FZTIYT+CMMI9', size=7.168),
Font(font_name='HONFQS+CMMI8', size=7.97),
Font(font_name='HPXOZC+CMSS8', size=5.978),
Font(font_name='LLVRDD+CMSY10', size=11.955),
Font(font_name='ZJTMSG+CMSS9', size=7.168),
)
cls.__KNOWN_NAMES = {} cls.__KNOWN_NAMES = {}
for name, value in cls.__dict__.items(): for name, value in cls.__dict__.items():
if name[0].isupper() and isinstance(value, cls): if name[0].isupper():
assert value not in cls.__KNOWN_NAMES, f"duplicate known font: {value}" if isinstance(value, cls):
cls.__KNOWN_NAMES[value] = name assert value not in cls.__KNOWN_NAMES, f"duplicate known font: {value}"
cls.__KNOWN_NAMES[value] = name
elif isinstance(value, tuple) and all(isinstance(i, cls) for i in value):
for i, font in enumerate(value):
assert isinstance(font, cls)
assert font not in cls.__KNOWN_NAMES, f"duplicate known font: {font}"
cls.__KNOWN_NAMES[font] = f"{name}[{i}]"
old_repr = cls.__repr__ old_repr = cls.__repr__
def __repr__(self: cls) -> str: def __repr__(self: cls) -> str:
@ -77,6 +221,10 @@ class Font:
del cls._register_known_fonts del cls._register_known_fonts
for font in Font.known_fonts():
font.space_width # initialize
font.line_height # initialize
Font._register_known_fonts() Font._register_known_fonts()
@dataclass(unsafe_hash=True, frozen=True) @dataclass(unsafe_hash=True, frozen=True)
@ -103,17 +251,25 @@ class Char:
@dataclass() @dataclass()
class Parser: class Parser:
def parse_pdf(self, file: str, page_numbers: range | None = None): def parse_pdf(self, file: str, page_numbers: Container[int] | None = None):
for page in extract_pages(file, page_numbers=page_numbers): for page in extract_pages(file, page_numbers=page_numbers):
PageParser(parser=self, page_id=page.pageid).parse_page(page) try:
PageParser(parser=self, page_id=page.pageid).parse_page(page)
except Exception as e:
e.add_note(f"pageid={page.pageid}")
raise
COLUMN_SPLIT_X = 300.0 COLUMN_SPLIT_X = 300.0
INSTR_BIT_FIELDS_TOP_PAD_HEIGHT = 14.694 INSTR_BIT_FIELDS_PREFIX_TEXT_TOP_PAD_HEIGHT = 29.938
INSTR_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT = 9.278
INSTR_BIT_FIELDS_PREFIX_BOX_BOTTOM_TO_SUFFIX_TEXT_HEIGHT = 20.971
INSTR_BIT_FIELDS_TOP_PAD_HEIGHT = 20.175
INSTR_BIT_FIELDS_TOP_PAD_HEIGHT2 = 14.694
INSTR_BIT_FIELDS_BOX_HEIGHT = 22.317 INSTR_BIT_FIELDS_BOX_HEIGHT = 22.317
INSTR_BIT_FIELDS_BOTTOM_PAD_HEIGHT = 24.657 INSTR_BIT_FIELDS_BOTTOM_TO_CODE_TEXT_HEIGHT = 24.657 + 1.586
INSTR_BIT_FIELDS_PADDED_HEIGHT = (INSTR_BIT_FIELDS_TOP_PAD_HEIGHT INSTR_BIT_FIELDS_BOTTOM_TO_HEADER_TEXT_HEIGHT = 32.927 + 2.1519216
+ INSTR_BIT_FIELDS_BOX_HEIGHT + INSTR_BIT_FIELDS_BOTTOM_PAD_HEIGHT) INSTR_MNEMONIC_TOP_PAD_HEIGHT = 15.75
@dataclass() @dataclass()
class ParsedTextLine: class ParsedTextLine:
@ -143,49 +299,69 @@ class ParsedTextLine:
_T = TypeVar("_T") _T = TypeVar("_T")
class BaselinePos(enum.Enum):
ABOVE = "above"
BASELINE = "baseline"
BELOW = "below"
@dataclass(unsafe_hash=True, frozen=True) @dataclass(unsafe_hash=True, frozen=True)
class TextLineFonts: class TextLineFonts:
regular: Font regular: tuple[Font, ...]
italic: Font | None = None italic: tuple[Font, ...] | None = None
bold: Font | None = None bold: tuple[Font, ...] | None = None
bold_italic: Font | None = None bold_italic: tuple[Font, ...] | None = None
subscript: tuple[Font, ...] | None = None
def get_font(self, part_kind: TextLineFontKind, default: _T=None) -> _T | Font: def get_font(self, part_kind: TextLineFontKind, default: _T=None) -> _T | tuple[tuple[Font, ...], BaselinePos]:
match part_kind: match part_kind:
case TextLineFontKind.REGULAR: case TextLineFontKind.REGULAR:
retval = self.regular font = self.regular
baseline_pos = BaselinePos.BASELINE
case TextLineFontKind.ITALIC: case TextLineFontKind.ITALIC:
retval = self.italic font = self.italic
baseline_pos = BaselinePos.BASELINE
case TextLineFontKind.BOLD: case TextLineFontKind.BOLD:
retval = self.bold font = self.bold
baseline_pos = BaselinePos.BASELINE
case TextLineFontKind.BOLD_ITALIC: case TextLineFontKind.BOLD_ITALIC:
retval = self.bold_italic font = self.bold_italic
baseline_pos = BaselinePos.BASELINE
case TextLineFontKind.SUBSCRIPT:
font = self.subscript
baseline_pos = BaselinePos.BELOW
case TextLineFontKind.SUPERSCRIPT:
font = self.subscript
baseline_pos = BaselinePos.ABOVE
case _: case _:
assert_never(part_kind) assert_never(part_kind)
if retval is None: if font is None:
return default return default
return retval return font, baseline_pos
@cached_property @cached_property
def __font_to_kind_map(self) -> dict[Font, TextLineFontKind]: def __font_to_kind_map(self) -> dict[tuple[Font, BaselinePos], TextLineFontKind]:
retval = {} retval: dict[tuple[Font, BaselinePos], TextLineFontKind] = {}
for kind in TextLineFontKind: for kind in TextLineFontKind:
font = self.get_font(kind) fonts = self.get_font(kind)
if font is None: if fonts is None:
continue continue
assert font not in retval, \ fonts, baseline_pos = fonts
f"duplicate font: kind={kind} old_kind={retval[font]} font={font}" for font in fonts:
retval[font] = kind assert font not in retval, \
f"duplicate font: kind={kind} old_kind={retval[font]} font={font}"
retval[font, baseline_pos] = kind
return retval return retval
def get_kind(self, font: Font, default: _T=None) -> _T | TextLineFontKind: def get_kind(self, font: Font, baseline_pos: BaselinePos, default: _T=None) -> _T | TextLineFontKind:
return self.__font_to_kind_map.get(font, default) return self.__font_to_kind_map.get((font, baseline_pos), default)
class TextLineFontKind(enum.Enum): class TextLineFontKind(enum.Enum):
REGULAR = "regular" REGULAR = "regular"
ITALIC = "italic" ITALIC = "italic"
BOLD = "bold" BOLD = "bold"
BOLD_ITALIC = "bold_italic" BOLD_ITALIC = "bold_italic"
SUBSCRIPT = "subscript"
SUPERSCRIPT = "superscript"
@cached_property @cached_property
def text_line_tags(self) -> tuple[str, ...]: def text_line_tags(self) -> tuple[str, ...]:
@ -198,12 +374,19 @@ class TextLineFontKind(enum.Enum):
return "b", return "b",
case TextLineFontKind.BOLD_ITALIC: case TextLineFontKind.BOLD_ITALIC:
return "b", "i" return "b", "i"
case TextLineFontKind.SUBSCRIPT:
return "sub",
case TextLineFontKind.SUPERSCRIPT:
return "sup",
case _: case _:
assert_never(self) assert_never(self)
class PageParseFailed(Exception): class PageParseFailed(Exception):
pass pass
class InstrParseFailed(Exception):
pass
class ElementBodyBuilder: class ElementBodyBuilder:
def __init__(self, containing_element: ElementTree.Element): def __init__(self, containing_element: ElementTree.Element):
self.__containing_element = containing_element self.__containing_element = containing_element
@ -266,8 +449,29 @@ class InstrBitField:
def __str__(self) -> str: def __str__(self) -> str:
return f"<InstrBitField: x={self.box_min_x}..{self.box_max_x} name={self.name} start_bit={self.start_bit}>" return f"<InstrBitField: x={self.box_min_x}..{self.box_max_x} name={self.name} start_bit={self.start_bit}>"
@dataclass(unsafe_hash=True, frozen=True)
class InstrBitFieldsPrefix:
box_min_x: float
box_min_y: float
box_max_x: float
box_max_y: float
prefix_text: ParsedTextLine
fields: tuple[InstrBitField, ...]
suffix_text: ParsedTextLine
def __str__(self):
sep = ",\n "
return (f"<InstrBitFieldsPrefix: ({self.box_min_x},{self.box_min_y}).."
f"({self.box_max_x},{self.box_max_y})\n"
f" prefix_text={self.prefix_text}\n"
f" [\n"
f" {sep.join(map(str, self.fields))},\n"
f" ]\n"
f" suffix_text={self.suffix_text}>")
@dataclass(unsafe_hash=True, frozen=True) @dataclass(unsafe_hash=True, frozen=True)
class InstrBitFields: class InstrBitFields:
prefix: None | InstrBitFieldsPrefix
box_min_x: float box_min_x: float
box_min_y: float box_min_y: float
box_max_x: float box_max_x: float
@ -276,9 +480,20 @@ class InstrBitFields:
def __str__(self): def __str__(self):
sep = ",\n " sep = ",\n "
return (f"<InstrBitFields: ({self.box_min_x},{self.box_min_y}).." prefix_str = ""
if self.prefix is not None:
prefix_str = f"{self.prefix}\n"
return (f"{prefix_str}<InstrBitFields: ({self.box_min_x},{self.box_min_y}).."
f"({self.box_max_x},{self.box_max_y}) [\n" f"({self.box_max_x},{self.box_max_y}) [\n"
f" {sep.join(map(str, self.fields))}]>") f" {sep.join(map(str, self.fields))},\n]>")
CHAR_TO_EXPANDED = {
"\ufb00": "ff",
"\ufb01": "fi",
"\ufb02": "fl",
"\ufb03": "ffi",
"\ufb04": "ffl",
}
@dataclass() @dataclass()
class PageParser: class PageParser:
@ -315,16 +530,24 @@ class PageParser:
self.unprocessed_chars[char.font].add(char) self.unprocessed_chars[char.font].add(char)
for i in self.unprocessed_chars.values(): for i in self.unprocessed_chars.values():
i.sort(key=Char.top_down_left_to_right_sort_key) i.sort(key=Char.top_down_left_to_right_sort_key)
unknown_fonts=[]
unknown_font_errors=[]
for font, chars in self.unprocessed_chars.items(): for font, chars in self.unprocessed_chars.items():
print() if font.known_name is None:
print(font) text = ""
text = "" char = None
char = None for char in chars:
for char in chars: text += char.text
text += char.text unknown_fonts.append(repr(font) + ",")
print(repr(text)) unknown_font_errors.append(f"unknown font {font}\nlast char: {char}\ntext: {text!r}")
assert font.known_name is not None, f"unknown font {font}\nlast char: {char}" unknown_fonts.sort()
self.extract_instrs() if len(unknown_fonts) != 0:
raise AssertionError("\nunknown fonts:\n" + "\n".join(unknown_fonts)
+ "\n\n" + "\n".join(unknown_font_errors))
try:
self.extract_instrs()
except InstrParseFailed:
traceback.print_exc()
def extract_text_line( def extract_text_line(
self, *, self, *,
@ -334,60 +557,85 @@ class PageParser:
max_x: float, max_x: float,
fonts: TextLineFonts, fonts: TextLineFonts,
skip_initial_spaces=False, skip_initial_spaces=False,
allowed_start_min_y_error=None,
) -> None | ParsedTextLine: ) -> None | ParsedTextLine:
chars: list[Char] = [] chars: list[Char] = []
chars_set: SetById[Char] = SetById()
if start_char is not None: if start_char is not None:
chars.append(start_char) chars.append(start_char)
self.unprocessed_chars[start_char.font].remove(start_char) chars_set.add(start_char)
for x, y, char in self.qt.range( for x, y, char in self.qt.range(
min_x=min_x, min_x=min_x - fonts.regular[0].size * 0.5,
max_x=max_x, max_x=max_x,
min_y=start_min_y - fonts.regular.size * 0.5, min_y=start_min_y - fonts.regular[0].size * 0.4,
max_y=start_min_y + fonts.regular.size * 0.5, max_y=start_min_y + fonts.regular[0].size * 0.6,
): ):
if not isinstance(char, Char): if not isinstance(char, Char):
continue continue
if char not in self.unprocessed_chars[char.font]: if char not in self.unprocessed_chars[char.font] or char in chars_set:
continue continue
self.unprocessed_chars[char.font].remove(char) chars_set.add(char)
chars.append(char) chars.append(char)
if len(chars) == 0: if len(chars) == 0:
return None return None
chars.sort(key=Char.top_down_left_to_right_sort_key) chars.sort(key=lambda char: (char.min_x, char.text))
retval = ParsedTextLine( retval = ParsedTextLine(
element=ElementTree.Element("text-line"), element=ElementTree.Element("text-line"),
regular_min_y=chars[0].min_y, regular_min_y=chars[0].min_y,
fonts=fonts, fonts=fonts,
chars=chars, chars=chars,
) )
with ElementBodyBuilder(retval.element) as body_builder: text_and_tag_stacks: list[tuple[str, tuple[str, ...]]] = []
last_max_x = min_x last_max_x = min_x
last_kind = None last_kind = None
for char in chars: last_char = None
kind = fonts.get_kind(char.font) for char in chars:
if kind is None: if char.min_y - retval.regular_min_y < -0.2:
print(f"font kind is None:\nfonts={fonts}\nchar={char}") baseline_pos = BaselinePos.BELOW
return None elif char.min_y - retval.regular_min_y > 1:
if last_kind is None: baseline_pos = BaselinePos.ABOVE
space_kind = kind else:
elif last_kind != kind: baseline_pos = BaselinePos.BASELINE
space_kind = TextLineFontKind.REGULAR kind = fonts.get_kind(font=char.font, baseline_pos=baseline_pos)
else: if kind is None:
space_kind = kind print(f"font kind is None:\nfonts={fonts}\nchar={char}\nbaseline_pos={baseline_pos}")
space_font = fonts.get_font(space_kind, fonts.regular) return None
space_width = char.min_x - last_max_x if last_kind is None:
space_count_f = space_width / space_font.space_width space_kind = kind
space_count = round(space_count_f) elif last_kind != kind:
if space_count_f > 0.25 and abs(space_count - space_count_f) > 0.15: space_kind = TextLineFontKind.REGULAR
print(f"spaces: space_count_f={space_count_f} space_width={space_width}") else:
if space_count > 0 and not skip_initial_spaces: space_kind = kind
body_builder.set_tag_stack(space_kind.text_line_tags) space_font, _ = fonts.get_font(space_kind, (fonts.regular, BaselinePos.BASELINE))
body_builder.write_text(" " * space_count) space_width = char.min_x - last_max_x
skip_initial_spaces = False space_count_f = space_width / space_font[0].space_width
body_builder.set_tag_stack(kind.text_line_tags) space_count = round(space_count_f)
body_builder.write_text(char.text) if space_count_f > 0.25 and abs(space_count - space_count_f) > 0.15:
print(f"spaces: space_count_f={space_count_f} space_width={space_width}")
if space_count > 0 and not skip_initial_spaces:
text_and_tag_stacks.append((" " * space_count, space_kind.text_line_tags))
skip_initial_spaces = False
if (char.text == "\u0338" and last_char is not None and last_char.text == "=" and abs(char.min_x - last_char.min_x) < 0.01 and abs(char.min_y - last_char.min_y) < 0.01):
text_and_tag_stacks[-1] = "\u2260", ()
last_max_x = last_char.max_x
else:
char_text = CHAR_TO_EXPANDED.get(char.text, char.text)
text_and_tag_stacks.append((char_text, kind.text_line_tags))
last_max_x = char.max_x last_max_x = char.max_x
last_kind = kind last_kind = kind
last_char = char
with ElementBodyBuilder(retval.element) as body_builder:
for text, tag_stack in text_and_tag_stacks:
body_builder.set_tag_stack(tag_stack)
body_builder.write_text(text)
for char in chars:
self.unprocessed_chars[char.font].remove(char)
if allowed_start_min_y_error is None:
allowed_start_min_y_error = 0.01
assert abs(start_min_y - chars[0].min_y) < allowed_start_min_y_error, (
f"start_min_y={start_min_y} regular_min_y={chars[0].min_y}\n"
f"start_min_y error: {start_min_y - chars[0].min_y}\n"
f"allowed_start_min_y_error={allowed_start_min_y_error}")
return retval return retval
def extract_following_text_lines( def extract_following_text_lines(
@ -395,16 +643,18 @@ class PageParser:
first_text_line: ParsedTextLine, first_text_line: ParsedTextLine,
min_x: float, min_x: float,
max_x: float, max_x: float,
allowed_start_min_y_error=None,
) -> list[ParsedTextLine]: ) -> list[ParsedTextLine]:
retval: list[ParsedTextLine] = [] retval: list[ParsedTextLine] = []
line = first_text_line line = first_text_line
while line is not None: while line is not None:
retval.append(line) retval.append(line)
line = self.extract_text_line( line = self.extract_text_line(
start_min_y=line.regular_min_y - first_text_line.fonts.regular.line_height, start_min_y=line.regular_min_y - first_text_line.fonts.regular[0].line_height,
min_x=min_x, min_x=min_x,
max_x=max_x, max_x=max_x,
fonts=first_text_line.fonts, fonts=first_text_line.fonts,
allowed_start_min_y_error=allowed_start_min_y_error,
) )
return retval return retval
@ -412,18 +662,112 @@ class PageParser:
self, self,
min_x: float, min_x: float,
max_x: float, max_x: float,
last_mnemonic_line_min_y: float, mnemonic_lines: list[ParsedTextLine],
) -> None | InstrBitFields:
found_non_affix_line = False
if len(mnemonic_lines) > 1:
expected_non_affix_line_y = (mnemonic_lines[-1].regular_min_y
- INSTR_BIT_FIELDS_TOP_PAD_HEIGHT2)
else:
expected_non_affix_line_y = (mnemonic_lines[-1].regular_min_y
- INSTR_BIT_FIELDS_TOP_PAD_HEIGHT)
for x, y, line in self.qt.range(
min_x=min_x - 5,
max_x=max_x + 5,
min_y=expected_non_affix_line_y - 5,
max_y=expected_non_affix_line_y + 5,
):
if not isinstance(line, LTLine):
continue
if line.width > line.height:
found_non_affix_line = True
break
if found_non_affix_line:
return self.extract_instr_bit_fields_box(
min_x=min_x,
max_x=max_x,
expected_box_max_y=expected_non_affix_line_y,
)
prefix_text = self.extract_text_line(
start_min_y=mnemonic_lines[-1].regular_min_y
- INSTR_BIT_FIELDS_PREFIX_TEXT_TOP_PAD_HEIGHT,
min_x=min_x,
max_x=max_x,
fonts=TextLineFonts(
regular=(Font.INSTR_DESC_SMALL,),
bold=(Font.INSTR_DESC_SMALL_BOLD,),
),
allowed_start_min_y_error=2,
)
if prefix_text is None:
raise InstrParseFailed("can't find instr prefix bit fields title")
prefix_text_str = "".join(prefix_text.element.itertext())
if prefix_text_str != "Prefix:":
raise InstrParseFailed(
f"instr prefix bit fields title is not as expected: {prefix_text_str!r}")
prefix_bit_fields = self.extract_instr_bit_fields_box(
min_x=min_x,
max_x=max_x,
expected_box_max_y=prefix_text.regular_min_y
- INSTR_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT,
)
if prefix_bit_fields is None:
raise InstrParseFailed("can't find instr prefix bit fields")
suffix_text = self.extract_text_line(
start_min_y=prefix_bit_fields.box_min_y
- INSTR_BIT_FIELDS_PREFIX_BOX_BOTTOM_TO_SUFFIX_TEXT_HEIGHT,
min_x=min_x,
max_x=max_x,
fonts=TextLineFonts(
regular=(Font.INSTR_DESC_SMALL,),
bold=(Font.INSTR_DESC_SMALL_BOLD,),
),
allowed_start_min_y_error=2,
)
if suffix_text is None:
raise InstrParseFailed("can't find instr suffix bit fields title")
suffix_text_str = "".join(suffix_text.element.itertext())
if suffix_text_str != "Suffix:":
raise InstrParseFailed(
f"instr suffix bit fields title is not as expected: {suffix_text_str!r}")
suffix_bit_fields = self.extract_instr_bit_fields_box(
min_x=min_x,
max_x=max_x,
expected_box_max_y=suffix_text.regular_min_y
- INSTR_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT,
)
if suffix_bit_fields is None:
raise InstrParseFailed("can't find instr suffix bit fields")
return InstrBitFields(
prefix=InstrBitFieldsPrefix(
box_min_x=prefix_bit_fields.box_min_x,
box_min_y=prefix_bit_fields.box_min_y,
box_max_x=prefix_bit_fields.box_max_x,
box_max_y=prefix_bit_fields.box_max_y,
prefix_text=prefix_text,
fields=prefix_bit_fields.fields,
suffix_text=suffix_text,
),
box_min_x=suffix_bit_fields.box_min_x,
box_min_y=suffix_bit_fields.box_min_y,
box_max_x=suffix_bit_fields.box_max_x,
box_max_y=suffix_bit_fields.box_max_y,
fields=suffix_bit_fields.fields,
)
def extract_instr_bit_fields_box(
self,
min_x: float,
max_x: float,
expected_box_max_y: float,
) -> None | InstrBitFields: ) -> None | InstrBitFields:
h_lines: list[LTLine] = [] h_lines: list[LTLine] = []
v_lines: list[LTLine] = [] v_lines: list[LTLine] = []
for x, y, line in self.qt.range( for x, y, line in self.qt.range(
min_x=min_x - 5, min_x=min_x - 5,
max_x=max_x + 5, max_x=max_x + 5,
min_y=last_mnemonic_line_min_y min_y=expected_box_max_y - INSTR_BIT_FIELDS_BOX_HEIGHT - 5,
- INSTR_BIT_FIELDS_PADDED_HEIGHT max_y=expected_box_max_y + 5,
+ INSTR_BIT_FIELDS_BOTTOM_PAD_HEIGHT / 2,
max_y=last_mnemonic_line_min_y
- INSTR_BIT_FIELDS_TOP_PAD_HEIGHT / 2,
): ):
if not isinstance(line, LTLine): if not isinstance(line, LTLine):
continue continue
@ -439,10 +783,10 @@ class PageParser:
if len(h_lines) == 0 and len(v_lines) == 0: if len(h_lines) == 0 and len(v_lines) == 0:
return None return None
if len(h_lines) != 2: if len(h_lines) != 2:
raise PageParseFailed( raise InstrParseFailed(
f"instruction bit fields box has wrong number of horizontal lines:\n{h_lines}") f"instruction bit fields box has wrong number of horizontal lines:\n{h_lines}")
if len(v_lines) < 2: if len(v_lines) < 2:
raise PageParseFailed( raise InstrParseFailed(
f"instruction bit fields box has too few vertical lines:\n{h_lines}") f"instruction bit fields box has too few vertical lines:\n{h_lines}")
bottom_line, top_line = h_lines bottom_line, top_line = h_lines
box_min_x = v_lines[0].x0 box_min_x = v_lines[0].x0
@ -463,7 +807,7 @@ class PageParser:
box_min_x=field_box_min_x, box_min_x=field_box_min_x,
box_max_x=field_box_max_x, box_max_x=field_box_max_x,
name=self.extract_text_line( name=self.extract_text_line(
start_min_y=box_mid_y + 3, start_min_y=box_mid_y + 3.288,
min_x=field_box_min_x, min_x=field_box_min_x,
max_x=field_box_max_x, max_x=field_box_max_x,
fonts=TextLineFonts( fonts=TextLineFonts(
@ -472,16 +816,17 @@ class PageParser:
skip_initial_spaces=True, skip_initial_spaces=True,
), ),
start_bit=self.extract_text_line( start_bit=self.extract_text_line(
start_min_y=box_min_y + 3, start_min_y=box_min_y + 3.487,
min_x=field_box_min_x, min_x=field_box_min_x,
max_x=field_box_max_x, max_x=field_box_max_x,
fonts=TextLineFonts( fonts=TextLineFonts(
regular=Font.INSTR_FIELD_BIT_NUMS, regular=(Font.INSTR_DESC_SMALL,),
), ),
skip_initial_spaces=True, skip_initial_spaces=True,
), ),
)) ))
return InstrBitFields( return InstrBitFields(
prefix=None,
box_min_x=box_min_x, box_min_x=box_min_x,
box_min_y=box_min_y, box_min_y=box_min_y,
box_max_x=box_max_x, box_max_x=box_max_x,
@ -501,7 +846,7 @@ class PageParser:
start_min_y=header_start_char.min_y, start_min_y=header_start_char.min_y,
min_x=column_min_x, min_x=column_min_x,
max_x=column_max_x, max_x=column_max_x,
fonts=TextLineFonts(regular=Font.INSTR_HEADER), fonts=TextLineFonts(regular=(Font.INSTR_HEADER,)),
) )
if header_line is None: if header_line is None:
raise PageParseFailed("can't find header text line") raise PageParseFailed("can't find header text line")
@ -509,20 +854,22 @@ class PageParser:
first_text_line=header_line, first_text_line=header_line,
min_x=column_min_x, min_x=column_min_x,
max_x=column_max_x, max_x=column_max_x,
allowed_start_min_y_error=1.5,
) )
print("instr header lines:") print("instr header lines:")
print("\n".join(map(str, header_lines))) print("\n".join(map(str, header_lines)))
mnemonic_line = self.extract_text_line( mnemonic_line = self.extract_text_line(
start_min_y=header_lines[-1].regular_min_y - 18.788, start_min_y=header_lines[-1].regular_min_y - INSTR_MNEMONIC_TOP_PAD_HEIGHT,
min_x=column_min_x, min_x=column_min_x,
max_x=column_max_x, max_x=column_max_x,
fonts=TextLineFonts( fonts=TextLineFonts(
regular=Font.INSTR_DESC, regular=Font.INSTR_DESC,
), ),
skip_initial_spaces=True, skip_initial_spaces=True,
allowed_start_min_y_error=3,
) )
if mnemonic_line is None: if mnemonic_line is None:
raise PageParseFailed("can't find instr mnemonic text line") raise InstrParseFailed("can't find instr mnemonic text line")
mnemonic_lines = self.extract_following_text_lines( mnemonic_lines = self.extract_following_text_lines(
first_text_line=mnemonic_line, first_text_line=mnemonic_line,
min_x=mnemonic_line.chars[0].min_x, min_x=mnemonic_line.chars[0].min_x,
@ -533,9 +880,94 @@ class PageParser:
instr_bit_fields = self.extract_instr_bit_fields( instr_bit_fields = self.extract_instr_bit_fields(
min_x=column_min_x, min_x=column_min_x,
max_x=column_max_x, max_x=column_max_x,
last_mnemonic_line_min_y=mnemonic_lines[-1].regular_min_y, mnemonic_lines=mnemonic_lines,
) )
print(instr_bit_fields) print(instr_bit_fields)
if instr_bit_fields is None:
raise InstrParseFailed("can't find instr bit fields")
alt_header_line = self.extract_text_line(
start_min_y=instr_bit_fields.box_min_y
- INSTR_BIT_FIELDS_BOTTOM_TO_HEADER_TEXT_HEIGHT,
min_x=column_min_x,
max_x=column_max_x,
fonts=TextLineFonts(
regular=(Font.INSTR_HEADER,),
),
skip_initial_spaces=True,
allowed_start_min_y_error=6,
)
if alt_header_line is not None:
print(f"found alt header line:\n{alt_header_line}")
alt_header_lines = self.extract_following_text_lines(
first_text_line=alt_header_line,
min_x=column_min_x,
max_x=column_max_x,
allowed_start_min_y_error=1.5,
)
print("instr alt header lines:")
print("\n".join(map(str, alt_header_lines)))
alt_mnemonic_line = self.extract_text_line(
start_min_y=alt_header_lines[-1].regular_min_y - INSTR_MNEMONIC_TOP_PAD_HEIGHT,
min_x=column_min_x,
max_x=column_max_x,
fonts=TextLineFonts(
regular=Font.INSTR_DESC,
),
skip_initial_spaces=True,
allowed_start_min_y_error=1.5,
)
if alt_mnemonic_line is None:
raise InstrParseFailed("can't find instr alt mnemonic text line")
alt_mnemonic_lines = self.extract_following_text_lines(
first_text_line=alt_mnemonic_line,
min_x=alt_mnemonic_line.chars[0].min_x,
max_x=column_max_x,
)
print("instr alt mnemonic lines:")
print("\n".join(map(str, alt_mnemonic_lines)))
alt_instr_bit_fields = self.extract_instr_bit_fields(
min_x=column_min_x,
max_x=column_max_x,
mnemonic_lines=alt_mnemonic_lines,
)
print(alt_instr_bit_fields)
if alt_instr_bit_fields is None:
raise InstrParseFailed("can't find instr alt bit fields")
last_instr_bit_fields = alt_instr_bit_fields
else:
print("no alt header line")
alt_header_lines = None
alt_mnemonic_lines = None
alt_instr_bit_fields = None
last_instr_bit_fields = instr_bit_fields
code_line = None
for y_offset in reversed(range(4)):
code_line = self.extract_text_line(
start_min_y=last_instr_bit_fields.box_min_y
- INSTR_BIT_FIELDS_BOTTOM_TO_CODE_TEXT_HEIGHT
+ y_offset * 0.5 * Font.INSTR_CODE[0].line_height,
min_x=column_min_x,
max_x=column_max_x,
fonts=TextLineFonts(
regular=Font.INSTR_CODE,
subscript=(Font.INSTR_CODE_SUBSCRIPT,),
),
skip_initial_spaces=True,
allowed_start_min_y_error=1,
)
if code_line is not None:
break
if code_line is None:
raise InstrParseFailed("can't find instr code text line")
code_lines = self.extract_following_text_lines(
first_text_line=code_line,
min_x=code_line.chars[0].min_x,
max_x=column_max_x,
allowed_start_min_y_error=0.05,
)
print("instr code lines:")
print("\n".join(map(str, code_lines)))
# TODO: finish # TODO: finish
def extract_instrs(self): def extract_instrs(self):
@ -544,4 +976,4 @@ class PageParser:
self.extract_instr(next(iter(unprocessed_header_chars))) self.extract_instr(next(iter(unprocessed_header_chars)))
def main(): def main():
Parser().parse_pdf(sys.argv[1], page_numbers=range(76, 78)) Parser().parse_pdf(sys.argv[1], page_numbers=range(1495))