WIP... all fonts included
This commit is contained in:
parent
08141ce560
commit
b9f47e5ae1
|
@ -4,9 +4,10 @@ from dataclasses import dataclass, field
|
|||
import dataclasses
|
||||
from functools import cached_property
|
||||
import sys
|
||||
from typing import ClassVar, Iterable, Iterator, NewType, TypeAlias, TypeVar, assert_never, overload
|
||||
from typing import ClassVar, Container, Iterable, Iterator, NewType, TypeAlias, TypeVar, assert_never, overload
|
||||
from xml.etree import ElementTree
|
||||
import enum
|
||||
import traceback
|
||||
|
||||
from pdfminer.high_level import extract_pages
|
||||
from pdfminer.layout import LTChar, LTLine, LTPage, LTRect, LTTextBox
|
||||
|
@ -22,22 +23,30 @@ class Font:
|
|||
|
||||
@cached_property
|
||||
def space_width(self) -> float:
|
||||
match self:
|
||||
case Font.INSTR_HEADER:
|
||||
return 3.12
|
||||
case _:
|
||||
return self.size * 0.31
|
||||
return 3.985 * self.size / Font.INSTR_CODE[0].size
|
||||
|
||||
@cached_property
|
||||
def line_height(self) -> float:
|
||||
match self:
|
||||
case Font.INSTR_HEADER:
|
||||
return 10.961
|
||||
match self.font_name:
|
||||
case _ if any(self.font_name == i.font_name for i in Font.INSTR_CODE):
|
||||
return 9.464 * self.size / Font.INSTR_CODE[0].size
|
||||
case Font.INSTR_DESC_BOLD.font_name | \
|
||||
Font.INSTR_DESC_ITALIC.font_name | \
|
||||
Font.INSTR_DESC_BOLD_ITALIC.font_name | \
|
||||
Font.NOTATION_PAGE_SUBSCRIPT.font_name | \
|
||||
Font.NOTATION_PAGE_SUBSCRIPT_SYM.font_name:
|
||||
return 10.959 * self.size / Font.INSTR_DESC[0].size
|
||||
case _ if self in Font.INSTR_DESC or self.font_name == Font.INSTR_DESC[0].font_name:
|
||||
return 10.959 * self.size / Font.INSTR_DESC[0].size
|
||||
case _ if self in Font.MATH_MISC:
|
||||
return 10.959 * self.size / Font.INSTR_DESC[0].size
|
||||
case _ if self in Font.NOTATION_PAGE_SYM:
|
||||
return 10.959 * self.size / Font.INSTR_DESC[0].size
|
||||
case _:
|
||||
return self.size * 1.1
|
||||
raise AssertionError(f"no line height: {self}")
|
||||
|
||||
@classmethod
|
||||
def __iter__(cls) -> Iterator[Font]:
|
||||
def known_fonts(cls) -> Iterator[Font]:
|
||||
return iter(cls.__KNOWN_NAMES.keys())
|
||||
|
||||
@property
|
||||
|
@ -47,25 +56,160 @@ class Font:
|
|||
@classmethod
|
||||
def _register_known_fonts(cls) -> None:
|
||||
cls.INSTR_HEADER = Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=9.963)
|
||||
cls.RTL_FN_HEADER = Font(font_name='APUYSQ+zcoN-Regular', size=9.963)
|
||||
cls.PAGE_HEADER = Font(font_name='MJBFWM+DejaVuSansCondensed', size=9.963)
|
||||
cls.PAGE_FOOTER = Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.981)
|
||||
cls.INSTR_DESC = Font(font_name='MJBFWM+DejaVuSansCondensed', size=8.966)
|
||||
cls.INSTR_DESC = (
|
||||
Font(font_name='MJBFWM+DejaVuSansCondensed', size=8.966),
|
||||
Font(font_name='WHMZPU+CMEX9', size=8.966),
|
||||
)
|
||||
cls.INSTR_DESC_MISC = (
|
||||
Font(font_name='MJBFWM+DejaVuSansCondensed', size=2.377),
|
||||
Font(font_name='MJBFWM+DejaVuSansCondensed', size=2.561),
|
||||
Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.492),
|
||||
Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.641),
|
||||
Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.772),
|
||||
Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.864),
|
||||
Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.925),
|
||||
Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.097),
|
||||
Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.123),
|
||||
Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.131),
|
||||
Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.516),
|
||||
Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.604),
|
||||
Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.634),
|
||||
Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.906),
|
||||
Font(font_name='MJBFWM+DejaVuSansCondensed', size=6.033),
|
||||
Font(font_name='MJBFWM+DejaVuSansCondensed', size=6.068),
|
||||
Font(font_name='MJBFWM+DejaVuSansCondensed', size=6.213),
|
||||
Font(font_name='MJBFWM+DejaVuSansCondensed', size=6.252),
|
||||
Font(font_name='MJBFWM+DejaVuSansCondensed', size=6.962),
|
||||
Font(font_name='MJBFWM+DejaVuSansCondensed', size=7.977),
|
||||
)
|
||||
cls.INSTR_DESC_CODE = Font(font_name='APUYSQ+zcoN-Regular', size=6.974)
|
||||
cls.INSTR_DESC_CODE_MISC = (
|
||||
Font(font_name='APUYSQ+zcoN-Regular', size=3.587),
|
||||
Font(font_name='APUYSQ+zcoN-Regular', size=4.483),
|
||||
)
|
||||
cls.INSTR_DESC_ITALIC = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=8.966)
|
||||
cls.INSTR_DESC_BOLD = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=8.966)
|
||||
cls.INSTR_DESC_BOLD_ITALIC = Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=8.966)
|
||||
cls.INSTR_DESC_SMALL = Font(font_name='MJBFWM+DejaVuSansCondensed', size=7.97)
|
||||
cls.INSTR_DESC_SMALL_ITALIC = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=7.97)
|
||||
cls.INSTR_DESC_SMALL_BOLD = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=7.97)
|
||||
cls.INSTR_DESC_SMALL_BOLD_ITALIC = Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=7.97)
|
||||
cls.INSTR_DESC_BOLD_MISC = (
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=2.21),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=2.399),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=2.763),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=2.946),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=2.949),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=2.999),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.065),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.086),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.183),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.686),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.744),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.825),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.842),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.857),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.979),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.032),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.112),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.161),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.206),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.353),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.378),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.434),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.595),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.619),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.647),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.68),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.693),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.736),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.781),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.802),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.995),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.201),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.258),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.363),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.442),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.473),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.485),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.512),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.543),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.613),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.744),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.774),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.809),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.849),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.911),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.92),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.962),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.981),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.146),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.213),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.221),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.243),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.55),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.62),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.699),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.725),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.751),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.856),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=8.029),
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=8.406),
|
||||
)
|
||||
cls.INSTR_DESC_SUBSCRIPT = Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.978)
|
||||
cls.INSTR_FIELD_BIT_NUMS = Font(font_name='MJBFWM+DejaVuSansCondensed', size=7.97)
|
||||
cls.INSTR_DESC_BOLD_SUBSCRIPT = \
|
||||
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.978)
|
||||
cls.INSTR_DESC_BOLD_ITALIC_SUBSCRIPT = \
|
||||
Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=5.978)
|
||||
cls.INSTR_EXT_MNEMONIC = Font(font_name='APUYSQ+zcoN-Regular', size=8.966)
|
||||
cls.INSTR_CODE = Font(font_name='APUYSQ+zcoN-Regular', size=7.97)
|
||||
cls.INSTR_CODE_SYM = Font(font_name='RRFUNA+CMSY8', size=7.97)
|
||||
cls.INSTR_CODE_NE_EQ_SIGN = Font(font_name='HPXOZC+CMSS8', size=7.97)
|
||||
cls.INSTR_CODE = (
|
||||
Font(font_name='APUYSQ+zcoN-Regular', size=7.97),
|
||||
Font(font_name='RRFUNA+CMSY8', size=7.97),
|
||||
Font(font_name='HPXOZC+CMSS8', size=7.97),
|
||||
)
|
||||
cls.INSTR_CODE_SUBSCRIPT = Font(font_name='APUYSQ+zcoN-Regular', size=5.978)
|
||||
cls.TITLE_PAGE_BIG = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=24.787)
|
||||
cls.TITLE_PAGE_VERSION = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=9.963)
|
||||
cls.TITLE_PAGE_TM = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.974)
|
||||
cls.TITLE_PAGE_REV = Font(font_name='MJBFWM+DejaVuSansCondensed', size=6.974)
|
||||
cls.TITLE_PAGE_BOOK = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=20.663)
|
||||
cls.LEGAL_PAGE_ITALIC = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=9.963)
|
||||
cls.CHANGE_SUMMARY_PAGE_BOLD = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=11.955)
|
||||
cls.CHAPTER_TITLE = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=17.215)
|
||||
cls.NOTATION_PAGE_SYM = (
|
||||
Font(font_name='FZTIYT+CMMI9', size=8.966),
|
||||
Font(font_name='ONUAYC+CMSSI9', size=8.966),
|
||||
Font(font_name='TNGBFZ+CMSY9', size=8.966),
|
||||
Font(font_name='ZJTMSG+CMSS9', size=8.966),
|
||||
)
|
||||
cls.NOTATION_PAGE_SUBSCRIPT_SYM = Font(font_name='DBQTKF+CMSY6', size=5.978)
|
||||
cls.NOTATION_PAGE_SUBSCRIPT = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=5.978)
|
||||
cls.MATH_MISC = (
|
||||
Font(font_name='AAJMKT+CMMI6', size=5.978),
|
||||
Font(font_name='CUTMFD+CMSSI8', size=5.978),
|
||||
Font(font_name='CUTMFD+CMSSI8', size=7.97),
|
||||
Font(font_name='FZTIYT+CMMI9', size=5.734),
|
||||
Font(font_name='FZTIYT+CMMI9', size=7.168),
|
||||
Font(font_name='HONFQS+CMMI8', size=7.97),
|
||||
Font(font_name='HPXOZC+CMSS8', size=5.978),
|
||||
Font(font_name='LLVRDD+CMSY10', size=11.955),
|
||||
Font(font_name='ZJTMSG+CMSS9', size=7.168),
|
||||
)
|
||||
|
||||
cls.__KNOWN_NAMES = {}
|
||||
for name, value in cls.__dict__.items():
|
||||
if name[0].isupper() and isinstance(value, cls):
|
||||
if name[0].isupper():
|
||||
if isinstance(value, cls):
|
||||
assert value not in cls.__KNOWN_NAMES, f"duplicate known font: {value}"
|
||||
cls.__KNOWN_NAMES[value] = name
|
||||
elif isinstance(value, tuple) and all(isinstance(i, cls) for i in value):
|
||||
for i, font in enumerate(value):
|
||||
assert isinstance(font, cls)
|
||||
assert font not in cls.__KNOWN_NAMES, f"duplicate known font: {font}"
|
||||
cls.__KNOWN_NAMES[font] = f"{name}[{i}]"
|
||||
|
||||
old_repr = cls.__repr__
|
||||
def __repr__(self: cls) -> str:
|
||||
|
@ -77,6 +221,10 @@ class Font:
|
|||
|
||||
del cls._register_known_fonts
|
||||
|
||||
for font in Font.known_fonts():
|
||||
font.space_width # initialize
|
||||
font.line_height # initialize
|
||||
|
||||
Font._register_known_fonts()
|
||||
|
||||
@dataclass(unsafe_hash=True, frozen=True)
|
||||
|
@ -103,17 +251,25 @@ class Char:
|
|||
|
||||
@dataclass()
|
||||
class Parser:
|
||||
def parse_pdf(self, file: str, page_numbers: range | None = None):
|
||||
def parse_pdf(self, file: str, page_numbers: Container[int] | None = None):
|
||||
for page in extract_pages(file, page_numbers=page_numbers):
|
||||
try:
|
||||
PageParser(parser=self, page_id=page.pageid).parse_page(page)
|
||||
except Exception as e:
|
||||
e.add_note(f"pageid={page.pageid}")
|
||||
raise
|
||||
|
||||
|
||||
COLUMN_SPLIT_X = 300.0
|
||||
INSTR_BIT_FIELDS_TOP_PAD_HEIGHT = 14.694
|
||||
INSTR_BIT_FIELDS_PREFIX_TEXT_TOP_PAD_HEIGHT = 29.938
|
||||
INSTR_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT = 9.278
|
||||
INSTR_BIT_FIELDS_PREFIX_BOX_BOTTOM_TO_SUFFIX_TEXT_HEIGHT = 20.971
|
||||
INSTR_BIT_FIELDS_TOP_PAD_HEIGHT = 20.175
|
||||
INSTR_BIT_FIELDS_TOP_PAD_HEIGHT2 = 14.694
|
||||
INSTR_BIT_FIELDS_BOX_HEIGHT = 22.317
|
||||
INSTR_BIT_FIELDS_BOTTOM_PAD_HEIGHT = 24.657
|
||||
INSTR_BIT_FIELDS_PADDED_HEIGHT = (INSTR_BIT_FIELDS_TOP_PAD_HEIGHT
|
||||
+ INSTR_BIT_FIELDS_BOX_HEIGHT + INSTR_BIT_FIELDS_BOTTOM_PAD_HEIGHT)
|
||||
INSTR_BIT_FIELDS_BOTTOM_TO_CODE_TEXT_HEIGHT = 24.657 + 1.586
|
||||
INSTR_BIT_FIELDS_BOTTOM_TO_HEADER_TEXT_HEIGHT = 32.927 + 2.1519216
|
||||
INSTR_MNEMONIC_TOP_PAD_HEIGHT = 15.75
|
||||
|
||||
@dataclass()
|
||||
class ParsedTextLine:
|
||||
|
@ -143,49 +299,69 @@ class ParsedTextLine:
|
|||
|
||||
_T = TypeVar("_T")
|
||||
|
||||
class BaselinePos(enum.Enum):
|
||||
ABOVE = "above"
|
||||
BASELINE = "baseline"
|
||||
BELOW = "below"
|
||||
|
||||
@dataclass(unsafe_hash=True, frozen=True)
|
||||
class TextLineFonts:
|
||||
regular: Font
|
||||
italic: Font | None = None
|
||||
bold: Font | None = None
|
||||
bold_italic: Font | None = None
|
||||
regular: tuple[Font, ...]
|
||||
italic: tuple[Font, ...] | None = None
|
||||
bold: tuple[Font, ...] | None = None
|
||||
bold_italic: tuple[Font, ...] | None = None
|
||||
subscript: tuple[Font, ...] | None = None
|
||||
|
||||
def get_font(self, part_kind: TextLineFontKind, default: _T=None) -> _T | Font:
|
||||
def get_font(self, part_kind: TextLineFontKind, default: _T=None) -> _T | tuple[tuple[Font, ...], BaselinePos]:
|
||||
match part_kind:
|
||||
case TextLineFontKind.REGULAR:
|
||||
retval = self.regular
|
||||
font = self.regular
|
||||
baseline_pos = BaselinePos.BASELINE
|
||||
case TextLineFontKind.ITALIC:
|
||||
retval = self.italic
|
||||
font = self.italic
|
||||
baseline_pos = BaselinePos.BASELINE
|
||||
case TextLineFontKind.BOLD:
|
||||
retval = self.bold
|
||||
font = self.bold
|
||||
baseline_pos = BaselinePos.BASELINE
|
||||
case TextLineFontKind.BOLD_ITALIC:
|
||||
retval = self.bold_italic
|
||||
font = self.bold_italic
|
||||
baseline_pos = BaselinePos.BASELINE
|
||||
case TextLineFontKind.SUBSCRIPT:
|
||||
font = self.subscript
|
||||
baseline_pos = BaselinePos.BELOW
|
||||
case TextLineFontKind.SUPERSCRIPT:
|
||||
font = self.subscript
|
||||
baseline_pos = BaselinePos.ABOVE
|
||||
case _:
|
||||
assert_never(part_kind)
|
||||
if retval is None:
|
||||
if font is None:
|
||||
return default
|
||||
return retval
|
||||
return font, baseline_pos
|
||||
|
||||
@cached_property
|
||||
def __font_to_kind_map(self) -> dict[Font, TextLineFontKind]:
|
||||
retval = {}
|
||||
def __font_to_kind_map(self) -> dict[tuple[Font, BaselinePos], TextLineFontKind]:
|
||||
retval: dict[tuple[Font, BaselinePos], TextLineFontKind] = {}
|
||||
for kind in TextLineFontKind:
|
||||
font = self.get_font(kind)
|
||||
if font is None:
|
||||
fonts = self.get_font(kind)
|
||||
if fonts is None:
|
||||
continue
|
||||
fonts, baseline_pos = fonts
|
||||
for font in fonts:
|
||||
assert font not in retval, \
|
||||
f"duplicate font: kind={kind} old_kind={retval[font]} font={font}"
|
||||
retval[font] = kind
|
||||
retval[font, baseline_pos] = kind
|
||||
return retval
|
||||
|
||||
def get_kind(self, font: Font, default: _T=None) -> _T | TextLineFontKind:
|
||||
return self.__font_to_kind_map.get(font, default)
|
||||
def get_kind(self, font: Font, baseline_pos: BaselinePos, default: _T=None) -> _T | TextLineFontKind:
|
||||
return self.__font_to_kind_map.get((font, baseline_pos), default)
|
||||
|
||||
class TextLineFontKind(enum.Enum):
|
||||
REGULAR = "regular"
|
||||
ITALIC = "italic"
|
||||
BOLD = "bold"
|
||||
BOLD_ITALIC = "bold_italic"
|
||||
SUBSCRIPT = "subscript"
|
||||
SUPERSCRIPT = "superscript"
|
||||
|
||||
@cached_property
|
||||
def text_line_tags(self) -> tuple[str, ...]:
|
||||
|
@ -198,12 +374,19 @@ class TextLineFontKind(enum.Enum):
|
|||
return "b",
|
||||
case TextLineFontKind.BOLD_ITALIC:
|
||||
return "b", "i"
|
||||
case TextLineFontKind.SUBSCRIPT:
|
||||
return "sub",
|
||||
case TextLineFontKind.SUPERSCRIPT:
|
||||
return "sup",
|
||||
case _:
|
||||
assert_never(self)
|
||||
|
||||
class PageParseFailed(Exception):
|
||||
pass
|
||||
|
||||
class InstrParseFailed(Exception):
|
||||
pass
|
||||
|
||||
class ElementBodyBuilder:
|
||||
def __init__(self, containing_element: ElementTree.Element):
|
||||
self.__containing_element = containing_element
|
||||
|
@ -266,8 +449,29 @@ class InstrBitField:
|
|||
def __str__(self) -> str:
|
||||
return f"<InstrBitField: x={self.box_min_x}..{self.box_max_x} name={self.name} start_bit={self.start_bit}>"
|
||||
|
||||
@dataclass(unsafe_hash=True, frozen=True)
|
||||
class InstrBitFieldsPrefix:
|
||||
box_min_x: float
|
||||
box_min_y: float
|
||||
box_max_x: float
|
||||
box_max_y: float
|
||||
prefix_text: ParsedTextLine
|
||||
fields: tuple[InstrBitField, ...]
|
||||
suffix_text: ParsedTextLine
|
||||
|
||||
def __str__(self):
|
||||
sep = ",\n "
|
||||
return (f"<InstrBitFieldsPrefix: ({self.box_min_x},{self.box_min_y}).."
|
||||
f"({self.box_max_x},{self.box_max_y})\n"
|
||||
f" prefix_text={self.prefix_text}\n"
|
||||
f" [\n"
|
||||
f" {sep.join(map(str, self.fields))},\n"
|
||||
f" ]\n"
|
||||
f" suffix_text={self.suffix_text}>")
|
||||
|
||||
@dataclass(unsafe_hash=True, frozen=True)
|
||||
class InstrBitFields:
|
||||
prefix: None | InstrBitFieldsPrefix
|
||||
box_min_x: float
|
||||
box_min_y: float
|
||||
box_max_x: float
|
||||
|
@ -276,9 +480,20 @@ class InstrBitFields:
|
|||
|
||||
def __str__(self):
|
||||
sep = ",\n "
|
||||
return (f"<InstrBitFields: ({self.box_min_x},{self.box_min_y}).."
|
||||
prefix_str = ""
|
||||
if self.prefix is not None:
|
||||
prefix_str = f"{self.prefix}\n"
|
||||
return (f"{prefix_str}<InstrBitFields: ({self.box_min_x},{self.box_min_y}).."
|
||||
f"({self.box_max_x},{self.box_max_y}) [\n"
|
||||
f" {sep.join(map(str, self.fields))}]>")
|
||||
f" {sep.join(map(str, self.fields))},\n]>")
|
||||
|
||||
CHAR_TO_EXPANDED = {
|
||||
"\ufb00": "ff",
|
||||
"\ufb01": "fi",
|
||||
"\ufb02": "fl",
|
||||
"\ufb03": "ffi",
|
||||
"\ufb04": "ffl",
|
||||
}
|
||||
|
||||
@dataclass()
|
||||
class PageParser:
|
||||
|
@ -315,16 +530,24 @@ class PageParser:
|
|||
self.unprocessed_chars[char.font].add(char)
|
||||
for i in self.unprocessed_chars.values():
|
||||
i.sort(key=Char.top_down_left_to_right_sort_key)
|
||||
unknown_fonts=[]
|
||||
unknown_font_errors=[]
|
||||
for font, chars in self.unprocessed_chars.items():
|
||||
print()
|
||||
print(font)
|
||||
if font.known_name is None:
|
||||
text = ""
|
||||
char = None
|
||||
for char in chars:
|
||||
text += char.text
|
||||
print(repr(text))
|
||||
assert font.known_name is not None, f"unknown font {font}\nlast char: {char}"
|
||||
unknown_fonts.append(repr(font) + ",")
|
||||
unknown_font_errors.append(f"unknown font {font}\nlast char: {char}\ntext: {text!r}")
|
||||
unknown_fonts.sort()
|
||||
if len(unknown_fonts) != 0:
|
||||
raise AssertionError("\nunknown fonts:\n" + "\n".join(unknown_fonts)
|
||||
+ "\n\n" + "\n".join(unknown_font_errors))
|
||||
try:
|
||||
self.extract_instrs()
|
||||
except InstrParseFailed:
|
||||
traceback.print_exc()
|
||||
|
||||
def extract_text_line(
|
||||
self, *,
|
||||
|
@ -334,39 +557,48 @@ class PageParser:
|
|||
max_x: float,
|
||||
fonts: TextLineFonts,
|
||||
skip_initial_spaces=False,
|
||||
allowed_start_min_y_error=None,
|
||||
) -> None | ParsedTextLine:
|
||||
chars: list[Char] = []
|
||||
chars_set: SetById[Char] = SetById()
|
||||
if start_char is not None:
|
||||
chars.append(start_char)
|
||||
self.unprocessed_chars[start_char.font].remove(start_char)
|
||||
chars_set.add(start_char)
|
||||
for x, y, char in self.qt.range(
|
||||
min_x=min_x,
|
||||
min_x=min_x - fonts.regular[0].size * 0.5,
|
||||
max_x=max_x,
|
||||
min_y=start_min_y - fonts.regular.size * 0.5,
|
||||
max_y=start_min_y + fonts.regular.size * 0.5,
|
||||
min_y=start_min_y - fonts.regular[0].size * 0.4,
|
||||
max_y=start_min_y + fonts.regular[0].size * 0.6,
|
||||
):
|
||||
if not isinstance(char, Char):
|
||||
continue
|
||||
if char not in self.unprocessed_chars[char.font]:
|
||||
if char not in self.unprocessed_chars[char.font] or char in chars_set:
|
||||
continue
|
||||
self.unprocessed_chars[char.font].remove(char)
|
||||
chars_set.add(char)
|
||||
chars.append(char)
|
||||
if len(chars) == 0:
|
||||
return None
|
||||
chars.sort(key=Char.top_down_left_to_right_sort_key)
|
||||
chars.sort(key=lambda char: (char.min_x, char.text))
|
||||
retval = ParsedTextLine(
|
||||
element=ElementTree.Element("text-line"),
|
||||
regular_min_y=chars[0].min_y,
|
||||
fonts=fonts,
|
||||
chars=chars,
|
||||
)
|
||||
with ElementBodyBuilder(retval.element) as body_builder:
|
||||
text_and_tag_stacks: list[tuple[str, tuple[str, ...]]] = []
|
||||
last_max_x = min_x
|
||||
last_kind = None
|
||||
last_char = None
|
||||
for char in chars:
|
||||
kind = fonts.get_kind(char.font)
|
||||
if char.min_y - retval.regular_min_y < -0.2:
|
||||
baseline_pos = BaselinePos.BELOW
|
||||
elif char.min_y - retval.regular_min_y > 1:
|
||||
baseline_pos = BaselinePos.ABOVE
|
||||
else:
|
||||
baseline_pos = BaselinePos.BASELINE
|
||||
kind = fonts.get_kind(font=char.font, baseline_pos=baseline_pos)
|
||||
if kind is None:
|
||||
print(f"font kind is None:\nfonts={fonts}\nchar={char}")
|
||||
print(f"font kind is None:\nfonts={fonts}\nchar={char}\nbaseline_pos={baseline_pos}")
|
||||
return None
|
||||
if last_kind is None:
|
||||
space_kind = kind
|
||||
|
@ -374,20 +606,36 @@ class PageParser:
|
|||
space_kind = TextLineFontKind.REGULAR
|
||||
else:
|
||||
space_kind = kind
|
||||
space_font = fonts.get_font(space_kind, fonts.regular)
|
||||
space_font, _ = fonts.get_font(space_kind, (fonts.regular, BaselinePos.BASELINE))
|
||||
space_width = char.min_x - last_max_x
|
||||
space_count_f = space_width / space_font.space_width
|
||||
space_count_f = space_width / space_font[0].space_width
|
||||
space_count = round(space_count_f)
|
||||
if space_count_f > 0.25 and abs(space_count - space_count_f) > 0.15:
|
||||
print(f"spaces: space_count_f={space_count_f} space_width={space_width}")
|
||||
if space_count > 0 and not skip_initial_spaces:
|
||||
body_builder.set_tag_stack(space_kind.text_line_tags)
|
||||
body_builder.write_text(" " * space_count)
|
||||
text_and_tag_stacks.append((" " * space_count, space_kind.text_line_tags))
|
||||
skip_initial_spaces = False
|
||||
body_builder.set_tag_stack(kind.text_line_tags)
|
||||
body_builder.write_text(char.text)
|
||||
if (char.text == "\u0338" and last_char is not None and last_char.text == "=" and abs(char.min_x - last_char.min_x) < 0.01 and abs(char.min_y - last_char.min_y) < 0.01):
|
||||
text_and_tag_stacks[-1] = "\u2260", ()
|
||||
last_max_x = last_char.max_x
|
||||
else:
|
||||
char_text = CHAR_TO_EXPANDED.get(char.text, char.text)
|
||||
text_and_tag_stacks.append((char_text, kind.text_line_tags))
|
||||
last_max_x = char.max_x
|
||||
last_kind = kind
|
||||
last_char = char
|
||||
with ElementBodyBuilder(retval.element) as body_builder:
|
||||
for text, tag_stack in text_and_tag_stacks:
|
||||
body_builder.set_tag_stack(tag_stack)
|
||||
body_builder.write_text(text)
|
||||
for char in chars:
|
||||
self.unprocessed_chars[char.font].remove(char)
|
||||
if allowed_start_min_y_error is None:
|
||||
allowed_start_min_y_error = 0.01
|
||||
assert abs(start_min_y - chars[0].min_y) < allowed_start_min_y_error, (
|
||||
f"start_min_y={start_min_y} regular_min_y={chars[0].min_y}\n"
|
||||
f"start_min_y error: {start_min_y - chars[0].min_y}\n"
|
||||
f"allowed_start_min_y_error={allowed_start_min_y_error}")
|
||||
return retval
|
||||
|
||||
def extract_following_text_lines(
|
||||
|
@ -395,16 +643,18 @@ class PageParser:
|
|||
first_text_line: ParsedTextLine,
|
||||
min_x: float,
|
||||
max_x: float,
|
||||
allowed_start_min_y_error=None,
|
||||
) -> list[ParsedTextLine]:
|
||||
retval: list[ParsedTextLine] = []
|
||||
line = first_text_line
|
||||
while line is not None:
|
||||
retval.append(line)
|
||||
line = self.extract_text_line(
|
||||
start_min_y=line.regular_min_y - first_text_line.fonts.regular.line_height,
|
||||
start_min_y=line.regular_min_y - first_text_line.fonts.regular[0].line_height,
|
||||
min_x=min_x,
|
||||
max_x=max_x,
|
||||
fonts=first_text_line.fonts,
|
||||
allowed_start_min_y_error=allowed_start_min_y_error,
|
||||
)
|
||||
return retval
|
||||
|
||||
|
@ -412,18 +662,112 @@ class PageParser:
|
|||
self,
|
||||
min_x: float,
|
||||
max_x: float,
|
||||
last_mnemonic_line_min_y: float,
|
||||
mnemonic_lines: list[ParsedTextLine],
|
||||
) -> None | InstrBitFields:
|
||||
found_non_affix_line = False
|
||||
if len(mnemonic_lines) > 1:
|
||||
expected_non_affix_line_y = (mnemonic_lines[-1].regular_min_y
|
||||
- INSTR_BIT_FIELDS_TOP_PAD_HEIGHT2)
|
||||
else:
|
||||
expected_non_affix_line_y = (mnemonic_lines[-1].regular_min_y
|
||||
- INSTR_BIT_FIELDS_TOP_PAD_HEIGHT)
|
||||
for x, y, line in self.qt.range(
|
||||
min_x=min_x - 5,
|
||||
max_x=max_x + 5,
|
||||
min_y=expected_non_affix_line_y - 5,
|
||||
max_y=expected_non_affix_line_y + 5,
|
||||
):
|
||||
if not isinstance(line, LTLine):
|
||||
continue
|
||||
if line.width > line.height:
|
||||
found_non_affix_line = True
|
||||
break
|
||||
if found_non_affix_line:
|
||||
return self.extract_instr_bit_fields_box(
|
||||
min_x=min_x,
|
||||
max_x=max_x,
|
||||
expected_box_max_y=expected_non_affix_line_y,
|
||||
)
|
||||
prefix_text = self.extract_text_line(
|
||||
start_min_y=mnemonic_lines[-1].regular_min_y
|
||||
- INSTR_BIT_FIELDS_PREFIX_TEXT_TOP_PAD_HEIGHT,
|
||||
min_x=min_x,
|
||||
max_x=max_x,
|
||||
fonts=TextLineFonts(
|
||||
regular=(Font.INSTR_DESC_SMALL,),
|
||||
bold=(Font.INSTR_DESC_SMALL_BOLD,),
|
||||
),
|
||||
allowed_start_min_y_error=2,
|
||||
)
|
||||
if prefix_text is None:
|
||||
raise InstrParseFailed("can't find instr prefix bit fields title")
|
||||
prefix_text_str = "".join(prefix_text.element.itertext())
|
||||
if prefix_text_str != "Prefix:":
|
||||
raise InstrParseFailed(
|
||||
f"instr prefix bit fields title is not as expected: {prefix_text_str!r}")
|
||||
prefix_bit_fields = self.extract_instr_bit_fields_box(
|
||||
min_x=min_x,
|
||||
max_x=max_x,
|
||||
expected_box_max_y=prefix_text.regular_min_y
|
||||
- INSTR_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT,
|
||||
)
|
||||
if prefix_bit_fields is None:
|
||||
raise InstrParseFailed("can't find instr prefix bit fields")
|
||||
suffix_text = self.extract_text_line(
|
||||
start_min_y=prefix_bit_fields.box_min_y
|
||||
- INSTR_BIT_FIELDS_PREFIX_BOX_BOTTOM_TO_SUFFIX_TEXT_HEIGHT,
|
||||
min_x=min_x,
|
||||
max_x=max_x,
|
||||
fonts=TextLineFonts(
|
||||
regular=(Font.INSTR_DESC_SMALL,),
|
||||
bold=(Font.INSTR_DESC_SMALL_BOLD,),
|
||||
),
|
||||
allowed_start_min_y_error=2,
|
||||
)
|
||||
if suffix_text is None:
|
||||
raise InstrParseFailed("can't find instr suffix bit fields title")
|
||||
suffix_text_str = "".join(suffix_text.element.itertext())
|
||||
if suffix_text_str != "Suffix:":
|
||||
raise InstrParseFailed(
|
||||
f"instr suffix bit fields title is not as expected: {suffix_text_str!r}")
|
||||
suffix_bit_fields = self.extract_instr_bit_fields_box(
|
||||
min_x=min_x,
|
||||
max_x=max_x,
|
||||
expected_box_max_y=suffix_text.regular_min_y
|
||||
- INSTR_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT,
|
||||
)
|
||||
if suffix_bit_fields is None:
|
||||
raise InstrParseFailed("can't find instr suffix bit fields")
|
||||
return InstrBitFields(
|
||||
prefix=InstrBitFieldsPrefix(
|
||||
box_min_x=prefix_bit_fields.box_min_x,
|
||||
box_min_y=prefix_bit_fields.box_min_y,
|
||||
box_max_x=prefix_bit_fields.box_max_x,
|
||||
box_max_y=prefix_bit_fields.box_max_y,
|
||||
prefix_text=prefix_text,
|
||||
fields=prefix_bit_fields.fields,
|
||||
suffix_text=suffix_text,
|
||||
),
|
||||
box_min_x=suffix_bit_fields.box_min_x,
|
||||
box_min_y=suffix_bit_fields.box_min_y,
|
||||
box_max_x=suffix_bit_fields.box_max_x,
|
||||
box_max_y=suffix_bit_fields.box_max_y,
|
||||
fields=suffix_bit_fields.fields,
|
||||
)
|
||||
|
||||
def extract_instr_bit_fields_box(
|
||||
self,
|
||||
min_x: float,
|
||||
max_x: float,
|
||||
expected_box_max_y: float,
|
||||
) -> None | InstrBitFields:
|
||||
h_lines: list[LTLine] = []
|
||||
v_lines: list[LTLine] = []
|
||||
for x, y, line in self.qt.range(
|
||||
min_x=min_x - 5,
|
||||
max_x=max_x + 5,
|
||||
min_y=last_mnemonic_line_min_y
|
||||
- INSTR_BIT_FIELDS_PADDED_HEIGHT
|
||||
+ INSTR_BIT_FIELDS_BOTTOM_PAD_HEIGHT / 2,
|
||||
max_y=last_mnemonic_line_min_y
|
||||
- INSTR_BIT_FIELDS_TOP_PAD_HEIGHT / 2,
|
||||
min_y=expected_box_max_y - INSTR_BIT_FIELDS_BOX_HEIGHT - 5,
|
||||
max_y=expected_box_max_y + 5,
|
||||
):
|
||||
if not isinstance(line, LTLine):
|
||||
continue
|
||||
|
@ -439,10 +783,10 @@ class PageParser:
|
|||
if len(h_lines) == 0 and len(v_lines) == 0:
|
||||
return None
|
||||
if len(h_lines) != 2:
|
||||
raise PageParseFailed(
|
||||
raise InstrParseFailed(
|
||||
f"instruction bit fields box has wrong number of horizontal lines:\n{h_lines}")
|
||||
if len(v_lines) < 2:
|
||||
raise PageParseFailed(
|
||||
raise InstrParseFailed(
|
||||
f"instruction bit fields box has too few vertical lines:\n{h_lines}")
|
||||
bottom_line, top_line = h_lines
|
||||
box_min_x = v_lines[0].x0
|
||||
|
@ -463,7 +807,7 @@ class PageParser:
|
|||
box_min_x=field_box_min_x,
|
||||
box_max_x=field_box_max_x,
|
||||
name=self.extract_text_line(
|
||||
start_min_y=box_mid_y + 3,
|
||||
start_min_y=box_mid_y + 3.288,
|
||||
min_x=field_box_min_x,
|
||||
max_x=field_box_max_x,
|
||||
fonts=TextLineFonts(
|
||||
|
@ -472,16 +816,17 @@ class PageParser:
|
|||
skip_initial_spaces=True,
|
||||
),
|
||||
start_bit=self.extract_text_line(
|
||||
start_min_y=box_min_y + 3,
|
||||
start_min_y=box_min_y + 3.487,
|
||||
min_x=field_box_min_x,
|
||||
max_x=field_box_max_x,
|
||||
fonts=TextLineFonts(
|
||||
regular=Font.INSTR_FIELD_BIT_NUMS,
|
||||
regular=(Font.INSTR_DESC_SMALL,),
|
||||
),
|
||||
skip_initial_spaces=True,
|
||||
),
|
||||
))
|
||||
return InstrBitFields(
|
||||
prefix=None,
|
||||
box_min_x=box_min_x,
|
||||
box_min_y=box_min_y,
|
||||
box_max_x=box_max_x,
|
||||
|
@ -501,7 +846,7 @@ class PageParser:
|
|||
start_min_y=header_start_char.min_y,
|
||||
min_x=column_min_x,
|
||||
max_x=column_max_x,
|
||||
fonts=TextLineFonts(regular=Font.INSTR_HEADER),
|
||||
fonts=TextLineFonts(regular=(Font.INSTR_HEADER,)),
|
||||
)
|
||||
if header_line is None:
|
||||
raise PageParseFailed("can't find header text line")
|
||||
|
@ -509,20 +854,22 @@ class PageParser:
|
|||
first_text_line=header_line,
|
||||
min_x=column_min_x,
|
||||
max_x=column_max_x,
|
||||
allowed_start_min_y_error=1.5,
|
||||
)
|
||||
print("instr header lines:")
|
||||
print("\n".join(map(str, header_lines)))
|
||||
mnemonic_line = self.extract_text_line(
|
||||
start_min_y=header_lines[-1].regular_min_y - 18.788,
|
||||
start_min_y=header_lines[-1].regular_min_y - INSTR_MNEMONIC_TOP_PAD_HEIGHT,
|
||||
min_x=column_min_x,
|
||||
max_x=column_max_x,
|
||||
fonts=TextLineFonts(
|
||||
regular=Font.INSTR_DESC,
|
||||
),
|
||||
skip_initial_spaces=True,
|
||||
allowed_start_min_y_error=3,
|
||||
)
|
||||
if mnemonic_line is None:
|
||||
raise PageParseFailed("can't find instr mnemonic text line")
|
||||
raise InstrParseFailed("can't find instr mnemonic text line")
|
||||
mnemonic_lines = self.extract_following_text_lines(
|
||||
first_text_line=mnemonic_line,
|
||||
min_x=mnemonic_line.chars[0].min_x,
|
||||
|
@ -533,9 +880,94 @@ class PageParser:
|
|||
instr_bit_fields = self.extract_instr_bit_fields(
|
||||
min_x=column_min_x,
|
||||
max_x=column_max_x,
|
||||
last_mnemonic_line_min_y=mnemonic_lines[-1].regular_min_y,
|
||||
mnemonic_lines=mnemonic_lines,
|
||||
)
|
||||
print(instr_bit_fields)
|
||||
if instr_bit_fields is None:
|
||||
raise InstrParseFailed("can't find instr bit fields")
|
||||
alt_header_line = self.extract_text_line(
|
||||
start_min_y=instr_bit_fields.box_min_y
|
||||
- INSTR_BIT_FIELDS_BOTTOM_TO_HEADER_TEXT_HEIGHT,
|
||||
min_x=column_min_x,
|
||||
max_x=column_max_x,
|
||||
fonts=TextLineFonts(
|
||||
regular=(Font.INSTR_HEADER,),
|
||||
),
|
||||
skip_initial_spaces=True,
|
||||
allowed_start_min_y_error=6,
|
||||
)
|
||||
if alt_header_line is not None:
|
||||
print(f"found alt header line:\n{alt_header_line}")
|
||||
alt_header_lines = self.extract_following_text_lines(
|
||||
first_text_line=alt_header_line,
|
||||
min_x=column_min_x,
|
||||
max_x=column_max_x,
|
||||
allowed_start_min_y_error=1.5,
|
||||
)
|
||||
print("instr alt header lines:")
|
||||
print("\n".join(map(str, alt_header_lines)))
|
||||
alt_mnemonic_line = self.extract_text_line(
|
||||
start_min_y=alt_header_lines[-1].regular_min_y - INSTR_MNEMONIC_TOP_PAD_HEIGHT,
|
||||
min_x=column_min_x,
|
||||
max_x=column_max_x,
|
||||
fonts=TextLineFonts(
|
||||
regular=Font.INSTR_DESC,
|
||||
),
|
||||
skip_initial_spaces=True,
|
||||
allowed_start_min_y_error=1.5,
|
||||
)
|
||||
if alt_mnemonic_line is None:
|
||||
raise InstrParseFailed("can't find instr alt mnemonic text line")
|
||||
alt_mnemonic_lines = self.extract_following_text_lines(
|
||||
first_text_line=alt_mnemonic_line,
|
||||
min_x=alt_mnemonic_line.chars[0].min_x,
|
||||
max_x=column_max_x,
|
||||
)
|
||||
print("instr alt mnemonic lines:")
|
||||
print("\n".join(map(str, alt_mnemonic_lines)))
|
||||
alt_instr_bit_fields = self.extract_instr_bit_fields(
|
||||
min_x=column_min_x,
|
||||
max_x=column_max_x,
|
||||
mnemonic_lines=alt_mnemonic_lines,
|
||||
)
|
||||
print(alt_instr_bit_fields)
|
||||
if alt_instr_bit_fields is None:
|
||||
raise InstrParseFailed("can't find instr alt bit fields")
|
||||
last_instr_bit_fields = alt_instr_bit_fields
|
||||
else:
|
||||
print("no alt header line")
|
||||
alt_header_lines = None
|
||||
alt_mnemonic_lines = None
|
||||
alt_instr_bit_fields = None
|
||||
last_instr_bit_fields = instr_bit_fields
|
||||
|
||||
code_line = None
|
||||
for y_offset in reversed(range(4)):
|
||||
code_line = self.extract_text_line(
|
||||
start_min_y=last_instr_bit_fields.box_min_y
|
||||
- INSTR_BIT_FIELDS_BOTTOM_TO_CODE_TEXT_HEIGHT
|
||||
+ y_offset * 0.5 * Font.INSTR_CODE[0].line_height,
|
||||
min_x=column_min_x,
|
||||
max_x=column_max_x,
|
||||
fonts=TextLineFonts(
|
||||
regular=Font.INSTR_CODE,
|
||||
subscript=(Font.INSTR_CODE_SUBSCRIPT,),
|
||||
),
|
||||
skip_initial_spaces=True,
|
||||
allowed_start_min_y_error=1,
|
||||
)
|
||||
if code_line is not None:
|
||||
break
|
||||
if code_line is None:
|
||||
raise InstrParseFailed("can't find instr code text line")
|
||||
code_lines = self.extract_following_text_lines(
|
||||
first_text_line=code_line,
|
||||
min_x=code_line.chars[0].min_x,
|
||||
max_x=column_max_x,
|
||||
allowed_start_min_y_error=0.05,
|
||||
)
|
||||
print("instr code lines:")
|
||||
print("\n".join(map(str, code_lines)))
|
||||
# TODO: finish
|
||||
|
||||
def extract_instrs(self):
|
||||
|
@ -544,4 +976,4 @@ class PageParser:
|
|||
self.extract_instr(next(iter(unprocessed_header_chars)))
|
||||
|
||||
def main():
|
||||
Parser().parse_pdf(sys.argv[1], page_numbers=range(76, 78))
|
||||
Parser().parse_pdf(sys.argv[1], page_numbers=range(1495))
|
Loading…
Reference in a new issue