WIP... all fonts included
This commit is contained in:
parent
08141ce560
commit
b9f47e5ae1
|
@ -4,9 +4,10 @@ from dataclasses import dataclass, field
|
||||||
import dataclasses
|
import dataclasses
|
||||||
from functools import cached_property
|
from functools import cached_property
|
||||||
import sys
|
import sys
|
||||||
from typing import ClassVar, Iterable, Iterator, NewType, TypeAlias, TypeVar, assert_never, overload
|
from typing import ClassVar, Container, Iterable, Iterator, NewType, TypeAlias, TypeVar, assert_never, overload
|
||||||
from xml.etree import ElementTree
|
from xml.etree import ElementTree
|
||||||
import enum
|
import enum
|
||||||
|
import traceback
|
||||||
|
|
||||||
from pdfminer.high_level import extract_pages
|
from pdfminer.high_level import extract_pages
|
||||||
from pdfminer.layout import LTChar, LTLine, LTPage, LTRect, LTTextBox
|
from pdfminer.layout import LTChar, LTLine, LTPage, LTRect, LTTextBox
|
||||||
|
@ -22,22 +23,30 @@ class Font:
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def space_width(self) -> float:
|
def space_width(self) -> float:
|
||||||
match self:
|
return 3.985 * self.size / Font.INSTR_CODE[0].size
|
||||||
case Font.INSTR_HEADER:
|
|
||||||
return 3.12
|
|
||||||
case _:
|
|
||||||
return self.size * 0.31
|
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def line_height(self) -> float:
|
def line_height(self) -> float:
|
||||||
match self:
|
match self.font_name:
|
||||||
case Font.INSTR_HEADER:
|
case _ if any(self.font_name == i.font_name for i in Font.INSTR_CODE):
|
||||||
return 10.961
|
return 9.464 * self.size / Font.INSTR_CODE[0].size
|
||||||
|
case Font.INSTR_DESC_BOLD.font_name | \
|
||||||
|
Font.INSTR_DESC_ITALIC.font_name | \
|
||||||
|
Font.INSTR_DESC_BOLD_ITALIC.font_name | \
|
||||||
|
Font.NOTATION_PAGE_SUBSCRIPT.font_name | \
|
||||||
|
Font.NOTATION_PAGE_SUBSCRIPT_SYM.font_name:
|
||||||
|
return 10.959 * self.size / Font.INSTR_DESC[0].size
|
||||||
|
case _ if self in Font.INSTR_DESC or self.font_name == Font.INSTR_DESC[0].font_name:
|
||||||
|
return 10.959 * self.size / Font.INSTR_DESC[0].size
|
||||||
|
case _ if self in Font.MATH_MISC:
|
||||||
|
return 10.959 * self.size / Font.INSTR_DESC[0].size
|
||||||
|
case _ if self in Font.NOTATION_PAGE_SYM:
|
||||||
|
return 10.959 * self.size / Font.INSTR_DESC[0].size
|
||||||
case _:
|
case _:
|
||||||
return self.size * 1.1
|
raise AssertionError(f"no line height: {self}")
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def __iter__(cls) -> Iterator[Font]:
|
def known_fonts(cls) -> Iterator[Font]:
|
||||||
return iter(cls.__KNOWN_NAMES.keys())
|
return iter(cls.__KNOWN_NAMES.keys())
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -47,25 +56,160 @@ class Font:
|
||||||
@classmethod
|
@classmethod
|
||||||
def _register_known_fonts(cls) -> None:
|
def _register_known_fonts(cls) -> None:
|
||||||
cls.INSTR_HEADER = Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=9.963)
|
cls.INSTR_HEADER = Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=9.963)
|
||||||
|
cls.RTL_FN_HEADER = Font(font_name='APUYSQ+zcoN-Regular', size=9.963)
|
||||||
cls.PAGE_HEADER = Font(font_name='MJBFWM+DejaVuSansCondensed', size=9.963)
|
cls.PAGE_HEADER = Font(font_name='MJBFWM+DejaVuSansCondensed', size=9.963)
|
||||||
cls.PAGE_FOOTER = Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.981)
|
cls.PAGE_FOOTER = Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.981)
|
||||||
cls.INSTR_DESC = Font(font_name='MJBFWM+DejaVuSansCondensed', size=8.966)
|
cls.INSTR_DESC = (
|
||||||
|
Font(font_name='MJBFWM+DejaVuSansCondensed', size=8.966),
|
||||||
|
Font(font_name='WHMZPU+CMEX9', size=8.966),
|
||||||
|
)
|
||||||
|
cls.INSTR_DESC_MISC = (
|
||||||
|
Font(font_name='MJBFWM+DejaVuSansCondensed', size=2.377),
|
||||||
|
Font(font_name='MJBFWM+DejaVuSansCondensed', size=2.561),
|
||||||
|
Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.492),
|
||||||
|
Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.641),
|
||||||
|
Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.772),
|
||||||
|
Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.864),
|
||||||
|
Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.925),
|
||||||
|
Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.097),
|
||||||
|
Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.123),
|
||||||
|
Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.131),
|
||||||
|
Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.516),
|
||||||
|
Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.604),
|
||||||
|
Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.634),
|
||||||
|
Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.906),
|
||||||
|
Font(font_name='MJBFWM+DejaVuSansCondensed', size=6.033),
|
||||||
|
Font(font_name='MJBFWM+DejaVuSansCondensed', size=6.068),
|
||||||
|
Font(font_name='MJBFWM+DejaVuSansCondensed', size=6.213),
|
||||||
|
Font(font_name='MJBFWM+DejaVuSansCondensed', size=6.252),
|
||||||
|
Font(font_name='MJBFWM+DejaVuSansCondensed', size=6.962),
|
||||||
|
Font(font_name='MJBFWM+DejaVuSansCondensed', size=7.977),
|
||||||
|
)
|
||||||
|
cls.INSTR_DESC_CODE = Font(font_name='APUYSQ+zcoN-Regular', size=6.974)
|
||||||
|
cls.INSTR_DESC_CODE_MISC = (
|
||||||
|
Font(font_name='APUYSQ+zcoN-Regular', size=3.587),
|
||||||
|
Font(font_name='APUYSQ+zcoN-Regular', size=4.483),
|
||||||
|
)
|
||||||
cls.INSTR_DESC_ITALIC = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=8.966)
|
cls.INSTR_DESC_ITALIC = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=8.966)
|
||||||
cls.INSTR_DESC_BOLD = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=8.966)
|
cls.INSTR_DESC_BOLD = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=8.966)
|
||||||
cls.INSTR_DESC_BOLD_ITALIC = Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=8.966)
|
cls.INSTR_DESC_BOLD_ITALIC = Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=8.966)
|
||||||
|
cls.INSTR_DESC_SMALL = Font(font_name='MJBFWM+DejaVuSansCondensed', size=7.97)
|
||||||
|
cls.INSTR_DESC_SMALL_ITALIC = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=7.97)
|
||||||
|
cls.INSTR_DESC_SMALL_BOLD = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=7.97)
|
||||||
|
cls.INSTR_DESC_SMALL_BOLD_ITALIC = Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=7.97)
|
||||||
|
cls.INSTR_DESC_BOLD_MISC = (
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=2.21),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=2.399),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=2.763),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=2.946),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=2.949),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=2.999),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.065),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.086),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.183),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.686),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.744),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.825),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.842),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.857),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=3.979),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.032),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.112),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.161),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.206),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.353),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.378),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.434),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.595),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.619),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.647),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.68),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.693),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.736),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.781),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.802),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=4.995),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.201),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.258),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.363),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.442),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.473),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.485),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.512),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.543),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.613),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.744),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.774),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.809),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.849),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.911),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.92),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.962),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.981),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.146),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.213),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.221),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.243),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.55),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.62),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.699),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.725),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.751),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.856),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=8.029),
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=8.406),
|
||||||
|
)
|
||||||
cls.INSTR_DESC_SUBSCRIPT = Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.978)
|
cls.INSTR_DESC_SUBSCRIPT = Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.978)
|
||||||
cls.INSTR_FIELD_BIT_NUMS = Font(font_name='MJBFWM+DejaVuSansCondensed', size=7.97)
|
cls.INSTR_DESC_BOLD_SUBSCRIPT = \
|
||||||
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.978)
|
||||||
|
cls.INSTR_DESC_BOLD_ITALIC_SUBSCRIPT = \
|
||||||
|
Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=5.978)
|
||||||
cls.INSTR_EXT_MNEMONIC = Font(font_name='APUYSQ+zcoN-Regular', size=8.966)
|
cls.INSTR_EXT_MNEMONIC = Font(font_name='APUYSQ+zcoN-Regular', size=8.966)
|
||||||
cls.INSTR_CODE = Font(font_name='APUYSQ+zcoN-Regular', size=7.97)
|
cls.INSTR_CODE = (
|
||||||
cls.INSTR_CODE_SYM = Font(font_name='RRFUNA+CMSY8', size=7.97)
|
Font(font_name='APUYSQ+zcoN-Regular', size=7.97),
|
||||||
cls.INSTR_CODE_NE_EQ_SIGN = Font(font_name='HPXOZC+CMSS8', size=7.97)
|
Font(font_name='RRFUNA+CMSY8', size=7.97),
|
||||||
|
Font(font_name='HPXOZC+CMSS8', size=7.97),
|
||||||
|
)
|
||||||
cls.INSTR_CODE_SUBSCRIPT = Font(font_name='APUYSQ+zcoN-Regular', size=5.978)
|
cls.INSTR_CODE_SUBSCRIPT = Font(font_name='APUYSQ+zcoN-Regular', size=5.978)
|
||||||
|
cls.TITLE_PAGE_BIG = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=24.787)
|
||||||
|
cls.TITLE_PAGE_VERSION = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=9.963)
|
||||||
|
cls.TITLE_PAGE_TM = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.974)
|
||||||
|
cls.TITLE_PAGE_REV = Font(font_name='MJBFWM+DejaVuSansCondensed', size=6.974)
|
||||||
|
cls.TITLE_PAGE_BOOK = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=20.663)
|
||||||
|
cls.LEGAL_PAGE_ITALIC = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=9.963)
|
||||||
|
cls.CHANGE_SUMMARY_PAGE_BOLD = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=11.955)
|
||||||
|
cls.CHAPTER_TITLE = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=17.215)
|
||||||
|
cls.NOTATION_PAGE_SYM = (
|
||||||
|
Font(font_name='FZTIYT+CMMI9', size=8.966),
|
||||||
|
Font(font_name='ONUAYC+CMSSI9', size=8.966),
|
||||||
|
Font(font_name='TNGBFZ+CMSY9', size=8.966),
|
||||||
|
Font(font_name='ZJTMSG+CMSS9', size=8.966),
|
||||||
|
)
|
||||||
|
cls.NOTATION_PAGE_SUBSCRIPT_SYM = Font(font_name='DBQTKF+CMSY6', size=5.978)
|
||||||
|
cls.NOTATION_PAGE_SUBSCRIPT = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=5.978)
|
||||||
|
cls.MATH_MISC = (
|
||||||
|
Font(font_name='AAJMKT+CMMI6', size=5.978),
|
||||||
|
Font(font_name='CUTMFD+CMSSI8', size=5.978),
|
||||||
|
Font(font_name='CUTMFD+CMSSI8', size=7.97),
|
||||||
|
Font(font_name='FZTIYT+CMMI9', size=5.734),
|
||||||
|
Font(font_name='FZTIYT+CMMI9', size=7.168),
|
||||||
|
Font(font_name='HONFQS+CMMI8', size=7.97),
|
||||||
|
Font(font_name='HPXOZC+CMSS8', size=5.978),
|
||||||
|
Font(font_name='LLVRDD+CMSY10', size=11.955),
|
||||||
|
Font(font_name='ZJTMSG+CMSS9', size=7.168),
|
||||||
|
)
|
||||||
|
|
||||||
cls.__KNOWN_NAMES = {}
|
cls.__KNOWN_NAMES = {}
|
||||||
for name, value in cls.__dict__.items():
|
for name, value in cls.__dict__.items():
|
||||||
if name[0].isupper() and isinstance(value, cls):
|
if name[0].isupper():
|
||||||
|
if isinstance(value, cls):
|
||||||
assert value not in cls.__KNOWN_NAMES, f"duplicate known font: {value}"
|
assert value not in cls.__KNOWN_NAMES, f"duplicate known font: {value}"
|
||||||
cls.__KNOWN_NAMES[value] = name
|
cls.__KNOWN_NAMES[value] = name
|
||||||
|
elif isinstance(value, tuple) and all(isinstance(i, cls) for i in value):
|
||||||
|
for i, font in enumerate(value):
|
||||||
|
assert isinstance(font, cls)
|
||||||
|
assert font not in cls.__KNOWN_NAMES, f"duplicate known font: {font}"
|
||||||
|
cls.__KNOWN_NAMES[font] = f"{name}[{i}]"
|
||||||
|
|
||||||
old_repr = cls.__repr__
|
old_repr = cls.__repr__
|
||||||
def __repr__(self: cls) -> str:
|
def __repr__(self: cls) -> str:
|
||||||
|
@ -77,6 +221,10 @@ class Font:
|
||||||
|
|
||||||
del cls._register_known_fonts
|
del cls._register_known_fonts
|
||||||
|
|
||||||
|
for font in Font.known_fonts():
|
||||||
|
font.space_width # initialize
|
||||||
|
font.line_height # initialize
|
||||||
|
|
||||||
Font._register_known_fonts()
|
Font._register_known_fonts()
|
||||||
|
|
||||||
@dataclass(unsafe_hash=True, frozen=True)
|
@dataclass(unsafe_hash=True, frozen=True)
|
||||||
|
@ -103,17 +251,25 @@ class Char:
|
||||||
|
|
||||||
@dataclass()
|
@dataclass()
|
||||||
class Parser:
|
class Parser:
|
||||||
def parse_pdf(self, file: str, page_numbers: range | None = None):
|
def parse_pdf(self, file: str, page_numbers: Container[int] | None = None):
|
||||||
for page in extract_pages(file, page_numbers=page_numbers):
|
for page in extract_pages(file, page_numbers=page_numbers):
|
||||||
|
try:
|
||||||
PageParser(parser=self, page_id=page.pageid).parse_page(page)
|
PageParser(parser=self, page_id=page.pageid).parse_page(page)
|
||||||
|
except Exception as e:
|
||||||
|
e.add_note(f"pageid={page.pageid}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
COLUMN_SPLIT_X = 300.0
|
COLUMN_SPLIT_X = 300.0
|
||||||
INSTR_BIT_FIELDS_TOP_PAD_HEIGHT = 14.694
|
INSTR_BIT_FIELDS_PREFIX_TEXT_TOP_PAD_HEIGHT = 29.938
|
||||||
|
INSTR_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT = 9.278
|
||||||
|
INSTR_BIT_FIELDS_PREFIX_BOX_BOTTOM_TO_SUFFIX_TEXT_HEIGHT = 20.971
|
||||||
|
INSTR_BIT_FIELDS_TOP_PAD_HEIGHT = 20.175
|
||||||
|
INSTR_BIT_FIELDS_TOP_PAD_HEIGHT2 = 14.694
|
||||||
INSTR_BIT_FIELDS_BOX_HEIGHT = 22.317
|
INSTR_BIT_FIELDS_BOX_HEIGHT = 22.317
|
||||||
INSTR_BIT_FIELDS_BOTTOM_PAD_HEIGHT = 24.657
|
INSTR_BIT_FIELDS_BOTTOM_TO_CODE_TEXT_HEIGHT = 24.657 + 1.586
|
||||||
INSTR_BIT_FIELDS_PADDED_HEIGHT = (INSTR_BIT_FIELDS_TOP_PAD_HEIGHT
|
INSTR_BIT_FIELDS_BOTTOM_TO_HEADER_TEXT_HEIGHT = 32.927 + 2.1519216
|
||||||
+ INSTR_BIT_FIELDS_BOX_HEIGHT + INSTR_BIT_FIELDS_BOTTOM_PAD_HEIGHT)
|
INSTR_MNEMONIC_TOP_PAD_HEIGHT = 15.75
|
||||||
|
|
||||||
@dataclass()
|
@dataclass()
|
||||||
class ParsedTextLine:
|
class ParsedTextLine:
|
||||||
|
@ -143,49 +299,69 @@ class ParsedTextLine:
|
||||||
|
|
||||||
_T = TypeVar("_T")
|
_T = TypeVar("_T")
|
||||||
|
|
||||||
|
class BaselinePos(enum.Enum):
|
||||||
|
ABOVE = "above"
|
||||||
|
BASELINE = "baseline"
|
||||||
|
BELOW = "below"
|
||||||
|
|
||||||
@dataclass(unsafe_hash=True, frozen=True)
|
@dataclass(unsafe_hash=True, frozen=True)
|
||||||
class TextLineFonts:
|
class TextLineFonts:
|
||||||
regular: Font
|
regular: tuple[Font, ...]
|
||||||
italic: Font | None = None
|
italic: tuple[Font, ...] | None = None
|
||||||
bold: Font | None = None
|
bold: tuple[Font, ...] | None = None
|
||||||
bold_italic: Font | None = None
|
bold_italic: tuple[Font, ...] | None = None
|
||||||
|
subscript: tuple[Font, ...] | None = None
|
||||||
|
|
||||||
def get_font(self, part_kind: TextLineFontKind, default: _T=None) -> _T | Font:
|
def get_font(self, part_kind: TextLineFontKind, default: _T=None) -> _T | tuple[tuple[Font, ...], BaselinePos]:
|
||||||
match part_kind:
|
match part_kind:
|
||||||
case TextLineFontKind.REGULAR:
|
case TextLineFontKind.REGULAR:
|
||||||
retval = self.regular
|
font = self.regular
|
||||||
|
baseline_pos = BaselinePos.BASELINE
|
||||||
case TextLineFontKind.ITALIC:
|
case TextLineFontKind.ITALIC:
|
||||||
retval = self.italic
|
font = self.italic
|
||||||
|
baseline_pos = BaselinePos.BASELINE
|
||||||
case TextLineFontKind.BOLD:
|
case TextLineFontKind.BOLD:
|
||||||
retval = self.bold
|
font = self.bold
|
||||||
|
baseline_pos = BaselinePos.BASELINE
|
||||||
case TextLineFontKind.BOLD_ITALIC:
|
case TextLineFontKind.BOLD_ITALIC:
|
||||||
retval = self.bold_italic
|
font = self.bold_italic
|
||||||
|
baseline_pos = BaselinePos.BASELINE
|
||||||
|
case TextLineFontKind.SUBSCRIPT:
|
||||||
|
font = self.subscript
|
||||||
|
baseline_pos = BaselinePos.BELOW
|
||||||
|
case TextLineFontKind.SUPERSCRIPT:
|
||||||
|
font = self.subscript
|
||||||
|
baseline_pos = BaselinePos.ABOVE
|
||||||
case _:
|
case _:
|
||||||
assert_never(part_kind)
|
assert_never(part_kind)
|
||||||
if retval is None:
|
if font is None:
|
||||||
return default
|
return default
|
||||||
return retval
|
return font, baseline_pos
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def __font_to_kind_map(self) -> dict[Font, TextLineFontKind]:
|
def __font_to_kind_map(self) -> dict[tuple[Font, BaselinePos], TextLineFontKind]:
|
||||||
retval = {}
|
retval: dict[tuple[Font, BaselinePos], TextLineFontKind] = {}
|
||||||
for kind in TextLineFontKind:
|
for kind in TextLineFontKind:
|
||||||
font = self.get_font(kind)
|
fonts = self.get_font(kind)
|
||||||
if font is None:
|
if fonts is None:
|
||||||
continue
|
continue
|
||||||
|
fonts, baseline_pos = fonts
|
||||||
|
for font in fonts:
|
||||||
assert font not in retval, \
|
assert font not in retval, \
|
||||||
f"duplicate font: kind={kind} old_kind={retval[font]} font={font}"
|
f"duplicate font: kind={kind} old_kind={retval[font]} font={font}"
|
||||||
retval[font] = kind
|
retval[font, baseline_pos] = kind
|
||||||
return retval
|
return retval
|
||||||
|
|
||||||
def get_kind(self, font: Font, default: _T=None) -> _T | TextLineFontKind:
|
def get_kind(self, font: Font, baseline_pos: BaselinePos, default: _T=None) -> _T | TextLineFontKind:
|
||||||
return self.__font_to_kind_map.get(font, default)
|
return self.__font_to_kind_map.get((font, baseline_pos), default)
|
||||||
|
|
||||||
class TextLineFontKind(enum.Enum):
|
class TextLineFontKind(enum.Enum):
|
||||||
REGULAR = "regular"
|
REGULAR = "regular"
|
||||||
ITALIC = "italic"
|
ITALIC = "italic"
|
||||||
BOLD = "bold"
|
BOLD = "bold"
|
||||||
BOLD_ITALIC = "bold_italic"
|
BOLD_ITALIC = "bold_italic"
|
||||||
|
SUBSCRIPT = "subscript"
|
||||||
|
SUPERSCRIPT = "superscript"
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def text_line_tags(self) -> tuple[str, ...]:
|
def text_line_tags(self) -> tuple[str, ...]:
|
||||||
|
@ -198,12 +374,19 @@ class TextLineFontKind(enum.Enum):
|
||||||
return "b",
|
return "b",
|
||||||
case TextLineFontKind.BOLD_ITALIC:
|
case TextLineFontKind.BOLD_ITALIC:
|
||||||
return "b", "i"
|
return "b", "i"
|
||||||
|
case TextLineFontKind.SUBSCRIPT:
|
||||||
|
return "sub",
|
||||||
|
case TextLineFontKind.SUPERSCRIPT:
|
||||||
|
return "sup",
|
||||||
case _:
|
case _:
|
||||||
assert_never(self)
|
assert_never(self)
|
||||||
|
|
||||||
class PageParseFailed(Exception):
|
class PageParseFailed(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
class InstrParseFailed(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
class ElementBodyBuilder:
|
class ElementBodyBuilder:
|
||||||
def __init__(self, containing_element: ElementTree.Element):
|
def __init__(self, containing_element: ElementTree.Element):
|
||||||
self.__containing_element = containing_element
|
self.__containing_element = containing_element
|
||||||
|
@ -266,8 +449,29 @@ class InstrBitField:
|
||||||
def __str__(self) -> str:
|
def __str__(self) -> str:
|
||||||
return f"<InstrBitField: x={self.box_min_x}..{self.box_max_x} name={self.name} start_bit={self.start_bit}>"
|
return f"<InstrBitField: x={self.box_min_x}..{self.box_max_x} name={self.name} start_bit={self.start_bit}>"
|
||||||
|
|
||||||
|
@dataclass(unsafe_hash=True, frozen=True)
|
||||||
|
class InstrBitFieldsPrefix:
|
||||||
|
box_min_x: float
|
||||||
|
box_min_y: float
|
||||||
|
box_max_x: float
|
||||||
|
box_max_y: float
|
||||||
|
prefix_text: ParsedTextLine
|
||||||
|
fields: tuple[InstrBitField, ...]
|
||||||
|
suffix_text: ParsedTextLine
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
sep = ",\n "
|
||||||
|
return (f"<InstrBitFieldsPrefix: ({self.box_min_x},{self.box_min_y}).."
|
||||||
|
f"({self.box_max_x},{self.box_max_y})\n"
|
||||||
|
f" prefix_text={self.prefix_text}\n"
|
||||||
|
f" [\n"
|
||||||
|
f" {sep.join(map(str, self.fields))},\n"
|
||||||
|
f" ]\n"
|
||||||
|
f" suffix_text={self.suffix_text}>")
|
||||||
|
|
||||||
@dataclass(unsafe_hash=True, frozen=True)
|
@dataclass(unsafe_hash=True, frozen=True)
|
||||||
class InstrBitFields:
|
class InstrBitFields:
|
||||||
|
prefix: None | InstrBitFieldsPrefix
|
||||||
box_min_x: float
|
box_min_x: float
|
||||||
box_min_y: float
|
box_min_y: float
|
||||||
box_max_x: float
|
box_max_x: float
|
||||||
|
@ -276,9 +480,20 @@ class InstrBitFields:
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
sep = ",\n "
|
sep = ",\n "
|
||||||
return (f"<InstrBitFields: ({self.box_min_x},{self.box_min_y}).."
|
prefix_str = ""
|
||||||
|
if self.prefix is not None:
|
||||||
|
prefix_str = f"{self.prefix}\n"
|
||||||
|
return (f"{prefix_str}<InstrBitFields: ({self.box_min_x},{self.box_min_y}).."
|
||||||
f"({self.box_max_x},{self.box_max_y}) [\n"
|
f"({self.box_max_x},{self.box_max_y}) [\n"
|
||||||
f" {sep.join(map(str, self.fields))}]>")
|
f" {sep.join(map(str, self.fields))},\n]>")
|
||||||
|
|
||||||
|
CHAR_TO_EXPANDED = {
|
||||||
|
"\ufb00": "ff",
|
||||||
|
"\ufb01": "fi",
|
||||||
|
"\ufb02": "fl",
|
||||||
|
"\ufb03": "ffi",
|
||||||
|
"\ufb04": "ffl",
|
||||||
|
}
|
||||||
|
|
||||||
@dataclass()
|
@dataclass()
|
||||||
class PageParser:
|
class PageParser:
|
||||||
|
@ -315,16 +530,24 @@ class PageParser:
|
||||||
self.unprocessed_chars[char.font].add(char)
|
self.unprocessed_chars[char.font].add(char)
|
||||||
for i in self.unprocessed_chars.values():
|
for i in self.unprocessed_chars.values():
|
||||||
i.sort(key=Char.top_down_left_to_right_sort_key)
|
i.sort(key=Char.top_down_left_to_right_sort_key)
|
||||||
|
unknown_fonts=[]
|
||||||
|
unknown_font_errors=[]
|
||||||
for font, chars in self.unprocessed_chars.items():
|
for font, chars in self.unprocessed_chars.items():
|
||||||
print()
|
if font.known_name is None:
|
||||||
print(font)
|
|
||||||
text = ""
|
text = ""
|
||||||
char = None
|
char = None
|
||||||
for char in chars:
|
for char in chars:
|
||||||
text += char.text
|
text += char.text
|
||||||
print(repr(text))
|
unknown_fonts.append(repr(font) + ",")
|
||||||
assert font.known_name is not None, f"unknown font {font}\nlast char: {char}"
|
unknown_font_errors.append(f"unknown font {font}\nlast char: {char}\ntext: {text!r}")
|
||||||
|
unknown_fonts.sort()
|
||||||
|
if len(unknown_fonts) != 0:
|
||||||
|
raise AssertionError("\nunknown fonts:\n" + "\n".join(unknown_fonts)
|
||||||
|
+ "\n\n" + "\n".join(unknown_font_errors))
|
||||||
|
try:
|
||||||
self.extract_instrs()
|
self.extract_instrs()
|
||||||
|
except InstrParseFailed:
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
def extract_text_line(
|
def extract_text_line(
|
||||||
self, *,
|
self, *,
|
||||||
|
@ -334,39 +557,48 @@ class PageParser:
|
||||||
max_x: float,
|
max_x: float,
|
||||||
fonts: TextLineFonts,
|
fonts: TextLineFonts,
|
||||||
skip_initial_spaces=False,
|
skip_initial_spaces=False,
|
||||||
|
allowed_start_min_y_error=None,
|
||||||
) -> None | ParsedTextLine:
|
) -> None | ParsedTextLine:
|
||||||
chars: list[Char] = []
|
chars: list[Char] = []
|
||||||
|
chars_set: SetById[Char] = SetById()
|
||||||
if start_char is not None:
|
if start_char is not None:
|
||||||
chars.append(start_char)
|
chars.append(start_char)
|
||||||
self.unprocessed_chars[start_char.font].remove(start_char)
|
chars_set.add(start_char)
|
||||||
for x, y, char in self.qt.range(
|
for x, y, char in self.qt.range(
|
||||||
min_x=min_x,
|
min_x=min_x - fonts.regular[0].size * 0.5,
|
||||||
max_x=max_x,
|
max_x=max_x,
|
||||||
min_y=start_min_y - fonts.regular.size * 0.5,
|
min_y=start_min_y - fonts.regular[0].size * 0.4,
|
||||||
max_y=start_min_y + fonts.regular.size * 0.5,
|
max_y=start_min_y + fonts.regular[0].size * 0.6,
|
||||||
):
|
):
|
||||||
if not isinstance(char, Char):
|
if not isinstance(char, Char):
|
||||||
continue
|
continue
|
||||||
if char not in self.unprocessed_chars[char.font]:
|
if char not in self.unprocessed_chars[char.font] or char in chars_set:
|
||||||
continue
|
continue
|
||||||
self.unprocessed_chars[char.font].remove(char)
|
chars_set.add(char)
|
||||||
chars.append(char)
|
chars.append(char)
|
||||||
if len(chars) == 0:
|
if len(chars) == 0:
|
||||||
return None
|
return None
|
||||||
chars.sort(key=Char.top_down_left_to_right_sort_key)
|
chars.sort(key=lambda char: (char.min_x, char.text))
|
||||||
retval = ParsedTextLine(
|
retval = ParsedTextLine(
|
||||||
element=ElementTree.Element("text-line"),
|
element=ElementTree.Element("text-line"),
|
||||||
regular_min_y=chars[0].min_y,
|
regular_min_y=chars[0].min_y,
|
||||||
fonts=fonts,
|
fonts=fonts,
|
||||||
chars=chars,
|
chars=chars,
|
||||||
)
|
)
|
||||||
with ElementBodyBuilder(retval.element) as body_builder:
|
text_and_tag_stacks: list[tuple[str, tuple[str, ...]]] = []
|
||||||
last_max_x = min_x
|
last_max_x = min_x
|
||||||
last_kind = None
|
last_kind = None
|
||||||
|
last_char = None
|
||||||
for char in chars:
|
for char in chars:
|
||||||
kind = fonts.get_kind(char.font)
|
if char.min_y - retval.regular_min_y < -0.2:
|
||||||
|
baseline_pos = BaselinePos.BELOW
|
||||||
|
elif char.min_y - retval.regular_min_y > 1:
|
||||||
|
baseline_pos = BaselinePos.ABOVE
|
||||||
|
else:
|
||||||
|
baseline_pos = BaselinePos.BASELINE
|
||||||
|
kind = fonts.get_kind(font=char.font, baseline_pos=baseline_pos)
|
||||||
if kind is None:
|
if kind is None:
|
||||||
print(f"font kind is None:\nfonts={fonts}\nchar={char}")
|
print(f"font kind is None:\nfonts={fonts}\nchar={char}\nbaseline_pos={baseline_pos}")
|
||||||
return None
|
return None
|
||||||
if last_kind is None:
|
if last_kind is None:
|
||||||
space_kind = kind
|
space_kind = kind
|
||||||
|
@ -374,20 +606,36 @@ class PageParser:
|
||||||
space_kind = TextLineFontKind.REGULAR
|
space_kind = TextLineFontKind.REGULAR
|
||||||
else:
|
else:
|
||||||
space_kind = kind
|
space_kind = kind
|
||||||
space_font = fonts.get_font(space_kind, fonts.regular)
|
space_font, _ = fonts.get_font(space_kind, (fonts.regular, BaselinePos.BASELINE))
|
||||||
space_width = char.min_x - last_max_x
|
space_width = char.min_x - last_max_x
|
||||||
space_count_f = space_width / space_font.space_width
|
space_count_f = space_width / space_font[0].space_width
|
||||||
space_count = round(space_count_f)
|
space_count = round(space_count_f)
|
||||||
if space_count_f > 0.25 and abs(space_count - space_count_f) > 0.15:
|
if space_count_f > 0.25 and abs(space_count - space_count_f) > 0.15:
|
||||||
print(f"spaces: space_count_f={space_count_f} space_width={space_width}")
|
print(f"spaces: space_count_f={space_count_f} space_width={space_width}")
|
||||||
if space_count > 0 and not skip_initial_spaces:
|
if space_count > 0 and not skip_initial_spaces:
|
||||||
body_builder.set_tag_stack(space_kind.text_line_tags)
|
text_and_tag_stacks.append((" " * space_count, space_kind.text_line_tags))
|
||||||
body_builder.write_text(" " * space_count)
|
|
||||||
skip_initial_spaces = False
|
skip_initial_spaces = False
|
||||||
body_builder.set_tag_stack(kind.text_line_tags)
|
if (char.text == "\u0338" and last_char is not None and last_char.text == "=" and abs(char.min_x - last_char.min_x) < 0.01 and abs(char.min_y - last_char.min_y) < 0.01):
|
||||||
body_builder.write_text(char.text)
|
text_and_tag_stacks[-1] = "\u2260", ()
|
||||||
|
last_max_x = last_char.max_x
|
||||||
|
else:
|
||||||
|
char_text = CHAR_TO_EXPANDED.get(char.text, char.text)
|
||||||
|
text_and_tag_stacks.append((char_text, kind.text_line_tags))
|
||||||
last_max_x = char.max_x
|
last_max_x = char.max_x
|
||||||
last_kind = kind
|
last_kind = kind
|
||||||
|
last_char = char
|
||||||
|
with ElementBodyBuilder(retval.element) as body_builder:
|
||||||
|
for text, tag_stack in text_and_tag_stacks:
|
||||||
|
body_builder.set_tag_stack(tag_stack)
|
||||||
|
body_builder.write_text(text)
|
||||||
|
for char in chars:
|
||||||
|
self.unprocessed_chars[char.font].remove(char)
|
||||||
|
if allowed_start_min_y_error is None:
|
||||||
|
allowed_start_min_y_error = 0.01
|
||||||
|
assert abs(start_min_y - chars[0].min_y) < allowed_start_min_y_error, (
|
||||||
|
f"start_min_y={start_min_y} regular_min_y={chars[0].min_y}\n"
|
||||||
|
f"start_min_y error: {start_min_y - chars[0].min_y}\n"
|
||||||
|
f"allowed_start_min_y_error={allowed_start_min_y_error}")
|
||||||
return retval
|
return retval
|
||||||
|
|
||||||
def extract_following_text_lines(
|
def extract_following_text_lines(
|
||||||
|
@ -395,16 +643,18 @@ class PageParser:
|
||||||
first_text_line: ParsedTextLine,
|
first_text_line: ParsedTextLine,
|
||||||
min_x: float,
|
min_x: float,
|
||||||
max_x: float,
|
max_x: float,
|
||||||
|
allowed_start_min_y_error=None,
|
||||||
) -> list[ParsedTextLine]:
|
) -> list[ParsedTextLine]:
|
||||||
retval: list[ParsedTextLine] = []
|
retval: list[ParsedTextLine] = []
|
||||||
line = first_text_line
|
line = first_text_line
|
||||||
while line is not None:
|
while line is not None:
|
||||||
retval.append(line)
|
retval.append(line)
|
||||||
line = self.extract_text_line(
|
line = self.extract_text_line(
|
||||||
start_min_y=line.regular_min_y - first_text_line.fonts.regular.line_height,
|
start_min_y=line.regular_min_y - first_text_line.fonts.regular[0].line_height,
|
||||||
min_x=min_x,
|
min_x=min_x,
|
||||||
max_x=max_x,
|
max_x=max_x,
|
||||||
fonts=first_text_line.fonts,
|
fonts=first_text_line.fonts,
|
||||||
|
allowed_start_min_y_error=allowed_start_min_y_error,
|
||||||
)
|
)
|
||||||
return retval
|
return retval
|
||||||
|
|
||||||
|
@ -412,18 +662,112 @@ class PageParser:
|
||||||
self,
|
self,
|
||||||
min_x: float,
|
min_x: float,
|
||||||
max_x: float,
|
max_x: float,
|
||||||
last_mnemonic_line_min_y: float,
|
mnemonic_lines: list[ParsedTextLine],
|
||||||
|
) -> None | InstrBitFields:
|
||||||
|
found_non_affix_line = False
|
||||||
|
if len(mnemonic_lines) > 1:
|
||||||
|
expected_non_affix_line_y = (mnemonic_lines[-1].regular_min_y
|
||||||
|
- INSTR_BIT_FIELDS_TOP_PAD_HEIGHT2)
|
||||||
|
else:
|
||||||
|
expected_non_affix_line_y = (mnemonic_lines[-1].regular_min_y
|
||||||
|
- INSTR_BIT_FIELDS_TOP_PAD_HEIGHT)
|
||||||
|
for x, y, line in self.qt.range(
|
||||||
|
min_x=min_x - 5,
|
||||||
|
max_x=max_x + 5,
|
||||||
|
min_y=expected_non_affix_line_y - 5,
|
||||||
|
max_y=expected_non_affix_line_y + 5,
|
||||||
|
):
|
||||||
|
if not isinstance(line, LTLine):
|
||||||
|
continue
|
||||||
|
if line.width > line.height:
|
||||||
|
found_non_affix_line = True
|
||||||
|
break
|
||||||
|
if found_non_affix_line:
|
||||||
|
return self.extract_instr_bit_fields_box(
|
||||||
|
min_x=min_x,
|
||||||
|
max_x=max_x,
|
||||||
|
expected_box_max_y=expected_non_affix_line_y,
|
||||||
|
)
|
||||||
|
prefix_text = self.extract_text_line(
|
||||||
|
start_min_y=mnemonic_lines[-1].regular_min_y
|
||||||
|
- INSTR_BIT_FIELDS_PREFIX_TEXT_TOP_PAD_HEIGHT,
|
||||||
|
min_x=min_x,
|
||||||
|
max_x=max_x,
|
||||||
|
fonts=TextLineFonts(
|
||||||
|
regular=(Font.INSTR_DESC_SMALL,),
|
||||||
|
bold=(Font.INSTR_DESC_SMALL_BOLD,),
|
||||||
|
),
|
||||||
|
allowed_start_min_y_error=2,
|
||||||
|
)
|
||||||
|
if prefix_text is None:
|
||||||
|
raise InstrParseFailed("can't find instr prefix bit fields title")
|
||||||
|
prefix_text_str = "".join(prefix_text.element.itertext())
|
||||||
|
if prefix_text_str != "Prefix:":
|
||||||
|
raise InstrParseFailed(
|
||||||
|
f"instr prefix bit fields title is not as expected: {prefix_text_str!r}")
|
||||||
|
prefix_bit_fields = self.extract_instr_bit_fields_box(
|
||||||
|
min_x=min_x,
|
||||||
|
max_x=max_x,
|
||||||
|
expected_box_max_y=prefix_text.regular_min_y
|
||||||
|
- INSTR_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT,
|
||||||
|
)
|
||||||
|
if prefix_bit_fields is None:
|
||||||
|
raise InstrParseFailed("can't find instr prefix bit fields")
|
||||||
|
suffix_text = self.extract_text_line(
|
||||||
|
start_min_y=prefix_bit_fields.box_min_y
|
||||||
|
- INSTR_BIT_FIELDS_PREFIX_BOX_BOTTOM_TO_SUFFIX_TEXT_HEIGHT,
|
||||||
|
min_x=min_x,
|
||||||
|
max_x=max_x,
|
||||||
|
fonts=TextLineFonts(
|
||||||
|
regular=(Font.INSTR_DESC_SMALL,),
|
||||||
|
bold=(Font.INSTR_DESC_SMALL_BOLD,),
|
||||||
|
),
|
||||||
|
allowed_start_min_y_error=2,
|
||||||
|
)
|
||||||
|
if suffix_text is None:
|
||||||
|
raise InstrParseFailed("can't find instr suffix bit fields title")
|
||||||
|
suffix_text_str = "".join(suffix_text.element.itertext())
|
||||||
|
if suffix_text_str != "Suffix:":
|
||||||
|
raise InstrParseFailed(
|
||||||
|
f"instr suffix bit fields title is not as expected: {suffix_text_str!r}")
|
||||||
|
suffix_bit_fields = self.extract_instr_bit_fields_box(
|
||||||
|
min_x=min_x,
|
||||||
|
max_x=max_x,
|
||||||
|
expected_box_max_y=suffix_text.regular_min_y
|
||||||
|
- INSTR_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT,
|
||||||
|
)
|
||||||
|
if suffix_bit_fields is None:
|
||||||
|
raise InstrParseFailed("can't find instr suffix bit fields")
|
||||||
|
return InstrBitFields(
|
||||||
|
prefix=InstrBitFieldsPrefix(
|
||||||
|
box_min_x=prefix_bit_fields.box_min_x,
|
||||||
|
box_min_y=prefix_bit_fields.box_min_y,
|
||||||
|
box_max_x=prefix_bit_fields.box_max_x,
|
||||||
|
box_max_y=prefix_bit_fields.box_max_y,
|
||||||
|
prefix_text=prefix_text,
|
||||||
|
fields=prefix_bit_fields.fields,
|
||||||
|
suffix_text=suffix_text,
|
||||||
|
),
|
||||||
|
box_min_x=suffix_bit_fields.box_min_x,
|
||||||
|
box_min_y=suffix_bit_fields.box_min_y,
|
||||||
|
box_max_x=suffix_bit_fields.box_max_x,
|
||||||
|
box_max_y=suffix_bit_fields.box_max_y,
|
||||||
|
fields=suffix_bit_fields.fields,
|
||||||
|
)
|
||||||
|
|
||||||
|
def extract_instr_bit_fields_box(
|
||||||
|
self,
|
||||||
|
min_x: float,
|
||||||
|
max_x: float,
|
||||||
|
expected_box_max_y: float,
|
||||||
) -> None | InstrBitFields:
|
) -> None | InstrBitFields:
|
||||||
h_lines: list[LTLine] = []
|
h_lines: list[LTLine] = []
|
||||||
v_lines: list[LTLine] = []
|
v_lines: list[LTLine] = []
|
||||||
for x, y, line in self.qt.range(
|
for x, y, line in self.qt.range(
|
||||||
min_x=min_x - 5,
|
min_x=min_x - 5,
|
||||||
max_x=max_x + 5,
|
max_x=max_x + 5,
|
||||||
min_y=last_mnemonic_line_min_y
|
min_y=expected_box_max_y - INSTR_BIT_FIELDS_BOX_HEIGHT - 5,
|
||||||
- INSTR_BIT_FIELDS_PADDED_HEIGHT
|
max_y=expected_box_max_y + 5,
|
||||||
+ INSTR_BIT_FIELDS_BOTTOM_PAD_HEIGHT / 2,
|
|
||||||
max_y=last_mnemonic_line_min_y
|
|
||||||
- INSTR_BIT_FIELDS_TOP_PAD_HEIGHT / 2,
|
|
||||||
):
|
):
|
||||||
if not isinstance(line, LTLine):
|
if not isinstance(line, LTLine):
|
||||||
continue
|
continue
|
||||||
|
@ -439,10 +783,10 @@ class PageParser:
|
||||||
if len(h_lines) == 0 and len(v_lines) == 0:
|
if len(h_lines) == 0 and len(v_lines) == 0:
|
||||||
return None
|
return None
|
||||||
if len(h_lines) != 2:
|
if len(h_lines) != 2:
|
||||||
raise PageParseFailed(
|
raise InstrParseFailed(
|
||||||
f"instruction bit fields box has wrong number of horizontal lines:\n{h_lines}")
|
f"instruction bit fields box has wrong number of horizontal lines:\n{h_lines}")
|
||||||
if len(v_lines) < 2:
|
if len(v_lines) < 2:
|
||||||
raise PageParseFailed(
|
raise InstrParseFailed(
|
||||||
f"instruction bit fields box has too few vertical lines:\n{h_lines}")
|
f"instruction bit fields box has too few vertical lines:\n{h_lines}")
|
||||||
bottom_line, top_line = h_lines
|
bottom_line, top_line = h_lines
|
||||||
box_min_x = v_lines[0].x0
|
box_min_x = v_lines[0].x0
|
||||||
|
@ -463,7 +807,7 @@ class PageParser:
|
||||||
box_min_x=field_box_min_x,
|
box_min_x=field_box_min_x,
|
||||||
box_max_x=field_box_max_x,
|
box_max_x=field_box_max_x,
|
||||||
name=self.extract_text_line(
|
name=self.extract_text_line(
|
||||||
start_min_y=box_mid_y + 3,
|
start_min_y=box_mid_y + 3.288,
|
||||||
min_x=field_box_min_x,
|
min_x=field_box_min_x,
|
||||||
max_x=field_box_max_x,
|
max_x=field_box_max_x,
|
||||||
fonts=TextLineFonts(
|
fonts=TextLineFonts(
|
||||||
|
@ -472,16 +816,17 @@ class PageParser:
|
||||||
skip_initial_spaces=True,
|
skip_initial_spaces=True,
|
||||||
),
|
),
|
||||||
start_bit=self.extract_text_line(
|
start_bit=self.extract_text_line(
|
||||||
start_min_y=box_min_y + 3,
|
start_min_y=box_min_y + 3.487,
|
||||||
min_x=field_box_min_x,
|
min_x=field_box_min_x,
|
||||||
max_x=field_box_max_x,
|
max_x=field_box_max_x,
|
||||||
fonts=TextLineFonts(
|
fonts=TextLineFonts(
|
||||||
regular=Font.INSTR_FIELD_BIT_NUMS,
|
regular=(Font.INSTR_DESC_SMALL,),
|
||||||
),
|
),
|
||||||
skip_initial_spaces=True,
|
skip_initial_spaces=True,
|
||||||
),
|
),
|
||||||
))
|
))
|
||||||
return InstrBitFields(
|
return InstrBitFields(
|
||||||
|
prefix=None,
|
||||||
box_min_x=box_min_x,
|
box_min_x=box_min_x,
|
||||||
box_min_y=box_min_y,
|
box_min_y=box_min_y,
|
||||||
box_max_x=box_max_x,
|
box_max_x=box_max_x,
|
||||||
|
@ -501,7 +846,7 @@ class PageParser:
|
||||||
start_min_y=header_start_char.min_y,
|
start_min_y=header_start_char.min_y,
|
||||||
min_x=column_min_x,
|
min_x=column_min_x,
|
||||||
max_x=column_max_x,
|
max_x=column_max_x,
|
||||||
fonts=TextLineFonts(regular=Font.INSTR_HEADER),
|
fonts=TextLineFonts(regular=(Font.INSTR_HEADER,)),
|
||||||
)
|
)
|
||||||
if header_line is None:
|
if header_line is None:
|
||||||
raise PageParseFailed("can't find header text line")
|
raise PageParseFailed("can't find header text line")
|
||||||
|
@ -509,20 +854,22 @@ class PageParser:
|
||||||
first_text_line=header_line,
|
first_text_line=header_line,
|
||||||
min_x=column_min_x,
|
min_x=column_min_x,
|
||||||
max_x=column_max_x,
|
max_x=column_max_x,
|
||||||
|
allowed_start_min_y_error=1.5,
|
||||||
)
|
)
|
||||||
print("instr header lines:")
|
print("instr header lines:")
|
||||||
print("\n".join(map(str, header_lines)))
|
print("\n".join(map(str, header_lines)))
|
||||||
mnemonic_line = self.extract_text_line(
|
mnemonic_line = self.extract_text_line(
|
||||||
start_min_y=header_lines[-1].regular_min_y - 18.788,
|
start_min_y=header_lines[-1].regular_min_y - INSTR_MNEMONIC_TOP_PAD_HEIGHT,
|
||||||
min_x=column_min_x,
|
min_x=column_min_x,
|
||||||
max_x=column_max_x,
|
max_x=column_max_x,
|
||||||
fonts=TextLineFonts(
|
fonts=TextLineFonts(
|
||||||
regular=Font.INSTR_DESC,
|
regular=Font.INSTR_DESC,
|
||||||
),
|
),
|
||||||
skip_initial_spaces=True,
|
skip_initial_spaces=True,
|
||||||
|
allowed_start_min_y_error=3,
|
||||||
)
|
)
|
||||||
if mnemonic_line is None:
|
if mnemonic_line is None:
|
||||||
raise PageParseFailed("can't find instr mnemonic text line")
|
raise InstrParseFailed("can't find instr mnemonic text line")
|
||||||
mnemonic_lines = self.extract_following_text_lines(
|
mnemonic_lines = self.extract_following_text_lines(
|
||||||
first_text_line=mnemonic_line,
|
first_text_line=mnemonic_line,
|
||||||
min_x=mnemonic_line.chars[0].min_x,
|
min_x=mnemonic_line.chars[0].min_x,
|
||||||
|
@ -533,9 +880,94 @@ class PageParser:
|
||||||
instr_bit_fields = self.extract_instr_bit_fields(
|
instr_bit_fields = self.extract_instr_bit_fields(
|
||||||
min_x=column_min_x,
|
min_x=column_min_x,
|
||||||
max_x=column_max_x,
|
max_x=column_max_x,
|
||||||
last_mnemonic_line_min_y=mnemonic_lines[-1].regular_min_y,
|
mnemonic_lines=mnemonic_lines,
|
||||||
)
|
)
|
||||||
print(instr_bit_fields)
|
print(instr_bit_fields)
|
||||||
|
if instr_bit_fields is None:
|
||||||
|
raise InstrParseFailed("can't find instr bit fields")
|
||||||
|
alt_header_line = self.extract_text_line(
|
||||||
|
start_min_y=instr_bit_fields.box_min_y
|
||||||
|
- INSTR_BIT_FIELDS_BOTTOM_TO_HEADER_TEXT_HEIGHT,
|
||||||
|
min_x=column_min_x,
|
||||||
|
max_x=column_max_x,
|
||||||
|
fonts=TextLineFonts(
|
||||||
|
regular=(Font.INSTR_HEADER,),
|
||||||
|
),
|
||||||
|
skip_initial_spaces=True,
|
||||||
|
allowed_start_min_y_error=6,
|
||||||
|
)
|
||||||
|
if alt_header_line is not None:
|
||||||
|
print(f"found alt header line:\n{alt_header_line}")
|
||||||
|
alt_header_lines = self.extract_following_text_lines(
|
||||||
|
first_text_line=alt_header_line,
|
||||||
|
min_x=column_min_x,
|
||||||
|
max_x=column_max_x,
|
||||||
|
allowed_start_min_y_error=1.5,
|
||||||
|
)
|
||||||
|
print("instr alt header lines:")
|
||||||
|
print("\n".join(map(str, alt_header_lines)))
|
||||||
|
alt_mnemonic_line = self.extract_text_line(
|
||||||
|
start_min_y=alt_header_lines[-1].regular_min_y - INSTR_MNEMONIC_TOP_PAD_HEIGHT,
|
||||||
|
min_x=column_min_x,
|
||||||
|
max_x=column_max_x,
|
||||||
|
fonts=TextLineFonts(
|
||||||
|
regular=Font.INSTR_DESC,
|
||||||
|
),
|
||||||
|
skip_initial_spaces=True,
|
||||||
|
allowed_start_min_y_error=1.5,
|
||||||
|
)
|
||||||
|
if alt_mnemonic_line is None:
|
||||||
|
raise InstrParseFailed("can't find instr alt mnemonic text line")
|
||||||
|
alt_mnemonic_lines = self.extract_following_text_lines(
|
||||||
|
first_text_line=alt_mnemonic_line,
|
||||||
|
min_x=alt_mnemonic_line.chars[0].min_x,
|
||||||
|
max_x=column_max_x,
|
||||||
|
)
|
||||||
|
print("instr alt mnemonic lines:")
|
||||||
|
print("\n".join(map(str, alt_mnemonic_lines)))
|
||||||
|
alt_instr_bit_fields = self.extract_instr_bit_fields(
|
||||||
|
min_x=column_min_x,
|
||||||
|
max_x=column_max_x,
|
||||||
|
mnemonic_lines=alt_mnemonic_lines,
|
||||||
|
)
|
||||||
|
print(alt_instr_bit_fields)
|
||||||
|
if alt_instr_bit_fields is None:
|
||||||
|
raise InstrParseFailed("can't find instr alt bit fields")
|
||||||
|
last_instr_bit_fields = alt_instr_bit_fields
|
||||||
|
else:
|
||||||
|
print("no alt header line")
|
||||||
|
alt_header_lines = None
|
||||||
|
alt_mnemonic_lines = None
|
||||||
|
alt_instr_bit_fields = None
|
||||||
|
last_instr_bit_fields = instr_bit_fields
|
||||||
|
|
||||||
|
code_line = None
|
||||||
|
for y_offset in reversed(range(4)):
|
||||||
|
code_line = self.extract_text_line(
|
||||||
|
start_min_y=last_instr_bit_fields.box_min_y
|
||||||
|
- INSTR_BIT_FIELDS_BOTTOM_TO_CODE_TEXT_HEIGHT
|
||||||
|
+ y_offset * 0.5 * Font.INSTR_CODE[0].line_height,
|
||||||
|
min_x=column_min_x,
|
||||||
|
max_x=column_max_x,
|
||||||
|
fonts=TextLineFonts(
|
||||||
|
regular=Font.INSTR_CODE,
|
||||||
|
subscript=(Font.INSTR_CODE_SUBSCRIPT,),
|
||||||
|
),
|
||||||
|
skip_initial_spaces=True,
|
||||||
|
allowed_start_min_y_error=1,
|
||||||
|
)
|
||||||
|
if code_line is not None:
|
||||||
|
break
|
||||||
|
if code_line is None:
|
||||||
|
raise InstrParseFailed("can't find instr code text line")
|
||||||
|
code_lines = self.extract_following_text_lines(
|
||||||
|
first_text_line=code_line,
|
||||||
|
min_x=code_line.chars[0].min_x,
|
||||||
|
max_x=column_max_x,
|
||||||
|
allowed_start_min_y_error=0.05,
|
||||||
|
)
|
||||||
|
print("instr code lines:")
|
||||||
|
print("\n".join(map(str, code_lines)))
|
||||||
# TODO: finish
|
# TODO: finish
|
||||||
|
|
||||||
def extract_instrs(self):
|
def extract_instrs(self):
|
||||||
|
@ -544,4 +976,4 @@ class PageParser:
|
||||||
self.extract_instr(next(iter(unprocessed_header_chars)))
|
self.extract_instr(next(iter(unprocessed_header_chars)))
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
Parser().parse_pdf(sys.argv[1], page_numbers=range(76, 78))
|
Parser().parse_pdf(sys.argv[1], page_numbers=range(1495))
|
Loading…
Reference in a new issue