1937 lines
78 KiB
Python
Executable file
1937 lines
78 KiB
Python
Executable file
from __future__ import annotations
|
|
from collections import defaultdict
|
|
from collections.abc import Generator, Iterable, Iterator, Callable
|
|
from contextlib import contextmanager
|
|
from dataclasses import dataclass, field
|
|
import dataclasses
|
|
from functools import cached_property
|
|
import sys
|
|
from typing import ClassVar, TypeVar, assert_never
|
|
from xml.etree import ElementTree
|
|
import enum
|
|
import traceback
|
|
from copy import deepcopy
|
|
from pathlib import Path
|
|
|
|
from pdfminer.high_level import extract_pages
|
|
from pdfminer.layout import LTChar, LTLine, LTPage, LTRect, LTTextBox
|
|
|
|
from parse_powerisa_pdf.quad_tree import QuadTree
|
|
from parse_powerisa_pdf.set_by_id import SetById
|
|
|
|
@dataclass(unsafe_hash=True, frozen=True)
|
|
class Font:
|
|
font_name: str
|
|
size: float
|
|
__KNOWN_NAMES: ClassVar[dict[Font, str]]
|
|
|
|
@cached_property
|
|
def space_width(self) -> float:
|
|
return 3.985 * self.size / Font.INSN_CODE[0].size
|
|
|
|
@cached_property
|
|
def line_height(self) -> float:
|
|
match self.font_name:
|
|
case _ if any(self.font_name == f.font_name for f in Font.insn_code_fonts()):
|
|
return 9.464 * self.size / Font.INSN_CODE[0].size
|
|
case Font.INSN_DESC_BOLD.font_name | \
|
|
Font.INSN_DESC_ITALIC.font_name | \
|
|
Font.INSN_DESC_BOLD_ITALIC.font_name:
|
|
return 10.959 * self.size / Font.INSN_DESC[0].size
|
|
case _ if self in Font.INSN_DESC or self.font_name == Font.INSN_DESC[0].font_name:
|
|
return 10.959 * self.size / Font.INSN_DESC[0].size
|
|
case _ if self in Font.MATH_MISC:
|
|
return 10.959 * self.size / Font.INSN_DESC[0].size
|
|
case _:
|
|
raise AssertionError(f"no line height: {self}")
|
|
|
|
@classmethod
|
|
def insn_code_fonts(cls) -> Iterator[Font]:
|
|
yield from cls.INSN_CODE
|
|
yield from cls.INSN_CODE_SUBSCRIPT
|
|
|
|
@classmethod
|
|
def known_fonts(cls) -> Iterator[Font]:
|
|
return iter(cls.__KNOWN_NAMES.keys())
|
|
|
|
@property
|
|
def known_name(self) -> None | str:
|
|
return self.__KNOWN_NAMES.get(self)
|
|
|
|
@classmethod
|
|
def _register_known_fonts(cls) -> None:
|
|
cls.INSN_HEADER = Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=9.963)
|
|
cls.RTL_FN_HEADER = Font(font_name='APUYSQ+zcoN-Regular', size=9.963)
|
|
cls.PAGE_HEADER = Font(font_name='MJBFWM+DejaVuSansCondensed', size=9.963)
|
|
cls.PAGE_FOOTER = Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.981)
|
|
cls.INSN_DESC = (
|
|
Font(font_name='MJBFWM+DejaVuSansCondensed', size=8.966),
|
|
Font(font_name='FZTIYT+CMMI9', size=8.966),
|
|
Font(font_name='ONUAYC+CMSSI9', size=8.966),
|
|
Font(font_name='TNGBFZ+CMSY9', size=8.966),
|
|
Font(font_name='WHMZPU+CMEX9', size=8.966),
|
|
Font(font_name='ZJTMSG+CMSS9', size=8.966),
|
|
)
|
|
cls.INSN_DESC_MISC = tuple(
|
|
Font(font_name='MJBFWM+DejaVuSansCondensed', size=i)
|
|
for i in [
|
|
2.377, 2.561, 4.492, 4.641, 4.772, 4.864, 4.925,
|
|
5.097, 5.123, 5.131, 5.516, 5.604, 5.634, 5.906,
|
|
6.033, 6.068, 6.213, 6.252, 6.962, 7.977,
|
|
]
|
|
)
|
|
cls.INSN_DESC_CODE = Font(font_name='APUYSQ+zcoN-Regular', size=6.974)
|
|
cls.INSN_DESC_CODE_MISC = (
|
|
Font(font_name='APUYSQ+zcoN-Regular', size=3.587),
|
|
Font(font_name='APUYSQ+zcoN-Regular', size=4.483),
|
|
)
|
|
cls.INSN_DESC_ITALIC = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=8.966)
|
|
cls.INSN_DESC_BOLD = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=8.966)
|
|
cls.INSN_DESC_BOLD_ITALIC = Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=8.966)
|
|
cls.INSN_DESC_SMALL = Font(font_name='MJBFWM+DejaVuSansCondensed', size=7.97)
|
|
cls.INSN_DESC_SMALL_ITALIC = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=7.97)
|
|
cls.INSN_DESC_SMALL_BOLD = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=7.97)
|
|
cls.INSN_DESC_SMALL_BOLD_ITALIC = Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=7.97)
|
|
cls.INSN_DESC_BOLD_MISC = tuple(
|
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=i)
|
|
for i in [
|
|
2.21, 2.399, 2.763, 2.946, 2.949, 2.999,
|
|
3.065, 3.086, 3.183, 3.686, 3.744, 3.825, 3.842, 3.857, 3.979,
|
|
4.032, 4.112, 4.161, 4.206, 4.353, 4.378, 4.434,
|
|
4.595, 4.619, 4.647, 4.68, 4.693, 4.736, 4.781, 4.802, 4.995,
|
|
5.201, 5.258, 5.363, 5.442, 5.473, 5.485,
|
|
5.512, 5.543, 5.613, 5.744, 5.774, 5.809, 5.849, 5.911, 5.92, 5.962, 5.981,
|
|
6.146, 6.213, 6.221, 6.243, 6.55, 6.62, 6.699, 6.725, 6.751, 6.856,
|
|
8.029, 8.406,
|
|
]
|
|
)
|
|
cls.INSN_DESC_SUBSCRIPT = Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.978)
|
|
cls.INSN_DESC_BOLD_SUBSCRIPT = \
|
|
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.978)
|
|
cls.INSN_DESC_ITALIC_SUBSCRIPT = \
|
|
Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=5.978)
|
|
cls.INSN_DESC_BOLD_ITALIC_SUBSCRIPT = \
|
|
Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=5.978)
|
|
cls.INSN_EXT_MNEMONIC = Font(font_name='APUYSQ+zcoN-Regular', size=8.966)
|
|
cls.INSN_CODE = (
|
|
Font(font_name='APUYSQ+zcoN-Regular', size=7.97),
|
|
Font(font_name='RRFUNA+CMSY8', size=7.97),
|
|
Font(font_name='HPXOZC+CMSS8', size=7.97),
|
|
)
|
|
cls.INSN_CODE_SUBSCRIPT = (
|
|
Font(font_name='APUYSQ+zcoN-Regular', size=5.978),
|
|
Font(font_name='DBQTKF+CMSY6', size=5.978),
|
|
)
|
|
cls.TITLE_PAGE_BIG = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=24.787)
|
|
cls.TITLE_PAGE_VERSION = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=9.963)
|
|
cls.TITLE_PAGE_TM = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.974)
|
|
cls.TITLE_PAGE_REV = Font(font_name='MJBFWM+DejaVuSansCondensed', size=6.974)
|
|
cls.TITLE_PAGE_BOOK = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=20.663)
|
|
cls.LEGAL_PAGE_ITALIC = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=9.963)
|
|
cls.CHANGE_SUMMARY_PAGE_BOLD = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=11.955)
|
|
cls.CHAPTER_TITLE = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=17.215)
|
|
cls.MATH_MISC = (
|
|
Font(font_name='AAJMKT+CMMI6', size=5.978),
|
|
Font(font_name='CUTMFD+CMSSI8', size=5.978),
|
|
Font(font_name='CUTMFD+CMSSI8', size=7.97),
|
|
Font(font_name='FZTIYT+CMMI9', size=5.734),
|
|
Font(font_name='FZTIYT+CMMI9', size=7.168),
|
|
Font(font_name='HONFQS+CMMI8', size=7.97),
|
|
Font(font_name='HPXOZC+CMSS8', size=5.978),
|
|
Font(font_name='LLVRDD+CMSY10', size=11.955),
|
|
Font(font_name='ZJTMSG+CMSS9', size=7.168),
|
|
)
|
|
|
|
cls.__KNOWN_NAMES = {}
|
|
for name, value in cls.__dict__.items():
|
|
if name[0].isupper():
|
|
if isinstance(value, cls):
|
|
assert value not in cls.__KNOWN_NAMES, f"duplicate known font: {value}"
|
|
cls.__KNOWN_NAMES[value] = name
|
|
elif isinstance(value, tuple) and all(isinstance(i, cls) for i in value):
|
|
for i, font in enumerate(value):
|
|
assert isinstance(font, cls)
|
|
assert font not in cls.__KNOWN_NAMES, f"duplicate known font: {font}"
|
|
cls.__KNOWN_NAMES[font] = f"{name}[{i}]"
|
|
|
|
old_repr = cls.__repr__
|
|
def __repr__(self: cls) -> str:
|
|
known_name = self.known_name
|
|
if known_name is not None:
|
|
return f"<{self.__class__.__name__}.{known_name}: {old_repr(self)}>"
|
|
return old_repr(self)
|
|
cls.__repr__ = __repr__
|
|
|
|
del cls._register_known_fonts
|
|
|
|
for font in Font.known_fonts():
|
|
font.space_width # initialize
|
|
font.line_height # initialize
|
|
|
|
Font._register_known_fonts()
|
|
|
|
@dataclass(unsafe_hash=True, frozen=True)
|
|
class Char:
|
|
font: Font
|
|
text: str
|
|
adv: float
|
|
min_x: float
|
|
min_y: float
|
|
max_x: float
|
|
max_y: float
|
|
|
|
def top_down_left_to_right_sort_key(self):
|
|
return -self.min_y, self.min_x
|
|
|
|
@property
|
|
def width(self) -> float:
|
|
return self.max_x - self.min_x
|
|
|
|
@property
|
|
def height(self) -> float:
|
|
return self.max_y - self.min_y
|
|
|
|
|
|
COLUMN_SPLIT_X = 300.0
|
|
PAGE_BODY_MAX_X = 600.0
|
|
PAGE_BODY_MIN_X = 50
|
|
PAGE_BODY_MAX_Y = 780.0
|
|
PAGE_BODY_MIN_Y = 45.0
|
|
ONE_TITLE_LINE_SPLIT_Y = 734.0
|
|
TWO_TITLE_LINES_SPLIT_Y = 715.0
|
|
INSN_BIT_FIELDS_PREFIX_TEXT_TOP_PAD_HEIGHT = 29.938
|
|
INSN_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT = 9.278
|
|
INSN_BIT_FIELDS_PREFIX_BOX_BOTTOM_TO_SUFFIX_TEXT_HEIGHT = 20.971
|
|
INSN_BIT_FIELDS_TOP_PAD_HEIGHT = 20.175
|
|
INSN_BIT_FIELDS_TOP_PAD_HEIGHT2 = 14.694
|
|
INSN_BIT_FIELDS_BOX_HEIGHT = 22.317
|
|
INSN_SP_REGS_ALTERED_REGISTER_COLUMN_X = 34.405
|
|
INSN_SP_REGS_ALTERED_FIELDS_COLUMN_X = 86.692
|
|
INSN_SP_REGS_ALTERED_FIELDS_CONDS_SPLIT_X = 188.74
|
|
|
|
@dataclass()
|
|
class ParsedTextLine:
|
|
element: ElementTree.Element
|
|
regular_min_y: float
|
|
regular_max_y: float
|
|
fonts: TextLineFonts
|
|
chars: list[Char]
|
|
preceding_blank_lines: int
|
|
|
|
@property
|
|
def regular_height(self) -> float:
|
|
return self.regular_max_y - self.regular_min_y
|
|
|
|
def get_header_text(self) -> None | str:
|
|
assert self.fonts == TextLineFonts.INSN_DESC_FONTS
|
|
if (self.element.text or "").strip() != "":
|
|
return None
|
|
if (self.element.tail or "").strip() != "":
|
|
return None
|
|
if len(self.element) != 1:
|
|
return None
|
|
if self.element[0].tag != "b":
|
|
return None
|
|
if len(self.element[0]) != 0:
|
|
return None
|
|
text = "".join(self.element.itertext())
|
|
if text.endswith(":") and text[0].istitle():
|
|
return text
|
|
return None
|
|
|
|
def __repr__(self) -> str:
|
|
fields = []
|
|
for i in dataclasses.fields(self):
|
|
if i.name == "element":
|
|
fields.append(
|
|
i.name + "=" + ElementTree.tostring(self.element, encoding="unicode"))
|
|
continue
|
|
# use fields as a handy object we know isn't a field
|
|
v = getattr(self, i.name, fields)
|
|
if v is fields:
|
|
fields.append(i.name + "=<unset>")
|
|
continue
|
|
fields.append(i.name + "=" + repr(v))
|
|
sep = ",\n "
|
|
return f"{__class__.__name__}({sep.join(fields)})"
|
|
|
|
def __str__(self) -> str:
|
|
return "\n" * self.preceding_blank_lines + ElementTree.tostring(self.element, encoding="unicode")
|
|
|
|
def write_xml(self, parent: ElementTree.Element, trailing_nl: bool):
|
|
for _ in range(self.preceding_blank_lines):
|
|
ElementTree.SubElement(parent, "br").tail = "\n"
|
|
if self.element.text is not None:
|
|
if len(parent) == 0:
|
|
parent.text = (parent.text or "") + self.element.text
|
|
else:
|
|
parent[-1].tail = (parent[-1].tail or "") + self.element.text
|
|
for element in self.element:
|
|
parent.append(deepcopy(element))
|
|
if trailing_nl:
|
|
ElementTree.SubElement(parent, "br").tail = "\n"
|
|
|
|
@staticmethod
|
|
def write_xml_lines(
|
|
lines: Iterable[ParsedTextLine],
|
|
parent: ElementTree.Element,
|
|
trailing_nl: bool,
|
|
preceding_nl: bool=False,
|
|
):
|
|
if preceding_nl:
|
|
ElementTree.SubElement(parent, "br").tail = "\n"
|
|
first = True
|
|
for line in lines:
|
|
if first:
|
|
first = False
|
|
else:
|
|
ElementTree.SubElement(parent, "br").tail = "\n"
|
|
line.write_xml(parent, trailing_nl=False)
|
|
if trailing_nl:
|
|
ElementTree.SubElement(parent, "br").tail = "\n"
|
|
|
|
|
|
_T = TypeVar("_T")
|
|
|
|
class BaselinePos(enum.Enum):
|
|
ABOVE = "above"
|
|
BELOW = "below"
|
|
|
|
@dataclass(unsafe_hash=True, frozen=True)
|
|
class TextLineFonts:
|
|
regular: tuple[Font, ...]
|
|
italic: tuple[Font, ...] | None = None
|
|
bold: tuple[Font, ...] | None = None
|
|
bold_italic: tuple[Font, ...] | None = None
|
|
subscript: tuple[Font, ...] | None = None
|
|
bold_subscript: tuple[Font, ...] | None = None
|
|
italic_subscript: tuple[Font, ...] | None = None
|
|
bold_italic_subscript: tuple[Font, ...] | None = None
|
|
code: tuple[Font, ...] | None = None
|
|
code_subscript: tuple[Font, ...] | None = None
|
|
|
|
@classmethod
|
|
def _define_fonts(cls):
|
|
cls.INSN_MNEMONIC_FONTS = cls(
|
|
regular=Font.INSN_DESC,
|
|
)
|
|
cls.INSN_HEADER_FONTS = cls(
|
|
regular=(Font.INSN_HEADER,),
|
|
)
|
|
cls.INSN_BIT_FIELD_BIT_NUMBER_FONTS = cls(
|
|
regular=(Font.INSN_DESC_SMALL, Font.TITLE_PAGE_REV),
|
|
)
|
|
cls.INSN_BIT_FIELD_NAME_FONTS = cls(
|
|
regular=Font.INSN_DESC,
|
|
subscript=(Font.INSN_DESC_SUBSCRIPT,),
|
|
)
|
|
cls.INSN_BIT_FIELDS_AFFIX_TITLE_FONTS = cls(
|
|
regular=(Font.INSN_DESC_SMALL,),
|
|
bold=(Font.INSN_DESC_SMALL_BOLD,),
|
|
)
|
|
cls.INSN_CODE_FONTS = cls(
|
|
regular=Font.INSN_CODE,
|
|
subscript=Font.INSN_CODE_SUBSCRIPT,
|
|
)
|
|
cls.INSN_DESC_FONTS = cls(
|
|
regular=(*Font.INSN_DESC, Font.INSN_DESC_SMALL),
|
|
bold=(Font.INSN_DESC_BOLD, Font.INSN_DESC_SMALL_BOLD),
|
|
italic=(Font.INSN_DESC_ITALIC, Font.INSN_DESC_SMALL_ITALIC),
|
|
bold_italic=(Font.INSN_DESC_BOLD_ITALIC, Font.INSN_DESC_SMALL_BOLD_ITALIC),
|
|
subscript=(Font.INSN_DESC_SUBSCRIPT,),
|
|
bold_subscript=(Font.INSN_DESC_BOLD_SUBSCRIPT,),
|
|
italic_subscript=(Font.INSN_DESC_ITALIC_SUBSCRIPT,),
|
|
bold_italic_subscript=(Font.INSN_DESC_BOLD_ITALIC_SUBSCRIPT,),
|
|
code=(Font.INSN_DESC_CODE, Font.INSN_EXT_MNEMONIC),
|
|
code_subscript=Font.INSN_CODE_SUBSCRIPT,
|
|
)
|
|
|
|
del cls._define_fonts
|
|
|
|
def get_font(
|
|
self,
|
|
part_kind: TextLineFontKind,
|
|
default: _T=None,
|
|
) -> _T | tuple[tuple[Font, ...], None | BaselinePos]:
|
|
match part_kind:
|
|
case TextLineFontKind.REGULAR:
|
|
font = self.regular
|
|
case TextLineFontKind.ITALIC:
|
|
font = self.italic
|
|
case TextLineFontKind.BOLD:
|
|
font = self.bold
|
|
case TextLineFontKind.BOLD_ITALIC:
|
|
font = self.bold_italic
|
|
case TextLineFontKind.SUBSCRIPT:
|
|
font = self.subscript
|
|
case TextLineFontKind.SUPERSCRIPT:
|
|
font = self.subscript
|
|
case TextLineFontKind.BOLD_SUBSCRIPT:
|
|
font = self.bold_subscript
|
|
case TextLineFontKind.BOLD_SUPERSCRIPT:
|
|
font = self.bold_subscript
|
|
case TextLineFontKind.ITALIC_SUBSCRIPT:
|
|
font = self.italic_subscript
|
|
case TextLineFontKind.ITALIC_SUPERSCRIPT:
|
|
font = self.italic_subscript
|
|
case TextLineFontKind.BOLD_ITALIC_SUBSCRIPT:
|
|
font = self.bold_italic_subscript
|
|
case TextLineFontKind.BOLD_ITALIC_SUPERSCRIPT:
|
|
font = self.bold_italic_subscript
|
|
case TextLineFontKind.CODE:
|
|
font = self.code
|
|
case TextLineFontKind.CODE_SUBSCRIPT:
|
|
font = self.code_subscript
|
|
case TextLineFontKind.CODE_SUPERSCRIPT:
|
|
font = self.code_subscript
|
|
case _:
|
|
assert_never(part_kind)
|
|
if font is None:
|
|
return default
|
|
return font, part_kind.sub_super.baseline_pos
|
|
|
|
@cached_property
|
|
def __font_to_kind_map(self) -> dict[tuple[Font, None | BaselinePos], TextLineFontKind]:
|
|
retval: dict[tuple[Font, None | BaselinePos], TextLineFontKind] = {}
|
|
for kind in TextLineFontKind:
|
|
fonts = self.get_font(kind)
|
|
if fonts is None:
|
|
continue
|
|
fonts, baseline_pos = fonts
|
|
for font in fonts:
|
|
assert font not in retval, \
|
|
f"duplicate font: kind={kind} old_kind={retval[font]} font={font}"
|
|
retval[font, baseline_pos] = kind
|
|
return retval
|
|
|
|
@cached_property
|
|
def fonts(self) -> frozenset[Font]:
|
|
retval: set[Font] = set()
|
|
for kind in TextLineFontKind:
|
|
fonts = self.get_font(kind)
|
|
if fonts is None:
|
|
continue
|
|
fonts, baseline_pos = fonts
|
|
retval.update(fonts)
|
|
return frozenset(retval)
|
|
|
|
def get_kind(self, font: Font, baseline_pos: BaselinePos, default: _T=None) -> _T | TextLineFontKind:
|
|
retval = self.__font_to_kind_map.get((font, baseline_pos))
|
|
if retval is None:
|
|
retval = self.__font_to_kind_map.get((font, None))
|
|
if retval is None:
|
|
return default
|
|
return retval
|
|
|
|
TextLineFonts._define_fonts()
|
|
|
|
class FontVariantCode(tuple, enum.Enum):
|
|
CODE = ("code",)
|
|
NOT_CODE = ()
|
|
|
|
class FontVariantBold(tuple, enum.Enum):
|
|
BOLD = ("b",)
|
|
NOT_BOLD = ()
|
|
|
|
class FontVariantItalic(tuple, enum.Enum):
|
|
ITALIC = ("i",)
|
|
NOT_ITALIC = ()
|
|
|
|
class FontVariantSubSuper(tuple, enum.Enum):
|
|
NOT_SUB_SUPER = ()
|
|
SUBSCRIPT = ("sub",)
|
|
SUPERSCRIPT = ("sup",)
|
|
|
|
@cached_property
|
|
def baseline_pos(self) -> None | BaselinePos:
|
|
match self:
|
|
case FontVariantSubSuper.NOT_SUB_SUPER:
|
|
return None
|
|
case FontVariantSubSuper.SUBSCRIPT:
|
|
return BaselinePos.BELOW
|
|
case FontVariantSubSuper.SUPERSCRIPT:
|
|
return BaselinePos.ABOVE
|
|
case _:
|
|
assert_never(self)
|
|
|
|
class TextLineFontKind(enum.Enum):
|
|
def __init__(
|
|
self,
|
|
code: FontVariantCode,
|
|
bold: FontVariantBold,
|
|
italic: FontVariantItalic,
|
|
sub_super: FontVariantSubSuper,
|
|
):
|
|
self.code = code
|
|
self.bold = bold
|
|
self.italic = italic
|
|
self.sub_super = sub_super
|
|
|
|
REGULAR = FontVariantCode.NOT_CODE, FontVariantBold.NOT_BOLD, \
|
|
FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.NOT_SUB_SUPER
|
|
SUBSCRIPT = FontVariantCode.NOT_CODE, FontVariantBold.NOT_BOLD, \
|
|
FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.SUBSCRIPT
|
|
SUPERSCRIPT = FontVariantCode.NOT_CODE, FontVariantBold.NOT_BOLD, \
|
|
FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.SUPERSCRIPT
|
|
ITALIC = FontVariantCode.NOT_CODE, FontVariantBold.NOT_BOLD, \
|
|
FontVariantItalic.ITALIC, FontVariantSubSuper.NOT_SUB_SUPER
|
|
ITALIC_SUBSCRIPT = FontVariantCode.NOT_CODE, FontVariantBold.NOT_BOLD, \
|
|
FontVariantItalic.ITALIC, FontVariantSubSuper.SUBSCRIPT
|
|
ITALIC_SUPERSCRIPT = FontVariantCode.NOT_CODE, FontVariantBold.NOT_BOLD, \
|
|
FontVariantItalic.ITALIC, FontVariantSubSuper.SUPERSCRIPT
|
|
BOLD = FontVariantCode.NOT_CODE, FontVariantBold.BOLD, \
|
|
FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.NOT_SUB_SUPER
|
|
BOLD_SUBSCRIPT = FontVariantCode.NOT_CODE, FontVariantBold.BOLD, \
|
|
FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.SUBSCRIPT
|
|
BOLD_SUPERSCRIPT = FontVariantCode.NOT_CODE, FontVariantBold.BOLD, \
|
|
FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.SUPERSCRIPT
|
|
BOLD_ITALIC = FontVariantCode.NOT_CODE, FontVariantBold.BOLD, \
|
|
FontVariantItalic.ITALIC, FontVariantSubSuper.NOT_SUB_SUPER
|
|
BOLD_ITALIC_SUBSCRIPT = FontVariantCode.NOT_CODE, FontVariantBold.BOLD, \
|
|
FontVariantItalic.ITALIC, FontVariantSubSuper.SUBSCRIPT
|
|
BOLD_ITALIC_SUPERSCRIPT = FontVariantCode.NOT_CODE, FontVariantBold.BOLD, \
|
|
FontVariantItalic.ITALIC, FontVariantSubSuper.SUPERSCRIPT
|
|
CODE = FontVariantCode.CODE, FontVariantBold.NOT_BOLD, \
|
|
FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.NOT_SUB_SUPER
|
|
CODE_SUBSCRIPT = FontVariantCode.CODE, FontVariantBold.NOT_BOLD, \
|
|
FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.SUBSCRIPT
|
|
CODE_SUPERSCRIPT = FontVariantCode.CODE, FontVariantBold.NOT_BOLD, \
|
|
FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.SUPERSCRIPT
|
|
|
|
@cached_property
|
|
def text_line_tags(self) -> tuple[str, ...]:
|
|
return (*self.code.value, *self.bold.value, *self.italic.value, *self.sub_super.value)
|
|
|
|
class PageParseError(Exception):
|
|
pass
|
|
|
|
class InsnParseError(Exception):
|
|
pass
|
|
|
|
class ElementBodyBuilder:
|
|
def __init__(self, containing_element: ElementTree.Element):
|
|
self.__containing_element = containing_element
|
|
self.__stack: list[ElementTree.Element] = []
|
|
self.__text_buffer: list[str] = []
|
|
|
|
def __shrink_stack(self, new_len: int):
|
|
while new_len < len(self.__stack):
|
|
self.__flush_text_buffer()
|
|
self.__stack.pop()
|
|
|
|
def set_tag_stack(self, tag_stack: Iterable[str]):
|
|
new_len = 0
|
|
for i, tag in enumerate(tag_stack):
|
|
new_len = i + 1
|
|
if i >= len(self.__stack):
|
|
self.__flush_text_buffer()
|
|
self.__stack.append(ElementTree.SubElement(self.__insert_point(), tag))
|
|
elif self.__stack[i].tag != tag:
|
|
self.__shrink_stack(new_len)
|
|
self.__shrink_stack(new_len)
|
|
|
|
def write_text(self, text: str):
|
|
self.__text_buffer.append(text)
|
|
|
|
def __insert_point(self) -> ElementTree.Element:
|
|
if len(self.__stack) != 0:
|
|
return self.__stack[-1]
|
|
return self.__containing_element
|
|
|
|
def __flush_text_buffer(self):
|
|
if len(self.__text_buffer) == 0:
|
|
return
|
|
insert_point = self.__insert_point()
|
|
text = "".join(self.__text_buffer)
|
|
self.__text_buffer.clear()
|
|
if len(insert_point) != 0:
|
|
element = insert_point[-1]
|
|
element.tail = (element.tail or "") + text
|
|
else:
|
|
insert_point.text = (insert_point.text or "") + text
|
|
|
|
def __enter__(self) -> ElementBodyBuilder:
|
|
return self
|
|
|
|
def __exit__(self, exc_type, exc_value, traceback):
|
|
self.flush()
|
|
|
|
def flush(self):
|
|
self.set_tag_stack(())
|
|
self.__flush_text_buffer()
|
|
|
|
@dataclass(unsafe_hash=True, frozen=True)
|
|
class InsnBitField:
|
|
box_min_x: float
|
|
box_max_x: float
|
|
name: ParsedTextLine
|
|
bit_number: ParsedTextLine
|
|
|
|
def __str__(self) -> str:
|
|
return f"<InsnBitField: x={self.box_min_x}..{self.box_max_x} name={self.name} bit_number={self.bit_number}>"
|
|
|
|
def write_xml(self, parent: ElementTree.Element):
|
|
field = ElementTree.SubElement(parent, "field")
|
|
field.text = "\n"
|
|
field.tail = "\n"
|
|
name = ElementTree.SubElement(field, "name")
|
|
name.tail = "\n"
|
|
self.name.write_xml(name, trailing_nl=False)
|
|
bit_number = ElementTree.SubElement(field, "bit-number")
|
|
bit_number.tail = "\n"
|
|
self.bit_number.write_xml(bit_number, trailing_nl=False)
|
|
|
|
@dataclass(unsafe_hash=True, frozen=True)
|
|
class InsnBitFieldsPrefix:
|
|
box_min_x: float
|
|
box_min_y: float
|
|
box_max_x: float
|
|
box_max_y: float
|
|
prefix_text: ParsedTextLine
|
|
fields: tuple[InsnBitField, ...]
|
|
suffix_text: ParsedTextLine
|
|
|
|
def __str__(self):
|
|
sep = ",\n "
|
|
return (f"<InsnBitFieldsPrefix: ({self.box_min_x},{self.box_min_y}).."
|
|
f"({self.box_max_x},{self.box_max_y})\n"
|
|
f" prefix_text={self.prefix_text}\n"
|
|
f" [\n"
|
|
f" {sep.join(map(str, self.fields))},\n"
|
|
f" ]\n"
|
|
f" suffix_text={self.suffix_text}>")
|
|
|
|
def write_xml(self, parent: ElementTree.Element):
|
|
prefix_elm = ElementTree.SubElement(parent, "prefix")
|
|
prefix_elm.text = "\n"
|
|
prefix_elm.tail = "\n"
|
|
prefix_text = ElementTree.SubElement(prefix_elm, "prefix-text")
|
|
prefix_text.tail = "\n"
|
|
self.prefix_text.write_xml(prefix_text, trailing_nl=False)
|
|
InsnBitFields.write_xml_fields(self.fields, prefix_elm)
|
|
suffix_text = ElementTree.SubElement(prefix_elm, "suffix-text")
|
|
suffix_text.tail = "\n"
|
|
self.suffix_text.write_xml(suffix_text, trailing_nl=False)
|
|
|
|
@dataclass(unsafe_hash=True, frozen=True)
|
|
class InsnBitFields:
|
|
prefix: None | InsnBitFieldsPrefix
|
|
box_min_x: float
|
|
box_min_y: float
|
|
box_max_x: float
|
|
box_max_y: float
|
|
fields: tuple[InsnBitField, ...]
|
|
|
|
def __str__(self):
|
|
sep = ",\n "
|
|
prefix_str = ""
|
|
if self.prefix is not None:
|
|
prefix_str = f"{self.prefix}\n"
|
|
return (f"{prefix_str}<InsnBitFields: ({self.box_min_x},{self.box_min_y}).."
|
|
f"({self.box_max_x},{self.box_max_y}) [\n"
|
|
f" {sep.join(map(str, self.fields))},\n]>")
|
|
|
|
@staticmethod
|
|
def write_xml_fields(fields: Iterable[InsnBitField], parent: ElementTree.Element):
|
|
fields_elm = ElementTree.SubElement(parent, "fields")
|
|
fields_elm.text = "\n"
|
|
fields_elm.tail = "\n"
|
|
for field in fields:
|
|
field.write_xml(fields_elm)
|
|
|
|
def write_xml(self, parent: ElementTree.Element):
|
|
bit_fields = ElementTree.SubElement(parent, "bit-fields")
|
|
bit_fields.text = "\n"
|
|
bit_fields.tail = "\n"
|
|
if self.prefix is not None:
|
|
self.prefix.write_xml(bit_fields)
|
|
InsnBitFields.write_xml_fields(self.fields, bit_fields)
|
|
|
|
|
|
@dataclass(unsafe_hash=True, frozen=True)
|
|
class InsnSpRegsAlteredEntry:
|
|
reg: ParsedTextLine
|
|
fields: tuple[ParsedTextLine, ...]
|
|
conds: tuple[ParsedTextLine, ...]
|
|
|
|
def __str__(self, indent="") -> str:
|
|
fields = "\n".join([
|
|
"(",
|
|
*(f"{indent} {i}," for i in self.fields),
|
|
f"{indent} )",
|
|
])
|
|
if self.fields == ():
|
|
fields = "()"
|
|
conds = "\n".join([
|
|
"(",
|
|
*(f"{indent} {i}," for i in self.conds),
|
|
f"{indent} )",
|
|
])
|
|
if self.conds == ():
|
|
conds = "()"
|
|
return (f"Entry(\n"
|
|
f"{indent} reg={self.reg},\n"
|
|
f"{indent} fields={fields},\n"
|
|
f"{indent} conds={conds},\n"
|
|
f"{indent})")
|
|
|
|
def write_xml(self, parent: ElementTree.Element):
|
|
entry = ElementTree.SubElement(parent, "entry")
|
|
entry.text = "\n"
|
|
entry.tail = "\n"
|
|
reg = ElementTree.SubElement(entry, "register")
|
|
reg.tail = "\n"
|
|
self.reg.write_xml(reg, trailing_nl=False)
|
|
fields = ElementTree.SubElement(entry, "fields")
|
|
fields.tail = "\n"
|
|
ParsedTextLine.write_xml_lines(self.fields, fields, trailing_nl=False)
|
|
conds = ElementTree.SubElement(entry, "conditions")
|
|
conds.tail = "\n"
|
|
ParsedTextLine.write_xml_lines(self.conds, conds, trailing_nl=False)
|
|
|
|
@dataclass(unsafe_hash=True, frozen=True)
|
|
class InsnSpRegsAltered:
|
|
sp_regs_altered_text: ParsedTextLine
|
|
special_text: None | ParsedTextLine
|
|
table_header_reg: None | ParsedTextLine
|
|
table_header_fields: None | ParsedTextLine
|
|
entries: tuple[InsnSpRegsAlteredEntry, ...]
|
|
final_regular_min_y: float
|
|
|
|
def __str__(self) -> str:
|
|
lines = [
|
|
"InsnSpRegsAltered(",
|
|
f" sp_regs_altered_text={self.sp_regs_altered_text},"
|
|
]
|
|
if self.special_text is not None:
|
|
lines.append(f" special_text={self.special_text},")
|
|
if self.table_header_reg is not None:
|
|
lines.append(f" table_header_reg={self.table_header_reg},")
|
|
if self.table_header_fields is not None:
|
|
lines.append(f" table_header_fields={self.table_header_fields},")
|
|
if len(self.entries) == 0:
|
|
lines.append(" entries=(),")
|
|
else:
|
|
lines.append(" entries=(")
|
|
for entry in self.entries:
|
|
lines.append(f" {entry.__str__(' ')},")
|
|
lines.append(" ),")
|
|
lines.append(f" final_regular_min_y={self.final_regular_min_y},")
|
|
lines.append(f")")
|
|
return "\n".join(lines)
|
|
|
|
def write_xml(self, parent: ElementTree.Element):
|
|
sp_regs_altered = ElementTree.SubElement(parent, "special-registers-altered")
|
|
sp_regs_altered.text = "\n"
|
|
sp_regs_altered.tail = "\n"
|
|
title = ElementTree.SubElement(sp_regs_altered, "title")
|
|
title.tail = "\n"
|
|
self.sp_regs_altered_text.write_xml(title, trailing_nl=False)
|
|
if self.special_text is not None:
|
|
special_text = ElementTree.SubElement(sp_regs_altered, "special-text")
|
|
special_text.tail = "\n"
|
|
self.special_text.write_xml(special_text, trailing_nl=False)
|
|
if self.table_header_reg is not None:
|
|
table_header_reg = ElementTree.SubElement(sp_regs_altered, "table-header-register")
|
|
table_header_reg.tail = "\n"
|
|
self.table_header_reg.write_xml(table_header_reg, trailing_nl=False)
|
|
if self.table_header_fields is not None:
|
|
table_header_fields = ElementTree.SubElement(sp_regs_altered, "table-header-fields")
|
|
table_header_fields.tail = "\n"
|
|
self.table_header_fields.write_xml(table_header_fields, trailing_nl=False)
|
|
for entry in self.entries:
|
|
entry.write_xml(sp_regs_altered)
|
|
|
|
class _InsnParseSection(enum.Enum):
|
|
CODE = "code"
|
|
HEADER = "header"
|
|
DESC = "desc"
|
|
|
|
CHAR_TO_EXPANDED = {
|
|
"\ufb00": "ff",
|
|
"\ufb01": "fi",
|
|
"\ufb02": "fl",
|
|
"\ufb03": "ffi",
|
|
"\ufb04": "ffl",
|
|
}
|
|
|
|
@dataclass()
|
|
class Page:
|
|
page_num: int
|
|
qt: defaultdict[TextSection, QuadTree[Char | LTLine | LTRect]]
|
|
unprocessed_chars: defaultdict[TextSection, defaultdict[Font, SetById[Char]]]
|
|
unprocessed_non_text: SetById[LTLine | LTRect]
|
|
|
|
@staticmethod
|
|
def from_lt_page(page_num: int, page: LTPage) -> Page:
|
|
qt: defaultdict[TextSection, QuadTree[Char | LTLine | LTRect]] = defaultdict(QuadTree)
|
|
unprocessed_chars = defaultdict(lambda: defaultdict(SetById[Char]))
|
|
unprocessed_non_text: SetById[LTLine | LTRect] = SetById()
|
|
for component in page:
|
|
if isinstance(component, (LTLine, LTRect)):
|
|
if component.width > 100 and \
|
|
component.x0 < COLUMN_SPLIT_X - 10 and \
|
|
component.x1 > COLUMN_SPLIT_X + 10:
|
|
print(f"wide component: {component}")
|
|
else:
|
|
print(f"component: {component}")
|
|
text_section = TextSection.for_position(
|
|
page_num=page_num,
|
|
x=(component.x0 + component.x1) * 0.5,
|
|
y=(component.y0 + component.y1) * 0.5,
|
|
)
|
|
if text_section is not None:
|
|
qt[text_section].insert(component.x0, component.y0, component)
|
|
unprocessed_non_text.add(component)
|
|
continue
|
|
if not isinstance(component, LTTextBox):
|
|
print(f"ignoring: {component}")
|
|
continue
|
|
for text_line in component:
|
|
for element in text_line:
|
|
if not isinstance(element, LTChar):
|
|
continue
|
|
text_section = TextSection.for_position(
|
|
page_num=page_num,
|
|
x=(element.x0 + element.x1) * 0.5,
|
|
y=(element.y0 + element.y1) * 0.5,
|
|
)
|
|
if text_section is None:
|
|
if PAGE_BODY_MIN_Y <= element.y0 <= PAGE_BODY_MAX_Y:
|
|
if page_num != 1072: # page 1072 has characters in the margins
|
|
raise AssertionError(
|
|
f"char not in text section: {element}\npage_num={page_num}")
|
|
continue
|
|
char = Char(
|
|
text=element.get_text(),
|
|
font=Font(font_name=element.fontname, size=round(element.size, 3)),
|
|
adv=element.adv,
|
|
min_x=element.x0,
|
|
min_y=element.y0,
|
|
max_x=element.x1,
|
|
max_y=element.y1,
|
|
)
|
|
qt[text_section].insert(char.min_x, char.min_y, char)
|
|
unprocessed_chars[text_section][char.font].add(char)
|
|
for i in unprocessed_chars.values():
|
|
for j in i.values():
|
|
j.sort(key=Char.top_down_left_to_right_sort_key)
|
|
unknown_fonts=[]
|
|
unknown_font_errors=[]
|
|
for i in unprocessed_chars.values():
|
|
for font, chars in i.items():
|
|
if font.known_name is None:
|
|
text = ""
|
|
char = None
|
|
for char in chars:
|
|
text += char.text
|
|
unknown_fonts.append(repr(font) + ",")
|
|
unknown_font_errors.append(f"unknown font {font}\nlast char: {char}\ntext: {text!r}")
|
|
unknown_fonts.sort()
|
|
if len(unknown_fonts) != 0:
|
|
raise AssertionError("\nunknown fonts:\n" + "\n".join(unknown_fonts)
|
|
+ "\n\n" + "\n".join(unknown_font_errors))
|
|
return Page(
|
|
page_num=page_num,
|
|
qt=qt,
|
|
unprocessed_chars=unprocessed_chars,
|
|
unprocessed_non_text=unprocessed_non_text,
|
|
)
|
|
|
|
class Pages:
|
|
pages_gen: None | Generator[Page, None, None]
|
|
__pages: dict[int, Page]
|
|
__max_page_num: int
|
|
|
|
def __init__(self, pages_gen: None | Generator[Page, None, None]=None):
|
|
self.pages_gen = pages_gen
|
|
self.__pages = {}
|
|
self.__max_page_num = 0
|
|
|
|
def __enter__(self) -> Pages:
|
|
return self
|
|
|
|
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
|
self.close()
|
|
|
|
def close(self):
|
|
if self.pages_gen is not None:
|
|
gen = self.pages_gen
|
|
self.pages_gen = None
|
|
gen.close()
|
|
|
|
def is_past_end(self, page_num: int) -> bool:
|
|
while self.pages_gen is not None and page_num > self.__max_page_num:
|
|
self.__fill_page()
|
|
return page_num > self.__max_page_num
|
|
|
|
def __fill_page(self) -> bool:
|
|
if self.pages_gen is None:
|
|
return False
|
|
try:
|
|
page = self.pages_gen.send(None)
|
|
except StopIteration:
|
|
page = None
|
|
if page is None:
|
|
self.close()
|
|
return False
|
|
if page.page_num <= self.__max_page_num:
|
|
e = AssertionError(
|
|
f"page numbers must be a strictly-increasing positive integer sequence:\n"
|
|
f"got {page.page_num} which isn't more than {self.__max_page_num}")
|
|
if self.pages_gen is not None:
|
|
self.pages_gen.throw(e)
|
|
raise e # either no generator or generator failed to propagate exception
|
|
self.__pages[page.page_num] = page
|
|
self.__max_page_num = page.page_num
|
|
return True
|
|
|
|
def get(self, page_num: int, default: _T=None) -> _T | Page:
|
|
while True:
|
|
page = self.__pages.get(page_num)
|
|
if page is not None:
|
|
return page
|
|
if self.pages_gen is None:
|
|
return default
|
|
if page_num < self.__max_page_num:
|
|
return default
|
|
self.__fill_page()
|
|
|
|
def __contains__(self, page_num: int, /) -> bool:
|
|
return self.get(page_num) is not None
|
|
|
|
def __getitem__(self, page_num: int, /) -> Page:
|
|
retval = self.get(page_num)
|
|
if retval is None:
|
|
raise KeyError(page_num)
|
|
return retval
|
|
|
|
@dataclass(unsafe_hash=True, frozen=True)
|
|
class TextSection:
|
|
page_num: int
|
|
min_x: float
|
|
min_y: float
|
|
max_x: float
|
|
max_y: float
|
|
|
|
@classmethod
|
|
def first(cls) -> TextSection:
|
|
return cls.page_sections(page_num=1)[0]
|
|
|
|
@cached_property
|
|
def next(self) -> TextSection:
|
|
page_sections = self.page_sections(page_num=self.page_num)
|
|
index = page_sections.index(self)
|
|
if index + 1 < len(page_sections):
|
|
return page_sections[index + 1]
|
|
for page_num in range(self.page_num + 1, self.page_num + 100000):
|
|
page_sections = self.page_sections(page_num=page_num)
|
|
if len(page_sections) != 0:
|
|
return page_sections[0]
|
|
raise AssertionError(f"can't find next TextSection after {self}")
|
|
|
|
@classmethod
|
|
def left_column(
|
|
cls, *,
|
|
page_num: int,
|
|
min_y=PAGE_BODY_MIN_Y,
|
|
max_y=PAGE_BODY_MAX_Y,
|
|
) -> TextSection:
|
|
return cls(
|
|
page_num=page_num,
|
|
min_x=PAGE_BODY_MIN_X,
|
|
min_y=min_y,
|
|
max_x=COLUMN_SPLIT_X,
|
|
max_y=max_y)
|
|
|
|
@classmethod
|
|
def right_column(
|
|
cls, *,
|
|
page_num: int,
|
|
min_y=PAGE_BODY_MIN_Y,
|
|
max_y=PAGE_BODY_MAX_Y,
|
|
) -> TextSection:
|
|
return cls(
|
|
page_num=page_num,
|
|
min_x=COLUMN_SPLIT_X,
|
|
min_y=min_y,
|
|
max_x=PAGE_BODY_MAX_X,
|
|
max_y=max_y)
|
|
|
|
@classmethod
|
|
def columns(
|
|
cls, *,
|
|
page_num: int,
|
|
min_y=PAGE_BODY_MIN_Y,
|
|
max_y=PAGE_BODY_MAX_Y,
|
|
) -> tuple[TextSection, TextSection]:
|
|
return (cls.left_column(page_num=page_num, min_y=min_y, max_y=max_y),
|
|
cls.right_column(page_num=page_num, min_y=min_y, max_y=max_y))
|
|
|
|
@classmethod
|
|
def full_page(
|
|
cls, *,
|
|
page_num: int,
|
|
min_y=PAGE_BODY_MIN_Y,
|
|
max_y=PAGE_BODY_MAX_Y,
|
|
) -> TextSection:
|
|
return cls(
|
|
page_num=page_num,
|
|
min_x=PAGE_BODY_MIN_X,
|
|
min_y=min_y,
|
|
max_x=PAGE_BODY_MAX_X,
|
|
max_y=max_y)
|
|
|
|
__COLUMNS_THEN_FULL_PAGE: ClassVar = {
|
|
129: 438.992, 241: 512.419, 242: 408.077, 243: 488.509,
|
|
244: 437.518, 245: 444.522, 247: 352.082, 248: 356.723,
|
|
249: 365.944, 251: 334.553, 264: 184.67, 296: 267.29,
|
|
297: 200.043, 298: 440.64, 299: 197.356, 300: 160.076,
|
|
301: 364.924, 303: 330.055, 305: 344.867, 306: 335.403,
|
|
307: 336.897, 308: 365.233, 309: 364.735,
|
|
}
|
|
|
|
__FULL_PAGE_THEN_COLUMNS: ClassVar = {
|
|
246: 689.039,
|
|
250: 615.315,
|
|
266: 678.088,
|
|
}
|
|
|
|
__ONE_TITLE_LINE_THEN_COLUMNS_THEN_FULL_PAGE: ClassVar = {
|
|
128: 301.55,
|
|
}
|
|
|
|
__TWO_TITLE_LINES_THEN_COLUMNS_THEN_FULL_PAGE: ClassVar = {
|
|
304: 242.732,
|
|
}
|
|
|
|
__COLUMNS_THEN_COLUMNS: ClassVar = {
|
|
79: 621.66,
|
|
126: 519.89,
|
|
}
|
|
|
|
__ONE_TITLE_LINE_THEN_COLUMNS_THEN_COLUMNS: ClassVar = {
|
|
130: 550.43,
|
|
162: 599.247,
|
|
194: 622.161,
|
|
196: 682.933,
|
|
204: 613.195,
|
|
215: 633.12,
|
|
}
|
|
|
|
__ONE_TITLE_LINE_THEN_COLUMNS: ClassVar = {
|
|
103, 104, 105, 121, 139, 143, 146, 151, 153, 158, 207, 213, 218,
|
|
}
|
|
|
|
__TWO_TITLE_LINES_THEN_COLUMNS: ClassVar = {
|
|
198, 206,
|
|
}
|
|
|
|
__FULL_PAGE: ClassVar = {
|
|
118, 157, 252, 254, 255, 257, 259, 260, 265, 268, 270, 271, 272,
|
|
*range(274, 286),
|
|
}
|
|
|
|
@classmethod
|
|
def __page_sections(cls, page_num: int) -> tuple[TextSection, ...]:
|
|
match page_num:
|
|
case _ if page_num in cls.__COLUMNS_THEN_COLUMNS:
|
|
split_y = cls.__COLUMNS_THEN_COLUMNS[page_num]
|
|
return (
|
|
*cls.columns(page_num=page_num, min_y=split_y),
|
|
*cls.columns(page_num=page_num, max_y=split_y),
|
|
)
|
|
case _ if page_num in cls.__ONE_TITLE_LINE_THEN_COLUMNS:
|
|
return (
|
|
cls.full_page(page_num=page_num, min_y=ONE_TITLE_LINE_SPLIT_Y),
|
|
*cls.columns(page_num=page_num, max_y=ONE_TITLE_LINE_SPLIT_Y),
|
|
)
|
|
case _ if page_num in cls.__FULL_PAGE:
|
|
return cls.full_page(page_num=page_num),
|
|
case _ if page_num in cls.__ONE_TITLE_LINE_THEN_COLUMNS_THEN_COLUMNS:
|
|
split_y = cls.__ONE_TITLE_LINE_THEN_COLUMNS_THEN_COLUMNS[page_num]
|
|
return (
|
|
cls.full_page(page_num=page_num, min_y=ONE_TITLE_LINE_SPLIT_Y),
|
|
*cls.columns(page_num=page_num, min_y=split_y, max_y=ONE_TITLE_LINE_SPLIT_Y),
|
|
*cls.columns(page_num=page_num, max_y=split_y),
|
|
)
|
|
case _ if page_num in cls.__TWO_TITLE_LINES_THEN_COLUMNS:
|
|
return (
|
|
cls.full_page(page_num=page_num, min_y=TWO_TITLE_LINES_SPLIT_Y),
|
|
*cls.columns(page_num=page_num, max_y=TWO_TITLE_LINES_SPLIT_Y),
|
|
)
|
|
case _ if page_num in cls.__COLUMNS_THEN_FULL_PAGE:
|
|
split_y = cls.__COLUMNS_THEN_FULL_PAGE[page_num]
|
|
return (
|
|
*cls.columns(page_num=page_num, min_y=split_y),
|
|
cls.full_page(page_num=page_num, max_y=split_y),
|
|
)
|
|
case _ if page_num in cls.__FULL_PAGE_THEN_COLUMNS:
|
|
split_y = cls.__FULL_PAGE_THEN_COLUMNS[page_num]
|
|
return (
|
|
cls.full_page(page_num=page_num, min_y=split_y),
|
|
*cls.columns(page_num=page_num, max_y=split_y),
|
|
)
|
|
case _ if page_num in cls.__ONE_TITLE_LINE_THEN_COLUMNS_THEN_FULL_PAGE:
|
|
split_y = cls.__ONE_TITLE_LINE_THEN_COLUMNS_THEN_FULL_PAGE[page_num]
|
|
return (
|
|
cls.full_page(page_num=page_num, min_y=ONE_TITLE_LINE_SPLIT_Y),
|
|
*cls.columns(page_num=page_num, min_y=split_y, max_y=ONE_TITLE_LINE_SPLIT_Y),
|
|
cls.full_page(page_num=page_num, max_y=split_y),
|
|
)
|
|
case _ if page_num in cls.__TWO_TITLE_LINES_THEN_COLUMNS_THEN_FULL_PAGE:
|
|
split_y = cls.__TWO_TITLE_LINES_THEN_COLUMNS_THEN_FULL_PAGE[page_num]
|
|
return (
|
|
cls.full_page(page_num=page_num, min_y=TWO_TITLE_LINES_SPLIT_Y),
|
|
*cls.columns(page_num=page_num, min_y=split_y, max_y=TWO_TITLE_LINES_SPLIT_Y),
|
|
cls.full_page(page_num=page_num, max_y=split_y),
|
|
)
|
|
case 263:
|
|
return (
|
|
cls.full_page(page_num=page_num, min_y=699.997),
|
|
*cls.columns(page_num=page_num, min_y=366.396, max_y=699.997),
|
|
*cls.columns(page_num=page_num, min_y=207, max_y=366.396),
|
|
cls.full_page(page_num=page_num, max_y=207),
|
|
)
|
|
# TODO: checked up to page 309 (page named 273)
|
|
case _:
|
|
return cls.columns(page_num=page_num)
|
|
|
|
__PAGE_SECTIONS_CACHE: ClassVar[dict[int, tuple[TextSection, ...]]] = {}
|
|
|
|
@classmethod
|
|
def page_sections(cls, page_num: int) -> tuple[TextSection, ...]:
|
|
try:
|
|
return cls.__PAGE_SECTIONS_CACHE[page_num]
|
|
except KeyError:
|
|
pass
|
|
retval = cls.__PAGE_SECTIONS_CACHE[page_num] = cls.__page_sections(page_num=page_num)
|
|
return retval
|
|
|
|
@classmethod
|
|
def for_position(cls, page_num: int, x: float, y: float) -> None | TextSection:
|
|
for i in cls.page_sections(page_num=page_num):
|
|
if i.min_x <= x <= i.max_x and i.min_y <= y <= i.max_y:
|
|
return i
|
|
return None
|
|
|
|
@dataclass(frozen=True)
|
|
class InsnHeader:
|
|
header_lines: tuple[ParsedTextLine, ...]
|
|
mnemonic_lines: tuple[ParsedTextLine, ...]
|
|
bit_fields: InsnBitFields
|
|
|
|
@property
|
|
def min_y(self) -> float:
|
|
return self.bit_fields.box_min_y
|
|
|
|
def write_xml(self, parent: ElementTree.Element):
|
|
header = ElementTree.SubElement(parent, "header")
|
|
header.text = "\n"
|
|
header.tail = "\n"
|
|
title = ElementTree.SubElement(header, "title")
|
|
title.tail = "\n"
|
|
ParsedTextLine.write_xml_lines(self.header_lines, title, trailing_nl=False)
|
|
mnemonics = ElementTree.SubElement(header, "mnemonics")
|
|
mnemonics.tail = "\n"
|
|
ParsedTextLine.write_xml_lines(self.mnemonic_lines, mnemonics, trailing_nl=False)
|
|
self.bit_fields.write_xml(header)
|
|
|
|
@dataclass(frozen=True)
|
|
class Insn:
|
|
headers: tuple[InsnHeader, ...]
|
|
code_lines: tuple[ParsedTextLine, ...]
|
|
desc_lines: tuple[ParsedTextLine, ...]
|
|
sp_regs_altered: None | InsnSpRegsAltered
|
|
|
|
def write_xml(self, parent: ElementTree.Element):
|
|
insn = ElementTree.SubElement(parent, "instruction")
|
|
insn.text = "\n"
|
|
insn.tail = "\n"
|
|
for header in self.headers:
|
|
header.write_xml(insn)
|
|
if len(self.code_lines) != 0:
|
|
code = ElementTree.SubElement(insn, "code")
|
|
code.tail = "\n"
|
|
ParsedTextLine.write_xml_lines(self.code_lines, code, trailing_nl=False)
|
|
if len(self.desc_lines) != 0:
|
|
desc = ElementTree.SubElement(insn, "description")
|
|
desc.tail = "\n"
|
|
ParsedTextLine.write_xml_lines(self.desc_lines, desc, trailing_nl=False)
|
|
if self.sp_regs_altered is not None:
|
|
self.sp_regs_altered.write_xml(insn)
|
|
|
|
@dataclass()
|
|
class Parser:
|
|
pages: Pages = field(default_factory=Pages)
|
|
text_section: TextSection = TextSection.first()
|
|
insns: list[Insn] = field(default_factory=list)
|
|
|
|
@property
|
|
def page(self) -> Page:
|
|
return self.pages[self.text_section.page_num]
|
|
|
|
@property
|
|
def unprocessed_chars(self) -> defaultdict[Font, SetById[Char]]:
|
|
return self.pages[self.text_section.page_num].unprocessed_chars[self.text_section]
|
|
|
|
@staticmethod
|
|
def __pages_gen(file: Path, page_numbers: Iterable[int] | None) -> Generator[Page, None, None]:
|
|
if page_numbers is not None:
|
|
page_numbers = sorted(i - 1 for i in page_numbers)
|
|
for i, page in enumerate(extract_pages(file, page_numbers=page_numbers)):
|
|
if page_numbers is not None:
|
|
page_num = page_numbers[i] + 1
|
|
else:
|
|
page_num = i + 1
|
|
print(f"page {page_num}")
|
|
yield Page.from_lt_page(page_num=page_num, page=page)
|
|
|
|
def parse_pdf(self, file: Path, page_numbers: Iterable[int] | None = None):
|
|
self.pages = Pages(pages_gen=Parser.__pages_gen(
|
|
file=file, page_numbers=page_numbers))
|
|
self.text_section = TextSection.first()
|
|
while True:
|
|
self.text_section = self.text_section.next
|
|
if self.pages.is_past_end(self.text_section.page_num):
|
|
break
|
|
if self.text_section.page_num in self.pages:
|
|
print(f"section {self.text_section}")
|
|
with self.note_text_section():
|
|
self.parse_text_section()
|
|
|
|
@contextmanager
|
|
def note_text_section(self):
|
|
start_text_section = self.text_section
|
|
try:
|
|
yield
|
|
except Exception as e:
|
|
if self.text_section == start_text_section:
|
|
note = f"text_section={self.text_section}"
|
|
else:
|
|
note = f"start_text_section={start_text_section}\ntext_section={self.text_section}"
|
|
if note not in getattr(e, "__notes__", ()):
|
|
e.add_note(note)
|
|
raise
|
|
|
|
def parse_text_section(self):
|
|
try:
|
|
with self.note_text_section():
|
|
self.extract_insns()
|
|
except (InsnParseError, PageParseError) as e:
|
|
print("".join(traceback.format_exception_only(e)), flush=True)
|
|
traceback.print_exc()
|
|
|
|
def find_top_left_char_in_range(self, *,
|
|
min_x: float,
|
|
max_x: float,
|
|
min_y: float,
|
|
max_y: float,
|
|
allow_processed: bool,
|
|
pred: None | Callable[[Char], bool] = None,
|
|
) -> None | Char:
|
|
retval = None
|
|
for x, y, char in self.page.qt[self.text_section].range(
|
|
min_x=min_x,
|
|
max_x=max_x,
|
|
min_y=min_y,
|
|
max_y=max_y,
|
|
):
|
|
if not isinstance(char, Char):
|
|
continue
|
|
if not allow_processed and char not in self.unprocessed_chars[char.font]:
|
|
continue
|
|
if retval is None:
|
|
if pred is None or pred(char):
|
|
retval = char
|
|
elif char.min_x - char.min_y < retval.min_x - retval.min_y:
|
|
if pred is None or pred(char):
|
|
retval = char
|
|
return retval
|
|
|
|
def extract_text_line(
|
|
self, *,
|
|
start_char: None | Char = None,
|
|
start_min_y: float,
|
|
min_x: float,
|
|
max_x: float,
|
|
fonts: TextLineFonts,
|
|
preceding_blank_lines: int = 0,
|
|
skip_initial_spaces=False,
|
|
allowed_start_min_y_error=None,
|
|
) -> None | ParsedTextLine:
|
|
chars: list[Char] = []
|
|
chars_set: SetById[Char] = SetById()
|
|
if start_char is not None:
|
|
chars.append(start_char)
|
|
chars_set.add(start_char)
|
|
if start_char is not None and \
|
|
start_char.text == "*" and \
|
|
self.text_section.page_num == 168 and \
|
|
start_char.font in (fonts.subscript or ()):
|
|
start_min_y = start_char.max_y - fonts.regular[0].size
|
|
for x, y, char in self.page.qt[self.text_section].range(
|
|
min_x=min_x - fonts.regular[0].size * 0.5,
|
|
max_x=max_x,
|
|
min_y=start_min_y - fonts.regular[0].size * 0.4,
|
|
max_y=start_min_y + fonts.regular[0].size * 0.6,
|
|
):
|
|
if not isinstance(char, Char):
|
|
continue
|
|
if char not in self.unprocessed_chars[char.font] or char in chars_set:
|
|
continue
|
|
chars_set.add(char)
|
|
chars.append(char)
|
|
if len(chars) == 0:
|
|
return None
|
|
chars.sort(key=lambda char: (char.min_x, char.text))
|
|
regular_min_y = chars[0].min_y
|
|
regular_max_y = chars[0].max_y
|
|
for char in chars:
|
|
kind = fonts.get_kind(font=char.font, baseline_pos=BaselinePos.BELOW)
|
|
if kind is not None and kind.sub_super is FontVariantSubSuper.NOT_SUB_SUPER:
|
|
regular_min_y = char.min_y
|
|
regular_max_y = char.max_y
|
|
break
|
|
retval = ParsedTextLine(
|
|
element=ElementTree.Element("text-line"),
|
|
regular_min_y=regular_min_y,
|
|
regular_max_y=regular_max_y,
|
|
fonts=fonts,
|
|
chars=chars,
|
|
preceding_blank_lines=preceding_blank_lines,
|
|
)
|
|
text_and_tag_stacks: list[tuple[str, tuple[str, ...]]] = []
|
|
last_max_x = min_x
|
|
last_kind = None
|
|
last_char = None
|
|
for char in chars:
|
|
if (char.max_y + char.min_y) * 0.5 > (retval.regular_max_y + retval.regular_min_y) * 0.5:
|
|
baseline_pos = BaselinePos.ABOVE
|
|
else:
|
|
baseline_pos = BaselinePos.BELOW
|
|
kind = fonts.get_kind(font=char.font, baseline_pos=baseline_pos)
|
|
if kind is None:
|
|
print(
|
|
f"font kind is None:\n"
|
|
f"regular_min_y={retval.regular_min_y}\n"
|
|
f"fonts={fonts}\n"
|
|
f"char={char}\n"
|
|
f"baseline_pos={baseline_pos}\n"
|
|
f"chars[0]={chars[0]}"
|
|
)
|
|
return None
|
|
if last_kind is None:
|
|
space_kind = kind
|
|
elif last_kind != kind:
|
|
space_kind = TextLineFontKind.REGULAR
|
|
else:
|
|
space_kind = kind
|
|
space_font, _ = fonts.get_font(space_kind, (fonts.regular, None))
|
|
space_width = char.min_x - last_max_x
|
|
space_count_f = space_width / space_font[0].space_width
|
|
space_count = round(space_count_f)
|
|
if space_count == 0 and space_count_f > 0.35:
|
|
space_count = 1
|
|
if space_count_f > 0.25 and abs(space_count - space_count_f) > 0.15:
|
|
print(f"spaces: space_count_f={space_count_f} space_width={space_width}")
|
|
if space_count > 0 and not skip_initial_spaces:
|
|
text_and_tag_stacks.append((" " * space_count, space_kind.text_line_tags))
|
|
skip_initial_spaces = False
|
|
if (char.text == "\u0338"
|
|
and last_char is not None
|
|
and last_char.text == "="
|
|
and abs(char.min_x - last_char.min_x) < 0.01
|
|
and abs(char.min_y - last_char.min_y) < 0.01):
|
|
text_and_tag_stacks[-1] = "\u2260", ()
|
|
last_max_x = last_char.max_x
|
|
else:
|
|
char_text = CHAR_TO_EXPANDED.get(char.text, char.text)
|
|
text_and_tag_stacks.append((char_text, kind.text_line_tags))
|
|
last_max_x = char.max_x
|
|
last_kind = kind
|
|
last_char = char
|
|
with ElementBodyBuilder(retval.element) as body_builder:
|
|
for text, tag_stack in text_and_tag_stacks:
|
|
body_builder.set_tag_stack(tag_stack)
|
|
body_builder.write_text(text)
|
|
for char in chars:
|
|
self.unprocessed_chars[char.font].remove(char)
|
|
if allowed_start_min_y_error is None:
|
|
allowed_start_min_y_error = 0.01
|
|
if abs(start_min_y - retval.regular_min_y) > allowed_start_min_y_error:
|
|
raise PageParseError(
|
|
f"start_min_y={start_min_y} regular_min_y={retval.regular_min_y}\n"
|
|
f"start_min_y error: {start_min_y - retval.regular_min_y}\n"
|
|
f"allowed_start_min_y_error={allowed_start_min_y_error}")
|
|
return retval
|
|
|
|
def extract_following_text_lines(
|
|
self,
|
|
first_text_line: ParsedTextLine,
|
|
min_x: float,
|
|
max_x: float,
|
|
allowed_start_min_y_error=None,
|
|
) -> list[ParsedTextLine]:
|
|
retval: list[ParsedTextLine] = []
|
|
line = first_text_line
|
|
while line is not None:
|
|
retval.append(line)
|
|
line = self.extract_text_line(
|
|
start_min_y=line.regular_min_y - first_text_line.fonts.regular[0].line_height,
|
|
min_x=min_x,
|
|
max_x=max_x,
|
|
fonts=first_text_line.fonts,
|
|
allowed_start_min_y_error=allowed_start_min_y_error,
|
|
)
|
|
return retval
|
|
|
|
def extract_insn_bit_fields(
|
|
self,
|
|
mnemonic_lines: list[ParsedTextLine],
|
|
) -> None | InsnBitFields:
|
|
found_non_affix_line = False
|
|
if len(mnemonic_lines) > 1:
|
|
expected_non_affix_line_y = (mnemonic_lines[-1].regular_min_y
|
|
- INSN_BIT_FIELDS_TOP_PAD_HEIGHT2)
|
|
else:
|
|
expected_non_affix_line_y = (mnemonic_lines[-1].regular_min_y
|
|
- INSN_BIT_FIELDS_TOP_PAD_HEIGHT)
|
|
for x, y, line in self.page.qt[self.text_section].range(
|
|
min_x=self.text_section.min_x - 5,
|
|
max_x=self.text_section.max_x + 5,
|
|
min_y=expected_non_affix_line_y - 5,
|
|
max_y=expected_non_affix_line_y + 5,
|
|
):
|
|
if not isinstance(line, LTLine):
|
|
continue
|
|
if line.width > line.height:
|
|
found_non_affix_line = True
|
|
break
|
|
if found_non_affix_line:
|
|
return self.extract_insn_bit_fields_box(
|
|
expected_box_max_y=expected_non_affix_line_y,
|
|
)
|
|
prefix_text = self.extract_text_line(
|
|
start_min_y=mnemonic_lines[-1].regular_min_y
|
|
- INSN_BIT_FIELDS_PREFIX_TEXT_TOP_PAD_HEIGHT,
|
|
min_x=self.text_section.min_x,
|
|
max_x=self.text_section.max_x,
|
|
fonts=TextLineFonts.INSN_BIT_FIELDS_AFFIX_TITLE_FONTS,
|
|
allowed_start_min_y_error=2,
|
|
skip_initial_spaces=True,
|
|
)
|
|
if prefix_text is None:
|
|
raise InsnParseError("can't find insn prefix bit fields title")
|
|
prefix_text_str = "".join(prefix_text.element.itertext())
|
|
if prefix_text_str != "Prefix:":
|
|
raise InsnParseError(
|
|
f"insn prefix bit fields title is not as expected: {prefix_text_str!r}")
|
|
prefix_bit_fields = self.extract_insn_bit_fields_box(
|
|
expected_box_max_y=prefix_text.regular_min_y
|
|
- INSN_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT,
|
|
)
|
|
if prefix_bit_fields is None:
|
|
raise InsnParseError("can't find insn prefix bit fields")
|
|
suffix_text = self.extract_text_line(
|
|
start_min_y=prefix_bit_fields.box_min_y
|
|
- INSN_BIT_FIELDS_PREFIX_BOX_BOTTOM_TO_SUFFIX_TEXT_HEIGHT,
|
|
min_x=self.text_section.min_x,
|
|
max_x=self.text_section.max_x,
|
|
fonts=TextLineFonts.INSN_BIT_FIELDS_AFFIX_TITLE_FONTS,
|
|
allowed_start_min_y_error=2,
|
|
skip_initial_spaces=True,
|
|
)
|
|
if suffix_text is None:
|
|
raise InsnParseError("can't find insn suffix bit fields title")
|
|
suffix_text_str = "".join(suffix_text.element.itertext())
|
|
if suffix_text_str != "Suffix:":
|
|
raise InsnParseError(
|
|
f"insn suffix bit fields title is not as expected: {suffix_text_str!r}")
|
|
suffix_bit_fields = self.extract_insn_bit_fields_box(
|
|
expected_box_max_y=suffix_text.regular_min_y
|
|
- INSN_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT,
|
|
)
|
|
if suffix_bit_fields is None:
|
|
raise InsnParseError("can't find insn suffix bit fields")
|
|
return InsnBitFields(
|
|
prefix=InsnBitFieldsPrefix(
|
|
box_min_x=prefix_bit_fields.box_min_x,
|
|
box_min_y=prefix_bit_fields.box_min_y,
|
|
box_max_x=prefix_bit_fields.box_max_x,
|
|
box_max_y=prefix_bit_fields.box_max_y,
|
|
prefix_text=prefix_text,
|
|
fields=prefix_bit_fields.fields,
|
|
suffix_text=suffix_text,
|
|
),
|
|
box_min_x=suffix_bit_fields.box_min_x,
|
|
box_min_y=suffix_bit_fields.box_min_y,
|
|
box_max_x=suffix_bit_fields.box_max_x,
|
|
box_max_y=suffix_bit_fields.box_max_y,
|
|
fields=suffix_bit_fields.fields,
|
|
)
|
|
|
|
def extract_insn_bit_fields_box(
|
|
self,
|
|
expected_box_max_y: float,
|
|
) -> None | InsnBitFields:
|
|
h_lines: list[LTLine] = []
|
|
v_lines: list[LTLine] = []
|
|
for x, y, line in self.page.qt[self.text_section].range(
|
|
min_x=self.text_section.min_x - 5,
|
|
max_x=self.text_section.max_x + 5,
|
|
min_y=expected_box_max_y - INSN_BIT_FIELDS_BOX_HEIGHT - 5,
|
|
max_y=expected_box_max_y + 5,
|
|
):
|
|
if not isinstance(line, LTLine):
|
|
continue
|
|
if line.width > line.height:
|
|
h_lines.append(line)
|
|
else:
|
|
v_lines.append(line)
|
|
h_lines.sort(key=lambda line: line.y0, reverse=False)
|
|
v_lines.sort(key=lambda line: line.x0)
|
|
for i in reversed(range(len(v_lines) - 1)):
|
|
if abs(v_lines[i].x0 - v_lines[i + 1].x0) < 0.5:
|
|
del v_lines[i + 1] # remove duplicates
|
|
if len(h_lines) == 0 and len(v_lines) == 0:
|
|
return None
|
|
if len(h_lines) != 2:
|
|
raise InsnParseError(
|
|
f"instruction bit fields box has wrong number of horizontal lines:\n{h_lines}")
|
|
if len(v_lines) < 2:
|
|
raise InsnParseError(
|
|
f"instruction bit fields box has too few vertical lines:\n{h_lines}")
|
|
bottom_line, top_line = h_lines
|
|
box_min_x = v_lines[0].x0
|
|
box_max_x = v_lines[-1].x0
|
|
box_min_y = bottom_line.y0
|
|
box_max_y = top_line.y1
|
|
box_mid_y = (box_min_y + box_max_y) * 0.5
|
|
print(f"bottom_line={bottom_line}")
|
|
print(f"top_line={top_line}")
|
|
print(v_lines)
|
|
fields: list[InsnBitField] = []
|
|
for i in range(len(v_lines) - 1):
|
|
left_line = v_lines[i]
|
|
right_line = v_lines[i + 1]
|
|
field_box_min_x = left_line.x1
|
|
field_box_max_x = right_line.x0
|
|
bit_field_name_start_min_y = box_mid_y + 3.288
|
|
bit_field_name=self.extract_text_line(
|
|
start_min_y=bit_field_name_start_min_y,
|
|
min_x=field_box_min_x,
|
|
max_x=field_box_max_x,
|
|
fonts=TextLineFonts.INSN_BIT_FIELD_NAME_FONTS,
|
|
skip_initial_spaces=True,
|
|
allowed_start_min_y_error=0.4,
|
|
)
|
|
if bit_field_name is None:
|
|
raise InsnParseError(f"instruction bit field name not found:\n"
|
|
f"start_min_y={bit_field_name_start_min_y} "
|
|
f"field_box_min_x={field_box_min_x} "
|
|
f"field_box_max_x={field_box_max_x}")
|
|
bit_field_number_start_min_y = box_min_y + 3.487
|
|
bit_number=self.extract_text_line(
|
|
start_min_y=bit_field_number_start_min_y,
|
|
min_x=field_box_min_x,
|
|
max_x=field_box_max_x,
|
|
fonts=TextLineFonts.INSN_BIT_FIELD_BIT_NUMBER_FONTS,
|
|
skip_initial_spaces=True,
|
|
)
|
|
if bit_number is None:
|
|
raise InsnParseError(f"instruction bit field bit number not found:\n"
|
|
f"start_min_y={bit_field_number_start_min_y} "
|
|
f"field_box_min_x={field_box_min_x} "
|
|
f"field_box_max_x={field_box_max_x}")
|
|
fields.append(InsnBitField(
|
|
box_min_x=field_box_min_x,
|
|
box_max_x=field_box_max_x,
|
|
name=bit_field_name,
|
|
bit_number=bit_number,
|
|
))
|
|
return InsnBitFields(
|
|
prefix=None,
|
|
box_min_x=box_min_x,
|
|
box_min_y=box_min_y,
|
|
box_max_x=box_max_x,
|
|
box_max_y=box_max_y,
|
|
fields=tuple(fields),
|
|
)
|
|
|
|
def extract_insn_header_mnemonics_and_bit_fields(
|
|
self,
|
|
start_min_y: float,
|
|
header_start_char: None | Char = None,
|
|
) -> None | InsnHeader:
|
|
assert header_start_char is None or \
|
|
header_start_char.font == Font.INSN_HEADER
|
|
header_line = self.extract_text_line(
|
|
start_char=header_start_char,
|
|
start_min_y=start_min_y,
|
|
min_x=self.text_section.min_x,
|
|
max_x=self.text_section.max_x,
|
|
fonts=TextLineFonts.INSN_HEADER_FONTS,
|
|
skip_initial_spaces=True,
|
|
allowed_start_min_y_error=6,
|
|
)
|
|
if header_line is None:
|
|
return None
|
|
print(f"found header line:\n{header_line}")
|
|
header_lines = self.extract_following_text_lines(
|
|
first_text_line=header_line,
|
|
min_x=self.text_section.min_x,
|
|
max_x=self.text_section.max_x,
|
|
allowed_start_min_y_error=1.5,
|
|
)
|
|
print("insn header lines:")
|
|
print("\n".join(map(str, header_lines)))
|
|
mnemonic_start_char = self.find_top_left_char_in_range(
|
|
min_x=self.text_section.min_x - 5,
|
|
max_x=self.text_section.max_x + 5,
|
|
min_y=header_lines[-1].regular_min_y - 50,
|
|
max_y=header_lines[-1].regular_min_y - 5,
|
|
allow_processed=False,
|
|
)
|
|
if mnemonic_start_char is None:
|
|
raise InsnParseError("can't find insn mnemonic text line")
|
|
mnemonic_line = self.extract_text_line(
|
|
start_char=mnemonic_start_char,
|
|
start_min_y=mnemonic_start_char.min_y,
|
|
min_x=self.text_section.min_x,
|
|
max_x=self.text_section.max_x,
|
|
fonts=TextLineFonts.INSN_MNEMONIC_FONTS,
|
|
skip_initial_spaces=True,
|
|
)
|
|
if mnemonic_line is None:
|
|
raise InsnParseError("can't find insn mnemonic text line")
|
|
mnemonic_lines = self.extract_following_text_lines(
|
|
first_text_line=mnemonic_line,
|
|
min_x=mnemonic_line.chars[0].min_x,
|
|
max_x=self.text_section.max_x,
|
|
)
|
|
print("insn mnemonic lines:")
|
|
print("\n".join(map(str, mnemonic_lines)))
|
|
insn_bit_fields = self.extract_insn_bit_fields(
|
|
mnemonic_lines=mnemonic_lines,
|
|
)
|
|
print(insn_bit_fields)
|
|
if insn_bit_fields is None:
|
|
raise InsnParseError("can't find insn bit fields")
|
|
return InsnHeader(
|
|
header_lines=tuple(header_lines),
|
|
mnemonic_lines=tuple(mnemonic_lines),
|
|
bit_fields=insn_bit_fields,
|
|
)
|
|
|
|
def extract_insn_sp_regs_altered(
|
|
self,
|
|
sp_regs_altered_text: ParsedTextLine,
|
|
) -> InsnSpRegsAltered:
|
|
sp_regs_altered_text.preceding_blank_lines = 0
|
|
fonts = TextLineFonts.INSN_DESC_FONTS
|
|
column_min_x = sp_regs_altered_text.chars[0].min_x
|
|
table_header_reg_char = self.find_top_left_char_in_range(
|
|
min_x=column_min_x - 1,
|
|
max_x=column_min_x + INSN_SP_REGS_ALTERED_FIELDS_COLUMN_X - 1,
|
|
min_y=sp_regs_altered_text.regular_min_y - 30,
|
|
max_y=sp_regs_altered_text.regular_min_y - 5,
|
|
allow_processed=False,
|
|
)
|
|
if table_header_reg_char is None:
|
|
raise InsnParseError(
|
|
"can't find special registers altered table's register-column's header")
|
|
KNOWN_SPECIAL_TEXTS = (
|
|
"None",
|
|
"Dependent on the system service",
|
|
"See above.",
|
|
"See Table 5.1",
|
|
)
|
|
match table_header_reg_char.text:
|
|
case "R":
|
|
pass
|
|
case text if any(text == i[0] for i in KNOWN_SPECIAL_TEXTS):
|
|
special_text = self.extract_text_line(
|
|
start_char=table_header_reg_char,
|
|
start_min_y=table_header_reg_char.min_y,
|
|
min_x=column_min_x,
|
|
max_x=self.text_section.max_x,
|
|
fonts=fonts,
|
|
skip_initial_spaces=True,
|
|
)
|
|
assert special_text is not None \
|
|
and special_text.element.text in KNOWN_SPECIAL_TEXTS, \
|
|
f"can't find special-registers-altered special-text:\n{special_text}"
|
|
return InsnSpRegsAltered(
|
|
sp_regs_altered_text=sp_regs_altered_text,
|
|
special_text=special_text,
|
|
table_header_reg=None,
|
|
table_header_fields=None,
|
|
entries=(),
|
|
final_regular_min_y=special_text.regular_min_y,
|
|
)
|
|
case text:
|
|
raise InsnParseError(
|
|
f"unknown special-registers-altered special-text start character: {text!r}")
|
|
table_header_fields_char = self.find_top_left_char_in_range(
|
|
min_x=column_min_x + INSN_SP_REGS_ALTERED_FIELDS_COLUMN_X - 10,
|
|
max_x=column_min_x + INSN_SP_REGS_ALTERED_FIELDS_CONDS_SPLIT_X,
|
|
min_y=table_header_reg_char.min_y - 5,
|
|
max_y=table_header_reg_char.min_y + 5,
|
|
allow_processed=False,
|
|
)
|
|
assert table_header_fields_char is not None, \
|
|
"can't find special registers altered table's fields-column's header"
|
|
assert table_header_fields_char.text == "F", (
|
|
f"can't find special registers altered table's fields-column's header:\n"
|
|
f"table_header_fields_char={table_header_fields_char}")
|
|
columns_x_bounds = (
|
|
(table_header_reg_char.min_x, table_header_fields_char.min_x - 1),
|
|
(table_header_fields_char.min_x,
|
|
column_min_x + INSN_SP_REGS_ALTERED_FIELDS_CONDS_SPLIT_X),
|
|
(column_min_x + INSN_SP_REGS_ALTERED_FIELDS_CONDS_SPLIT_X, self.text_section.max_x),
|
|
)
|
|
table_header_reg = self.extract_text_line(
|
|
start_char=table_header_reg_char,
|
|
start_min_y=table_header_reg_char.min_y,
|
|
min_x=columns_x_bounds[0][0],
|
|
max_x=columns_x_bounds[0][1],
|
|
fonts=fonts,
|
|
)
|
|
assert table_header_reg is not None, \
|
|
"can't find special registers altered table's register-column's header"
|
|
table_header_reg_text = "".join(table_header_reg.element.itertext())
|
|
assert table_header_reg_text == "Register", (
|
|
f"can't find special registers altered table's register-column's header:\n"
|
|
f"table_header_reg_text={table_header_reg_text!r}")
|
|
table_header_fields = self.extract_text_line(
|
|
start_char=table_header_fields_char,
|
|
start_min_y=table_header_fields_char.min_y,
|
|
min_x=columns_x_bounds[1][0],
|
|
max_x=columns_x_bounds[1][1],
|
|
fonts=fonts,
|
|
)
|
|
assert table_header_fields is not None, \
|
|
"can't find special registers altered table's fields-column's header"
|
|
table_header_fields_text = "".join(table_header_fields.element.itertext())
|
|
assert table_header_fields_text == "Field(s)", (
|
|
f"can't find special registers altered table's fields-column's header:\n"
|
|
f"table_header_fields_text={table_header_fields_text!r}")
|
|
regular_min_y = table_header_reg.regular_min_y
|
|
entries: list[InsnSpRegsAlteredEntry] = []
|
|
row: list[None | ParsedTextLine] = [None, None, None]
|
|
cur_reg: None | ParsedTextLine = None
|
|
cur_fields: list[ParsedTextLine] = []
|
|
cur_conds: list[ParsedTextLine] = []
|
|
while True:
|
|
next_regular_min_y = None
|
|
for i, (min_x, max_x) in enumerate(columns_x_bounds):
|
|
row[i] = cell = self.extract_text_line(
|
|
start_min_y=regular_min_y - fonts.regular[0].line_height,
|
|
min_x=min_x,
|
|
max_x=max_x,
|
|
fonts=fonts,
|
|
skip_initial_spaces=True,
|
|
allowed_start_min_y_error=2,
|
|
)
|
|
if cell is not None and next_regular_min_y is None:
|
|
next_regular_min_y = cell.regular_min_y
|
|
if next_regular_min_y is None:
|
|
break
|
|
regular_min_y = next_regular_min_y
|
|
cur_reg_cell, cur_fields_cell, cur_conds_cell = row
|
|
if cur_reg_cell is None:
|
|
assert cur_reg is not None, \
|
|
"can't find special registers altered table's first register"
|
|
if cur_fields_cell is not None:
|
|
cur_fields.append(cur_fields_cell)
|
|
if cur_conds_cell is not None:
|
|
cur_conds.append(cur_conds_cell)
|
|
continue
|
|
if cur_reg is not None:
|
|
entries.append(InsnSpRegsAlteredEntry(
|
|
reg=cur_reg,
|
|
fields=tuple(cur_fields),
|
|
conds=tuple(cur_conds),
|
|
))
|
|
cur_fields.clear()
|
|
cur_conds.clear()
|
|
cur_reg = cur_reg_cell
|
|
if cur_fields_cell is not None:
|
|
cur_fields.append(cur_fields_cell)
|
|
if cur_conds_cell is not None:
|
|
cur_conds.append(cur_conds_cell)
|
|
assert cur_reg is not None, \
|
|
"can't find special registers altered table's first register"
|
|
entries.append(InsnSpRegsAlteredEntry(
|
|
reg=cur_reg,
|
|
fields=tuple(cur_fields),
|
|
conds=tuple(cur_conds),
|
|
))
|
|
return InsnSpRegsAltered(
|
|
sp_regs_altered_text=sp_regs_altered_text,
|
|
special_text=None,
|
|
table_header_reg=table_header_reg,
|
|
table_header_fields=table_header_fields,
|
|
entries=tuple(entries),
|
|
final_regular_min_y=regular_min_y,
|
|
)
|
|
|
|
def extract_insn(self, header_start_char: Char) -> Insn:
|
|
assert header_start_char.font == Font.INSN_HEADER
|
|
print(header_start_char)
|
|
header = self.extract_insn_header_mnemonics_and_bit_fields(
|
|
start_min_y=header_start_char.min_y,
|
|
header_start_char=header_start_char,
|
|
)
|
|
if header is None:
|
|
raise PageParseError("can't find header text line")
|
|
next_start_min_y = header.min_y - 5
|
|
headers = [header]
|
|
code_lines: list[ParsedTextLine] = []
|
|
desc_lines: list[ParsedTextLine] = []
|
|
sp_regs_altered = None
|
|
while True:
|
|
search_min_y = next_start_min_y - 70
|
|
next_char = self.find_top_left_char_in_range(
|
|
min_x=self.text_section.min_x - 5,
|
|
max_x=self.text_section.max_x + 5,
|
|
min_y=max(search_min_y, self.text_section.min_y),
|
|
max_y=next_start_min_y,
|
|
allow_processed=False,
|
|
)
|
|
if next_char is None:
|
|
if search_min_y <= self.text_section.min_y \
|
|
and self.text_section.next is not None and \
|
|
self.text_section.next.page_num in self.pages:
|
|
# go to next section
|
|
self.text_section = self.text_section.next
|
|
next_start_min_y = self.text_section.max_y
|
|
continue
|
|
else:
|
|
raise InsnParseError("can't find insn code or description text")
|
|
match next_char.font:
|
|
case font if font in TextLineFonts.INSN_CODE_FONTS.fonts:
|
|
next_section = _InsnParseSection.CODE
|
|
case font if font in TextLineFonts.INSN_DESC_FONTS.fonts:
|
|
next_section = _InsnParseSection.DESC
|
|
case Font.INSN_HEADER:
|
|
next_section = _InsnParseSection.HEADER
|
|
case font:
|
|
raise InsnParseError(f"can't find insn code or description text\nfont={font}")
|
|
match next_section:
|
|
case _InsnParseSection.CODE:
|
|
if len(desc_lines) != 0:
|
|
break
|
|
code_line = self.extract_text_line(
|
|
start_char=next_char,
|
|
start_min_y=next_char.min_y,
|
|
min_x=next_char.min_x,
|
|
max_x=self.text_section.max_x,
|
|
fonts=TextLineFonts.INSN_CODE_FONTS,
|
|
preceding_blank_lines=0 if len(code_lines) == 0 else 1,
|
|
)
|
|
if code_line is None:
|
|
raise InsnParseError("can't find insn code text line")
|
|
more_code_lines = self.extract_following_text_lines(
|
|
first_text_line=code_line,
|
|
min_x=code_line.chars[0].min_x,
|
|
max_x=self.text_section.max_x,
|
|
allowed_start_min_y_error=0.05,
|
|
)
|
|
print("more insn code lines:")
|
|
print("\n".join(map(str, more_code_lines)))
|
|
code_lines.extend(more_code_lines)
|
|
next_start_min_y = code_lines[-1].regular_min_y - 5
|
|
case _InsnParseSection.HEADER:
|
|
if len(code_lines) != 0 or len(desc_lines) != 0:
|
|
break
|
|
header = self.extract_insn_header_mnemonics_and_bit_fields(
|
|
start_min_y=next_char.min_y,
|
|
header_start_char=next_char,
|
|
)
|
|
if header is None:
|
|
raise InsnParseError("can't find header text line")
|
|
headers.append(header)
|
|
next_start_min_y = header.min_y - 5
|
|
case _InsnParseSection.DESC:
|
|
desc_line = self.extract_text_line(
|
|
start_char=next_char,
|
|
start_min_y=next_char.min_y,
|
|
min_x=next_char.min_x,
|
|
max_x=self.text_section.max_x,
|
|
fonts=TextLineFonts.INSN_DESC_FONTS,
|
|
preceding_blank_lines=0 if len(desc_lines) == 0 else 1,
|
|
allowed_start_min_y_error=3,
|
|
)
|
|
if desc_line is None:
|
|
raise InsnParseError("can't find insn desc text line")
|
|
match desc_line.get_header_text():
|
|
case None:
|
|
more_desc_lines = self.extract_following_text_lines(
|
|
first_text_line=desc_line,
|
|
min_x=desc_line.chars[0].min_x,
|
|
max_x=self.text_section.max_x,
|
|
allowed_start_min_y_error=3.5,
|
|
)
|
|
print("more insn desc lines:")
|
|
print("\n".join(map(str, more_desc_lines)))
|
|
desc_lines.extend(more_desc_lines)
|
|
next_start_min_y = desc_lines[-1].regular_min_y - 5
|
|
case "Special Registers Altered:":
|
|
sp_regs_altered = self.extract_insn_sp_regs_altered(
|
|
sp_regs_altered_text=desc_line,
|
|
)
|
|
next_start_min_y = sp_regs_altered.final_regular_min_y
|
|
break
|
|
case header_text:
|
|
raise AssertionError(f"unhandled header text: {header_text!r}\n{desc_line}")
|
|
case _:
|
|
assert_never(next_section)
|
|
print("insn code lines:")
|
|
print("\n".join(map(str, code_lines)))
|
|
print("insn desc lines:")
|
|
print("\n".join(map(str, desc_lines)))
|
|
print("sp_regs_altered:")
|
|
print(sp_regs_altered)
|
|
# TODO: finish
|
|
return Insn(
|
|
headers=tuple(headers),
|
|
code_lines=tuple(code_lines),
|
|
desc_lines=tuple(desc_lines),
|
|
sp_regs_altered=sp_regs_altered,
|
|
)
|
|
|
|
def extract_insns(self):
|
|
while True:
|
|
try:
|
|
header_start_char = next(iter(
|
|
self.unprocessed_chars[Font.INSN_HEADER]))
|
|
except StopIteration:
|
|
break
|
|
self.insns.append(self.extract_insn(header_start_char=header_start_char))
|
|
|
|
def main():
|
|
if 2 < len(sys.argv):
|
|
if ":" in sys.argv[2]:
|
|
page_numbers = range(*map(int, sys.argv[2].split(":")))
|
|
else:
|
|
page_numbers = tuple(int(i) for i in sys.argv[2].split(","))
|
|
else:
|
|
page_numbers = None
|
|
parser = Parser()
|
|
file_name = Path(sys.argv[1])
|
|
parser.parse_pdf(file_name, page_numbers=page_numbers)
|
|
insns = ElementTree.Element("instructions", attrib={"is-subset": str(page_numbers is not None)})
|
|
insns.text = "\n"
|
|
insns.tail = "\n"
|
|
comment = ElementTree.Comment(f" Automatically generated from {file_name.name} ")
|
|
comment.tail = "\n"
|
|
insns.append(comment)
|
|
for insn in parser.insns:
|
|
insn.write_xml(insns)
|
|
ElementTree.ElementTree(insns).write(
|
|
"powerisa-instructions.xml",
|
|
encoding="utf-8",
|
|
xml_declaration=True,
|
|
)
|