parse_powerisa_pdf/parse_powerisa_pdf/parse_powerisa_pdf.py

1937 lines
78 KiB
Python
Executable file

from __future__ import annotations
from collections import defaultdict
from collections.abc import Generator, Iterable, Iterator, Callable
from contextlib import contextmanager
from dataclasses import dataclass, field
import dataclasses
from functools import cached_property
import sys
from typing import ClassVar, TypeVar, assert_never
from xml.etree import ElementTree
import enum
import traceback
from copy import deepcopy
from pathlib import Path
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTChar, LTLine, LTPage, LTRect, LTTextBox
from parse_powerisa_pdf.quad_tree import QuadTree
from parse_powerisa_pdf.set_by_id import SetById
@dataclass(unsafe_hash=True, frozen=True)
class Font:
font_name: str
size: float
__KNOWN_NAMES: ClassVar[dict[Font, str]]
@cached_property
def space_width(self) -> float:
return 3.985 * self.size / Font.INSN_CODE[0].size
@cached_property
def line_height(self) -> float:
match self.font_name:
case _ if any(self.font_name == f.font_name for f in Font.insn_code_fonts()):
return 9.464 * self.size / Font.INSN_CODE[0].size
case Font.INSN_DESC_BOLD.font_name | \
Font.INSN_DESC_ITALIC.font_name | \
Font.INSN_DESC_BOLD_ITALIC.font_name:
return 10.959 * self.size / Font.INSN_DESC[0].size
case _ if self in Font.INSN_DESC or self.font_name == Font.INSN_DESC[0].font_name:
return 10.959 * self.size / Font.INSN_DESC[0].size
case _ if self in Font.MATH_MISC:
return 10.959 * self.size / Font.INSN_DESC[0].size
case _:
raise AssertionError(f"no line height: {self}")
@classmethod
def insn_code_fonts(cls) -> Iterator[Font]:
yield from cls.INSN_CODE
yield from cls.INSN_CODE_SUBSCRIPT
@classmethod
def known_fonts(cls) -> Iterator[Font]:
return iter(cls.__KNOWN_NAMES.keys())
@property
def known_name(self) -> None | str:
return self.__KNOWN_NAMES.get(self)
@classmethod
def _register_known_fonts(cls) -> None:
cls.INSN_HEADER = Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=9.963)
cls.RTL_FN_HEADER = Font(font_name='APUYSQ+zcoN-Regular', size=9.963)
cls.PAGE_HEADER = Font(font_name='MJBFWM+DejaVuSansCondensed', size=9.963)
cls.PAGE_FOOTER = Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.981)
cls.INSN_DESC = (
Font(font_name='MJBFWM+DejaVuSansCondensed', size=8.966),
Font(font_name='FZTIYT+CMMI9', size=8.966),
Font(font_name='ONUAYC+CMSSI9', size=8.966),
Font(font_name='TNGBFZ+CMSY9', size=8.966),
Font(font_name='WHMZPU+CMEX9', size=8.966),
Font(font_name='ZJTMSG+CMSS9', size=8.966),
)
cls.INSN_DESC_MISC = tuple(
Font(font_name='MJBFWM+DejaVuSansCondensed', size=i)
for i in [
2.377, 2.561, 4.492, 4.641, 4.772, 4.864, 4.925,
5.097, 5.123, 5.131, 5.516, 5.604, 5.634, 5.906,
6.033, 6.068, 6.213, 6.252, 6.962, 7.977,
]
)
cls.INSN_DESC_CODE = Font(font_name='APUYSQ+zcoN-Regular', size=6.974)
cls.INSN_DESC_CODE_MISC = (
Font(font_name='APUYSQ+zcoN-Regular', size=3.587),
Font(font_name='APUYSQ+zcoN-Regular', size=4.483),
)
cls.INSN_DESC_ITALIC = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=8.966)
cls.INSN_DESC_BOLD = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=8.966)
cls.INSN_DESC_BOLD_ITALIC = Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=8.966)
cls.INSN_DESC_SMALL = Font(font_name='MJBFWM+DejaVuSansCondensed', size=7.97)
cls.INSN_DESC_SMALL_ITALIC = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=7.97)
cls.INSN_DESC_SMALL_BOLD = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=7.97)
cls.INSN_DESC_SMALL_BOLD_ITALIC = Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=7.97)
cls.INSN_DESC_BOLD_MISC = tuple(
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=i)
for i in [
2.21, 2.399, 2.763, 2.946, 2.949, 2.999,
3.065, 3.086, 3.183, 3.686, 3.744, 3.825, 3.842, 3.857, 3.979,
4.032, 4.112, 4.161, 4.206, 4.353, 4.378, 4.434,
4.595, 4.619, 4.647, 4.68, 4.693, 4.736, 4.781, 4.802, 4.995,
5.201, 5.258, 5.363, 5.442, 5.473, 5.485,
5.512, 5.543, 5.613, 5.744, 5.774, 5.809, 5.849, 5.911, 5.92, 5.962, 5.981,
6.146, 6.213, 6.221, 6.243, 6.55, 6.62, 6.699, 6.725, 6.751, 6.856,
8.029, 8.406,
]
)
cls.INSN_DESC_SUBSCRIPT = Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.978)
cls.INSN_DESC_BOLD_SUBSCRIPT = \
Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.978)
cls.INSN_DESC_ITALIC_SUBSCRIPT = \
Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=5.978)
cls.INSN_DESC_BOLD_ITALIC_SUBSCRIPT = \
Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=5.978)
cls.INSN_EXT_MNEMONIC = Font(font_name='APUYSQ+zcoN-Regular', size=8.966)
cls.INSN_CODE = (
Font(font_name='APUYSQ+zcoN-Regular', size=7.97),
Font(font_name='RRFUNA+CMSY8', size=7.97),
Font(font_name='HPXOZC+CMSS8', size=7.97),
)
cls.INSN_CODE_SUBSCRIPT = (
Font(font_name='APUYSQ+zcoN-Regular', size=5.978),
Font(font_name='DBQTKF+CMSY6', size=5.978),
)
cls.TITLE_PAGE_BIG = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=24.787)
cls.TITLE_PAGE_VERSION = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=9.963)
cls.TITLE_PAGE_TM = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=6.974)
cls.TITLE_PAGE_REV = Font(font_name='MJBFWM+DejaVuSansCondensed', size=6.974)
cls.TITLE_PAGE_BOOK = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=20.663)
cls.LEGAL_PAGE_ITALIC = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=9.963)
cls.CHANGE_SUMMARY_PAGE_BOLD = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=11.955)
cls.CHAPTER_TITLE = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=17.215)
cls.MATH_MISC = (
Font(font_name='AAJMKT+CMMI6', size=5.978),
Font(font_name='CUTMFD+CMSSI8', size=5.978),
Font(font_name='CUTMFD+CMSSI8', size=7.97),
Font(font_name='FZTIYT+CMMI9', size=5.734),
Font(font_name='FZTIYT+CMMI9', size=7.168),
Font(font_name='HONFQS+CMMI8', size=7.97),
Font(font_name='HPXOZC+CMSS8', size=5.978),
Font(font_name='LLVRDD+CMSY10', size=11.955),
Font(font_name='ZJTMSG+CMSS9', size=7.168),
)
cls.__KNOWN_NAMES = {}
for name, value in cls.__dict__.items():
if name[0].isupper():
if isinstance(value, cls):
assert value not in cls.__KNOWN_NAMES, f"duplicate known font: {value}"
cls.__KNOWN_NAMES[value] = name
elif isinstance(value, tuple) and all(isinstance(i, cls) for i in value):
for i, font in enumerate(value):
assert isinstance(font, cls)
assert font not in cls.__KNOWN_NAMES, f"duplicate known font: {font}"
cls.__KNOWN_NAMES[font] = f"{name}[{i}]"
old_repr = cls.__repr__
def __repr__(self: cls) -> str:
known_name = self.known_name
if known_name is not None:
return f"<{self.__class__.__name__}.{known_name}: {old_repr(self)}>"
return old_repr(self)
cls.__repr__ = __repr__
del cls._register_known_fonts
for font in Font.known_fonts():
font.space_width # initialize
font.line_height # initialize
Font._register_known_fonts()
@dataclass(unsafe_hash=True, frozen=True)
class Char:
font: Font
text: str
adv: float
min_x: float
min_y: float
max_x: float
max_y: float
def top_down_left_to_right_sort_key(self):
return -self.min_y, self.min_x
@property
def width(self) -> float:
return self.max_x - self.min_x
@property
def height(self) -> float:
return self.max_y - self.min_y
COLUMN_SPLIT_X = 300.0
PAGE_BODY_MAX_X = 600.0
PAGE_BODY_MIN_X = 50
PAGE_BODY_MAX_Y = 780.0
PAGE_BODY_MIN_Y = 45.0
ONE_TITLE_LINE_SPLIT_Y = 734.0
TWO_TITLE_LINES_SPLIT_Y = 715.0
INSN_BIT_FIELDS_PREFIX_TEXT_TOP_PAD_HEIGHT = 29.938
INSN_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT = 9.278
INSN_BIT_FIELDS_PREFIX_BOX_BOTTOM_TO_SUFFIX_TEXT_HEIGHT = 20.971
INSN_BIT_FIELDS_TOP_PAD_HEIGHT = 20.175
INSN_BIT_FIELDS_TOP_PAD_HEIGHT2 = 14.694
INSN_BIT_FIELDS_BOX_HEIGHT = 22.317
INSN_SP_REGS_ALTERED_REGISTER_COLUMN_X = 34.405
INSN_SP_REGS_ALTERED_FIELDS_COLUMN_X = 86.692
INSN_SP_REGS_ALTERED_FIELDS_CONDS_SPLIT_X = 188.74
@dataclass()
class ParsedTextLine:
element: ElementTree.Element
regular_min_y: float
regular_max_y: float
fonts: TextLineFonts
chars: list[Char]
preceding_blank_lines: int
@property
def regular_height(self) -> float:
return self.regular_max_y - self.regular_min_y
def get_header_text(self) -> None | str:
assert self.fonts == TextLineFonts.INSN_DESC_FONTS
if (self.element.text or "").strip() != "":
return None
if (self.element.tail or "").strip() != "":
return None
if len(self.element) != 1:
return None
if self.element[0].tag != "b":
return None
if len(self.element[0]) != 0:
return None
text = "".join(self.element.itertext())
if text.endswith(":") and text[0].istitle():
return text
return None
def __repr__(self) -> str:
fields = []
for i in dataclasses.fields(self):
if i.name == "element":
fields.append(
i.name + "=" + ElementTree.tostring(self.element, encoding="unicode"))
continue
# use fields as a handy object we know isn't a field
v = getattr(self, i.name, fields)
if v is fields:
fields.append(i.name + "=<unset>")
continue
fields.append(i.name + "=" + repr(v))
sep = ",\n "
return f"{__class__.__name__}({sep.join(fields)})"
def __str__(self) -> str:
return "\n" * self.preceding_blank_lines + ElementTree.tostring(self.element, encoding="unicode")
def write_xml(self, parent: ElementTree.Element, trailing_nl: bool):
for _ in range(self.preceding_blank_lines):
ElementTree.SubElement(parent, "br").tail = "\n"
if self.element.text is not None:
if len(parent) == 0:
parent.text = (parent.text or "") + self.element.text
else:
parent[-1].tail = (parent[-1].tail or "") + self.element.text
for element in self.element:
parent.append(deepcopy(element))
if trailing_nl:
ElementTree.SubElement(parent, "br").tail = "\n"
@staticmethod
def write_xml_lines(
lines: Iterable[ParsedTextLine],
parent: ElementTree.Element,
trailing_nl: bool,
preceding_nl: bool=False,
):
if preceding_nl:
ElementTree.SubElement(parent, "br").tail = "\n"
first = True
for line in lines:
if first:
first = False
else:
ElementTree.SubElement(parent, "br").tail = "\n"
line.write_xml(parent, trailing_nl=False)
if trailing_nl:
ElementTree.SubElement(parent, "br").tail = "\n"
_T = TypeVar("_T")
class BaselinePos(enum.Enum):
ABOVE = "above"
BELOW = "below"
@dataclass(unsafe_hash=True, frozen=True)
class TextLineFonts:
regular: tuple[Font, ...]
italic: tuple[Font, ...] | None = None
bold: tuple[Font, ...] | None = None
bold_italic: tuple[Font, ...] | None = None
subscript: tuple[Font, ...] | None = None
bold_subscript: tuple[Font, ...] | None = None
italic_subscript: tuple[Font, ...] | None = None
bold_italic_subscript: tuple[Font, ...] | None = None
code: tuple[Font, ...] | None = None
code_subscript: tuple[Font, ...] | None = None
@classmethod
def _define_fonts(cls):
cls.INSN_MNEMONIC_FONTS = cls(
regular=Font.INSN_DESC,
)
cls.INSN_HEADER_FONTS = cls(
regular=(Font.INSN_HEADER,),
)
cls.INSN_BIT_FIELD_BIT_NUMBER_FONTS = cls(
regular=(Font.INSN_DESC_SMALL, Font.TITLE_PAGE_REV),
)
cls.INSN_BIT_FIELD_NAME_FONTS = cls(
regular=Font.INSN_DESC,
subscript=(Font.INSN_DESC_SUBSCRIPT,),
)
cls.INSN_BIT_FIELDS_AFFIX_TITLE_FONTS = cls(
regular=(Font.INSN_DESC_SMALL,),
bold=(Font.INSN_DESC_SMALL_BOLD,),
)
cls.INSN_CODE_FONTS = cls(
regular=Font.INSN_CODE,
subscript=Font.INSN_CODE_SUBSCRIPT,
)
cls.INSN_DESC_FONTS = cls(
regular=(*Font.INSN_DESC, Font.INSN_DESC_SMALL),
bold=(Font.INSN_DESC_BOLD, Font.INSN_DESC_SMALL_BOLD),
italic=(Font.INSN_DESC_ITALIC, Font.INSN_DESC_SMALL_ITALIC),
bold_italic=(Font.INSN_DESC_BOLD_ITALIC, Font.INSN_DESC_SMALL_BOLD_ITALIC),
subscript=(Font.INSN_DESC_SUBSCRIPT,),
bold_subscript=(Font.INSN_DESC_BOLD_SUBSCRIPT,),
italic_subscript=(Font.INSN_DESC_ITALIC_SUBSCRIPT,),
bold_italic_subscript=(Font.INSN_DESC_BOLD_ITALIC_SUBSCRIPT,),
code=(Font.INSN_DESC_CODE, Font.INSN_EXT_MNEMONIC),
code_subscript=Font.INSN_CODE_SUBSCRIPT,
)
del cls._define_fonts
def get_font(
self,
part_kind: TextLineFontKind,
default: _T=None,
) -> _T | tuple[tuple[Font, ...], None | BaselinePos]:
match part_kind:
case TextLineFontKind.REGULAR:
font = self.regular
case TextLineFontKind.ITALIC:
font = self.italic
case TextLineFontKind.BOLD:
font = self.bold
case TextLineFontKind.BOLD_ITALIC:
font = self.bold_italic
case TextLineFontKind.SUBSCRIPT:
font = self.subscript
case TextLineFontKind.SUPERSCRIPT:
font = self.subscript
case TextLineFontKind.BOLD_SUBSCRIPT:
font = self.bold_subscript
case TextLineFontKind.BOLD_SUPERSCRIPT:
font = self.bold_subscript
case TextLineFontKind.ITALIC_SUBSCRIPT:
font = self.italic_subscript
case TextLineFontKind.ITALIC_SUPERSCRIPT:
font = self.italic_subscript
case TextLineFontKind.BOLD_ITALIC_SUBSCRIPT:
font = self.bold_italic_subscript
case TextLineFontKind.BOLD_ITALIC_SUPERSCRIPT:
font = self.bold_italic_subscript
case TextLineFontKind.CODE:
font = self.code
case TextLineFontKind.CODE_SUBSCRIPT:
font = self.code_subscript
case TextLineFontKind.CODE_SUPERSCRIPT:
font = self.code_subscript
case _:
assert_never(part_kind)
if font is None:
return default
return font, part_kind.sub_super.baseline_pos
@cached_property
def __font_to_kind_map(self) -> dict[tuple[Font, None | BaselinePos], TextLineFontKind]:
retval: dict[tuple[Font, None | BaselinePos], TextLineFontKind] = {}
for kind in TextLineFontKind:
fonts = self.get_font(kind)
if fonts is None:
continue
fonts, baseline_pos = fonts
for font in fonts:
assert font not in retval, \
f"duplicate font: kind={kind} old_kind={retval[font]} font={font}"
retval[font, baseline_pos] = kind
return retval
@cached_property
def fonts(self) -> frozenset[Font]:
retval: set[Font] = set()
for kind in TextLineFontKind:
fonts = self.get_font(kind)
if fonts is None:
continue
fonts, baseline_pos = fonts
retval.update(fonts)
return frozenset(retval)
def get_kind(self, font: Font, baseline_pos: BaselinePos, default: _T=None) -> _T | TextLineFontKind:
retval = self.__font_to_kind_map.get((font, baseline_pos))
if retval is None:
retval = self.__font_to_kind_map.get((font, None))
if retval is None:
return default
return retval
TextLineFonts._define_fonts()
class FontVariantCode(tuple, enum.Enum):
CODE = ("code",)
NOT_CODE = ()
class FontVariantBold(tuple, enum.Enum):
BOLD = ("b",)
NOT_BOLD = ()
class FontVariantItalic(tuple, enum.Enum):
ITALIC = ("i",)
NOT_ITALIC = ()
class FontVariantSubSuper(tuple, enum.Enum):
NOT_SUB_SUPER = ()
SUBSCRIPT = ("sub",)
SUPERSCRIPT = ("sup",)
@cached_property
def baseline_pos(self) -> None | BaselinePos:
match self:
case FontVariantSubSuper.NOT_SUB_SUPER:
return None
case FontVariantSubSuper.SUBSCRIPT:
return BaselinePos.BELOW
case FontVariantSubSuper.SUPERSCRIPT:
return BaselinePos.ABOVE
case _:
assert_never(self)
class TextLineFontKind(enum.Enum):
def __init__(
self,
code: FontVariantCode,
bold: FontVariantBold,
italic: FontVariantItalic,
sub_super: FontVariantSubSuper,
):
self.code = code
self.bold = bold
self.italic = italic
self.sub_super = sub_super
REGULAR = FontVariantCode.NOT_CODE, FontVariantBold.NOT_BOLD, \
FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.NOT_SUB_SUPER
SUBSCRIPT = FontVariantCode.NOT_CODE, FontVariantBold.NOT_BOLD, \
FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.SUBSCRIPT
SUPERSCRIPT = FontVariantCode.NOT_CODE, FontVariantBold.NOT_BOLD, \
FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.SUPERSCRIPT
ITALIC = FontVariantCode.NOT_CODE, FontVariantBold.NOT_BOLD, \
FontVariantItalic.ITALIC, FontVariantSubSuper.NOT_SUB_SUPER
ITALIC_SUBSCRIPT = FontVariantCode.NOT_CODE, FontVariantBold.NOT_BOLD, \
FontVariantItalic.ITALIC, FontVariantSubSuper.SUBSCRIPT
ITALIC_SUPERSCRIPT = FontVariantCode.NOT_CODE, FontVariantBold.NOT_BOLD, \
FontVariantItalic.ITALIC, FontVariantSubSuper.SUPERSCRIPT
BOLD = FontVariantCode.NOT_CODE, FontVariantBold.BOLD, \
FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.NOT_SUB_SUPER
BOLD_SUBSCRIPT = FontVariantCode.NOT_CODE, FontVariantBold.BOLD, \
FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.SUBSCRIPT
BOLD_SUPERSCRIPT = FontVariantCode.NOT_CODE, FontVariantBold.BOLD, \
FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.SUPERSCRIPT
BOLD_ITALIC = FontVariantCode.NOT_CODE, FontVariantBold.BOLD, \
FontVariantItalic.ITALIC, FontVariantSubSuper.NOT_SUB_SUPER
BOLD_ITALIC_SUBSCRIPT = FontVariantCode.NOT_CODE, FontVariantBold.BOLD, \
FontVariantItalic.ITALIC, FontVariantSubSuper.SUBSCRIPT
BOLD_ITALIC_SUPERSCRIPT = FontVariantCode.NOT_CODE, FontVariantBold.BOLD, \
FontVariantItalic.ITALIC, FontVariantSubSuper.SUPERSCRIPT
CODE = FontVariantCode.CODE, FontVariantBold.NOT_BOLD, \
FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.NOT_SUB_SUPER
CODE_SUBSCRIPT = FontVariantCode.CODE, FontVariantBold.NOT_BOLD, \
FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.SUBSCRIPT
CODE_SUPERSCRIPT = FontVariantCode.CODE, FontVariantBold.NOT_BOLD, \
FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.SUPERSCRIPT
@cached_property
def text_line_tags(self) -> tuple[str, ...]:
return (*self.code.value, *self.bold.value, *self.italic.value, *self.sub_super.value)
class PageParseError(Exception):
pass
class InsnParseError(Exception):
pass
class ElementBodyBuilder:
def __init__(self, containing_element: ElementTree.Element):
self.__containing_element = containing_element
self.__stack: list[ElementTree.Element] = []
self.__text_buffer: list[str] = []
def __shrink_stack(self, new_len: int):
while new_len < len(self.__stack):
self.__flush_text_buffer()
self.__stack.pop()
def set_tag_stack(self, tag_stack: Iterable[str]):
new_len = 0
for i, tag in enumerate(tag_stack):
new_len = i + 1
if i >= len(self.__stack):
self.__flush_text_buffer()
self.__stack.append(ElementTree.SubElement(self.__insert_point(), tag))
elif self.__stack[i].tag != tag:
self.__shrink_stack(new_len)
self.__shrink_stack(new_len)
def write_text(self, text: str):
self.__text_buffer.append(text)
def __insert_point(self) -> ElementTree.Element:
if len(self.__stack) != 0:
return self.__stack[-1]
return self.__containing_element
def __flush_text_buffer(self):
if len(self.__text_buffer) == 0:
return
insert_point = self.__insert_point()
text = "".join(self.__text_buffer)
self.__text_buffer.clear()
if len(insert_point) != 0:
element = insert_point[-1]
element.tail = (element.tail or "") + text
else:
insert_point.text = (insert_point.text or "") + text
def __enter__(self) -> ElementBodyBuilder:
return self
def __exit__(self, exc_type, exc_value, traceback):
self.flush()
def flush(self):
self.set_tag_stack(())
self.__flush_text_buffer()
@dataclass(unsafe_hash=True, frozen=True)
class InsnBitField:
box_min_x: float
box_max_x: float
name: ParsedTextLine
bit_number: ParsedTextLine
def __str__(self) -> str:
return f"<InsnBitField: x={self.box_min_x}..{self.box_max_x} name={self.name} bit_number={self.bit_number}>"
def write_xml(self, parent: ElementTree.Element):
field = ElementTree.SubElement(parent, "field")
field.text = "\n"
field.tail = "\n"
name = ElementTree.SubElement(field, "name")
name.tail = "\n"
self.name.write_xml(name, trailing_nl=False)
bit_number = ElementTree.SubElement(field, "bit-number")
bit_number.tail = "\n"
self.bit_number.write_xml(bit_number, trailing_nl=False)
@dataclass(unsafe_hash=True, frozen=True)
class InsnBitFieldsPrefix:
box_min_x: float
box_min_y: float
box_max_x: float
box_max_y: float
prefix_text: ParsedTextLine
fields: tuple[InsnBitField, ...]
suffix_text: ParsedTextLine
def __str__(self):
sep = ",\n "
return (f"<InsnBitFieldsPrefix: ({self.box_min_x},{self.box_min_y}).."
f"({self.box_max_x},{self.box_max_y})\n"
f" prefix_text={self.prefix_text}\n"
f" [\n"
f" {sep.join(map(str, self.fields))},\n"
f" ]\n"
f" suffix_text={self.suffix_text}>")
def write_xml(self, parent: ElementTree.Element):
prefix_elm = ElementTree.SubElement(parent, "prefix")
prefix_elm.text = "\n"
prefix_elm.tail = "\n"
prefix_text = ElementTree.SubElement(prefix_elm, "prefix-text")
prefix_text.tail = "\n"
self.prefix_text.write_xml(prefix_text, trailing_nl=False)
InsnBitFields.write_xml_fields(self.fields, prefix_elm)
suffix_text = ElementTree.SubElement(prefix_elm, "suffix-text")
suffix_text.tail = "\n"
self.suffix_text.write_xml(suffix_text, trailing_nl=False)
@dataclass(unsafe_hash=True, frozen=True)
class InsnBitFields:
prefix: None | InsnBitFieldsPrefix
box_min_x: float
box_min_y: float
box_max_x: float
box_max_y: float
fields: tuple[InsnBitField, ...]
def __str__(self):
sep = ",\n "
prefix_str = ""
if self.prefix is not None:
prefix_str = f"{self.prefix}\n"
return (f"{prefix_str}<InsnBitFields: ({self.box_min_x},{self.box_min_y}).."
f"({self.box_max_x},{self.box_max_y}) [\n"
f" {sep.join(map(str, self.fields))},\n]>")
@staticmethod
def write_xml_fields(fields: Iterable[InsnBitField], parent: ElementTree.Element):
fields_elm = ElementTree.SubElement(parent, "fields")
fields_elm.text = "\n"
fields_elm.tail = "\n"
for field in fields:
field.write_xml(fields_elm)
def write_xml(self, parent: ElementTree.Element):
bit_fields = ElementTree.SubElement(parent, "bit-fields")
bit_fields.text = "\n"
bit_fields.tail = "\n"
if self.prefix is not None:
self.prefix.write_xml(bit_fields)
InsnBitFields.write_xml_fields(self.fields, bit_fields)
@dataclass(unsafe_hash=True, frozen=True)
class InsnSpRegsAlteredEntry:
reg: ParsedTextLine
fields: tuple[ParsedTextLine, ...]
conds: tuple[ParsedTextLine, ...]
def __str__(self, indent="") -> str:
fields = "\n".join([
"(",
*(f"{indent} {i}," for i in self.fields),
f"{indent} )",
])
if self.fields == ():
fields = "()"
conds = "\n".join([
"(",
*(f"{indent} {i}," for i in self.conds),
f"{indent} )",
])
if self.conds == ():
conds = "()"
return (f"Entry(\n"
f"{indent} reg={self.reg},\n"
f"{indent} fields={fields},\n"
f"{indent} conds={conds},\n"
f"{indent})")
def write_xml(self, parent: ElementTree.Element):
entry = ElementTree.SubElement(parent, "entry")
entry.text = "\n"
entry.tail = "\n"
reg = ElementTree.SubElement(entry, "register")
reg.tail = "\n"
self.reg.write_xml(reg, trailing_nl=False)
fields = ElementTree.SubElement(entry, "fields")
fields.tail = "\n"
ParsedTextLine.write_xml_lines(self.fields, fields, trailing_nl=False)
conds = ElementTree.SubElement(entry, "conditions")
conds.tail = "\n"
ParsedTextLine.write_xml_lines(self.conds, conds, trailing_nl=False)
@dataclass(unsafe_hash=True, frozen=True)
class InsnSpRegsAltered:
sp_regs_altered_text: ParsedTextLine
special_text: None | ParsedTextLine
table_header_reg: None | ParsedTextLine
table_header_fields: None | ParsedTextLine
entries: tuple[InsnSpRegsAlteredEntry, ...]
final_regular_min_y: float
def __str__(self) -> str:
lines = [
"InsnSpRegsAltered(",
f" sp_regs_altered_text={self.sp_regs_altered_text},"
]
if self.special_text is not None:
lines.append(f" special_text={self.special_text},")
if self.table_header_reg is not None:
lines.append(f" table_header_reg={self.table_header_reg},")
if self.table_header_fields is not None:
lines.append(f" table_header_fields={self.table_header_fields},")
if len(self.entries) == 0:
lines.append(" entries=(),")
else:
lines.append(" entries=(")
for entry in self.entries:
lines.append(f" {entry.__str__(' ')},")
lines.append(" ),")
lines.append(f" final_regular_min_y={self.final_regular_min_y},")
lines.append(f")")
return "\n".join(lines)
def write_xml(self, parent: ElementTree.Element):
sp_regs_altered = ElementTree.SubElement(parent, "special-registers-altered")
sp_regs_altered.text = "\n"
sp_regs_altered.tail = "\n"
title = ElementTree.SubElement(sp_regs_altered, "title")
title.tail = "\n"
self.sp_regs_altered_text.write_xml(title, trailing_nl=False)
if self.special_text is not None:
special_text = ElementTree.SubElement(sp_regs_altered, "special-text")
special_text.tail = "\n"
self.special_text.write_xml(special_text, trailing_nl=False)
if self.table_header_reg is not None:
table_header_reg = ElementTree.SubElement(sp_regs_altered, "table-header-register")
table_header_reg.tail = "\n"
self.table_header_reg.write_xml(table_header_reg, trailing_nl=False)
if self.table_header_fields is not None:
table_header_fields = ElementTree.SubElement(sp_regs_altered, "table-header-fields")
table_header_fields.tail = "\n"
self.table_header_fields.write_xml(table_header_fields, trailing_nl=False)
for entry in self.entries:
entry.write_xml(sp_regs_altered)
class _InsnParseSection(enum.Enum):
CODE = "code"
HEADER = "header"
DESC = "desc"
CHAR_TO_EXPANDED = {
"\ufb00": "ff",
"\ufb01": "fi",
"\ufb02": "fl",
"\ufb03": "ffi",
"\ufb04": "ffl",
}
@dataclass()
class Page:
page_num: int
qt: defaultdict[TextSection, QuadTree[Char | LTLine | LTRect]]
unprocessed_chars: defaultdict[TextSection, defaultdict[Font, SetById[Char]]]
unprocessed_non_text: SetById[LTLine | LTRect]
@staticmethod
def from_lt_page(page_num: int, page: LTPage) -> Page:
qt: defaultdict[TextSection, QuadTree[Char | LTLine | LTRect]] = defaultdict(QuadTree)
unprocessed_chars = defaultdict(lambda: defaultdict(SetById[Char]))
unprocessed_non_text: SetById[LTLine | LTRect] = SetById()
for component in page:
if isinstance(component, (LTLine, LTRect)):
if component.width > 100 and \
component.x0 < COLUMN_SPLIT_X - 10 and \
component.x1 > COLUMN_SPLIT_X + 10:
print(f"wide component: {component}")
else:
print(f"component: {component}")
text_section = TextSection.for_position(
page_num=page_num,
x=(component.x0 + component.x1) * 0.5,
y=(component.y0 + component.y1) * 0.5,
)
if text_section is not None:
qt[text_section].insert(component.x0, component.y0, component)
unprocessed_non_text.add(component)
continue
if not isinstance(component, LTTextBox):
print(f"ignoring: {component}")
continue
for text_line in component:
for element in text_line:
if not isinstance(element, LTChar):
continue
text_section = TextSection.for_position(
page_num=page_num,
x=(element.x0 + element.x1) * 0.5,
y=(element.y0 + element.y1) * 0.5,
)
if text_section is None:
if PAGE_BODY_MIN_Y <= element.y0 <= PAGE_BODY_MAX_Y:
if page_num != 1072: # page 1072 has characters in the margins
raise AssertionError(
f"char not in text section: {element}\npage_num={page_num}")
continue
char = Char(
text=element.get_text(),
font=Font(font_name=element.fontname, size=round(element.size, 3)),
adv=element.adv,
min_x=element.x0,
min_y=element.y0,
max_x=element.x1,
max_y=element.y1,
)
qt[text_section].insert(char.min_x, char.min_y, char)
unprocessed_chars[text_section][char.font].add(char)
for i in unprocessed_chars.values():
for j in i.values():
j.sort(key=Char.top_down_left_to_right_sort_key)
unknown_fonts=[]
unknown_font_errors=[]
for i in unprocessed_chars.values():
for font, chars in i.items():
if font.known_name is None:
text = ""
char = None
for char in chars:
text += char.text
unknown_fonts.append(repr(font) + ",")
unknown_font_errors.append(f"unknown font {font}\nlast char: {char}\ntext: {text!r}")
unknown_fonts.sort()
if len(unknown_fonts) != 0:
raise AssertionError("\nunknown fonts:\n" + "\n".join(unknown_fonts)
+ "\n\n" + "\n".join(unknown_font_errors))
return Page(
page_num=page_num,
qt=qt,
unprocessed_chars=unprocessed_chars,
unprocessed_non_text=unprocessed_non_text,
)
class Pages:
pages_gen: None | Generator[Page, None, None]
__pages: dict[int, Page]
__max_page_num: int
def __init__(self, pages_gen: None | Generator[Page, None, None]=None):
self.pages_gen = pages_gen
self.__pages = {}
self.__max_page_num = 0
def __enter__(self) -> Pages:
return self
def __exit__(self, exc_type, exc_value, traceback) -> None:
self.close()
def close(self):
if self.pages_gen is not None:
gen = self.pages_gen
self.pages_gen = None
gen.close()
def is_past_end(self, page_num: int) -> bool:
while self.pages_gen is not None and page_num > self.__max_page_num:
self.__fill_page()
return page_num > self.__max_page_num
def __fill_page(self) -> bool:
if self.pages_gen is None:
return False
try:
page = self.pages_gen.send(None)
except StopIteration:
page = None
if page is None:
self.close()
return False
if page.page_num <= self.__max_page_num:
e = AssertionError(
f"page numbers must be a strictly-increasing positive integer sequence:\n"
f"got {page.page_num} which isn't more than {self.__max_page_num}")
if self.pages_gen is not None:
self.pages_gen.throw(e)
raise e # either no generator or generator failed to propagate exception
self.__pages[page.page_num] = page
self.__max_page_num = page.page_num
return True
def get(self, page_num: int, default: _T=None) -> _T | Page:
while True:
page = self.__pages.get(page_num)
if page is not None:
return page
if self.pages_gen is None:
return default
if page_num < self.__max_page_num:
return default
self.__fill_page()
def __contains__(self, page_num: int, /) -> bool:
return self.get(page_num) is not None
def __getitem__(self, page_num: int, /) -> Page:
retval = self.get(page_num)
if retval is None:
raise KeyError(page_num)
return retval
@dataclass(unsafe_hash=True, frozen=True)
class TextSection:
page_num: int
min_x: float
min_y: float
max_x: float
max_y: float
@classmethod
def first(cls) -> TextSection:
return cls.page_sections(page_num=1)[0]
@cached_property
def next(self) -> TextSection:
page_sections = self.page_sections(page_num=self.page_num)
index = page_sections.index(self)
if index + 1 < len(page_sections):
return page_sections[index + 1]
for page_num in range(self.page_num + 1, self.page_num + 100000):
page_sections = self.page_sections(page_num=page_num)
if len(page_sections) != 0:
return page_sections[0]
raise AssertionError(f"can't find next TextSection after {self}")
@classmethod
def left_column(
cls, *,
page_num: int,
min_y=PAGE_BODY_MIN_Y,
max_y=PAGE_BODY_MAX_Y,
) -> TextSection:
return cls(
page_num=page_num,
min_x=PAGE_BODY_MIN_X,
min_y=min_y,
max_x=COLUMN_SPLIT_X,
max_y=max_y)
@classmethod
def right_column(
cls, *,
page_num: int,
min_y=PAGE_BODY_MIN_Y,
max_y=PAGE_BODY_MAX_Y,
) -> TextSection:
return cls(
page_num=page_num,
min_x=COLUMN_SPLIT_X,
min_y=min_y,
max_x=PAGE_BODY_MAX_X,
max_y=max_y)
@classmethod
def columns(
cls, *,
page_num: int,
min_y=PAGE_BODY_MIN_Y,
max_y=PAGE_BODY_MAX_Y,
) -> tuple[TextSection, TextSection]:
return (cls.left_column(page_num=page_num, min_y=min_y, max_y=max_y),
cls.right_column(page_num=page_num, min_y=min_y, max_y=max_y))
@classmethod
def full_page(
cls, *,
page_num: int,
min_y=PAGE_BODY_MIN_Y,
max_y=PAGE_BODY_MAX_Y,
) -> TextSection:
return cls(
page_num=page_num,
min_x=PAGE_BODY_MIN_X,
min_y=min_y,
max_x=PAGE_BODY_MAX_X,
max_y=max_y)
__COLUMNS_THEN_FULL_PAGE: ClassVar = {
129: 438.992, 241: 512.419, 242: 408.077, 243: 488.509,
244: 437.518, 245: 444.522, 247: 352.082, 248: 356.723,
249: 365.944, 251: 334.553, 264: 184.67, 296: 267.29,
297: 200.043, 298: 440.64, 299: 197.356, 300: 160.076,
301: 364.924, 303: 330.055, 305: 344.867, 306: 335.403,
307: 336.897, 308: 365.233, 309: 364.735,
}
__FULL_PAGE_THEN_COLUMNS: ClassVar = {
246: 689.039,
250: 615.315,
266: 678.088,
}
__ONE_TITLE_LINE_THEN_COLUMNS_THEN_FULL_PAGE: ClassVar = {
128: 301.55,
}
__TWO_TITLE_LINES_THEN_COLUMNS_THEN_FULL_PAGE: ClassVar = {
304: 242.732,
}
__COLUMNS_THEN_COLUMNS: ClassVar = {
79: 621.66,
126: 519.89,
}
__ONE_TITLE_LINE_THEN_COLUMNS_THEN_COLUMNS: ClassVar = {
130: 550.43,
162: 599.247,
194: 622.161,
196: 682.933,
204: 613.195,
215: 633.12,
}
__ONE_TITLE_LINE_THEN_COLUMNS: ClassVar = {
103, 104, 105, 121, 139, 143, 146, 151, 153, 158, 207, 213, 218,
}
__TWO_TITLE_LINES_THEN_COLUMNS: ClassVar = {
198, 206,
}
__FULL_PAGE: ClassVar = {
118, 157, 252, 254, 255, 257, 259, 260, 265, 268, 270, 271, 272,
*range(274, 286),
}
@classmethod
def __page_sections(cls, page_num: int) -> tuple[TextSection, ...]:
match page_num:
case _ if page_num in cls.__COLUMNS_THEN_COLUMNS:
split_y = cls.__COLUMNS_THEN_COLUMNS[page_num]
return (
*cls.columns(page_num=page_num, min_y=split_y),
*cls.columns(page_num=page_num, max_y=split_y),
)
case _ if page_num in cls.__ONE_TITLE_LINE_THEN_COLUMNS:
return (
cls.full_page(page_num=page_num, min_y=ONE_TITLE_LINE_SPLIT_Y),
*cls.columns(page_num=page_num, max_y=ONE_TITLE_LINE_SPLIT_Y),
)
case _ if page_num in cls.__FULL_PAGE:
return cls.full_page(page_num=page_num),
case _ if page_num in cls.__ONE_TITLE_LINE_THEN_COLUMNS_THEN_COLUMNS:
split_y = cls.__ONE_TITLE_LINE_THEN_COLUMNS_THEN_COLUMNS[page_num]
return (
cls.full_page(page_num=page_num, min_y=ONE_TITLE_LINE_SPLIT_Y),
*cls.columns(page_num=page_num, min_y=split_y, max_y=ONE_TITLE_LINE_SPLIT_Y),
*cls.columns(page_num=page_num, max_y=split_y),
)
case _ if page_num in cls.__TWO_TITLE_LINES_THEN_COLUMNS:
return (
cls.full_page(page_num=page_num, min_y=TWO_TITLE_LINES_SPLIT_Y),
*cls.columns(page_num=page_num, max_y=TWO_TITLE_LINES_SPLIT_Y),
)
case _ if page_num in cls.__COLUMNS_THEN_FULL_PAGE:
split_y = cls.__COLUMNS_THEN_FULL_PAGE[page_num]
return (
*cls.columns(page_num=page_num, min_y=split_y),
cls.full_page(page_num=page_num, max_y=split_y),
)
case _ if page_num in cls.__FULL_PAGE_THEN_COLUMNS:
split_y = cls.__FULL_PAGE_THEN_COLUMNS[page_num]
return (
cls.full_page(page_num=page_num, min_y=split_y),
*cls.columns(page_num=page_num, max_y=split_y),
)
case _ if page_num in cls.__ONE_TITLE_LINE_THEN_COLUMNS_THEN_FULL_PAGE:
split_y = cls.__ONE_TITLE_LINE_THEN_COLUMNS_THEN_FULL_PAGE[page_num]
return (
cls.full_page(page_num=page_num, min_y=ONE_TITLE_LINE_SPLIT_Y),
*cls.columns(page_num=page_num, min_y=split_y, max_y=ONE_TITLE_LINE_SPLIT_Y),
cls.full_page(page_num=page_num, max_y=split_y),
)
case _ if page_num in cls.__TWO_TITLE_LINES_THEN_COLUMNS_THEN_FULL_PAGE:
split_y = cls.__TWO_TITLE_LINES_THEN_COLUMNS_THEN_FULL_PAGE[page_num]
return (
cls.full_page(page_num=page_num, min_y=TWO_TITLE_LINES_SPLIT_Y),
*cls.columns(page_num=page_num, min_y=split_y, max_y=TWO_TITLE_LINES_SPLIT_Y),
cls.full_page(page_num=page_num, max_y=split_y),
)
case 263:
return (
cls.full_page(page_num=page_num, min_y=699.997),
*cls.columns(page_num=page_num, min_y=366.396, max_y=699.997),
*cls.columns(page_num=page_num, min_y=207, max_y=366.396),
cls.full_page(page_num=page_num, max_y=207),
)
# TODO: checked up to page 309 (page named 273)
case _:
return cls.columns(page_num=page_num)
__PAGE_SECTIONS_CACHE: ClassVar[dict[int, tuple[TextSection, ...]]] = {}
@classmethod
def page_sections(cls, page_num: int) -> tuple[TextSection, ...]:
try:
return cls.__PAGE_SECTIONS_CACHE[page_num]
except KeyError:
pass
retval = cls.__PAGE_SECTIONS_CACHE[page_num] = cls.__page_sections(page_num=page_num)
return retval
@classmethod
def for_position(cls, page_num: int, x: float, y: float) -> None | TextSection:
for i in cls.page_sections(page_num=page_num):
if i.min_x <= x <= i.max_x and i.min_y <= y <= i.max_y:
return i
return None
@dataclass(frozen=True)
class InsnHeader:
header_lines: tuple[ParsedTextLine, ...]
mnemonic_lines: tuple[ParsedTextLine, ...]
bit_fields: InsnBitFields
@property
def min_y(self) -> float:
return self.bit_fields.box_min_y
def write_xml(self, parent: ElementTree.Element):
header = ElementTree.SubElement(parent, "header")
header.text = "\n"
header.tail = "\n"
title = ElementTree.SubElement(header, "title")
title.tail = "\n"
ParsedTextLine.write_xml_lines(self.header_lines, title, trailing_nl=False)
mnemonics = ElementTree.SubElement(header, "mnemonics")
mnemonics.tail = "\n"
ParsedTextLine.write_xml_lines(self.mnemonic_lines, mnemonics, trailing_nl=False)
self.bit_fields.write_xml(header)
@dataclass(frozen=True)
class Insn:
headers: tuple[InsnHeader, ...]
code_lines: tuple[ParsedTextLine, ...]
desc_lines: tuple[ParsedTextLine, ...]
sp_regs_altered: None | InsnSpRegsAltered
def write_xml(self, parent: ElementTree.Element):
insn = ElementTree.SubElement(parent, "instruction")
insn.text = "\n"
insn.tail = "\n"
for header in self.headers:
header.write_xml(insn)
if len(self.code_lines) != 0:
code = ElementTree.SubElement(insn, "code")
code.tail = "\n"
ParsedTextLine.write_xml_lines(self.code_lines, code, trailing_nl=False)
if len(self.desc_lines) != 0:
desc = ElementTree.SubElement(insn, "description")
desc.tail = "\n"
ParsedTextLine.write_xml_lines(self.desc_lines, desc, trailing_nl=False)
if self.sp_regs_altered is not None:
self.sp_regs_altered.write_xml(insn)
@dataclass()
class Parser:
pages: Pages = field(default_factory=Pages)
text_section: TextSection = TextSection.first()
insns: list[Insn] = field(default_factory=list)
@property
def page(self) -> Page:
return self.pages[self.text_section.page_num]
@property
def unprocessed_chars(self) -> defaultdict[Font, SetById[Char]]:
return self.pages[self.text_section.page_num].unprocessed_chars[self.text_section]
@staticmethod
def __pages_gen(file: Path, page_numbers: Iterable[int] | None) -> Generator[Page, None, None]:
if page_numbers is not None:
page_numbers = sorted(i - 1 for i in page_numbers)
for i, page in enumerate(extract_pages(file, page_numbers=page_numbers)):
if page_numbers is not None:
page_num = page_numbers[i] + 1
else:
page_num = i + 1
print(f"page {page_num}")
yield Page.from_lt_page(page_num=page_num, page=page)
def parse_pdf(self, file: Path, page_numbers: Iterable[int] | None = None):
self.pages = Pages(pages_gen=Parser.__pages_gen(
file=file, page_numbers=page_numbers))
self.text_section = TextSection.first()
while True:
self.text_section = self.text_section.next
if self.pages.is_past_end(self.text_section.page_num):
break
if self.text_section.page_num in self.pages:
print(f"section {self.text_section}")
with self.note_text_section():
self.parse_text_section()
@contextmanager
def note_text_section(self):
start_text_section = self.text_section
try:
yield
except Exception as e:
if self.text_section == start_text_section:
note = f"text_section={self.text_section}"
else:
note = f"start_text_section={start_text_section}\ntext_section={self.text_section}"
if note not in getattr(e, "__notes__", ()):
e.add_note(note)
raise
def parse_text_section(self):
try:
with self.note_text_section():
self.extract_insns()
except (InsnParseError, PageParseError) as e:
print("".join(traceback.format_exception_only(e)), flush=True)
traceback.print_exc()
def find_top_left_char_in_range(self, *,
min_x: float,
max_x: float,
min_y: float,
max_y: float,
allow_processed: bool,
pred: None | Callable[[Char], bool] = None,
) -> None | Char:
retval = None
for x, y, char in self.page.qt[self.text_section].range(
min_x=min_x,
max_x=max_x,
min_y=min_y,
max_y=max_y,
):
if not isinstance(char, Char):
continue
if not allow_processed and char not in self.unprocessed_chars[char.font]:
continue
if retval is None:
if pred is None or pred(char):
retval = char
elif char.min_x - char.min_y < retval.min_x - retval.min_y:
if pred is None or pred(char):
retval = char
return retval
def extract_text_line(
self, *,
start_char: None | Char = None,
start_min_y: float,
min_x: float,
max_x: float,
fonts: TextLineFonts,
preceding_blank_lines: int = 0,
skip_initial_spaces=False,
allowed_start_min_y_error=None,
) -> None | ParsedTextLine:
chars: list[Char] = []
chars_set: SetById[Char] = SetById()
if start_char is not None:
chars.append(start_char)
chars_set.add(start_char)
if start_char is not None and \
start_char.text == "*" and \
self.text_section.page_num == 168 and \
start_char.font in (fonts.subscript or ()):
start_min_y = start_char.max_y - fonts.regular[0].size
for x, y, char in self.page.qt[self.text_section].range(
min_x=min_x - fonts.regular[0].size * 0.5,
max_x=max_x,
min_y=start_min_y - fonts.regular[0].size * 0.4,
max_y=start_min_y + fonts.regular[0].size * 0.6,
):
if not isinstance(char, Char):
continue
if char not in self.unprocessed_chars[char.font] or char in chars_set:
continue
chars_set.add(char)
chars.append(char)
if len(chars) == 0:
return None
chars.sort(key=lambda char: (char.min_x, char.text))
regular_min_y = chars[0].min_y
regular_max_y = chars[0].max_y
for char in chars:
kind = fonts.get_kind(font=char.font, baseline_pos=BaselinePos.BELOW)
if kind is not None and kind.sub_super is FontVariantSubSuper.NOT_SUB_SUPER:
regular_min_y = char.min_y
regular_max_y = char.max_y
break
retval = ParsedTextLine(
element=ElementTree.Element("text-line"),
regular_min_y=regular_min_y,
regular_max_y=regular_max_y,
fonts=fonts,
chars=chars,
preceding_blank_lines=preceding_blank_lines,
)
text_and_tag_stacks: list[tuple[str, tuple[str, ...]]] = []
last_max_x = min_x
last_kind = None
last_char = None
for char in chars:
if (char.max_y + char.min_y) * 0.5 > (retval.regular_max_y + retval.regular_min_y) * 0.5:
baseline_pos = BaselinePos.ABOVE
else:
baseline_pos = BaselinePos.BELOW
kind = fonts.get_kind(font=char.font, baseline_pos=baseline_pos)
if kind is None:
print(
f"font kind is None:\n"
f"regular_min_y={retval.regular_min_y}\n"
f"fonts={fonts}\n"
f"char={char}\n"
f"baseline_pos={baseline_pos}\n"
f"chars[0]={chars[0]}"
)
return None
if last_kind is None:
space_kind = kind
elif last_kind != kind:
space_kind = TextLineFontKind.REGULAR
else:
space_kind = kind
space_font, _ = fonts.get_font(space_kind, (fonts.regular, None))
space_width = char.min_x - last_max_x
space_count_f = space_width / space_font[0].space_width
space_count = round(space_count_f)
if space_count == 0 and space_count_f > 0.35:
space_count = 1
if space_count_f > 0.25 and abs(space_count - space_count_f) > 0.15:
print(f"spaces: space_count_f={space_count_f} space_width={space_width}")
if space_count > 0 and not skip_initial_spaces:
text_and_tag_stacks.append((" " * space_count, space_kind.text_line_tags))
skip_initial_spaces = False
if (char.text == "\u0338"
and last_char is not None
and last_char.text == "="
and abs(char.min_x - last_char.min_x) < 0.01
and abs(char.min_y - last_char.min_y) < 0.01):
text_and_tag_stacks[-1] = "\u2260", ()
last_max_x = last_char.max_x
else:
char_text = CHAR_TO_EXPANDED.get(char.text, char.text)
text_and_tag_stacks.append((char_text, kind.text_line_tags))
last_max_x = char.max_x
last_kind = kind
last_char = char
with ElementBodyBuilder(retval.element) as body_builder:
for text, tag_stack in text_and_tag_stacks:
body_builder.set_tag_stack(tag_stack)
body_builder.write_text(text)
for char in chars:
self.unprocessed_chars[char.font].remove(char)
if allowed_start_min_y_error is None:
allowed_start_min_y_error = 0.01
if abs(start_min_y - retval.regular_min_y) > allowed_start_min_y_error:
raise PageParseError(
f"start_min_y={start_min_y} regular_min_y={retval.regular_min_y}\n"
f"start_min_y error: {start_min_y - retval.regular_min_y}\n"
f"allowed_start_min_y_error={allowed_start_min_y_error}")
return retval
def extract_following_text_lines(
self,
first_text_line: ParsedTextLine,
min_x: float,
max_x: float,
allowed_start_min_y_error=None,
) -> list[ParsedTextLine]:
retval: list[ParsedTextLine] = []
line = first_text_line
while line is not None:
retval.append(line)
line = self.extract_text_line(
start_min_y=line.regular_min_y - first_text_line.fonts.regular[0].line_height,
min_x=min_x,
max_x=max_x,
fonts=first_text_line.fonts,
allowed_start_min_y_error=allowed_start_min_y_error,
)
return retval
def extract_insn_bit_fields(
self,
mnemonic_lines: list[ParsedTextLine],
) -> None | InsnBitFields:
found_non_affix_line = False
if len(mnemonic_lines) > 1:
expected_non_affix_line_y = (mnemonic_lines[-1].regular_min_y
- INSN_BIT_FIELDS_TOP_PAD_HEIGHT2)
else:
expected_non_affix_line_y = (mnemonic_lines[-1].regular_min_y
- INSN_BIT_FIELDS_TOP_PAD_HEIGHT)
for x, y, line in self.page.qt[self.text_section].range(
min_x=self.text_section.min_x - 5,
max_x=self.text_section.max_x + 5,
min_y=expected_non_affix_line_y - 5,
max_y=expected_non_affix_line_y + 5,
):
if not isinstance(line, LTLine):
continue
if line.width > line.height:
found_non_affix_line = True
break
if found_non_affix_line:
return self.extract_insn_bit_fields_box(
expected_box_max_y=expected_non_affix_line_y,
)
prefix_text = self.extract_text_line(
start_min_y=mnemonic_lines[-1].regular_min_y
- INSN_BIT_FIELDS_PREFIX_TEXT_TOP_PAD_HEIGHT,
min_x=self.text_section.min_x,
max_x=self.text_section.max_x,
fonts=TextLineFonts.INSN_BIT_FIELDS_AFFIX_TITLE_FONTS,
allowed_start_min_y_error=2,
skip_initial_spaces=True,
)
if prefix_text is None:
raise InsnParseError("can't find insn prefix bit fields title")
prefix_text_str = "".join(prefix_text.element.itertext())
if prefix_text_str != "Prefix:":
raise InsnParseError(
f"insn prefix bit fields title is not as expected: {prefix_text_str!r}")
prefix_bit_fields = self.extract_insn_bit_fields_box(
expected_box_max_y=prefix_text.regular_min_y
- INSN_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT,
)
if prefix_bit_fields is None:
raise InsnParseError("can't find insn prefix bit fields")
suffix_text = self.extract_text_line(
start_min_y=prefix_bit_fields.box_min_y
- INSN_BIT_FIELDS_PREFIX_BOX_BOTTOM_TO_SUFFIX_TEXT_HEIGHT,
min_x=self.text_section.min_x,
max_x=self.text_section.max_x,
fonts=TextLineFonts.INSN_BIT_FIELDS_AFFIX_TITLE_FONTS,
allowed_start_min_y_error=2,
skip_initial_spaces=True,
)
if suffix_text is None:
raise InsnParseError("can't find insn suffix bit fields title")
suffix_text_str = "".join(suffix_text.element.itertext())
if suffix_text_str != "Suffix:":
raise InsnParseError(
f"insn suffix bit fields title is not as expected: {suffix_text_str!r}")
suffix_bit_fields = self.extract_insn_bit_fields_box(
expected_box_max_y=suffix_text.regular_min_y
- INSN_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT,
)
if suffix_bit_fields is None:
raise InsnParseError("can't find insn suffix bit fields")
return InsnBitFields(
prefix=InsnBitFieldsPrefix(
box_min_x=prefix_bit_fields.box_min_x,
box_min_y=prefix_bit_fields.box_min_y,
box_max_x=prefix_bit_fields.box_max_x,
box_max_y=prefix_bit_fields.box_max_y,
prefix_text=prefix_text,
fields=prefix_bit_fields.fields,
suffix_text=suffix_text,
),
box_min_x=suffix_bit_fields.box_min_x,
box_min_y=suffix_bit_fields.box_min_y,
box_max_x=suffix_bit_fields.box_max_x,
box_max_y=suffix_bit_fields.box_max_y,
fields=suffix_bit_fields.fields,
)
def extract_insn_bit_fields_box(
self,
expected_box_max_y: float,
) -> None | InsnBitFields:
h_lines: list[LTLine] = []
v_lines: list[LTLine] = []
for x, y, line in self.page.qt[self.text_section].range(
min_x=self.text_section.min_x - 5,
max_x=self.text_section.max_x + 5,
min_y=expected_box_max_y - INSN_BIT_FIELDS_BOX_HEIGHT - 5,
max_y=expected_box_max_y + 5,
):
if not isinstance(line, LTLine):
continue
if line.width > line.height:
h_lines.append(line)
else:
v_lines.append(line)
h_lines.sort(key=lambda line: line.y0, reverse=False)
v_lines.sort(key=lambda line: line.x0)
for i in reversed(range(len(v_lines) - 1)):
if abs(v_lines[i].x0 - v_lines[i + 1].x0) < 0.5:
del v_lines[i + 1] # remove duplicates
if len(h_lines) == 0 and len(v_lines) == 0:
return None
if len(h_lines) != 2:
raise InsnParseError(
f"instruction bit fields box has wrong number of horizontal lines:\n{h_lines}")
if len(v_lines) < 2:
raise InsnParseError(
f"instruction bit fields box has too few vertical lines:\n{h_lines}")
bottom_line, top_line = h_lines
box_min_x = v_lines[0].x0
box_max_x = v_lines[-1].x0
box_min_y = bottom_line.y0
box_max_y = top_line.y1
box_mid_y = (box_min_y + box_max_y) * 0.5
print(f"bottom_line={bottom_line}")
print(f"top_line={top_line}")
print(v_lines)
fields: list[InsnBitField] = []
for i in range(len(v_lines) - 1):
left_line = v_lines[i]
right_line = v_lines[i + 1]
field_box_min_x = left_line.x1
field_box_max_x = right_line.x0
bit_field_name_start_min_y = box_mid_y + 3.288
bit_field_name=self.extract_text_line(
start_min_y=bit_field_name_start_min_y,
min_x=field_box_min_x,
max_x=field_box_max_x,
fonts=TextLineFonts.INSN_BIT_FIELD_NAME_FONTS,
skip_initial_spaces=True,
allowed_start_min_y_error=0.4,
)
if bit_field_name is None:
raise InsnParseError(f"instruction bit field name not found:\n"
f"start_min_y={bit_field_name_start_min_y} "
f"field_box_min_x={field_box_min_x} "
f"field_box_max_x={field_box_max_x}")
bit_field_number_start_min_y = box_min_y + 3.487
bit_number=self.extract_text_line(
start_min_y=bit_field_number_start_min_y,
min_x=field_box_min_x,
max_x=field_box_max_x,
fonts=TextLineFonts.INSN_BIT_FIELD_BIT_NUMBER_FONTS,
skip_initial_spaces=True,
)
if bit_number is None:
raise InsnParseError(f"instruction bit field bit number not found:\n"
f"start_min_y={bit_field_number_start_min_y} "
f"field_box_min_x={field_box_min_x} "
f"field_box_max_x={field_box_max_x}")
fields.append(InsnBitField(
box_min_x=field_box_min_x,
box_max_x=field_box_max_x,
name=bit_field_name,
bit_number=bit_number,
))
return InsnBitFields(
prefix=None,
box_min_x=box_min_x,
box_min_y=box_min_y,
box_max_x=box_max_x,
box_max_y=box_max_y,
fields=tuple(fields),
)
def extract_insn_header_mnemonics_and_bit_fields(
self,
start_min_y: float,
header_start_char: None | Char = None,
) -> None | InsnHeader:
assert header_start_char is None or \
header_start_char.font == Font.INSN_HEADER
header_line = self.extract_text_line(
start_char=header_start_char,
start_min_y=start_min_y,
min_x=self.text_section.min_x,
max_x=self.text_section.max_x,
fonts=TextLineFonts.INSN_HEADER_FONTS,
skip_initial_spaces=True,
allowed_start_min_y_error=6,
)
if header_line is None:
return None
print(f"found header line:\n{header_line}")
header_lines = self.extract_following_text_lines(
first_text_line=header_line,
min_x=self.text_section.min_x,
max_x=self.text_section.max_x,
allowed_start_min_y_error=1.5,
)
print("insn header lines:")
print("\n".join(map(str, header_lines)))
mnemonic_start_char = self.find_top_left_char_in_range(
min_x=self.text_section.min_x - 5,
max_x=self.text_section.max_x + 5,
min_y=header_lines[-1].regular_min_y - 50,
max_y=header_lines[-1].regular_min_y - 5,
allow_processed=False,
)
if mnemonic_start_char is None:
raise InsnParseError("can't find insn mnemonic text line")
mnemonic_line = self.extract_text_line(
start_char=mnemonic_start_char,
start_min_y=mnemonic_start_char.min_y,
min_x=self.text_section.min_x,
max_x=self.text_section.max_x,
fonts=TextLineFonts.INSN_MNEMONIC_FONTS,
skip_initial_spaces=True,
)
if mnemonic_line is None:
raise InsnParseError("can't find insn mnemonic text line")
mnemonic_lines = self.extract_following_text_lines(
first_text_line=mnemonic_line,
min_x=mnemonic_line.chars[0].min_x,
max_x=self.text_section.max_x,
)
print("insn mnemonic lines:")
print("\n".join(map(str, mnemonic_lines)))
insn_bit_fields = self.extract_insn_bit_fields(
mnemonic_lines=mnemonic_lines,
)
print(insn_bit_fields)
if insn_bit_fields is None:
raise InsnParseError("can't find insn bit fields")
return InsnHeader(
header_lines=tuple(header_lines),
mnemonic_lines=tuple(mnemonic_lines),
bit_fields=insn_bit_fields,
)
def extract_insn_sp_regs_altered(
self,
sp_regs_altered_text: ParsedTextLine,
) -> InsnSpRegsAltered:
sp_regs_altered_text.preceding_blank_lines = 0
fonts = TextLineFonts.INSN_DESC_FONTS
column_min_x = sp_regs_altered_text.chars[0].min_x
table_header_reg_char = self.find_top_left_char_in_range(
min_x=column_min_x - 1,
max_x=column_min_x + INSN_SP_REGS_ALTERED_FIELDS_COLUMN_X - 1,
min_y=sp_regs_altered_text.regular_min_y - 30,
max_y=sp_regs_altered_text.regular_min_y - 5,
allow_processed=False,
)
if table_header_reg_char is None:
raise InsnParseError(
"can't find special registers altered table's register-column's header")
KNOWN_SPECIAL_TEXTS = (
"None",
"Dependent on the system service",
"See above.",
"See Table 5.1",
)
match table_header_reg_char.text:
case "R":
pass
case text if any(text == i[0] for i in KNOWN_SPECIAL_TEXTS):
special_text = self.extract_text_line(
start_char=table_header_reg_char,
start_min_y=table_header_reg_char.min_y,
min_x=column_min_x,
max_x=self.text_section.max_x,
fonts=fonts,
skip_initial_spaces=True,
)
assert special_text is not None \
and special_text.element.text in KNOWN_SPECIAL_TEXTS, \
f"can't find special-registers-altered special-text:\n{special_text}"
return InsnSpRegsAltered(
sp_regs_altered_text=sp_regs_altered_text,
special_text=special_text,
table_header_reg=None,
table_header_fields=None,
entries=(),
final_regular_min_y=special_text.regular_min_y,
)
case text:
raise InsnParseError(
f"unknown special-registers-altered special-text start character: {text!r}")
table_header_fields_char = self.find_top_left_char_in_range(
min_x=column_min_x + INSN_SP_REGS_ALTERED_FIELDS_COLUMN_X - 10,
max_x=column_min_x + INSN_SP_REGS_ALTERED_FIELDS_CONDS_SPLIT_X,
min_y=table_header_reg_char.min_y - 5,
max_y=table_header_reg_char.min_y + 5,
allow_processed=False,
)
assert table_header_fields_char is not None, \
"can't find special registers altered table's fields-column's header"
assert table_header_fields_char.text == "F", (
f"can't find special registers altered table's fields-column's header:\n"
f"table_header_fields_char={table_header_fields_char}")
columns_x_bounds = (
(table_header_reg_char.min_x, table_header_fields_char.min_x - 1),
(table_header_fields_char.min_x,
column_min_x + INSN_SP_REGS_ALTERED_FIELDS_CONDS_SPLIT_X),
(column_min_x + INSN_SP_REGS_ALTERED_FIELDS_CONDS_SPLIT_X, self.text_section.max_x),
)
table_header_reg = self.extract_text_line(
start_char=table_header_reg_char,
start_min_y=table_header_reg_char.min_y,
min_x=columns_x_bounds[0][0],
max_x=columns_x_bounds[0][1],
fonts=fonts,
)
assert table_header_reg is not None, \
"can't find special registers altered table's register-column's header"
table_header_reg_text = "".join(table_header_reg.element.itertext())
assert table_header_reg_text == "Register", (
f"can't find special registers altered table's register-column's header:\n"
f"table_header_reg_text={table_header_reg_text!r}")
table_header_fields = self.extract_text_line(
start_char=table_header_fields_char,
start_min_y=table_header_fields_char.min_y,
min_x=columns_x_bounds[1][0],
max_x=columns_x_bounds[1][1],
fonts=fonts,
)
assert table_header_fields is not None, \
"can't find special registers altered table's fields-column's header"
table_header_fields_text = "".join(table_header_fields.element.itertext())
assert table_header_fields_text == "Field(s)", (
f"can't find special registers altered table's fields-column's header:\n"
f"table_header_fields_text={table_header_fields_text!r}")
regular_min_y = table_header_reg.regular_min_y
entries: list[InsnSpRegsAlteredEntry] = []
row: list[None | ParsedTextLine] = [None, None, None]
cur_reg: None | ParsedTextLine = None
cur_fields: list[ParsedTextLine] = []
cur_conds: list[ParsedTextLine] = []
while True:
next_regular_min_y = None
for i, (min_x, max_x) in enumerate(columns_x_bounds):
row[i] = cell = self.extract_text_line(
start_min_y=regular_min_y - fonts.regular[0].line_height,
min_x=min_x,
max_x=max_x,
fonts=fonts,
skip_initial_spaces=True,
allowed_start_min_y_error=2,
)
if cell is not None and next_regular_min_y is None:
next_regular_min_y = cell.regular_min_y
if next_regular_min_y is None:
break
regular_min_y = next_regular_min_y
cur_reg_cell, cur_fields_cell, cur_conds_cell = row
if cur_reg_cell is None:
assert cur_reg is not None, \
"can't find special registers altered table's first register"
if cur_fields_cell is not None:
cur_fields.append(cur_fields_cell)
if cur_conds_cell is not None:
cur_conds.append(cur_conds_cell)
continue
if cur_reg is not None:
entries.append(InsnSpRegsAlteredEntry(
reg=cur_reg,
fields=tuple(cur_fields),
conds=tuple(cur_conds),
))
cur_fields.clear()
cur_conds.clear()
cur_reg = cur_reg_cell
if cur_fields_cell is not None:
cur_fields.append(cur_fields_cell)
if cur_conds_cell is not None:
cur_conds.append(cur_conds_cell)
assert cur_reg is not None, \
"can't find special registers altered table's first register"
entries.append(InsnSpRegsAlteredEntry(
reg=cur_reg,
fields=tuple(cur_fields),
conds=tuple(cur_conds),
))
return InsnSpRegsAltered(
sp_regs_altered_text=sp_regs_altered_text,
special_text=None,
table_header_reg=table_header_reg,
table_header_fields=table_header_fields,
entries=tuple(entries),
final_regular_min_y=regular_min_y,
)
def extract_insn(self, header_start_char: Char) -> Insn:
assert header_start_char.font == Font.INSN_HEADER
print(header_start_char)
header = self.extract_insn_header_mnemonics_and_bit_fields(
start_min_y=header_start_char.min_y,
header_start_char=header_start_char,
)
if header is None:
raise PageParseError("can't find header text line")
next_start_min_y = header.min_y - 5
headers = [header]
code_lines: list[ParsedTextLine] = []
desc_lines: list[ParsedTextLine] = []
sp_regs_altered = None
while True:
search_min_y = next_start_min_y - 70
next_char = self.find_top_left_char_in_range(
min_x=self.text_section.min_x - 5,
max_x=self.text_section.max_x + 5,
min_y=max(search_min_y, self.text_section.min_y),
max_y=next_start_min_y,
allow_processed=False,
)
if next_char is None:
if search_min_y <= self.text_section.min_y \
and self.text_section.next is not None and \
self.text_section.next.page_num in self.pages:
# go to next section
self.text_section = self.text_section.next
next_start_min_y = self.text_section.max_y
continue
else:
raise InsnParseError("can't find insn code or description text")
match next_char.font:
case font if font in TextLineFonts.INSN_CODE_FONTS.fonts:
next_section = _InsnParseSection.CODE
case font if font in TextLineFonts.INSN_DESC_FONTS.fonts:
next_section = _InsnParseSection.DESC
case Font.INSN_HEADER:
next_section = _InsnParseSection.HEADER
case font:
raise InsnParseError(f"can't find insn code or description text\nfont={font}")
match next_section:
case _InsnParseSection.CODE:
if len(desc_lines) != 0:
break
code_line = self.extract_text_line(
start_char=next_char,
start_min_y=next_char.min_y,
min_x=next_char.min_x,
max_x=self.text_section.max_x,
fonts=TextLineFonts.INSN_CODE_FONTS,
preceding_blank_lines=0 if len(code_lines) == 0 else 1,
)
if code_line is None:
raise InsnParseError("can't find insn code text line")
more_code_lines = self.extract_following_text_lines(
first_text_line=code_line,
min_x=code_line.chars[0].min_x,
max_x=self.text_section.max_x,
allowed_start_min_y_error=0.05,
)
print("more insn code lines:")
print("\n".join(map(str, more_code_lines)))
code_lines.extend(more_code_lines)
next_start_min_y = code_lines[-1].regular_min_y - 5
case _InsnParseSection.HEADER:
if len(code_lines) != 0 or len(desc_lines) != 0:
break
header = self.extract_insn_header_mnemonics_and_bit_fields(
start_min_y=next_char.min_y,
header_start_char=next_char,
)
if header is None:
raise InsnParseError("can't find header text line")
headers.append(header)
next_start_min_y = header.min_y - 5
case _InsnParseSection.DESC:
desc_line = self.extract_text_line(
start_char=next_char,
start_min_y=next_char.min_y,
min_x=next_char.min_x,
max_x=self.text_section.max_x,
fonts=TextLineFonts.INSN_DESC_FONTS,
preceding_blank_lines=0 if len(desc_lines) == 0 else 1,
allowed_start_min_y_error=3,
)
if desc_line is None:
raise InsnParseError("can't find insn desc text line")
match desc_line.get_header_text():
case None:
more_desc_lines = self.extract_following_text_lines(
first_text_line=desc_line,
min_x=desc_line.chars[0].min_x,
max_x=self.text_section.max_x,
allowed_start_min_y_error=3.5,
)
print("more insn desc lines:")
print("\n".join(map(str, more_desc_lines)))
desc_lines.extend(more_desc_lines)
next_start_min_y = desc_lines[-1].regular_min_y - 5
case "Special Registers Altered:":
sp_regs_altered = self.extract_insn_sp_regs_altered(
sp_regs_altered_text=desc_line,
)
next_start_min_y = sp_regs_altered.final_regular_min_y
break
case header_text:
raise AssertionError(f"unhandled header text: {header_text!r}\n{desc_line}")
case _:
assert_never(next_section)
print("insn code lines:")
print("\n".join(map(str, code_lines)))
print("insn desc lines:")
print("\n".join(map(str, desc_lines)))
print("sp_regs_altered:")
print(sp_regs_altered)
# TODO: finish
return Insn(
headers=tuple(headers),
code_lines=tuple(code_lines),
desc_lines=tuple(desc_lines),
sp_regs_altered=sp_regs_altered,
)
def extract_insns(self):
while True:
try:
header_start_char = next(iter(
self.unprocessed_chars[Font.INSN_HEADER]))
except StopIteration:
break
self.insns.append(self.extract_insn(header_start_char=header_start_char))
def main():
if 2 < len(sys.argv):
if ":" in sys.argv[2]:
page_numbers = range(*map(int, sys.argv[2].split(":")))
else:
page_numbers = tuple(int(i) for i in sys.argv[2].split(","))
else:
page_numbers = None
parser = Parser()
file_name = Path(sys.argv[1])
parser.parse_pdf(file_name, page_numbers=page_numbers)
insns = ElementTree.Element("instructions", attrib={"is-subset": str(page_numbers is not None)})
insns.text = "\n"
insns.tail = "\n"
comment = ElementTree.Comment(f" Automatically generated from {file_name.name} ")
comment.tail = "\n"
insns.append(comment)
for insn in parser.insns:
insn.write_xml(insns)
ElementTree.ElementTree(insns).write(
"powerisa-instructions.xml",
encoding="utf-8",
xml_declaration=True,
)