initial commit

This commit is contained in:
Jacob Lifshay 2024-10-24 21:42:22 -07:00
commit 77b0ce2c3d
Signed by: programmerjake
SSH key fingerprint: SHA256:B1iRVvUJkvd7upMIiMqn6OyxvD2SgJkAH3ZnUOj6z+c
8 changed files with 1289 additions and 0 deletions

View file

@ -0,0 +1,392 @@
from __future__ import annotations
from collections import defaultdict
from dataclasses import dataclass, field
from functools import cached_property
import sys
from typing import ClassVar, Iterable, Iterator, NewType, TypeAlias, TypeVar, assert_never, overload
from xml.etree import ElementTree
import enum
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTChar, LTLine, LTPage, LTRect, LTTextBox
from parse_powerisa_pdf.quad_tree import QuadTree
from parse_powerisa_pdf.set_by_id import SetById
@dataclass(unsafe_hash=True, frozen=True)
class Font:
font_name: str
size: float
__KNOWN_NAMES: ClassVar[dict[Font, str]]
@cached_property
def space_width(self) -> float:
match self:
case Font.INSTR_HEADER:
return 3.12
case _:
return self.size * 0.31
@cached_property
def line_height(self) -> float:
match self:
case Font.INSTR_HEADER:
return 10.961
case _:
return self.size * 1.1
@classmethod
def __iter__(cls) -> Iterator[Font]:
return iter(cls.__KNOWN_NAMES.keys())
@property
def known_name(self) -> None | str:
return self.__KNOWN_NAMES.get(self)
@classmethod
def _register_known_fonts(cls) -> None:
cls.INSTR_HEADER = Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=9.963)
cls.PAGE_HEADER = Font(font_name='MJBFWM+DejaVuSansCondensed', size=9.963)
cls.PAGE_FOOTER = Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.981)
cls.INSTR_DESC = Font(font_name='MJBFWM+DejaVuSansCondensed', size=8.966)
cls.INSTR_DESC_ITALIC = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=8.966)
cls.INSTR_DESC_BOLD = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=8.966)
cls.INSTR_DESC_BOLD_ITALIC = Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=8.966)
cls.INSTR_DESC_SUBSCRIPT = Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.978)
cls.INSTR_FIELD_BIT_NUMS = Font(font_name='MJBFWM+DejaVuSansCondensed', size=7.97)
cls.INSTR_EXT_MNEMONIC = Font(font_name='APUYSQ+zcoN-Regular', size=8.966)
cls.INSTR_CODE = Font(font_name='APUYSQ+zcoN-Regular', size=7.97)
cls.INSTR_CODE_SYM = Font(font_name='RRFUNA+CMSY8', size=7.97)
cls.INSTR_CODE_NE_EQ_SIGN = Font(font_name='HPXOZC+CMSS8', size=7.97)
cls.INSTR_CODE_SUBSCRIPT = Font(font_name='APUYSQ+zcoN-Regular', size=5.978)
cls.__KNOWN_NAMES = {}
for name, value in cls.__dict__.items():
if name[0].isupper() and isinstance(value, cls):
assert value not in cls.__KNOWN_NAMES, f"duplicate known font: {value}"
cls.__KNOWN_NAMES[value] = name
old_repr = cls.__repr__
def __repr__(self: cls) -> str:
known_name = self.known_name
if known_name is not None:
return f"<{self.__class__.__name__}.{known_name}: {old_repr(self)}>"
return old_repr(self)
cls.__repr__ = __repr__
del cls._register_known_fonts
Font._register_known_fonts()
@dataclass(unsafe_hash=True, frozen=True)
class Char:
font: Font
text: str
adv: float
min_x: float
min_y: float
max_x: float
max_y: float
def top_down_left_to_right_sort_key(self):
return -self.min_y, self.min_x
@property
def width(self) -> float:
return self.max_x - self.min_x
@property
def height(self) -> float:
return self.max_y - self.min_y
@dataclass()
class Parser:
def parse_pdf(self, file: str, page_numbers: range | None = None):
for page in extract_pages(file, page_numbers=page_numbers):
PageParser(parser=self, page_id=page.pageid).parse_page(page)
COLUMN_SPLIT_X = 300.0
@dataclass()
class ParsedTextLine:
element: ElementTree.Element
regular_min_y: float
fonts: TextLineFonts
chars: list[Char]
def __str__(self) -> str:
return ElementTree.tostring(self.element, encoding="unicode")
_T = TypeVar("_T")
@dataclass(unsafe_hash=True, frozen=True)
class TextLineFonts:
regular: Font
italic: Font | None = None
bold: Font | None = None
bold_italic: Font | None = None
def get_font(self, part_kind: TextLineFontKind, default: _T=None) -> _T | Font:
match part_kind:
case TextLineFontKind.REGULAR:
retval = self.regular
case TextLineFontKind.ITALIC:
retval = self.italic
case TextLineFontKind.BOLD:
retval = self.bold
case TextLineFontKind.BOLD_ITALIC:
retval = self.bold_italic
case _:
assert_never(part_kind)
if retval is None:
return default
return retval
@cached_property
def __font_to_kind_map(self) -> dict[Font, TextLineFontKind]:
retval = {}
for kind in TextLineFontKind:
font = self.get_font(kind)
if font is None:
continue
assert font not in retval, \
f"duplicate font: kind={kind} old_kind={retval[font]} font={font}"
retval[font] = kind
return retval
def get_kind(self, font: Font, default: _T=None) -> _T | TextLineFontKind:
return self.__font_to_kind_map.get(font, default)
class TextLineFontKind(enum.Enum):
REGULAR = "regular"
ITALIC = "italic"
BOLD = "bold"
BOLD_ITALIC = "bold_italic"
@cached_property
def text_line_tags(self) -> tuple[str, ...]:
match self:
case TextLineFontKind.REGULAR:
return ()
case TextLineFontKind.ITALIC:
return "i",
case TextLineFontKind.BOLD:
return "b",
case TextLineFontKind.BOLD_ITALIC:
return "b", "i"
case _:
assert_never(self)
class PageParseFailed(Exception):
pass
class ElementBodyBuilder:
def __init__(self, containing_element: ElementTree.Element):
self.__containing_element = containing_element
self.__stack: list[ElementTree.Element] = []
self.__text_buffer: list[str] = []
def __shrink_stack(self, new_len: int):
while new_len < len(self.__stack):
self.__flush_text_buffer()
self.__stack.pop()
def set_tag_stack(self, tag_stack: Iterable[str]):
new_len = 0
for i, tag in enumerate(tag_stack):
new_len = i + 1
if i >= len(self.__stack):
self.__flush_text_buffer()
self.__stack.append(ElementTree.SubElement(self.__insert_point(), tag))
elif self.__stack[i].tag != tag:
self.__shrink_stack(new_len)
self.__shrink_stack(new_len)
def write_text(self, text: str):
self.__text_buffer.append(text)
def __insert_point(self) -> ElementTree.Element:
if len(self.__stack) != 0:
return self.__stack[-1]
return self.__containing_element
def __flush_text_buffer(self):
if len(self.__text_buffer) == 0:
return
insert_point = self.__insert_point()
text = "".join(self.__text_buffer)
self.__text_buffer.clear()
if len(insert_point) != 0:
element = insert_point[-1]
element.tail = (element.tail or "") + text
else:
insert_point.text = (insert_point.text or "") + text
def __enter__(self) -> ElementBodyBuilder:
return self
def __exit__(self, exc_type, exc_value, traceback):
self.flush()
def flush(self):
self.set_tag_stack(())
self.__flush_text_buffer()
@dataclass()
class PageParser:
parser: Parser
page_id: int
qt: QuadTree[Char | LTLine | LTRect] = field(default_factory=QuadTree)
unprocessed_chars: defaultdict[Font, SetById[Char]] = field(
default_factory=lambda: defaultdict(SetById[Char]))
unprocessed_non_text: SetById[LTLine | LTRect] = field(
default_factory=SetById[LTLine | LTRect])
def parse_page(self, page: LTPage):
for component in page:
if isinstance(component, (LTLine, LTRect)):
self.qt.insert(component.x0, component.y0, component)
continue
if not isinstance(component, LTTextBox):
print(f"ignoring: {component}")
continue
for text_line in component:
for element in text_line:
if not isinstance(element, LTChar):
continue
char = Char(
text=element.get_text(),
font=Font(font_name=element.fontname, size=round(element.size, 3)),
adv=element.adv,
min_x=element.x0,
min_y=element.y0,
max_x=element.x1,
max_y=element.y1,
)
self.qt.insert(char.min_x, char.min_y, char)
self.unprocessed_chars[char.font].add(char)
for i in self.unprocessed_chars.values():
i.sort(key=Char.top_down_left_to_right_sort_key)
for font, chars in self.unprocessed_chars.items():
print()
print(font)
text = ""
char = None
for char in chars:
text += char.text
print(repr(text))
assert font.known_name is not None, f"unknown font {font}\nlast char: {char}"
self.extract_instructions()
def extract_text_line(
self, *,
start_char: None | Char = None,
start_min_y: float,
min_x: float,
max_x: float,
fonts: TextLineFonts,
) -> None | ParsedTextLine:
chars: list[Char] = []
if start_char is not None:
chars.append(start_char)
self.unprocessed_chars[start_char.font].remove(start_char)
for x, y, char in self.qt.range(
min_x=min_x,
max_x=max_x,
min_y=start_min_y - fonts.regular.size * 0.5,
max_y=start_min_y + fonts.regular.size * 0.5,
):
if not isinstance(char, Char):
continue
if char not in self.unprocessed_chars[char.font]:
continue
self.unprocessed_chars[char.font].remove(char)
chars.append(char)
if len(chars) == 0:
return None
chars.sort(key=Char.top_down_left_to_right_sort_key)
retval = ParsedTextLine(
element=ElementTree.Element("text-line"),
regular_min_y=chars[0].min_y,
fonts=fonts,
chars=chars,
)
with ElementBodyBuilder(retval.element) as body_builder:
last_max_x = min_x
last_kind = None
for char in chars:
kind = fonts.get_kind(char.font)
if kind is None:
return None
if last_kind is None:
space_kind = kind
elif last_kind != kind:
space_kind = TextLineFontKind.REGULAR
else:
space_kind = kind
space_font = fonts.get_font(space_kind, fonts.regular)
space_width = char.min_x - last_max_x
space_count_f = space_width / space_font.space_width
space_count = round(space_count_f)
if space_count_f > 0.25 and abs(space_count - space_count_f) > 0.15:
print(f"spaces: space_count_f={space_count_f} space_width={space_width}")
if space_count > 0:
body_builder.set_tag_stack(space_kind.text_line_tags)
body_builder.write_text(" " * space_count)
body_builder.set_tag_stack(kind.text_line_tags)
body_builder.write_text(char.text)
last_max_x = char.max_x
last_kind = kind
return retval
def extract_following_text_lines(
self,
first_text_line: ParsedTextLine,
min_x: float,
max_x: float,
) -> list[ParsedTextLine]:
retval: list[ParsedTextLine] = []
line = first_text_line
while line is not None:
retval.append(line)
line = self.extract_text_line(
start_min_y=line.regular_min_y - first_text_line.fonts.regular.line_height,
min_x=min_x,
max_x=max_x,
fonts=first_text_line.fonts,
)
return retval
def extract_instruction(self, header_start_char: Char):
assert header_start_char.font == Font.INSTR_HEADER
if header_start_char.min_x < COLUMN_SPLIT_X:
column_max_x = COLUMN_SPLIT_X
else:
column_max_x = 1000
header_text_line = self.extract_text_line(
start_char=header_start_char,
start_min_y=header_start_char.min_y,
min_x=header_start_char.min_x,
max_x=column_max_x,
fonts=TextLineFonts(regular=Font.INSTR_HEADER),
)
if header_text_line is None:
raise PageParseFailed("can't find header text line")
print(header_text_line)
header_lines = self.extract_following_text_lines(
first_text_line=header_text_line,
min_x=header_start_char.min_x,
max_x=column_max_x,
)
print(*header_lines)
# TODO: finish
def extract_instructions(self):
unprocessed_header_chars = self.unprocessed_chars[Font.INSTR_HEADER]
while len(unprocessed_header_chars) != 0:
self.extract_instruction(next(iter(unprocessed_header_chars)))
def main():
Parser().parse_pdf(sys.argv[1], page_numbers=range(76, 78))