initial commit
This commit is contained in:
commit
77b0ce2c3d
8 changed files with 1289 additions and 0 deletions
392
parse_powerisa_pdf/parse_powerisa_pdf.py
Executable file
392
parse_powerisa_pdf/parse_powerisa_pdf.py
Executable file
|
|
@ -0,0 +1,392 @@
|
|||
from __future__ import annotations
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass, field
|
||||
from functools import cached_property
|
||||
import sys
|
||||
from typing import ClassVar, Iterable, Iterator, NewType, TypeAlias, TypeVar, assert_never, overload
|
||||
from xml.etree import ElementTree
|
||||
import enum
|
||||
|
||||
from pdfminer.high_level import extract_pages
|
||||
from pdfminer.layout import LTChar, LTLine, LTPage, LTRect, LTTextBox
|
||||
|
||||
from parse_powerisa_pdf.quad_tree import QuadTree
|
||||
from parse_powerisa_pdf.set_by_id import SetById
|
||||
|
||||
@dataclass(unsafe_hash=True, frozen=True)
|
||||
class Font:
|
||||
font_name: str
|
||||
size: float
|
||||
__KNOWN_NAMES: ClassVar[dict[Font, str]]
|
||||
|
||||
@cached_property
|
||||
def space_width(self) -> float:
|
||||
match self:
|
||||
case Font.INSTR_HEADER:
|
||||
return 3.12
|
||||
case _:
|
||||
return self.size * 0.31
|
||||
|
||||
@cached_property
|
||||
def line_height(self) -> float:
|
||||
match self:
|
||||
case Font.INSTR_HEADER:
|
||||
return 10.961
|
||||
case _:
|
||||
return self.size * 1.1
|
||||
|
||||
@classmethod
|
||||
def __iter__(cls) -> Iterator[Font]:
|
||||
return iter(cls.__KNOWN_NAMES.keys())
|
||||
|
||||
@property
|
||||
def known_name(self) -> None | str:
|
||||
return self.__KNOWN_NAMES.get(self)
|
||||
|
||||
@classmethod
|
||||
def _register_known_fonts(cls) -> None:
|
||||
cls.INSTR_HEADER = Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=9.963)
|
||||
cls.PAGE_HEADER = Font(font_name='MJBFWM+DejaVuSansCondensed', size=9.963)
|
||||
cls.PAGE_FOOTER = Font(font_name='MJBFWM+DejaVuSansCondensed', size=4.981)
|
||||
cls.INSTR_DESC = Font(font_name='MJBFWM+DejaVuSansCondensed', size=8.966)
|
||||
cls.INSTR_DESC_ITALIC = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=8.966)
|
||||
cls.INSTR_DESC_BOLD = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=8.966)
|
||||
cls.INSTR_DESC_BOLD_ITALIC = Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=8.966)
|
||||
cls.INSTR_DESC_SUBSCRIPT = Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.978)
|
||||
cls.INSTR_FIELD_BIT_NUMS = Font(font_name='MJBFWM+DejaVuSansCondensed', size=7.97)
|
||||
cls.INSTR_EXT_MNEMONIC = Font(font_name='APUYSQ+zcoN-Regular', size=8.966)
|
||||
cls.INSTR_CODE = Font(font_name='APUYSQ+zcoN-Regular', size=7.97)
|
||||
cls.INSTR_CODE_SYM = Font(font_name='RRFUNA+CMSY8', size=7.97)
|
||||
cls.INSTR_CODE_NE_EQ_SIGN = Font(font_name='HPXOZC+CMSS8', size=7.97)
|
||||
cls.INSTR_CODE_SUBSCRIPT = Font(font_name='APUYSQ+zcoN-Regular', size=5.978)
|
||||
|
||||
cls.__KNOWN_NAMES = {}
|
||||
for name, value in cls.__dict__.items():
|
||||
if name[0].isupper() and isinstance(value, cls):
|
||||
assert value not in cls.__KNOWN_NAMES, f"duplicate known font: {value}"
|
||||
cls.__KNOWN_NAMES[value] = name
|
||||
|
||||
old_repr = cls.__repr__
|
||||
def __repr__(self: cls) -> str:
|
||||
known_name = self.known_name
|
||||
if known_name is not None:
|
||||
return f"<{self.__class__.__name__}.{known_name}: {old_repr(self)}>"
|
||||
return old_repr(self)
|
||||
cls.__repr__ = __repr__
|
||||
|
||||
del cls._register_known_fonts
|
||||
|
||||
Font._register_known_fonts()
|
||||
|
||||
@dataclass(unsafe_hash=True, frozen=True)
|
||||
class Char:
|
||||
font: Font
|
||||
text: str
|
||||
adv: float
|
||||
min_x: float
|
||||
min_y: float
|
||||
max_x: float
|
||||
max_y: float
|
||||
|
||||
def top_down_left_to_right_sort_key(self):
|
||||
return -self.min_y, self.min_x
|
||||
|
||||
@property
|
||||
def width(self) -> float:
|
||||
return self.max_x - self.min_x
|
||||
|
||||
@property
|
||||
def height(self) -> float:
|
||||
return self.max_y - self.min_y
|
||||
|
||||
|
||||
@dataclass()
|
||||
class Parser:
|
||||
def parse_pdf(self, file: str, page_numbers: range | None = None):
|
||||
for page in extract_pages(file, page_numbers=page_numbers):
|
||||
PageParser(parser=self, page_id=page.pageid).parse_page(page)
|
||||
|
||||
|
||||
COLUMN_SPLIT_X = 300.0
|
||||
|
||||
@dataclass()
|
||||
class ParsedTextLine:
|
||||
element: ElementTree.Element
|
||||
regular_min_y: float
|
||||
fonts: TextLineFonts
|
||||
chars: list[Char]
|
||||
|
||||
def __str__(self) -> str:
|
||||
return ElementTree.tostring(self.element, encoding="unicode")
|
||||
|
||||
_T = TypeVar("_T")
|
||||
|
||||
@dataclass(unsafe_hash=True, frozen=True)
|
||||
class TextLineFonts:
|
||||
regular: Font
|
||||
italic: Font | None = None
|
||||
bold: Font | None = None
|
||||
bold_italic: Font | None = None
|
||||
|
||||
def get_font(self, part_kind: TextLineFontKind, default: _T=None) -> _T | Font:
|
||||
match part_kind:
|
||||
case TextLineFontKind.REGULAR:
|
||||
retval = self.regular
|
||||
case TextLineFontKind.ITALIC:
|
||||
retval = self.italic
|
||||
case TextLineFontKind.BOLD:
|
||||
retval = self.bold
|
||||
case TextLineFontKind.BOLD_ITALIC:
|
||||
retval = self.bold_italic
|
||||
case _:
|
||||
assert_never(part_kind)
|
||||
if retval is None:
|
||||
return default
|
||||
return retval
|
||||
|
||||
@cached_property
|
||||
def __font_to_kind_map(self) -> dict[Font, TextLineFontKind]:
|
||||
retval = {}
|
||||
for kind in TextLineFontKind:
|
||||
font = self.get_font(kind)
|
||||
if font is None:
|
||||
continue
|
||||
assert font not in retval, \
|
||||
f"duplicate font: kind={kind} old_kind={retval[font]} font={font}"
|
||||
retval[font] = kind
|
||||
return retval
|
||||
|
||||
def get_kind(self, font: Font, default: _T=None) -> _T | TextLineFontKind:
|
||||
return self.__font_to_kind_map.get(font, default)
|
||||
|
||||
class TextLineFontKind(enum.Enum):
|
||||
REGULAR = "regular"
|
||||
ITALIC = "italic"
|
||||
BOLD = "bold"
|
||||
BOLD_ITALIC = "bold_italic"
|
||||
|
||||
@cached_property
|
||||
def text_line_tags(self) -> tuple[str, ...]:
|
||||
match self:
|
||||
case TextLineFontKind.REGULAR:
|
||||
return ()
|
||||
case TextLineFontKind.ITALIC:
|
||||
return "i",
|
||||
case TextLineFontKind.BOLD:
|
||||
return "b",
|
||||
case TextLineFontKind.BOLD_ITALIC:
|
||||
return "b", "i"
|
||||
case _:
|
||||
assert_never(self)
|
||||
|
||||
class PageParseFailed(Exception):
|
||||
pass
|
||||
|
||||
class ElementBodyBuilder:
|
||||
def __init__(self, containing_element: ElementTree.Element):
|
||||
self.__containing_element = containing_element
|
||||
self.__stack: list[ElementTree.Element] = []
|
||||
self.__text_buffer: list[str] = []
|
||||
|
||||
def __shrink_stack(self, new_len: int):
|
||||
while new_len < len(self.__stack):
|
||||
self.__flush_text_buffer()
|
||||
self.__stack.pop()
|
||||
|
||||
def set_tag_stack(self, tag_stack: Iterable[str]):
|
||||
new_len = 0
|
||||
for i, tag in enumerate(tag_stack):
|
||||
new_len = i + 1
|
||||
if i >= len(self.__stack):
|
||||
self.__flush_text_buffer()
|
||||
self.__stack.append(ElementTree.SubElement(self.__insert_point(), tag))
|
||||
elif self.__stack[i].tag != tag:
|
||||
self.__shrink_stack(new_len)
|
||||
self.__shrink_stack(new_len)
|
||||
|
||||
def write_text(self, text: str):
|
||||
self.__text_buffer.append(text)
|
||||
|
||||
def __insert_point(self) -> ElementTree.Element:
|
||||
if len(self.__stack) != 0:
|
||||
return self.__stack[-1]
|
||||
return self.__containing_element
|
||||
|
||||
def __flush_text_buffer(self):
|
||||
if len(self.__text_buffer) == 0:
|
||||
return
|
||||
insert_point = self.__insert_point()
|
||||
text = "".join(self.__text_buffer)
|
||||
self.__text_buffer.clear()
|
||||
if len(insert_point) != 0:
|
||||
element = insert_point[-1]
|
||||
element.tail = (element.tail or "") + text
|
||||
else:
|
||||
insert_point.text = (insert_point.text or "") + text
|
||||
|
||||
def __enter__(self) -> ElementBodyBuilder:
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
self.flush()
|
||||
|
||||
def flush(self):
|
||||
self.set_tag_stack(())
|
||||
self.__flush_text_buffer()
|
||||
|
||||
@dataclass()
|
||||
class PageParser:
|
||||
parser: Parser
|
||||
page_id: int
|
||||
qt: QuadTree[Char | LTLine | LTRect] = field(default_factory=QuadTree)
|
||||
unprocessed_chars: defaultdict[Font, SetById[Char]] = field(
|
||||
default_factory=lambda: defaultdict(SetById[Char]))
|
||||
unprocessed_non_text: SetById[LTLine | LTRect] = field(
|
||||
default_factory=SetById[LTLine | LTRect])
|
||||
|
||||
def parse_page(self, page: LTPage):
|
||||
for component in page:
|
||||
if isinstance(component, (LTLine, LTRect)):
|
||||
self.qt.insert(component.x0, component.y0, component)
|
||||
continue
|
||||
if not isinstance(component, LTTextBox):
|
||||
print(f"ignoring: {component}")
|
||||
continue
|
||||
for text_line in component:
|
||||
for element in text_line:
|
||||
if not isinstance(element, LTChar):
|
||||
continue
|
||||
char = Char(
|
||||
text=element.get_text(),
|
||||
font=Font(font_name=element.fontname, size=round(element.size, 3)),
|
||||
adv=element.adv,
|
||||
min_x=element.x0,
|
||||
min_y=element.y0,
|
||||
max_x=element.x1,
|
||||
max_y=element.y1,
|
||||
)
|
||||
self.qt.insert(char.min_x, char.min_y, char)
|
||||
self.unprocessed_chars[char.font].add(char)
|
||||
for i in self.unprocessed_chars.values():
|
||||
i.sort(key=Char.top_down_left_to_right_sort_key)
|
||||
for font, chars in self.unprocessed_chars.items():
|
||||
print()
|
||||
print(font)
|
||||
text = ""
|
||||
char = None
|
||||
for char in chars:
|
||||
text += char.text
|
||||
print(repr(text))
|
||||
assert font.known_name is not None, f"unknown font {font}\nlast char: {char}"
|
||||
self.extract_instructions()
|
||||
|
||||
def extract_text_line(
|
||||
self, *,
|
||||
start_char: None | Char = None,
|
||||
start_min_y: float,
|
||||
min_x: float,
|
||||
max_x: float,
|
||||
fonts: TextLineFonts,
|
||||
) -> None | ParsedTextLine:
|
||||
chars: list[Char] = []
|
||||
if start_char is not None:
|
||||
chars.append(start_char)
|
||||
self.unprocessed_chars[start_char.font].remove(start_char)
|
||||
for x, y, char in self.qt.range(
|
||||
min_x=min_x,
|
||||
max_x=max_x,
|
||||
min_y=start_min_y - fonts.regular.size * 0.5,
|
||||
max_y=start_min_y + fonts.regular.size * 0.5,
|
||||
):
|
||||
if not isinstance(char, Char):
|
||||
continue
|
||||
if char not in self.unprocessed_chars[char.font]:
|
||||
continue
|
||||
self.unprocessed_chars[char.font].remove(char)
|
||||
chars.append(char)
|
||||
if len(chars) == 0:
|
||||
return None
|
||||
chars.sort(key=Char.top_down_left_to_right_sort_key)
|
||||
retval = ParsedTextLine(
|
||||
element=ElementTree.Element("text-line"),
|
||||
regular_min_y=chars[0].min_y,
|
||||
fonts=fonts,
|
||||
chars=chars,
|
||||
)
|
||||
with ElementBodyBuilder(retval.element) as body_builder:
|
||||
last_max_x = min_x
|
||||
last_kind = None
|
||||
for char in chars:
|
||||
kind = fonts.get_kind(char.font)
|
||||
if kind is None:
|
||||
return None
|
||||
if last_kind is None:
|
||||
space_kind = kind
|
||||
elif last_kind != kind:
|
||||
space_kind = TextLineFontKind.REGULAR
|
||||
else:
|
||||
space_kind = kind
|
||||
space_font = fonts.get_font(space_kind, fonts.regular)
|
||||
space_width = char.min_x - last_max_x
|
||||
space_count_f = space_width / space_font.space_width
|
||||
space_count = round(space_count_f)
|
||||
if space_count_f > 0.25 and abs(space_count - space_count_f) > 0.15:
|
||||
print(f"spaces: space_count_f={space_count_f} space_width={space_width}")
|
||||
if space_count > 0:
|
||||
body_builder.set_tag_stack(space_kind.text_line_tags)
|
||||
body_builder.write_text(" " * space_count)
|
||||
body_builder.set_tag_stack(kind.text_line_tags)
|
||||
body_builder.write_text(char.text)
|
||||
last_max_x = char.max_x
|
||||
last_kind = kind
|
||||
return retval
|
||||
|
||||
def extract_following_text_lines(
|
||||
self,
|
||||
first_text_line: ParsedTextLine,
|
||||
min_x: float,
|
||||
max_x: float,
|
||||
) -> list[ParsedTextLine]:
|
||||
retval: list[ParsedTextLine] = []
|
||||
line = first_text_line
|
||||
while line is not None:
|
||||
retval.append(line)
|
||||
line = self.extract_text_line(
|
||||
start_min_y=line.regular_min_y - first_text_line.fonts.regular.line_height,
|
||||
min_x=min_x,
|
||||
max_x=max_x,
|
||||
fonts=first_text_line.fonts,
|
||||
)
|
||||
return retval
|
||||
|
||||
def extract_instruction(self, header_start_char: Char):
|
||||
assert header_start_char.font == Font.INSTR_HEADER
|
||||
if header_start_char.min_x < COLUMN_SPLIT_X:
|
||||
column_max_x = COLUMN_SPLIT_X
|
||||
else:
|
||||
column_max_x = 1000
|
||||
header_text_line = self.extract_text_line(
|
||||
start_char=header_start_char,
|
||||
start_min_y=header_start_char.min_y,
|
||||
min_x=header_start_char.min_x,
|
||||
max_x=column_max_x,
|
||||
fonts=TextLineFonts(regular=Font.INSTR_HEADER),
|
||||
)
|
||||
if header_text_line is None:
|
||||
raise PageParseFailed("can't find header text line")
|
||||
print(header_text_line)
|
||||
header_lines = self.extract_following_text_lines(
|
||||
first_text_line=header_text_line,
|
||||
min_x=header_start_char.min_x,
|
||||
max_x=column_max_x,
|
||||
)
|
||||
print(*header_lines)
|
||||
# TODO: finish
|
||||
|
||||
def extract_instructions(self):
|
||||
unprocessed_header_chars = self.unprocessed_chars[Font.INSTR_HEADER]
|
||||
while len(unprocessed_header_chars) != 0:
|
||||
self.extract_instruction(next(iter(unprocessed_header_chars)))
|
||||
|
||||
def main():
|
||||
Parser().parse_pdf(sys.argv[1], page_numbers=range(76, 78))
|
||||
Loading…
Add table
Add a link
Reference in a new issue