From 21b97c00640d0d36965a861abf6fe7807a4a721f Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Sun, 27 Oct 2024 22:59:27 -0700 Subject: [PATCH] WIP -- no reported errors till page 305 --- parse_powerisa_pdf/parse_powerisa_pdf.py | 821 +++++++++++++++++------ 1 file changed, 612 insertions(+), 209 deletions(-) diff --git a/parse_powerisa_pdf/parse_powerisa_pdf.py b/parse_powerisa_pdf/parse_powerisa_pdf.py index c830da1..338e6e5 100755 --- a/parse_powerisa_pdf/parse_powerisa_pdf.py +++ b/parse_powerisa_pdf/parse_powerisa_pdf.py @@ -1,10 +1,12 @@ from __future__ import annotations from collections import defaultdict +from collections.abc import Generator, Iterable, Iterator, Callable +from contextlib import contextmanager from dataclasses import dataclass, field import dataclasses from functools import cached_property import sys -from typing import Callable, ClassVar, Iterable, Iterator, TypeAlias, TypeVar, assert_never +from typing import ClassVar, TypeVar, assert_never from xml.etree import ElementTree import enum import traceback @@ -32,8 +34,7 @@ class Font: return 9.464 * self.size / Font.INSN_CODE[0].size case Font.INSN_DESC_BOLD.font_name | \ Font.INSN_DESC_ITALIC.font_name | \ - Font.INSN_DESC_BOLD_ITALIC.font_name | \ - Font.NOTATION_PAGE_SUBSCRIPT.font_name: + Font.INSN_DESC_BOLD_ITALIC.font_name: return 10.959 * self.size / Font.INSN_DESC[0].size case _ if self in Font.INSN_DESC or self.font_name == Font.INSN_DESC[0].font_name: return 10.959 * self.size / Font.INSN_DESC[0].size @@ -105,6 +106,8 @@ class Font: cls.INSN_DESC_SUBSCRIPT = Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.978) cls.INSN_DESC_BOLD_SUBSCRIPT = \ Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.978) + cls.INSN_DESC_ITALIC_SUBSCRIPT = \ + Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=5.978) cls.INSN_DESC_BOLD_ITALIC_SUBSCRIPT = \ Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=5.978) cls.INSN_EXT_MNEMONIC = Font(font_name='APUYSQ+zcoN-Regular', size=8.966) @@ -125,7 +128,6 @@ class Font: cls.LEGAL_PAGE_ITALIC = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=9.963) cls.CHANGE_SUMMARY_PAGE_BOLD = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=11.955) cls.CHAPTER_TITLE = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=17.215) - cls.NOTATION_PAGE_SUBSCRIPT = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=5.978) cls.MATH_MISC = ( Font(font_name='AAJMKT+CMMI6', size=5.978), Font(font_name='CUTMFD+CMSSI8', size=5.978), @@ -188,27 +190,13 @@ class Char: return self.max_y - self.min_y -@dataclass() -class Parser: - def parse_pdf(self, file: str, page_numbers: Iterable[int] | None = None): - if page_numbers is not None: - page_numbers = sorted(i - 1 for i in page_numbers) - for i, page in enumerate(extract_pages(file, page_numbers=page_numbers)): - if page_numbers is not None: - page_num = page_numbers[i] + 1 - else: - page_num = i + 1 - print(f"page {page_num}") - try: - PageParser(parser=self, page_num=page_num).parse_page(page) - except Exception as e: - e.add_note(f"page_num={page_num}") - raise - - COLUMN_SPLIT_X = 300.0 +PAGE_BODY_MAX_X = 600.0 +PAGE_BODY_MIN_X = 50 PAGE_BODY_MAX_Y = 780.0 PAGE_BODY_MIN_Y = 45.0 +ONE_TITLE_LINE_SPLIT_Y = 734.0 +TWO_TITLE_LINES_SPLIT_Y = 715.0 INSN_BIT_FIELDS_PREFIX_TEXT_TOP_PAD_HEIGHT = 29.938 INSN_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT = 9.278 INSN_BIT_FIELDS_PREFIX_BOX_BOTTOM_TO_SUFFIX_TEXT_HEIGHT = 20.971 @@ -281,6 +269,9 @@ class TextLineFonts: bold: tuple[Font, ...] | None = None bold_italic: tuple[Font, ...] | None = None subscript: tuple[Font, ...] | None = None + bold_subscript: tuple[Font, ...] | None = None + italic_subscript: tuple[Font, ...] | None = None + bold_italic_subscript: tuple[Font, ...] | None = None code: tuple[Font, ...] | None = None code_subscript: tuple[Font, ...] | None = None @@ -293,10 +284,11 @@ class TextLineFonts: regular=(Font.INSN_HEADER,), ) cls.INSN_BIT_FIELD_BIT_NUMBER_FONTS = cls( - regular=(Font.INSN_DESC_SMALL,), + regular=(Font.INSN_DESC_SMALL, Font.TITLE_PAGE_REV), ) cls.INSN_BIT_FIELD_NAME_FONTS = cls( regular=Font.INSN_DESC, + subscript=(Font.INSN_DESC_SUBSCRIPT,), ) cls.INSN_BIT_FIELDS_AFFIX_TITLE_FONTS = cls( regular=(Font.INSN_DESC_SMALL,), @@ -307,11 +299,14 @@ class TextLineFonts: subscript=Font.INSN_CODE_SUBSCRIPT, ) cls.INSN_DESC_FONTS = cls( - regular=Font.INSN_DESC, - bold=(Font.INSN_DESC_BOLD,), - italic=(Font.INSN_DESC_ITALIC,), - bold_italic=(Font.INSN_DESC_BOLD_ITALIC,), + regular=(*Font.INSN_DESC, Font.INSN_DESC_SMALL), + bold=(Font.INSN_DESC_BOLD, Font.INSN_DESC_SMALL_BOLD), + italic=(Font.INSN_DESC_ITALIC, Font.INSN_DESC_SMALL_ITALIC), + bold_italic=(Font.INSN_DESC_BOLD_ITALIC, Font.INSN_DESC_SMALL_BOLD_ITALIC), subscript=(Font.INSN_DESC_SUBSCRIPT,), + bold_subscript=(Font.INSN_DESC_BOLD_SUBSCRIPT,), + italic_subscript=(Font.INSN_DESC_ITALIC_SUBSCRIPT,), + bold_italic_subscript=(Font.INSN_DESC_BOLD_ITALIC_SUBSCRIPT,), code=(Font.INSN_DESC_CODE, Font.INSN_EXT_MNEMONIC), code_subscript=Font.INSN_CODE_SUBSCRIPT, ) @@ -326,36 +321,39 @@ class TextLineFonts: match part_kind: case TextLineFontKind.REGULAR: font = self.regular - baseline_pos = None case TextLineFontKind.ITALIC: font = self.italic - baseline_pos = None case TextLineFontKind.BOLD: font = self.bold - baseline_pos = None case TextLineFontKind.BOLD_ITALIC: font = self.bold_italic - baseline_pos = None case TextLineFontKind.SUBSCRIPT: font = self.subscript - baseline_pos = BaselinePos.BELOW case TextLineFontKind.SUPERSCRIPT: font = self.subscript - baseline_pos = BaselinePos.ABOVE + case TextLineFontKind.BOLD_SUBSCRIPT: + font = self.bold_subscript + case TextLineFontKind.BOLD_SUPERSCRIPT: + font = self.bold_subscript + case TextLineFontKind.ITALIC_SUBSCRIPT: + font = self.italic_subscript + case TextLineFontKind.ITALIC_SUPERSCRIPT: + font = self.italic_subscript + case TextLineFontKind.BOLD_ITALIC_SUBSCRIPT: + font = self.bold_italic_subscript + case TextLineFontKind.BOLD_ITALIC_SUPERSCRIPT: + font = self.bold_italic_subscript case TextLineFontKind.CODE: font = self.code - baseline_pos = None case TextLineFontKind.CODE_SUBSCRIPT: font = self.code_subscript - baseline_pos = BaselinePos.BELOW case TextLineFontKind.CODE_SUPERSCRIPT: font = self.code_subscript - baseline_pos = BaselinePos.ABOVE case _: assert_never(part_kind) if font is None: return default - return font, baseline_pos + return font, part_kind.sub_super.baseline_pos @cached_property def __font_to_kind_map(self) -> dict[tuple[Font, None | BaselinePos], TextLineFontKind]: @@ -392,40 +390,82 @@ class TextLineFonts: TextLineFonts._define_fonts() +class FontVariantCode(enum.Enum): + CODE = ("code",) + NOT_CODE = () + +class FontVariantBold(enum.Enum): + BOLD = ("b",) + NOT_BOLD = () + +class FontVariantItalic(enum.Enum): + ITALIC = ("i",) + NOT_ITALIC = () + +class FontVariantSubSuper(enum.Enum): + NOT_SUB_SUPER = () + SUBSCRIPT = ("sub",) + SUPERSCRIPT = ("sup",) + + @cached_property + def baseline_pos(self) -> None | BaselinePos: + match self: + case FontVariantSubSuper.NOT_SUB_SUPER: + return None + case FontVariantSubSuper.SUBSCRIPT: + return BaselinePos.BELOW + case FontVariantSubSuper.SUPERSCRIPT: + return BaselinePos.ABOVE + case _: + assert_never(self) + class TextLineFontKind(enum.Enum): - REGULAR = "regular" - ITALIC = "italic" - BOLD = "bold" - BOLD_ITALIC = "bold_italic" - SUBSCRIPT = "subscript" - SUPERSCRIPT = "superscript" - CODE = "code" - CODE_SUBSCRIPT = "code_subscript" - CODE_SUPERSCRIPT = "code_superscript" + def __init__( + self, + code: FontVariantCode, + bold: FontVariantBold, + italic: FontVariantItalic, + sub_super: FontVariantSubSuper, + ): + self.code = code + self.bold = bold + self.italic = italic + self.sub_super = sub_super + + REGULAR = FontVariantCode.NOT_CODE, FontVariantBold.NOT_BOLD, \ + FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.NOT_SUB_SUPER + SUBSCRIPT = FontVariantCode.NOT_CODE, FontVariantBold.NOT_BOLD, \ + FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.SUBSCRIPT + SUPERSCRIPT = FontVariantCode.NOT_CODE, FontVariantBold.NOT_BOLD, \ + FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.SUPERSCRIPT + ITALIC = FontVariantCode.NOT_CODE, FontVariantBold.NOT_BOLD, \ + FontVariantItalic.ITALIC, FontVariantSubSuper.NOT_SUB_SUPER + ITALIC_SUBSCRIPT = FontVariantCode.NOT_CODE, FontVariantBold.NOT_BOLD, \ + FontVariantItalic.ITALIC, FontVariantSubSuper.SUBSCRIPT + ITALIC_SUPERSCRIPT = FontVariantCode.NOT_CODE, FontVariantBold.NOT_BOLD, \ + FontVariantItalic.ITALIC, FontVariantSubSuper.SUPERSCRIPT + BOLD = FontVariantCode.NOT_CODE, FontVariantBold.BOLD, \ + FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.NOT_SUB_SUPER + BOLD_SUBSCRIPT = FontVariantCode.NOT_CODE, FontVariantBold.BOLD, \ + FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.SUBSCRIPT + BOLD_SUPERSCRIPT = FontVariantCode.NOT_CODE, FontVariantBold.BOLD, \ + FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.SUPERSCRIPT + BOLD_ITALIC = FontVariantCode.NOT_CODE, FontVariantBold.BOLD, \ + FontVariantItalic.ITALIC, FontVariantSubSuper.NOT_SUB_SUPER + BOLD_ITALIC_SUBSCRIPT = FontVariantCode.NOT_CODE, FontVariantBold.BOLD, \ + FontVariantItalic.ITALIC, FontVariantSubSuper.SUBSCRIPT + BOLD_ITALIC_SUPERSCRIPT = FontVariantCode.NOT_CODE, FontVariantBold.BOLD, \ + FontVariantItalic.ITALIC, FontVariantSubSuper.SUPERSCRIPT + CODE = FontVariantCode.CODE, FontVariantBold.NOT_BOLD, \ + FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.NOT_SUB_SUPER + CODE_SUBSCRIPT = FontVariantCode.CODE, FontVariantBold.NOT_BOLD, \ + FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.SUBSCRIPT + CODE_SUPERSCRIPT = FontVariantCode.CODE, FontVariantBold.NOT_BOLD, \ + FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.SUPERSCRIPT @cached_property def text_line_tags(self) -> tuple[str, ...]: - match self: - case TextLineFontKind.REGULAR: - return () - case TextLineFontKind.ITALIC: - return "i", - case TextLineFontKind.BOLD: - return "b", - case TextLineFontKind.BOLD_ITALIC: - return "b", "i" - case TextLineFontKind.SUBSCRIPT: - return "sub", - case TextLineFontKind.SUPERSCRIPT: - return "sup", - case TextLineFontKind.CODE: - return "code", - case TextLineFontKind.CODE_SUBSCRIPT: - return "code", "sub" - case TextLineFontKind.CODE_SUPERSCRIPT: - return "code", "sup" - case _: - assert_never(self) + return (*self.code.value, *self.bold.value, *self.italic.value, *self.sub_super.value) class PageParseError(Exception): pass @@ -489,8 +529,8 @@ class ElementBodyBuilder: class InsnBitField: box_min_x: float box_max_x: float - name: None | ParsedTextLine - bit_number: None | ParsedTextLine + name: ParsedTextLine + bit_number: ParsedTextLine def __str__(self) -> str: return f"" @@ -591,6 +631,11 @@ class InsnSpRegsAltered: lines.append(f")") return "\n".join(lines) +class _InsnParseSection(enum.Enum): + CODE = "code" + HEADER = "header" + DESC = "desc" + CHAR_TO_EXPANDED = { "\ufb00": "ff", "\ufb01": "fi", @@ -600,21 +645,33 @@ CHAR_TO_EXPANDED = { } @dataclass() -class PageParser: - parser: Parser +class Page: page_num: int - qt: QuadTree[Char | LTLine | LTRect] = field(default_factory=QuadTree) - unprocessed_chars: defaultdict[Font, SetById[Char]] = field( - default_factory=lambda: defaultdict(SetById[Char])) - unprocessed_non_text: SetById[LTLine | LTRect] = field( - default_factory=SetById[LTLine | LTRect]) + qt: defaultdict[TextSection, QuadTree[Char | LTLine | LTRect]] + unprocessed_chars: defaultdict[TextSection, defaultdict[Font, SetById[Char]]] + unprocessed_non_text: SetById[LTLine | LTRect] - def parse_page(self, page: LTPage): + @staticmethod + def from_lt_page(page_num: int, page: LTPage) -> Page: + qt: defaultdict[TextSection, QuadTree[Char | LTLine | LTRect]] = defaultdict(QuadTree) + unprocessed_chars = defaultdict(lambda: defaultdict(SetById[Char])) + unprocessed_non_text: SetById[LTLine | LTRect] = SetById() for component in page: if isinstance(component, (LTLine, LTRect)): - if isinstance(component, LTRect): - print(component) - self.qt.insert(component.x0, component.y0, component) + if component.width > 100 and \ + component.x0 < COLUMN_SPLIT_X - 10 and \ + component.x1 > COLUMN_SPLIT_X + 10: + print(f"wide component: {component}") + else: + print(f"component: {component}") + text_section = TextSection.for_position( + page_num=page_num, + x=(component.x0 + component.x1) * 0.5, + y=(component.y0 + component.y1) * 0.5, + ) + if text_section is not None: + qt[text_section].insert(component.x0, component.y0, component) + unprocessed_non_text.add(component) continue if not isinstance(component, LTTextBox): print(f"ignoring: {component}") @@ -623,6 +680,16 @@ class PageParser: for element in text_line: if not isinstance(element, LTChar): continue + text_section = TextSection.for_position( + page_num=page_num, + x=(element.x0 + element.x1) * 0.5, + y=(element.y0 + element.y1) * 0.5, + ) + if text_section is None: + if PAGE_BODY_MIN_Y <= element.y0 <= PAGE_BODY_MAX_Y: + raise AssertionError( + f"char not in text section: {element}\npage_num={page_num}") + continue char = Char( text=element.get_text(), font=Font(font_name=element.fontname, size=round(element.size, 3)), @@ -632,28 +699,367 @@ class PageParser: max_x=element.x1, max_y=element.y1, ) - self.qt.insert(char.min_x, char.min_y, char) - self.unprocessed_chars[char.font].add(char) - for i in self.unprocessed_chars.values(): - i.sort(key=Char.top_down_left_to_right_sort_key) + qt[text_section].insert(char.min_x, char.min_y, char) + unprocessed_chars[text_section][char.font].add(char) + for i in unprocessed_chars.values(): + for j in i.values(): + j.sort(key=Char.top_down_left_to_right_sort_key) unknown_fonts=[] unknown_font_errors=[] - for font, chars in self.unprocessed_chars.items(): - if font.known_name is None: - text = "" - char = None - for char in chars: - text += char.text - unknown_fonts.append(repr(font) + ",") - unknown_font_errors.append(f"unknown font {font}\nlast char: {char}\ntext: {text!r}") + for i in unprocessed_chars.values(): + for font, chars in i.items(): + if font.known_name is None: + text = "" + char = None + for char in chars: + text += char.text + unknown_fonts.append(repr(font) + ",") + unknown_font_errors.append(f"unknown font {font}\nlast char: {char}\ntext: {text!r}") unknown_fonts.sort() if len(unknown_fonts) != 0: raise AssertionError("\nunknown fonts:\n" + "\n".join(unknown_fonts) + "\n\n" + "\n".join(unknown_font_errors)) + return Page( + page_num=page_num, + qt=qt, + unprocessed_chars=unprocessed_chars, + unprocessed_non_text=unprocessed_non_text, + ) + +class Pages: + pages_gen: None | Generator[Page, None, None] + __pages: dict[int, Page] + __max_page_num: int + + def __init__(self, pages_gen: None | Generator[Page, None, None]=None): + self.pages_gen = pages_gen + self.__pages = {} + self.__max_page_num = 0 + + def __enter__(self) -> Pages: + return self + + def __exit__(self, exc_type, exc_value, traceback) -> None: + self.close() + + def close(self): + if self.pages_gen is not None: + gen = self.pages_gen + self.pages_gen = None + gen.close() + + def is_past_end(self, page_num: int) -> bool: + while self.pages_gen is not None and page_num > self.__max_page_num: + self.__fill_page() + return page_num > self.__max_page_num + + def __fill_page(self) -> bool: + if self.pages_gen is None: + return False try: - self.extract_insns() + page = self.pages_gen.send(None) + except StopIteration: + page = None + if page is None: + self.close() + return False + if page.page_num <= self.__max_page_num: + e = AssertionError( + f"page numbers must be a strictly-increasing positive integer sequence:\n" + f"got {page.page_num} which isn't more than {self.__max_page_num}") + if self.pages_gen is not None: + self.pages_gen.throw(e) + raise e # either no generator or generator failed to propagate exception + self.__pages[page.page_num] = page + self.__max_page_num = page.page_num + return True + + def get(self, page_num: int, default: _T=None) -> _T | Page: + while True: + page = self.__pages.get(page_num) + if page is not None: + return page + if self.pages_gen is None: + return default + if page_num < self.__max_page_num: + return default + self.__fill_page() + + def __contains__(self, page_num: int, /) -> bool: + return self.get(page_num) is not None + + def __getitem__(self, page_num: int, /) -> Page: + retval = self.get(page_num) + if retval is None: + raise KeyError(page_num) + return retval + +@dataclass(unsafe_hash=True, frozen=True) +class TextSection: + page_num: int + min_x: float + min_y: float + max_x: float + max_y: float + + @classmethod + def first(cls) -> TextSection: + return cls.page_sections(page_num=1)[0] + + @cached_property + def next(self) -> TextSection: + page_sections = self.page_sections(page_num=self.page_num) + index = page_sections.index(self) + if index + 1 < len(page_sections): + return page_sections[index + 1] + for page_num in range(self.page_num + 1, self.page_num + 100000): + page_sections = self.page_sections(page_num=page_num) + if len(page_sections) != 0: + return page_sections[0] + raise AssertionError(f"can't find next TextSection after {self}") + + @classmethod + def left_column( + cls, *, + page_num: int, + min_y=PAGE_BODY_MIN_Y, + max_y=PAGE_BODY_MAX_Y, + ) -> TextSection: + return cls( + page_num=page_num, + min_x=PAGE_BODY_MIN_X, + min_y=min_y, + max_x=COLUMN_SPLIT_X, + max_y=max_y) + + @classmethod + def right_column( + cls, *, + page_num: int, + min_y=PAGE_BODY_MIN_Y, + max_y=PAGE_BODY_MAX_Y, + ) -> TextSection: + return cls( + page_num=page_num, + min_x=COLUMN_SPLIT_X, + min_y=min_y, + max_x=PAGE_BODY_MAX_X, + max_y=max_y) + + @classmethod + def columns( + cls, *, + page_num: int, + min_y=PAGE_BODY_MIN_Y, + max_y=PAGE_BODY_MAX_Y, + ) -> tuple[TextSection, TextSection]: + return (cls.left_column(page_num=page_num, min_y=min_y, max_y=max_y), + cls.right_column(page_num=page_num, min_y=min_y, max_y=max_y)) + + @classmethod + def full_page( + cls, *, + page_num: int, + min_y=PAGE_BODY_MIN_Y, + max_y=PAGE_BODY_MAX_Y, + ) -> TextSection: + return cls( + page_num=page_num, + min_x=PAGE_BODY_MIN_X, + min_y=min_y, + max_x=PAGE_BODY_MAX_X, + max_y=max_y) + + __COLUMNS_THEN_FULL_PAGE: ClassVar = { + 129: 438.992, 241: 512.419, 242: 408.077, 243: 488.509, + 244: 437.518, 245: 444.522, 247: 352.082, 248: 356.723, + 249: 365.944, 251: 334.553, 264: 184.67, 296: 267.29, + 297: 200.043, 298: 440.64, 299: 197.356, 300: 160.076, + 301: 364.924, 303: 330.055, 305: 344.867, 306: 335.403, + 307: 336.897, 308: 365.233, 309: 364.735, + } + + __FULL_PAGE_THEN_COLUMNS: ClassVar = { + 246: 689.039, + 250: 615.315, + 266: 678.088, + } + + __ONE_TITLE_LINE_THEN_COLUMNS_THEN_FULL_PAGE: ClassVar = { + 128: 301.55, + } + + __TWO_TITLE_LINES_THEN_COLUMNS_THEN_FULL_PAGE: ClassVar = { + 304: 242.732, + } + + __COLUMNS_THEN_COLUMNS: ClassVar = { + 79: 621.66, + 126: 519.89, + } + + __ONE_TITLE_LINE_THEN_COLUMNS_THEN_COLUMNS: ClassVar = { + 130: 550.43, + 162: 599.247, + 194: 622.161, + 196: 682.933, + 204: 613.195, + 215: 633.12, + } + + __ONE_TITLE_LINE_THEN_COLUMNS: ClassVar = { + 103, 104, 105, 121, 139, 143, 146, 151, 153, 158, 207, 213, 218, + } + + __TWO_TITLE_LINES_THEN_COLUMNS: ClassVar = { + 198, 206, + } + + __FULL_PAGE: ClassVar = { + 118, 157, 252, 254, 255, 257, 259, 260, 265, 268, 270, 271, 272, + *range(274, 286), + } + + @classmethod + def __page_sections(cls, page_num: int) -> tuple[TextSection, ...]: + match page_num: + case _ if page_num in cls.__COLUMNS_THEN_COLUMNS: + split_y = cls.__COLUMNS_THEN_COLUMNS[page_num] + return ( + *cls.columns(page_num=page_num, min_y=split_y), + *cls.columns(page_num=page_num, max_y=split_y), + ) + case _ if page_num in cls.__ONE_TITLE_LINE_THEN_COLUMNS: + return ( + cls.full_page(page_num=page_num, min_y=ONE_TITLE_LINE_SPLIT_Y), + *cls.columns(page_num=page_num, max_y=ONE_TITLE_LINE_SPLIT_Y), + ) + case _ if page_num in cls.__FULL_PAGE: + return cls.full_page(page_num=page_num), + case _ if page_num in cls.__ONE_TITLE_LINE_THEN_COLUMNS_THEN_COLUMNS: + split_y = cls.__ONE_TITLE_LINE_THEN_COLUMNS_THEN_COLUMNS[page_num] + return ( + cls.full_page(page_num=page_num, min_y=ONE_TITLE_LINE_SPLIT_Y), + *cls.columns(page_num=page_num, min_y=split_y, max_y=ONE_TITLE_LINE_SPLIT_Y), + *cls.columns(page_num=page_num, max_y=split_y), + ) + case _ if page_num in cls.__TWO_TITLE_LINES_THEN_COLUMNS: + return ( + cls.full_page(page_num=page_num, min_y=TWO_TITLE_LINES_SPLIT_Y), + *cls.columns(page_num=page_num, max_y=TWO_TITLE_LINES_SPLIT_Y), + ) + case _ if page_num in cls.__COLUMNS_THEN_FULL_PAGE: + split_y = cls.__COLUMNS_THEN_FULL_PAGE[page_num] + return ( + *cls.columns(page_num=page_num, min_y=split_y), + cls.full_page(page_num=page_num, max_y=split_y), + ) + case _ if page_num in cls.__FULL_PAGE_THEN_COLUMNS: + split_y = cls.__FULL_PAGE_THEN_COLUMNS[page_num] + return ( + cls.full_page(page_num=page_num, min_y=split_y), + *cls.columns(page_num=page_num, max_y=split_y), + ) + case _ if page_num in cls.__ONE_TITLE_LINE_THEN_COLUMNS_THEN_FULL_PAGE: + split_y = cls.__ONE_TITLE_LINE_THEN_COLUMNS_THEN_FULL_PAGE[page_num] + return ( + cls.full_page(page_num=page_num, min_y=ONE_TITLE_LINE_SPLIT_Y), + *cls.columns(page_num=page_num, min_y=split_y, max_y=ONE_TITLE_LINE_SPLIT_Y), + cls.full_page(page_num=page_num, max_y=split_y), + ) + case _ if page_num in cls.__TWO_TITLE_LINES_THEN_COLUMNS_THEN_FULL_PAGE: + split_y = cls.__TWO_TITLE_LINES_THEN_COLUMNS_THEN_FULL_PAGE[page_num] + return ( + cls.full_page(page_num=page_num, min_y=TWO_TITLE_LINES_SPLIT_Y), + *cls.columns(page_num=page_num, min_y=split_y, max_y=TWO_TITLE_LINES_SPLIT_Y), + cls.full_page(page_num=page_num, max_y=split_y), + ) + case 263: + return ( + cls.full_page(page_num=page_num, min_y=699.997), + *cls.columns(page_num=page_num, min_y=366.396, max_y=699.997), + *cls.columns(page_num=page_num, min_y=207, max_y=366.396), + cls.full_page(page_num=page_num, max_y=207), + ) + # TODO: checked up to page 309 (page named 273) + case _: + return cls.columns(page_num=page_num) + + __PAGE_SECTIONS_CACHE: ClassVar[dict[int, tuple[TextSection, ...]]] = {} + + @classmethod + def page_sections(cls, page_num: int) -> tuple[TextSection, ...]: + try: + return cls.__PAGE_SECTIONS_CACHE[page_num] + except KeyError: + pass + retval = cls.__PAGE_SECTIONS_CACHE[page_num] = cls.__page_sections(page_num=page_num) + return retval + + @classmethod + def for_position(cls, page_num: int, x: float, y: float) -> None | TextSection: + for i in cls.page_sections(page_num=page_num): + if i.min_x <= x <= i.max_x and i.min_y <= y <= i.max_y: + return i + return None + +@dataclass() +class Parser: + pages: Pages = field(default_factory=Pages) + text_section: TextSection = TextSection.first() + + @property + def page(self) -> Page: + return self.pages[self.text_section.page_num] + + @property + def unprocessed_chars(self) -> defaultdict[Font, SetById[Char]]: + return self.pages[self.text_section.page_num].unprocessed_chars[self.text_section] + + @staticmethod + def __pages_gen(file: str, page_numbers: Iterable[int] | None) -> Generator[Page, None, None]: + if page_numbers is not None: + page_numbers = sorted(i - 1 for i in page_numbers) + for i, page in enumerate(extract_pages(file, page_numbers=page_numbers)): + if page_numbers is not None: + page_num = page_numbers[i] + 1 + else: + page_num = i + 1 + print(f"page {page_num}") + yield Page.from_lt_page(page_num=page_num, page=page) + + def parse_pdf(self, file: str, page_numbers: Iterable[int] | None = None): + self.pages = Pages(pages_gen=Parser.__pages_gen( + file=file, page_numbers=page_numbers)) + self.text_section = TextSection.first() + while True: + self.text_section = self.text_section.next + if self.pages.is_past_end(self.text_section.page_num): + break + if self.text_section.page_num in self.pages: + print(f"section {self.text_section}") + with self.note_text_section(): + self.parse_text_section() + + @contextmanager + def note_text_section(self): + start_text_section = self.text_section + try: + yield + except Exception as e: + if self.text_section == start_text_section: + note = f"text_section={self.text_section}" + else: + note = f"start_text_section={start_text_section}\ntext_section={self.text_section}" + if note not in getattr(e, "__notes__", ()): + e.add_note(note) + raise + + def parse_text_section(self): + try: + with self.note_text_section(): + self.extract_insns() except InsnParseError as e: - e.add_note(f"page_num={self.page_num}") print("".join(traceback.format_exception_only(e)), flush=True) traceback.print_exc() @@ -666,7 +1072,7 @@ class PageParser: pred: None | Callable[[Char], bool] = None, ) -> None | Char: retval = None - for x, y, char in self.qt.range( + for x, y, char in self.page.qt[self.text_section].range( min_x=min_x, max_x=max_x, min_y=min_y, @@ -700,7 +1106,12 @@ class PageParser: if start_char is not None: chars.append(start_char) chars_set.add(start_char) - for x, y, char in self.qt.range( + if start_char is not None and \ + start_char.text == "*" and \ + self.text_section.page_num == 168 and \ + start_char.font in (fonts.subscript or ()): + start_min_y = start_char.max_y - fonts.regular[0].size + for x, y, char in self.page.qt[self.text_section].range( min_x=min_x - fonts.regular[0].size * 0.5, max_x=max_x, min_y=start_min_y - fonts.regular[0].size * 0.4, @@ -715,10 +1126,18 @@ class PageParser: if len(chars) == 0: return None chars.sort(key=lambda char: (char.min_x, char.text)) + regular_min_y = chars[0].min_y + regular_max_y = chars[0].max_y + for char in chars: + kind = fonts.get_kind(font=char.font, baseline_pos=BaselinePos.BELOW) + if kind is not None and kind.sub_super is FontVariantSubSuper.NOT_SUB_SUPER: + regular_min_y = char.min_y + regular_max_y = char.max_y + break retval = ParsedTextLine( element=ElementTree.Element("text-line"), - regular_min_y=chars[0].min_y, - regular_max_y=chars[0].max_y, + regular_min_y=regular_min_y, + regular_max_y=regular_max_y, fonts=fonts, chars=chars, preceding_blank_lines=preceding_blank_lines, @@ -753,7 +1172,7 @@ class PageParser: space_width = char.min_x - last_max_x space_count_f = space_width / space_font[0].space_width space_count = round(space_count_f) - if space_count == 0 and space_count_f > 0.4: + if space_count == 0 and space_count_f > 0.35: space_count = 1 if space_count_f > 0.25 and abs(space_count - space_count_f) > 0.15: print(f"spaces: space_count_f={space_count_f} space_width={space_width}") @@ -781,9 +1200,9 @@ class PageParser: self.unprocessed_chars[char.font].remove(char) if allowed_start_min_y_error is None: allowed_start_min_y_error = 0.01 - assert abs(start_min_y - chars[0].min_y) < allowed_start_min_y_error, ( - f"start_min_y={start_min_y} regular_min_y={chars[0].min_y}\n" - f"start_min_y error: {start_min_y - chars[0].min_y}\n" + assert abs(start_min_y - retval.regular_min_y) < allowed_start_min_y_error, ( + f"start_min_y={start_min_y} regular_min_y={retval.regular_min_y}\n" + f"start_min_y error: {start_min_y - retval.regular_min_y}\n" f"allowed_start_min_y_error={allowed_start_min_y_error}") return retval @@ -809,8 +1228,6 @@ class PageParser: def extract_insn_bit_fields( self, - min_x: float, - max_x: float, mnemonic_lines: list[ParsedTextLine], ) -> None | InsnBitFields: found_non_affix_line = False @@ -820,9 +1237,9 @@ class PageParser: else: expected_non_affix_line_y = (mnemonic_lines[-1].regular_min_y - INSN_BIT_FIELDS_TOP_PAD_HEIGHT) - for x, y, line in self.qt.range( - min_x=min_x - 5, - max_x=max_x + 5, + for x, y, line in self.page.qt[self.text_section].range( + min_x=self.text_section.min_x - 5, + max_x=self.text_section.max_x + 5, min_y=expected_non_affix_line_y - 5, max_y=expected_non_affix_line_y + 5, ): @@ -833,17 +1250,16 @@ class PageParser: break if found_non_affix_line: return self.extract_insn_bit_fields_box( - min_x=min_x, - max_x=max_x, expected_box_max_y=expected_non_affix_line_y, ) prefix_text = self.extract_text_line( start_min_y=mnemonic_lines[-1].regular_min_y - INSN_BIT_FIELDS_PREFIX_TEXT_TOP_PAD_HEIGHT, - min_x=min_x, - max_x=max_x, + min_x=self.text_section.min_x, + max_x=self.text_section.max_x, fonts=TextLineFonts.INSN_BIT_FIELDS_AFFIX_TITLE_FONTS, allowed_start_min_y_error=2, + skip_initial_spaces=True, ) if prefix_text is None: raise InsnParseError("can't find insn prefix bit fields title") @@ -852,8 +1268,6 @@ class PageParser: raise InsnParseError( f"insn prefix bit fields title is not as expected: {prefix_text_str!r}") prefix_bit_fields = self.extract_insn_bit_fields_box( - min_x=min_x, - max_x=max_x, expected_box_max_y=prefix_text.regular_min_y - INSN_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT, ) @@ -862,10 +1276,11 @@ class PageParser: suffix_text = self.extract_text_line( start_min_y=prefix_bit_fields.box_min_y - INSN_BIT_FIELDS_PREFIX_BOX_BOTTOM_TO_SUFFIX_TEXT_HEIGHT, - min_x=min_x, - max_x=max_x, + min_x=self.text_section.min_x, + max_x=self.text_section.max_x, fonts=TextLineFonts.INSN_BIT_FIELDS_AFFIX_TITLE_FONTS, allowed_start_min_y_error=2, + skip_initial_spaces=True, ) if suffix_text is None: raise InsnParseError("can't find insn suffix bit fields title") @@ -874,8 +1289,6 @@ class PageParser: raise InsnParseError( f"insn suffix bit fields title is not as expected: {suffix_text_str!r}") suffix_bit_fields = self.extract_insn_bit_fields_box( - min_x=min_x, - max_x=max_x, expected_box_max_y=suffix_text.regular_min_y - INSN_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT, ) @@ -900,15 +1313,13 @@ class PageParser: def extract_insn_bit_fields_box( self, - min_x: float, - max_x: float, expected_box_max_y: float, ) -> None | InsnBitFields: h_lines: list[LTLine] = [] v_lines: list[LTLine] = [] - for x, y, line in self.qt.range( - min_x=min_x - 5, - max_x=max_x + 5, + for x, y, line in self.page.qt[self.text_section].range( + min_x=self.text_section.min_x - 5, + max_x=self.text_section.max_x + 5, min_y=expected_box_max_y - INSN_BIT_FIELDS_BOX_HEIGHT - 5, max_y=expected_box_max_y + 5, ): @@ -946,23 +1357,38 @@ class PageParser: right_line = v_lines[i + 1] field_box_min_x = left_line.x1 field_box_max_x = right_line.x0 + bit_field_name_start_min_y = box_mid_y + 3.288 + bit_field_name=self.extract_text_line( + start_min_y=bit_field_name_start_min_y, + min_x=field_box_min_x, + max_x=field_box_max_x, + fonts=TextLineFonts.INSN_BIT_FIELD_NAME_FONTS, + skip_initial_spaces=True, + allowed_start_min_y_error=0.4, + ) + if bit_field_name is None: + raise InsnParseError(f"instruction bit field name not found:\n" + f"start_min_y={bit_field_name_start_min_y} " + f"field_box_min_x={field_box_min_x} " + f"field_box_max_x={field_box_max_x}") + bit_field_number_start_min_y = box_min_y + 3.487 + bit_number=self.extract_text_line( + start_min_y=bit_field_number_start_min_y, + min_x=field_box_min_x, + max_x=field_box_max_x, + fonts=TextLineFonts.INSN_BIT_FIELD_BIT_NUMBER_FONTS, + skip_initial_spaces=True, + ) + if bit_number is None: + raise InsnParseError(f"instruction bit field bit number not found:\n" + f"start_min_y={bit_field_number_start_min_y} " + f"field_box_min_x={field_box_min_x} " + f"field_box_max_x={field_box_max_x}") fields.append(InsnBitField( box_min_x=field_box_min_x, box_max_x=field_box_max_x, - name=self.extract_text_line( - start_min_y=box_mid_y + 3.288, - min_x=field_box_min_x, - max_x=field_box_max_x, - fonts=TextLineFonts.INSN_BIT_FIELD_NAME_FONTS, - skip_initial_spaces=True, - ), - bit_number=self.extract_text_line( - start_min_y=box_min_y + 3.487, - min_x=field_box_min_x, - max_x=field_box_max_x, - fonts=TextLineFonts.INSN_BIT_FIELD_BIT_NUMBER_FONTS, - skip_initial_spaces=True, - ), + name=bit_field_name, + bit_number=bit_number, )) return InsnBitFields( prefix=None, @@ -975,8 +1401,6 @@ class PageParser: def extract_insn_header_mnemonics_and_bit_fields( self, - column_min_x: float, - column_max_x: float, start_min_y: float, header_start_char: None | Char = None, ) -> None | tuple[list[ParsedTextLine], list[ParsedTextLine], InsnBitFields]: @@ -985,8 +1409,8 @@ class PageParser: header_line = self.extract_text_line( start_char=header_start_char, start_min_y=start_min_y, - min_x=column_min_x, - max_x=column_max_x, + min_x=self.text_section.min_x, + max_x=self.text_section.max_x, fonts=TextLineFonts.INSN_HEADER_FONTS, skip_initial_spaces=True, allowed_start_min_y_error=6, @@ -996,15 +1420,15 @@ class PageParser: print(f"found header line:\n{header_line}") header_lines = self.extract_following_text_lines( first_text_line=header_line, - min_x=column_min_x, - max_x=column_max_x, + min_x=self.text_section.min_x, + max_x=self.text_section.max_x, allowed_start_min_y_error=1.5, ) print("insn header lines:") print("\n".join(map(str, header_lines))) mnemonic_start_char = self.find_top_left_char_in_range( - min_x=column_min_x - 5, - max_x=column_max_x + 5, + min_x=self.text_section.min_x - 5, + max_x=self.text_section.max_x + 5, min_y=header_lines[-1].regular_min_y - 50, max_y=header_lines[-1].regular_min_y - 5, allow_processed=False, @@ -1014,8 +1438,8 @@ class PageParser: mnemonic_line = self.extract_text_line( start_char=mnemonic_start_char, start_min_y=mnemonic_start_char.min_y, - min_x=column_min_x, - max_x=column_max_x, + min_x=self.text_section.min_x, + max_x=self.text_section.max_x, fonts=TextLineFonts.INSN_MNEMONIC_FONTS, skip_initial_spaces=True, ) @@ -1024,13 +1448,11 @@ class PageParser: mnemonic_lines = self.extract_following_text_lines( first_text_line=mnemonic_line, min_x=mnemonic_line.chars[0].min_x, - max_x=column_max_x, + max_x=self.text_section.max_x, ) print("insn mnemonic lines:") print("\n".join(map(str, mnemonic_lines))) insn_bit_fields = self.extract_insn_bit_fields( - min_x=column_min_x, - max_x=column_max_x, mnemonic_lines=mnemonic_lines, ) print(insn_bit_fields) @@ -1041,11 +1463,10 @@ class PageParser: def extract_insn_sp_regs_altered( self, sp_regs_altered_text: ParsedTextLine, - column_min_x: float, - column_max_x: float, ) -> InsnSpRegsAltered: sp_regs_altered_text.preceding_blank_lines = 0 fonts = TextLineFonts.INSN_DESC_FONTS + column_min_x = sp_regs_altered_text.chars[0].min_x table_header_reg_char = self.find_top_left_char_in_range( min_x=column_min_x - 1, max_x=column_min_x + INSN_SP_REGS_ALTERED_FIELDS_COLUMN_X - 1, @@ -1055,37 +1476,25 @@ class PageParser: ) assert table_header_reg_char is not None, \ "can't find special registers altered table's register-column's header" + KNOWN_SPECIAL_TEXTS = ( + "None", + "Dependent on the system service", + "See above.", + ) match table_header_reg_char.text: - case "N": - none_text = self.extract_text_line( - start_char=table_header_reg_char, - start_min_y=table_header_reg_char.min_y, - min_x=column_min_x, - max_x=column_max_x, - fonts=fonts, - skip_initial_spaces=True, - ) - assert none_text is not None and none_text.element.text == "None", \ - f"can't find special-registers-altered None: none_text={none_text}" - return InsnSpRegsAltered( - sp_regs_altered_text=sp_regs_altered_text, - special_text=none_text, - table_header_reg=None, - table_header_fields=None, - entries=(), - final_regular_min_y=none_text.regular_min_y, - ) - case "D": + case "R": + pass + case text if any(text == i[0] for i in KNOWN_SPECIAL_TEXTS): special_text = self.extract_text_line( start_char=table_header_reg_char, start_min_y=table_header_reg_char.min_y, min_x=column_min_x, - max_x=column_max_x, + max_x=self.text_section.max_x, fonts=fonts, skip_initial_spaces=True, ) assert special_text is not None \ - and special_text.element.text == "Dependent on the system service", \ + and special_text.element.text in KNOWN_SPECIAL_TEXTS, \ f"can't find special-registers-altered special-text:\n{special_text}" return InsnSpRegsAltered( sp_regs_altered_text=sp_regs_altered_text, @@ -1095,10 +1504,8 @@ class PageParser: entries=(), final_regular_min_y=special_text.regular_min_y, ) - case "R": - pass case text: - raise AssertionError( + raise InsnParseError( f"unknown special-registers-altered special-text start character: {text!r}") table_header_fields_char = self.find_top_left_char_in_range( min_x=column_min_x + INSN_SP_REGS_ALTERED_FIELDS_COLUMN_X - 10, @@ -1116,7 +1523,7 @@ class PageParser: (table_header_reg_char.min_x, table_header_fields_char.min_x - 1), (table_header_fields_char.min_x, column_min_x + INSN_SP_REGS_ALTERED_FIELDS_CONDS_SPLIT_X), - (column_min_x + INSN_SP_REGS_ALTERED_FIELDS_CONDS_SPLIT_X, column_max_x), + (column_min_x + INSN_SP_REGS_ALTERED_FIELDS_CONDS_SPLIT_X, self.text_section.max_x), ) table_header_reg = self.extract_text_line( start_char=table_header_reg_char, @@ -1207,14 +1614,7 @@ class PageParser: def extract_insn(self, header_start_char: Char): assert header_start_char.font == Font.INSN_HEADER print(header_start_char) - column_min_x = header_start_char.min_x - if column_min_x < COLUMN_SPLIT_X: - column_max_x = COLUMN_SPLIT_X - else: - column_max_x = 1000 header = self.extract_insn_header_mnemonics_and_bit_fields( - column_min_x=column_min_x, - column_max_x=column_max_x, start_min_y=header_start_char.min_y, header_start_char=header_start_char, ) @@ -1228,39 +1628,40 @@ class PageParser: while True: search_min_y = next_start_min_y - 70 next_char = self.find_top_left_char_in_range( - min_x=column_min_x - 5, - max_x=column_max_x + 5, - min_y=max(search_min_y, PAGE_BODY_MIN_Y), + min_x=self.text_section.min_x - 5, + max_x=self.text_section.max_x + 5, + min_y=max(search_min_y, self.text_section.min_y), max_y=next_start_min_y, allow_processed=False, ) if next_char is None: - if column_max_x == COLUMN_SPLIT_X and search_min_y <= PAGE_BODY_MIN_Y: - # go to other column - column_min_x = COLUMN_SPLIT_X - column_max_x = 1000 - next_start_min_y = PAGE_BODY_MAX_Y + if search_min_y <= self.text_section.min_y \ + and self.text_section.next is not None and \ + self.text_section.next.page_num in self.pages: + # go to next section + self.text_section = self.text_section.next + next_start_min_y = self.text_section.max_y continue else: raise InsnParseError("can't find insn code or description text") match next_char.font: case font if font in TextLineFonts.INSN_CODE_FONTS.fonts: - next_section = "code" + next_section = _InsnParseSection.CODE case font if font in TextLineFonts.INSN_DESC_FONTS.fonts: - next_section = "desc" + next_section = _InsnParseSection.DESC case Font.INSN_HEADER: - next_section = "header" + next_section = _InsnParseSection.HEADER case font: raise InsnParseError(f"can't find insn code or description text\nfont={font}") match next_section: - case "code": + case _InsnParseSection.CODE: if len(desc_lines) != 0: break code_line = self.extract_text_line( start_char=next_char, start_min_y=next_char.min_y, min_x=next_char.min_x, - max_x=column_max_x, + max_x=self.text_section.max_x, fonts=TextLineFonts.INSN_CODE_FONTS, preceding_blank_lines=0 if len(code_lines) == 0 else 1, ) @@ -1269,19 +1670,17 @@ class PageParser: more_code_lines = self.extract_following_text_lines( first_text_line=code_line, min_x=code_line.chars[0].min_x, - max_x=column_max_x, + max_x=self.text_section.max_x, allowed_start_min_y_error=0.05, ) print("more insn code lines:") print("\n".join(map(str, more_code_lines))) code_lines.extend(more_code_lines) next_start_min_y = code_lines[-1].regular_min_y - 5 - case "header": + case _InsnParseSection.HEADER: if len(code_lines) != 0 or len(desc_lines) != 0: break header = self.extract_insn_header_mnemonics_and_bit_fields( - column_min_x=column_min_x, - column_max_x=column_max_x, start_min_y=next_char.min_y, header_start_char=next_char, ) @@ -1289,13 +1688,15 @@ class PageParser: raise InsnParseError("can't find header text line") headers.append(header) next_start_min_y = header[2].box_min_y - 5 - case "desc": + case _InsnParseSection.DESC: desc_line = self.extract_text_line( + start_char=next_char, start_min_y=next_char.min_y, min_x=next_char.min_x, - max_x=column_max_x, + max_x=self.text_section.max_x, fonts=TextLineFonts.INSN_DESC_FONTS, preceding_blank_lines=0 if len(desc_lines) == 0 else 1, + allowed_start_min_y_error=3, ) if desc_line is None: raise InsnParseError("can't find insn desc text line") @@ -1304,7 +1705,7 @@ class PageParser: more_desc_lines = self.extract_following_text_lines( first_text_line=desc_line, min_x=desc_line.chars[0].min_x, - max_x=column_max_x, + max_x=self.text_section.max_x, allowed_start_min_y_error=3, ) print("more insn desc lines:") @@ -1314,8 +1715,6 @@ class PageParser: case "Special Registers Altered:": sp_regs_altered = self.extract_insn_sp_regs_altered( sp_regs_altered_text=desc_line, - column_min_x=column_min_x, - column_max_x=column_max_x, ) next_start_min_y = sp_regs_altered.final_regular_min_y break @@ -1332,9 +1731,13 @@ class PageParser: # TODO: finish def extract_insns(self): - unprocessed_header_chars = self.unprocessed_chars[Font.INSN_HEADER] - while len(unprocessed_header_chars) != 0: - self.extract_insn(next(iter(unprocessed_header_chars))) + while True: + try: + header_start_char = next(iter( + self.unprocessed_chars[Font.INSN_HEADER])) + except StopIteration: + break + self.extract_insn(header_start_char=header_start_char) def main(): if 2 < len(sys.argv):