From 21b97c00640d0d36965a861abf6fe7807a4a721f Mon Sep 17 00:00:00 2001
From: Jacob Lifshay <programmerjake@gmail.com>
Date: Sun, 27 Oct 2024 22:59:27 -0700
Subject: [PATCH] WIP -- no reported errors till page 305

---
 parse_powerisa_pdf/parse_powerisa_pdf.py | 821 +++++++++++++++++------
 1 file changed, 612 insertions(+), 209 deletions(-)

diff --git a/parse_powerisa_pdf/parse_powerisa_pdf.py b/parse_powerisa_pdf/parse_powerisa_pdf.py
index c830da1..338e6e5 100755
--- a/parse_powerisa_pdf/parse_powerisa_pdf.py
+++ b/parse_powerisa_pdf/parse_powerisa_pdf.py
@@ -1,10 +1,12 @@
 from __future__ import annotations
 from collections import defaultdict
+from collections.abc import Generator, Iterable, Iterator, Callable
+from contextlib import contextmanager
 from dataclasses import dataclass, field
 import dataclasses
 from functools import cached_property
 import sys
-from typing import Callable, ClassVar, Iterable, Iterator, TypeAlias, TypeVar, assert_never
+from typing import ClassVar, TypeVar, assert_never
 from xml.etree import ElementTree
 import enum
 import traceback
@@ -32,8 +34,7 @@ class Font:
                 return 9.464 * self.size / Font.INSN_CODE[0].size
             case Font.INSN_DESC_BOLD.font_name | \
                     Font.INSN_DESC_ITALIC.font_name | \
-                    Font.INSN_DESC_BOLD_ITALIC.font_name | \
-                    Font.NOTATION_PAGE_SUBSCRIPT.font_name:
+                    Font.INSN_DESC_BOLD_ITALIC.font_name:
                 return 10.959 * self.size / Font.INSN_DESC[0].size
             case _ if self in Font.INSN_DESC or self.font_name == Font.INSN_DESC[0].font_name:
                 return 10.959 * self.size / Font.INSN_DESC[0].size
@@ -105,6 +106,8 @@ class Font:
         cls.INSN_DESC_SUBSCRIPT = Font(font_name='MJBFWM+DejaVuSansCondensed', size=5.978)
         cls.INSN_DESC_BOLD_SUBSCRIPT = \
             Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=5.978)
+        cls.INSN_DESC_ITALIC_SUBSCRIPT = \
+            Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=5.978)
         cls.INSN_DESC_BOLD_ITALIC_SUBSCRIPT = \
             Font(font_name='YDJYQV+DejaVuSansCondensed-BoldOblique', size=5.978)
         cls.INSN_EXT_MNEMONIC = Font(font_name='APUYSQ+zcoN-Regular', size=8.966)
@@ -125,7 +128,6 @@ class Font:
         cls.LEGAL_PAGE_ITALIC = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=9.963)
         cls.CHANGE_SUMMARY_PAGE_BOLD = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=11.955)
         cls.CHAPTER_TITLE = Font(font_name='NHUPPK+DejaVuSansCondensed-Bold', size=17.215)
-        cls.NOTATION_PAGE_SUBSCRIPT = Font(font_name='CGMSHV+DejaVuSansCondensed-Oblique', size=5.978)
         cls.MATH_MISC = (
             Font(font_name='AAJMKT+CMMI6', size=5.978),
             Font(font_name='CUTMFD+CMSSI8', size=5.978),
@@ -188,27 +190,13 @@ class Char:
         return self.max_y - self.min_y
 
 
-@dataclass()
-class Parser:
-    def parse_pdf(self, file: str, page_numbers: Iterable[int] | None = None):
-        if page_numbers is not None:
-            page_numbers = sorted(i - 1 for i in page_numbers)
-        for i, page in enumerate(extract_pages(file, page_numbers=page_numbers)):
-            if page_numbers is not None:
-                page_num = page_numbers[i] + 1
-            else:
-                page_num = i + 1
-            print(f"page {page_num}")
-            try:
-                PageParser(parser=self, page_num=page_num).parse_page(page)
-            except Exception as e:
-                e.add_note(f"page_num={page_num}")
-                raise
-
-
 COLUMN_SPLIT_X = 300.0
+PAGE_BODY_MAX_X = 600.0
+PAGE_BODY_MIN_X = 50
 PAGE_BODY_MAX_Y = 780.0
 PAGE_BODY_MIN_Y = 45.0
+ONE_TITLE_LINE_SPLIT_Y = 734.0
+TWO_TITLE_LINES_SPLIT_Y = 715.0
 INSN_BIT_FIELDS_PREFIX_TEXT_TOP_PAD_HEIGHT = 29.938
 INSN_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT = 9.278
 INSN_BIT_FIELDS_PREFIX_BOX_BOTTOM_TO_SUFFIX_TEXT_HEIGHT = 20.971
@@ -281,6 +269,9 @@ class TextLineFonts:
     bold: tuple[Font, ...] | None = None
     bold_italic: tuple[Font, ...] | None = None
     subscript: tuple[Font, ...] | None = None
+    bold_subscript: tuple[Font, ...] | None = None
+    italic_subscript: tuple[Font, ...] | None = None
+    bold_italic_subscript: tuple[Font, ...] | None = None
     code: tuple[Font, ...] | None = None
     code_subscript: tuple[Font, ...] | None = None
 
@@ -293,10 +284,11 @@ class TextLineFonts:
             regular=(Font.INSN_HEADER,),
         )
         cls.INSN_BIT_FIELD_BIT_NUMBER_FONTS = cls(
-            regular=(Font.INSN_DESC_SMALL,),
+            regular=(Font.INSN_DESC_SMALL, Font.TITLE_PAGE_REV),
         )
         cls.INSN_BIT_FIELD_NAME_FONTS = cls(
             regular=Font.INSN_DESC,
+            subscript=(Font.INSN_DESC_SUBSCRIPT,),
         )
         cls.INSN_BIT_FIELDS_AFFIX_TITLE_FONTS = cls(
             regular=(Font.INSN_DESC_SMALL,),
@@ -307,11 +299,14 @@ class TextLineFonts:
             subscript=Font.INSN_CODE_SUBSCRIPT,
         )
         cls.INSN_DESC_FONTS = cls(
-            regular=Font.INSN_DESC,
-            bold=(Font.INSN_DESC_BOLD,),
-            italic=(Font.INSN_DESC_ITALIC,),
-            bold_italic=(Font.INSN_DESC_BOLD_ITALIC,),
+            regular=(*Font.INSN_DESC, Font.INSN_DESC_SMALL),
+            bold=(Font.INSN_DESC_BOLD, Font.INSN_DESC_SMALL_BOLD),
+            italic=(Font.INSN_DESC_ITALIC, Font.INSN_DESC_SMALL_ITALIC),
+            bold_italic=(Font.INSN_DESC_BOLD_ITALIC, Font.INSN_DESC_SMALL_BOLD_ITALIC),
             subscript=(Font.INSN_DESC_SUBSCRIPT,),
+            bold_subscript=(Font.INSN_DESC_BOLD_SUBSCRIPT,),
+            italic_subscript=(Font.INSN_DESC_ITALIC_SUBSCRIPT,),
+            bold_italic_subscript=(Font.INSN_DESC_BOLD_ITALIC_SUBSCRIPT,),
             code=(Font.INSN_DESC_CODE, Font.INSN_EXT_MNEMONIC),
             code_subscript=Font.INSN_CODE_SUBSCRIPT,
         )
@@ -326,36 +321,39 @@ class TextLineFonts:
         match part_kind:
             case TextLineFontKind.REGULAR:
                 font = self.regular
-                baseline_pos = None
             case TextLineFontKind.ITALIC:
                 font = self.italic
-                baseline_pos = None
             case TextLineFontKind.BOLD:
                 font = self.bold
-                baseline_pos = None
             case TextLineFontKind.BOLD_ITALIC:
                 font = self.bold_italic
-                baseline_pos = None
             case TextLineFontKind.SUBSCRIPT:
                 font = self.subscript
-                baseline_pos = BaselinePos.BELOW
             case TextLineFontKind.SUPERSCRIPT:
                 font = self.subscript
-                baseline_pos = BaselinePos.ABOVE
+            case TextLineFontKind.BOLD_SUBSCRIPT:
+                font = self.bold_subscript
+            case TextLineFontKind.BOLD_SUPERSCRIPT:
+                font = self.bold_subscript
+            case TextLineFontKind.ITALIC_SUBSCRIPT:
+                font = self.italic_subscript
+            case TextLineFontKind.ITALIC_SUPERSCRIPT:
+                font = self.italic_subscript
+            case TextLineFontKind.BOLD_ITALIC_SUBSCRIPT:
+                font = self.bold_italic_subscript
+            case TextLineFontKind.BOLD_ITALIC_SUPERSCRIPT:
+                font = self.bold_italic_subscript
             case TextLineFontKind.CODE:
                 font = self.code
-                baseline_pos = None
             case TextLineFontKind.CODE_SUBSCRIPT:
                 font = self.code_subscript
-                baseline_pos = BaselinePos.BELOW
             case TextLineFontKind.CODE_SUPERSCRIPT:
                 font = self.code_subscript
-                baseline_pos = BaselinePos.ABOVE
             case _:
                 assert_never(part_kind)
         if font is None:
             return default
-        return font, baseline_pos
+        return font, part_kind.sub_super.baseline_pos
 
     @cached_property
     def __font_to_kind_map(self) -> dict[tuple[Font, None | BaselinePos], TextLineFontKind]:
@@ -392,40 +390,82 @@ class TextLineFonts:
 
 TextLineFonts._define_fonts()
 
+class FontVariantCode(enum.Enum):
+    CODE = ("code",)
+    NOT_CODE = ()
+
+class FontVariantBold(enum.Enum):
+    BOLD = ("b",)
+    NOT_BOLD = ()
+
+class FontVariantItalic(enum.Enum):
+    ITALIC = ("i",)
+    NOT_ITALIC = ()
+
+class FontVariantSubSuper(enum.Enum):
+    NOT_SUB_SUPER = ()
+    SUBSCRIPT = ("sub",)
+    SUPERSCRIPT = ("sup",)
+
+    @cached_property
+    def baseline_pos(self) -> None | BaselinePos:
+        match self:
+            case FontVariantSubSuper.NOT_SUB_SUPER:
+                return None
+            case FontVariantSubSuper.SUBSCRIPT:
+                return BaselinePos.BELOW
+            case FontVariantSubSuper.SUPERSCRIPT:
+                return BaselinePos.ABOVE
+            case _:
+                assert_never(self)
+
 class TextLineFontKind(enum.Enum):
-    REGULAR = "regular"
-    ITALIC = "italic"
-    BOLD = "bold"
-    BOLD_ITALIC = "bold_italic"
-    SUBSCRIPT = "subscript"
-    SUPERSCRIPT = "superscript"
-    CODE = "code"
-    CODE_SUBSCRIPT = "code_subscript"
-    CODE_SUPERSCRIPT = "code_superscript"
+    def __init__(
+        self,
+        code: FontVariantCode,
+        bold: FontVariantBold,
+        italic: FontVariantItalic,
+        sub_super: FontVariantSubSuper,
+    ):
+        self.code = code
+        self.bold = bold
+        self.italic = italic
+        self.sub_super = sub_super
+
+    REGULAR = FontVariantCode.NOT_CODE, FontVariantBold.NOT_BOLD, \
+        FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.NOT_SUB_SUPER
+    SUBSCRIPT = FontVariantCode.NOT_CODE, FontVariantBold.NOT_BOLD, \
+        FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.SUBSCRIPT
+    SUPERSCRIPT = FontVariantCode.NOT_CODE, FontVariantBold.NOT_BOLD, \
+        FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.SUPERSCRIPT
+    ITALIC = FontVariantCode.NOT_CODE, FontVariantBold.NOT_BOLD, \
+        FontVariantItalic.ITALIC, FontVariantSubSuper.NOT_SUB_SUPER
+    ITALIC_SUBSCRIPT = FontVariantCode.NOT_CODE, FontVariantBold.NOT_BOLD, \
+        FontVariantItalic.ITALIC, FontVariantSubSuper.SUBSCRIPT
+    ITALIC_SUPERSCRIPT = FontVariantCode.NOT_CODE, FontVariantBold.NOT_BOLD, \
+        FontVariantItalic.ITALIC, FontVariantSubSuper.SUPERSCRIPT
+    BOLD = FontVariantCode.NOT_CODE, FontVariantBold.BOLD, \
+        FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.NOT_SUB_SUPER
+    BOLD_SUBSCRIPT = FontVariantCode.NOT_CODE, FontVariantBold.BOLD, \
+        FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.SUBSCRIPT
+    BOLD_SUPERSCRIPT = FontVariantCode.NOT_CODE, FontVariantBold.BOLD, \
+        FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.SUPERSCRIPT
+    BOLD_ITALIC = FontVariantCode.NOT_CODE, FontVariantBold.BOLD, \
+        FontVariantItalic.ITALIC, FontVariantSubSuper.NOT_SUB_SUPER
+    BOLD_ITALIC_SUBSCRIPT = FontVariantCode.NOT_CODE, FontVariantBold.BOLD, \
+        FontVariantItalic.ITALIC, FontVariantSubSuper.SUBSCRIPT
+    BOLD_ITALIC_SUPERSCRIPT = FontVariantCode.NOT_CODE, FontVariantBold.BOLD, \
+        FontVariantItalic.ITALIC, FontVariantSubSuper.SUPERSCRIPT
+    CODE = FontVariantCode.CODE, FontVariantBold.NOT_BOLD, \
+        FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.NOT_SUB_SUPER
+    CODE_SUBSCRIPT = FontVariantCode.CODE, FontVariantBold.NOT_BOLD, \
+        FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.SUBSCRIPT
+    CODE_SUPERSCRIPT = FontVariantCode.CODE, FontVariantBold.NOT_BOLD, \
+        FontVariantItalic.NOT_ITALIC, FontVariantSubSuper.SUPERSCRIPT
 
     @cached_property
     def text_line_tags(self) -> tuple[str, ...]:
-        match self:
-            case TextLineFontKind.REGULAR:
-                return ()
-            case TextLineFontKind.ITALIC:
-                return "i",
-            case TextLineFontKind.BOLD:
-                return "b",
-            case TextLineFontKind.BOLD_ITALIC:
-                return "b", "i"
-            case TextLineFontKind.SUBSCRIPT:
-                return "sub",
-            case TextLineFontKind.SUPERSCRIPT:
-                return "sup",
-            case TextLineFontKind.CODE:
-                return "code",
-            case TextLineFontKind.CODE_SUBSCRIPT:
-                return "code", "sub"
-            case TextLineFontKind.CODE_SUPERSCRIPT:
-                return "code", "sup"
-            case _:
-                assert_never(self)
+        return (*self.code.value, *self.bold.value, *self.italic.value, *self.sub_super.value)
 
 class PageParseError(Exception):
     pass
@@ -489,8 +529,8 @@ class ElementBodyBuilder:
 class InsnBitField:
     box_min_x: float
     box_max_x: float
-    name: None | ParsedTextLine
-    bit_number: None | ParsedTextLine
+    name: ParsedTextLine
+    bit_number: ParsedTextLine
 
     def __str__(self) -> str:
         return f"<InsnBitField: x={self.box_min_x}..{self.box_max_x} name={self.name} bit_number={self.bit_number}>"
@@ -591,6 +631,11 @@ class InsnSpRegsAltered:
         lines.append(f")")
         return "\n".join(lines)
 
+class _InsnParseSection(enum.Enum):
+    CODE = "code"
+    HEADER = "header"
+    DESC = "desc"
+
 CHAR_TO_EXPANDED = {
     "\ufb00": "ff",
     "\ufb01": "fi",
@@ -600,21 +645,33 @@ CHAR_TO_EXPANDED = {
 }
 
 @dataclass()
-class PageParser:
-    parser: Parser
+class Page:
     page_num: int
-    qt: QuadTree[Char | LTLine | LTRect] = field(default_factory=QuadTree)
-    unprocessed_chars: defaultdict[Font, SetById[Char]] = field(
-        default_factory=lambda: defaultdict(SetById[Char]))
-    unprocessed_non_text: SetById[LTLine | LTRect] = field(
-        default_factory=SetById[LTLine | LTRect])
+    qt: defaultdict[TextSection, QuadTree[Char | LTLine | LTRect]]
+    unprocessed_chars: defaultdict[TextSection, defaultdict[Font, SetById[Char]]]
+    unprocessed_non_text: SetById[LTLine | LTRect]
 
-    def parse_page(self, page: LTPage):
+    @staticmethod
+    def from_lt_page(page_num: int, page: LTPage) -> Page:
+        qt: defaultdict[TextSection, QuadTree[Char | LTLine | LTRect]] = defaultdict(QuadTree)
+        unprocessed_chars = defaultdict(lambda: defaultdict(SetById[Char]))
+        unprocessed_non_text: SetById[LTLine | LTRect] = SetById()
         for component in page:
             if isinstance(component, (LTLine, LTRect)):
-                if isinstance(component, LTRect):
-                    print(component)
-                self.qt.insert(component.x0, component.y0, component)
+                if component.width > 100 and \
+                        component.x0 < COLUMN_SPLIT_X - 10 and \
+                        component.x1 > COLUMN_SPLIT_X + 10:
+                    print(f"wide component: {component}")
+                else:
+                    print(f"component: {component}")
+                text_section = TextSection.for_position(
+                    page_num=page_num,
+                    x=(component.x0 + component.x1) * 0.5,
+                    y=(component.y0 + component.y1) * 0.5,
+                )
+                if text_section is not None:
+                    qt[text_section].insert(component.x0, component.y0, component)
+                    unprocessed_non_text.add(component)
                 continue
             if not isinstance(component, LTTextBox):
                 print(f"ignoring: {component}")
@@ -623,6 +680,16 @@ class PageParser:
                 for element in text_line:
                     if not isinstance(element, LTChar):
                         continue
+                    text_section = TextSection.for_position(
+                        page_num=page_num,
+                        x=(element.x0 + element.x1) * 0.5,
+                        y=(element.y0 + element.y1) * 0.5,
+                    )
+                    if text_section is None:
+                        if PAGE_BODY_MIN_Y <= element.y0 <= PAGE_BODY_MAX_Y:
+                            raise AssertionError(
+                                f"char not in text section: {element}\npage_num={page_num}")
+                        continue
                     char = Char(
                         text=element.get_text(),
                         font=Font(font_name=element.fontname, size=round(element.size, 3)),
@@ -632,28 +699,367 @@ class PageParser:
                         max_x=element.x1,
                         max_y=element.y1,
                     )
-                    self.qt.insert(char.min_x, char.min_y, char)
-                    self.unprocessed_chars[char.font].add(char)
-        for i in self.unprocessed_chars.values():
-            i.sort(key=Char.top_down_left_to_right_sort_key)
+                    qt[text_section].insert(char.min_x, char.min_y, char)
+                    unprocessed_chars[text_section][char.font].add(char)
+        for i in unprocessed_chars.values():
+            for j in i.values():
+                j.sort(key=Char.top_down_left_to_right_sort_key)
         unknown_fonts=[]
         unknown_font_errors=[]
-        for font, chars in self.unprocessed_chars.items():
-            if font.known_name is None:
-                text = ""
-                char = None
-                for char in chars:
-                    text += char.text
-                unknown_fonts.append(repr(font) + ",")
-                unknown_font_errors.append(f"unknown font {font}\nlast char: {char}\ntext: {text!r}")
+        for i in unprocessed_chars.values():
+            for font, chars in i.items():
+                if font.known_name is None:
+                    text = ""
+                    char = None
+                    for char in chars:
+                        text += char.text
+                    unknown_fonts.append(repr(font) + ",")
+                    unknown_font_errors.append(f"unknown font {font}\nlast char: {char}\ntext: {text!r}")
         unknown_fonts.sort()
         if len(unknown_fonts) != 0:
             raise AssertionError("\nunknown fonts:\n" + "\n".join(unknown_fonts)
                 + "\n\n" + "\n".join(unknown_font_errors))
+        return Page(
+            page_num=page_num,
+            qt=qt,
+            unprocessed_chars=unprocessed_chars,
+            unprocessed_non_text=unprocessed_non_text,
+        )
+
+class Pages:
+    pages_gen: None | Generator[Page, None, None]
+    __pages: dict[int, Page]
+    __max_page_num: int
+
+    def __init__(self, pages_gen: None | Generator[Page, None, None]=None):
+        self.pages_gen = pages_gen
+        self.__pages = {}
+        self.__max_page_num = 0
+
+    def __enter__(self) -> Pages:
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback) -> None:
+        self.close()
+
+    def close(self):
+        if self.pages_gen is not None:
+            gen = self.pages_gen
+            self.pages_gen = None
+            gen.close()
+
+    def is_past_end(self, page_num: int) -> bool:
+        while self.pages_gen is not None and page_num > self.__max_page_num:
+            self.__fill_page()
+        return page_num > self.__max_page_num
+
+    def __fill_page(self) -> bool:
+        if self.pages_gen is None:
+            return False
         try:
-            self.extract_insns()
+            page = self.pages_gen.send(None)
+        except StopIteration:
+            page = None
+        if page is None:
+            self.close()
+            return False
+        if page.page_num <= self.__max_page_num:
+            e = AssertionError(
+                f"page numbers must be a strictly-increasing positive integer sequence:\n"
+                f"got {page.page_num} which isn't more than {self.__max_page_num}")
+            if self.pages_gen is not None:
+                self.pages_gen.throw(e)
+            raise e  # either no generator or generator failed to propagate exception
+        self.__pages[page.page_num] = page
+        self.__max_page_num = page.page_num
+        return True
+
+    def get(self, page_num: int, default: _T=None) -> _T | Page:
+        while True:
+            page = self.__pages.get(page_num)
+            if page is not None:
+                return page
+            if self.pages_gen is None:
+                return default
+            if page_num < self.__max_page_num:
+                return default
+            self.__fill_page()
+
+    def __contains__(self, page_num: int, /) -> bool:
+        return self.get(page_num) is not None
+
+    def __getitem__(self, page_num: int, /) -> Page:
+        retval = self.get(page_num)
+        if retval is None:
+            raise KeyError(page_num)
+        return retval
+
+@dataclass(unsafe_hash=True, frozen=True)
+class TextSection:
+    page_num: int
+    min_x: float
+    min_y: float
+    max_x: float
+    max_y: float
+
+    @classmethod
+    def first(cls) -> TextSection:
+        return cls.page_sections(page_num=1)[0]
+
+    @cached_property
+    def next(self) -> TextSection:
+        page_sections = self.page_sections(page_num=self.page_num)
+        index = page_sections.index(self)
+        if index + 1 < len(page_sections):
+            return page_sections[index + 1]
+        for page_num in range(self.page_num + 1, self.page_num + 100000):
+            page_sections = self.page_sections(page_num=page_num)
+            if len(page_sections) != 0:
+                return page_sections[0]
+        raise AssertionError(f"can't find next TextSection after {self}")
+
+    @classmethod
+    def left_column(
+        cls, *,
+        page_num: int,
+        min_y=PAGE_BODY_MIN_Y,
+        max_y=PAGE_BODY_MAX_Y,
+    ) -> TextSection:
+        return cls(
+            page_num=page_num,
+            min_x=PAGE_BODY_MIN_X,
+            min_y=min_y,
+            max_x=COLUMN_SPLIT_X,
+            max_y=max_y)
+
+    @classmethod
+    def right_column(
+        cls, *,
+        page_num: int,
+        min_y=PAGE_BODY_MIN_Y,
+        max_y=PAGE_BODY_MAX_Y,
+    ) -> TextSection:
+        return cls(
+            page_num=page_num,
+            min_x=COLUMN_SPLIT_X,
+            min_y=min_y,
+            max_x=PAGE_BODY_MAX_X,
+            max_y=max_y)
+
+    @classmethod
+    def columns(
+        cls, *,
+        page_num: int,
+        min_y=PAGE_BODY_MIN_Y,
+        max_y=PAGE_BODY_MAX_Y,
+    ) -> tuple[TextSection, TextSection]:
+        return (cls.left_column(page_num=page_num, min_y=min_y, max_y=max_y),
+                cls.right_column(page_num=page_num, min_y=min_y, max_y=max_y))
+
+    @classmethod
+    def full_page(
+        cls, *,
+        page_num: int,
+        min_y=PAGE_BODY_MIN_Y,
+        max_y=PAGE_BODY_MAX_Y,
+    ) -> TextSection:
+        return cls(
+            page_num=page_num,
+            min_x=PAGE_BODY_MIN_X,
+            min_y=min_y,
+            max_x=PAGE_BODY_MAX_X,
+            max_y=max_y)
+
+    __COLUMNS_THEN_FULL_PAGE: ClassVar = {
+        129: 438.992, 241: 512.419, 242: 408.077, 243: 488.509,
+        244: 437.518, 245: 444.522, 247: 352.082, 248: 356.723,
+        249: 365.944, 251: 334.553, 264: 184.67, 296: 267.29,
+        297: 200.043, 298: 440.64, 299: 197.356, 300: 160.076,
+        301: 364.924, 303: 330.055, 305: 344.867, 306: 335.403,
+        307: 336.897, 308: 365.233, 309: 364.735,
+    }
+
+    __FULL_PAGE_THEN_COLUMNS: ClassVar = {
+        246: 689.039,
+        250: 615.315,
+        266: 678.088,
+    }
+
+    __ONE_TITLE_LINE_THEN_COLUMNS_THEN_FULL_PAGE: ClassVar = {
+        128: 301.55,
+    }
+
+    __TWO_TITLE_LINES_THEN_COLUMNS_THEN_FULL_PAGE: ClassVar = {
+        304: 242.732,
+    }
+
+    __COLUMNS_THEN_COLUMNS: ClassVar = {
+        79: 621.66,
+        126: 519.89,
+    }
+
+    __ONE_TITLE_LINE_THEN_COLUMNS_THEN_COLUMNS: ClassVar = {
+        130: 550.43,
+        162: 599.247,
+        194: 622.161,
+        196: 682.933,
+        204: 613.195,
+        215: 633.12,
+    }
+
+    __ONE_TITLE_LINE_THEN_COLUMNS: ClassVar = {
+        103, 104, 105, 121, 139, 143, 146, 151, 153, 158, 207, 213, 218,
+    }
+
+    __TWO_TITLE_LINES_THEN_COLUMNS: ClassVar = {
+        198, 206,
+    }
+
+    __FULL_PAGE: ClassVar = {
+        118, 157, 252, 254, 255, 257, 259, 260, 265, 268, 270, 271, 272,
+        *range(274, 286),
+    }
+
+    @classmethod
+    def __page_sections(cls, page_num: int) -> tuple[TextSection, ...]:
+        match page_num:
+            case _ if page_num in cls.__COLUMNS_THEN_COLUMNS:
+                split_y = cls.__COLUMNS_THEN_COLUMNS[page_num]
+                return (
+                    *cls.columns(page_num=page_num, min_y=split_y),
+                    *cls.columns(page_num=page_num, max_y=split_y),
+                )
+            case _ if page_num in cls.__ONE_TITLE_LINE_THEN_COLUMNS:
+                return (
+                    cls.full_page(page_num=page_num, min_y=ONE_TITLE_LINE_SPLIT_Y),
+                    *cls.columns(page_num=page_num, max_y=ONE_TITLE_LINE_SPLIT_Y),
+                )
+            case _ if page_num in cls.__FULL_PAGE:
+                return cls.full_page(page_num=page_num),
+            case _ if page_num in cls.__ONE_TITLE_LINE_THEN_COLUMNS_THEN_COLUMNS:
+                split_y = cls.__ONE_TITLE_LINE_THEN_COLUMNS_THEN_COLUMNS[page_num]
+                return (
+                    cls.full_page(page_num=page_num, min_y=ONE_TITLE_LINE_SPLIT_Y),
+                    *cls.columns(page_num=page_num, min_y=split_y, max_y=ONE_TITLE_LINE_SPLIT_Y),
+                    *cls.columns(page_num=page_num, max_y=split_y),
+                )
+            case _ if page_num in cls.__TWO_TITLE_LINES_THEN_COLUMNS:
+                return (
+                    cls.full_page(page_num=page_num, min_y=TWO_TITLE_LINES_SPLIT_Y),
+                    *cls.columns(page_num=page_num, max_y=TWO_TITLE_LINES_SPLIT_Y),
+                )
+            case _ if page_num in cls.__COLUMNS_THEN_FULL_PAGE:
+                split_y = cls.__COLUMNS_THEN_FULL_PAGE[page_num]
+                return (
+                    *cls.columns(page_num=page_num, min_y=split_y),
+                    cls.full_page(page_num=page_num, max_y=split_y),
+                )
+            case _ if page_num in cls.__FULL_PAGE_THEN_COLUMNS:
+                split_y = cls.__FULL_PAGE_THEN_COLUMNS[page_num]
+                return (
+                    cls.full_page(page_num=page_num, min_y=split_y),
+                    *cls.columns(page_num=page_num, max_y=split_y),
+                )
+            case _ if page_num in cls.__ONE_TITLE_LINE_THEN_COLUMNS_THEN_FULL_PAGE:
+                split_y = cls.__ONE_TITLE_LINE_THEN_COLUMNS_THEN_FULL_PAGE[page_num]
+                return (
+                    cls.full_page(page_num=page_num, min_y=ONE_TITLE_LINE_SPLIT_Y),
+                    *cls.columns(page_num=page_num, min_y=split_y, max_y=ONE_TITLE_LINE_SPLIT_Y),
+                    cls.full_page(page_num=page_num, max_y=split_y),
+                )
+            case _ if page_num in cls.__TWO_TITLE_LINES_THEN_COLUMNS_THEN_FULL_PAGE:
+                split_y = cls.__TWO_TITLE_LINES_THEN_COLUMNS_THEN_FULL_PAGE[page_num]
+                return (
+                    cls.full_page(page_num=page_num, min_y=TWO_TITLE_LINES_SPLIT_Y),
+                    *cls.columns(page_num=page_num, min_y=split_y, max_y=TWO_TITLE_LINES_SPLIT_Y),
+                    cls.full_page(page_num=page_num, max_y=split_y),
+                )
+            case 263:
+                return (
+                    cls.full_page(page_num=page_num, min_y=699.997),
+                    *cls.columns(page_num=page_num, min_y=366.396, max_y=699.997),
+                    *cls.columns(page_num=page_num, min_y=207, max_y=366.396),
+                    cls.full_page(page_num=page_num, max_y=207),
+                )
+            # TODO: checked up to page 309 (page named 273)
+            case _:
+                return cls.columns(page_num=page_num)
+
+    __PAGE_SECTIONS_CACHE: ClassVar[dict[int, tuple[TextSection, ...]]] = {}
+
+    @classmethod
+    def page_sections(cls, page_num: int) -> tuple[TextSection, ...]:
+        try:
+            return cls.__PAGE_SECTIONS_CACHE[page_num]
+        except KeyError:
+            pass
+        retval = cls.__PAGE_SECTIONS_CACHE[page_num] = cls.__page_sections(page_num=page_num)
+        return retval
+
+    @classmethod
+    def for_position(cls, page_num: int, x: float, y: float) -> None | TextSection:
+        for i in cls.page_sections(page_num=page_num):
+            if i.min_x <= x <= i.max_x and i.min_y <= y <= i.max_y:
+                return i
+        return None
+
+@dataclass()
+class Parser:
+    pages: Pages = field(default_factory=Pages)
+    text_section: TextSection = TextSection.first()
+
+    @property
+    def page(self) -> Page:
+        return self.pages[self.text_section.page_num]
+
+    @property
+    def unprocessed_chars(self) -> defaultdict[Font, SetById[Char]]:
+        return self.pages[self.text_section.page_num].unprocessed_chars[self.text_section]
+
+    @staticmethod
+    def __pages_gen(file: str, page_numbers: Iterable[int] | None) -> Generator[Page, None, None]:
+        if page_numbers is not None:
+            page_numbers = sorted(i - 1 for i in page_numbers)
+        for i, page in enumerate(extract_pages(file, page_numbers=page_numbers)):
+            if page_numbers is not None:
+                page_num = page_numbers[i] + 1
+            else:
+                page_num = i + 1
+            print(f"page {page_num}")
+            yield Page.from_lt_page(page_num=page_num, page=page)
+
+    def parse_pdf(self, file: str, page_numbers: Iterable[int] | None = None):
+        self.pages = Pages(pages_gen=Parser.__pages_gen(
+            file=file, page_numbers=page_numbers))
+        self.text_section = TextSection.first()
+        while True:
+            self.text_section = self.text_section.next
+            if self.pages.is_past_end(self.text_section.page_num):
+                break
+            if self.text_section.page_num in self.pages:
+                print(f"section {self.text_section}")
+                with self.note_text_section():
+                    self.parse_text_section()
+
+    @contextmanager
+    def note_text_section(self):
+        start_text_section = self.text_section
+        try:
+            yield
+        except Exception as e:
+            if self.text_section == start_text_section:
+                note = f"text_section={self.text_section}"
+            else:
+                note = f"start_text_section={start_text_section}\ntext_section={self.text_section}"
+            if note not in getattr(e, "__notes__", ()):
+                e.add_note(note)
+            raise
+
+    def parse_text_section(self):
+        try:
+            with self.note_text_section():
+                self.extract_insns()
         except InsnParseError as e:
-            e.add_note(f"page_num={self.page_num}")
             print("".join(traceback.format_exception_only(e)), flush=True)
             traceback.print_exc()
 
@@ -666,7 +1072,7 @@ class PageParser:
         pred: None | Callable[[Char], bool] = None,
     ) -> None | Char:
         retval = None
-        for x, y, char in self.qt.range(
+        for x, y, char in self.page.qt[self.text_section].range(
             min_x=min_x,
             max_x=max_x,
             min_y=min_y,
@@ -700,7 +1106,12 @@ class PageParser:
         if start_char is not None:
             chars.append(start_char)
             chars_set.add(start_char)
-        for x, y, char in self.qt.range(
+        if start_char is not None and \
+                start_char.text == "*" and \
+                self.text_section.page_num == 168 and \
+                start_char.font in (fonts.subscript or ()):
+            start_min_y = start_char.max_y - fonts.regular[0].size
+        for x, y, char in self.page.qt[self.text_section].range(
             min_x=min_x - fonts.regular[0].size * 0.5,
             max_x=max_x,
             min_y=start_min_y - fonts.regular[0].size * 0.4,
@@ -715,10 +1126,18 @@ class PageParser:
         if len(chars) == 0:
             return None
         chars.sort(key=lambda char: (char.min_x, char.text))
+        regular_min_y = chars[0].min_y
+        regular_max_y = chars[0].max_y
+        for char in chars:
+            kind = fonts.get_kind(font=char.font, baseline_pos=BaselinePos.BELOW)
+            if kind is not None and kind.sub_super is FontVariantSubSuper.NOT_SUB_SUPER:
+                regular_min_y = char.min_y
+                regular_max_y = char.max_y
+                break
         retval = ParsedTextLine(
             element=ElementTree.Element("text-line"),
-            regular_min_y=chars[0].min_y,
-            regular_max_y=chars[0].max_y,
+            regular_min_y=regular_min_y,
+            regular_max_y=regular_max_y,
             fonts=fonts,
             chars=chars,
             preceding_blank_lines=preceding_blank_lines,
@@ -753,7 +1172,7 @@ class PageParser:
             space_width = char.min_x - last_max_x
             space_count_f = space_width / space_font[0].space_width
             space_count = round(space_count_f)
-            if space_count == 0 and space_count_f > 0.4:
+            if space_count == 0 and space_count_f > 0.35:
                 space_count = 1
             if space_count_f > 0.25 and abs(space_count - space_count_f) > 0.15:
                 print(f"spaces: space_count_f={space_count_f} space_width={space_width}")
@@ -781,9 +1200,9 @@ class PageParser:
             self.unprocessed_chars[char.font].remove(char)
         if allowed_start_min_y_error is None:
             allowed_start_min_y_error = 0.01
-        assert abs(start_min_y - chars[0].min_y) < allowed_start_min_y_error, (
-            f"start_min_y={start_min_y} regular_min_y={chars[0].min_y}\n"
-            f"start_min_y error: {start_min_y - chars[0].min_y}\n"
+        assert abs(start_min_y - retval.regular_min_y) < allowed_start_min_y_error, (
+            f"start_min_y={start_min_y} regular_min_y={retval.regular_min_y}\n"
+            f"start_min_y error: {start_min_y - retval.regular_min_y}\n"
             f"allowed_start_min_y_error={allowed_start_min_y_error}")
         return retval
 
@@ -809,8 +1228,6 @@ class PageParser:
 
     def extract_insn_bit_fields(
         self,
-        min_x: float,
-        max_x: float,
         mnemonic_lines: list[ParsedTextLine],
     ) -> None | InsnBitFields:
         found_non_affix_line = False
@@ -820,9 +1237,9 @@ class PageParser:
         else:
             expected_non_affix_line_y = (mnemonic_lines[-1].regular_min_y
                 - INSN_BIT_FIELDS_TOP_PAD_HEIGHT)
-        for x, y, line in self.qt.range(
-            min_x=min_x - 5,
-            max_x=max_x + 5,
+        for x, y, line in self.page.qt[self.text_section].range(
+            min_x=self.text_section.min_x - 5,
+            max_x=self.text_section.max_x + 5,
             min_y=expected_non_affix_line_y - 5,
             max_y=expected_non_affix_line_y + 5,
         ):
@@ -833,17 +1250,16 @@ class PageParser:
                 break
         if found_non_affix_line:
             return self.extract_insn_bit_fields_box(
-                min_x=min_x,
-                max_x=max_x,
                 expected_box_max_y=expected_non_affix_line_y,
             )
         prefix_text = self.extract_text_line(
             start_min_y=mnemonic_lines[-1].regular_min_y
                 - INSN_BIT_FIELDS_PREFIX_TEXT_TOP_PAD_HEIGHT,
-            min_x=min_x,
-            max_x=max_x,
+            min_x=self.text_section.min_x,
+            max_x=self.text_section.max_x,
             fonts=TextLineFonts.INSN_BIT_FIELDS_AFFIX_TITLE_FONTS,
             allowed_start_min_y_error=2,
+            skip_initial_spaces=True,
         )
         if prefix_text is None:
             raise InsnParseError("can't find insn prefix bit fields title")
@@ -852,8 +1268,6 @@ class PageParser:
             raise InsnParseError(
                 f"insn prefix bit fields title is not as expected: {prefix_text_str!r}")
         prefix_bit_fields = self.extract_insn_bit_fields_box(
-            min_x=min_x,
-            max_x=max_x,
             expected_box_max_y=prefix_text.regular_min_y
                 - INSN_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT,
         )
@@ -862,10 +1276,11 @@ class PageParser:
         suffix_text = self.extract_text_line(
             start_min_y=prefix_bit_fields.box_min_y
                 - INSN_BIT_FIELDS_PREFIX_BOX_BOTTOM_TO_SUFFIX_TEXT_HEIGHT,
-            min_x=min_x,
-            max_x=max_x,
+            min_x=self.text_section.min_x,
+            max_x=self.text_section.max_x,
             fonts=TextLineFonts.INSN_BIT_FIELDS_AFFIX_TITLE_FONTS,
             allowed_start_min_y_error=2,
+            skip_initial_spaces=True,
         )
         if suffix_text is None:
             raise InsnParseError("can't find insn suffix bit fields title")
@@ -874,8 +1289,6 @@ class PageParser:
             raise InsnParseError(
                 f"insn suffix bit fields title is not as expected: {suffix_text_str!r}")
         suffix_bit_fields = self.extract_insn_bit_fields_box(
-            min_x=min_x,
-            max_x=max_x,
             expected_box_max_y=suffix_text.regular_min_y
                 - INSN_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT,
         )
@@ -900,15 +1313,13 @@ class PageParser:
 
     def extract_insn_bit_fields_box(
         self,
-        min_x: float,
-        max_x: float,
         expected_box_max_y: float,
     ) -> None | InsnBitFields:
         h_lines: list[LTLine] = []
         v_lines: list[LTLine] = []
-        for x, y, line in self.qt.range(
-            min_x=min_x - 5,
-            max_x=max_x + 5,
+        for x, y, line in self.page.qt[self.text_section].range(
+            min_x=self.text_section.min_x - 5,
+            max_x=self.text_section.max_x + 5,
             min_y=expected_box_max_y - INSN_BIT_FIELDS_BOX_HEIGHT - 5,
             max_y=expected_box_max_y + 5,
         ):
@@ -946,23 +1357,38 @@ class PageParser:
             right_line = v_lines[i + 1]
             field_box_min_x = left_line.x1
             field_box_max_x = right_line.x0
+            bit_field_name_start_min_y = box_mid_y + 3.288
+            bit_field_name=self.extract_text_line(
+                start_min_y=bit_field_name_start_min_y,
+                min_x=field_box_min_x,
+                max_x=field_box_max_x,
+                fonts=TextLineFonts.INSN_BIT_FIELD_NAME_FONTS,
+                skip_initial_spaces=True,
+                allowed_start_min_y_error=0.4,
+            )
+            if bit_field_name is None:
+                raise InsnParseError(f"instruction bit field name not found:\n"
+                    f"start_min_y={bit_field_name_start_min_y} "
+                    f"field_box_min_x={field_box_min_x} "
+                    f"field_box_max_x={field_box_max_x}")
+            bit_field_number_start_min_y = box_min_y + 3.487
+            bit_number=self.extract_text_line(
+                start_min_y=bit_field_number_start_min_y,
+                min_x=field_box_min_x,
+                max_x=field_box_max_x,
+                fonts=TextLineFonts.INSN_BIT_FIELD_BIT_NUMBER_FONTS,
+                skip_initial_spaces=True,
+            )
+            if bit_number is None:
+                raise InsnParseError(f"instruction bit field bit number not found:\n"
+                    f"start_min_y={bit_field_number_start_min_y} "
+                    f"field_box_min_x={field_box_min_x} "
+                    f"field_box_max_x={field_box_max_x}")
             fields.append(InsnBitField(
                 box_min_x=field_box_min_x,
                 box_max_x=field_box_max_x,
-                name=self.extract_text_line(
-                    start_min_y=box_mid_y + 3.288,
-                    min_x=field_box_min_x,
-                    max_x=field_box_max_x,
-                    fonts=TextLineFonts.INSN_BIT_FIELD_NAME_FONTS,
-                    skip_initial_spaces=True,
-                ),
-                bit_number=self.extract_text_line(
-                    start_min_y=box_min_y + 3.487,
-                    min_x=field_box_min_x,
-                    max_x=field_box_max_x,
-                    fonts=TextLineFonts.INSN_BIT_FIELD_BIT_NUMBER_FONTS,
-                    skip_initial_spaces=True,
-                ),
+                name=bit_field_name,
+                bit_number=bit_number,
             ))
         return InsnBitFields(
             prefix=None,
@@ -975,8 +1401,6 @@ class PageParser:
 
     def extract_insn_header_mnemonics_and_bit_fields(
         self,
-        column_min_x: float,
-        column_max_x: float,
         start_min_y: float,
         header_start_char: None | Char = None,
     ) -> None | tuple[list[ParsedTextLine], list[ParsedTextLine], InsnBitFields]:
@@ -985,8 +1409,8 @@ class PageParser:
         header_line = self.extract_text_line(
             start_char=header_start_char,
             start_min_y=start_min_y,
-            min_x=column_min_x,
-            max_x=column_max_x,
+            min_x=self.text_section.min_x,
+            max_x=self.text_section.max_x,
             fonts=TextLineFonts.INSN_HEADER_FONTS,
             skip_initial_spaces=True,
             allowed_start_min_y_error=6,
@@ -996,15 +1420,15 @@ class PageParser:
         print(f"found header line:\n{header_line}")
         header_lines = self.extract_following_text_lines(
             first_text_line=header_line,
-            min_x=column_min_x,
-            max_x=column_max_x,
+            min_x=self.text_section.min_x,
+            max_x=self.text_section.max_x,
             allowed_start_min_y_error=1.5,
         )
         print("insn header lines:")
         print("\n".join(map(str, header_lines)))
         mnemonic_start_char = self.find_top_left_char_in_range(
-            min_x=column_min_x - 5,
-            max_x=column_max_x + 5,
+            min_x=self.text_section.min_x - 5,
+            max_x=self.text_section.max_x + 5,
             min_y=header_lines[-1].regular_min_y - 50,
             max_y=header_lines[-1].regular_min_y - 5,
             allow_processed=False,
@@ -1014,8 +1438,8 @@ class PageParser:
         mnemonic_line = self.extract_text_line(
             start_char=mnemonic_start_char,
             start_min_y=mnemonic_start_char.min_y,
-            min_x=column_min_x,
-            max_x=column_max_x,
+            min_x=self.text_section.min_x,
+            max_x=self.text_section.max_x,
             fonts=TextLineFonts.INSN_MNEMONIC_FONTS,
             skip_initial_spaces=True,
         )
@@ -1024,13 +1448,11 @@ class PageParser:
         mnemonic_lines = self.extract_following_text_lines(
             first_text_line=mnemonic_line,
             min_x=mnemonic_line.chars[0].min_x,
-            max_x=column_max_x,
+            max_x=self.text_section.max_x,
         )
         print("insn mnemonic lines:")
         print("\n".join(map(str, mnemonic_lines)))
         insn_bit_fields = self.extract_insn_bit_fields(
-            min_x=column_min_x,
-            max_x=column_max_x,
             mnemonic_lines=mnemonic_lines,
         )
         print(insn_bit_fields)
@@ -1041,11 +1463,10 @@ class PageParser:
     def extract_insn_sp_regs_altered(
         self,
         sp_regs_altered_text: ParsedTextLine,
-        column_min_x: float,
-        column_max_x: float,
     ) -> InsnSpRegsAltered:
         sp_regs_altered_text.preceding_blank_lines = 0
         fonts = TextLineFonts.INSN_DESC_FONTS
+        column_min_x = sp_regs_altered_text.chars[0].min_x
         table_header_reg_char = self.find_top_left_char_in_range(
             min_x=column_min_x - 1,
             max_x=column_min_x + INSN_SP_REGS_ALTERED_FIELDS_COLUMN_X - 1,
@@ -1055,37 +1476,25 @@ class PageParser:
         )
         assert table_header_reg_char is not None, \
             "can't find special registers altered table's register-column's header"
+        KNOWN_SPECIAL_TEXTS = (
+            "None",
+            "Dependent on the system service",
+            "See above.",
+        )
         match table_header_reg_char.text:
-            case "N":
-                none_text = self.extract_text_line(
-                    start_char=table_header_reg_char,
-                    start_min_y=table_header_reg_char.min_y,
-                    min_x=column_min_x,
-                    max_x=column_max_x,
-                    fonts=fonts,
-                    skip_initial_spaces=True,
-                )
-                assert none_text is not None and none_text.element.text == "None", \
-                    f"can't find special-registers-altered None: none_text={none_text}"
-                return InsnSpRegsAltered(
-                    sp_regs_altered_text=sp_regs_altered_text,
-                    special_text=none_text,
-                    table_header_reg=None,
-                    table_header_fields=None,
-                    entries=(),
-                    final_regular_min_y=none_text.regular_min_y,
-                )
-            case "D":
+            case "R":
+                pass
+            case text if any(text == i[0] for i in KNOWN_SPECIAL_TEXTS):
                 special_text = self.extract_text_line(
                     start_char=table_header_reg_char,
                     start_min_y=table_header_reg_char.min_y,
                     min_x=column_min_x,
-                    max_x=column_max_x,
+                    max_x=self.text_section.max_x,
                     fonts=fonts,
                     skip_initial_spaces=True,
                 )
                 assert special_text is not None \
-                    and special_text.element.text == "Dependent on the system service", \
+                    and special_text.element.text in KNOWN_SPECIAL_TEXTS, \
                     f"can't find special-registers-altered special-text:\n{special_text}"
                 return InsnSpRegsAltered(
                     sp_regs_altered_text=sp_regs_altered_text,
@@ -1095,10 +1504,8 @@ class PageParser:
                     entries=(),
                     final_regular_min_y=special_text.regular_min_y,
                 )
-            case "R":
-                pass
             case text:
-                raise AssertionError(
+                raise InsnParseError(
                     f"unknown special-registers-altered special-text start character: {text!r}")
         table_header_fields_char = self.find_top_left_char_in_range(
             min_x=column_min_x + INSN_SP_REGS_ALTERED_FIELDS_COLUMN_X - 10,
@@ -1116,7 +1523,7 @@ class PageParser:
             (table_header_reg_char.min_x, table_header_fields_char.min_x - 1),
             (table_header_fields_char.min_x,
                 column_min_x + INSN_SP_REGS_ALTERED_FIELDS_CONDS_SPLIT_X),
-            (column_min_x + INSN_SP_REGS_ALTERED_FIELDS_CONDS_SPLIT_X, column_max_x),
+            (column_min_x + INSN_SP_REGS_ALTERED_FIELDS_CONDS_SPLIT_X, self.text_section.max_x),
         )
         table_header_reg = self.extract_text_line(
             start_char=table_header_reg_char,
@@ -1207,14 +1614,7 @@ class PageParser:
     def extract_insn(self, header_start_char: Char):
         assert header_start_char.font == Font.INSN_HEADER
         print(header_start_char)
-        column_min_x = header_start_char.min_x
-        if column_min_x < COLUMN_SPLIT_X:
-            column_max_x = COLUMN_SPLIT_X
-        else:
-            column_max_x = 1000
         header = self.extract_insn_header_mnemonics_and_bit_fields(
-            column_min_x=column_min_x,
-            column_max_x=column_max_x,
             start_min_y=header_start_char.min_y,
             header_start_char=header_start_char,
         )
@@ -1228,39 +1628,40 @@ class PageParser:
         while True:
             search_min_y = next_start_min_y - 70
             next_char = self.find_top_left_char_in_range(
-                min_x=column_min_x - 5,
-                max_x=column_max_x + 5,
-                min_y=max(search_min_y, PAGE_BODY_MIN_Y),
+                min_x=self.text_section.min_x - 5,
+                max_x=self.text_section.max_x + 5,
+                min_y=max(search_min_y, self.text_section.min_y),
                 max_y=next_start_min_y,
                 allow_processed=False,
             )
             if next_char is None:
-                if column_max_x == COLUMN_SPLIT_X and search_min_y <= PAGE_BODY_MIN_Y:
-                    # go to other column
-                    column_min_x = COLUMN_SPLIT_X
-                    column_max_x = 1000
-                    next_start_min_y = PAGE_BODY_MAX_Y
+                if search_min_y <= self.text_section.min_y \
+                        and self.text_section.next is not None and \
+                        self.text_section.next.page_num in self.pages:
+                    # go to next section
+                    self.text_section = self.text_section.next
+                    next_start_min_y = self.text_section.max_y
                     continue
                 else:
                     raise InsnParseError("can't find insn code or description text")
             match next_char.font:
                 case font if font in TextLineFonts.INSN_CODE_FONTS.fonts:
-                    next_section = "code"
+                    next_section = _InsnParseSection.CODE
                 case font if font in TextLineFonts.INSN_DESC_FONTS.fonts:
-                    next_section = "desc"
+                    next_section = _InsnParseSection.DESC
                 case Font.INSN_HEADER:
-                    next_section = "header"
+                    next_section = _InsnParseSection.HEADER
                 case font:
                     raise InsnParseError(f"can't find insn code or description text\nfont={font}")
             match next_section:
-                case "code":
+                case _InsnParseSection.CODE:
                     if len(desc_lines) != 0:
                         break
                     code_line = self.extract_text_line(
                         start_char=next_char,
                         start_min_y=next_char.min_y,
                         min_x=next_char.min_x,
-                        max_x=column_max_x,
+                        max_x=self.text_section.max_x,
                         fonts=TextLineFonts.INSN_CODE_FONTS,
                         preceding_blank_lines=0 if len(code_lines) == 0 else 1,
                     )
@@ -1269,19 +1670,17 @@ class PageParser:
                     more_code_lines = self.extract_following_text_lines(
                         first_text_line=code_line,
                         min_x=code_line.chars[0].min_x,
-                        max_x=column_max_x,
+                        max_x=self.text_section.max_x,
                         allowed_start_min_y_error=0.05,
                     )
                     print("more insn code lines:")
                     print("\n".join(map(str, more_code_lines)))
                     code_lines.extend(more_code_lines)
                     next_start_min_y = code_lines[-1].regular_min_y - 5
-                case "header":
+                case _InsnParseSection.HEADER:
                     if len(code_lines) != 0 or len(desc_lines) != 0:
                         break
                     header = self.extract_insn_header_mnemonics_and_bit_fields(
-                        column_min_x=column_min_x,
-                        column_max_x=column_max_x,
                         start_min_y=next_char.min_y,
                         header_start_char=next_char,
                     )
@@ -1289,13 +1688,15 @@ class PageParser:
                         raise InsnParseError("can't find header text line")
                     headers.append(header)
                     next_start_min_y = header[2].box_min_y - 5
-                case "desc":
+                case _InsnParseSection.DESC:
                     desc_line = self.extract_text_line(
+                        start_char=next_char,
                         start_min_y=next_char.min_y,
                         min_x=next_char.min_x,
-                        max_x=column_max_x,
+                        max_x=self.text_section.max_x,
                         fonts=TextLineFonts.INSN_DESC_FONTS,
                         preceding_blank_lines=0 if len(desc_lines) == 0 else 1,
+                        allowed_start_min_y_error=3,
                     )
                     if desc_line is None:
                         raise InsnParseError("can't find insn desc text line")
@@ -1304,7 +1705,7 @@ class PageParser:
                             more_desc_lines = self.extract_following_text_lines(
                                 first_text_line=desc_line,
                                 min_x=desc_line.chars[0].min_x,
-                                max_x=column_max_x,
+                                max_x=self.text_section.max_x,
                                 allowed_start_min_y_error=3,
                             )
                             print("more insn desc lines:")
@@ -1314,8 +1715,6 @@ class PageParser:
                         case "Special Registers Altered:":
                             sp_regs_altered = self.extract_insn_sp_regs_altered(
                                 sp_regs_altered_text=desc_line,
-                                column_min_x=column_min_x,
-                                column_max_x=column_max_x,
                             )
                             next_start_min_y = sp_regs_altered.final_regular_min_y
                             break
@@ -1332,9 +1731,13 @@ class PageParser:
         # TODO: finish
 
     def extract_insns(self):
-        unprocessed_header_chars = self.unprocessed_chars[Font.INSN_HEADER]
-        while len(unprocessed_header_chars) != 0:
-            self.extract_insn(next(iter(unprocessed_header_chars)))
+        while True:
+            try:
+                header_start_char = next(iter(
+                    self.unprocessed_chars[Font.INSN_HEADER]))
+            except StopIteration:
+                break
+            self.extract_insn(header_start_char=header_start_char)
 
 def main():
     if 2 < len(sys.argv):