parse instruction bit fields

This commit is contained in:
Jacob Lifshay 2024-10-25 00:37:40 -07:00
parent 77b0ce2c3d
commit 08141ce560
Signed by: programmerjake
SSH key fingerprint: SHA256:B1iRVvUJkvd7upMIiMqn6OyxvD2SgJkAH3ZnUOj6z+c

View file

@ -1,6 +1,7 @@
from __future__ import annotations from __future__ import annotations
from collections import defaultdict from collections import defaultdict
from dataclasses import dataclass, field from dataclasses import dataclass, field
import dataclasses
from functools import cached_property from functools import cached_property
import sys import sys
from typing import ClassVar, Iterable, Iterator, NewType, TypeAlias, TypeVar, assert_never, overload from typing import ClassVar, Iterable, Iterator, NewType, TypeAlias, TypeVar, assert_never, overload
@ -108,6 +109,11 @@ class Parser:
COLUMN_SPLIT_X = 300.0 COLUMN_SPLIT_X = 300.0
INSTR_BIT_FIELDS_TOP_PAD_HEIGHT = 14.694
INSTR_BIT_FIELDS_BOX_HEIGHT = 22.317
INSTR_BIT_FIELDS_BOTTOM_PAD_HEIGHT = 24.657
INSTR_BIT_FIELDS_PADDED_HEIGHT = (INSTR_BIT_FIELDS_TOP_PAD_HEIGHT
+ INSTR_BIT_FIELDS_BOX_HEIGHT + INSTR_BIT_FIELDS_BOTTOM_PAD_HEIGHT)
@dataclass() @dataclass()
class ParsedTextLine: class ParsedTextLine:
@ -116,6 +122,22 @@ class ParsedTextLine:
fonts: TextLineFonts fonts: TextLineFonts
chars: list[Char] chars: list[Char]
def __repr__(self) -> str:
fields = []
for i in dataclasses.fields(self):
if i.name == "element":
fields.append(
i.name + "=" + ElementTree.tostring(self.element, encoding="unicode"))
continue
# use fields as a handy object we know isn't a field
v = getattr(self, i.name, fields)
if v is fields:
fields.append(i.name + "=<unset>")
continue
fields.append(i.name + "=" + repr(v))
sep = ",\n "
return f"{__class__.__name__}({sep.join(fields)})"
def __str__(self) -> str: def __str__(self) -> str:
return ElementTree.tostring(self.element, encoding="unicode") return ElementTree.tostring(self.element, encoding="unicode")
@ -234,6 +256,30 @@ class ElementBodyBuilder:
self.set_tag_stack(()) self.set_tag_stack(())
self.__flush_text_buffer() self.__flush_text_buffer()
@dataclass(unsafe_hash=True, frozen=True)
class InstrBitField:
box_min_x: float
box_max_x: float
name: None | ParsedTextLine
start_bit: None | ParsedTextLine
def __str__(self) -> str:
return f"<InstrBitField: x={self.box_min_x}..{self.box_max_x} name={self.name} start_bit={self.start_bit}>"
@dataclass(unsafe_hash=True, frozen=True)
class InstrBitFields:
box_min_x: float
box_min_y: float
box_max_x: float
box_max_y: float
fields: tuple[InstrBitField, ...]
def __str__(self):
sep = ",\n "
return (f"<InstrBitFields: ({self.box_min_x},{self.box_min_y}).."
f"({self.box_max_x},{self.box_max_y}) [\n"
f" {sep.join(map(str, self.fields))}]>")
@dataclass() @dataclass()
class PageParser: class PageParser:
parser: Parser parser: Parser
@ -278,7 +324,7 @@ class PageParser:
text += char.text text += char.text
print(repr(text)) print(repr(text))
assert font.known_name is not None, f"unknown font {font}\nlast char: {char}" assert font.known_name is not None, f"unknown font {font}\nlast char: {char}"
self.extract_instructions() self.extract_instrs()
def extract_text_line( def extract_text_line(
self, *, self, *,
@ -287,6 +333,7 @@ class PageParser:
min_x: float, min_x: float,
max_x: float, max_x: float,
fonts: TextLineFonts, fonts: TextLineFonts,
skip_initial_spaces=False,
) -> None | ParsedTextLine: ) -> None | ParsedTextLine:
chars: list[Char] = [] chars: list[Char] = []
if start_char is not None: if start_char is not None:
@ -319,6 +366,7 @@ class PageParser:
for char in chars: for char in chars:
kind = fonts.get_kind(char.font) kind = fonts.get_kind(char.font)
if kind is None: if kind is None:
print(f"font kind is None:\nfonts={fonts}\nchar={char}")
return None return None
if last_kind is None: if last_kind is None:
space_kind = kind space_kind = kind
@ -332,9 +380,10 @@ class PageParser:
space_count = round(space_count_f) space_count = round(space_count_f)
if space_count_f > 0.25 and abs(space_count - space_count_f) > 0.15: if space_count_f > 0.25 and abs(space_count - space_count_f) > 0.15:
print(f"spaces: space_count_f={space_count_f} space_width={space_width}") print(f"spaces: space_count_f={space_count_f} space_width={space_width}")
if space_count > 0: if space_count > 0 and not skip_initial_spaces:
body_builder.set_tag_stack(space_kind.text_line_tags) body_builder.set_tag_stack(space_kind.text_line_tags)
body_builder.write_text(" " * space_count) body_builder.write_text(" " * space_count)
skip_initial_spaces = False
body_builder.set_tag_stack(kind.text_line_tags) body_builder.set_tag_stack(kind.text_line_tags)
body_builder.write_text(char.text) body_builder.write_text(char.text)
last_max_x = char.max_x last_max_x = char.max_x
@ -359,34 +408,140 @@ class PageParser:
) )
return retval return retval
def extract_instruction(self, header_start_char: Char): def extract_instr_bit_fields(
self,
min_x: float,
max_x: float,
last_mnemonic_line_min_y: float,
) -> None | InstrBitFields:
h_lines: list[LTLine] = []
v_lines: list[LTLine] = []
for x, y, line in self.qt.range(
min_x=min_x - 5,
max_x=max_x + 5,
min_y=last_mnemonic_line_min_y
- INSTR_BIT_FIELDS_PADDED_HEIGHT
+ INSTR_BIT_FIELDS_BOTTOM_PAD_HEIGHT / 2,
max_y=last_mnemonic_line_min_y
- INSTR_BIT_FIELDS_TOP_PAD_HEIGHT / 2,
):
if not isinstance(line, LTLine):
continue
if line.width > line.height:
h_lines.append(line)
else:
v_lines.append(line)
h_lines.sort(key=lambda line: line.y0, reverse=False)
v_lines.sort(key=lambda line: line.x0)
for i in reversed(range(len(v_lines) - 1)):
if abs(v_lines[i].x0 - v_lines[i + 1].x0) < 0.5:
del v_lines[i + 1] # remove duplicates
if len(h_lines) == 0 and len(v_lines) == 0:
return None
if len(h_lines) != 2:
raise PageParseFailed(
f"instruction bit fields box has wrong number of horizontal lines:\n{h_lines}")
if len(v_lines) < 2:
raise PageParseFailed(
f"instruction bit fields box has too few vertical lines:\n{h_lines}")
bottom_line, top_line = h_lines
box_min_x = v_lines[0].x0
box_max_x = v_lines[-1].x0
box_min_y = bottom_line.y0
box_max_y = top_line.y1
box_mid_y = (box_min_y + box_max_y) * 0.5
print(f"bottom_line={bottom_line}")
print(f"top_line={top_line}")
print(v_lines)
fields: list[InstrBitField] = []
for i in range(len(v_lines) - 1):
left_line = v_lines[i]
right_line = v_lines[i + 1]
field_box_min_x = left_line.x1
field_box_max_x = right_line.x0
fields.append(InstrBitField(
box_min_x=field_box_min_x,
box_max_x=field_box_max_x,
name=self.extract_text_line(
start_min_y=box_mid_y + 3,
min_x=field_box_min_x,
max_x=field_box_max_x,
fonts=TextLineFonts(
regular=Font.INSTR_DESC,
),
skip_initial_spaces=True,
),
start_bit=self.extract_text_line(
start_min_y=box_min_y + 3,
min_x=field_box_min_x,
max_x=field_box_max_x,
fonts=TextLineFonts(
regular=Font.INSTR_FIELD_BIT_NUMS,
),
skip_initial_spaces=True,
),
))
return InstrBitFields(
box_min_x=box_min_x,
box_min_y=box_min_y,
box_max_x=box_max_x,
box_max_y=box_max_y,
fields=tuple(fields),
)
def extract_instr(self, header_start_char: Char):
assert header_start_char.font == Font.INSTR_HEADER assert header_start_char.font == Font.INSTR_HEADER
if header_start_char.min_x < COLUMN_SPLIT_X: column_min_x = header_start_char.min_x
if column_min_x < COLUMN_SPLIT_X:
column_max_x = COLUMN_SPLIT_X column_max_x = COLUMN_SPLIT_X
else: else:
column_max_x = 1000 column_max_x = 1000
header_text_line = self.extract_text_line( header_line = self.extract_text_line(
start_char=header_start_char, start_char=header_start_char,
start_min_y=header_start_char.min_y, start_min_y=header_start_char.min_y,
min_x=header_start_char.min_x, min_x=column_min_x,
max_x=column_max_x, max_x=column_max_x,
fonts=TextLineFonts(regular=Font.INSTR_HEADER), fonts=TextLineFonts(regular=Font.INSTR_HEADER),
) )
if header_text_line is None: if header_line is None:
raise PageParseFailed("can't find header text line") raise PageParseFailed("can't find header text line")
print(header_text_line)
header_lines = self.extract_following_text_lines( header_lines = self.extract_following_text_lines(
first_text_line=header_text_line, first_text_line=header_line,
min_x=header_start_char.min_x, min_x=column_min_x,
max_x=column_max_x, max_x=column_max_x,
) )
print(*header_lines) print("instr header lines:")
print("\n".join(map(str, header_lines)))
mnemonic_line = self.extract_text_line(
start_min_y=header_lines[-1].regular_min_y - 18.788,
min_x=column_min_x,
max_x=column_max_x,
fonts=TextLineFonts(
regular=Font.INSTR_DESC,
),
skip_initial_spaces=True,
)
if mnemonic_line is None:
raise PageParseFailed("can't find instr mnemonic text line")
mnemonic_lines = self.extract_following_text_lines(
first_text_line=mnemonic_line,
min_x=mnemonic_line.chars[0].min_x,
max_x=column_max_x,
)
print("instr mnemonic lines:")
print("\n".join(map(str, mnemonic_lines)))
instr_bit_fields = self.extract_instr_bit_fields(
min_x=column_min_x,
max_x=column_max_x,
last_mnemonic_line_min_y=mnemonic_lines[-1].regular_min_y,
)
print(instr_bit_fields)
# TODO: finish # TODO: finish
def extract_instructions(self): def extract_instrs(self):
unprocessed_header_chars = self.unprocessed_chars[Font.INSTR_HEADER] unprocessed_header_chars = self.unprocessed_chars[Font.INSTR_HEADER]
while len(unprocessed_header_chars) != 0: while len(unprocessed_header_chars) != 0:
self.extract_instruction(next(iter(unprocessed_header_chars))) self.extract_instr(next(iter(unprocessed_header_chars)))
def main(): def main():
Parser().parse_pdf(sys.argv[1], page_numbers=range(76, 78)) Parser().parse_pdf(sys.argv[1], page_numbers=range(76, 78))