parse instruction bit fields
This commit is contained in:
parent
77b0ce2c3d
commit
08141ce560
|
@ -1,6 +1,7 @@
|
|||
from __future__ import annotations
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass, field
|
||||
import dataclasses
|
||||
from functools import cached_property
|
||||
import sys
|
||||
from typing import ClassVar, Iterable, Iterator, NewType, TypeAlias, TypeVar, assert_never, overload
|
||||
|
@ -108,6 +109,11 @@ class Parser:
|
|||
|
||||
|
||||
COLUMN_SPLIT_X = 300.0
|
||||
INSTR_BIT_FIELDS_TOP_PAD_HEIGHT = 14.694
|
||||
INSTR_BIT_FIELDS_BOX_HEIGHT = 22.317
|
||||
INSTR_BIT_FIELDS_BOTTOM_PAD_HEIGHT = 24.657
|
||||
INSTR_BIT_FIELDS_PADDED_HEIGHT = (INSTR_BIT_FIELDS_TOP_PAD_HEIGHT
|
||||
+ INSTR_BIT_FIELDS_BOX_HEIGHT + INSTR_BIT_FIELDS_BOTTOM_PAD_HEIGHT)
|
||||
|
||||
@dataclass()
|
||||
class ParsedTextLine:
|
||||
|
@ -116,6 +122,22 @@ class ParsedTextLine:
|
|||
fonts: TextLineFonts
|
||||
chars: list[Char]
|
||||
|
||||
def __repr__(self) -> str:
|
||||
fields = []
|
||||
for i in dataclasses.fields(self):
|
||||
if i.name == "element":
|
||||
fields.append(
|
||||
i.name + "=" + ElementTree.tostring(self.element, encoding="unicode"))
|
||||
continue
|
||||
# use fields as a handy object we know isn't a field
|
||||
v = getattr(self, i.name, fields)
|
||||
if v is fields:
|
||||
fields.append(i.name + "=<unset>")
|
||||
continue
|
||||
fields.append(i.name + "=" + repr(v))
|
||||
sep = ",\n "
|
||||
return f"{__class__.__name__}({sep.join(fields)})"
|
||||
|
||||
def __str__(self) -> str:
|
||||
return ElementTree.tostring(self.element, encoding="unicode")
|
||||
|
||||
|
@ -234,6 +256,30 @@ class ElementBodyBuilder:
|
|||
self.set_tag_stack(())
|
||||
self.__flush_text_buffer()
|
||||
|
||||
@dataclass(unsafe_hash=True, frozen=True)
|
||||
class InstrBitField:
|
||||
box_min_x: float
|
||||
box_max_x: float
|
||||
name: None | ParsedTextLine
|
||||
start_bit: None | ParsedTextLine
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"<InstrBitField: x={self.box_min_x}..{self.box_max_x} name={self.name} start_bit={self.start_bit}>"
|
||||
|
||||
@dataclass(unsafe_hash=True, frozen=True)
|
||||
class InstrBitFields:
|
||||
box_min_x: float
|
||||
box_min_y: float
|
||||
box_max_x: float
|
||||
box_max_y: float
|
||||
fields: tuple[InstrBitField, ...]
|
||||
|
||||
def __str__(self):
|
||||
sep = ",\n "
|
||||
return (f"<InstrBitFields: ({self.box_min_x},{self.box_min_y}).."
|
||||
f"({self.box_max_x},{self.box_max_y}) [\n"
|
||||
f" {sep.join(map(str, self.fields))}]>")
|
||||
|
||||
@dataclass()
|
||||
class PageParser:
|
||||
parser: Parser
|
||||
|
@ -278,7 +324,7 @@ class PageParser:
|
|||
text += char.text
|
||||
print(repr(text))
|
||||
assert font.known_name is not None, f"unknown font {font}\nlast char: {char}"
|
||||
self.extract_instructions()
|
||||
self.extract_instrs()
|
||||
|
||||
def extract_text_line(
|
||||
self, *,
|
||||
|
@ -287,6 +333,7 @@ class PageParser:
|
|||
min_x: float,
|
||||
max_x: float,
|
||||
fonts: TextLineFonts,
|
||||
skip_initial_spaces=False,
|
||||
) -> None | ParsedTextLine:
|
||||
chars: list[Char] = []
|
||||
if start_char is not None:
|
||||
|
@ -319,6 +366,7 @@ class PageParser:
|
|||
for char in chars:
|
||||
kind = fonts.get_kind(char.font)
|
||||
if kind is None:
|
||||
print(f"font kind is None:\nfonts={fonts}\nchar={char}")
|
||||
return None
|
||||
if last_kind is None:
|
||||
space_kind = kind
|
||||
|
@ -332,9 +380,10 @@ class PageParser:
|
|||
space_count = round(space_count_f)
|
||||
if space_count_f > 0.25 and abs(space_count - space_count_f) > 0.15:
|
||||
print(f"spaces: space_count_f={space_count_f} space_width={space_width}")
|
||||
if space_count > 0:
|
||||
if space_count > 0 and not skip_initial_spaces:
|
||||
body_builder.set_tag_stack(space_kind.text_line_tags)
|
||||
body_builder.write_text(" " * space_count)
|
||||
skip_initial_spaces = False
|
||||
body_builder.set_tag_stack(kind.text_line_tags)
|
||||
body_builder.write_text(char.text)
|
||||
last_max_x = char.max_x
|
||||
|
@ -359,34 +408,140 @@ class PageParser:
|
|||
)
|
||||
return retval
|
||||
|
||||
def extract_instruction(self, header_start_char: Char):
|
||||
def extract_instr_bit_fields(
|
||||
self,
|
||||
min_x: float,
|
||||
max_x: float,
|
||||
last_mnemonic_line_min_y: float,
|
||||
) -> None | InstrBitFields:
|
||||
h_lines: list[LTLine] = []
|
||||
v_lines: list[LTLine] = []
|
||||
for x, y, line in self.qt.range(
|
||||
min_x=min_x - 5,
|
||||
max_x=max_x + 5,
|
||||
min_y=last_mnemonic_line_min_y
|
||||
- INSTR_BIT_FIELDS_PADDED_HEIGHT
|
||||
+ INSTR_BIT_FIELDS_BOTTOM_PAD_HEIGHT / 2,
|
||||
max_y=last_mnemonic_line_min_y
|
||||
- INSTR_BIT_FIELDS_TOP_PAD_HEIGHT / 2,
|
||||
):
|
||||
if not isinstance(line, LTLine):
|
||||
continue
|
||||
if line.width > line.height:
|
||||
h_lines.append(line)
|
||||
else:
|
||||
v_lines.append(line)
|
||||
h_lines.sort(key=lambda line: line.y0, reverse=False)
|
||||
v_lines.sort(key=lambda line: line.x0)
|
||||
for i in reversed(range(len(v_lines) - 1)):
|
||||
if abs(v_lines[i].x0 - v_lines[i + 1].x0) < 0.5:
|
||||
del v_lines[i + 1] # remove duplicates
|
||||
if len(h_lines) == 0 and len(v_lines) == 0:
|
||||
return None
|
||||
if len(h_lines) != 2:
|
||||
raise PageParseFailed(
|
||||
f"instruction bit fields box has wrong number of horizontal lines:\n{h_lines}")
|
||||
if len(v_lines) < 2:
|
||||
raise PageParseFailed(
|
||||
f"instruction bit fields box has too few vertical lines:\n{h_lines}")
|
||||
bottom_line, top_line = h_lines
|
||||
box_min_x = v_lines[0].x0
|
||||
box_max_x = v_lines[-1].x0
|
||||
box_min_y = bottom_line.y0
|
||||
box_max_y = top_line.y1
|
||||
box_mid_y = (box_min_y + box_max_y) * 0.5
|
||||
print(f"bottom_line={bottom_line}")
|
||||
print(f"top_line={top_line}")
|
||||
print(v_lines)
|
||||
fields: list[InstrBitField] = []
|
||||
for i in range(len(v_lines) - 1):
|
||||
left_line = v_lines[i]
|
||||
right_line = v_lines[i + 1]
|
||||
field_box_min_x = left_line.x1
|
||||
field_box_max_x = right_line.x0
|
||||
fields.append(InstrBitField(
|
||||
box_min_x=field_box_min_x,
|
||||
box_max_x=field_box_max_x,
|
||||
name=self.extract_text_line(
|
||||
start_min_y=box_mid_y + 3,
|
||||
min_x=field_box_min_x,
|
||||
max_x=field_box_max_x,
|
||||
fonts=TextLineFonts(
|
||||
regular=Font.INSTR_DESC,
|
||||
),
|
||||
skip_initial_spaces=True,
|
||||
),
|
||||
start_bit=self.extract_text_line(
|
||||
start_min_y=box_min_y + 3,
|
||||
min_x=field_box_min_x,
|
||||
max_x=field_box_max_x,
|
||||
fonts=TextLineFonts(
|
||||
regular=Font.INSTR_FIELD_BIT_NUMS,
|
||||
),
|
||||
skip_initial_spaces=True,
|
||||
),
|
||||
))
|
||||
return InstrBitFields(
|
||||
box_min_x=box_min_x,
|
||||
box_min_y=box_min_y,
|
||||
box_max_x=box_max_x,
|
||||
box_max_y=box_max_y,
|
||||
fields=tuple(fields),
|
||||
)
|
||||
|
||||
def extract_instr(self, header_start_char: Char):
|
||||
assert header_start_char.font == Font.INSTR_HEADER
|
||||
if header_start_char.min_x < COLUMN_SPLIT_X:
|
||||
column_min_x = header_start_char.min_x
|
||||
if column_min_x < COLUMN_SPLIT_X:
|
||||
column_max_x = COLUMN_SPLIT_X
|
||||
else:
|
||||
column_max_x = 1000
|
||||
header_text_line = self.extract_text_line(
|
||||
header_line = self.extract_text_line(
|
||||
start_char=header_start_char,
|
||||
start_min_y=header_start_char.min_y,
|
||||
min_x=header_start_char.min_x,
|
||||
min_x=column_min_x,
|
||||
max_x=column_max_x,
|
||||
fonts=TextLineFonts(regular=Font.INSTR_HEADER),
|
||||
)
|
||||
if header_text_line is None:
|
||||
if header_line is None:
|
||||
raise PageParseFailed("can't find header text line")
|
||||
print(header_text_line)
|
||||
header_lines = self.extract_following_text_lines(
|
||||
first_text_line=header_text_line,
|
||||
min_x=header_start_char.min_x,
|
||||
first_text_line=header_line,
|
||||
min_x=column_min_x,
|
||||
max_x=column_max_x,
|
||||
)
|
||||
print(*header_lines)
|
||||
print("instr header lines:")
|
||||
print("\n".join(map(str, header_lines)))
|
||||
mnemonic_line = self.extract_text_line(
|
||||
start_min_y=header_lines[-1].regular_min_y - 18.788,
|
||||
min_x=column_min_x,
|
||||
max_x=column_max_x,
|
||||
fonts=TextLineFonts(
|
||||
regular=Font.INSTR_DESC,
|
||||
),
|
||||
skip_initial_spaces=True,
|
||||
)
|
||||
if mnemonic_line is None:
|
||||
raise PageParseFailed("can't find instr mnemonic text line")
|
||||
mnemonic_lines = self.extract_following_text_lines(
|
||||
first_text_line=mnemonic_line,
|
||||
min_x=mnemonic_line.chars[0].min_x,
|
||||
max_x=column_max_x,
|
||||
)
|
||||
print("instr mnemonic lines:")
|
||||
print("\n".join(map(str, mnemonic_lines)))
|
||||
instr_bit_fields = self.extract_instr_bit_fields(
|
||||
min_x=column_min_x,
|
||||
max_x=column_max_x,
|
||||
last_mnemonic_line_min_y=mnemonic_lines[-1].regular_min_y,
|
||||
)
|
||||
print(instr_bit_fields)
|
||||
# TODO: finish
|
||||
|
||||
def extract_instructions(self):
|
||||
def extract_instrs(self):
|
||||
unprocessed_header_chars = self.unprocessed_chars[Font.INSTR_HEADER]
|
||||
while len(unprocessed_header_chars) != 0:
|
||||
self.extract_instruction(next(iter(unprocessed_header_chars)))
|
||||
self.extract_instr(next(iter(unprocessed_header_chars)))
|
||||
|
||||
def main():
|
||||
Parser().parse_pdf(sys.argv[1], page_numbers=range(76, 78))
|
Loading…
Reference in a new issue