parse instruction bit fields
This commit is contained in:
parent
77b0ce2c3d
commit
08141ce560
|
@ -1,6 +1,7 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
import dataclasses
|
||||||
from functools import cached_property
|
from functools import cached_property
|
||||||
import sys
|
import sys
|
||||||
from typing import ClassVar, Iterable, Iterator, NewType, TypeAlias, TypeVar, assert_never, overload
|
from typing import ClassVar, Iterable, Iterator, NewType, TypeAlias, TypeVar, assert_never, overload
|
||||||
|
@ -108,6 +109,11 @@ class Parser:
|
||||||
|
|
||||||
|
|
||||||
COLUMN_SPLIT_X = 300.0
|
COLUMN_SPLIT_X = 300.0
|
||||||
|
INSTR_BIT_FIELDS_TOP_PAD_HEIGHT = 14.694
|
||||||
|
INSTR_BIT_FIELDS_BOX_HEIGHT = 22.317
|
||||||
|
INSTR_BIT_FIELDS_BOTTOM_PAD_HEIGHT = 24.657
|
||||||
|
INSTR_BIT_FIELDS_PADDED_HEIGHT = (INSTR_BIT_FIELDS_TOP_PAD_HEIGHT
|
||||||
|
+ INSTR_BIT_FIELDS_BOX_HEIGHT + INSTR_BIT_FIELDS_BOTTOM_PAD_HEIGHT)
|
||||||
|
|
||||||
@dataclass()
|
@dataclass()
|
||||||
class ParsedTextLine:
|
class ParsedTextLine:
|
||||||
|
@ -116,6 +122,22 @@ class ParsedTextLine:
|
||||||
fonts: TextLineFonts
|
fonts: TextLineFonts
|
||||||
chars: list[Char]
|
chars: list[Char]
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
fields = []
|
||||||
|
for i in dataclasses.fields(self):
|
||||||
|
if i.name == "element":
|
||||||
|
fields.append(
|
||||||
|
i.name + "=" + ElementTree.tostring(self.element, encoding="unicode"))
|
||||||
|
continue
|
||||||
|
# use fields as a handy object we know isn't a field
|
||||||
|
v = getattr(self, i.name, fields)
|
||||||
|
if v is fields:
|
||||||
|
fields.append(i.name + "=<unset>")
|
||||||
|
continue
|
||||||
|
fields.append(i.name + "=" + repr(v))
|
||||||
|
sep = ",\n "
|
||||||
|
return f"{__class__.__name__}({sep.join(fields)})"
|
||||||
|
|
||||||
def __str__(self) -> str:
|
def __str__(self) -> str:
|
||||||
return ElementTree.tostring(self.element, encoding="unicode")
|
return ElementTree.tostring(self.element, encoding="unicode")
|
||||||
|
|
||||||
|
@ -234,6 +256,30 @@ class ElementBodyBuilder:
|
||||||
self.set_tag_stack(())
|
self.set_tag_stack(())
|
||||||
self.__flush_text_buffer()
|
self.__flush_text_buffer()
|
||||||
|
|
||||||
|
@dataclass(unsafe_hash=True, frozen=True)
|
||||||
|
class InstrBitField:
|
||||||
|
box_min_x: float
|
||||||
|
box_max_x: float
|
||||||
|
name: None | ParsedTextLine
|
||||||
|
start_bit: None | ParsedTextLine
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
return f"<InstrBitField: x={self.box_min_x}..{self.box_max_x} name={self.name} start_bit={self.start_bit}>"
|
||||||
|
|
||||||
|
@dataclass(unsafe_hash=True, frozen=True)
|
||||||
|
class InstrBitFields:
|
||||||
|
box_min_x: float
|
||||||
|
box_min_y: float
|
||||||
|
box_max_x: float
|
||||||
|
box_max_y: float
|
||||||
|
fields: tuple[InstrBitField, ...]
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
sep = ",\n "
|
||||||
|
return (f"<InstrBitFields: ({self.box_min_x},{self.box_min_y}).."
|
||||||
|
f"({self.box_max_x},{self.box_max_y}) [\n"
|
||||||
|
f" {sep.join(map(str, self.fields))}]>")
|
||||||
|
|
||||||
@dataclass()
|
@dataclass()
|
||||||
class PageParser:
|
class PageParser:
|
||||||
parser: Parser
|
parser: Parser
|
||||||
|
@ -278,7 +324,7 @@ class PageParser:
|
||||||
text += char.text
|
text += char.text
|
||||||
print(repr(text))
|
print(repr(text))
|
||||||
assert font.known_name is not None, f"unknown font {font}\nlast char: {char}"
|
assert font.known_name is not None, f"unknown font {font}\nlast char: {char}"
|
||||||
self.extract_instructions()
|
self.extract_instrs()
|
||||||
|
|
||||||
def extract_text_line(
|
def extract_text_line(
|
||||||
self, *,
|
self, *,
|
||||||
|
@ -287,6 +333,7 @@ class PageParser:
|
||||||
min_x: float,
|
min_x: float,
|
||||||
max_x: float,
|
max_x: float,
|
||||||
fonts: TextLineFonts,
|
fonts: TextLineFonts,
|
||||||
|
skip_initial_spaces=False,
|
||||||
) -> None | ParsedTextLine:
|
) -> None | ParsedTextLine:
|
||||||
chars: list[Char] = []
|
chars: list[Char] = []
|
||||||
if start_char is not None:
|
if start_char is not None:
|
||||||
|
@ -319,6 +366,7 @@ class PageParser:
|
||||||
for char in chars:
|
for char in chars:
|
||||||
kind = fonts.get_kind(char.font)
|
kind = fonts.get_kind(char.font)
|
||||||
if kind is None:
|
if kind is None:
|
||||||
|
print(f"font kind is None:\nfonts={fonts}\nchar={char}")
|
||||||
return None
|
return None
|
||||||
if last_kind is None:
|
if last_kind is None:
|
||||||
space_kind = kind
|
space_kind = kind
|
||||||
|
@ -332,9 +380,10 @@ class PageParser:
|
||||||
space_count = round(space_count_f)
|
space_count = round(space_count_f)
|
||||||
if space_count_f > 0.25 and abs(space_count - space_count_f) > 0.15:
|
if space_count_f > 0.25 and abs(space_count - space_count_f) > 0.15:
|
||||||
print(f"spaces: space_count_f={space_count_f} space_width={space_width}")
|
print(f"spaces: space_count_f={space_count_f} space_width={space_width}")
|
||||||
if space_count > 0:
|
if space_count > 0 and not skip_initial_spaces:
|
||||||
body_builder.set_tag_stack(space_kind.text_line_tags)
|
body_builder.set_tag_stack(space_kind.text_line_tags)
|
||||||
body_builder.write_text(" " * space_count)
|
body_builder.write_text(" " * space_count)
|
||||||
|
skip_initial_spaces = False
|
||||||
body_builder.set_tag_stack(kind.text_line_tags)
|
body_builder.set_tag_stack(kind.text_line_tags)
|
||||||
body_builder.write_text(char.text)
|
body_builder.write_text(char.text)
|
||||||
last_max_x = char.max_x
|
last_max_x = char.max_x
|
||||||
|
@ -359,34 +408,140 @@ class PageParser:
|
||||||
)
|
)
|
||||||
return retval
|
return retval
|
||||||
|
|
||||||
def extract_instruction(self, header_start_char: Char):
|
def extract_instr_bit_fields(
|
||||||
|
self,
|
||||||
|
min_x: float,
|
||||||
|
max_x: float,
|
||||||
|
last_mnemonic_line_min_y: float,
|
||||||
|
) -> None | InstrBitFields:
|
||||||
|
h_lines: list[LTLine] = []
|
||||||
|
v_lines: list[LTLine] = []
|
||||||
|
for x, y, line in self.qt.range(
|
||||||
|
min_x=min_x - 5,
|
||||||
|
max_x=max_x + 5,
|
||||||
|
min_y=last_mnemonic_line_min_y
|
||||||
|
- INSTR_BIT_FIELDS_PADDED_HEIGHT
|
||||||
|
+ INSTR_BIT_FIELDS_BOTTOM_PAD_HEIGHT / 2,
|
||||||
|
max_y=last_mnemonic_line_min_y
|
||||||
|
- INSTR_BIT_FIELDS_TOP_PAD_HEIGHT / 2,
|
||||||
|
):
|
||||||
|
if not isinstance(line, LTLine):
|
||||||
|
continue
|
||||||
|
if line.width > line.height:
|
||||||
|
h_lines.append(line)
|
||||||
|
else:
|
||||||
|
v_lines.append(line)
|
||||||
|
h_lines.sort(key=lambda line: line.y0, reverse=False)
|
||||||
|
v_lines.sort(key=lambda line: line.x0)
|
||||||
|
for i in reversed(range(len(v_lines) - 1)):
|
||||||
|
if abs(v_lines[i].x0 - v_lines[i + 1].x0) < 0.5:
|
||||||
|
del v_lines[i + 1] # remove duplicates
|
||||||
|
if len(h_lines) == 0 and len(v_lines) == 0:
|
||||||
|
return None
|
||||||
|
if len(h_lines) != 2:
|
||||||
|
raise PageParseFailed(
|
||||||
|
f"instruction bit fields box has wrong number of horizontal lines:\n{h_lines}")
|
||||||
|
if len(v_lines) < 2:
|
||||||
|
raise PageParseFailed(
|
||||||
|
f"instruction bit fields box has too few vertical lines:\n{h_lines}")
|
||||||
|
bottom_line, top_line = h_lines
|
||||||
|
box_min_x = v_lines[0].x0
|
||||||
|
box_max_x = v_lines[-1].x0
|
||||||
|
box_min_y = bottom_line.y0
|
||||||
|
box_max_y = top_line.y1
|
||||||
|
box_mid_y = (box_min_y + box_max_y) * 0.5
|
||||||
|
print(f"bottom_line={bottom_line}")
|
||||||
|
print(f"top_line={top_line}")
|
||||||
|
print(v_lines)
|
||||||
|
fields: list[InstrBitField] = []
|
||||||
|
for i in range(len(v_lines) - 1):
|
||||||
|
left_line = v_lines[i]
|
||||||
|
right_line = v_lines[i + 1]
|
||||||
|
field_box_min_x = left_line.x1
|
||||||
|
field_box_max_x = right_line.x0
|
||||||
|
fields.append(InstrBitField(
|
||||||
|
box_min_x=field_box_min_x,
|
||||||
|
box_max_x=field_box_max_x,
|
||||||
|
name=self.extract_text_line(
|
||||||
|
start_min_y=box_mid_y + 3,
|
||||||
|
min_x=field_box_min_x,
|
||||||
|
max_x=field_box_max_x,
|
||||||
|
fonts=TextLineFonts(
|
||||||
|
regular=Font.INSTR_DESC,
|
||||||
|
),
|
||||||
|
skip_initial_spaces=True,
|
||||||
|
),
|
||||||
|
start_bit=self.extract_text_line(
|
||||||
|
start_min_y=box_min_y + 3,
|
||||||
|
min_x=field_box_min_x,
|
||||||
|
max_x=field_box_max_x,
|
||||||
|
fonts=TextLineFonts(
|
||||||
|
regular=Font.INSTR_FIELD_BIT_NUMS,
|
||||||
|
),
|
||||||
|
skip_initial_spaces=True,
|
||||||
|
),
|
||||||
|
))
|
||||||
|
return InstrBitFields(
|
||||||
|
box_min_x=box_min_x,
|
||||||
|
box_min_y=box_min_y,
|
||||||
|
box_max_x=box_max_x,
|
||||||
|
box_max_y=box_max_y,
|
||||||
|
fields=tuple(fields),
|
||||||
|
)
|
||||||
|
|
||||||
|
def extract_instr(self, header_start_char: Char):
|
||||||
assert header_start_char.font == Font.INSTR_HEADER
|
assert header_start_char.font == Font.INSTR_HEADER
|
||||||
if header_start_char.min_x < COLUMN_SPLIT_X:
|
column_min_x = header_start_char.min_x
|
||||||
|
if column_min_x < COLUMN_SPLIT_X:
|
||||||
column_max_x = COLUMN_SPLIT_X
|
column_max_x = COLUMN_SPLIT_X
|
||||||
else:
|
else:
|
||||||
column_max_x = 1000
|
column_max_x = 1000
|
||||||
header_text_line = self.extract_text_line(
|
header_line = self.extract_text_line(
|
||||||
start_char=header_start_char,
|
start_char=header_start_char,
|
||||||
start_min_y=header_start_char.min_y,
|
start_min_y=header_start_char.min_y,
|
||||||
min_x=header_start_char.min_x,
|
min_x=column_min_x,
|
||||||
max_x=column_max_x,
|
max_x=column_max_x,
|
||||||
fonts=TextLineFonts(regular=Font.INSTR_HEADER),
|
fonts=TextLineFonts(regular=Font.INSTR_HEADER),
|
||||||
)
|
)
|
||||||
if header_text_line is None:
|
if header_line is None:
|
||||||
raise PageParseFailed("can't find header text line")
|
raise PageParseFailed("can't find header text line")
|
||||||
print(header_text_line)
|
|
||||||
header_lines = self.extract_following_text_lines(
|
header_lines = self.extract_following_text_lines(
|
||||||
first_text_line=header_text_line,
|
first_text_line=header_line,
|
||||||
min_x=header_start_char.min_x,
|
min_x=column_min_x,
|
||||||
max_x=column_max_x,
|
max_x=column_max_x,
|
||||||
)
|
)
|
||||||
print(*header_lines)
|
print("instr header lines:")
|
||||||
|
print("\n".join(map(str, header_lines)))
|
||||||
|
mnemonic_line = self.extract_text_line(
|
||||||
|
start_min_y=header_lines[-1].regular_min_y - 18.788,
|
||||||
|
min_x=column_min_x,
|
||||||
|
max_x=column_max_x,
|
||||||
|
fonts=TextLineFonts(
|
||||||
|
regular=Font.INSTR_DESC,
|
||||||
|
),
|
||||||
|
skip_initial_spaces=True,
|
||||||
|
)
|
||||||
|
if mnemonic_line is None:
|
||||||
|
raise PageParseFailed("can't find instr mnemonic text line")
|
||||||
|
mnemonic_lines = self.extract_following_text_lines(
|
||||||
|
first_text_line=mnemonic_line,
|
||||||
|
min_x=mnemonic_line.chars[0].min_x,
|
||||||
|
max_x=column_max_x,
|
||||||
|
)
|
||||||
|
print("instr mnemonic lines:")
|
||||||
|
print("\n".join(map(str, mnemonic_lines)))
|
||||||
|
instr_bit_fields = self.extract_instr_bit_fields(
|
||||||
|
min_x=column_min_x,
|
||||||
|
max_x=column_max_x,
|
||||||
|
last_mnemonic_line_min_y=mnemonic_lines[-1].regular_min_y,
|
||||||
|
)
|
||||||
|
print(instr_bit_fields)
|
||||||
# TODO: finish
|
# TODO: finish
|
||||||
|
|
||||||
def extract_instructions(self):
|
def extract_instrs(self):
|
||||||
unprocessed_header_chars = self.unprocessed_chars[Font.INSTR_HEADER]
|
unprocessed_header_chars = self.unprocessed_chars[Font.INSTR_HEADER]
|
||||||
while len(unprocessed_header_chars) != 0:
|
while len(unprocessed_header_chars) != 0:
|
||||||
self.extract_instruction(next(iter(unprocessed_header_chars)))
|
self.extract_instr(next(iter(unprocessed_header_chars)))
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
Parser().parse_pdf(sys.argv[1], page_numbers=range(76, 78))
|
Parser().parse_pdf(sys.argv[1], page_numbers=range(76, 78))
|
Loading…
Reference in a new issue