diff --git a/parse_powerisa_pdf/parse_powerisa_pdf.py b/parse_powerisa_pdf/parse_powerisa_pdf.py index 8810ae7..c42118e 100755 --- a/parse_powerisa_pdf/parse_powerisa_pdf.py +++ b/parse_powerisa_pdf/parse_powerisa_pdf.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections import defaultdict from dataclasses import dataclass, field +import dataclasses from functools import cached_property import sys from typing import ClassVar, Iterable, Iterator, NewType, TypeAlias, TypeVar, assert_never, overload @@ -108,6 +109,11 @@ class Parser: COLUMN_SPLIT_X = 300.0 +INSTR_BIT_FIELDS_TOP_PAD_HEIGHT = 14.694 +INSTR_BIT_FIELDS_BOX_HEIGHT = 22.317 +INSTR_BIT_FIELDS_BOTTOM_PAD_HEIGHT = 24.657 +INSTR_BIT_FIELDS_PADDED_HEIGHT = (INSTR_BIT_FIELDS_TOP_PAD_HEIGHT + + INSTR_BIT_FIELDS_BOX_HEIGHT + INSTR_BIT_FIELDS_BOTTOM_PAD_HEIGHT) @dataclass() class ParsedTextLine: @@ -116,6 +122,22 @@ class ParsedTextLine: fonts: TextLineFonts chars: list[Char] + def __repr__(self) -> str: + fields = [] + for i in dataclasses.fields(self): + if i.name == "element": + fields.append( + i.name + "=" + ElementTree.tostring(self.element, encoding="unicode")) + continue + # use fields as a handy object we know isn't a field + v = getattr(self, i.name, fields) + if v is fields: + fields.append(i.name + "=") + continue + fields.append(i.name + "=" + repr(v)) + sep = ",\n " + return f"{__class__.__name__}({sep.join(fields)})" + def __str__(self) -> str: return ElementTree.tostring(self.element, encoding="unicode") @@ -234,6 +256,30 @@ class ElementBodyBuilder: self.set_tag_stack(()) self.__flush_text_buffer() +@dataclass(unsafe_hash=True, frozen=True) +class InstrBitField: + box_min_x: float + box_max_x: float + name: None | ParsedTextLine + start_bit: None | ParsedTextLine + + def __str__(self) -> str: + return f"" + +@dataclass(unsafe_hash=True, frozen=True) +class InstrBitFields: + box_min_x: float + box_min_y: float + box_max_x: float + box_max_y: float + fields: tuple[InstrBitField, ...] + + def __str__(self): + sep = ",\n " + return (f"") + @dataclass() class PageParser: parser: Parser @@ -278,7 +324,7 @@ class PageParser: text += char.text print(repr(text)) assert font.known_name is not None, f"unknown font {font}\nlast char: {char}" - self.extract_instructions() + self.extract_instrs() def extract_text_line( self, *, @@ -287,6 +333,7 @@ class PageParser: min_x: float, max_x: float, fonts: TextLineFonts, + skip_initial_spaces=False, ) -> None | ParsedTextLine: chars: list[Char] = [] if start_char is not None: @@ -319,6 +366,7 @@ class PageParser: for char in chars: kind = fonts.get_kind(char.font) if kind is None: + print(f"font kind is None:\nfonts={fonts}\nchar={char}") return None if last_kind is None: space_kind = kind @@ -332,9 +380,10 @@ class PageParser: space_count = round(space_count_f) if space_count_f > 0.25 and abs(space_count - space_count_f) > 0.15: print(f"spaces: space_count_f={space_count_f} space_width={space_width}") - if space_count > 0: + if space_count > 0 and not skip_initial_spaces: body_builder.set_tag_stack(space_kind.text_line_tags) body_builder.write_text(" " * space_count) + skip_initial_spaces = False body_builder.set_tag_stack(kind.text_line_tags) body_builder.write_text(char.text) last_max_x = char.max_x @@ -359,34 +408,140 @@ class PageParser: ) return retval - def extract_instruction(self, header_start_char: Char): + def extract_instr_bit_fields( + self, + min_x: float, + max_x: float, + last_mnemonic_line_min_y: float, + ) -> None | InstrBitFields: + h_lines: list[LTLine] = [] + v_lines: list[LTLine] = [] + for x, y, line in self.qt.range( + min_x=min_x - 5, + max_x=max_x + 5, + min_y=last_mnemonic_line_min_y + - INSTR_BIT_FIELDS_PADDED_HEIGHT + + INSTR_BIT_FIELDS_BOTTOM_PAD_HEIGHT / 2, + max_y=last_mnemonic_line_min_y + - INSTR_BIT_FIELDS_TOP_PAD_HEIGHT / 2, + ): + if not isinstance(line, LTLine): + continue + if line.width > line.height: + h_lines.append(line) + else: + v_lines.append(line) + h_lines.sort(key=lambda line: line.y0, reverse=False) + v_lines.sort(key=lambda line: line.x0) + for i in reversed(range(len(v_lines) - 1)): + if abs(v_lines[i].x0 - v_lines[i + 1].x0) < 0.5: + del v_lines[i + 1] # remove duplicates + if len(h_lines) == 0 and len(v_lines) == 0: + return None + if len(h_lines) != 2: + raise PageParseFailed( + f"instruction bit fields box has wrong number of horizontal lines:\n{h_lines}") + if len(v_lines) < 2: + raise PageParseFailed( + f"instruction bit fields box has too few vertical lines:\n{h_lines}") + bottom_line, top_line = h_lines + box_min_x = v_lines[0].x0 + box_max_x = v_lines[-1].x0 + box_min_y = bottom_line.y0 + box_max_y = top_line.y1 + box_mid_y = (box_min_y + box_max_y) * 0.5 + print(f"bottom_line={bottom_line}") + print(f"top_line={top_line}") + print(v_lines) + fields: list[InstrBitField] = [] + for i in range(len(v_lines) - 1): + left_line = v_lines[i] + right_line = v_lines[i + 1] + field_box_min_x = left_line.x1 + field_box_max_x = right_line.x0 + fields.append(InstrBitField( + box_min_x=field_box_min_x, + box_max_x=field_box_max_x, + name=self.extract_text_line( + start_min_y=box_mid_y + 3, + min_x=field_box_min_x, + max_x=field_box_max_x, + fonts=TextLineFonts( + regular=Font.INSTR_DESC, + ), + skip_initial_spaces=True, + ), + start_bit=self.extract_text_line( + start_min_y=box_min_y + 3, + min_x=field_box_min_x, + max_x=field_box_max_x, + fonts=TextLineFonts( + regular=Font.INSTR_FIELD_BIT_NUMS, + ), + skip_initial_spaces=True, + ), + )) + return InstrBitFields( + box_min_x=box_min_x, + box_min_y=box_min_y, + box_max_x=box_max_x, + box_max_y=box_max_y, + fields=tuple(fields), + ) + + def extract_instr(self, header_start_char: Char): assert header_start_char.font == Font.INSTR_HEADER - if header_start_char.min_x < COLUMN_SPLIT_X: + column_min_x = header_start_char.min_x + if column_min_x < COLUMN_SPLIT_X: column_max_x = COLUMN_SPLIT_X else: column_max_x = 1000 - header_text_line = self.extract_text_line( + header_line = self.extract_text_line( start_char=header_start_char, start_min_y=header_start_char.min_y, - min_x=header_start_char.min_x, + min_x=column_min_x, max_x=column_max_x, fonts=TextLineFonts(regular=Font.INSTR_HEADER), ) - if header_text_line is None: + if header_line is None: raise PageParseFailed("can't find header text line") - print(header_text_line) header_lines = self.extract_following_text_lines( - first_text_line=header_text_line, - min_x=header_start_char.min_x, + first_text_line=header_line, + min_x=column_min_x, max_x=column_max_x, ) - print(*header_lines) + print("instr header lines:") + print("\n".join(map(str, header_lines))) + mnemonic_line = self.extract_text_line( + start_min_y=header_lines[-1].regular_min_y - 18.788, + min_x=column_min_x, + max_x=column_max_x, + fonts=TextLineFonts( + regular=Font.INSTR_DESC, + ), + skip_initial_spaces=True, + ) + if mnemonic_line is None: + raise PageParseFailed("can't find instr mnemonic text line") + mnemonic_lines = self.extract_following_text_lines( + first_text_line=mnemonic_line, + min_x=mnemonic_line.chars[0].min_x, + max_x=column_max_x, + ) + print("instr mnemonic lines:") + print("\n".join(map(str, mnemonic_lines))) + instr_bit_fields = self.extract_instr_bit_fields( + min_x=column_min_x, + max_x=column_max_x, + last_mnemonic_line_min_y=mnemonic_lines[-1].regular_min_y, + ) + print(instr_bit_fields) # TODO: finish - def extract_instructions(self): + def extract_instrs(self): unprocessed_header_chars = self.unprocessed_chars[Font.INSTR_HEADER] while len(unprocessed_header_chars) != 0: - self.extract_instruction(next(iter(unprocessed_header_chars))) + self.extract_instr(next(iter(unprocessed_header_chars))) def main(): Parser().parse_pdf(sys.argv[1], page_numbers=range(76, 78)) \ No newline at end of file