generate xml output

This commit is contained in:
Jacob Lifshay 2024-10-28 00:10:42 -07:00
parent 21b97c0064
commit 25f47227d8
Signed by: programmerjake
SSH key fingerprint: SHA256:B1iRVvUJkvd7upMIiMqn6OyxvD2SgJkAH3ZnUOj6z+c
3 changed files with 209 additions and 20 deletions

3
.gitignore vendored
View file

@ -2,4 +2,5 @@
/.vscode
*.egg-info
__pycache__
*.log
*.log
/powerisa-instructions.xml

View file

@ -10,6 +10,8 @@ from typing import ClassVar, TypeVar, assert_never
from xml.etree import ElementTree
import enum
import traceback
from copy import deepcopy
from pathlib import Path
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTChar, LTLine, LTPage, LTRect, LTTextBox
@ -256,6 +258,39 @@ class ParsedTextLine:
def __str__(self) -> str:
return "\n" * self.preceding_blank_lines + ElementTree.tostring(self.element, encoding="unicode")
def write_xml(self, parent: ElementTree.Element, trailing_nl: bool):
for _ in range(self.preceding_blank_lines):
ElementTree.SubElement(parent, "br").tail = "\n"
if self.element.text is not None:
if len(parent) == 0:
parent.text = (parent.text or "") + self.element.text
else:
parent[-1].tail = (parent[-1].tail or "") + self.element.text
for element in self.element:
parent.append(deepcopy(element))
if trailing_nl:
ElementTree.SubElement(parent, "br").tail = "\n"
@staticmethod
def write_xml_lines(
lines: Iterable[ParsedTextLine],
parent: ElementTree.Element,
trailing_nl: bool,
preceding_nl: bool=False,
):
if preceding_nl:
ElementTree.SubElement(parent, "br").tail = "\n"
first = True
for line in lines:
if first:
first = False
else:
ElementTree.SubElement(parent, "br").tail = "\n"
line.write_xml(parent, trailing_nl=False)
if trailing_nl:
ElementTree.SubElement(parent, "br").tail = "\n"
_T = TypeVar("_T")
class BaselinePos(enum.Enum):
@ -535,6 +570,17 @@ class InsnBitField:
def __str__(self) -> str:
return f"<InsnBitField: x={self.box_min_x}..{self.box_max_x} name={self.name} bit_number={self.bit_number}>"
def write_xml(self, parent: ElementTree.Element):
field = ElementTree.SubElement(parent, "field")
field.text = "\n"
field.tail = "\n"
name = ElementTree.SubElement(field, "name")
name.tail = "\n"
self.name.write_xml(name, trailing_nl=False)
bit_number = ElementTree.SubElement(field, "bit-number")
bit_number.tail = "\n"
self.bit_number.write_xml(bit_number, trailing_nl=False)
@dataclass(unsafe_hash=True, frozen=True)
class InsnBitFieldsPrefix:
box_min_x: float
@ -555,6 +601,18 @@ class InsnBitFieldsPrefix:
f" ]\n"
f" suffix_text={self.suffix_text}>")
def write_xml(self, parent: ElementTree.Element):
prefix_elm = ElementTree.SubElement(parent, "prefix")
prefix_elm.text = "\n"
prefix_elm.tail = "\n"
prefix_text = ElementTree.SubElement(prefix_elm, "prefix-text")
prefix_text.tail = "\n"
self.prefix_text.write_xml(prefix_text, trailing_nl=False)
InsnBitFields.write_xml_fields(self.fields, prefix_elm)
suffix_text = ElementTree.SubElement(prefix_elm, "suffix-text")
suffix_text.tail = "\n"
self.suffix_text.write_xml(suffix_text, trailing_nl=False)
@dataclass(unsafe_hash=True, frozen=True)
class InsnBitFields:
prefix: None | InsnBitFieldsPrefix
@ -573,6 +631,23 @@ class InsnBitFields:
f"({self.box_max_x},{self.box_max_y}) [\n"
f" {sep.join(map(str, self.fields))},\n]>")
@staticmethod
def write_xml_fields(fields: Iterable[InsnBitField], parent: ElementTree.Element):
fields_elm = ElementTree.SubElement(parent, "fields")
fields_elm.text = "\n"
fields_elm.tail = "\n"
for field in fields:
field.write_xml(fields_elm)
def write_xml(self, parent: ElementTree.Element):
bit_fields = ElementTree.SubElement(parent, "bit-fields")
bit_fields.text = "\n"
bit_fields.tail = "\n"
if self.prefix is not None:
self.prefix.write_xml(bit_fields)
InsnBitFields.write_xml_fields(self.fields, bit_fields)
@dataclass(unsafe_hash=True, frozen=True)
class InsnSpRegsAlteredEntry:
reg: ParsedTextLine
@ -600,6 +675,20 @@ class InsnSpRegsAlteredEntry:
f"{indent} conds={conds},\n"
f"{indent})")
def write_xml(self, parent: ElementTree.Element):
entry = ElementTree.SubElement(parent, "entry")
entry.text = "\n"
entry.tail = "\n"
reg = ElementTree.SubElement(entry, "register")
reg.tail = "\n"
self.reg.write_xml(reg, trailing_nl=False)
fields = ElementTree.SubElement(entry, "fields")
fields.tail = "\n"
ParsedTextLine.write_xml_lines(self.fields, fields, trailing_nl=False)
conds = ElementTree.SubElement(entry, "conditions")
conds.tail = "\n"
ParsedTextLine.write_xml_lines(self.conds, conds, trailing_nl=False)
@dataclass(unsafe_hash=True, frozen=True)
class InsnSpRegsAltered:
sp_regs_altered_text: ParsedTextLine
@ -631,6 +720,28 @@ class InsnSpRegsAltered:
lines.append(f")")
return "\n".join(lines)
def write_xml(self, parent: ElementTree.Element):
sp_regs_altered = ElementTree.SubElement(parent, "special-registers-altered")
sp_regs_altered.text = "\n"
sp_regs_altered.tail = "\n"
title = ElementTree.SubElement(sp_regs_altered, "title")
title.tail = "\n"
self.sp_regs_altered_text.write_xml(title, trailing_nl=False)
if self.special_text is not None:
special_text = ElementTree.SubElement(sp_regs_altered, "special-text")
special_text.tail = "\n"
self.special_text.write_xml(special_text, trailing_nl=False)
if self.table_header_reg is not None:
table_header_reg = ElementTree.SubElement(sp_regs_altered, "table-header-register")
table_header_reg.tail = "\n"
self.table_header_reg.write_xml(table_header_reg, trailing_nl=False)
if self.table_header_fields is not None:
table_header_fields = ElementTree.SubElement(sp_regs_altered, "table-header-fields")
table_header_fields.tail = "\n"
self.table_header_fields.write_xml(table_header_fields, trailing_nl=False)
for entry in self.entries:
entry.write_xml(sp_regs_altered)
class _InsnParseSection(enum.Enum):
CODE = "code"
HEADER = "header"
@ -687,8 +798,9 @@ class Page:
)
if text_section is None:
if PAGE_BODY_MIN_Y <= element.y0 <= PAGE_BODY_MAX_Y:
raise AssertionError(
f"char not in text section: {element}\npage_num={page_num}")
if page_num != 1072: # page 1072 has characters in the margins
raise AssertionError(
f"char not in text section: {element}\npage_num={page_num}")
continue
char = Char(
text=element.get_text(),
@ -1003,10 +1115,57 @@ class TextSection:
return i
return None
@dataclass(frozen=True)
class InsnHeader:
header_lines: tuple[ParsedTextLine, ...]
mnemonic_lines: tuple[ParsedTextLine, ...]
bit_fields: InsnBitFields
@property
def min_y(self) -> float:
return self.bit_fields.box_min_y
def write_xml(self, parent: ElementTree.Element):
header = ElementTree.SubElement(parent, "header")
header.text = "\n"
header.tail = "\n"
title = ElementTree.SubElement(header, "title")
title.tail = "\n"
ParsedTextLine.write_xml_lines(self.header_lines, title, trailing_nl=False)
mnemonics = ElementTree.SubElement(header, "mnemonics")
mnemonics.tail = "\n"
ParsedTextLine.write_xml_lines(self.mnemonic_lines, mnemonics, trailing_nl=False)
self.bit_fields.write_xml(header)
@dataclass(frozen=True)
class Insn:
headers: tuple[InsnHeader, ...]
code_lines: tuple[ParsedTextLine, ...]
desc_lines: tuple[ParsedTextLine, ...]
sp_regs_altered: None | InsnSpRegsAltered
def write_xml(self, parent: ElementTree.Element):
insn = ElementTree.SubElement(parent, "instruction")
insn.text = "\n"
insn.tail = "\n"
for header in self.headers:
header.write_xml(insn)
if len(self.code_lines) != 0:
code = ElementTree.SubElement(insn, "code")
code.tail = "\n"
ParsedTextLine.write_xml_lines(self.code_lines, code, trailing_nl=False)
if len(self.desc_lines) != 0:
desc = ElementTree.SubElement(insn, "description")
desc.tail = "\n"
ParsedTextLine.write_xml_lines(self.desc_lines, desc, trailing_nl=False)
if self.sp_regs_altered is not None:
self.sp_regs_altered.write_xml(insn)
@dataclass()
class Parser:
pages: Pages = field(default_factory=Pages)
text_section: TextSection = TextSection.first()
insns: list[Insn] = field(default_factory=list)
@property
def page(self) -> Page:
@ -1017,7 +1176,7 @@ class Parser:
return self.pages[self.text_section.page_num].unprocessed_chars[self.text_section]
@staticmethod
def __pages_gen(file: str, page_numbers: Iterable[int] | None) -> Generator[Page, None, None]:
def __pages_gen(file: Path, page_numbers: Iterable[int] | None) -> Generator[Page, None, None]:
if page_numbers is not None:
page_numbers = sorted(i - 1 for i in page_numbers)
for i, page in enumerate(extract_pages(file, page_numbers=page_numbers)):
@ -1028,7 +1187,7 @@ class Parser:
print(f"page {page_num}")
yield Page.from_lt_page(page_num=page_num, page=page)
def parse_pdf(self, file: str, page_numbers: Iterable[int] | None = None):
def parse_pdf(self, file: Path, page_numbers: Iterable[int] | None = None):
self.pages = Pages(pages_gen=Parser.__pages_gen(
file=file, page_numbers=page_numbers))
self.text_section = TextSection.first()
@ -1059,7 +1218,7 @@ class Parser:
try:
with self.note_text_section():
self.extract_insns()
except InsnParseError as e:
except (InsnParseError, PageParseError) as e:
print("".join(traceback.format_exception_only(e)), flush=True)
traceback.print_exc()
@ -1200,10 +1359,11 @@ class Parser:
self.unprocessed_chars[char.font].remove(char)
if allowed_start_min_y_error is None:
allowed_start_min_y_error = 0.01
assert abs(start_min_y - retval.regular_min_y) < allowed_start_min_y_error, (
f"start_min_y={start_min_y} regular_min_y={retval.regular_min_y}\n"
f"start_min_y error: {start_min_y - retval.regular_min_y}\n"
f"allowed_start_min_y_error={allowed_start_min_y_error}")
if abs(start_min_y - retval.regular_min_y) > allowed_start_min_y_error:
raise PageParseError(
f"start_min_y={start_min_y} regular_min_y={retval.regular_min_y}\n"
f"start_min_y error: {start_min_y - retval.regular_min_y}\n"
f"allowed_start_min_y_error={allowed_start_min_y_error}")
return retval
def extract_following_text_lines(
@ -1403,7 +1563,7 @@ class Parser:
self,
start_min_y: float,
header_start_char: None | Char = None,
) -> None | tuple[list[ParsedTextLine], list[ParsedTextLine], InsnBitFields]:
) -> None | InsnHeader:
assert header_start_char is None or \
header_start_char.font == Font.INSN_HEADER
header_line = self.extract_text_line(
@ -1458,7 +1618,11 @@ class Parser:
print(insn_bit_fields)
if insn_bit_fields is None:
raise InsnParseError("can't find insn bit fields")
return header_lines, mnemonic_lines, insn_bit_fields
return InsnHeader(
header_lines=tuple(header_lines),
mnemonic_lines=tuple(mnemonic_lines),
bit_fields=insn_bit_fields,
)
def extract_insn_sp_regs_altered(
self,
@ -1474,12 +1638,14 @@ class Parser:
max_y=sp_regs_altered_text.regular_min_y - 5,
allow_processed=False,
)
assert table_header_reg_char is not None, \
"can't find special registers altered table's register-column's header"
if table_header_reg_char is None:
raise InsnParseError(
"can't find special registers altered table's register-column's header")
KNOWN_SPECIAL_TEXTS = (
"None",
"Dependent on the system service",
"See above.",
"See Table 5.1",
)
match table_header_reg_char.text:
case "R":
@ -1611,7 +1777,7 @@ class Parser:
final_regular_min_y=regular_min_y,
)
def extract_insn(self, header_start_char: Char):
def extract_insn(self, header_start_char: Char) -> Insn:
assert header_start_char.font == Font.INSN_HEADER
print(header_start_char)
header = self.extract_insn_header_mnemonics_and_bit_fields(
@ -1620,7 +1786,7 @@ class Parser:
)
if header is None:
raise PageParseError("can't find header text line")
next_start_min_y = header[2].box_min_y - 5
next_start_min_y = header.min_y - 5
headers = [header]
code_lines: list[ParsedTextLine] = []
desc_lines: list[ParsedTextLine] = []
@ -1687,7 +1853,7 @@ class Parser:
if header is None:
raise InsnParseError("can't find header text line")
headers.append(header)
next_start_min_y = header[2].box_min_y - 5
next_start_min_y = header.min_y - 5
case _InsnParseSection.DESC:
desc_line = self.extract_text_line(
start_char=next_char,
@ -1706,7 +1872,7 @@ class Parser:
first_text_line=desc_line,
min_x=desc_line.chars[0].min_x,
max_x=self.text_section.max_x,
allowed_start_min_y_error=3,
allowed_start_min_y_error=3.5,
)
print("more insn desc lines:")
print("\n".join(map(str, more_desc_lines)))
@ -1729,6 +1895,12 @@ class Parser:
print("sp_regs_altered:")
print(sp_regs_altered)
# TODO: finish
return Insn(
headers=tuple(headers),
code_lines=tuple(code_lines),
desc_lines=tuple(desc_lines),
sp_regs_altered=sp_regs_altered,
)
def extract_insns(self):
while True:
@ -1737,7 +1909,7 @@ class Parser:
self.unprocessed_chars[Font.INSN_HEADER]))
except StopIteration:
break
self.extract_insn(header_start_char=header_start_char)
self.insns.append(self.extract_insn(header_start_char=header_start_char))
def main():
if 2 < len(sys.argv):
@ -1747,4 +1919,19 @@ def main():
page_numbers = tuple(int(i) for i in sys.argv[2].split(","))
else:
page_numbers = None
Parser().parse_pdf(sys.argv[1], page_numbers=page_numbers)
parser = Parser()
file_name = Path(sys.argv[1])
parser.parse_pdf(file_name, page_numbers=page_numbers)
insns = ElementTree.Element("instructions", attrib={"is-subset": str(page_numbers is not None)})
insns.text = "\n"
insns.tail = "\n"
comment = ElementTree.Comment(f" Automatically generated from {file_name.name} ")
comment.tail = "\n"
insns.append(comment)
for insn in parser.insns:
insn.write_xml(insns)
ElementTree.ElementTree(insns).write(
"powerisa-instructions.xml",
encoding="utf-8",
xml_declaration=True,
)

View file

@ -8,6 +8,7 @@ version = "0.0.0"
dependencies = [
"pdfminer.six == 20240706"
]
requires-python = ">= 3.11"
[project.scripts]
parse_powerisa_pdf = "parse_powerisa_pdf.parse_powerisa_pdf:main"