generate xml output
This commit is contained in:
parent
21b97c0064
commit
25f47227d8
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -2,4 +2,5 @@
|
|||
/.vscode
|
||||
*.egg-info
|
||||
__pycache__
|
||||
*.log
|
||||
*.log
|
||||
/powerisa-instructions.xml
|
|
@ -10,6 +10,8 @@ from typing import ClassVar, TypeVar, assert_never
|
|||
from xml.etree import ElementTree
|
||||
import enum
|
||||
import traceback
|
||||
from copy import deepcopy
|
||||
from pathlib import Path
|
||||
|
||||
from pdfminer.high_level import extract_pages
|
||||
from pdfminer.layout import LTChar, LTLine, LTPage, LTRect, LTTextBox
|
||||
|
@ -256,6 +258,39 @@ class ParsedTextLine:
|
|||
def __str__(self) -> str:
|
||||
return "\n" * self.preceding_blank_lines + ElementTree.tostring(self.element, encoding="unicode")
|
||||
|
||||
def write_xml(self, parent: ElementTree.Element, trailing_nl: bool):
|
||||
for _ in range(self.preceding_blank_lines):
|
||||
ElementTree.SubElement(parent, "br").tail = "\n"
|
||||
if self.element.text is not None:
|
||||
if len(parent) == 0:
|
||||
parent.text = (parent.text or "") + self.element.text
|
||||
else:
|
||||
parent[-1].tail = (parent[-1].tail or "") + self.element.text
|
||||
for element in self.element:
|
||||
parent.append(deepcopy(element))
|
||||
if trailing_nl:
|
||||
ElementTree.SubElement(parent, "br").tail = "\n"
|
||||
|
||||
@staticmethod
|
||||
def write_xml_lines(
|
||||
lines: Iterable[ParsedTextLine],
|
||||
parent: ElementTree.Element,
|
||||
trailing_nl: bool,
|
||||
preceding_nl: bool=False,
|
||||
):
|
||||
if preceding_nl:
|
||||
ElementTree.SubElement(parent, "br").tail = "\n"
|
||||
first = True
|
||||
for line in lines:
|
||||
if first:
|
||||
first = False
|
||||
else:
|
||||
ElementTree.SubElement(parent, "br").tail = "\n"
|
||||
line.write_xml(parent, trailing_nl=False)
|
||||
if trailing_nl:
|
||||
ElementTree.SubElement(parent, "br").tail = "\n"
|
||||
|
||||
|
||||
_T = TypeVar("_T")
|
||||
|
||||
class BaselinePos(enum.Enum):
|
||||
|
@ -535,6 +570,17 @@ class InsnBitField:
|
|||
def __str__(self) -> str:
|
||||
return f"<InsnBitField: x={self.box_min_x}..{self.box_max_x} name={self.name} bit_number={self.bit_number}>"
|
||||
|
||||
def write_xml(self, parent: ElementTree.Element):
|
||||
field = ElementTree.SubElement(parent, "field")
|
||||
field.text = "\n"
|
||||
field.tail = "\n"
|
||||
name = ElementTree.SubElement(field, "name")
|
||||
name.tail = "\n"
|
||||
self.name.write_xml(name, trailing_nl=False)
|
||||
bit_number = ElementTree.SubElement(field, "bit-number")
|
||||
bit_number.tail = "\n"
|
||||
self.bit_number.write_xml(bit_number, trailing_nl=False)
|
||||
|
||||
@dataclass(unsafe_hash=True, frozen=True)
|
||||
class InsnBitFieldsPrefix:
|
||||
box_min_x: float
|
||||
|
@ -555,6 +601,18 @@ class InsnBitFieldsPrefix:
|
|||
f" ]\n"
|
||||
f" suffix_text={self.suffix_text}>")
|
||||
|
||||
def write_xml(self, parent: ElementTree.Element):
|
||||
prefix_elm = ElementTree.SubElement(parent, "prefix")
|
||||
prefix_elm.text = "\n"
|
||||
prefix_elm.tail = "\n"
|
||||
prefix_text = ElementTree.SubElement(prefix_elm, "prefix-text")
|
||||
prefix_text.tail = "\n"
|
||||
self.prefix_text.write_xml(prefix_text, trailing_nl=False)
|
||||
InsnBitFields.write_xml_fields(self.fields, prefix_elm)
|
||||
suffix_text = ElementTree.SubElement(prefix_elm, "suffix-text")
|
||||
suffix_text.tail = "\n"
|
||||
self.suffix_text.write_xml(suffix_text, trailing_nl=False)
|
||||
|
||||
@dataclass(unsafe_hash=True, frozen=True)
|
||||
class InsnBitFields:
|
||||
prefix: None | InsnBitFieldsPrefix
|
||||
|
@ -573,6 +631,23 @@ class InsnBitFields:
|
|||
f"({self.box_max_x},{self.box_max_y}) [\n"
|
||||
f" {sep.join(map(str, self.fields))},\n]>")
|
||||
|
||||
@staticmethod
|
||||
def write_xml_fields(fields: Iterable[InsnBitField], parent: ElementTree.Element):
|
||||
fields_elm = ElementTree.SubElement(parent, "fields")
|
||||
fields_elm.text = "\n"
|
||||
fields_elm.tail = "\n"
|
||||
for field in fields:
|
||||
field.write_xml(fields_elm)
|
||||
|
||||
def write_xml(self, parent: ElementTree.Element):
|
||||
bit_fields = ElementTree.SubElement(parent, "bit-fields")
|
||||
bit_fields.text = "\n"
|
||||
bit_fields.tail = "\n"
|
||||
if self.prefix is not None:
|
||||
self.prefix.write_xml(bit_fields)
|
||||
InsnBitFields.write_xml_fields(self.fields, bit_fields)
|
||||
|
||||
|
||||
@dataclass(unsafe_hash=True, frozen=True)
|
||||
class InsnSpRegsAlteredEntry:
|
||||
reg: ParsedTextLine
|
||||
|
@ -600,6 +675,20 @@ class InsnSpRegsAlteredEntry:
|
|||
f"{indent} conds={conds},\n"
|
||||
f"{indent})")
|
||||
|
||||
def write_xml(self, parent: ElementTree.Element):
|
||||
entry = ElementTree.SubElement(parent, "entry")
|
||||
entry.text = "\n"
|
||||
entry.tail = "\n"
|
||||
reg = ElementTree.SubElement(entry, "register")
|
||||
reg.tail = "\n"
|
||||
self.reg.write_xml(reg, trailing_nl=False)
|
||||
fields = ElementTree.SubElement(entry, "fields")
|
||||
fields.tail = "\n"
|
||||
ParsedTextLine.write_xml_lines(self.fields, fields, trailing_nl=False)
|
||||
conds = ElementTree.SubElement(entry, "conditions")
|
||||
conds.tail = "\n"
|
||||
ParsedTextLine.write_xml_lines(self.conds, conds, trailing_nl=False)
|
||||
|
||||
@dataclass(unsafe_hash=True, frozen=True)
|
||||
class InsnSpRegsAltered:
|
||||
sp_regs_altered_text: ParsedTextLine
|
||||
|
@ -631,6 +720,28 @@ class InsnSpRegsAltered:
|
|||
lines.append(f")")
|
||||
return "\n".join(lines)
|
||||
|
||||
def write_xml(self, parent: ElementTree.Element):
|
||||
sp_regs_altered = ElementTree.SubElement(parent, "special-registers-altered")
|
||||
sp_regs_altered.text = "\n"
|
||||
sp_regs_altered.tail = "\n"
|
||||
title = ElementTree.SubElement(sp_regs_altered, "title")
|
||||
title.tail = "\n"
|
||||
self.sp_regs_altered_text.write_xml(title, trailing_nl=False)
|
||||
if self.special_text is not None:
|
||||
special_text = ElementTree.SubElement(sp_regs_altered, "special-text")
|
||||
special_text.tail = "\n"
|
||||
self.special_text.write_xml(special_text, trailing_nl=False)
|
||||
if self.table_header_reg is not None:
|
||||
table_header_reg = ElementTree.SubElement(sp_regs_altered, "table-header-register")
|
||||
table_header_reg.tail = "\n"
|
||||
self.table_header_reg.write_xml(table_header_reg, trailing_nl=False)
|
||||
if self.table_header_fields is not None:
|
||||
table_header_fields = ElementTree.SubElement(sp_regs_altered, "table-header-fields")
|
||||
table_header_fields.tail = "\n"
|
||||
self.table_header_fields.write_xml(table_header_fields, trailing_nl=False)
|
||||
for entry in self.entries:
|
||||
entry.write_xml(sp_regs_altered)
|
||||
|
||||
class _InsnParseSection(enum.Enum):
|
||||
CODE = "code"
|
||||
HEADER = "header"
|
||||
|
@ -687,8 +798,9 @@ class Page:
|
|||
)
|
||||
if text_section is None:
|
||||
if PAGE_BODY_MIN_Y <= element.y0 <= PAGE_BODY_MAX_Y:
|
||||
raise AssertionError(
|
||||
f"char not in text section: {element}\npage_num={page_num}")
|
||||
if page_num != 1072: # page 1072 has characters in the margins
|
||||
raise AssertionError(
|
||||
f"char not in text section: {element}\npage_num={page_num}")
|
||||
continue
|
||||
char = Char(
|
||||
text=element.get_text(),
|
||||
|
@ -1003,10 +1115,57 @@ class TextSection:
|
|||
return i
|
||||
return None
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class InsnHeader:
|
||||
header_lines: tuple[ParsedTextLine, ...]
|
||||
mnemonic_lines: tuple[ParsedTextLine, ...]
|
||||
bit_fields: InsnBitFields
|
||||
|
||||
@property
|
||||
def min_y(self) -> float:
|
||||
return self.bit_fields.box_min_y
|
||||
|
||||
def write_xml(self, parent: ElementTree.Element):
|
||||
header = ElementTree.SubElement(parent, "header")
|
||||
header.text = "\n"
|
||||
header.tail = "\n"
|
||||
title = ElementTree.SubElement(header, "title")
|
||||
title.tail = "\n"
|
||||
ParsedTextLine.write_xml_lines(self.header_lines, title, trailing_nl=False)
|
||||
mnemonics = ElementTree.SubElement(header, "mnemonics")
|
||||
mnemonics.tail = "\n"
|
||||
ParsedTextLine.write_xml_lines(self.mnemonic_lines, mnemonics, trailing_nl=False)
|
||||
self.bit_fields.write_xml(header)
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Insn:
|
||||
headers: tuple[InsnHeader, ...]
|
||||
code_lines: tuple[ParsedTextLine, ...]
|
||||
desc_lines: tuple[ParsedTextLine, ...]
|
||||
sp_regs_altered: None | InsnSpRegsAltered
|
||||
|
||||
def write_xml(self, parent: ElementTree.Element):
|
||||
insn = ElementTree.SubElement(parent, "instruction")
|
||||
insn.text = "\n"
|
||||
insn.tail = "\n"
|
||||
for header in self.headers:
|
||||
header.write_xml(insn)
|
||||
if len(self.code_lines) != 0:
|
||||
code = ElementTree.SubElement(insn, "code")
|
||||
code.tail = "\n"
|
||||
ParsedTextLine.write_xml_lines(self.code_lines, code, trailing_nl=False)
|
||||
if len(self.desc_lines) != 0:
|
||||
desc = ElementTree.SubElement(insn, "description")
|
||||
desc.tail = "\n"
|
||||
ParsedTextLine.write_xml_lines(self.desc_lines, desc, trailing_nl=False)
|
||||
if self.sp_regs_altered is not None:
|
||||
self.sp_regs_altered.write_xml(insn)
|
||||
|
||||
@dataclass()
|
||||
class Parser:
|
||||
pages: Pages = field(default_factory=Pages)
|
||||
text_section: TextSection = TextSection.first()
|
||||
insns: list[Insn] = field(default_factory=list)
|
||||
|
||||
@property
|
||||
def page(self) -> Page:
|
||||
|
@ -1017,7 +1176,7 @@ class Parser:
|
|||
return self.pages[self.text_section.page_num].unprocessed_chars[self.text_section]
|
||||
|
||||
@staticmethod
|
||||
def __pages_gen(file: str, page_numbers: Iterable[int] | None) -> Generator[Page, None, None]:
|
||||
def __pages_gen(file: Path, page_numbers: Iterable[int] | None) -> Generator[Page, None, None]:
|
||||
if page_numbers is not None:
|
||||
page_numbers = sorted(i - 1 for i in page_numbers)
|
||||
for i, page in enumerate(extract_pages(file, page_numbers=page_numbers)):
|
||||
|
@ -1028,7 +1187,7 @@ class Parser:
|
|||
print(f"page {page_num}")
|
||||
yield Page.from_lt_page(page_num=page_num, page=page)
|
||||
|
||||
def parse_pdf(self, file: str, page_numbers: Iterable[int] | None = None):
|
||||
def parse_pdf(self, file: Path, page_numbers: Iterable[int] | None = None):
|
||||
self.pages = Pages(pages_gen=Parser.__pages_gen(
|
||||
file=file, page_numbers=page_numbers))
|
||||
self.text_section = TextSection.first()
|
||||
|
@ -1059,7 +1218,7 @@ class Parser:
|
|||
try:
|
||||
with self.note_text_section():
|
||||
self.extract_insns()
|
||||
except InsnParseError as e:
|
||||
except (InsnParseError, PageParseError) as e:
|
||||
print("".join(traceback.format_exception_only(e)), flush=True)
|
||||
traceback.print_exc()
|
||||
|
||||
|
@ -1200,10 +1359,11 @@ class Parser:
|
|||
self.unprocessed_chars[char.font].remove(char)
|
||||
if allowed_start_min_y_error is None:
|
||||
allowed_start_min_y_error = 0.01
|
||||
assert abs(start_min_y - retval.regular_min_y) < allowed_start_min_y_error, (
|
||||
f"start_min_y={start_min_y} regular_min_y={retval.regular_min_y}\n"
|
||||
f"start_min_y error: {start_min_y - retval.regular_min_y}\n"
|
||||
f"allowed_start_min_y_error={allowed_start_min_y_error}")
|
||||
if abs(start_min_y - retval.regular_min_y) > allowed_start_min_y_error:
|
||||
raise PageParseError(
|
||||
f"start_min_y={start_min_y} regular_min_y={retval.regular_min_y}\n"
|
||||
f"start_min_y error: {start_min_y - retval.regular_min_y}\n"
|
||||
f"allowed_start_min_y_error={allowed_start_min_y_error}")
|
||||
return retval
|
||||
|
||||
def extract_following_text_lines(
|
||||
|
@ -1403,7 +1563,7 @@ class Parser:
|
|||
self,
|
||||
start_min_y: float,
|
||||
header_start_char: None | Char = None,
|
||||
) -> None | tuple[list[ParsedTextLine], list[ParsedTextLine], InsnBitFields]:
|
||||
) -> None | InsnHeader:
|
||||
assert header_start_char is None or \
|
||||
header_start_char.font == Font.INSN_HEADER
|
||||
header_line = self.extract_text_line(
|
||||
|
@ -1458,7 +1618,11 @@ class Parser:
|
|||
print(insn_bit_fields)
|
||||
if insn_bit_fields is None:
|
||||
raise InsnParseError("can't find insn bit fields")
|
||||
return header_lines, mnemonic_lines, insn_bit_fields
|
||||
return InsnHeader(
|
||||
header_lines=tuple(header_lines),
|
||||
mnemonic_lines=tuple(mnemonic_lines),
|
||||
bit_fields=insn_bit_fields,
|
||||
)
|
||||
|
||||
def extract_insn_sp_regs_altered(
|
||||
self,
|
||||
|
@ -1474,12 +1638,14 @@ class Parser:
|
|||
max_y=sp_regs_altered_text.regular_min_y - 5,
|
||||
allow_processed=False,
|
||||
)
|
||||
assert table_header_reg_char is not None, \
|
||||
"can't find special registers altered table's register-column's header"
|
||||
if table_header_reg_char is None:
|
||||
raise InsnParseError(
|
||||
"can't find special registers altered table's register-column's header")
|
||||
KNOWN_SPECIAL_TEXTS = (
|
||||
"None",
|
||||
"Dependent on the system service",
|
||||
"See above.",
|
||||
"See Table 5.1",
|
||||
)
|
||||
match table_header_reg_char.text:
|
||||
case "R":
|
||||
|
@ -1611,7 +1777,7 @@ class Parser:
|
|||
final_regular_min_y=regular_min_y,
|
||||
)
|
||||
|
||||
def extract_insn(self, header_start_char: Char):
|
||||
def extract_insn(self, header_start_char: Char) -> Insn:
|
||||
assert header_start_char.font == Font.INSN_HEADER
|
||||
print(header_start_char)
|
||||
header = self.extract_insn_header_mnemonics_and_bit_fields(
|
||||
|
@ -1620,7 +1786,7 @@ class Parser:
|
|||
)
|
||||
if header is None:
|
||||
raise PageParseError("can't find header text line")
|
||||
next_start_min_y = header[2].box_min_y - 5
|
||||
next_start_min_y = header.min_y - 5
|
||||
headers = [header]
|
||||
code_lines: list[ParsedTextLine] = []
|
||||
desc_lines: list[ParsedTextLine] = []
|
||||
|
@ -1687,7 +1853,7 @@ class Parser:
|
|||
if header is None:
|
||||
raise InsnParseError("can't find header text line")
|
||||
headers.append(header)
|
||||
next_start_min_y = header[2].box_min_y - 5
|
||||
next_start_min_y = header.min_y - 5
|
||||
case _InsnParseSection.DESC:
|
||||
desc_line = self.extract_text_line(
|
||||
start_char=next_char,
|
||||
|
@ -1706,7 +1872,7 @@ class Parser:
|
|||
first_text_line=desc_line,
|
||||
min_x=desc_line.chars[0].min_x,
|
||||
max_x=self.text_section.max_x,
|
||||
allowed_start_min_y_error=3,
|
||||
allowed_start_min_y_error=3.5,
|
||||
)
|
||||
print("more insn desc lines:")
|
||||
print("\n".join(map(str, more_desc_lines)))
|
||||
|
@ -1729,6 +1895,12 @@ class Parser:
|
|||
print("sp_regs_altered:")
|
||||
print(sp_regs_altered)
|
||||
# TODO: finish
|
||||
return Insn(
|
||||
headers=tuple(headers),
|
||||
code_lines=tuple(code_lines),
|
||||
desc_lines=tuple(desc_lines),
|
||||
sp_regs_altered=sp_regs_altered,
|
||||
)
|
||||
|
||||
def extract_insns(self):
|
||||
while True:
|
||||
|
@ -1737,7 +1909,7 @@ class Parser:
|
|||
self.unprocessed_chars[Font.INSN_HEADER]))
|
||||
except StopIteration:
|
||||
break
|
||||
self.extract_insn(header_start_char=header_start_char)
|
||||
self.insns.append(self.extract_insn(header_start_char=header_start_char))
|
||||
|
||||
def main():
|
||||
if 2 < len(sys.argv):
|
||||
|
@ -1747,4 +1919,19 @@ def main():
|
|||
page_numbers = tuple(int(i) for i in sys.argv[2].split(","))
|
||||
else:
|
||||
page_numbers = None
|
||||
Parser().parse_pdf(sys.argv[1], page_numbers=page_numbers)
|
||||
parser = Parser()
|
||||
file_name = Path(sys.argv[1])
|
||||
parser.parse_pdf(file_name, page_numbers=page_numbers)
|
||||
insns = ElementTree.Element("instructions", attrib={"is-subset": str(page_numbers is not None)})
|
||||
insns.text = "\n"
|
||||
insns.tail = "\n"
|
||||
comment = ElementTree.Comment(f" Automatically generated from {file_name.name} ")
|
||||
comment.tail = "\n"
|
||||
insns.append(comment)
|
||||
for insn in parser.insns:
|
||||
insn.write_xml(insns)
|
||||
ElementTree.ElementTree(insns).write(
|
||||
"powerisa-instructions.xml",
|
||||
encoding="utf-8",
|
||||
xml_declaration=True,
|
||||
)
|
|
@ -8,6 +8,7 @@ version = "0.0.0"
|
|||
dependencies = [
|
||||
"pdfminer.six == 20240706"
|
||||
]
|
||||
requires-python = ">= 3.11"
|
||||
|
||||
[project.scripts]
|
||||
parse_powerisa_pdf = "parse_powerisa_pdf.parse_powerisa_pdf:main"
|
Loading…
Reference in a new issue