diff --git a/.forgejo/workflows/test.yml b/.forgejo/workflows/test.yml index c98f5b2..bed4795 100644 --- a/.forgejo/workflows/test.yml +++ b/.forgejo/workflows/test.yml @@ -2,25 +2,45 @@ # See Notices.txt for copyright information on: [push, pull_request] +env: + PDF_HASH: 56372d23ece7e9e2c1b381a639443982a3e16e38109df1c141d655b779b61fdb + OUTPUT_XML_HASH: c0b4592cbd0a3e59b9b2931a6a75a3d87ebf23bf453e8587a1522dd157f15ee9 + jobs: test: runs-on: debian-12 + container: + image: git.libre-chip.org/libre-chip/fayalite-deps:latest steps: - uses: actions/checkout@v3 with: fetch-depth: 0 - run: | scripts/check-copyright.sh + - uses: https://git.libre-chip.org/mirrors/rust-cache@v2 + with: + save-if: ${{ github.ref == 'refs/heads/master' }} - run: | apt-get update -qq apt-get install -qq python3-venv wget # copy of https://files.openpower.foundation/s/9izgC5Rogi5Ywmm/download/OPF_PowerISA_v3.1C.pdf wget -O OPF_PowerISA_v3.1C.pdf https://libre-chip.org/OPF_PowerISA_v3.1C.pdf - echo "56372d23ece7e9e2c1b381a639443982a3e16e38109df1c141d655b779b61fdb OPF_PowerISA_v3.1C.pdf" | sha256sum -c + echo "$PDF_HASH OPF_PowerISA_v3.1C.pdf" | sha256sum -c + - run: | + cargo test + - run: | + cargo build --release + - run: | + cargo run --release -- OPF_PowerISA_v3.1C.pdf &> >(tee out.log | grep '^page ') || { tail -n1000 out.log; false; } + echo "expected output (not all instructions are decoded yet, change when the output is improved):" + echo "$OUTPUT_XML_HASH powerisa-instructions.xml" | sha256sum -c + mv powerisa-instructions.xml powerisa-instructions-rust.xml + - run: | python3 -m venv --upgrade-deps .venv . .venv/bin/activate pip install -e . parse_powerisa_pdf OPF_PowerISA_v3.1C.pdf &> >(tee out.log | grep '^page ') || { tail -n1000 out.log; false; } echo "expected output (not all instructions are decoded yet, change when the output is improved):" - echo "c0b4592cbd0a3e59b9b2931a6a75a3d87ebf23bf453e8587a1522dd157f15ee9 powerisa-instructions.xml" | sha256sum -c + echo "$OUTPUT_XML_HASH powerisa-instructions.xml" | sha256sum -c + mv powerisa-instructions.xml powerisa-instructions-python.xml diff --git a/.gitignore b/.gitignore index 37b3676..454d147 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ __pycache__ *.log /powerisa-instructions.xml /*.pdf +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..4de1b68 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,337 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "bindgen" +version = "0.71.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f58bf3d7db68cfbac37cfc485a8d711e87e064c3d0fe0435b92f7a407f9d6b3" +dependencies = [ + "bitflags", + "cexpr", + "clang-sys", + "itertools", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "syn", +] + +[[package]] +name = "bitflags" +version = "2.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" + +[[package]] +name = "cc" +version = "1.2.51" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a0aeaff4ff1a90589618835a598e545176939b97874f7abc7851caa0618f203" +dependencies = [ + "find-msvc-tools", + "shlex", +] + +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "clang-sys" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" +dependencies = [ + "glob", + "libc", + "libloading", +] + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "find-msvc-tools" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "645cbb3a84e60b7531617d5ae4e57f7e27308f6445f5abf653209ea76dec8dff" + +[[package]] +name = "glob" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" + +[[package]] +name = "hashbrown" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" + +[[package]] +name = "indexmap" +version = "2.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ad4bb2b565bca0645f4d68c5c9af97fba094e9791da685bf83cb5f3ce74acf2" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", +] + +[[package]] +name = "libc" +version = "0.2.178" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37c93d8daa9d8a012fd8ab92f088405fb202ea0b6ab73ee2482ae66af4f42091" + +[[package]] +name = "libloading" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" +dependencies = [ + "cfg-if", + "windows-link", +] + +[[package]] +name = "libm" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de" + +[[package]] +name = "memchr" +version = "2.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "mupdf-sys" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13e9a0d4e844ab50315d43312f3d62f72c77205b07c8ee21cbd4b52bdc2a9910" +dependencies = [ + "bindgen", + "cc", + "pkg-config", + "regex", + "zerocopy", +] + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "parse_powerisa_pdf" +version = "0.1.0" +dependencies = [ + "indexmap", + "libm", + "mupdf-sys", + "quick-xml", + "serde", +] + +[[package]] +name = "pkg-config" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" + +[[package]] +name = "proc-macro2" +version = "1.0.104" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9695f8df41bb4f3d222c95a67532365f569318332d03d5f3f67f37b20e6ebdf0" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quick-xml" +version = "0.38.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b66c2058c55a409d601666cffe35f04333cf1013010882cec174a7467cd4e21c" +dependencies = [ + "memchr", + "serde", +] + +[[package]] +name = "quote" +version = "1.0.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "regex" +version = "1.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" + +[[package]] +name = "rustc-hash" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "syn" +version = "2.0.112" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21f182278bf2d2bcb3c88b1b08a37df029d71ce3d3ae26168e3c653b213b99d4" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "zerocopy" +version = "0.8.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd74ec98b9250adb3ca554bdde269adf631549f51d8a8f8f0a10b50f1cb298c3" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8a8d209fdf45cf5138cbb5a506f6b52522a25afccc534d1475dad8e31105c6a" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..09de0ba --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +# See Notices.txt for copyright information +[package] +name = "parse_powerisa_pdf" +version = "0.1.0" +license = "LGPL-3.0-or-later" +edition = "2024" +repository = "" +keywords = [] +categories = [] +rust-version = "1.89.0" + +[dependencies] +indexmap = "2.12.1" +libm = "0.2.15" +mupdf-sys = { version = "0.5.0", default-features = false } +quick-xml = { version = "0.38.4", features = ["serialize"] } +serde = { version = "1.0.228", features = ["derive"] } diff --git a/parse_powerisa_pdf/parse_powerisa_pdf.py b/parse_powerisa_pdf/parse_powerisa_pdf.py index a4afd09..3c2afe5 100755 --- a/parse_powerisa_pdf/parse_powerisa_pdf.py +++ b/parse_powerisa_pdf/parse_powerisa_pdf.py @@ -765,7 +765,7 @@ class Page: unprocessed_non_text: SetById[LTLine | LTRect] @staticmethod - def from_lt_page(page_num: int, page: LTPage) -> Page: + def from_lt_page(page_num: int, page: LTPage, first_seen_fonts: defaultdict[str, set[float]]) -> Page: qt: defaultdict[TextSection, QuadTree[Char | LTLine | LTRect]] = defaultdict(QuadTree) unprocessed_chars = defaultdict(lambda: defaultdict(SetById[Char])) unprocessed_non_text: SetById[LTLine | LTRect] = SetById() @@ -804,20 +804,25 @@ class Page: raise AssertionError( f"char not in text section: {element}\npage_num={page_num}") continue + font_size = round(element.size, 3) char = Char( text=element.get_text(), - font=Font(font_name=element.fontname, size=round(element.size, 3)), + font=Font(font_name=element.fontname, size=font_size), adv=element.adv, min_x=element.x0, min_y=element.y0, max_x=element.x1, max_y=element.y1, ) + if font_size not in first_seen_fonts[element.fontname]: + first_seen_fonts[element.fontname].add(font_size) + print(f"first seen font: {element.fontname!r} {font_size}: page {page_num} {char!r}") qt[text_section].insert(char.min_x, char.min_y, char) unprocessed_chars[text_section][char.font].add(char) - for i in unprocessed_chars.values(): - for j in i.values(): - j.sort(key=Char.top_down_left_to_right_sort_key) + for text_section, i in unprocessed_chars.items(): + for chars in i.values(): + chars.sort(key=Char.top_down_left_to_right_sort_key) + print(f"first char: {text_section!r}: {next(iter(chars), None)!r}") unknown_fonts=[] unknown_font_errors=[] for i in unprocessed_chars.values(): @@ -1181,13 +1186,14 @@ class Parser: def __pages_gen(file: Path, page_numbers: Iterable[int] | None) -> Generator[Page, None, None]: if page_numbers is not None: page_numbers = sorted(i - 1 for i in page_numbers) + first_seen_fonts = defaultdict(set) for i, page in enumerate(extract_pages(file, page_numbers=page_numbers)): if page_numbers is not None: page_num = page_numbers[i] + 1 else: page_num = i + 1 print(f"page {page_num}") - yield Page.from_lt_page(page_num=page_num, page=page) + yield Page.from_lt_page(page_num=page_num, page=page, first_seen_fonts=first_seen_fonts) def parse_pdf(self, file: Path, page_numbers: Iterable[int] | None = None): self.pages = Pages(pages_gen=Parser.__pages_gen( @@ -1503,7 +1509,7 @@ class Parser: f"instruction bit fields box has wrong number of horizontal lines:\n{h_lines}") if len(v_lines) < 2: raise InsnParseError( - f"instruction bit fields box has too few vertical lines:\n{h_lines}") + f"instruction bit fields box has too few vertical lines:\n{v_lines}") bottom_line, top_line = h_lines box_min_x = v_lines[0].x0 box_max_x = v_lines[-1].x0 diff --git a/pyproject.toml b/pyproject.toml index 68c4029..0f3e887 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,4 +13,7 @@ dependencies = [ requires-python = ">= 3.11" [project.scripts] -parse_powerisa_pdf = "parse_powerisa_pdf.parse_powerisa_pdf:main" \ No newline at end of file +parse_powerisa_pdf = "parse_powerisa_pdf.parse_powerisa_pdf:main" + +[tool.setuptools] +packages = ["parse_powerisa_pdf"] diff --git a/scripts/check-copyright.sh b/scripts/check-copyright.sh index 640fb4b..a9dc2bc 100755 --- a/scripts/check-copyright.sh +++ b/scripts/check-copyright.sh @@ -32,6 +32,7 @@ function check_file() } POUND_HEADER=('^"# SPDX-License-Identifier: LGPL-3.0-or-later"$' '^"# See Notices.txt for copyright information"$') +SLASH_HEADER=('^"// SPDX-License-Identifier: LGPL-3.0-or-later"$' '^"// See Notices.txt for copyright information"$') MD_HEADER=('^"