diff --git a/.forgejo/workflows/test.yml b/.forgejo/workflows/test.yml new file mode 100644 index 0000000..bed4795 --- /dev/null +++ b/.forgejo/workflows/test.yml @@ -0,0 +1,46 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +# See Notices.txt for copyright information +on: [push, pull_request] + +env: + PDF_HASH: 56372d23ece7e9e2c1b381a639443982a3e16e38109df1c141d655b779b61fdb + OUTPUT_XML_HASH: c0b4592cbd0a3e59b9b2931a6a75a3d87ebf23bf453e8587a1522dd157f15ee9 + +jobs: + test: + runs-on: debian-12 + container: + image: git.libre-chip.org/libre-chip/fayalite-deps:latest + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + - run: | + scripts/check-copyright.sh + - uses: https://git.libre-chip.org/mirrors/rust-cache@v2 + with: + save-if: ${{ github.ref == 'refs/heads/master' }} + - run: | + apt-get update -qq + apt-get install -qq python3-venv wget + # copy of https://files.openpower.foundation/s/9izgC5Rogi5Ywmm/download/OPF_PowerISA_v3.1C.pdf + wget -O OPF_PowerISA_v3.1C.pdf https://libre-chip.org/OPF_PowerISA_v3.1C.pdf + echo "$PDF_HASH OPF_PowerISA_v3.1C.pdf" | sha256sum -c + - run: | + cargo test + - run: | + cargo build --release + - run: | + cargo run --release -- OPF_PowerISA_v3.1C.pdf &> >(tee out.log | grep '^page ') || { tail -n1000 out.log; false; } + echo "expected output (not all instructions are decoded yet, change when the output is improved):" + echo "$OUTPUT_XML_HASH powerisa-instructions.xml" | sha256sum -c + mv powerisa-instructions.xml powerisa-instructions-rust.xml + - run: | + python3 -m venv --upgrade-deps .venv + . .venv/bin/activate + pip install -e . + parse_powerisa_pdf OPF_PowerISA_v3.1C.pdf &> >(tee out.log | grep '^page ') || { tail -n1000 out.log; false; } + echo "expected output (not all instructions are decoded yet, change when the output is improved):" + echo "$OUTPUT_XML_HASH powerisa-instructions.xml" | sha256sum -c + mv powerisa-instructions.xml powerisa-instructions-python.xml + diff --git a/.gitignore b/.gitignore index 50e4eb1..454d147 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,10 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +# See Notices.txt for copyright information /.venv /.vscode *.egg-info __pycache__ *.log -/powerisa-instructions.xml \ No newline at end of file +/powerisa-instructions.xml +/*.pdf +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..4321809 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,305 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "bindgen" +version = "0.71.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f58bf3d7db68cfbac37cfc485a8d711e87e064c3d0fe0435b92f7a407f9d6b3" +dependencies = [ + "bitflags", + "cexpr", + "clang-sys", + "itertools", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "syn", +] + +[[package]] +name = "bitflags" +version = "2.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" + +[[package]] +name = "cc" +version = "1.2.51" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a0aeaff4ff1a90589618835a598e545176939b97874f7abc7851caa0618f203" +dependencies = [ + "find-msvc-tools", + "shlex", +] + +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "clang-sys" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" +dependencies = [ + "glob", + "libc", + "libloading", +] + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "find-msvc-tools" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "645cbb3a84e60b7531617d5ae4e57f7e27308f6445f5abf653209ea76dec8dff" + +[[package]] +name = "glob" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" + +[[package]] +name = "hashbrown" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" + +[[package]] +name = "indexmap" +version = "2.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ad4bb2b565bca0645f4d68c5c9af97fba094e9791da685bf83cb5f3ce74acf2" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", +] + +[[package]] +name = "libc" +version = "0.2.178" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37c93d8daa9d8a012fd8ab92f088405fb202ea0b6ab73ee2482ae66af4f42091" + +[[package]] +name = "libloading" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" +dependencies = [ + "cfg-if", + "windows-link", +] + +[[package]] +name = "libm" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de" + +[[package]] +name = "memchr" +version = "2.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "mupdf-sys" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13e9a0d4e844ab50315d43312f3d62f72c77205b07c8ee21cbd4b52bdc2a9910" +dependencies = [ + "bindgen", + "cc", + "pkg-config", + "regex", + "zerocopy", +] + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "parse_powerisa_pdf" +version = "0.1.0" +dependencies = [ + "indexmap", + "libm", + "mupdf-sys", + "quick-xml", +] + +[[package]] +name = "pkg-config" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" + +[[package]] +name = "proc-macro2" +version = "1.0.104" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9695f8df41bb4f3d222c95a67532365f569318332d03d5f3f67f37b20e6ebdf0" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quick-xml" +version = "0.38.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b66c2058c55a409d601666cffe35f04333cf1013010882cec174a7467cd4e21c" +dependencies = [ + "memchr", +] + +[[package]] +name = "quote" +version = "1.0.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "regex" +version = "1.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" + +[[package]] +name = "rustc-hash" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "syn" +version = "2.0.112" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21f182278bf2d2bcb3c88b1b08a37df029d71ce3d3ae26168e3c653b213b99d4" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "zerocopy" +version = "0.8.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd74ec98b9250adb3ca554bdde269adf631549f51d8a8f8f0a10b50f1cb298c3" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8a8d209fdf45cf5138cbb5a506f6b52522a25afccc534d1475dad8e31105c6a" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..21175d6 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,17 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +# See Notices.txt for copyright information +[package] +name = "parse_powerisa_pdf" +version = "0.1.0" +license = "LGPL-3.0-or-later" +edition = "2024" +repository = "" +keywords = [] +categories = [] +rust-version = "1.89.0" + +[dependencies] +indexmap = "2.12.1" +libm = "0.2.15" +mupdf-sys = { version = "0.5.0", default-features = false } +quick-xml = "0.38.4" diff --git a/README.md b/README.md index f8fae5d..f589559 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,38 @@ + parser for the OPF PowerISA 3.1C pdf to attempt to extract all instructions' pseudo-code including subscripts/superscripts and other formatting +# Using the new Rust code: + +Usage: +* Download the OPF PowerISA 3.1C pdf (yes you need that exact version) from + +* Install Rust -- you need version 1.89.0 or later. + + Getting it from https://rustup.rs/ is recommended. + +* Install required build dependencies: + + On Debian 12: + + ```bash + sudo apt update + sudo apt install build-essential clang unzip + ``` + +* Compile and run: + + ```bash + cargo run -- path/to/downloaded/OPF_PowerISA_v3.1C.pdf > out.log + ``` + +* This will spit out lots of errors and then successfully create + the output file -- `powerisa-instructions.xml` in the current directory. + +# Using the old Python code: + Usage: * Download the OPF PowerISA 3.1C pdf (yes you need that exact version) from * Obtain CPython 3.11 (the default `python3` in [Debian Bookworm](https://www.debian.org/releases/bookworm/)) diff --git a/parse_powerisa_pdf/parse_powerisa_pdf.py b/parse_powerisa_pdf/parse_powerisa_pdf.py index c7187d1..3c2afe5 100755 --- a/parse_powerisa_pdf/parse_powerisa_pdf.py +++ b/parse_powerisa_pdf/parse_powerisa_pdf.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +# See Notices.txt for copyright information from __future__ import annotations from collections import defaultdict from collections.abc import Generator, Iterable, Iterator, Callable @@ -763,7 +765,7 @@ class Page: unprocessed_non_text: SetById[LTLine | LTRect] @staticmethod - def from_lt_page(page_num: int, page: LTPage) -> Page: + def from_lt_page(page_num: int, page: LTPage, first_seen_fonts: defaultdict[str, set[float]]) -> Page: qt: defaultdict[TextSection, QuadTree[Char | LTLine | LTRect]] = defaultdict(QuadTree) unprocessed_chars = defaultdict(lambda: defaultdict(SetById[Char])) unprocessed_non_text: SetById[LTLine | LTRect] = SetById() @@ -802,20 +804,25 @@ class Page: raise AssertionError( f"char not in text section: {element}\npage_num={page_num}") continue + font_size = round(element.size, 3) char = Char( text=element.get_text(), - font=Font(font_name=element.fontname, size=round(element.size, 3)), + font=Font(font_name=element.fontname, size=font_size), adv=element.adv, min_x=element.x0, min_y=element.y0, max_x=element.x1, max_y=element.y1, ) + if font_size not in first_seen_fonts[element.fontname]: + first_seen_fonts[element.fontname].add(font_size) + print(f"first seen font: {element.fontname!r} {font_size}: page {page_num} {char!r}") qt[text_section].insert(char.min_x, char.min_y, char) unprocessed_chars[text_section][char.font].add(char) - for i in unprocessed_chars.values(): - for j in i.values(): - j.sort(key=Char.top_down_left_to_right_sort_key) + for text_section, i in unprocessed_chars.items(): + for chars in i.values(): + chars.sort(key=Char.top_down_left_to_right_sort_key) + print(f"first char: {text_section!r}: {next(iter(chars), None)!r}") unknown_fonts=[] unknown_font_errors=[] for i in unprocessed_chars.values(): @@ -1179,13 +1186,14 @@ class Parser: def __pages_gen(file: Path, page_numbers: Iterable[int] | None) -> Generator[Page, None, None]: if page_numbers is not None: page_numbers = sorted(i - 1 for i in page_numbers) + first_seen_fonts = defaultdict(set) for i, page in enumerate(extract_pages(file, page_numbers=page_numbers)): if page_numbers is not None: page_num = page_numbers[i] + 1 else: page_num = i + 1 print(f"page {page_num}") - yield Page.from_lt_page(page_num=page_num, page=page) + yield Page.from_lt_page(page_num=page_num, page=page, first_seen_fonts=first_seen_fonts) def parse_pdf(self, file: Path, page_numbers: Iterable[int] | None = None): self.pages = Pages(pages_gen=Parser.__pages_gen( @@ -1501,7 +1509,7 @@ class Parser: f"instruction bit fields box has wrong number of horizontal lines:\n{h_lines}") if len(v_lines) < 2: raise InsnParseError( - f"instruction bit fields box has too few vertical lines:\n{h_lines}") + f"instruction bit fields box has too few vertical lines:\n{v_lines}") bottom_line, top_line = h_lines box_min_x = v_lines[0].x0 box_max_x = v_lines[-1].x0 diff --git a/parse_powerisa_pdf/quad_tree.py b/parse_powerisa_pdf/quad_tree.py index 34343e8..bee9d76 100644 --- a/parse_powerisa_pdf/quad_tree.py +++ b/parse_powerisa_pdf/quad_tree.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +# See Notices.txt for copyright information from __future__ import annotations from typing import Callable, Generic, Iterable, Iterator, TypeVar from math import frexp, isfinite, isnan, ldexp diff --git a/parse_powerisa_pdf/set_by_id.py b/parse_powerisa_pdf/set_by_id.py index 444741b..969f8d7 100644 --- a/parse_powerisa_pdf/set_by_id.py +++ b/parse_powerisa_pdf/set_by_id.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +# See Notices.txt for copyright information from collections import abc from typing import Callable, Generic, Iterable, Iterator, Protocol, TypeAlias, TypeVar, overload diff --git a/pyproject.toml b/pyproject.toml index c2ec3e0..0f3e887 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +# See Notices.txt for copyright information [build-system] requires = ["setuptools >= 61.0"] build-backend = "setuptools.build_meta" @@ -11,4 +13,7 @@ dependencies = [ requires-python = ">= 3.11" [project.scripts] -parse_powerisa_pdf = "parse_powerisa_pdf.parse_powerisa_pdf:main" \ No newline at end of file +parse_powerisa_pdf = "parse_powerisa_pdf.parse_powerisa_pdf:main" + +[tool.setuptools] +packages = ["parse_powerisa_pdf"] diff --git a/scripts/check-copyright.sh b/scripts/check-copyright.sh new file mode 100755 index 0000000..a9dc2bc --- /dev/null +++ b/scripts/check-copyright.sh @@ -0,0 +1,70 @@ +#!/bin/bash +# SPDX-License-Identifier: LGPL-3.0-or-later +# See Notices.txt for copyright information +set -e + +function fail() +{ + local error="$1" + echo "error: $error" >&2 + exit 1 +} + +function fail_file() +{ + local file="$1" line="$2" error="$3" + fail "$file:$((line + 1)): $error" +} + +function check_file() +{ + local file="$1" regexes=("${@:2}") + local lines + mapfile -t lines < "$file" + if (("${#lines[@]}" == 0)); then + return # empty file, no copyright needed + fi + local line + for line in "${!regexes[@]}"; do + eval '[[ "${lines[i]}" =~ '"${regexes[i]}"' ]]' || + fail_file "$file" "$line" "doesn't match regex: ${regexes[i]}" + done +} + +POUND_HEADER=('^"# SPDX-License-Identifier: LGPL-3.0-or-later"$' '^"# See Notices.txt for copyright information"$') +SLASH_HEADER=('^"// SPDX-License-Identifier: LGPL-3.0-or-later"$' '^"// See Notices.txt for copyright information"$') +MD_HEADER=('^"