diff --git a/.forgejo/workflows/test.yml b/.forgejo/workflows/test.yml deleted file mode 100644 index bed4795..0000000 --- a/.forgejo/workflows/test.yml +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-License-Identifier: LGPL-3.0-or-later -# See Notices.txt for copyright information -on: [push, pull_request] - -env: - PDF_HASH: 56372d23ece7e9e2c1b381a639443982a3e16e38109df1c141d655b779b61fdb - OUTPUT_XML_HASH: c0b4592cbd0a3e59b9b2931a6a75a3d87ebf23bf453e8587a1522dd157f15ee9 - -jobs: - test: - runs-on: debian-12 - container: - image: git.libre-chip.org/libre-chip/fayalite-deps:latest - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - run: | - scripts/check-copyright.sh - - uses: https://git.libre-chip.org/mirrors/rust-cache@v2 - with: - save-if: ${{ github.ref == 'refs/heads/master' }} - - run: | - apt-get update -qq - apt-get install -qq python3-venv wget - # copy of https://files.openpower.foundation/s/9izgC5Rogi5Ywmm/download/OPF_PowerISA_v3.1C.pdf - wget -O OPF_PowerISA_v3.1C.pdf https://libre-chip.org/OPF_PowerISA_v3.1C.pdf - echo "$PDF_HASH OPF_PowerISA_v3.1C.pdf" | sha256sum -c - - run: | - cargo test - - run: | - cargo build --release - - run: | - cargo run --release -- OPF_PowerISA_v3.1C.pdf &> >(tee out.log | grep '^page ') || { tail -n1000 out.log; false; } - echo "expected output (not all instructions are decoded yet, change when the output is improved):" - echo "$OUTPUT_XML_HASH powerisa-instructions.xml" | sha256sum -c - mv powerisa-instructions.xml powerisa-instructions-rust.xml - - run: | - python3 -m venv --upgrade-deps .venv - . .venv/bin/activate - pip install -e . - parse_powerisa_pdf OPF_PowerISA_v3.1C.pdf &> >(tee out.log | grep '^page ') || { tail -n1000 out.log; false; } - echo "expected output (not all instructions are decoded yet, change when the output is improved):" - echo "$OUTPUT_XML_HASH powerisa-instructions.xml" | sha256sum -c - mv powerisa-instructions.xml powerisa-instructions-python.xml - diff --git a/.gitignore b/.gitignore index 454d147..e11e257 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,7 @@ -# SPDX-License-Identifier: LGPL-3.0-or-later -# See Notices.txt for copyright information /.venv /.vscode *.egg-info __pycache__ *.log /powerisa-instructions.xml -/*.pdf /target diff --git a/Cargo.lock b/Cargo.lock index 4321809..10be7b1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3,56 +3,10 @@ version = 4 [[package]] -name = "aho-corasick" -version = "1.1.4" +name = "adler2" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" -dependencies = [ - "memchr", -] - -[[package]] -name = "bindgen" -version = "0.71.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f58bf3d7db68cfbac37cfc485a8d711e87e064c3d0fe0435b92f7a407f9d6b3" -dependencies = [ - "bitflags", - "cexpr", - "clang-sys", - "itertools", - "proc-macro2", - "quote", - "regex", - "rustc-hash", - "shlex", - "syn", -] - -[[package]] -name = "bitflags" -version = "2.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" - -[[package]] -name = "cc" -version = "1.2.51" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a0aeaff4ff1a90589618835a598e545176939b97874f7abc7851caa0618f203" -dependencies = [ - "find-msvc-tools", - "shlex", -] - -[[package]] -name = "cexpr" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" -dependencies = [ - "nom", -] +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" [[package]] name = "cfg-if" @@ -61,16 +15,39 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" [[package]] -name = "clang-sys" -version = "1.8.1" +name = "crc32fast" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" dependencies = [ - "glob", - "libc", - "libloading", + "cfg-if", ] +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + [[package]] name = "either" version = "1.15.0" @@ -78,228 +55,55 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" [[package]] -name = "equivalent" -version = "1.0.2" +name = "flate2" +version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" - -[[package]] -name = "find-msvc-tools" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "645cbb3a84e60b7531617d5ae4e57f7e27308f6445f5abf653209ea76dec8dff" - -[[package]] -name = "glob" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" - -[[package]] -name = "hashbrown" -version = "0.16.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" - -[[package]] -name = "indexmap" -version = "2.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ad4bb2b565bca0645f4d68c5c9af97fba094e9791da685bf83cb5f3ce74acf2" +checksum = "bfe33edd8e85a12a67454e37f8c75e730830d83e313556ab9ebf9ee7fbeb3bfb" dependencies = [ - "equivalent", - "hashbrown", + "crc32fast", + "miniz_oxide", ] [[package]] -name = "itertools" -version = "0.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" -dependencies = [ - "either", -] - -[[package]] -name = "libc" -version = "0.2.178" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37c93d8daa9d8a012fd8ab92f088405fb202ea0b6ab73ee2482ae66af4f42091" - -[[package]] -name = "libloading" +name = "miniz_oxide" version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" dependencies = [ - "cfg-if", - "windows-link", -] - -[[package]] -name = "libm" -version = "0.2.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de" - -[[package]] -name = "memchr" -version = "2.7.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" - -[[package]] -name = "minimal-lexical" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" - -[[package]] -name = "mupdf-sys" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13e9a0d4e844ab50315d43312f3d62f72c77205b07c8ee21cbd4b52bdc2a9910" -dependencies = [ - "bindgen", - "cc", - "pkg-config", - "regex", - "zerocopy", -] - -[[package]] -name = "nom" -version = "7.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" -dependencies = [ - "memchr", - "minimal-lexical", + "adler2", + "simd-adler32", ] [[package]] name = "parse_powerisa_pdf" version = "0.1.0" dependencies = [ - "indexmap", - "libm", - "mupdf-sys", - "quick-xml", + "flate2", + "rayon", ] [[package]] -name = "pkg-config" -version = "0.3.32" +name = "rayon" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" - -[[package]] -name = "proc-macro2" -version = "1.0.104" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9695f8df41bb4f3d222c95a67532365f569318332d03d5f3f67f37b20e6ebdf0" +checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" dependencies = [ - "unicode-ident", + "either", + "rayon-core", ] [[package]] -name = "quick-xml" -version = "0.38.4" +name = "rayon-core" +version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b66c2058c55a409d601666cffe35f04333cf1013010882cec174a7467cd4e21c" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" dependencies = [ - "memchr", + "crossbeam-deque", + "crossbeam-utils", ] [[package]] -name = "quote" -version = "1.0.42" +name = "simd-adler32" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f" -dependencies = [ - "proc-macro2", -] - -[[package]] -name = "regex" -version = "1.12.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" -dependencies = [ - "aho-corasick", - "memchr", - "regex-automata", - "regex-syntax", -] - -[[package]] -name = "regex-automata" -version = "0.4.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" -dependencies = [ - "aho-corasick", - "memchr", - "regex-syntax", -] - -[[package]] -name = "regex-syntax" -version = "0.8.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" - -[[package]] -name = "rustc-hash" -version = "2.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" - -[[package]] -name = "shlex" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" - -[[package]] -name = "syn" -version = "2.0.112" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21f182278bf2d2bcb3c88b1b08a37df029d71ce3d3ae26168e3c653b213b99d4" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - -[[package]] -name = "unicode-ident" -version = "1.0.22" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" - -[[package]] -name = "windows-link" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" - -[[package]] -name = "zerocopy" -version = "0.8.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd74ec98b9250adb3ca554bdde269adf631549f51d8a8f8f0a10b50f1cb298c3" -dependencies = [ - "zerocopy-derive", -] - -[[package]] -name = "zerocopy-derive" -version = "0.8.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8a8d209fdf45cf5138cbb5a506f6b52522a25afccc534d1475dad8e31105c6a" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] +checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" diff --git a/Cargo.toml b/Cargo.toml index 21175d6..20ecf46 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,17 +1,10 @@ -# SPDX-License-Identifier: LGPL-3.0-or-later -# See Notices.txt for copyright information [package] name = "parse_powerisa_pdf" version = "0.1.0" -license = "LGPL-3.0-or-later" edition = "2024" -repository = "" -keywords = [] -categories = [] -rust-version = "1.89.0" +license = "LGPL-3.0-or-later" [dependencies] -indexmap = "2.12.1" -libm = "0.2.15" -mupdf-sys = { version = "0.5.0", default-features = false } -quick-xml = "0.38.4" +flate2 = "1.1.5" +rayon = "1.11.0" + diff --git a/README.md b/README.md index f589559..f8fae5d 100644 --- a/README.md +++ b/README.md @@ -1,38 +1,5 @@ - parser for the OPF PowerISA 3.1C pdf to attempt to extract all instructions' pseudo-code including subscripts/superscripts and other formatting -# Using the new Rust code: - -Usage: -* Download the OPF PowerISA 3.1C pdf (yes you need that exact version) from - -* Install Rust -- you need version 1.89.0 or later. - - Getting it from https://rustup.rs/ is recommended. - -* Install required build dependencies: - - On Debian 12: - - ```bash - sudo apt update - sudo apt install build-essential clang unzip - ``` - -* Compile and run: - - ```bash - cargo run -- path/to/downloaded/OPF_PowerISA_v3.1C.pdf > out.log - ``` - -* This will spit out lots of errors and then successfully create - the output file -- `powerisa-instructions.xml` in the current directory. - -# Using the old Python code: - Usage: * Download the OPF PowerISA 3.1C pdf (yes you need that exact version) from * Obtain CPython 3.11 (the default `python3` in [Debian Bookworm](https://www.debian.org/releases/bookworm/)) diff --git a/parse_powerisa_pdf/parse_powerisa_pdf.py b/parse_powerisa_pdf/parse_powerisa_pdf.py index 3c2afe5..c7187d1 100755 --- a/parse_powerisa_pdf/parse_powerisa_pdf.py +++ b/parse_powerisa_pdf/parse_powerisa_pdf.py @@ -1,5 +1,3 @@ -# SPDX-License-Identifier: LGPL-3.0-or-later -# See Notices.txt for copyright information from __future__ import annotations from collections import defaultdict from collections.abc import Generator, Iterable, Iterator, Callable @@ -765,7 +763,7 @@ class Page: unprocessed_non_text: SetById[LTLine | LTRect] @staticmethod - def from_lt_page(page_num: int, page: LTPage, first_seen_fonts: defaultdict[str, set[float]]) -> Page: + def from_lt_page(page_num: int, page: LTPage) -> Page: qt: defaultdict[TextSection, QuadTree[Char | LTLine | LTRect]] = defaultdict(QuadTree) unprocessed_chars = defaultdict(lambda: defaultdict(SetById[Char])) unprocessed_non_text: SetById[LTLine | LTRect] = SetById() @@ -804,25 +802,20 @@ class Page: raise AssertionError( f"char not in text section: {element}\npage_num={page_num}") continue - font_size = round(element.size, 3) char = Char( text=element.get_text(), - font=Font(font_name=element.fontname, size=font_size), + font=Font(font_name=element.fontname, size=round(element.size, 3)), adv=element.adv, min_x=element.x0, min_y=element.y0, max_x=element.x1, max_y=element.y1, ) - if font_size not in first_seen_fonts[element.fontname]: - first_seen_fonts[element.fontname].add(font_size) - print(f"first seen font: {element.fontname!r} {font_size}: page {page_num} {char!r}") qt[text_section].insert(char.min_x, char.min_y, char) unprocessed_chars[text_section][char.font].add(char) - for text_section, i in unprocessed_chars.items(): - for chars in i.values(): - chars.sort(key=Char.top_down_left_to_right_sort_key) - print(f"first char: {text_section!r}: {next(iter(chars), None)!r}") + for i in unprocessed_chars.values(): + for j in i.values(): + j.sort(key=Char.top_down_left_to_right_sort_key) unknown_fonts=[] unknown_font_errors=[] for i in unprocessed_chars.values(): @@ -1186,14 +1179,13 @@ class Parser: def __pages_gen(file: Path, page_numbers: Iterable[int] | None) -> Generator[Page, None, None]: if page_numbers is not None: page_numbers = sorted(i - 1 for i in page_numbers) - first_seen_fonts = defaultdict(set) for i, page in enumerate(extract_pages(file, page_numbers=page_numbers)): if page_numbers is not None: page_num = page_numbers[i] + 1 else: page_num = i + 1 print(f"page {page_num}") - yield Page.from_lt_page(page_num=page_num, page=page, first_seen_fonts=first_seen_fonts) + yield Page.from_lt_page(page_num=page_num, page=page) def parse_pdf(self, file: Path, page_numbers: Iterable[int] | None = None): self.pages = Pages(pages_gen=Parser.__pages_gen( @@ -1509,7 +1501,7 @@ class Parser: f"instruction bit fields box has wrong number of horizontal lines:\n{h_lines}") if len(v_lines) < 2: raise InsnParseError( - f"instruction bit fields box has too few vertical lines:\n{v_lines}") + f"instruction bit fields box has too few vertical lines:\n{h_lines}") bottom_line, top_line = h_lines box_min_x = v_lines[0].x0 box_max_x = v_lines[-1].x0 diff --git a/parse_powerisa_pdf/quad_tree.py b/parse_powerisa_pdf/quad_tree.py index bee9d76..34343e8 100644 --- a/parse_powerisa_pdf/quad_tree.py +++ b/parse_powerisa_pdf/quad_tree.py @@ -1,5 +1,3 @@ -# SPDX-License-Identifier: LGPL-3.0-or-later -# See Notices.txt for copyright information from __future__ import annotations from typing import Callable, Generic, Iterable, Iterator, TypeVar from math import frexp, isfinite, isnan, ldexp diff --git a/parse_powerisa_pdf/set_by_id.py b/parse_powerisa_pdf/set_by_id.py index 969f8d7..444741b 100644 --- a/parse_powerisa_pdf/set_by_id.py +++ b/parse_powerisa_pdf/set_by_id.py @@ -1,5 +1,3 @@ -# SPDX-License-Identifier: LGPL-3.0-or-later -# See Notices.txt for copyright information from collections import abc from typing import Callable, Generic, Iterable, Iterator, Protocol, TypeAlias, TypeVar, overload diff --git a/pyproject.toml b/pyproject.toml index 0f3e887..c2ec3e0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,3 @@ -# SPDX-License-Identifier: LGPL-3.0-or-later -# See Notices.txt for copyright information [build-system] requires = ["setuptools >= 61.0"] build-backend = "setuptools.build_meta" @@ -13,7 +11,4 @@ dependencies = [ requires-python = ">= 3.11" [project.scripts] -parse_powerisa_pdf = "parse_powerisa_pdf.parse_powerisa_pdf:main" - -[tool.setuptools] -packages = ["parse_powerisa_pdf"] +parse_powerisa_pdf = "parse_powerisa_pdf.parse_powerisa_pdf:main" \ No newline at end of file diff --git a/scripts/check-copyright.sh b/scripts/check-copyright.sh deleted file mode 100755 index a9dc2bc..0000000 --- a/scripts/check-copyright.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: LGPL-3.0-or-later -# See Notices.txt for copyright information -set -e - -function fail() -{ - local error="$1" - echo "error: $error" >&2 - exit 1 -} - -function fail_file() -{ - local file="$1" line="$2" error="$3" - fail "$file:$((line + 1)): $error" -} - -function check_file() -{ - local file="$1" regexes=("${@:2}") - local lines - mapfile -t lines < "$file" - if (("${#lines[@]}" == 0)); then - return # empty file, no copyright needed - fi - local line - for line in "${!regexes[@]}"; do - eval '[[ "${lines[i]}" =~ '"${regexes[i]}"' ]]' || - fail_file "$file" "$line" "doesn't match regex: ${regexes[i]}" - done -} - -POUND_HEADER=('^"# SPDX-License-Identifier: LGPL-3.0-or-later"$' '^"# See Notices.txt for copyright information"$') -SLASH_HEADER=('^"// SPDX-License-Identifier: LGPL-3.0-or-later"$' '^"// See Notices.txt for copyright information"$') -MD_HEADER=('^"