From 76438b727cc80d42f15660d4449c01e577eb5a2e Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Tue, 6 Jan 2026 15:21:46 -0800 Subject: [PATCH 1/5] rename src/main.rs -> src/lib.rs --- src/{main.rs => lib.rs} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/{main.rs => lib.rs} (100%) diff --git a/src/main.rs b/src/lib.rs similarity index 100% rename from src/main.rs rename to src/lib.rs From b68cb274da7f1100855f5b35dfb15015077d6305 Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Tue, 6 Jan 2026 15:04:47 -0800 Subject: [PATCH 2/5] change to a library --- src/lib.rs | 25 +++++++++++++++++++------ src/main.rs | 6 ++++++ 2 files changed, 25 insertions(+), 6 deletions(-) create mode 100644 src/main.rs diff --git a/src/lib.rs b/src/lib.rs index 67d9dd9..adb3be6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2016,7 +2016,7 @@ struct Parser<'ctx> { } #[derive(Debug)] -struct Error(String, Backtrace); +pub struct Error(String, Backtrace); impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { @@ -2026,6 +2026,8 @@ impl fmt::Display for Error { } } +impl std::error::Error for Error {} + trait IntoError: fmt::Display {} impl From for Error { @@ -3831,10 +3833,22 @@ fn main_inner() -> Result<(), Error> { } else { None }; + + std::fs::write( + "powerisa-instructions.xml", + parse_powerisa_pdf_and_generate_xml(&args[1], page_numbers, dump_mupdf_page_xml)?, + )?; + Ok(()) +} + +pub fn parse_powerisa_pdf_and_generate_xml( + file_name: &str, + page_numbers: Option>>>, + dump_mupdf_page_xml: bool, +) -> Result { mupdf_ffi::Context::with(|ctx| { let mut parser = Parser::new(); let is_subset = page_numbers.is_some(); - let file_name = &args[1]; parser.parse_pdf(ctx, file_name, page_numbers, dump_mupdf_page_xml)?; let mut insns = xml_tree::Element::new( "instructions".into(), @@ -3858,16 +3872,15 @@ fn main_inner() -> Result<(), Error> { } let mut output = Vec::new(); insns.write(&mut output, true)?; - std::fs::write("powerisa-instructions.xml", output)?; - Ok(()) + Ok(String::from_utf8(output).expect("known to generate valid utf-8")) }) } -fn main() -> std::process::ExitCode { +pub fn main() -> std::process::ExitCode { match main_inner() { Ok(()) => std::process::ExitCode::SUCCESS, Err(e) => { - println!("Error: {e}"); + eprintln!("Error: {e}"); std::process::ExitCode::FAILURE } } diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..23e46da --- /dev/null +++ b/src/main.rs @@ -0,0 +1,6 @@ +// SPDX-License-Identifier: LGPL-3.0-or-later +// See Notices.txt for copyright information + +fn main() -> std::process::ExitCode { + parse_powerisa_pdf::main() +} From bc550be1227869d173f563825409279ab1d43614 Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Tue, 6 Jan 2026 15:29:14 -0800 Subject: [PATCH 3/5] clean up dead code --- Cargo.lock | 32 ------------------- Cargo.toml | 3 +- src/lib.rs | 81 +++++++----------------------------------------- src/mupdf_ffi.rs | 5 +++ 4 files changed, 17 insertions(+), 104 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4de1b68..4321809 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -185,7 +185,6 @@ dependencies = [ "libm", "mupdf-sys", "quick-xml", - "serde", ] [[package]] @@ -210,7 +209,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b66c2058c55a409d601666cffe35f04333cf1013010882cec174a7467cd4e21c" dependencies = [ "memchr", - "serde", ] [[package]] @@ -257,36 +255,6 @@ version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" -[[package]] -name = "serde" -version = "1.0.228" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" -dependencies = [ - "serde_core", - "serde_derive", -] - -[[package]] -name = "serde_core" -version = "1.0.228" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" -dependencies = [ - "serde_derive", -] - -[[package]] -name = "serde_derive" -version = "1.0.228" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "shlex" version = "1.3.0" diff --git a/Cargo.toml b/Cargo.toml index 09de0ba..21175d6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,5 +14,4 @@ rust-version = "1.89.0" indexmap = "2.12.1" libm = "0.2.15" mupdf-sys = { version = "0.5.0", default-features = false } -quick-xml = { version = "0.38.4", features = ["serialize"] } -serde = { version = "1.0.228", features = ["derive"] } +quick-xml = "0.38.4" diff --git a/src/lib.rs b/src/lib.rs index adb3be6..0ea6a19 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -13,7 +13,6 @@ use mupdf_sys::{fz_matrix, fz_point, fz_text_item}; use non_nan_float::NonNaNF32; use std::{ backtrace::Backtrace, - borrow::Cow, cell::RefCell, collections::{BTreeMap, BTreeSet, HashMap, HashSet}, convert::Infallible, @@ -154,6 +153,7 @@ macro_rules! make_enum_font { _ => None, } } + #[allow(dead_code)] fn new(font_name: &str, size: NonNaNF32) -> Self { if let Some(v) = Self::new_known(font_name, size) { v @@ -610,9 +610,11 @@ struct Char { } impl Char { + #[allow(dead_code)] fn width(&self) -> f32 { self.max_x.get() - self.min_x.get() } + #[allow(dead_code)] fn height(&self) -> f32 { self.max_y.get() - self.min_y.get() } @@ -634,6 +636,7 @@ const INSN_BIT_FIELDS_PREFIX_BOX_BOTTOM_TO_SUFFIX_TEXT_HEIGHT: f32 = 20.971; const INSN_BIT_FIELDS_TOP_PAD_HEIGHT: f32 = 20.175; const INSN_BIT_FIELDS_TOP_PAD_HEIGHT2: f32 = 14.694; const INSN_BIT_FIELDS_BOX_HEIGHT: f32 = 22.317; +#[allow(dead_code)] const INSN_SP_REGS_ALTERED_REGISTER_COLUMN_X: f32 = 34.405; const INSN_SP_REGS_ALTERED_FIELDS_COLUMN_X: f32 = 86.692; const INSN_SP_REGS_ALTERED_FIELDS_CONDS_SPLIT_X: f32 = 188.74; @@ -649,6 +652,7 @@ struct ParsedTextLine { } impl ParsedTextLine { + #[allow(dead_code)] fn regular_height(&self) -> f32 { self.regular_max_y - self.regular_min_y } @@ -1522,6 +1526,7 @@ impl LineOrRect { Self::Rect(v) => v.width(), } } + #[allow(dead_code)] fn height(self) -> f32 { match self { Self::Line(v) => v.height(), @@ -1606,6 +1611,7 @@ struct Page { qt: BTreeMap>, unprocessed_chars: Rc>>>>>>, + #[allow(dead_code)] unprocessed_non_text: Rc>>, } @@ -3162,7 +3168,10 @@ impl<'ctx> Parser<'ctx> { Some(header_text) if header_text == "Special Registers Altered:" => { let new_sp_regs_altered = self.extract_insn_sp_regs_altered(desc_line)?; - next_start_min_y = new_sp_regs_altered.final_regular_min_y; + #[allow(unused_assignments)] + { + next_start_min_y = new_sp_regs_altered.final_regular_min_y; + } sp_regs_altered = Some(new_sp_regs_altered); break; } @@ -3671,74 +3680,6 @@ impl<'ctx> mupdf_ffi::DeviceCallbacks<'ctx> for MyDevice<'_> { } } -#[derive(serde::Deserialize, Debug)] -enum MuPdfXml<'a> { - #[serde(rename = "page")] - Page(MuPdfXmlPage<'a>), -} - -#[derive(serde::Deserialize, Debug)] -struct MuPdfXmlPage<'a> { - #[serde(rename = "@id")] - id: Cow<'a, str>, - #[serde(rename = "@width")] - width: f32, - #[serde(rename = "@height")] - height: f32, - block: Vec>, -} - -#[derive(serde::Deserialize, Debug)] -struct MuPdfXmlBlock<'a> { - #[serde(rename = "@bbox")] - bbox: [f32; 4], - #[serde(rename = "@justify")] - justify: Cow<'a, str>, - line: Vec>, -} - -#[derive(serde::Deserialize, Debug)] -struct MuPdfXmlLine<'a> { - #[serde(rename = "@bbox")] - bbox: [f32; 4], - #[serde(rename = "@wmode")] - wmode: u8, - #[serde(rename = "@dir")] - dir: [f32; 2], - #[serde(rename = "@text")] - text: Cow<'a, str>, - font: Vec>, -} - -#[derive(serde::Deserialize, Debug)] -struct MuPdfXmlFont<'a> { - #[serde(rename = "@name")] - name: Cow<'a, str>, - #[serde(rename = "@size")] - size: f32, - char: Vec>, -} - -#[derive(serde::Deserialize, Debug)] -struct MuPdfXmlChar<'a> { - #[serde(rename = "@quad")] - quad: [f32; 8], - #[serde(rename = "@x")] - x: f32, - #[serde(rename = "@y")] - y: f32, - #[serde(rename = "@bidi")] - bidi: u16, - #[serde(rename = "@color")] - color: Cow<'a, str>, - #[serde(rename = "@alpha")] - alpha: Cow<'a, str>, - #[serde(rename = "@flags")] - flags: u32, - #[serde(rename = "@c")] - c: Cow<'a, str>, -} - impl Page { fn from_mupdf_page( page_num: u32, diff --git a/src/mupdf_ffi.rs b/src/mupdf_ffi.rs index 19d7564..809ee44 100644 --- a/src/mupdf_ffi.rs +++ b/src/mupdf_ffi.rs @@ -765,6 +765,7 @@ pub(crate) enum WriteMode { } impl<'a, 'ctx> TextSpanRef<'a, 'ctx> { + #[allow(dead_code)] pub(crate) fn get(self) -> &'a UnsafeCell { self.ptr } @@ -802,6 +803,7 @@ pub(crate) struct FontRef<'a, 'ctx> { } impl<'a, 'ctx> FontRef<'a, 'ctx> { + #[allow(dead_code)] pub(crate) fn get(self) -> &'a UnsafeCell { self.ptr } @@ -810,9 +812,11 @@ impl<'a, 'ctx> FontRef<'a, 'ctx> { .to_str() .expect("font name isn't valid UTF-8") } + #[allow(dead_code)] pub(crate) fn is_bold(self) -> bool { unsafe { fz_font_is_bold(self.ctx.0.get(), self.ptr.get()) != 0 } } + #[allow(dead_code)] pub(crate) fn is_italic(self) -> bool { unsafe { fz_font_is_italic(self.ctx.0.get(), self.ptr.get()) != 0 } } @@ -824,6 +828,7 @@ impl<'a, 'ctx> FontRef<'a, 'ctx> { } } +#[allow(dead_code)] pub(crate) fn transform_point(point: fz_point, m: fz_matrix) -> fz_point { unsafe { fz_transform_point(point, m) } } From 4177a58c8d6bfab0e9075d6b940856310b38491a Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Tue, 6 Jan 2026 16:05:02 -0800 Subject: [PATCH 4/5] add rust code to readme --- README.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/README.md b/README.md index 7fd652f..9fba609 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,26 @@ See Notices.txt for copyright information --> parser for the OPF PowerISA 3.1C pdf to attempt to extract all instructions' pseudo-code including subscripts/superscripts and other formatting +# Using the new Rust code: + +Usage: +* Download the OPF PowerISA 3.1C pdf (yes you need that exact version) from + +* Install Rust -- you need version 1.89.0 or later. + + Getting it from https://rustup.rs/ is recommended. + +* Compile and run: + + ```bash + cargo run -- path/to/downloaded/OPF_PowerISA_v3.1C.pdf > out.log + ``` + +* This will spit out lots of errors and then successfully create + the output file -- `powerisa-instructions.xml` in the current directory. + +# Using the old Python code: + Usage: * Download the OPF PowerISA 3.1C pdf (yes you need that exact version) from * Obtain CPython 3.11 (the default `python3` in [Debian Bookworm](https://www.debian.org/releases/bookworm/)) From 38a1fb328bd44f26389c28fbf66716154f4113dc Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Tue, 6 Jan 2026 16:13:36 -0800 Subject: [PATCH 5/5] add build dependencies to readme --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index 9fba609..f589559 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,15 @@ Usage: Getting it from https://rustup.rs/ is recommended. +* Install required build dependencies: + + On Debian 12: + + ```bash + sudo apt update + sudo apt install build-essential clang unzip + ``` + * Compile and run: ```bash