diff --git a/Cargo.lock b/Cargo.lock index 4de1b68..4321809 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -185,7 +185,6 @@ dependencies = [ "libm", "mupdf-sys", "quick-xml", - "serde", ] [[package]] @@ -210,7 +209,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b66c2058c55a409d601666cffe35f04333cf1013010882cec174a7467cd4e21c" dependencies = [ "memchr", - "serde", ] [[package]] @@ -257,36 +255,6 @@ version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" -[[package]] -name = "serde" -version = "1.0.228" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" -dependencies = [ - "serde_core", - "serde_derive", -] - -[[package]] -name = "serde_core" -version = "1.0.228" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" -dependencies = [ - "serde_derive", -] - -[[package]] -name = "serde_derive" -version = "1.0.228" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "shlex" version = "1.3.0" diff --git a/Cargo.toml b/Cargo.toml index 09de0ba..21175d6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,5 +14,4 @@ rust-version = "1.89.0" indexmap = "2.12.1" libm = "0.2.15" mupdf-sys = { version = "0.5.0", default-features = false } -quick-xml = { version = "0.38.4", features = ["serialize"] } -serde = { version = "1.0.228", features = ["derive"] } +quick-xml = "0.38.4" diff --git a/README.md b/README.md index 7fd652f..f589559 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,35 @@ See Notices.txt for copyright information --> parser for the OPF PowerISA 3.1C pdf to attempt to extract all instructions' pseudo-code including subscripts/superscripts and other formatting +# Using the new Rust code: + +Usage: +* Download the OPF PowerISA 3.1C pdf (yes you need that exact version) from + +* Install Rust -- you need version 1.89.0 or later. + + Getting it from https://rustup.rs/ is recommended. + +* Install required build dependencies: + + On Debian 12: + + ```bash + sudo apt update + sudo apt install build-essential clang unzip + ``` + +* Compile and run: + + ```bash + cargo run -- path/to/downloaded/OPF_PowerISA_v3.1C.pdf > out.log + ``` + +* This will spit out lots of errors and then successfully create + the output file -- `powerisa-instructions.xml` in the current directory. + +# Using the old Python code: + Usage: * Download the OPF PowerISA 3.1C pdf (yes you need that exact version) from * Obtain CPython 3.11 (the default `python3` in [Debian Bookworm](https://www.debian.org/releases/bookworm/)) diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..0ea6a19 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,3828 @@ +// SPDX-License-Identifier: LGPL-3.0-or-later +// See Notices.txt for copyright information + +use crate::{ + mupdf_ffi::{ + MuPdfError, WriteMode, add_points, point_max_components, point_min_components, + transform_vector, + }, + quad_tree::QuadTree, +}; +use indexmap::IndexSet; +use mupdf_sys::{fz_matrix, fz_point, fz_text_item}; +use non_nan_float::NonNaNF32; +use std::{ + backtrace::Backtrace, + cell::RefCell, + collections::{BTreeMap, BTreeSet, HashMap, HashSet}, + convert::Infallible, + fmt, + num::NonZero, + ops::ControlFlow, + rc::Rc, + sync::OnceLock, +}; + +mod mupdf_ffi; +mod quad_tree; +mod xml_tree; + +mod non_nan_float { + #[derive(Default, PartialEq, PartialOrd, Clone, Copy)] + pub(crate) struct NonNaNF32(f32); + + impl std::fmt::Debug for NonNaNF32 { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.fmt(f) + } + } + + impl std::fmt::Display for NonNaNF32 { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.fmt(f) + } + } + + impl NonNaNF32 { + pub(crate) const fn new(v: f32) -> Option { + if v.is_nan() { None } else { Some(Self(v)) } + } + pub(crate) const fn get(self) -> f32 { + self.0 + } + pub(crate) const fn min(self, other: Self) -> Self { + Self(self.0.min(other.0)) + } + pub(crate) const fn max(self, other: Self) -> Self { + Self(self.0.max(other.0)) + } + } + + impl std::hash::Hash for NonNaNF32 { + fn hash(&self, state: &mut H) { + if self.0 == 0.0 { 0.0 } else { self.0 } + .to_bits() + .hash(state); + } + } + + impl Eq for NonNaNF32 {} + + impl Ord for NonNaNF32 { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.partial_cmp(other).expect("known to be non-NaN") + } + } + + impl std::ops::Neg for NonNaNF32 { + type Output = Self; + + fn neg(self) -> Self::Output { + Self(-self.0) + } + } +} + +const fn str_eq(a: &str, b: &str) -> bool { + let a = a.as_bytes(); + let b = b.as_bytes(); + if a.len() != b.len() { + return false; + } + let mut i = 0; + while i < a.len() { + if a[i] != b[i] { + return false; + } + i += 1; + } + true +} + +macro_rules! make_enum_font { + ( + enum $Font:ident { + #[other] + $Other:ident $other_body:tt, + $(#[group] + $KnownFontGroup:ident { + $(#[name_with_tag = $known_font_name_with_tag:literal, size = $known_font_size:literal] + $KnownFont:ident,)* + },)* + } + ) => { + #[derive(Hash, PartialEq, Eq, PartialOrd, Ord, Debug, Clone)] + enum $Font { + $Other $other_body, + $($($KnownFont,)*)* + } + + #[derive(Hash, PartialEq, Eq, PartialOrd, Ord, Debug, Copy, Clone)] + enum KnownFontGroup { + $($KnownFontGroup,)* + } + + impl KnownFontGroup { + const fn fonts(self) -> &'static [Font] { + match self { + $(Self::$KnownFontGroup => &[$(Font::$KnownFont,)*],)* + } + } + const INSN_CODE_FONT_GROUPS: &[Self] = &[Self::InsnCode, Self::InsnCodeSubscript]; + } + + impl $Font { + const fn extract_font_name_from_font_name_with_tag(font_name_with_tag: &str) -> &str { + if let [b'A'..=b'Z',b'A'..=b'Z',b'A'..=b'Z',b'A'..=b'Z',b'A'..=b'Z',b'A'..=b'Z',b'+',_,..] = font_name_with_tag.as_bytes() { + font_name_with_tag.split_at(7).1 + } else { + panic!("invalid font name with id") + } + } + const fn known_from_name_with_tag(font_name_with_tag: &str, size: NonNaNF32) -> Option { + match size.get() { + $($($known_font_size if str_eq(font_name_with_tag, $known_font_name_with_tag) => Some(Self::$KnownFont),)*)* + _ => None, + } + } + const fn new_known(font_name: &str, size: NonNaNF32) -> Option { + match size.get() { + $($($known_font_size if str_eq(font_name, const { + Self::extract_font_name_from_font_name_with_tag($known_font_name_with_tag) + }) => Some(Self::$KnownFont),)*)* + _ => None, + } + } + #[allow(dead_code)] + fn new(font_name: &str, size: NonNaNF32) -> Self { + if let Some(v) = Self::new_known(font_name, size) { + v + } else { + Self::Other { + font_name: Box::from(font_name), + size, + } + } + } + const fn size(&self) -> f32 { + match *self { + Self::$Other { size, .. } => size.get(), + $($(Self::$KnownFont => $known_font_size,)*)* + } + } + const fn font_name(&self) -> &str { + match self { + Self::$Other { font_name, .. } => font_name, + $($(Self::$KnownFont => const { Self::extract_font_name_from_font_name_with_tag($known_font_name_with_tag) },)*)* + } + } + const fn known_font_group(&self) -> Option { + match self { + Self::$Other { .. } => None, + $($(Self::$KnownFont => Some(KnownFontGroup::$KnownFontGroup),)*)* + } + } + const fn line_height(&self) -> f32 { + match self { + Self::$Other { .. } => self.line_height_helper(), + $($(Self::$KnownFont => const { Self::$KnownFont.line_height_helper() },)*)* + } + } + } + + const _: () = { + $($( + let (known_font_name, known_font) = const { + let known_font_name = Font::extract_font_name_from_font_name_with_tag($known_font_name_with_tag); + (known_font_name, &Font::new_known(known_font_name, NonNaNF32::new($known_font_size).unwrap()).unwrap()) + }; + assert!(str_eq(known_font_name, known_font.font_name())); + assert!(matches!(known_font, Font::$KnownFont)); + )*)* + }; + }; +} + +make_enum_font! { + enum Font { + #[other] + Other { + font_name: Box, + size: NonNaNF32, + }, + #[group] + InsnHeader { + #[name_with_tag = "YDJYQV+DejaVuSansCondensed-BoldOblique", size = 9.963] + InsnHeader, + }, + #[group] + RtlFnHeader { + #[name_with_tag = "APUYSQ+zcoN-Regular", size = 9.963] + RtlFnHeader, + }, + #[group] + PageHeader { + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 9.963] + PageHeader, + }, + #[group] + PageFooter { + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 4.981] + PageFooter, + }, + #[group] + InsnDesc { + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 8.966] + InsnDesc0, + #[name_with_tag = "FZTIYT+CMMI9", size = 8.966] + InsnDesc1, + #[name_with_tag = "ONUAYC+CMSSI9", size = 8.966] + InsnDesc2, + #[name_with_tag = "TNGBFZ+CMSY9", size = 8.966] + InsnDesc3, + #[name_with_tag = "WHMZPU+CMEX9", size = 8.966] + InsnDesc4, + #[name_with_tag = "ZJTMSG+CMSS9", size = 8.966] + InsnDesc5, + }, + #[group] + InsnDescMisc { + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 2.377] + InsnDescMisc0, + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 2.561] + InsnDescMisc1, + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 4.492] + InsnDescMisc2, + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 4.641] + InsnDescMisc3, + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 4.772] + InsnDescMisc4, + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 4.864] + InsnDescMisc5, + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 4.925] + InsnDescMisc6, + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 5.097] + InsnDescMisc7, + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 5.123] + InsnDescMisc8, + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 5.131] + InsnDescMisc9, + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 5.516] + InsnDescMisc10, + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 5.604] + InsnDescMisc11, + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 5.634] + InsnDescMisc12, + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 5.906] + InsnDescMisc13, + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.033] + InsnDescMisc14, + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.068] + InsnDescMisc15, + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.213] + InsnDescMisc16, + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.238] + InsnDescMisc17, + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.252] + InsnDescMisc18, + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.962] + InsnDescMisc19, + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 7.977] + InsnDescMisc20, + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 8.506] + InsnDescMisc21, + }, + #[group] + InsnDescCode { + #[name_with_tag = "APUYSQ+zcoN-Regular", size = 6.974] + InsnDescCode, + }, + #[group] + InsnDescCodeMisc { + #[name_with_tag = "APUYSQ+zcoN-Regular", size = 3.587] + InsnDescCodeMisc0, + #[name_with_tag = "APUYSQ+zcoN-Regular", size = 4.483] + InsnDescCodeMisc1, + }, + #[group] + InsnDescItalic { + #[name_with_tag = "CGMSHV+DejaVuSansCondensed-Oblique", size = 8.966] + InsnDescItalic, + }, + #[group] + InsnDescBold { + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 8.966] + InsnDescBold, + }, + #[group] + InsnDescBoldItalic { + #[name_with_tag = "YDJYQV+DejaVuSansCondensed-BoldOblique", size = 8.966] + InsnDescBoldItalic, + }, + #[group] + InsnDescSmall { + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 7.97] + InsnDescSmall, + }, + #[group] + InsnDescSmallItalic { + #[name_with_tag = "CGMSHV+DejaVuSansCondensed-Oblique", size = 7.97] + InsnDescSmallItalic, + }, + #[group] + InsnDescSmallBold { + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 7.97] + InsnDescSmallBold, + }, + #[group] + InsnDescSmallBoldItalic { + #[name_with_tag = "YDJYQV+DejaVuSansCondensed-BoldOblique", size = 7.97] + InsnDescSmallBoldItalic, + }, + #[group] + InsnDescBoldMisc { + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 2.21] + InsnDescBoldMisc0, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 2.399] + InsnDescBoldMisc1, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 2.763] + InsnDescBoldMisc2, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 2.946] + InsnDescBoldMisc3, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 2.949] + InsnDescBoldMisc4, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 2.999] + InsnDescBoldMisc5, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.065] + InsnDescBoldMisc6, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.086] + InsnDescBoldMisc7, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.183] + InsnDescBoldMisc8, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.686] + InsnDescBoldMisc9, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.744] + InsnDescBoldMisc10, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.825] + InsnDescBoldMisc11, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.842] + InsnDescBoldMisc12, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.857] + InsnDescBoldMisc13, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.979] + InsnDescBoldMisc14, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.032] + InsnDescBoldMisc15, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.112] + InsnDescBoldMisc16, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.161] + InsnDescBoldMisc17, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.206] + InsnDescBoldMisc18, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.353] + InsnDescBoldMisc19, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.378] + InsnDescBoldMisc20, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.434] + InsnDescBoldMisc21, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.595] + InsnDescBoldMisc22, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.619] + InsnDescBoldMisc23, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.647] + InsnDescBoldMisc24, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.68] + InsnDescBoldMisc25, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.693] + InsnDescBoldMisc26, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.736] + InsnDescBoldMisc27, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.781] + InsnDescBoldMisc28, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.802] + InsnDescBoldMisc29, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.995] + InsnDescBoldMisc30, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.201] + InsnDescBoldMisc31, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.258] + InsnDescBoldMisc32, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.363] + InsnDescBoldMisc33, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.442] + InsnDescBoldMisc34, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.473] + InsnDescBoldMisc35, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.485] + InsnDescBoldMisc36, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.512] + InsnDescBoldMisc37, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.543] + InsnDescBoldMisc38, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.613] + InsnDescBoldMisc39, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.744] + InsnDescBoldMisc40, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.774] + InsnDescBoldMisc41, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.809] + InsnDescBoldMisc42, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.849] + InsnDescBoldMisc43, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.911] + InsnDescBoldMisc44, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.92] + InsnDescBoldMisc45, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.962] + InsnDescBoldMisc46, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.981] + InsnDescBoldMisc47, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.146] + InsnDescBoldMisc48, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.213] + InsnDescBoldMisc49, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.221] + InsnDescBoldMisc50, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.243] + InsnDescBoldMisc51, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.55] + InsnDescBoldMisc52, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.62] + InsnDescBoldMisc53, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.699] + InsnDescBoldMisc54, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.725] + InsnDescBoldMisc55, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.751] + InsnDescBoldMisc56, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.856] + InsnDescBoldMisc57, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 8.029] + InsnDescBoldMisc58, + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 8.406] + InsnDescBoldMisc59, + }, + #[group] + InsnDescSubscript { + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 5.978] + InsnDescSubscript, + }, + #[group] + InsnDescBoldSubscript { + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.978] + InsnDescBoldSubscript, + }, + #[group] + InsnDescItalicSubscript { + #[name_with_tag = "CGMSHV+DejaVuSansCondensed-Oblique", size = 5.978] + InsnDescItalicSubscript, + }, + #[group] + InsnDescBoldItalicSubscript { + #[name_with_tag = "YDJYQV+DejaVuSansCondensed-BoldOblique", size = 5.978] + InsnDescBoldItalicSubscript, + }, + #[group] + InsnExtMnemonic { + #[name_with_tag = "APUYSQ+zcoN-Regular", size = 8.966] + InsnExtMnemonic, + }, + #[group] + InsnCode { + #[name_with_tag = "APUYSQ+zcoN-Regular", size = 7.97] + InsnCode0, + #[name_with_tag = "RRFUNA+CMSY8", size = 7.97] + InsnCode1, + #[name_with_tag = "HPXOZC+CMSS8", size = 7.97] + InsnCode2, + }, + #[group] + InsnCodeSubscript { + #[name_with_tag = "APUYSQ+zcoN-Regular", size = 5.978] + InsnCodeSubscript0, + #[name_with_tag = "DBQTKF+CMSY6", size = 5.978] + InsnCodeSubscript1, + }, + #[group] + TitlePageBig { + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 24.787] + TitlePageBig, + }, + #[group] + TitlePageVersion { + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 9.963] + TitlePageVersion, + }, + #[group] + TitlePageTm { + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.974] + TitlePageTm, + }, + #[group] + TitlePageRev { + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.974] + TitlePageRev, + }, + #[group] + TitlePageBook { + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 20.663] + TitlePageBook, + }, + #[group] + LegalPageItalic { + #[name_with_tag = "CGMSHV+DejaVuSansCondensed-Oblique", size = 9.963] + LegalPageItalic, + }, + #[group] + ChangeSummaryPageBold { + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 11.955] + ChangeSummaryPageBold, + }, + #[group] + ChapterTitle { + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 17.215] + ChapterTitle, + }, + #[group] + MathMisc { + #[name_with_tag = "AAJMKT+CMMI6", size = 5.978] + MathMisc0, + #[name_with_tag = "CUTMFD+CMSSI8", size = 5.978] + MathMisc1, + #[name_with_tag = "CUTMFD+CMSSI8", size = 7.97] + MathMisc2, + #[name_with_tag = "FZTIYT+CMMI9", size = 5.734] + MathMisc3, + #[name_with_tag = "FZTIYT+CMMI9", size = 7.168] + MathMisc4, + #[name_with_tag = "HONFQS+CMMI8", size = 7.97] + MathMisc5, + #[name_with_tag = "HPXOZC+CMSS8", size = 5.978] + MathMisc6, + #[name_with_tag = "LLVRDD+CMSY10", size = 11.955] + MathMisc7, + #[name_with_tag = "ZJTMSG+CMSS9", size = 7.168] + MathMisc8, + }, + } +} + +impl Font { + const fn space_width(&self) -> f32 { + self.size() * const { 3.985 / Font::InsnCode0.size() } + } + const fn line_height_helper(&self) -> f32 { + let font_name = self.font_name(); + let mut i = 0; + while i < KnownFontGroup::INSN_CODE_FONT_GROUPS.len() { + let fonts = KnownFontGroup::INSN_CODE_FONT_GROUPS[i].fonts(); + let mut j = 0; + while j < fonts.len() { + if str_eq(font_name, fonts[j].font_name()) { + return 9.464 * self.size() / Font::InsnCode0.size(); + } + j += 1; + } + i += 1; + } + let group = self.known_font_group(); + if matches!(group, Some(KnownFontGroup::InsnDesc)) + || str_eq(font_name, Font::InsnDesc0.font_name()) + || str_eq(font_name, Font::InsnDescBold.font_name()) + || str_eq(font_name, Font::InsnDescItalic.font_name()) + || str_eq(font_name, Font::InsnDescBoldItalic.font_name()) + || matches!(group, Some(KnownFontGroup::MathMisc)) + { + return 10.959 * self.size() / Font::InsnDesc0.size(); + } + panic!("no line height") + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +struct Char { + font: Font, + text: String, + min_x: NonNaNF32, + min_y: NonNaNF32, + max_x: NonNaNF32, + max_y: NonNaNF32, +} + +impl Char { + #[allow(dead_code)] + fn width(&self) -> f32 { + self.max_x.get() - self.min_x.get() + } + #[allow(dead_code)] + fn height(&self) -> f32 { + self.max_y.get() - self.min_y.get() + } + fn top_down_left_to_right_sort_key(&self) -> impl Ord + use<> { + (-self.min_y, self.min_x) + } +} + +const COLUMN_SPLIT_X: f32 = 300.0; +const PAGE_BODY_MAX_X: f32 = 600.0; +const PAGE_BODY_MIN_X: f32 = 50.0; +const PAGE_BODY_MAX_Y: f32 = 780.0; +const PAGE_BODY_MIN_Y: f32 = 45.0; +const ONE_TITLE_LINE_SPLIT_Y: f32 = 734.0; +const TWO_TITLE_LINES_SPLIT_Y: f32 = 715.0; +const INSN_BIT_FIELDS_PREFIX_TEXT_TOP_PAD_HEIGHT: f32 = 29.938; +const INSN_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT: f32 = 9.278; +const INSN_BIT_FIELDS_PREFIX_BOX_BOTTOM_TO_SUFFIX_TEXT_HEIGHT: f32 = 20.971; +const INSN_BIT_FIELDS_TOP_PAD_HEIGHT: f32 = 20.175; +const INSN_BIT_FIELDS_TOP_PAD_HEIGHT2: f32 = 14.694; +const INSN_BIT_FIELDS_BOX_HEIGHT: f32 = 22.317; +#[allow(dead_code)] +const INSN_SP_REGS_ALTERED_REGISTER_COLUMN_X: f32 = 34.405; +const INSN_SP_REGS_ALTERED_FIELDS_COLUMN_X: f32 = 86.692; +const INSN_SP_REGS_ALTERED_FIELDS_CONDS_SPLIT_X: f32 = 188.74; + +#[derive(Clone)] +struct ParsedTextLine { + element: xml_tree::Element, + regular_min_y: f32, + regular_max_y: f32, + fonts: TextLineFonts, + chars: Vec, + preceding_blank_lines: u32, +} + +impl ParsedTextLine { + #[allow(dead_code)] + fn regular_height(&self) -> f32 { + self.regular_max_y - self.regular_min_y + } + fn get_header_text(&self) -> Option { + assert_eq!(self.fonts, TextLineFonts::InsnDescFonts); + if !self.element.text.trim().is_empty() { + return None; + } + if !self.element.tail.trim().is_empty() { + return None; + } + let [b] = &*self.element.children else { + return None; + }; + if b.tag.normal() != Some("b") { + return None; + } + if b.children.len() != 0 { + return None; + } + let text = self.element.inner_text(); + // should also check titlecase, but rust doesn't include that in std + if text.ends_with(":") && text.chars().next().is_some_and(|ch| ch.is_uppercase()) { + Some(text) + } else { + None + } + } + fn write_xml(&self, parent: &mut xml_tree::Element, trailing_nl: bool) { + for _ in 0..self.preceding_blank_lines { + parent.sub_element("br".into(), []).tail = "\n".into(); + } + if let Some(last_child) = parent.children.last_mut() { + last_child.tail += &self.element.text; + } else { + parent.text += &self.element.text; + } + parent.children.extend_from_slice(&self.element.children); + if trailing_nl { + parent.sub_element("br".into(), []).tail = "\n".into(); + } + } + fn write_xml_lines( + lines: impl IntoIterator>, + parent: &mut xml_tree::Element, + trailing_nl: bool, + preceding_nl: bool, + ) { + if preceding_nl { + parent.sub_element("br".into(), []).tail = "\n".into(); + } + let mut first = true; + for line in lines { + let line = std::borrow::Borrow::borrow(&line); + if first { + first = false; + } else { + parent.sub_element("br".into(), []).tail = "\n".into(); + } + line.write_xml(parent, false); + } + if trailing_nl { + parent.sub_element("br".into(), []).tail = "\n".into(); + } + } +} + +impl fmt::Debug for ParsedTextLine { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let Self { + element, + regular_min_y, + regular_max_y, + fonts, + chars, + preceding_blank_lines, + } = self; + f.debug_struct("ParsedTextLine") + .field("element", &format_args!("{element}")) + .field("regular_min_y", regular_min_y) + .field("regular_max_y", regular_max_y) + .field("fonts", fonts) + .field("chars", chars) + .field("preceding_blank_lines", preceding_blank_lines) + .finish() + } +} + +impl fmt::Display for ParsedTextLine { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + for _ in 0..self.preceding_blank_lines { + f.write_str("\n")?; + } + self.element.fmt(f) + } +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +enum BaselinePos { + Above, + Below, +} + +macro_rules! make_enum_with_values { + ( + $(#[$enum_meta:meta])* + enum $Enum:ident { + $($Variant:ident,)* + } + ) => { + $(#[$enum_meta])* + enum $Enum { + $($Variant,)* + } + + impl $Enum { + const VALUES: &[Self] = &[$(Self::$Variant,)*]; + } + }; +} + +make_enum_with_values! { + #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] + enum TextLineFonts { + InsnMnemonicFonts, + InsnHeaderFonts, + InsnBitFieldBitNumberFonts, + InsnBitFieldNameFonts, + InsnBitFieldsAffixTitleFonts, + InsnCodeFonts, + InsnDescFonts, + } +} + +impl TextLineFonts { + fn regular(self) -> &'static [Font] { + match self { + TextLineFonts::InsnMnemonicFonts => KnownFontGroup::InsnDesc.fonts(), + TextLineFonts::InsnHeaderFonts => &[Font::InsnHeader], + TextLineFonts::InsnBitFieldBitNumberFonts => &[Font::InsnDescSmall, Font::TitlePageRev], + TextLineFonts::InsnBitFieldNameFonts => KnownFontGroup::InsnDesc.fonts(), + TextLineFonts::InsnBitFieldsAffixTitleFonts => &[Font::InsnDescSmall], + TextLineFonts::InsnCodeFonts => KnownFontGroup::InsnCode.fonts(), + TextLineFonts::InsnDescFonts => { + static FONTS: OnceLock> = OnceLock::new(); + FONTS.get_or_init(|| { + Box::from_iter( + KnownFontGroup::InsnDesc + .fonts() + .iter() + .cloned() + .chain([Font::InsnDescSmall]), + ) + }) + } + } + } + fn italic(self) -> Option<&'static [Font]> { + match self { + TextLineFonts::InsnMnemonicFonts => None, + TextLineFonts::InsnHeaderFonts => None, + TextLineFonts::InsnBitFieldBitNumberFonts => None, + TextLineFonts::InsnBitFieldNameFonts => None, + TextLineFonts::InsnBitFieldsAffixTitleFonts => None, + TextLineFonts::InsnCodeFonts => None, + TextLineFonts::InsnDescFonts => { + Some(&[Font::InsnDescItalic, Font::InsnDescSmallItalic]) + } + } + } + fn bold(self) -> Option<&'static [Font]> { + match self { + TextLineFonts::InsnMnemonicFonts => None, + TextLineFonts::InsnHeaderFonts => None, + TextLineFonts::InsnBitFieldBitNumberFonts => None, + TextLineFonts::InsnBitFieldNameFonts => None, + TextLineFonts::InsnBitFieldsAffixTitleFonts => Some(&[Font::InsnDescSmallBold]), + TextLineFonts::InsnCodeFonts => None, + TextLineFonts::InsnDescFonts => Some(&[Font::InsnDescBold, Font::InsnDescSmallBold]), + } + } + fn bold_italic(self) -> Option<&'static [Font]> { + match self { + TextLineFonts::InsnMnemonicFonts => None, + TextLineFonts::InsnHeaderFonts => None, + TextLineFonts::InsnBitFieldBitNumberFonts => None, + TextLineFonts::InsnBitFieldNameFonts => None, + TextLineFonts::InsnBitFieldsAffixTitleFonts => None, + TextLineFonts::InsnCodeFonts => None, + TextLineFonts::InsnDescFonts => { + Some(&[Font::InsnDescBoldItalic, Font::InsnDescSmallBoldItalic]) + } + } + } + fn subscript(self) -> Option<&'static [Font]> { + match self { + TextLineFonts::InsnMnemonicFonts => None, + TextLineFonts::InsnHeaderFonts => None, + TextLineFonts::InsnBitFieldBitNumberFonts => None, + TextLineFonts::InsnBitFieldNameFonts => Some(&[Font::InsnDescSubscript]), + TextLineFonts::InsnBitFieldsAffixTitleFonts => None, + TextLineFonts::InsnCodeFonts => Some(KnownFontGroup::InsnCodeSubscript.fonts()), + TextLineFonts::InsnDescFonts => Some(&[Font::InsnDescSubscript]), + } + } + fn bold_subscript(self) -> Option<&'static [Font]> { + match self { + TextLineFonts::InsnMnemonicFonts => None, + TextLineFonts::InsnHeaderFonts => None, + TextLineFonts::InsnBitFieldBitNumberFonts => None, + TextLineFonts::InsnBitFieldNameFonts => None, + TextLineFonts::InsnBitFieldsAffixTitleFonts => None, + TextLineFonts::InsnCodeFonts => None, + TextLineFonts::InsnDescFonts => Some(&[Font::InsnDescBoldSubscript]), + } + } + fn italic_subscript(self) -> Option<&'static [Font]> { + match self { + TextLineFonts::InsnMnemonicFonts => None, + TextLineFonts::InsnHeaderFonts => None, + TextLineFonts::InsnBitFieldBitNumberFonts => None, + TextLineFonts::InsnBitFieldNameFonts => None, + TextLineFonts::InsnBitFieldsAffixTitleFonts => None, + TextLineFonts::InsnCodeFonts => None, + TextLineFonts::InsnDescFonts => Some(&[Font::InsnDescItalicSubscript]), + } + } + fn bold_italic_subscript(self) -> Option<&'static [Font]> { + match self { + TextLineFonts::InsnMnemonicFonts => None, + TextLineFonts::InsnHeaderFonts => None, + TextLineFonts::InsnBitFieldBitNumberFonts => None, + TextLineFonts::InsnBitFieldNameFonts => None, + TextLineFonts::InsnBitFieldsAffixTitleFonts => None, + TextLineFonts::InsnCodeFonts => None, + TextLineFonts::InsnDescFonts => Some(&[Font::InsnDescBoldItalicSubscript]), + } + } + fn code(self) -> Option<&'static [Font]> { + match self { + TextLineFonts::InsnMnemonicFonts => None, + TextLineFonts::InsnHeaderFonts => None, + TextLineFonts::InsnBitFieldBitNumberFonts => None, + TextLineFonts::InsnBitFieldNameFonts => None, + TextLineFonts::InsnBitFieldsAffixTitleFonts => None, + TextLineFonts::InsnCodeFonts => None, + TextLineFonts::InsnDescFonts => Some(&[Font::InsnDescCode, Font::InsnExtMnemonic]), + } + } + fn code_subscript(self) -> Option<&'static [Font]> { + match self { + TextLineFonts::InsnMnemonicFonts => None, + TextLineFonts::InsnHeaderFonts => None, + TextLineFonts::InsnBitFieldBitNumberFonts => None, + TextLineFonts::InsnBitFieldNameFonts => None, + TextLineFonts::InsnBitFieldsAffixTitleFonts => None, + TextLineFonts::InsnCodeFonts => None, + TextLineFonts::InsnDescFonts => Some(KnownFontGroup::InsnCodeSubscript.fonts()), + } + } + fn get_fonts( + self, + part_kind: TextLineFontKind, + ) -> Option<(&'static [Font], Option)> { + let fonts = match part_kind { + TextLineFontKind::Regular => self.regular(), + TextLineFontKind::Italic => self.italic()?, + TextLineFontKind::Bold => self.bold()?, + TextLineFontKind::BoldItalic => self.bold_italic()?, + TextLineFontKind::Subscript => self.subscript()?, + TextLineFontKind::Superscript => self.subscript()?, + TextLineFontKind::BoldSubscript => self.bold_subscript()?, + TextLineFontKind::BoldSuperscript => self.bold_subscript()?, + TextLineFontKind::ItalicSubscript => self.italic_subscript()?, + TextLineFontKind::ItalicSuperscript => self.italic_subscript()?, + TextLineFontKind::BoldItalicSubscript => self.bold_italic_subscript()?, + TextLineFontKind::BoldItalicSuperscript => self.bold_italic_subscript()?, + TextLineFontKind::Code => self.code()?, + TextLineFontKind::CodeSubscript => self.code_subscript()?, + TextLineFontKind::CodeSuperscript => self.code_subscript()?, + }; + Some((fonts, part_kind.sub_super().baseline_pos())) + } + fn font_to_kind_map(self) -> &'static HashMap<(Font, Option), TextLineFontKind> { + static MAPS: OnceLock< + HashMap), TextLineFontKind>>, + > = OnceLock::new(); + &MAPS.get_or_init(|| { + Self::VALUES + .iter() + .map(|&this: &TextLineFonts| { + let mut map = HashMap::new(); + for &kind in TextLineFontKind::VALUES { + let Some((fonts, baseline_pos)) = this.get_fonts(kind) else { + continue; + }; + for font in fonts { + let old_kind = map.insert((font.clone(), baseline_pos), kind); + assert!( + old_kind.is_none(), + "duplicate font: kind={kind:?} old_kind={old_kind:?} font={font:?}" + ); + } + } + (this, map) + }) + .collect() + })[&self] + } + fn fonts(self) -> &'static HashSet { + static SETS: OnceLock>> = OnceLock::new(); + &SETS.get_or_init(|| { + Self::VALUES + .iter() + .map(|&this: &TextLineFonts| { + let mut set = HashSet::new(); + for &kind in TextLineFontKind::VALUES { + let Some((fonts, _baseline_pos)) = this.get_fonts(kind) else { + continue; + }; + set.extend(fonts.iter().cloned()); + } + (this, set) + }) + .collect() + })[&self] + } + fn get_kind(self, font: Font, baseline_pos: BaselinePos) -> Option { + let font_to_kind_map = self.font_to_kind_map(); + font_to_kind_map + .get(&(font.clone(), Some(baseline_pos))) + .or_else(|| font_to_kind_map.get(&(font, None))) + .copied() + } +} + +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] +enum FontVariantCode { + Code, + NotCode, +} + +impl FontVariantCode { + const fn value(self) -> &'static [&'static str] { + match self { + Self::Code => &["code"], + Self::NotCode => &[], + } + } +} + +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] +enum FontVariantBold { + Bold, + NotBold, +} + +impl FontVariantBold { + const fn value(self) -> &'static [&'static str] { + match self { + Self::Bold => &["b"], + Self::NotBold => &[], + } + } +} + +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] +enum FontVariantItalic { + Italic, + NotItalic, +} + +impl FontVariantItalic { + const fn value(self) -> &'static [&'static str] { + match self { + Self::Italic => &["i"], + Self::NotItalic => &[], + } + } +} + +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] +enum FontVariantSubSuper { + NotSubSuper, + Subscript, + Superscript, +} + +impl FontVariantSubSuper { + const fn value(self) -> &'static [&'static str] { + match self { + Self::NotSubSuper => &[], + Self::Subscript => &["sub"], + Self::Superscript => &["sup"], + } + } +} + +impl FontVariantSubSuper { + fn baseline_pos(self) -> Option { + match self { + FontVariantSubSuper::NotSubSuper => None, + FontVariantSubSuper::Subscript => Some(BaselinePos::Below), + FontVariantSubSuper::Superscript => Some(BaselinePos::Above), + } + } +} + +make_enum_with_values! { + #[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] + enum TextLineFontKind { + Regular, + Subscript, + Superscript, + Italic, + ItalicSubscript, + ItalicSuperscript, + Bold, + BoldSubscript, + BoldSuperscript, + BoldItalic, + BoldItalicSubscript, + BoldItalicSuperscript, + Code, + CodeSubscript, + CodeSuperscript, + } +} + +impl TextLineFontKind { + fn code(self) -> FontVariantCode { + match self { + Self::Regular + | Self::Subscript + | Self::Superscript + | Self::Italic + | Self::ItalicSubscript + | Self::ItalicSuperscript + | Self::Bold + | Self::BoldSubscript + | Self::BoldSuperscript + | Self::BoldItalic + | Self::BoldItalicSubscript + | Self::BoldItalicSuperscript => FontVariantCode::NotCode, + Self::Code | Self::CodeSubscript | Self::CodeSuperscript => FontVariantCode::Code, + } + } + fn bold(self) -> FontVariantBold { + match self { + Self::Regular + | Self::Subscript + | Self::Superscript + | Self::Italic + | Self::ItalicSubscript + | Self::ItalicSuperscript => FontVariantBold::NotBold, + Self::Bold + | Self::BoldSubscript + | Self::BoldSuperscript + | Self::BoldItalic + | Self::BoldItalicSubscript + | Self::BoldItalicSuperscript => FontVariantBold::Bold, + Self::Code | Self::CodeSubscript | Self::CodeSuperscript => FontVariantBold::NotBold, + } + } + fn italic(self) -> FontVariantItalic { + match self { + Self::Regular | Self::Subscript | Self::Superscript => FontVariantItalic::NotItalic, + Self::Italic | Self::ItalicSubscript | Self::ItalicSuperscript => { + FontVariantItalic::Italic + } + Self::Bold | Self::BoldSubscript | Self::BoldSuperscript => { + FontVariantItalic::NotItalic + } + Self::BoldItalic | Self::BoldItalicSubscript | Self::BoldItalicSuperscript => { + FontVariantItalic::Italic + } + Self::Code | Self::CodeSubscript | Self::CodeSuperscript => { + FontVariantItalic::NotItalic + } + } + } + fn sub_super(self) -> FontVariantSubSuper { + match self { + Self::Regular => FontVariantSubSuper::NotSubSuper, + Self::Subscript => FontVariantSubSuper::Subscript, + Self::Superscript => FontVariantSubSuper::Superscript, + Self::Italic => FontVariantSubSuper::NotSubSuper, + Self::ItalicSubscript => FontVariantSubSuper::Subscript, + Self::ItalicSuperscript => FontVariantSubSuper::Superscript, + Self::Bold => FontVariantSubSuper::NotSubSuper, + Self::BoldSubscript => FontVariantSubSuper::Subscript, + Self::BoldSuperscript => FontVariantSubSuper::Superscript, + Self::BoldItalic => FontVariantSubSuper::NotSubSuper, + Self::BoldItalicSubscript => FontVariantSubSuper::Subscript, + Self::BoldItalicSuperscript => FontVariantSubSuper::Superscript, + Self::Code => FontVariantSubSuper::NotSubSuper, + Self::CodeSubscript => FontVariantSubSuper::Subscript, + Self::CodeSuperscript => FontVariantSubSuper::Superscript, + } + } + fn text_line_tags(self) -> impl Clone + Iterator { + self.code() + .value() + .iter() + .copied() + .chain(self.bold().value().iter().copied()) + .chain(self.italic().value().iter().copied()) + .chain(self.sub_super().value().iter().copied()) + } +} + +#[derive(Debug)] +struct ElementBodyBuilder<'a> { + containing_element: &'a mut xml_tree::Element, + stack: Vec, +} + +impl<'a> ElementBodyBuilder<'a> { + fn new(containing_element: &'a mut xml_tree::Element) -> Self { + Self { + containing_element, + stack: Vec::with_capacity(5), + } + } + fn shrink_stack(&mut self, new_len: usize) { + while new_len < self.stack.len() { + let Some(element) = self.stack.pop() else { + unreachable!(); + }; + self.insert_point().children.push(element); + } + } + fn set_tag_stack<'b>(&mut self, tag_stack: impl IntoIterator) { + let mut new_len = 0; + for (i, tag) in tag_stack.into_iter().enumerate() { + new_len = i + 1; + if i >= self.stack.len() { + self.stack.push(xml_tree::Element::new(tag.into(), [])); + } else if self.stack[i].tag.normal() != Some(tag) { + self.shrink_stack(new_len); + } + } + self.shrink_stack(new_len); + } + fn write_text(&mut self, text: impl std::borrow::Borrow) { + let text = std::borrow::Borrow::borrow(&text); + let insert_point = self.insert_point(); + if let Some(child) = insert_point.children.last_mut() { + child.tail += text; + } else { + insert_point.text += text; + } + } + fn insert_point(&mut self) -> &mut xml_tree::Element { + self.stack.last_mut().unwrap_or(self.containing_element) + } + fn scope(&mut self, f: impl FnOnce(&mut Self) -> R) -> R { + let retval = f(self); + self.flush(); + retval + } + fn flush(&mut self) { + self.set_tag_stack([]); + } +} + +#[derive(Clone, Debug)] +struct InsnBitField { + box_min_x: f32, + box_max_x: f32, + name: ParsedTextLine, + bit_number: ParsedTextLine, +} + +impl fmt::Display for InsnBitField { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let Self { + box_min_x, + box_max_x, + name, + bit_number, + } = self; + write!( + f, + "" + ) + } +} + +impl InsnBitField { + fn write_xml(&self, parent: &mut xml_tree::Element) { + let field = parent.sub_element("field".into(), []); + field.text = "\n".into(); + field.tail = "\n".into(); + let name = field.sub_element("name".into(), []); + name.tail = "\n".into(); + self.name.write_xml(name, false); + let bit_number = field.sub_element("bit-number".into(), []); + bit_number.tail = "\n".into(); + self.bit_number.write_xml(bit_number, false); + } +} + +#[derive(Clone, Debug)] +struct InsnBitFieldsPrefix { + box_min_x: f32, + box_min_y: f32, + box_max_x: f32, + box_max_y: f32, + prefix_text: ParsedTextLine, + fields: Vec, + suffix_text: ParsedTextLine, +} + +impl fmt::Display for InsnBitFieldsPrefix { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let Self { + box_min_x, + box_min_y, + box_max_x, + box_max_y, + prefix_text, + fields, + suffix_text, + } = self; + writeln!( + f, + "") + } +} + +impl InsnBitFieldsPrefix { + fn write_xml(&self, parent: &mut xml_tree::Element) { + let prefix_elm = parent.sub_element("prefix".into(), []); + prefix_elm.text = "\n".into(); + prefix_elm.tail = "\n".into(); + let prefix_text = prefix_elm.sub_element("prefix-text".into(), []); + prefix_text.tail = "\n".into(); + self.prefix_text.write_xml(prefix_text, false); + InsnBitFields::write_xml_fields(&self.fields, prefix_elm); + let suffix_text = prefix_elm.sub_element("suffix-text".into(), []); + suffix_text.tail = "\n".into(); + self.suffix_text.write_xml(suffix_text, false); + } +} + +#[derive(Clone, Debug)] +struct InsnBitFields { + prefix: Option, + box_min_x: f32, + box_min_y: f32, + box_max_x: f32, + box_max_y: f32, + fields: Vec, +} + +impl fmt::Display for InsnBitFields { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let Self { + prefix, + box_min_x, + box_min_y, + box_max_x, + box_max_y, + fields, + } = self; + if let Some(prefix) = prefix { + prefix.fmt(f)?; + } + writeln!( + f, + "") + } +} + +impl InsnBitFields { + fn write_xml_fields( + fields: impl IntoIterator>, + parent: &mut xml_tree::Element, + ) { + let fields_elm = parent.sub_element("fields".into(), []); + fields_elm.text = "\n".into(); + fields_elm.tail = "\n".into(); + for field in fields { + std::borrow::Borrow::borrow(&field).write_xml(fields_elm); + } + } + fn write_xml(&self, parent: &mut xml_tree::Element) { + let bit_fields = parent.sub_element("bit-fields".into(), []); + bit_fields.text = "\n".into(); + bit_fields.tail = "\n".into(); + if let Some(prefix) = &self.prefix { + prefix.write_xml(bit_fields); + } + Self::write_xml_fields(&self.fields, bit_fields) + } +} + +#[derive(Clone, Debug)] +struct InsnSpRegsAlteredEntry { + reg: ParsedTextLine, + fields: Vec, + conds: Vec, +} + +impl InsnSpRegsAlteredEntry { + fn display_fmt_with_indent(&self, f: &mut fmt::Formatter<'_>, indent: &str) -> fmt::Result { + let Self { reg, fields, conds } = self; + writeln!(f, "Entry(")?; + writeln!(f, "{indent} reg={reg},")?; + write!(f, "{indent} fields=")?; + if fields.is_empty() { + write!(f, "()")?; + } else { + writeln!(f, "(")?; + for field in fields { + writeln!(f, "{indent} {field},")?; + } + write!(f, "{indent} )")?; + } + writeln!(f, ",")?; + writeln!(f, "{indent} conds=")?; + if conds.is_empty() { + write!(f, "()")?; + } else { + writeln!(f, "(")?; + for cond in conds { + writeln!(f, "{indent} {cond},")?; + } + write!(f, "{indent} )")?; + } + writeln!(f, ",")?; + write!(f, "{indent})") + } + fn write_xml(&self, parent: &mut xml_tree::Element) { + let entry = parent.sub_element("entry".into(), []); + entry.text = "\n".into(); + entry.tail = "\n".into(); + let reg = entry.sub_element("register".into(), []); + reg.tail = "\n".into(); + self.reg.write_xml(reg, false); + let fields = entry.sub_element("fields".into(), []); + fields.tail = "\n".into(); + ParsedTextLine::write_xml_lines(&self.fields, fields, false, false); + let conds = entry.sub_element("conditions".into(), []); + conds.tail = "\n".into(); + ParsedTextLine::write_xml_lines(&self.conds, conds, false, false); + } +} + +impl fmt::Display for InsnSpRegsAlteredEntry { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.display_fmt_with_indent(f, "") + } +} + +#[derive(Clone, Debug)] +struct InsnSpRegsAltered { + sp_regs_altered_text: ParsedTextLine, + special_text: Option, + table_header_reg: Option, + table_header_fields: Option, + entries: Vec, + final_regular_min_y: f32, +} + +impl fmt::Display for InsnSpRegsAltered { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let Self { + sp_regs_altered_text, + special_text, + table_header_reg, + table_header_fields, + entries, + final_regular_min_y, + } = self; + writeln!(f, "InsnSpRegsAltered(")?; + writeln!(f, " sp_regs_altered_text={sp_regs_altered_text},")?; + if let Some(special_text) = special_text { + writeln!(f, " special_text={special_text},")?; + } + if let Some(table_header_reg) = table_header_reg { + writeln!(f, " table_header_reg={table_header_reg},")?; + } + if let Some(table_header_fields) = table_header_fields { + writeln!(f, " table_header_fields={table_header_fields},")?; + } + if self.entries.is_empty() { + writeln!(f, " entries=(),")?; + } else { + writeln!(f, " entries=(")?; + for entry in entries { + write!(f, " ")?; + entry.display_fmt_with_indent(f, " ")?; + writeln!(f, ",")?; + } + writeln!(f, " ),")?; + } + writeln!(f, " final_regular_min_y={final_regular_min_y},")?; + write!(f, ")") + } +} + +impl InsnSpRegsAltered { + fn write_xml(&self, parent: &mut xml_tree::Element) { + let sp_regs_altered = parent.sub_element("special-registers-altered".into(), []); + sp_regs_altered.text = "\n".into(); + sp_regs_altered.tail = "\n".into(); + let title = sp_regs_altered.sub_element("title".into(), []); + title.tail = "\n".into(); + self.sp_regs_altered_text.write_xml(title, false); + if let Some(special_text) = &self.special_text { + let special_text_el = sp_regs_altered.sub_element("special-text".into(), []); + special_text_el.tail = "\n".into(); + special_text.write_xml(special_text_el, false); + } + if let Some(table_header_reg) = &self.table_header_reg { + let table_header_reg_el = + sp_regs_altered.sub_element("table-header-register".into(), []); + table_header_reg_el.tail = "\n".into(); + table_header_reg.write_xml(table_header_reg_el, false); + } + if let Some(table_header_fields) = &self.table_header_fields { + let table_header_fields_el = + sp_regs_altered.sub_element("table-header-fields".into(), []); + table_header_fields_el.tail = "\n".into(); + table_header_fields.write_xml(table_header_fields_el, false); + } + for entry in &self.entries { + entry.write_xml(sp_regs_altered); + } + } +} + +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] +enum InsnParseSection { + Code, + Header, + Desc, +} + +#[derive(Clone, Debug)] +enum PageItem { + Char(Char), + LineOrRect(LineOrRect), +} + +#[derive(Copy, Clone, Debug)] +enum LineOrRect { + Line(Line), + Rect(Rect), +} + +impl LineOrRect { + fn width(self) -> f32 { + match self { + Self::Line(v) => v.width(), + Self::Rect(v) => v.width(), + } + } + #[allow(dead_code)] + fn height(self) -> f32 { + match self { + Self::Line(v) => v.height(), + Self::Rect(v) => v.height(), + } + } + fn min_x(self) -> NonNaNF32 { + match self { + Self::Line(v) => v.min_x(), + Self::Rect(v) => v.min_x, + } + } + fn max_x(self) -> NonNaNF32 { + match self { + Self::Line(v) => v.max_x(), + Self::Rect(v) => v.max_x, + } + } + fn min_y(self) -> NonNaNF32 { + match self { + Self::Line(v) => v.min_y(), + Self::Rect(v) => v.min_y, + } + } + fn max_y(self) -> NonNaNF32 { + match self { + Self::Line(v) => v.max_y(), + Self::Rect(v) => v.max_y, + } + } +} + +#[derive(Copy, Clone, Debug)] +struct Line { + p0_x: NonNaNF32, + p0_y: NonNaNF32, + p1_x: NonNaNF32, + p1_y: NonNaNF32, +} + +impl Line { + fn width(self) -> f32 { + f32::abs(self.p0_x.get() - self.p1_x.get()) + } + fn height(self) -> f32 { + f32::abs(self.p0_y.get() - self.p1_y.get()) + } + fn min_x(self) -> NonNaNF32 { + self.p0_x.min(self.p1_x) + } + fn max_x(self) -> NonNaNF32 { + self.p0_x.max(self.p1_x) + } + fn min_y(self) -> NonNaNF32 { + self.p0_y.min(self.p1_y) + } + fn max_y(self) -> NonNaNF32 { + self.p0_y.max(self.p1_y) + } +} + +#[derive(Copy, Clone, Debug)] +struct Rect { + min_x: NonNaNF32, + max_x: NonNaNF32, + min_y: NonNaNF32, + max_y: NonNaNF32, +} + +impl Rect { + fn width(self) -> f32 { + self.max_x.get() - self.min_x.get() + } + fn height(self) -> f32 { + self.max_y.get() - self.min_y.get() + } +} + +#[derive(Debug)] +struct Page { + page_num: u32, + qt: BTreeMap>, + unprocessed_chars: + Rc>>>>>>, + #[allow(dead_code)] + unprocessed_non_text: Rc>>, +} + +struct Pages<'ctx> { + pages_gen: Option> + 'ctx>>, + pages: BTreeMap>, + max_page_num: u32, +} + +impl<'ctx> fmt::Debug for Pages<'ctx> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let Self { + pages_gen, + pages, + max_page_num, + } = self; + f.debug_struct("Pages") + .field( + "pages_gen", + &pages_gen.is_some().then_some(format_args!("...")), + ) + .field("pages", pages) + .field("max_page_num", max_page_num) + .finish() + } +} + +impl<'ctx> Pages<'ctx> { + fn new(pages_gen: Option> + 'ctx>>) -> Self { + Self { + pages_gen, + pages: BTreeMap::new(), + max_page_num: 0, + } + } + fn close(&mut self) { + self.pages_gen = None; + } + fn is_past_end(&mut self, page_num: u32) -> Result { + while self.pages_gen.is_some() && page_num > self.max_page_num { + self.fill_page()?; + } + Ok(page_num > self.max_page_num) + } + fn fill_page(&mut self) -> Result { + let Some(pages_gen) = &mut self.pages_gen else { + return Ok(false); + }; + let page = pages_gen.next(); + let Some(page) = page else { + self.close(); + return Ok(false); + }; + let page = page?; + let page_num = page.page_num; + assert!( + page_num > self.max_page_num, + "page numbers must be a strictly-increasing positive integer sequence:\n\ + got {page_num} which isn't more than {}", + self.max_page_num + ); + self.pages.insert(page_num, Rc::new(page)); + self.max_page_num = page_num; + Ok(true) + } + fn get(&mut self, page_num: u32) -> Result>, Error> { + loop { + if let Some(page) = self.pages.get(&page_num) { + return Ok(Some(page.clone())); + } + if self.pages_gen.is_none() { + return Ok(None); + } + if page_num < self.max_page_num { + return Ok(None); + } + self.fill_page()?; + } + } +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +struct TextSection { + page_num: u32, + min_x: NonNaNF32, + min_y: NonNaNF32, + max_x: NonNaNF32, + max_y: NonNaNF32, +} + +struct TextSectionPagesData { + columns_then_full_page: BTreeMap, + full_page_then_columns: BTreeMap, + one_title_line_then_columns_then_full_page: BTreeMap, + two_title_lines_then_columns_then_full_page: BTreeMap, + columns_then_columns: BTreeMap, + one_title_line_then_columns_then_columns: BTreeMap, + one_title_line_then_columns: BTreeSet, + two_title_lines_then_columns: BTreeSet, + full_page: BTreeSet, +} + +impl TextSectionPagesData { + fn get() -> &'static Self { + static DATA: OnceLock = OnceLock::new(); + DATA.get_or_init(|| Self { + columns_then_full_page: FromIterator::from_iter([ + (129, 438.992), + (241, 512.419), + (242, 408.077), + (243, 488.509), + (244, 437.518), + (245, 444.522), + (247, 352.082), + (248, 356.723), + (249, 365.944), + (251, 334.553), + (264, 184.67), + (296, 267.29), + (297, 200.043), + (298, 440.64), + (299, 197.356), + (300, 160.076), + (301, 364.924), + (303, 330.055), + (305, 344.867), + (306, 335.403), + (307, 336.897), + (308, 365.233), + (309, 364.735), + ]), + full_page_then_columns: FromIterator::from_iter([ + (246, 689.039), + (250, 615.315), + (266, 678.088), + ]), + one_title_line_then_columns_then_full_page: FromIterator::from_iter([(128, 301.55)]), + two_title_lines_then_columns_then_full_page: FromIterator::from_iter([(304, 242.732)]), + columns_then_columns: FromIterator::from_iter([(79, 621.66), (126, 519.89)]), + one_title_line_then_columns_then_columns: FromIterator::from_iter([ + (130, 550.43), + (162, 599.247), + (194, 622.161), + (196, 682.933), + (204, 613.195), + (215, 633.12), + ]), + one_title_line_then_columns: FromIterator::from_iter([ + 103, 104, 105, 121, 139, 143, 146, 151, 153, 158, 207, 213, 218, + ]), + two_title_lines_then_columns: FromIterator::from_iter([198, 206]), + full_page: FromIterator::from_iter( + [ + 118, 157, 252, 254, 255, 257, 259, 260, 265, 268, 270, 271, 272, + ] + .into_iter() + .chain(274..286), + ), + }) + } +} + +impl TextSection { + fn first() -> TextSection { + Self::page_sections(1)[0] + } + + fn next(self) -> TextSection { + let page_sections = Self::page_sections(self.page_num); + let Some(index) = page_sections.iter().position(|v| *v == self) else { + panic!("not a known TextSection: {self:?}"); + }; + if let Some(&retval) = page_sections.get(index + 1) { + return retval; + } + for page_num in self.page_num + 1..self.page_num + 100000 { + let page_sections = Self::page_sections(page_num); + if let Some(&retval) = page_sections.get(0) { + return retval; + } + } + panic!("can't find next TextSection after {self:?}") + } + + fn new(page_num: u32, min_x: f32, min_y: f32, max_x: f32, max_y: f32) -> Self { + Self { + page_num, + min_x: NonNaNF32::new(min_x).expect("invalid min_x"), + min_y: NonNaNF32::new(min_y).expect("invalid min_y"), + max_x: NonNaNF32::new(max_x).expect("invalid max_x"), + max_y: NonNaNF32::new(max_y).expect("invalid max_y"), + } + } + + fn left_column(page_num: u32, min_y: f32, max_y: f32) -> TextSection { + Self::new(page_num, PAGE_BODY_MIN_X, min_y, COLUMN_SPLIT_X, max_y) + } + + fn right_column(page_num: u32, min_y: f32, max_y: f32) -> TextSection { + Self::new(page_num, COLUMN_SPLIT_X, min_y, PAGE_BODY_MAX_X, max_y) + } + + fn columns(page_num: u32, min_y: f32, max_y: f32) -> [TextSection; 2] { + [ + Self::left_column(page_num, min_y, max_y), + Self::right_column(page_num, min_y, max_y), + ] + } + + fn full_page(page_num: u32, min_y: f32, max_y: f32) -> TextSection { + Self::new(page_num, PAGE_BODY_MIN_X, min_y, PAGE_BODY_MAX_X, max_y) + } + + fn page_sections_helper(page_num: u32) -> Box<[TextSection]> { + let TextSectionPagesData { + columns_then_full_page, + full_page_then_columns, + one_title_line_then_columns_then_full_page, + two_title_lines_then_columns_then_full_page, + columns_then_columns, + one_title_line_then_columns_then_columns, + one_title_line_then_columns, + two_title_lines_then_columns, + full_page, + } = TextSectionPagesData::get(); + if let Some(split_y) = columns_then_columns.get(&page_num) { + return Box::from_iter( + Self::columns(page_num, *split_y, PAGE_BODY_MAX_Y) + .into_iter() + .chain(Self::columns(page_num, PAGE_BODY_MIN_Y, *split_y)), + ); + } + if one_title_line_then_columns.contains(&page_num) { + return Box::from_iter( + [Self::full_page( + page_num, + ONE_TITLE_LINE_SPLIT_Y, + PAGE_BODY_MAX_Y, + )] + .into_iter() + .chain(Self::columns( + page_num, + PAGE_BODY_MIN_Y, + ONE_TITLE_LINE_SPLIT_Y, + )), + ); + } + if full_page.contains(&page_num) { + return Box::new([Self::full_page(page_num, PAGE_BODY_MIN_Y, PAGE_BODY_MAX_Y)]); + } + if let Some(split_y) = one_title_line_then_columns_then_columns.get(&page_num) { + return Box::from_iter( + [Self::full_page( + page_num, + ONE_TITLE_LINE_SPLIT_Y, + PAGE_BODY_MAX_Y, + )] + .into_iter() + .chain(Self::columns(page_num, *split_y, ONE_TITLE_LINE_SPLIT_Y)) + .chain(Self::columns(page_num, PAGE_BODY_MIN_Y, *split_y)), + ); + } + if two_title_lines_then_columns.contains(&page_num) { + return Box::from_iter( + [Self::full_page( + page_num, + TWO_TITLE_LINES_SPLIT_Y, + PAGE_BODY_MAX_Y, + )] + .into_iter() + .chain(Self::columns( + page_num, + PAGE_BODY_MIN_Y, + TWO_TITLE_LINES_SPLIT_Y, + )), + ); + } + if let Some(split_y) = columns_then_full_page.get(&page_num) { + return Box::from_iter( + Self::columns(page_num, *split_y, PAGE_BODY_MAX_Y) + .into_iter() + .chain([Self::full_page(page_num, PAGE_BODY_MIN_Y, *split_y)]), + ); + } + if let Some(split_y) = full_page_then_columns.get(&page_num) { + return Box::from_iter( + [Self::full_page(page_num, *split_y, PAGE_BODY_MAX_Y)] + .into_iter() + .chain(Self::columns(page_num, PAGE_BODY_MIN_Y, *split_y)), + ); + } + if let Some(split_y) = one_title_line_then_columns_then_full_page.get(&page_num) { + return Box::from_iter( + [Self::full_page( + page_num, + ONE_TITLE_LINE_SPLIT_Y, + PAGE_BODY_MAX_Y, + )] + .into_iter() + .chain(Self::columns(page_num, *split_y, ONE_TITLE_LINE_SPLIT_Y)) + .chain([Self::full_page(page_num, PAGE_BODY_MIN_Y, *split_y)]), + ); + } + if let Some(split_y) = two_title_lines_then_columns_then_full_page.get(&page_num) { + return Box::from_iter( + [Self::full_page( + page_num, + TWO_TITLE_LINES_SPLIT_Y, + PAGE_BODY_MAX_Y, + )] + .into_iter() + .chain(Self::columns(page_num, *split_y, TWO_TITLE_LINES_SPLIT_Y)) + .chain([Self::full_page(page_num, PAGE_BODY_MIN_Y, *split_y)]), + ); + } + if page_num == 263 { + return Box::from_iter( + [Self::full_page(page_num, 699.997, PAGE_BODY_MAX_Y)] + .into_iter() + .chain(Self::columns(page_num, 366.396, 699.997)) + .chain(Self::columns(page_num, 207.0, 366.396)) + .chain([Self::full_page(page_num, PAGE_BODY_MIN_Y, 207.0)]), + ); + } + // TODO: checked up to page 309 (page named 273) + Box::new(Self::columns(page_num, PAGE_BODY_MIN_Y, PAGE_BODY_MAX_Y)) + } + fn page_sections(page_num: u32) -> &'static [TextSection] { + static CACHE: [OnceLock>; 2000] = [const { OnceLock::new() }; _]; + CACHE + .get(page_num as usize) + .expect("page_num out of range") + .get_or_init(|| Self::page_sections_helper(page_num)) + } + fn for_position(page_num: u32, x: f32, y: f32) -> Option { + for &i in Self::page_sections(page_num) { + if i.min_x.get() <= x && x <= i.max_x.get() && i.min_y.get() <= y && y <= i.max_y.get() + { + return Some(i); + } + } + None + } +} + +#[derive(Debug, Clone)] +struct InsnHeader { + header_lines: Vec, + mnemonic_lines: Vec, + bit_fields: InsnBitFields, +} + +impl InsnHeader { + fn min_y(&self) -> f32 { + self.bit_fields.box_min_y + } + fn write_xml(&self, parent: &mut xml_tree::Element) { + let header = parent.sub_element("header".into(), []); + header.text = "\n".into(); + header.tail = "\n".into(); + let title = header.sub_element("title".into(), []); + title.tail = "\n".into(); + ParsedTextLine::write_xml_lines(&self.header_lines, title, false, false); + let mnemonics = header.sub_element("mnemonics".into(), []); + mnemonics.tail = "\n".into(); + ParsedTextLine::write_xml_lines(&self.mnemonic_lines, mnemonics, false, false); + self.bit_fields.write_xml(header); + } +} + +#[derive(Debug, Clone)] +struct Insn { + headers: Vec, + code_lines: Vec, + desc_lines: Vec, + sp_regs_altered: Option, +} + +impl Insn { + fn write_xml(&self, parent: &mut xml_tree::Element) { + let insn = parent.sub_element("instruction".into(), []); + insn.text = "\n".into(); + insn.tail = "\n".into(); + for header in &self.headers { + header.write_xml(insn); + } + if !self.code_lines.is_empty() { + let code = insn.sub_element("code".into(), []); + code.tail = "\n".into(); + ParsedTextLine::write_xml_lines(&self.code_lines, code, false, false); + } + if !self.desc_lines.is_empty() { + let desc = insn.sub_element("description".into(), []); + desc.tail = "\n".into(); + ParsedTextLine::write_xml_lines(&self.desc_lines, desc, false, false); + } + if let Some(sp_regs_altered) = &self.sp_regs_altered { + sp_regs_altered.write_xml(insn); + } + } +} + +#[derive(Debug)] +struct Parser<'ctx> { + pages: Pages<'ctx>, + text_section: TextSection, + insns: Vec, +} + +#[derive(Debug)] +pub struct Error(String, Backtrace); + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(&self.0)?; + f.write_str("\n")?; + fmt::Display::fmt(&self.1, f) + } +} + +impl std::error::Error for Error {} + +trait IntoError: fmt::Display {} + +impl From for Error { + fn from(value: T) -> Self { + Error(value.to_string(), Backtrace::capture()) + } +} + +impl IntoError for &'_ str {} +impl IntoError for String {} +impl IntoError for MuPdfError {} +impl IntoError for std::ffi::NulError {} +impl IntoError for std::num::ParseIntError {} +impl IntoError for std::io::Error {} +impl IntoError for ErrorWithNote {} + +enum ExtractInsnsError { + InsnParseError(String, Backtrace), + PageParseError(String, Backtrace), + Other(Error), +} + +impl fmt::Display for ExtractInsnsError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let backtrace = match self { + ExtractInsnsError::InsnParseError(msg, backtrace) => { + writeln!(f, "instruction parse error: {msg}")?; + backtrace + } + ExtractInsnsError::PageParseError(msg, backtrace) => { + writeln!(f, "page parse error: {msg}")?; + backtrace + } + ExtractInsnsError::Other(e) => return fmt::Display::fmt(&e, f), + }; + backtrace.fmt(f) + } +} + +#[derive(Clone, Debug)] +struct ErrorWithNote { + error: E, + note: String, +} + +impl fmt::Display for ErrorWithNote { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let Self { error, note } = self; + fmt::Display::fmt(error, f)?; + write!(f, "\nnote: {note}") + } +} + +impl std::error::Error for ErrorWithNote {} + +impl<'ctx> Parser<'ctx> { + fn new() -> Self { + Self { + pages: Pages::new(None), + text_section: TextSection::first(), + insns: Vec::new(), + } + } + fn page(&mut self) -> Result, Error> { + Ok(self + .pages + .get(self.text_section.page_num)? + .ok_or("page_num is out of range")?) + } + fn unprocessed_chars(&mut self) -> Result>>>, Error> { + Ok(self + .page()? + .unprocessed_chars + .borrow_mut() + .entry(self.text_section) + .or_default() + .clone()) + } + fn pages_gen( + ctx: impl Into>, + file: &str, + page_numbers: Option>>, + dump_mupdf_page_xml: bool, + ) -> Result> + 'ctx>, Error> { + let ctx = ctx.into(); + let page_indexes = page_numbers.map(|page_numbers| { + let mut retval = Vec::from_iter(page_numbers.into_iter().map(|v| v.get() as usize - 1)); + retval.sort(); + retval + }); + let document = mupdf_ffi::Document::open(ctx, &std::ffi::CString::new(file)?)?; + let page_count = document.page_count()?; + let page_indexes = page_indexes.unwrap_or_else(|| (0..page_count).collect()); + let mut first_seen_fonts = BTreeMap::new(); + Ok(Box::new(page_indexes.into_iter().map(move |page_index| { + let page_num = page_index as u32 + 1; + println!("page {page_num}"); + let page = document + .load_page(page_index) + .map_err(|e| format!("error reading pdf page {page_num}: {e}"))?; + Ok( + Page::from_mupdf_page(page_num, &page, &mut first_seen_fonts, dump_mupdf_page_xml) + .map_err(|e| format!("error reading pdf page {page_num}: {e}"))?, + ) + }))) + } + fn parse_pdf>>( + &mut self, + ctx: impl Into>, + file: &str, + page_numbers: Option, + dump_mupdf_page_xml: bool, + ) -> Result<(), Error> { + self.pages = Pages::new(Some(Self::pages_gen( + ctx, + file, + page_numbers.map(|v| v.into_iter().collect()), + dump_mupdf_page_xml, + )?)); + self.text_section = TextSection::first(); + loop { + self.text_section = self.text_section.next(); + if self.pages.is_past_end(self.text_section.page_num)? { + return Ok(()); + } + if self.pages.get(self.text_section.page_num)?.is_some() { + println!("section {:?}", self.text_section); + self.note_text_section(Self::parse_text_section)?; + } + } + } + fn note_text_section( + &mut self, + f: impl FnOnce(&mut Self) -> Result<(), E>, + ) -> Result<(), ErrorWithNote> { + let start_text_section = self.text_section; + match f(self) { + Ok(()) => Ok(()), + Err(error) => { + let note = if self.text_section == start_text_section { + format!("text_section={:?}", self.text_section) + } else { + format!( + "start_text_section={start_text_section:?}\ntext_section={:?}", + self.text_section + ) + }; + Err(ErrorWithNote { error, note }) + } + } + } + fn parse_text_section(&mut self) -> Result<(), ErrorWithNote> { + match self.note_text_section(Self::extract_insns) { + Ok(()) => Ok(()), + Err( + e @ ErrorWithNote { + error: + ExtractInsnsError::InsnParseError(..) | ExtractInsnsError::PageParseError(..), + .. + }, + ) => { + println!("{e}"); + Ok(()) + } + Err(ErrorWithNote { + error: ExtractInsnsError::Other(error), + note, + }) => Err(ErrorWithNote { error, note }), + } + } + fn find_top_left_char_in_range( + &mut self, + min_x: f32, + max_x: f32, + min_y: f32, + max_y: f32, + allow_processed: bool, + ) -> Result, Error> { + let mut retval = None; + let page = self.page()?; + let unprocessed_chars = self.unprocessed_chars()?; + let ControlFlow::::Continue(()) = + page.qt[&self.text_section].range(min_x, max_x, min_y, max_y, |_x, _y, ch| { + let PageItem::Char(ch) = ch else { + return ControlFlow::Continue(()); + }; + if !allow_processed && !RefCell::borrow(&*unprocessed_chars)[&ch.font].contains(ch) + { + return ControlFlow::Continue(()); + } + match &mut retval { + None => retval = Some(ch.clone()), + Some(retval) + if ch.min_x.get() - ch.min_y.get() + < retval.min_x.get() - retval.min_y.get() => + { + *retval = ch.clone(); + } + Some(_) => {} + } + ControlFlow::Continue(()) + }); + Ok(retval) + } + fn extract_text_line( + &mut self, + start_char: Option, + mut start_min_y: f32, + min_x: f32, + max_x: f32, + fonts: TextLineFonts, + preceding_blank_lines: u32, + mut skip_initial_spaces: bool, + allowed_start_min_y_error: Option, + ) -> Result, ExtractInsnsError> { + let mut chars: Vec = Vec::new(); + let mut chars_set: IndexSet = IndexSet::new(); + if let Some(start_char) = start_char.clone() { + chars.push(start_char.clone()); + chars_set.insert(start_char); + } + if let Some(start_char) = start_char + && start_char.text == "*" + && self.text_section.page_num == 168 + && fonts + .subscript() + .is_some_and(|v| v.contains(&start_char.font)) + { + start_min_y = start_char.max_y.get() - fonts.regular()[0].size(); + } + let page = self.page().map_err(ExtractInsnsError::Other)?; + let unprocessed_chars = self.unprocessed_chars().map_err(ExtractInsnsError::Other)?; + let ControlFlow::::Continue(()) = page.qt[&self.text_section].range( + min_x - fonts.regular()[0].size() * 0.5, + max_x, + start_min_y - fonts.regular()[0].size() * 0.4, + start_min_y + fonts.regular()[0].size() * 0.6, + |_x, _y, ch| { + let PageItem::Char(ch) = ch else { + return ControlFlow::Continue(()); + }; + if !RefCell::borrow(&*unprocessed_chars)[&ch.font].contains(ch) + || chars_set.contains(ch) + { + return ControlFlow::Continue(()); + } + chars_set.insert(ch.clone()); + chars.push(ch.clone()); + ControlFlow::Continue(()) + }, + ); + if chars.is_empty() { + return Ok(None); + } + chars.sort_by(|a, b| (a.min_x, &a.text).cmp(&(b.min_x, &b.text))); + let mut regular_min_y = chars[0].min_y.get(); + let mut regular_max_y = chars[0].max_y.get(); + for ch in &chars { + let Some(kind) = fonts.get_kind(ch.font.clone(), BaselinePos::Below) else { + continue; + }; + if kind.sub_super() == FontVariantSubSuper::NotSubSuper { + regular_min_y = ch.min_y.get(); + regular_max_y = ch.max_y.get(); + break; + } + } + let mut retval = ParsedTextLine { + element: xml_tree::Element::new("text-line".into(), []), + regular_min_y, + regular_max_y, + fonts, + chars, + preceding_blank_lines, + }; + let mut text_and_tag_stacks: Vec<(String, Vec<&str>)> = Vec::new(); + let mut last_max_x = min_x; + let mut last_kind = None; + let mut last_char: Option = None; + for ch in &retval.chars { + let baseline_pos = if (ch.max_y.get() + ch.min_y.get()) * 0.5 + > (retval.regular_max_y + retval.regular_min_y) * 0.5 + { + BaselinePos::Above + } else { + BaselinePos::Below + }; + let Some(kind) = fonts.get_kind(ch.font.clone(), baseline_pos) else { + println!( + "font kind is None:\n\ + regular_min_y={}\n\ + fonts={fonts:?}\n\ + ch={ch:?}\n\ + baseline_pos={baseline_pos:?}\n\ + chars[0]={:?}", + retval.regular_min_y, retval.chars[0], + ); + return Ok(None); + }; + let space_kind = match last_kind { + None => kind, + Some(last_kind) if last_kind != kind => TextLineFontKind::Regular, + _ => kind, + }; + let (space_fonts, _) = fonts + .get_fonts(space_kind) + .unwrap_or((fonts.regular(), None)); + let space_width = ch.min_x.get() - last_max_x; + let space_count_f = space_width / space_fonts[0].space_width(); + let mut space_count = space_count_f.round() as usize; + if space_count == 0 && space_count_f > 0.35 { + space_count = 1 + } + if space_count_f > 0.25 && f32::abs(space_count as f32 - space_count_f) > 0.15 { + println!("spaces: space_count_f={space_count_f} space_width={space_width}"); + } + if space_count > 0 && !skip_initial_spaces { + text_and_tag_stacks.push(( + " ".repeat(space_count), + space_kind.text_line_tags().collect(), + )); + } + skip_initial_spaces = false; + if ch.text == "\u{0338}" + && let Some(last_char) = last_char + && last_char.text == "=" + && f32::abs(ch.min_x.get() - last_char.min_x.get()) < 0.01 + && f32::abs(ch.min_y.get() - last_char.min_y.get()) < 0.01 + { + *text_and_tag_stacks + .last_mut() + .expect("known to be non-empty") = ("\u{2260}".into(), Vec::new()); + last_max_x = last_char.max_x.get(); + } else { + let char_text = match &*ch.text { + "\u{fb00}" => "ff", + "\u{fb01}" => "fi", + "\u{fb02}" => "fl", + "\u{fb03}" => "ffi", + "\u{fb04}" => "ffl", + v => v, + }; + if char_text.chars().skip(1).next().is_some() { + dbg!(&ch); + } + text_and_tag_stacks.push((char_text.into(), kind.text_line_tags().collect())); + last_max_x = ch.max_x.get(); + } + last_kind = Some(kind); + last_char = Some(ch.clone()); + } + ElementBodyBuilder::scope( + &mut ElementBodyBuilder::new(&mut retval.element), + |body_builder| { + for (text, tag_stack) in text_and_tag_stacks { + body_builder.set_tag_stack(tag_stack); + body_builder.write_text(text) + } + }, + ); + for ch in &retval.chars { + RefCell::borrow_mut(&*unprocessed_chars) + .get_mut(&ch.font) + .expect("known to exist") + .shift_remove(ch); + } + let allowed_start_min_y_error = allowed_start_min_y_error.unwrap_or(0.01); + if f32::abs(start_min_y - retval.regular_min_y) > allowed_start_min_y_error { + return Err(ExtractInsnsError::PageParseError( + format!( + "start_min_y={start_min_y} regular_min_y={}\n\ + start_min_y error: {}\n\ + allowed_start_min_y_error={allowed_start_min_y_error}", + retval.regular_min_y, + start_min_y - retval.regular_min_y, + ), + Backtrace::capture(), + )); + } + Ok(Some(retval)) + } + fn extract_following_text_lines( + &mut self, + first_text_line: ParsedTextLine, + min_x: f32, + max_x: f32, + allowed_start_min_y_error: Option, + ) -> Result, ExtractInsnsError> { + let mut retval = Vec::new(); + let fonts = first_text_line.fonts; + let mut line = Some(first_text_line); + while let Some(cur_line) = line { + let start_min_y = cur_line.regular_min_y - fonts.regular()[0].line_height(); + retval.push(cur_line); + line = self.extract_text_line( + None, + start_min_y, + min_x, + max_x, + fonts, + 0, + false, + allowed_start_min_y_error, + )?; + } + return Ok(retval); + } + fn extract_insn_bit_fields( + &mut self, + mnemonic_lines: &[ParsedTextLine], + ) -> Result, ExtractInsnsError> { + let mut found_non_affix_line = false; + let [.., last_mnemonic_line] = mnemonic_lines else { + unreachable!(); + }; + let expected_non_affix_line_y = last_mnemonic_line.regular_min_y + - if mnemonic_lines.len() > 1 { + INSN_BIT_FIELDS_TOP_PAD_HEIGHT2 + } else { + INSN_BIT_FIELDS_TOP_PAD_HEIGHT + }; + let page = self.page().map_err(ExtractInsnsError::Other)?; + let _ = page.qt[&self.text_section].range( + self.text_section.min_x.get() - 5.0, + self.text_section.max_x.get() + 5.0, + expected_non_affix_line_y - 5.0, + expected_non_affix_line_y + 5.0, + |_x, _y, line| { + let PageItem::LineOrRect(LineOrRect::Line(line)) = line else { + return ControlFlow::Continue(()); + }; + if line.width() > line.height() { + found_non_affix_line = true; + return ControlFlow::Break(()); + } + ControlFlow::Continue(()) + }, + ); + if found_non_affix_line { + return self.extract_insn_bit_fields_box(expected_non_affix_line_y); + }; + let prefix_text = self.extract_text_line( + None, + last_mnemonic_line.regular_min_y - INSN_BIT_FIELDS_PREFIX_TEXT_TOP_PAD_HEIGHT, + self.text_section.min_x.get(), + self.text_section.max_x.get(), + TextLineFonts::InsnBitFieldsAffixTitleFonts, + 0, + true, + Some(2.0), + )?; + let Some(prefix_text) = prefix_text else { + return Err(ExtractInsnsError::InsnParseError( + "can't find insn prefix bit fields title".into(), + Backtrace::capture(), + )); + }; + let prefix_text_str = prefix_text.element.inner_text(); + if prefix_text_str != "Prefix:" { + return Err(ExtractInsnsError::InsnParseError( + format!("insn prefix bit fields title is not as expected: {prefix_text_str:?}"), + Backtrace::capture(), + )); + } + let prefix_bit_fields = self.extract_insn_bit_fields_box( + prefix_text.regular_min_y - INSN_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT, + )?; + let Some(prefix_bit_fields) = prefix_bit_fields else { + return Err(ExtractInsnsError::InsnParseError( + "can't find insn prefix bit fields".into(), + Backtrace::capture(), + )); + }; + let suffix_text = self.extract_text_line( + None, + prefix_bit_fields.box_min_y - INSN_BIT_FIELDS_PREFIX_BOX_BOTTOM_TO_SUFFIX_TEXT_HEIGHT, + self.text_section.min_x.get(), + self.text_section.max_x.get(), + TextLineFonts::InsnBitFieldsAffixTitleFonts, + 0, + true, + Some(2.0), + )?; + let Some(suffix_text) = suffix_text else { + return Err(ExtractInsnsError::InsnParseError( + "can't find insn suffix bit fields title".into(), + Backtrace::capture(), + )); + }; + let suffix_text_str = suffix_text.element.inner_text(); + if suffix_text_str != "Suffix:" { + return Err(ExtractInsnsError::InsnParseError( + format!("insn suffix bit fields title is not as expected: {suffix_text_str:?}"), + Backtrace::capture(), + )); + } + let suffix_bit_fields = self.extract_insn_bit_fields_box( + suffix_text.regular_min_y - INSN_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT, + )?; + let Some(suffix_bit_fields) = suffix_bit_fields else { + return Err(ExtractInsnsError::InsnParseError( + "can't find insn suffix bit fields".into(), + Backtrace::capture(), + )); + }; + return Ok(Some(InsnBitFields { + prefix: Some(InsnBitFieldsPrefix { + box_min_x: prefix_bit_fields.box_min_x, + box_min_y: prefix_bit_fields.box_min_y, + box_max_x: prefix_bit_fields.box_max_x, + box_max_y: prefix_bit_fields.box_max_y, + prefix_text: prefix_text, + fields: prefix_bit_fields.fields, + suffix_text: suffix_text, + }), + box_min_x: suffix_bit_fields.box_min_x, + box_min_y: suffix_bit_fields.box_min_y, + box_max_x: suffix_bit_fields.box_max_x, + box_max_y: suffix_bit_fields.box_max_y, + fields: suffix_bit_fields.fields, + })); + } + fn extract_insn_bit_fields_box( + &mut self, + expected_box_max_y: f32, + ) -> Result, ExtractInsnsError> { + let mut h_lines = Vec::new(); + let mut v_lines = Vec::new(); + let page = self.page().map_err(ExtractInsnsError::Other)?; + let ControlFlow::::Continue(()) = page.qt[&self.text_section].range( + self.text_section.min_x.get() - 5.0, + self.text_section.max_x.get() + 5.0, + expected_box_max_y - INSN_BIT_FIELDS_BOX_HEIGHT - 5.0, + expected_box_max_y + 5.0, + |_x, _y, line| { + let PageItem::LineOrRect(LineOrRect::Line(line)) = *line else { + return ControlFlow::Continue(()); + }; + if line.width() > line.height() { + h_lines.push(line); + } else { + v_lines.push(line); + } + ControlFlow::Continue(()) + }, + ); + h_lines.sort_by_key(|line| line.min_y()); + v_lines.sort_by_key(|line| line.min_x()); + for i in (0..v_lines.len().saturating_sub(1)).rev() { + if f32::abs(v_lines[i].min_x().get() - v_lines[i + 1].min_x().get()) < 0.5 { + v_lines.remove(i + 1); // remove duplicates + } + } + if h_lines.is_empty() && v_lines.is_empty() { + return Ok(None); + } + let [bottom_line, top_line] = &*h_lines else { + return Err(ExtractInsnsError::InsnParseError( + format!( + "instruction bit fields box has wrong number of horizontal lines:\n{h_lines:?}" + ), + Backtrace::capture(), + )); + }; + let [leftmost_line, .., rightmost_line] = &*v_lines else { + return Err(ExtractInsnsError::InsnParseError( + format!("instruction bit fields box has too few vertical lines:\n{v_lines:?}"), + Backtrace::capture(), + )); + }; + let box_min_x = leftmost_line.min_x().get(); + let box_max_x = rightmost_line.min_x().get(); + let box_min_y = bottom_line.min_y().get(); + let box_max_y = top_line.max_y().get(); + let box_mid_y = (box_min_y + box_max_y) * 0.5; + println!("bottom_line={bottom_line:?}"); + println!("top_line={top_line:?}"); + println!("{v_lines:?}"); + let mut fields = Vec::new(); + for i in 0..v_lines.len() - 1 { + let left_line = v_lines[i]; + let right_line = v_lines[i + 1]; + let field_box_min_x = left_line.max_x().get(); + let field_box_max_x = right_line.min_x().get(); + let bit_field_name_start_min_y = box_mid_y + 3.288; + let bit_field_name = self.extract_text_line( + None, + bit_field_name_start_min_y, + field_box_min_x, + field_box_max_x, + TextLineFonts::InsnBitFieldNameFonts, + 0, + true, + Some(0.4), + )?; + let Some(bit_field_name) = bit_field_name else { + return Err(ExtractInsnsError::InsnParseError( + format!( + "instruction bit field name not found:\n\ + start_min_y={bit_field_name_start_min_y} \ + field_box_min_x={field_box_min_x} \ + field_box_max_x={field_box_max_x}" + ), + Backtrace::capture(), + )); + }; + let bit_field_number_start_min_y = box_min_y + 3.487; + let bit_number = self.extract_text_line( + None, + bit_field_number_start_min_y, + field_box_min_x, + field_box_max_x, + TextLineFonts::InsnBitFieldBitNumberFonts, + 0, + true, + None, + )?; + let Some(bit_number) = bit_number else { + return Err(ExtractInsnsError::InsnParseError( + format!( + "instruction bit field bit number not found:\n\ + start_min_y={bit_field_number_start_min_y} \ + field_box_min_x={field_box_min_x} \ + field_box_max_x={field_box_max_x}" + ), + Backtrace::capture(), + )); + }; + fields.push(InsnBitField { + box_min_x: field_box_min_x, + box_max_x: field_box_max_x, + name: bit_field_name, + bit_number: bit_number, + }); + } + return Ok(Some(InsnBitFields { + prefix: None, + box_min_x, + box_min_y, + box_max_x, + box_max_y, + fields, + })); + } + fn extract_insn_header_mnemonics_and_bit_fields( + &mut self, + start_min_y: f32, + header_start_char: Option, + ) -> Result, ExtractInsnsError> { + assert!( + header_start_char + .as_ref() + .is_none_or(|v| v.font == Font::InsnHeader) + ); + let Some(header_line) = self.extract_text_line( + header_start_char, + start_min_y, + self.text_section.min_x.get(), + self.text_section.max_x.get(), + TextLineFonts::InsnHeaderFonts, + 0, + true, + Some(6.0), + )? + else { + return Ok(None); + }; + println!("found header line:\n{header_line}"); + let header_lines = self.extract_following_text_lines( + header_line, + self.text_section.min_x.get(), + self.text_section.max_x.get(), + Some(1.5), + )?; + println!("insn header lines:"); + for header_line in &header_lines { + println!("{header_line}"); + } + let [.., last_header_line] = &*header_lines else { + unreachable!(); + }; + let Some(mnemonic_start_char) = self + .find_top_left_char_in_range( + self.text_section.min_x.get() - 5.0, + self.text_section.max_x.get() + 5.0, + last_header_line.regular_min_y - 50.0, + last_header_line.regular_min_y - 5.0, + false, + ) + .map_err(ExtractInsnsError::Other)? + else { + return Err(ExtractInsnsError::InsnParseError( + "can't find insn mnemonic text line".into(), + Backtrace::capture(), + )); + }; + let mnemonic_start_char_min_y = mnemonic_start_char.min_y.get(); + let Some(mnemonic_line) = self.extract_text_line( + Some(mnemonic_start_char), + mnemonic_start_char_min_y, + self.text_section.min_x.get(), + self.text_section.max_x.get(), + TextLineFonts::InsnMnemonicFonts, + 0, + true, + None, + )? + else { + return Err(ExtractInsnsError::InsnParseError( + "can't find insn mnemonic text line".into(), + Backtrace::capture(), + )); + }; + let mnemonic_line_first_char_min_x = mnemonic_line.chars[0].min_x.get(); + let mnemonic_lines = self.extract_following_text_lines( + mnemonic_line, + mnemonic_line_first_char_min_x, + self.text_section.max_x.get(), + None, + )?; + println!("insn mnemonic lines:"); + for mnemonic_line in &mnemonic_lines { + println!("{mnemonic_line}"); + } + let Some(insn_bit_fields) = self.extract_insn_bit_fields(&mnemonic_lines)? else { + return Err(ExtractInsnsError::InsnParseError( + "can't find insn bit fields".into(), + Backtrace::capture(), + )); + }; + println!("{insn_bit_fields}"); + return Ok(Some(InsnHeader { + header_lines, + mnemonic_lines, + bit_fields: insn_bit_fields, + })); + } + fn extract_insn_sp_regs_altered( + &mut self, + mut sp_regs_altered_text: ParsedTextLine, + ) -> Result { + sp_regs_altered_text.preceding_blank_lines = 0; + let fonts = TextLineFonts::InsnDescFonts; + let column_min_x = sp_regs_altered_text.chars[0].min_x.get(); + let Some(table_header_reg_char) = self + .find_top_left_char_in_range( + column_min_x - 1.0, + column_min_x + INSN_SP_REGS_ALTERED_FIELDS_COLUMN_X - 1.0, + sp_regs_altered_text.regular_min_y - 30.0, + sp_regs_altered_text.regular_min_y - 5.0, + false, + ) + .map_err(ExtractInsnsError::Other)? + else { + return Err(ExtractInsnsError::InsnParseError( + "can't find special registers altered table's register-column's header".into(), + Backtrace::capture(), + )); + }; + const KNOWN_SPECIAL_TEXTS: &[&str] = &[ + "None", + "Dependent on the system service", + "See above.", + "See Table 5.1", + ]; + match &*table_header_reg_char.text { + "R" => {} + text if KNOWN_SPECIAL_TEXTS.iter().any(|i| text == &i[..1]) => { + let start_min_y = table_header_reg_char.min_y.get(); + let special_text = self.extract_text_line( + Some(table_header_reg_char), + start_min_y, + column_min_x, + self.text_section.max_x.get(), + fonts, + 0, + true, + None, + )?; + let special_text = match special_text { + Some(special_text) + if KNOWN_SPECIAL_TEXTS.contains(&&*special_text.element.text) => + { + special_text + } + _ => return Err(ExtractInsnsError::Other( + format!( + "can't find special-registers-altered special-text:\n{special_text:?}" + ) + .into(), + )), + }; + let final_regular_min_y = special_text.regular_min_y; + return Ok(InsnSpRegsAltered { + sp_regs_altered_text, + special_text: Some(special_text), + table_header_reg: None, + table_header_fields: None, + entries: vec![], + final_regular_min_y, + }); + } + text => { + return Err(ExtractInsnsError::InsnParseError( + format!( + "unknown special-registers-altered special-text start character: {text:?}" + ), + Backtrace::capture(), + )); + } + } + let Some(table_header_fields_char) = self + .find_top_left_char_in_range( + column_min_x + INSN_SP_REGS_ALTERED_FIELDS_COLUMN_X - 10.0, + column_min_x + INSN_SP_REGS_ALTERED_FIELDS_CONDS_SPLIT_X, + table_header_reg_char.min_y.get() - 5.0, + table_header_reg_char.min_y.get() + 5.0, + false, + ) + .map_err(ExtractInsnsError::Other)? + else { + return Err(ExtractInsnsError::Other( + "can't find special registers altered table's fields-column's header".into(), + )); + }; + if table_header_fields_char.text != "F" { + return Err(ExtractInsnsError::Other( + format!( + "can't find special registers altered table's fields-column's header:\n\ + table_header_fields_char={table_header_fields_char:?}" + ) + .into(), + )); + } + let columns_x_bounds = [ + ( + table_header_reg_char.min_x.get(), + table_header_fields_char.min_x.get() - 1.0, + ), + ( + table_header_fields_char.min_x.get(), + column_min_x + INSN_SP_REGS_ALTERED_FIELDS_CONDS_SPLIT_X, + ), + ( + column_min_x + INSN_SP_REGS_ALTERED_FIELDS_CONDS_SPLIT_X, + self.text_section.max_x.get(), + ), + ]; + let start_min_y = table_header_reg_char.min_y.get(); + let Some(table_header_reg) = self.extract_text_line( + Some(table_header_reg_char), + start_min_y, + columns_x_bounds[0].0, + columns_x_bounds[0].1, + fonts, + 0, + false, + None, + )? + else { + return Err(ExtractInsnsError::Other( + "can't find special registers altered table's register-column's header".into(), + )); + }; + let table_header_reg_text = table_header_reg.element.inner_text(); + if table_header_reg_text != "Register" { + return Err(ExtractInsnsError::Other( + format!( + "can't find special registers altered table's register-column's header:\n\ + table_header_reg_text={table_header_reg_text:?}" + ) + .into(), + )); + } + let start_min_y = table_header_fields_char.min_y.get(); + let Some(table_header_fields) = self.extract_text_line( + Some(table_header_fields_char), + start_min_y, + columns_x_bounds[1].0, + columns_x_bounds[1].1, + fonts, + 0, + false, + None, + )? + else { + return Err(ExtractInsnsError::Other( + "can't find special registers altered table's fields-column's header".into(), + )); + }; + let table_header_fields_text = table_header_fields.element.inner_text(); + if table_header_fields_text != "Field(s)" { + return Err(ExtractInsnsError::Other( + format!( + "can't find special registers altered table's fields-column's header:\n\ + table_header_fields_text={table_header_fields_text:?}" + ) + .into(), + )); + } + let mut regular_min_y = table_header_reg.regular_min_y; + let mut entries = Vec::new(); + let mut cur_reg = None; + let mut cur_fields = Vec::new(); + let mut cur_conds = Vec::new(); + loop { + let mut row = [None, None, None]; + let mut next_regular_min_y = None; + for (i, (min_x, max_x)) in columns_x_bounds.into_iter().enumerate() { + row[i] = self.extract_text_line( + None, + regular_min_y - fonts.regular()[0].line_height(), + min_x, + max_x, + fonts, + 0, + true, + Some(2.0), + )?; + if let Some(cell) = &row[i] + && next_regular_min_y.is_none() + { + next_regular_min_y = Some(cell.regular_min_y); + } + } + match next_regular_min_y { + Some(v) => regular_min_y = v, + None => break, + } + let [cur_reg_cell, cur_fields_cell, cur_conds_cell] = row; + if cur_reg_cell.is_none() { + if cur_reg.is_none() { + return Err(ExtractInsnsError::Other( + "can't find special registers altered table's first register".into(), + )); + } + cur_fields.extend(cur_fields_cell); + cur_conds.extend(cur_conds_cell); + continue; + } + if let Some(cur_reg) = cur_reg { + entries.push(InsnSpRegsAlteredEntry { + reg: cur_reg, + fields: cur_fields, + conds: cur_conds, + }); + cur_fields = Vec::new(); + cur_conds = Vec::new(); + } + cur_reg = cur_reg_cell; + cur_fields.extend(cur_fields_cell); + cur_conds.extend(cur_conds_cell); + } + let Some(cur_reg) = cur_reg else { + return Err(ExtractInsnsError::Other( + "can't find special registers altered table's first register".into(), + )); + }; + entries.push(InsnSpRegsAlteredEntry { + reg: cur_reg, + fields: cur_fields, + conds: cur_conds, + }); + return Ok(InsnSpRegsAltered { + sp_regs_altered_text: sp_regs_altered_text, + special_text: None, + table_header_reg: Some(table_header_reg), + table_header_fields: Some(table_header_fields), + entries, + final_regular_min_y: regular_min_y, + }); + } + fn extract_insn(&mut self, header_start_char: Char) -> Result { + assert_eq!(header_start_char.font, Font::InsnHeader); + println!("{header_start_char:?}"); + let Some(header) = self.extract_insn_header_mnemonics_and_bit_fields( + header_start_char.min_y.get(), + Some(header_start_char), + )? + else { + return Err(ExtractInsnsError::PageParseError( + "can't find header text line".into(), + Backtrace::capture(), + )); + }; + let mut next_start_min_y = header.min_y() - 5.0; + let mut headers = vec![header]; + let mut code_lines: Vec = Vec::new(); + let mut desc_lines: Vec = Vec::new(); + let mut sp_regs_altered = None; + loop { + let search_min_y = next_start_min_y - 70.0; + let Some(next_char) = self + .find_top_left_char_in_range( + self.text_section.min_x.get() - 5.0, + self.text_section.max_x.get() + 5.0, + search_min_y.max(self.text_section.min_y.get()), + next_start_min_y, + false, + ) + .map_err(ExtractInsnsError::Other)? + else { + if search_min_y <= self.text_section.min_y.get() + && self + .pages + .get(self.text_section.next().page_num) + .map_err(ExtractInsnsError::Other)? + .is_some() + { + // go to next section + self.text_section = self.text_section.next(); + next_start_min_y = self.text_section.max_y.get(); + continue; + } else { + return Err(ExtractInsnsError::InsnParseError( + "can't find insn code or description text".into(), + Backtrace::capture(), + )); + } + }; + let next_section = match &next_char.font { + font if TextLineFonts::InsnCodeFonts.fonts().contains(font) => { + InsnParseSection::Code + } + font if TextLineFonts::InsnDescFonts.fonts().contains(font) => { + InsnParseSection::Desc + } + Font::InsnHeader => InsnParseSection::Header, + font => { + return Err(ExtractInsnsError::InsnParseError( + format!("can't find insn code or description text\nfont={font:?}"), + Backtrace::capture(), + )); + } + }; + match next_section { + InsnParseSection::Code => { + if !desc_lines.is_empty() { + break; + } + let start_min_y = next_char.min_y.get(); + let min_x = next_char.min_x.get(); + let Some(code_line) = self.extract_text_line( + Some(next_char), + start_min_y, + min_x, + self.text_section.max_x.get(), + TextLineFonts::InsnCodeFonts, + if code_lines.is_empty() { 0 } else { 1 }, + false, + None, + )? + else { + return Err(ExtractInsnsError::InsnParseError( + "can't find insn code text line".into(), + Backtrace::capture(), + )); + }; + let min_x = code_line.chars[0].min_x.get(); + let more_code_lines = self.extract_following_text_lines( + code_line, + min_x, + self.text_section.max_x.get(), + Some(0.05), + )?; + println!("more insn code lines:"); + for i in &more_code_lines { + println!("{i}"); + } + code_lines.extend(more_code_lines); + let Some(last) = code_lines.last() else { + unreachable!() + }; + next_start_min_y = last.regular_min_y - 5.0; + } + InsnParseSection::Header => { + if !(code_lines.is_empty() && desc_lines.is_empty()) { + break; + } + let Some(header) = self.extract_insn_header_mnemonics_and_bit_fields( + next_char.min_y.get(), + Some(next_char), + )? + else { + return Err(ExtractInsnsError::InsnParseError( + "can't find header text line".into(), + Backtrace::capture(), + )); + }; + next_start_min_y = header.min_y() - 5.0; + headers.push(header); + } + InsnParseSection::Desc => { + let start_min_y = next_char.min_y.get(); + let min_x = next_char.min_x.get(); + let Some(desc_line) = self.extract_text_line( + Some(next_char), + start_min_y, + min_x, + self.text_section.max_x.get(), + TextLineFonts::InsnDescFonts, + if desc_lines.is_empty() { 0 } else { 1 }, + false, + Some(3.0), + )? + else { + return Err(ExtractInsnsError::InsnParseError( + "can't find insn desc text line".into(), + Backtrace::capture(), + )); + }; + match desc_line.get_header_text() { + None => { + let min_x = desc_line.chars[0].min_x.get(); + let more_desc_lines = self.extract_following_text_lines( + desc_line, + min_x, + self.text_section.max_x.get(), + Some(3.5), + )?; + println!("more insn desc lines:"); + for i in &more_desc_lines { + println!("{i}"); + } + desc_lines.extend(more_desc_lines); + next_start_min_y = desc_lines + .last() + .expect("known to be non-empty") + .regular_min_y + - 5.0; + } + Some(header_text) if header_text == "Special Registers Altered:" => { + let new_sp_regs_altered = + self.extract_insn_sp_regs_altered(desc_line)?; + #[allow(unused_assignments)] + { + next_start_min_y = new_sp_regs_altered.final_regular_min_y; + } + sp_regs_altered = Some(new_sp_regs_altered); + break; + } + Some(header_text) => { + return Err(ExtractInsnsError::Other( + format!("unhandled header text: {header_text:?}\n{desc_line}") + .into(), + )); + } + } + } + } + } + println!("insn code lines:"); + for i in &code_lines { + println!("{i}"); + } + println!("insn desc lines:"); + for i in &desc_lines { + println!("{i}"); + } + println!("sp_regs_altered:"); + println!("{sp_regs_altered:?}"); + // TODO: finish + return Ok(Insn { + headers, + code_lines, + desc_lines, + sp_regs_altered, + }); + } + fn extract_insns(&mut self) -> Result<(), ExtractInsnsError> { + loop { + let Some(header_start_char) = + RefCell::borrow(&*self.unprocessed_chars().map_err(ExtractInsnsError::Other)?) + .get(&Font::InsnHeader) + .and_then(|v| v.first().cloned()) + else { + return Ok(()); + }; + let insn = self.extract_insn(header_start_char)?; + self.insns.push(insn); + } + } +} + +#[derive(Debug)] +struct MyDevice<'a> { + page_num: u32, + qt: RefCell>>, + unprocessed_chars: + Rc>>>>>>, + unprocessed_non_text: Rc>>, + first_seen_fonts: RefCell<&'a mut BTreeMap>>, + error: RefCell>, +} + +impl<'a> MyDevice<'a> { + fn new(page_num: u32, first_seen_fonts: &'a mut BTreeMap>) -> Self { + Self { + page_num, + qt: Default::default(), + unprocessed_chars: Default::default(), + unprocessed_non_text: Default::default(), + first_seen_fonts: RefCell::new(first_seen_fonts), + error: RefCell::new(Ok(())), + } + } + fn path(&self, path: &mupdf_ffi::Path<'_>, ctm: fz_matrix) { + if self.error.borrow().is_err() { + return; + } + enum Walker { + Empty, + Moved { x: f32, y: f32 }, + Line(Line), + Rect { x1: f32, y1: f32, x2: f32, y2: f32 }, + NotRecognized, + } + fn new_line(p0_x: f32, p0_y: f32, p1_x: f32, p1_y: f32) -> Option { + Some(Line { + p0_x: NonNaNF32::new(p0_x)?, + p0_y: NonNaNF32::new(p0_y)?, + p1_x: NonNaNF32::new(p1_x)?, + p1_y: NonNaNF32::new(p1_y)?, + }) + } + impl<'ctx> mupdf_ffi::PathWalker<'ctx> for Walker { + fn move_to(&mut self, _ctx: mupdf_ffi::ContextRef<'ctx>, x: f32, y: f32) { + *self = match *self { + Walker::Empty | Walker::Moved { .. } => Walker::Moved { x, y }, + Walker::Line(_) | Walker::Rect { .. } | Walker::NotRecognized => { + Walker::NotRecognized + } + }; + } + fn line_to(&mut self, _ctx: mupdf_ffi::ContextRef<'ctx>, x: f32, y: f32) { + *self = match *self { + Walker::Empty => Walker::NotRecognized, + Walker::Moved { x: p0_x, y: p0_y } => new_line(p0_x, p0_y, x, y) + .map(Walker::Line) + .unwrap_or(Walker::NotRecognized), + Walker::Line(_) | Walker::Rect { .. } | Walker::NotRecognized => { + Walker::NotRecognized + } + }; + } + fn curve_to( + &mut self, + _ctx: mupdf_ffi::ContextRef<'ctx>, + _cx1: f32, + _cy1: f32, + _cx2: f32, + _cy2: f32, + _ex: f32, + _ey: f32, + ) { + *self = Walker::NotRecognized; + } + fn close_path(&mut self, _ctx: mupdf_ffi::ContextRef<'ctx>) {} + fn rect_to( + &mut self, + _ctx: mupdf_ffi::ContextRef<'ctx>, + x1: f32, + y1: f32, + x2: f32, + y2: f32, + ) { + *self = match *self { + Walker::Empty => Walker::Rect { x1, y1, x2, y2 }, + Walker::Moved { .. } + | Walker::Line(..) + | Walker::Rect { .. } + | Walker::NotRecognized => Walker::NotRecognized, + }; + } + } + let mut walker = Walker::Empty; + path.walk(&mut walker); + let component = match walker { + Walker::Empty | Walker::Moved { .. } | Walker::NotRecognized => return, + Walker::Line(Line { + p0_x, + p0_y, + p1_x, + p1_y, + }) => { + let mupdf_sys::fz_point { x: p0_x, y: p0_y } = + mupdf_ffi::transform_point_xy(p0_x.get(), p0_y.get(), ctm); + let mupdf_sys::fz_point { x: p1_x, y: p1_y } = + mupdf_ffi::transform_point_xy(p1_x.get(), p1_y.get(), ctm); + let Some(line) = new_line(p0_x, p0_y, p1_x, p1_y) else { + return; + }; + LineOrRect::Line(line) + } + Walker::Rect { x1, y1, x2, y2 } => { + let p1 = mupdf_ffi::transform_point_xy(x1, y1, ctm); + let p2 = mupdf_ffi::transform_point_xy(x2, y1, ctm); + let p3 = mupdf_ffi::transform_point_xy(x2, y2, ctm); + let p4 = mupdf_ffi::transform_point_xy(x1, y2, ctm); + let min_x = NonNaNF32::new(p1.x.min(p2.x).min(p3.x).min(p4.x)); + let max_x = NonNaNF32::new(p1.x.max(p2.x).max(p3.x).max(p4.x)); + let min_y = NonNaNF32::new(p1.y.min(p2.y).min(p3.y).min(p4.y)); + let max_y = NonNaNF32::new(p1.y.max(p2.y).max(p3.y).max(p4.y)); + let (Some(min_x), Some(max_x), Some(min_y), Some(max_y)) = + (min_x, max_x, min_y, max_y) + else { + return; + }; + LineOrRect::Rect(Rect { + min_x, + max_x, + min_y, + max_y, + }) + } + }; + if component.width() > 100.0 + && component.min_x().get() < COLUMN_SPLIT_X - 10.0 + && component.max_x().get() > COLUMN_SPLIT_X + 10.0 + { + println!("wide component: {component:?}"); + } else { + println!("component: {component:?}"); + } + let text_section = TextSection::for_position( + self.page_num, + (component.min_x().get() + component.max_x().get()) * 0.5, + (component.min_y().get() + component.max_y().get()) * 0.5, + ); + if let Some(text_section) = text_section { + self.qt + .borrow_mut() + .entry(text_section) + .or_default() + .insert( + component.min_x().get(), + component.min_y().get(), + PageItem::LineOrRect(component), + ); + } + } + fn text(&self, text: &mupdf_ffi::Text<'_>, ctm: fz_matrix) { + if self.error.borrow().is_err() { + return; + } + let mut first_seen_fonts = self.first_seen_fonts.borrow_mut(); + for span in text.spans() { + let tm = span.trm(); + const ROUND_FACTOR: f32 = 1000.0; + let font_size = (mupdf_ffi::matrix_expansion(tm) * ROUND_FACTOR).round() / ROUND_FACTOR; + let Some(font_size) = NonNaNF32::new(font_size) else { + continue; + }; + let font_name_with_tag = span.font().name(); + let font_name_with_tag = match font_name_with_tag { + "CGMSHV+DejaVuSansCondensed-Obli" => "CGMSHV+DejaVuSansCondensed-Oblique", + "YDJYQV+DejaVuSansCondensed-Bold" => "YDJYQV+DejaVuSansCondensed-BoldOblique", + "NHUPPK+DejaVuSansCondensed-Bold" => "NHUPPK+DejaVuSansCondensed-Bold", + _ if font_name_with_tag.len() == 31 => { + let _ = self.error.replace(Err(format!( + "probably truncated font name: {font_name_with_tag:?}" + ) + .into())); + return; + } + _ => font_name_with_tag, + }; + let mut flush_char = |char: Char| -> Result<(), ()> { + let Some(text_section) = TextSection::for_position( + self.page_num, + (char.min_x.get() + char.max_x.get()) * 0.5, + (char.min_y.get() + char.max_y.get()) * 0.5, + ) else { + if PAGE_BODY_MIN_Y <= char.min_y.get() && char.min_y.get() <= PAGE_BODY_MAX_Y { + if self.page_num != 1072 { + // page 1072 has characters in the margins + let _ = self.error.replace(Err(format!( + "char not in text section: {:?}\npage_num={}", + char.text, self.page_num, + ) + .into())); + return Err(()); + } + } + return Ok(()); + }; + let set = match first_seen_fonts.get_mut(font_name_with_tag) { + Some(v) => v, + None => first_seen_fonts + .entry(String::from(font_name_with_tag)) + .or_default(), + }; + if set.insert(font_size) { + println!( + "first seen font: {font_name_with_tag:?} {font_size}: page {} {char:?}", + self.page_num, + ); + } + self.qt + .borrow_mut() + .entry(text_section) + .or_default() + .insert( + char.min_x.get(), + char.min_y.get(), + PageItem::Char(char.clone()), + ); + self.unprocessed_chars + .borrow_mut() + .entry(text_section) + .or_default() + .borrow_mut() + .entry(char.font.clone()) + .or_default() + .insert(char); + Ok(()) + }; + let mut last_char = None; + for &fz_text_item { + x, + y, + adv, + gid, + ucs, + cid: _, + } in span.items() + { + let adv = if gid >= 0 { adv } else { 0.0 }; + let tm = fz_matrix { e: x, f: y, ..tm }; + let trm = mupdf_ffi::concat(tm, ctm); + let dir = match span.write_mode() { + WriteMode::Horizontal => fz_point { x: 1.0, y: 0.0 }, + WriteMode::Vertical => fz_point { x: 0.0, y: -1.0 }, + }; + let dir = mupdf_ffi::transform_vector(dir, trm); + let glyph_start; + let glyph_stop; + let mut glyph_ascender; + let glyph_descender; + match span.write_mode() { + WriteMode::Horizontal => { + glyph_start = fz_point { x: trm.e, y: trm.f }; + glyph_stop = fz_point { + x: trm.e + adv * dir.x, + y: trm.f + adv * dir.y, + }; + glyph_ascender = fz_point { + x: 0.0, + y: span.font().ascender(), + }; + glyph_descender = fz_point { + x: 0.0, + y: span.font().descender(), + }; + if glyph_ascender.y == glyph_descender.y { + glyph_ascender.y += 1.0; + } + } + WriteMode::Vertical => { + glyph_start = fz_point { + x: trm.e - adv * dir.x, + y: trm.f - adv * dir.y, + }; + glyph_stop = fz_point { x: trm.e, y: trm.f }; + glyph_ascender = fz_point { x: 1.0, y: 0.0 }; + glyph_descender = fz_point { x: 0.0, y: 0.0 }; + } + }; + let glyph_ascender = transform_vector(glyph_ascender, trm); + let glyph_descender = transform_vector(glyph_descender, trm); + let points = [ + add_points(glyph_start, glyph_descender), + add_points(glyph_start, glyph_ascender), + add_points(glyph_stop, glyph_descender), + add_points(glyph_stop, glyph_ascender), + ]; + let min = point_min_components( + point_min_components(point_min_components(points[0], points[1]), points[2]), + points[3], + ); + let max = point_max_components( + point_max_components(point_max_components(points[0], points[1]), points[2]), + points[3], + ); + let Some(ch) = u32::try_from(ucs).ok().and_then(|v| char::try_from(v).ok()) else { + continue; + }; + let text = String::from(ch); + if text.trim().is_empty() { + continue; + } + let font = Font::known_from_name_with_tag(font_name_with_tag, font_size) + .unwrap_or_else(|| Font::Other { + font_name: font_name_with_tag.into(), + size: font_size, + }); + let (Some(min_x), Some(min_y), Some(max_x), Some(max_y)) = ( + NonNaNF32::new(min.x), + NonNaNF32::new(min.y), + NonNaNF32::new(max.x), + NonNaNF32::new(max.y), + ) else { + let _ = self + .error + .replace(Err("char position shouldn't be NaN".into())); + return; + }; + if gid < 0 + && last_char + .as_ref() + .is_some_and(|last_char: &Char| last_char.font == font) + { + if let Some(Char { + font, + text: last_text, + min_x: last_min_x, + min_y: last_min_y, + max_x: last_max_x, + max_y: last_max_y, + }) = last_char.take() + { + last_char = Some(Char { + font, + text: last_text + &text, + min_x: last_min_x.min(min_x), + min_y: last_min_y.min(min_y), + max_x: last_max_x.max(max_x), + max_y: last_max_y.max(max_y), + }); + continue; + } + } + if let Some(last_char) = last_char.take() { + match flush_char(last_char) { + Ok(()) => {} + Err(()) => return, + } + } + last_char = Some(Char { + font, + text, + min_x, + min_y, + max_x, + max_y, + }); + } + if let Some(last_char) = last_char { + match flush_char(last_char) { + Ok(()) => {} + Err(()) => return, + } + } + } + } +} + +impl<'ctx> mupdf_ffi::DeviceCallbacks<'ctx> for MyDevice<'_> { + fn fill_path( + &self, + _ctx: mupdf_ffi::ContextRef<'ctx>, + path: &mupdf_ffi::Path<'ctx>, + _even_odd: bool, + ctm: fz_matrix, + ) { + self.path(path, ctm); + } + + fn stroke_path( + &self, + _ctx: mupdf_ffi::ContextRef<'ctx>, + path: &mupdf_ffi::Path<'ctx>, + ctm: fz_matrix, + ) { + self.path(path, ctm); + } + + fn clip_path( + &self, + _ctx: mupdf_ffi::ContextRef<'ctx>, + path: &mupdf_ffi::Path<'ctx>, + _even_odd: bool, + ctm: fz_matrix, + _scissor: mupdf_sys::fz_rect, + ) { + self.path(path, ctm); + } + + fn clip_stroke_path( + &self, + _ctx: mupdf_ffi::ContextRef<'ctx>, + path: &mupdf_ffi::Path<'ctx>, + ctm: fz_matrix, + _scissor: mupdf_sys::fz_rect, + ) { + self.path(path, ctm); + } + + fn fill_text( + &self, + _ctx: mupdf_ffi::ContextRef<'ctx>, + text: &mupdf_ffi::Text<'ctx>, + ctm: fz_matrix, + ) { + self.text(text, ctm); + } + + fn stroke_text( + &self, + _ctx: mupdf_ffi::ContextRef<'ctx>, + text: &mupdf_ffi::Text<'ctx>, + ctm: fz_matrix, + ) { + self.text(text, ctm); + } + + fn clip_text( + &self, + _ctx: mupdf_ffi::ContextRef<'ctx>, + text: &mupdf_ffi::Text<'ctx>, + ctm: fz_matrix, + _scissor: mupdf_sys::fz_rect, + ) { + self.text(text, ctm); + } + + fn clip_stroke_text( + &self, + _ctx: mupdf_ffi::ContextRef<'ctx>, + text: &mupdf_ffi::Text<'ctx>, + ctm: fz_matrix, + _scissor: mupdf_sys::fz_rect, + ) { + self.text(text, ctm); + } + + fn ignore_text( + &self, + _ctx: mupdf_ffi::ContextRef<'ctx>, + text: &mupdf_ffi::Text<'ctx>, + ctm: fz_matrix, + ) { + self.text(text, ctm); + } +} + +impl Page { + fn from_mupdf_page( + page_num: u32, + page: &mupdf_ffi::Page<'_>, + first_seen_fonts: &mut BTreeMap>, + dump_mupdf_page_xml: bool, + ) -> Result { + if dump_mupdf_page_xml { + println!("{}", page.to_xml()?); + } + let Some(pdf_page) = page.pdf_page() else { + return Err("page is not from a pdf".into()); + }; + let device = mupdf_ffi::Device::new( + page.ctx(), + Box::new(MyDevice::new(page_num, first_seen_fonts)), + )?; + page.run(&device, pdf_page.transform()?)?; + let MyDevice { + page_num: _, + qt, + unprocessed_chars, + unprocessed_non_text, + first_seen_fonts: _, + error, + } = device.get(); + error.replace(Ok(()))?; + for (text_section, i) in unprocessed_chars.borrow_mut().iter_mut() { + for chars in i.borrow_mut().values_mut() { + chars.sort_by_key(Char::top_down_left_to_right_sort_key); + println!("first char: {text_section:?}: {:?}", chars.first()); + } + } + let mut unknown_fonts = Vec::new(); + let mut unknown_font_errors = Vec::new(); + for i in RefCell::borrow(&unprocessed_chars).values() { + for (font, chars) in RefCell::borrow(i).iter() { + if font.known_font_group().is_none() { + let mut text = String::new(); + for char in chars { + text += &char.text; + } + unknown_fonts.push(format!("{font:?},")); + unknown_font_errors.push(format!( + "unknown font {font:?}\nlast char: {:?}\ntext: {text:?}", + chars.last() + )); + } + } + } + unknown_fonts.sort(); + if !unknown_fonts.is_empty() { + return Err(format!( + "\nunknown fonts:\n{}\n\n{}", + unknown_fonts.join("\n"), + unknown_font_errors.join("\n") + ) + .into()); + } + Ok(Self { + page_num, + qt: qt.take(), + unprocessed_chars: unprocessed_chars.clone(), + unprocessed_non_text: unprocessed_non_text.clone(), + }) + } +} + +fn main_inner() -> Result<(), Error> { + let mut args: Vec = std::env::args().collect(); + let dump_mupdf_page_xml = if args.get(1).is_some_and(|v| v == "--dump-mupdf-page-xml") { + args.remove(1); + true + } else { + false + }; + let page_numbers: Option>>> = if 2 < args.len() { + Some(if let Some((start, end)) = args[2].split_once(":") { + let start: NonZero = start.trim().parse()?; + let end: NonZero = end.trim().parse()?; + Box::new( + (start.get()..end.get()).map(|v| NonZero::new(v).expect("known to be non-zero")), + ) + } else { + Box::new( + Result::>, _>::from_iter( + args[2].split(",").map(|v| v.trim().parse()), + )? + .into_iter(), + ) + }) + } else { + None + }; + + std::fs::write( + "powerisa-instructions.xml", + parse_powerisa_pdf_and_generate_xml(&args[1], page_numbers, dump_mupdf_page_xml)?, + )?; + Ok(()) +} + +pub fn parse_powerisa_pdf_and_generate_xml( + file_name: &str, + page_numbers: Option>>>, + dump_mupdf_page_xml: bool, +) -> Result { + mupdf_ffi::Context::with(|ctx| { + let mut parser = Parser::new(); + let is_subset = page_numbers.is_some(); + parser.parse_pdf(ctx, file_name, page_numbers, dump_mupdf_page_xml)?; + let mut insns = xml_tree::Element::new( + "instructions".into(), + [( + "is-subset".into(), + if is_subset { + "True".into() + } else { + "False".into() + }, + )], + ); + insns.text = "\n".into(); + insns.tail = "\n".into(); + let mut comment = + xml_tree::Element::comment(format!(" Automatically generated from {file_name} ")); + comment.tail = "\n".into(); + insns.children.push(comment); + for insn in parser.insns { + insn.write_xml(&mut insns); + } + let mut output = Vec::new(); + insns.write(&mut output, true)?; + Ok(String::from_utf8(output).expect("known to generate valid utf-8")) + }) +} + +pub fn main() -> std::process::ExitCode { + match main_inner() { + Ok(()) => std::process::ExitCode::SUCCESS, + Err(e) => { + eprintln!("Error: {e}"); + std::process::ExitCode::FAILURE + } + } +} diff --git a/src/main.rs b/src/main.rs index 67d9dd9..23e46da 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3874 +1,6 @@ // SPDX-License-Identifier: LGPL-3.0-or-later // See Notices.txt for copyright information -use crate::{ - mupdf_ffi::{ - MuPdfError, WriteMode, add_points, point_max_components, point_min_components, - transform_vector, - }, - quad_tree::QuadTree, -}; -use indexmap::IndexSet; -use mupdf_sys::{fz_matrix, fz_point, fz_text_item}; -use non_nan_float::NonNaNF32; -use std::{ - backtrace::Backtrace, - borrow::Cow, - cell::RefCell, - collections::{BTreeMap, BTreeSet, HashMap, HashSet}, - convert::Infallible, - fmt, - num::NonZero, - ops::ControlFlow, - rc::Rc, - sync::OnceLock, -}; - -mod mupdf_ffi; -mod quad_tree; -mod xml_tree; - -mod non_nan_float { - #[derive(Default, PartialEq, PartialOrd, Clone, Copy)] - pub(crate) struct NonNaNF32(f32); - - impl std::fmt::Debug for NonNaNF32 { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - self.0.fmt(f) - } - } - - impl std::fmt::Display for NonNaNF32 { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - self.0.fmt(f) - } - } - - impl NonNaNF32 { - pub(crate) const fn new(v: f32) -> Option { - if v.is_nan() { None } else { Some(Self(v)) } - } - pub(crate) const fn get(self) -> f32 { - self.0 - } - pub(crate) const fn min(self, other: Self) -> Self { - Self(self.0.min(other.0)) - } - pub(crate) const fn max(self, other: Self) -> Self { - Self(self.0.max(other.0)) - } - } - - impl std::hash::Hash for NonNaNF32 { - fn hash(&self, state: &mut H) { - if self.0 == 0.0 { 0.0 } else { self.0 } - .to_bits() - .hash(state); - } - } - - impl Eq for NonNaNF32 {} - - impl Ord for NonNaNF32 { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - self.partial_cmp(other).expect("known to be non-NaN") - } - } - - impl std::ops::Neg for NonNaNF32 { - type Output = Self; - - fn neg(self) -> Self::Output { - Self(-self.0) - } - } -} - -const fn str_eq(a: &str, b: &str) -> bool { - let a = a.as_bytes(); - let b = b.as_bytes(); - if a.len() != b.len() { - return false; - } - let mut i = 0; - while i < a.len() { - if a[i] != b[i] { - return false; - } - i += 1; - } - true -} - -macro_rules! make_enum_font { - ( - enum $Font:ident { - #[other] - $Other:ident $other_body:tt, - $(#[group] - $KnownFontGroup:ident { - $(#[name_with_tag = $known_font_name_with_tag:literal, size = $known_font_size:literal] - $KnownFont:ident,)* - },)* - } - ) => { - #[derive(Hash, PartialEq, Eq, PartialOrd, Ord, Debug, Clone)] - enum $Font { - $Other $other_body, - $($($KnownFont,)*)* - } - - #[derive(Hash, PartialEq, Eq, PartialOrd, Ord, Debug, Copy, Clone)] - enum KnownFontGroup { - $($KnownFontGroup,)* - } - - impl KnownFontGroup { - const fn fonts(self) -> &'static [Font] { - match self { - $(Self::$KnownFontGroup => &[$(Font::$KnownFont,)*],)* - } - } - const INSN_CODE_FONT_GROUPS: &[Self] = &[Self::InsnCode, Self::InsnCodeSubscript]; - } - - impl $Font { - const fn extract_font_name_from_font_name_with_tag(font_name_with_tag: &str) -> &str { - if let [b'A'..=b'Z',b'A'..=b'Z',b'A'..=b'Z',b'A'..=b'Z',b'A'..=b'Z',b'A'..=b'Z',b'+',_,..] = font_name_with_tag.as_bytes() { - font_name_with_tag.split_at(7).1 - } else { - panic!("invalid font name with id") - } - } - const fn known_from_name_with_tag(font_name_with_tag: &str, size: NonNaNF32) -> Option { - match size.get() { - $($($known_font_size if str_eq(font_name_with_tag, $known_font_name_with_tag) => Some(Self::$KnownFont),)*)* - _ => None, - } - } - const fn new_known(font_name: &str, size: NonNaNF32) -> Option { - match size.get() { - $($($known_font_size if str_eq(font_name, const { - Self::extract_font_name_from_font_name_with_tag($known_font_name_with_tag) - }) => Some(Self::$KnownFont),)*)* - _ => None, - } - } - fn new(font_name: &str, size: NonNaNF32) -> Self { - if let Some(v) = Self::new_known(font_name, size) { - v - } else { - Self::Other { - font_name: Box::from(font_name), - size, - } - } - } - const fn size(&self) -> f32 { - match *self { - Self::$Other { size, .. } => size.get(), - $($(Self::$KnownFont => $known_font_size,)*)* - } - } - const fn font_name(&self) -> &str { - match self { - Self::$Other { font_name, .. } => font_name, - $($(Self::$KnownFont => const { Self::extract_font_name_from_font_name_with_tag($known_font_name_with_tag) },)*)* - } - } - const fn known_font_group(&self) -> Option { - match self { - Self::$Other { .. } => None, - $($(Self::$KnownFont => Some(KnownFontGroup::$KnownFontGroup),)*)* - } - } - const fn line_height(&self) -> f32 { - match self { - Self::$Other { .. } => self.line_height_helper(), - $($(Self::$KnownFont => const { Self::$KnownFont.line_height_helper() },)*)* - } - } - } - - const _: () = { - $($( - let (known_font_name, known_font) = const { - let known_font_name = Font::extract_font_name_from_font_name_with_tag($known_font_name_with_tag); - (known_font_name, &Font::new_known(known_font_name, NonNaNF32::new($known_font_size).unwrap()).unwrap()) - }; - assert!(str_eq(known_font_name, known_font.font_name())); - assert!(matches!(known_font, Font::$KnownFont)); - )*)* - }; - }; -} - -make_enum_font! { - enum Font { - #[other] - Other { - font_name: Box, - size: NonNaNF32, - }, - #[group] - InsnHeader { - #[name_with_tag = "YDJYQV+DejaVuSansCondensed-BoldOblique", size = 9.963] - InsnHeader, - }, - #[group] - RtlFnHeader { - #[name_with_tag = "APUYSQ+zcoN-Regular", size = 9.963] - RtlFnHeader, - }, - #[group] - PageHeader { - #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 9.963] - PageHeader, - }, - #[group] - PageFooter { - #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 4.981] - PageFooter, - }, - #[group] - InsnDesc { - #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 8.966] - InsnDesc0, - #[name_with_tag = "FZTIYT+CMMI9", size = 8.966] - InsnDesc1, - #[name_with_tag = "ONUAYC+CMSSI9", size = 8.966] - InsnDesc2, - #[name_with_tag = "TNGBFZ+CMSY9", size = 8.966] - InsnDesc3, - #[name_with_tag = "WHMZPU+CMEX9", size = 8.966] - InsnDesc4, - #[name_with_tag = "ZJTMSG+CMSS9", size = 8.966] - InsnDesc5, - }, - #[group] - InsnDescMisc { - #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 2.377] - InsnDescMisc0, - #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 2.561] - InsnDescMisc1, - #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 4.492] - InsnDescMisc2, - #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 4.641] - InsnDescMisc3, - #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 4.772] - InsnDescMisc4, - #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 4.864] - InsnDescMisc5, - #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 4.925] - InsnDescMisc6, - #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 5.097] - InsnDescMisc7, - #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 5.123] - InsnDescMisc8, - #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 5.131] - InsnDescMisc9, - #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 5.516] - InsnDescMisc10, - #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 5.604] - InsnDescMisc11, - #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 5.634] - InsnDescMisc12, - #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 5.906] - InsnDescMisc13, - #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.033] - InsnDescMisc14, - #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.068] - InsnDescMisc15, - #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.213] - InsnDescMisc16, - #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.238] - InsnDescMisc17, - #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.252] - InsnDescMisc18, - #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.962] - InsnDescMisc19, - #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 7.977] - InsnDescMisc20, - #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 8.506] - InsnDescMisc21, - }, - #[group] - InsnDescCode { - #[name_with_tag = "APUYSQ+zcoN-Regular", size = 6.974] - InsnDescCode, - }, - #[group] - InsnDescCodeMisc { - #[name_with_tag = "APUYSQ+zcoN-Regular", size = 3.587] - InsnDescCodeMisc0, - #[name_with_tag = "APUYSQ+zcoN-Regular", size = 4.483] - InsnDescCodeMisc1, - }, - #[group] - InsnDescItalic { - #[name_with_tag = "CGMSHV+DejaVuSansCondensed-Oblique", size = 8.966] - InsnDescItalic, - }, - #[group] - InsnDescBold { - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 8.966] - InsnDescBold, - }, - #[group] - InsnDescBoldItalic { - #[name_with_tag = "YDJYQV+DejaVuSansCondensed-BoldOblique", size = 8.966] - InsnDescBoldItalic, - }, - #[group] - InsnDescSmall { - #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 7.97] - InsnDescSmall, - }, - #[group] - InsnDescSmallItalic { - #[name_with_tag = "CGMSHV+DejaVuSansCondensed-Oblique", size = 7.97] - InsnDescSmallItalic, - }, - #[group] - InsnDescSmallBold { - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 7.97] - InsnDescSmallBold, - }, - #[group] - InsnDescSmallBoldItalic { - #[name_with_tag = "YDJYQV+DejaVuSansCondensed-BoldOblique", size = 7.97] - InsnDescSmallBoldItalic, - }, - #[group] - InsnDescBoldMisc { - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 2.21] - InsnDescBoldMisc0, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 2.399] - InsnDescBoldMisc1, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 2.763] - InsnDescBoldMisc2, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 2.946] - InsnDescBoldMisc3, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 2.949] - InsnDescBoldMisc4, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 2.999] - InsnDescBoldMisc5, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.065] - InsnDescBoldMisc6, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.086] - InsnDescBoldMisc7, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.183] - InsnDescBoldMisc8, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.686] - InsnDescBoldMisc9, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.744] - InsnDescBoldMisc10, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.825] - InsnDescBoldMisc11, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.842] - InsnDescBoldMisc12, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.857] - InsnDescBoldMisc13, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.979] - InsnDescBoldMisc14, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.032] - InsnDescBoldMisc15, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.112] - InsnDescBoldMisc16, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.161] - InsnDescBoldMisc17, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.206] - InsnDescBoldMisc18, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.353] - InsnDescBoldMisc19, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.378] - InsnDescBoldMisc20, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.434] - InsnDescBoldMisc21, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.595] - InsnDescBoldMisc22, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.619] - InsnDescBoldMisc23, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.647] - InsnDescBoldMisc24, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.68] - InsnDescBoldMisc25, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.693] - InsnDescBoldMisc26, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.736] - InsnDescBoldMisc27, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.781] - InsnDescBoldMisc28, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.802] - InsnDescBoldMisc29, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.995] - InsnDescBoldMisc30, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.201] - InsnDescBoldMisc31, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.258] - InsnDescBoldMisc32, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.363] - InsnDescBoldMisc33, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.442] - InsnDescBoldMisc34, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.473] - InsnDescBoldMisc35, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.485] - InsnDescBoldMisc36, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.512] - InsnDescBoldMisc37, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.543] - InsnDescBoldMisc38, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.613] - InsnDescBoldMisc39, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.744] - InsnDescBoldMisc40, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.774] - InsnDescBoldMisc41, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.809] - InsnDescBoldMisc42, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.849] - InsnDescBoldMisc43, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.911] - InsnDescBoldMisc44, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.92] - InsnDescBoldMisc45, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.962] - InsnDescBoldMisc46, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.981] - InsnDescBoldMisc47, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.146] - InsnDescBoldMisc48, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.213] - InsnDescBoldMisc49, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.221] - InsnDescBoldMisc50, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.243] - InsnDescBoldMisc51, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.55] - InsnDescBoldMisc52, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.62] - InsnDescBoldMisc53, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.699] - InsnDescBoldMisc54, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.725] - InsnDescBoldMisc55, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.751] - InsnDescBoldMisc56, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.856] - InsnDescBoldMisc57, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 8.029] - InsnDescBoldMisc58, - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 8.406] - InsnDescBoldMisc59, - }, - #[group] - InsnDescSubscript { - #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 5.978] - InsnDescSubscript, - }, - #[group] - InsnDescBoldSubscript { - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.978] - InsnDescBoldSubscript, - }, - #[group] - InsnDescItalicSubscript { - #[name_with_tag = "CGMSHV+DejaVuSansCondensed-Oblique", size = 5.978] - InsnDescItalicSubscript, - }, - #[group] - InsnDescBoldItalicSubscript { - #[name_with_tag = "YDJYQV+DejaVuSansCondensed-BoldOblique", size = 5.978] - InsnDescBoldItalicSubscript, - }, - #[group] - InsnExtMnemonic { - #[name_with_tag = "APUYSQ+zcoN-Regular", size = 8.966] - InsnExtMnemonic, - }, - #[group] - InsnCode { - #[name_with_tag = "APUYSQ+zcoN-Regular", size = 7.97] - InsnCode0, - #[name_with_tag = "RRFUNA+CMSY8", size = 7.97] - InsnCode1, - #[name_with_tag = "HPXOZC+CMSS8", size = 7.97] - InsnCode2, - }, - #[group] - InsnCodeSubscript { - #[name_with_tag = "APUYSQ+zcoN-Regular", size = 5.978] - InsnCodeSubscript0, - #[name_with_tag = "DBQTKF+CMSY6", size = 5.978] - InsnCodeSubscript1, - }, - #[group] - TitlePageBig { - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 24.787] - TitlePageBig, - }, - #[group] - TitlePageVersion { - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 9.963] - TitlePageVersion, - }, - #[group] - TitlePageTm { - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.974] - TitlePageTm, - }, - #[group] - TitlePageRev { - #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.974] - TitlePageRev, - }, - #[group] - TitlePageBook { - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 20.663] - TitlePageBook, - }, - #[group] - LegalPageItalic { - #[name_with_tag = "CGMSHV+DejaVuSansCondensed-Oblique", size = 9.963] - LegalPageItalic, - }, - #[group] - ChangeSummaryPageBold { - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 11.955] - ChangeSummaryPageBold, - }, - #[group] - ChapterTitle { - #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 17.215] - ChapterTitle, - }, - #[group] - MathMisc { - #[name_with_tag = "AAJMKT+CMMI6", size = 5.978] - MathMisc0, - #[name_with_tag = "CUTMFD+CMSSI8", size = 5.978] - MathMisc1, - #[name_with_tag = "CUTMFD+CMSSI8", size = 7.97] - MathMisc2, - #[name_with_tag = "FZTIYT+CMMI9", size = 5.734] - MathMisc3, - #[name_with_tag = "FZTIYT+CMMI9", size = 7.168] - MathMisc4, - #[name_with_tag = "HONFQS+CMMI8", size = 7.97] - MathMisc5, - #[name_with_tag = "HPXOZC+CMSS8", size = 5.978] - MathMisc6, - #[name_with_tag = "LLVRDD+CMSY10", size = 11.955] - MathMisc7, - #[name_with_tag = "ZJTMSG+CMSS9", size = 7.168] - MathMisc8, - }, - } -} - -impl Font { - const fn space_width(&self) -> f32 { - self.size() * const { 3.985 / Font::InsnCode0.size() } - } - const fn line_height_helper(&self) -> f32 { - let font_name = self.font_name(); - let mut i = 0; - while i < KnownFontGroup::INSN_CODE_FONT_GROUPS.len() { - let fonts = KnownFontGroup::INSN_CODE_FONT_GROUPS[i].fonts(); - let mut j = 0; - while j < fonts.len() { - if str_eq(font_name, fonts[j].font_name()) { - return 9.464 * self.size() / Font::InsnCode0.size(); - } - j += 1; - } - i += 1; - } - let group = self.known_font_group(); - if matches!(group, Some(KnownFontGroup::InsnDesc)) - || str_eq(font_name, Font::InsnDesc0.font_name()) - || str_eq(font_name, Font::InsnDescBold.font_name()) - || str_eq(font_name, Font::InsnDescItalic.font_name()) - || str_eq(font_name, Font::InsnDescBoldItalic.font_name()) - || matches!(group, Some(KnownFontGroup::MathMisc)) - { - return 10.959 * self.size() / Font::InsnDesc0.size(); - } - panic!("no line height") - } -} - -#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] -struct Char { - font: Font, - text: String, - min_x: NonNaNF32, - min_y: NonNaNF32, - max_x: NonNaNF32, - max_y: NonNaNF32, -} - -impl Char { - fn width(&self) -> f32 { - self.max_x.get() - self.min_x.get() - } - fn height(&self) -> f32 { - self.max_y.get() - self.min_y.get() - } - fn top_down_left_to_right_sort_key(&self) -> impl Ord + use<> { - (-self.min_y, self.min_x) - } -} - -const COLUMN_SPLIT_X: f32 = 300.0; -const PAGE_BODY_MAX_X: f32 = 600.0; -const PAGE_BODY_MIN_X: f32 = 50.0; -const PAGE_BODY_MAX_Y: f32 = 780.0; -const PAGE_BODY_MIN_Y: f32 = 45.0; -const ONE_TITLE_LINE_SPLIT_Y: f32 = 734.0; -const TWO_TITLE_LINES_SPLIT_Y: f32 = 715.0; -const INSN_BIT_FIELDS_PREFIX_TEXT_TOP_PAD_HEIGHT: f32 = 29.938; -const INSN_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT: f32 = 9.278; -const INSN_BIT_FIELDS_PREFIX_BOX_BOTTOM_TO_SUFFIX_TEXT_HEIGHT: f32 = 20.971; -const INSN_BIT_FIELDS_TOP_PAD_HEIGHT: f32 = 20.175; -const INSN_BIT_FIELDS_TOP_PAD_HEIGHT2: f32 = 14.694; -const INSN_BIT_FIELDS_BOX_HEIGHT: f32 = 22.317; -const INSN_SP_REGS_ALTERED_REGISTER_COLUMN_X: f32 = 34.405; -const INSN_SP_REGS_ALTERED_FIELDS_COLUMN_X: f32 = 86.692; -const INSN_SP_REGS_ALTERED_FIELDS_CONDS_SPLIT_X: f32 = 188.74; - -#[derive(Clone)] -struct ParsedTextLine { - element: xml_tree::Element, - regular_min_y: f32, - regular_max_y: f32, - fonts: TextLineFonts, - chars: Vec, - preceding_blank_lines: u32, -} - -impl ParsedTextLine { - fn regular_height(&self) -> f32 { - self.regular_max_y - self.regular_min_y - } - fn get_header_text(&self) -> Option { - assert_eq!(self.fonts, TextLineFonts::InsnDescFonts); - if !self.element.text.trim().is_empty() { - return None; - } - if !self.element.tail.trim().is_empty() { - return None; - } - let [b] = &*self.element.children else { - return None; - }; - if b.tag.normal() != Some("b") { - return None; - } - if b.children.len() != 0 { - return None; - } - let text = self.element.inner_text(); - // should also check titlecase, but rust doesn't include that in std - if text.ends_with(":") && text.chars().next().is_some_and(|ch| ch.is_uppercase()) { - Some(text) - } else { - None - } - } - fn write_xml(&self, parent: &mut xml_tree::Element, trailing_nl: bool) { - for _ in 0..self.preceding_blank_lines { - parent.sub_element("br".into(), []).tail = "\n".into(); - } - if let Some(last_child) = parent.children.last_mut() { - last_child.tail += &self.element.text; - } else { - parent.text += &self.element.text; - } - parent.children.extend_from_slice(&self.element.children); - if trailing_nl { - parent.sub_element("br".into(), []).tail = "\n".into(); - } - } - fn write_xml_lines( - lines: impl IntoIterator>, - parent: &mut xml_tree::Element, - trailing_nl: bool, - preceding_nl: bool, - ) { - if preceding_nl { - parent.sub_element("br".into(), []).tail = "\n".into(); - } - let mut first = true; - for line in lines { - let line = std::borrow::Borrow::borrow(&line); - if first { - first = false; - } else { - parent.sub_element("br".into(), []).tail = "\n".into(); - } - line.write_xml(parent, false); - } - if trailing_nl { - parent.sub_element("br".into(), []).tail = "\n".into(); - } - } -} - -impl fmt::Debug for ParsedTextLine { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let Self { - element, - regular_min_y, - regular_max_y, - fonts, - chars, - preceding_blank_lines, - } = self; - f.debug_struct("ParsedTextLine") - .field("element", &format_args!("{element}")) - .field("regular_min_y", regular_min_y) - .field("regular_max_y", regular_max_y) - .field("fonts", fonts) - .field("chars", chars) - .field("preceding_blank_lines", preceding_blank_lines) - .finish() - } -} - -impl fmt::Display for ParsedTextLine { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - for _ in 0..self.preceding_blank_lines { - f.write_str("\n")?; - } - self.element.fmt(f) - } -} - -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] -enum BaselinePos { - Above, - Below, -} - -macro_rules! make_enum_with_values { - ( - $(#[$enum_meta:meta])* - enum $Enum:ident { - $($Variant:ident,)* - } - ) => { - $(#[$enum_meta])* - enum $Enum { - $($Variant,)* - } - - impl $Enum { - const VALUES: &[Self] = &[$(Self::$Variant,)*]; - } - }; -} - -make_enum_with_values! { - #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] - enum TextLineFonts { - InsnMnemonicFonts, - InsnHeaderFonts, - InsnBitFieldBitNumberFonts, - InsnBitFieldNameFonts, - InsnBitFieldsAffixTitleFonts, - InsnCodeFonts, - InsnDescFonts, - } -} - -impl TextLineFonts { - fn regular(self) -> &'static [Font] { - match self { - TextLineFonts::InsnMnemonicFonts => KnownFontGroup::InsnDesc.fonts(), - TextLineFonts::InsnHeaderFonts => &[Font::InsnHeader], - TextLineFonts::InsnBitFieldBitNumberFonts => &[Font::InsnDescSmall, Font::TitlePageRev], - TextLineFonts::InsnBitFieldNameFonts => KnownFontGroup::InsnDesc.fonts(), - TextLineFonts::InsnBitFieldsAffixTitleFonts => &[Font::InsnDescSmall], - TextLineFonts::InsnCodeFonts => KnownFontGroup::InsnCode.fonts(), - TextLineFonts::InsnDescFonts => { - static FONTS: OnceLock> = OnceLock::new(); - FONTS.get_or_init(|| { - Box::from_iter( - KnownFontGroup::InsnDesc - .fonts() - .iter() - .cloned() - .chain([Font::InsnDescSmall]), - ) - }) - } - } - } - fn italic(self) -> Option<&'static [Font]> { - match self { - TextLineFonts::InsnMnemonicFonts => None, - TextLineFonts::InsnHeaderFonts => None, - TextLineFonts::InsnBitFieldBitNumberFonts => None, - TextLineFonts::InsnBitFieldNameFonts => None, - TextLineFonts::InsnBitFieldsAffixTitleFonts => None, - TextLineFonts::InsnCodeFonts => None, - TextLineFonts::InsnDescFonts => { - Some(&[Font::InsnDescItalic, Font::InsnDescSmallItalic]) - } - } - } - fn bold(self) -> Option<&'static [Font]> { - match self { - TextLineFonts::InsnMnemonicFonts => None, - TextLineFonts::InsnHeaderFonts => None, - TextLineFonts::InsnBitFieldBitNumberFonts => None, - TextLineFonts::InsnBitFieldNameFonts => None, - TextLineFonts::InsnBitFieldsAffixTitleFonts => Some(&[Font::InsnDescSmallBold]), - TextLineFonts::InsnCodeFonts => None, - TextLineFonts::InsnDescFonts => Some(&[Font::InsnDescBold, Font::InsnDescSmallBold]), - } - } - fn bold_italic(self) -> Option<&'static [Font]> { - match self { - TextLineFonts::InsnMnemonicFonts => None, - TextLineFonts::InsnHeaderFonts => None, - TextLineFonts::InsnBitFieldBitNumberFonts => None, - TextLineFonts::InsnBitFieldNameFonts => None, - TextLineFonts::InsnBitFieldsAffixTitleFonts => None, - TextLineFonts::InsnCodeFonts => None, - TextLineFonts::InsnDescFonts => { - Some(&[Font::InsnDescBoldItalic, Font::InsnDescSmallBoldItalic]) - } - } - } - fn subscript(self) -> Option<&'static [Font]> { - match self { - TextLineFonts::InsnMnemonicFonts => None, - TextLineFonts::InsnHeaderFonts => None, - TextLineFonts::InsnBitFieldBitNumberFonts => None, - TextLineFonts::InsnBitFieldNameFonts => Some(&[Font::InsnDescSubscript]), - TextLineFonts::InsnBitFieldsAffixTitleFonts => None, - TextLineFonts::InsnCodeFonts => Some(KnownFontGroup::InsnCodeSubscript.fonts()), - TextLineFonts::InsnDescFonts => Some(&[Font::InsnDescSubscript]), - } - } - fn bold_subscript(self) -> Option<&'static [Font]> { - match self { - TextLineFonts::InsnMnemonicFonts => None, - TextLineFonts::InsnHeaderFonts => None, - TextLineFonts::InsnBitFieldBitNumberFonts => None, - TextLineFonts::InsnBitFieldNameFonts => None, - TextLineFonts::InsnBitFieldsAffixTitleFonts => None, - TextLineFonts::InsnCodeFonts => None, - TextLineFonts::InsnDescFonts => Some(&[Font::InsnDescBoldSubscript]), - } - } - fn italic_subscript(self) -> Option<&'static [Font]> { - match self { - TextLineFonts::InsnMnemonicFonts => None, - TextLineFonts::InsnHeaderFonts => None, - TextLineFonts::InsnBitFieldBitNumberFonts => None, - TextLineFonts::InsnBitFieldNameFonts => None, - TextLineFonts::InsnBitFieldsAffixTitleFonts => None, - TextLineFonts::InsnCodeFonts => None, - TextLineFonts::InsnDescFonts => Some(&[Font::InsnDescItalicSubscript]), - } - } - fn bold_italic_subscript(self) -> Option<&'static [Font]> { - match self { - TextLineFonts::InsnMnemonicFonts => None, - TextLineFonts::InsnHeaderFonts => None, - TextLineFonts::InsnBitFieldBitNumberFonts => None, - TextLineFonts::InsnBitFieldNameFonts => None, - TextLineFonts::InsnBitFieldsAffixTitleFonts => None, - TextLineFonts::InsnCodeFonts => None, - TextLineFonts::InsnDescFonts => Some(&[Font::InsnDescBoldItalicSubscript]), - } - } - fn code(self) -> Option<&'static [Font]> { - match self { - TextLineFonts::InsnMnemonicFonts => None, - TextLineFonts::InsnHeaderFonts => None, - TextLineFonts::InsnBitFieldBitNumberFonts => None, - TextLineFonts::InsnBitFieldNameFonts => None, - TextLineFonts::InsnBitFieldsAffixTitleFonts => None, - TextLineFonts::InsnCodeFonts => None, - TextLineFonts::InsnDescFonts => Some(&[Font::InsnDescCode, Font::InsnExtMnemonic]), - } - } - fn code_subscript(self) -> Option<&'static [Font]> { - match self { - TextLineFonts::InsnMnemonicFonts => None, - TextLineFonts::InsnHeaderFonts => None, - TextLineFonts::InsnBitFieldBitNumberFonts => None, - TextLineFonts::InsnBitFieldNameFonts => None, - TextLineFonts::InsnBitFieldsAffixTitleFonts => None, - TextLineFonts::InsnCodeFonts => None, - TextLineFonts::InsnDescFonts => Some(KnownFontGroup::InsnCodeSubscript.fonts()), - } - } - fn get_fonts( - self, - part_kind: TextLineFontKind, - ) -> Option<(&'static [Font], Option)> { - let fonts = match part_kind { - TextLineFontKind::Regular => self.regular(), - TextLineFontKind::Italic => self.italic()?, - TextLineFontKind::Bold => self.bold()?, - TextLineFontKind::BoldItalic => self.bold_italic()?, - TextLineFontKind::Subscript => self.subscript()?, - TextLineFontKind::Superscript => self.subscript()?, - TextLineFontKind::BoldSubscript => self.bold_subscript()?, - TextLineFontKind::BoldSuperscript => self.bold_subscript()?, - TextLineFontKind::ItalicSubscript => self.italic_subscript()?, - TextLineFontKind::ItalicSuperscript => self.italic_subscript()?, - TextLineFontKind::BoldItalicSubscript => self.bold_italic_subscript()?, - TextLineFontKind::BoldItalicSuperscript => self.bold_italic_subscript()?, - TextLineFontKind::Code => self.code()?, - TextLineFontKind::CodeSubscript => self.code_subscript()?, - TextLineFontKind::CodeSuperscript => self.code_subscript()?, - }; - Some((fonts, part_kind.sub_super().baseline_pos())) - } - fn font_to_kind_map(self) -> &'static HashMap<(Font, Option), TextLineFontKind> { - static MAPS: OnceLock< - HashMap), TextLineFontKind>>, - > = OnceLock::new(); - &MAPS.get_or_init(|| { - Self::VALUES - .iter() - .map(|&this: &TextLineFonts| { - let mut map = HashMap::new(); - for &kind in TextLineFontKind::VALUES { - let Some((fonts, baseline_pos)) = this.get_fonts(kind) else { - continue; - }; - for font in fonts { - let old_kind = map.insert((font.clone(), baseline_pos), kind); - assert!( - old_kind.is_none(), - "duplicate font: kind={kind:?} old_kind={old_kind:?} font={font:?}" - ); - } - } - (this, map) - }) - .collect() - })[&self] - } - fn fonts(self) -> &'static HashSet { - static SETS: OnceLock>> = OnceLock::new(); - &SETS.get_or_init(|| { - Self::VALUES - .iter() - .map(|&this: &TextLineFonts| { - let mut set = HashSet::new(); - for &kind in TextLineFontKind::VALUES { - let Some((fonts, _baseline_pos)) = this.get_fonts(kind) else { - continue; - }; - set.extend(fonts.iter().cloned()); - } - (this, set) - }) - .collect() - })[&self] - } - fn get_kind(self, font: Font, baseline_pos: BaselinePos) -> Option { - let font_to_kind_map = self.font_to_kind_map(); - font_to_kind_map - .get(&(font.clone(), Some(baseline_pos))) - .or_else(|| font_to_kind_map.get(&(font, None))) - .copied() - } -} - -#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] -enum FontVariantCode { - Code, - NotCode, -} - -impl FontVariantCode { - const fn value(self) -> &'static [&'static str] { - match self { - Self::Code => &["code"], - Self::NotCode => &[], - } - } -} - -#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] -enum FontVariantBold { - Bold, - NotBold, -} - -impl FontVariantBold { - const fn value(self) -> &'static [&'static str] { - match self { - Self::Bold => &["b"], - Self::NotBold => &[], - } - } -} - -#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] -enum FontVariantItalic { - Italic, - NotItalic, -} - -impl FontVariantItalic { - const fn value(self) -> &'static [&'static str] { - match self { - Self::Italic => &["i"], - Self::NotItalic => &[], - } - } -} - -#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] -enum FontVariantSubSuper { - NotSubSuper, - Subscript, - Superscript, -} - -impl FontVariantSubSuper { - const fn value(self) -> &'static [&'static str] { - match self { - Self::NotSubSuper => &[], - Self::Subscript => &["sub"], - Self::Superscript => &["sup"], - } - } -} - -impl FontVariantSubSuper { - fn baseline_pos(self) -> Option { - match self { - FontVariantSubSuper::NotSubSuper => None, - FontVariantSubSuper::Subscript => Some(BaselinePos::Below), - FontVariantSubSuper::Superscript => Some(BaselinePos::Above), - } - } -} - -make_enum_with_values! { - #[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] - enum TextLineFontKind { - Regular, - Subscript, - Superscript, - Italic, - ItalicSubscript, - ItalicSuperscript, - Bold, - BoldSubscript, - BoldSuperscript, - BoldItalic, - BoldItalicSubscript, - BoldItalicSuperscript, - Code, - CodeSubscript, - CodeSuperscript, - } -} - -impl TextLineFontKind { - fn code(self) -> FontVariantCode { - match self { - Self::Regular - | Self::Subscript - | Self::Superscript - | Self::Italic - | Self::ItalicSubscript - | Self::ItalicSuperscript - | Self::Bold - | Self::BoldSubscript - | Self::BoldSuperscript - | Self::BoldItalic - | Self::BoldItalicSubscript - | Self::BoldItalicSuperscript => FontVariantCode::NotCode, - Self::Code | Self::CodeSubscript | Self::CodeSuperscript => FontVariantCode::Code, - } - } - fn bold(self) -> FontVariantBold { - match self { - Self::Regular - | Self::Subscript - | Self::Superscript - | Self::Italic - | Self::ItalicSubscript - | Self::ItalicSuperscript => FontVariantBold::NotBold, - Self::Bold - | Self::BoldSubscript - | Self::BoldSuperscript - | Self::BoldItalic - | Self::BoldItalicSubscript - | Self::BoldItalicSuperscript => FontVariantBold::Bold, - Self::Code | Self::CodeSubscript | Self::CodeSuperscript => FontVariantBold::NotBold, - } - } - fn italic(self) -> FontVariantItalic { - match self { - Self::Regular | Self::Subscript | Self::Superscript => FontVariantItalic::NotItalic, - Self::Italic | Self::ItalicSubscript | Self::ItalicSuperscript => { - FontVariantItalic::Italic - } - Self::Bold | Self::BoldSubscript | Self::BoldSuperscript => { - FontVariantItalic::NotItalic - } - Self::BoldItalic | Self::BoldItalicSubscript | Self::BoldItalicSuperscript => { - FontVariantItalic::Italic - } - Self::Code | Self::CodeSubscript | Self::CodeSuperscript => { - FontVariantItalic::NotItalic - } - } - } - fn sub_super(self) -> FontVariantSubSuper { - match self { - Self::Regular => FontVariantSubSuper::NotSubSuper, - Self::Subscript => FontVariantSubSuper::Subscript, - Self::Superscript => FontVariantSubSuper::Superscript, - Self::Italic => FontVariantSubSuper::NotSubSuper, - Self::ItalicSubscript => FontVariantSubSuper::Subscript, - Self::ItalicSuperscript => FontVariantSubSuper::Superscript, - Self::Bold => FontVariantSubSuper::NotSubSuper, - Self::BoldSubscript => FontVariantSubSuper::Subscript, - Self::BoldSuperscript => FontVariantSubSuper::Superscript, - Self::BoldItalic => FontVariantSubSuper::NotSubSuper, - Self::BoldItalicSubscript => FontVariantSubSuper::Subscript, - Self::BoldItalicSuperscript => FontVariantSubSuper::Superscript, - Self::Code => FontVariantSubSuper::NotSubSuper, - Self::CodeSubscript => FontVariantSubSuper::Subscript, - Self::CodeSuperscript => FontVariantSubSuper::Superscript, - } - } - fn text_line_tags(self) -> impl Clone + Iterator { - self.code() - .value() - .iter() - .copied() - .chain(self.bold().value().iter().copied()) - .chain(self.italic().value().iter().copied()) - .chain(self.sub_super().value().iter().copied()) - } -} - -#[derive(Debug)] -struct ElementBodyBuilder<'a> { - containing_element: &'a mut xml_tree::Element, - stack: Vec, -} - -impl<'a> ElementBodyBuilder<'a> { - fn new(containing_element: &'a mut xml_tree::Element) -> Self { - Self { - containing_element, - stack: Vec::with_capacity(5), - } - } - fn shrink_stack(&mut self, new_len: usize) { - while new_len < self.stack.len() { - let Some(element) = self.stack.pop() else { - unreachable!(); - }; - self.insert_point().children.push(element); - } - } - fn set_tag_stack<'b>(&mut self, tag_stack: impl IntoIterator) { - let mut new_len = 0; - for (i, tag) in tag_stack.into_iter().enumerate() { - new_len = i + 1; - if i >= self.stack.len() { - self.stack.push(xml_tree::Element::new(tag.into(), [])); - } else if self.stack[i].tag.normal() != Some(tag) { - self.shrink_stack(new_len); - } - } - self.shrink_stack(new_len); - } - fn write_text(&mut self, text: impl std::borrow::Borrow) { - let text = std::borrow::Borrow::borrow(&text); - let insert_point = self.insert_point(); - if let Some(child) = insert_point.children.last_mut() { - child.tail += text; - } else { - insert_point.text += text; - } - } - fn insert_point(&mut self) -> &mut xml_tree::Element { - self.stack.last_mut().unwrap_or(self.containing_element) - } - fn scope(&mut self, f: impl FnOnce(&mut Self) -> R) -> R { - let retval = f(self); - self.flush(); - retval - } - fn flush(&mut self) { - self.set_tag_stack([]); - } -} - -#[derive(Clone, Debug)] -struct InsnBitField { - box_min_x: f32, - box_max_x: f32, - name: ParsedTextLine, - bit_number: ParsedTextLine, -} - -impl fmt::Display for InsnBitField { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let Self { - box_min_x, - box_max_x, - name, - bit_number, - } = self; - write!( - f, - "" - ) - } -} - -impl InsnBitField { - fn write_xml(&self, parent: &mut xml_tree::Element) { - let field = parent.sub_element("field".into(), []); - field.text = "\n".into(); - field.tail = "\n".into(); - let name = field.sub_element("name".into(), []); - name.tail = "\n".into(); - self.name.write_xml(name, false); - let bit_number = field.sub_element("bit-number".into(), []); - bit_number.tail = "\n".into(); - self.bit_number.write_xml(bit_number, false); - } -} - -#[derive(Clone, Debug)] -struct InsnBitFieldsPrefix { - box_min_x: f32, - box_min_y: f32, - box_max_x: f32, - box_max_y: f32, - prefix_text: ParsedTextLine, - fields: Vec, - suffix_text: ParsedTextLine, -} - -impl fmt::Display for InsnBitFieldsPrefix { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let Self { - box_min_x, - box_min_y, - box_max_x, - box_max_y, - prefix_text, - fields, - suffix_text, - } = self; - writeln!( - f, - "") - } -} - -impl InsnBitFieldsPrefix { - fn write_xml(&self, parent: &mut xml_tree::Element) { - let prefix_elm = parent.sub_element("prefix".into(), []); - prefix_elm.text = "\n".into(); - prefix_elm.tail = "\n".into(); - let prefix_text = prefix_elm.sub_element("prefix-text".into(), []); - prefix_text.tail = "\n".into(); - self.prefix_text.write_xml(prefix_text, false); - InsnBitFields::write_xml_fields(&self.fields, prefix_elm); - let suffix_text = prefix_elm.sub_element("suffix-text".into(), []); - suffix_text.tail = "\n".into(); - self.suffix_text.write_xml(suffix_text, false); - } -} - -#[derive(Clone, Debug)] -struct InsnBitFields { - prefix: Option, - box_min_x: f32, - box_min_y: f32, - box_max_x: f32, - box_max_y: f32, - fields: Vec, -} - -impl fmt::Display for InsnBitFields { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let Self { - prefix, - box_min_x, - box_min_y, - box_max_x, - box_max_y, - fields, - } = self; - if let Some(prefix) = prefix { - prefix.fmt(f)?; - } - writeln!( - f, - "") - } -} - -impl InsnBitFields { - fn write_xml_fields( - fields: impl IntoIterator>, - parent: &mut xml_tree::Element, - ) { - let fields_elm = parent.sub_element("fields".into(), []); - fields_elm.text = "\n".into(); - fields_elm.tail = "\n".into(); - for field in fields { - std::borrow::Borrow::borrow(&field).write_xml(fields_elm); - } - } - fn write_xml(&self, parent: &mut xml_tree::Element) { - let bit_fields = parent.sub_element("bit-fields".into(), []); - bit_fields.text = "\n".into(); - bit_fields.tail = "\n".into(); - if let Some(prefix) = &self.prefix { - prefix.write_xml(bit_fields); - } - Self::write_xml_fields(&self.fields, bit_fields) - } -} - -#[derive(Clone, Debug)] -struct InsnSpRegsAlteredEntry { - reg: ParsedTextLine, - fields: Vec, - conds: Vec, -} - -impl InsnSpRegsAlteredEntry { - fn display_fmt_with_indent(&self, f: &mut fmt::Formatter<'_>, indent: &str) -> fmt::Result { - let Self { reg, fields, conds } = self; - writeln!(f, "Entry(")?; - writeln!(f, "{indent} reg={reg},")?; - write!(f, "{indent} fields=")?; - if fields.is_empty() { - write!(f, "()")?; - } else { - writeln!(f, "(")?; - for field in fields { - writeln!(f, "{indent} {field},")?; - } - write!(f, "{indent} )")?; - } - writeln!(f, ",")?; - writeln!(f, "{indent} conds=")?; - if conds.is_empty() { - write!(f, "()")?; - } else { - writeln!(f, "(")?; - for cond in conds { - writeln!(f, "{indent} {cond},")?; - } - write!(f, "{indent} )")?; - } - writeln!(f, ",")?; - write!(f, "{indent})") - } - fn write_xml(&self, parent: &mut xml_tree::Element) { - let entry = parent.sub_element("entry".into(), []); - entry.text = "\n".into(); - entry.tail = "\n".into(); - let reg = entry.sub_element("register".into(), []); - reg.tail = "\n".into(); - self.reg.write_xml(reg, false); - let fields = entry.sub_element("fields".into(), []); - fields.tail = "\n".into(); - ParsedTextLine::write_xml_lines(&self.fields, fields, false, false); - let conds = entry.sub_element("conditions".into(), []); - conds.tail = "\n".into(); - ParsedTextLine::write_xml_lines(&self.conds, conds, false, false); - } -} - -impl fmt::Display for InsnSpRegsAlteredEntry { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - self.display_fmt_with_indent(f, "") - } -} - -#[derive(Clone, Debug)] -struct InsnSpRegsAltered { - sp_regs_altered_text: ParsedTextLine, - special_text: Option, - table_header_reg: Option, - table_header_fields: Option, - entries: Vec, - final_regular_min_y: f32, -} - -impl fmt::Display for InsnSpRegsAltered { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let Self { - sp_regs_altered_text, - special_text, - table_header_reg, - table_header_fields, - entries, - final_regular_min_y, - } = self; - writeln!(f, "InsnSpRegsAltered(")?; - writeln!(f, " sp_regs_altered_text={sp_regs_altered_text},")?; - if let Some(special_text) = special_text { - writeln!(f, " special_text={special_text},")?; - } - if let Some(table_header_reg) = table_header_reg { - writeln!(f, " table_header_reg={table_header_reg},")?; - } - if let Some(table_header_fields) = table_header_fields { - writeln!(f, " table_header_fields={table_header_fields},")?; - } - if self.entries.is_empty() { - writeln!(f, " entries=(),")?; - } else { - writeln!(f, " entries=(")?; - for entry in entries { - write!(f, " ")?; - entry.display_fmt_with_indent(f, " ")?; - writeln!(f, ",")?; - } - writeln!(f, " ),")?; - } - writeln!(f, " final_regular_min_y={final_regular_min_y},")?; - write!(f, ")") - } -} - -impl InsnSpRegsAltered { - fn write_xml(&self, parent: &mut xml_tree::Element) { - let sp_regs_altered = parent.sub_element("special-registers-altered".into(), []); - sp_regs_altered.text = "\n".into(); - sp_regs_altered.tail = "\n".into(); - let title = sp_regs_altered.sub_element("title".into(), []); - title.tail = "\n".into(); - self.sp_regs_altered_text.write_xml(title, false); - if let Some(special_text) = &self.special_text { - let special_text_el = sp_regs_altered.sub_element("special-text".into(), []); - special_text_el.tail = "\n".into(); - special_text.write_xml(special_text_el, false); - } - if let Some(table_header_reg) = &self.table_header_reg { - let table_header_reg_el = - sp_regs_altered.sub_element("table-header-register".into(), []); - table_header_reg_el.tail = "\n".into(); - table_header_reg.write_xml(table_header_reg_el, false); - } - if let Some(table_header_fields) = &self.table_header_fields { - let table_header_fields_el = - sp_regs_altered.sub_element("table-header-fields".into(), []); - table_header_fields_el.tail = "\n".into(); - table_header_fields.write_xml(table_header_fields_el, false); - } - for entry in &self.entries { - entry.write_xml(sp_regs_altered); - } - } -} - -#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] -enum InsnParseSection { - Code, - Header, - Desc, -} - -#[derive(Clone, Debug)] -enum PageItem { - Char(Char), - LineOrRect(LineOrRect), -} - -#[derive(Copy, Clone, Debug)] -enum LineOrRect { - Line(Line), - Rect(Rect), -} - -impl LineOrRect { - fn width(self) -> f32 { - match self { - Self::Line(v) => v.width(), - Self::Rect(v) => v.width(), - } - } - fn height(self) -> f32 { - match self { - Self::Line(v) => v.height(), - Self::Rect(v) => v.height(), - } - } - fn min_x(self) -> NonNaNF32 { - match self { - Self::Line(v) => v.min_x(), - Self::Rect(v) => v.min_x, - } - } - fn max_x(self) -> NonNaNF32 { - match self { - Self::Line(v) => v.max_x(), - Self::Rect(v) => v.max_x, - } - } - fn min_y(self) -> NonNaNF32 { - match self { - Self::Line(v) => v.min_y(), - Self::Rect(v) => v.min_y, - } - } - fn max_y(self) -> NonNaNF32 { - match self { - Self::Line(v) => v.max_y(), - Self::Rect(v) => v.max_y, - } - } -} - -#[derive(Copy, Clone, Debug)] -struct Line { - p0_x: NonNaNF32, - p0_y: NonNaNF32, - p1_x: NonNaNF32, - p1_y: NonNaNF32, -} - -impl Line { - fn width(self) -> f32 { - f32::abs(self.p0_x.get() - self.p1_x.get()) - } - fn height(self) -> f32 { - f32::abs(self.p0_y.get() - self.p1_y.get()) - } - fn min_x(self) -> NonNaNF32 { - self.p0_x.min(self.p1_x) - } - fn max_x(self) -> NonNaNF32 { - self.p0_x.max(self.p1_x) - } - fn min_y(self) -> NonNaNF32 { - self.p0_y.min(self.p1_y) - } - fn max_y(self) -> NonNaNF32 { - self.p0_y.max(self.p1_y) - } -} - -#[derive(Copy, Clone, Debug)] -struct Rect { - min_x: NonNaNF32, - max_x: NonNaNF32, - min_y: NonNaNF32, - max_y: NonNaNF32, -} - -impl Rect { - fn width(self) -> f32 { - self.max_x.get() - self.min_x.get() - } - fn height(self) -> f32 { - self.max_y.get() - self.min_y.get() - } -} - -#[derive(Debug)] -struct Page { - page_num: u32, - qt: BTreeMap>, - unprocessed_chars: - Rc>>>>>>, - unprocessed_non_text: Rc>>, -} - -struct Pages<'ctx> { - pages_gen: Option> + 'ctx>>, - pages: BTreeMap>, - max_page_num: u32, -} - -impl<'ctx> fmt::Debug for Pages<'ctx> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let Self { - pages_gen, - pages, - max_page_num, - } = self; - f.debug_struct("Pages") - .field( - "pages_gen", - &pages_gen.is_some().then_some(format_args!("...")), - ) - .field("pages", pages) - .field("max_page_num", max_page_num) - .finish() - } -} - -impl<'ctx> Pages<'ctx> { - fn new(pages_gen: Option> + 'ctx>>) -> Self { - Self { - pages_gen, - pages: BTreeMap::new(), - max_page_num: 0, - } - } - fn close(&mut self) { - self.pages_gen = None; - } - fn is_past_end(&mut self, page_num: u32) -> Result { - while self.pages_gen.is_some() && page_num > self.max_page_num { - self.fill_page()?; - } - Ok(page_num > self.max_page_num) - } - fn fill_page(&mut self) -> Result { - let Some(pages_gen) = &mut self.pages_gen else { - return Ok(false); - }; - let page = pages_gen.next(); - let Some(page) = page else { - self.close(); - return Ok(false); - }; - let page = page?; - let page_num = page.page_num; - assert!( - page_num > self.max_page_num, - "page numbers must be a strictly-increasing positive integer sequence:\n\ - got {page_num} which isn't more than {}", - self.max_page_num - ); - self.pages.insert(page_num, Rc::new(page)); - self.max_page_num = page_num; - Ok(true) - } - fn get(&mut self, page_num: u32) -> Result>, Error> { - loop { - if let Some(page) = self.pages.get(&page_num) { - return Ok(Some(page.clone())); - } - if self.pages_gen.is_none() { - return Ok(None); - } - if page_num < self.max_page_num { - return Ok(None); - } - self.fill_page()?; - } - } -} - -#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] -struct TextSection { - page_num: u32, - min_x: NonNaNF32, - min_y: NonNaNF32, - max_x: NonNaNF32, - max_y: NonNaNF32, -} - -struct TextSectionPagesData { - columns_then_full_page: BTreeMap, - full_page_then_columns: BTreeMap, - one_title_line_then_columns_then_full_page: BTreeMap, - two_title_lines_then_columns_then_full_page: BTreeMap, - columns_then_columns: BTreeMap, - one_title_line_then_columns_then_columns: BTreeMap, - one_title_line_then_columns: BTreeSet, - two_title_lines_then_columns: BTreeSet, - full_page: BTreeSet, -} - -impl TextSectionPagesData { - fn get() -> &'static Self { - static DATA: OnceLock = OnceLock::new(); - DATA.get_or_init(|| Self { - columns_then_full_page: FromIterator::from_iter([ - (129, 438.992), - (241, 512.419), - (242, 408.077), - (243, 488.509), - (244, 437.518), - (245, 444.522), - (247, 352.082), - (248, 356.723), - (249, 365.944), - (251, 334.553), - (264, 184.67), - (296, 267.29), - (297, 200.043), - (298, 440.64), - (299, 197.356), - (300, 160.076), - (301, 364.924), - (303, 330.055), - (305, 344.867), - (306, 335.403), - (307, 336.897), - (308, 365.233), - (309, 364.735), - ]), - full_page_then_columns: FromIterator::from_iter([ - (246, 689.039), - (250, 615.315), - (266, 678.088), - ]), - one_title_line_then_columns_then_full_page: FromIterator::from_iter([(128, 301.55)]), - two_title_lines_then_columns_then_full_page: FromIterator::from_iter([(304, 242.732)]), - columns_then_columns: FromIterator::from_iter([(79, 621.66), (126, 519.89)]), - one_title_line_then_columns_then_columns: FromIterator::from_iter([ - (130, 550.43), - (162, 599.247), - (194, 622.161), - (196, 682.933), - (204, 613.195), - (215, 633.12), - ]), - one_title_line_then_columns: FromIterator::from_iter([ - 103, 104, 105, 121, 139, 143, 146, 151, 153, 158, 207, 213, 218, - ]), - two_title_lines_then_columns: FromIterator::from_iter([198, 206]), - full_page: FromIterator::from_iter( - [ - 118, 157, 252, 254, 255, 257, 259, 260, 265, 268, 270, 271, 272, - ] - .into_iter() - .chain(274..286), - ), - }) - } -} - -impl TextSection { - fn first() -> TextSection { - Self::page_sections(1)[0] - } - - fn next(self) -> TextSection { - let page_sections = Self::page_sections(self.page_num); - let Some(index) = page_sections.iter().position(|v| *v == self) else { - panic!("not a known TextSection: {self:?}"); - }; - if let Some(&retval) = page_sections.get(index + 1) { - return retval; - } - for page_num in self.page_num + 1..self.page_num + 100000 { - let page_sections = Self::page_sections(page_num); - if let Some(&retval) = page_sections.get(0) { - return retval; - } - } - panic!("can't find next TextSection after {self:?}") - } - - fn new(page_num: u32, min_x: f32, min_y: f32, max_x: f32, max_y: f32) -> Self { - Self { - page_num, - min_x: NonNaNF32::new(min_x).expect("invalid min_x"), - min_y: NonNaNF32::new(min_y).expect("invalid min_y"), - max_x: NonNaNF32::new(max_x).expect("invalid max_x"), - max_y: NonNaNF32::new(max_y).expect("invalid max_y"), - } - } - - fn left_column(page_num: u32, min_y: f32, max_y: f32) -> TextSection { - Self::new(page_num, PAGE_BODY_MIN_X, min_y, COLUMN_SPLIT_X, max_y) - } - - fn right_column(page_num: u32, min_y: f32, max_y: f32) -> TextSection { - Self::new(page_num, COLUMN_SPLIT_X, min_y, PAGE_BODY_MAX_X, max_y) - } - - fn columns(page_num: u32, min_y: f32, max_y: f32) -> [TextSection; 2] { - [ - Self::left_column(page_num, min_y, max_y), - Self::right_column(page_num, min_y, max_y), - ] - } - - fn full_page(page_num: u32, min_y: f32, max_y: f32) -> TextSection { - Self::new(page_num, PAGE_BODY_MIN_X, min_y, PAGE_BODY_MAX_X, max_y) - } - - fn page_sections_helper(page_num: u32) -> Box<[TextSection]> { - let TextSectionPagesData { - columns_then_full_page, - full_page_then_columns, - one_title_line_then_columns_then_full_page, - two_title_lines_then_columns_then_full_page, - columns_then_columns, - one_title_line_then_columns_then_columns, - one_title_line_then_columns, - two_title_lines_then_columns, - full_page, - } = TextSectionPagesData::get(); - if let Some(split_y) = columns_then_columns.get(&page_num) { - return Box::from_iter( - Self::columns(page_num, *split_y, PAGE_BODY_MAX_Y) - .into_iter() - .chain(Self::columns(page_num, PAGE_BODY_MIN_Y, *split_y)), - ); - } - if one_title_line_then_columns.contains(&page_num) { - return Box::from_iter( - [Self::full_page( - page_num, - ONE_TITLE_LINE_SPLIT_Y, - PAGE_BODY_MAX_Y, - )] - .into_iter() - .chain(Self::columns( - page_num, - PAGE_BODY_MIN_Y, - ONE_TITLE_LINE_SPLIT_Y, - )), - ); - } - if full_page.contains(&page_num) { - return Box::new([Self::full_page(page_num, PAGE_BODY_MIN_Y, PAGE_BODY_MAX_Y)]); - } - if let Some(split_y) = one_title_line_then_columns_then_columns.get(&page_num) { - return Box::from_iter( - [Self::full_page( - page_num, - ONE_TITLE_LINE_SPLIT_Y, - PAGE_BODY_MAX_Y, - )] - .into_iter() - .chain(Self::columns(page_num, *split_y, ONE_TITLE_LINE_SPLIT_Y)) - .chain(Self::columns(page_num, PAGE_BODY_MIN_Y, *split_y)), - ); - } - if two_title_lines_then_columns.contains(&page_num) { - return Box::from_iter( - [Self::full_page( - page_num, - TWO_TITLE_LINES_SPLIT_Y, - PAGE_BODY_MAX_Y, - )] - .into_iter() - .chain(Self::columns( - page_num, - PAGE_BODY_MIN_Y, - TWO_TITLE_LINES_SPLIT_Y, - )), - ); - } - if let Some(split_y) = columns_then_full_page.get(&page_num) { - return Box::from_iter( - Self::columns(page_num, *split_y, PAGE_BODY_MAX_Y) - .into_iter() - .chain([Self::full_page(page_num, PAGE_BODY_MIN_Y, *split_y)]), - ); - } - if let Some(split_y) = full_page_then_columns.get(&page_num) { - return Box::from_iter( - [Self::full_page(page_num, *split_y, PAGE_BODY_MAX_Y)] - .into_iter() - .chain(Self::columns(page_num, PAGE_BODY_MIN_Y, *split_y)), - ); - } - if let Some(split_y) = one_title_line_then_columns_then_full_page.get(&page_num) { - return Box::from_iter( - [Self::full_page( - page_num, - ONE_TITLE_LINE_SPLIT_Y, - PAGE_BODY_MAX_Y, - )] - .into_iter() - .chain(Self::columns(page_num, *split_y, ONE_TITLE_LINE_SPLIT_Y)) - .chain([Self::full_page(page_num, PAGE_BODY_MIN_Y, *split_y)]), - ); - } - if let Some(split_y) = two_title_lines_then_columns_then_full_page.get(&page_num) { - return Box::from_iter( - [Self::full_page( - page_num, - TWO_TITLE_LINES_SPLIT_Y, - PAGE_BODY_MAX_Y, - )] - .into_iter() - .chain(Self::columns(page_num, *split_y, TWO_TITLE_LINES_SPLIT_Y)) - .chain([Self::full_page(page_num, PAGE_BODY_MIN_Y, *split_y)]), - ); - } - if page_num == 263 { - return Box::from_iter( - [Self::full_page(page_num, 699.997, PAGE_BODY_MAX_Y)] - .into_iter() - .chain(Self::columns(page_num, 366.396, 699.997)) - .chain(Self::columns(page_num, 207.0, 366.396)) - .chain([Self::full_page(page_num, PAGE_BODY_MIN_Y, 207.0)]), - ); - } - // TODO: checked up to page 309 (page named 273) - Box::new(Self::columns(page_num, PAGE_BODY_MIN_Y, PAGE_BODY_MAX_Y)) - } - fn page_sections(page_num: u32) -> &'static [TextSection] { - static CACHE: [OnceLock>; 2000] = [const { OnceLock::new() }; _]; - CACHE - .get(page_num as usize) - .expect("page_num out of range") - .get_or_init(|| Self::page_sections_helper(page_num)) - } - fn for_position(page_num: u32, x: f32, y: f32) -> Option { - for &i in Self::page_sections(page_num) { - if i.min_x.get() <= x && x <= i.max_x.get() && i.min_y.get() <= y && y <= i.max_y.get() - { - return Some(i); - } - } - None - } -} - -#[derive(Debug, Clone)] -struct InsnHeader { - header_lines: Vec, - mnemonic_lines: Vec, - bit_fields: InsnBitFields, -} - -impl InsnHeader { - fn min_y(&self) -> f32 { - self.bit_fields.box_min_y - } - fn write_xml(&self, parent: &mut xml_tree::Element) { - let header = parent.sub_element("header".into(), []); - header.text = "\n".into(); - header.tail = "\n".into(); - let title = header.sub_element("title".into(), []); - title.tail = "\n".into(); - ParsedTextLine::write_xml_lines(&self.header_lines, title, false, false); - let mnemonics = header.sub_element("mnemonics".into(), []); - mnemonics.tail = "\n".into(); - ParsedTextLine::write_xml_lines(&self.mnemonic_lines, mnemonics, false, false); - self.bit_fields.write_xml(header); - } -} - -#[derive(Debug, Clone)] -struct Insn { - headers: Vec, - code_lines: Vec, - desc_lines: Vec, - sp_regs_altered: Option, -} - -impl Insn { - fn write_xml(&self, parent: &mut xml_tree::Element) { - let insn = parent.sub_element("instruction".into(), []); - insn.text = "\n".into(); - insn.tail = "\n".into(); - for header in &self.headers { - header.write_xml(insn); - } - if !self.code_lines.is_empty() { - let code = insn.sub_element("code".into(), []); - code.tail = "\n".into(); - ParsedTextLine::write_xml_lines(&self.code_lines, code, false, false); - } - if !self.desc_lines.is_empty() { - let desc = insn.sub_element("description".into(), []); - desc.tail = "\n".into(); - ParsedTextLine::write_xml_lines(&self.desc_lines, desc, false, false); - } - if let Some(sp_regs_altered) = &self.sp_regs_altered { - sp_regs_altered.write_xml(insn); - } - } -} - -#[derive(Debug)] -struct Parser<'ctx> { - pages: Pages<'ctx>, - text_section: TextSection, - insns: Vec, -} - -#[derive(Debug)] -struct Error(String, Backtrace); - -impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str(&self.0)?; - f.write_str("\n")?; - fmt::Display::fmt(&self.1, f) - } -} - -trait IntoError: fmt::Display {} - -impl From for Error { - fn from(value: T) -> Self { - Error(value.to_string(), Backtrace::capture()) - } -} - -impl IntoError for &'_ str {} -impl IntoError for String {} -impl IntoError for MuPdfError {} -impl IntoError for std::ffi::NulError {} -impl IntoError for std::num::ParseIntError {} -impl IntoError for std::io::Error {} -impl IntoError for ErrorWithNote {} - -enum ExtractInsnsError { - InsnParseError(String, Backtrace), - PageParseError(String, Backtrace), - Other(Error), -} - -impl fmt::Display for ExtractInsnsError { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let backtrace = match self { - ExtractInsnsError::InsnParseError(msg, backtrace) => { - writeln!(f, "instruction parse error: {msg}")?; - backtrace - } - ExtractInsnsError::PageParseError(msg, backtrace) => { - writeln!(f, "page parse error: {msg}")?; - backtrace - } - ExtractInsnsError::Other(e) => return fmt::Display::fmt(&e, f), - }; - backtrace.fmt(f) - } -} - -#[derive(Clone, Debug)] -struct ErrorWithNote { - error: E, - note: String, -} - -impl fmt::Display for ErrorWithNote { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let Self { error, note } = self; - fmt::Display::fmt(error, f)?; - write!(f, "\nnote: {note}") - } -} - -impl std::error::Error for ErrorWithNote {} - -impl<'ctx> Parser<'ctx> { - fn new() -> Self { - Self { - pages: Pages::new(None), - text_section: TextSection::first(), - insns: Vec::new(), - } - } - fn page(&mut self) -> Result, Error> { - Ok(self - .pages - .get(self.text_section.page_num)? - .ok_or("page_num is out of range")?) - } - fn unprocessed_chars(&mut self) -> Result>>>, Error> { - Ok(self - .page()? - .unprocessed_chars - .borrow_mut() - .entry(self.text_section) - .or_default() - .clone()) - } - fn pages_gen( - ctx: impl Into>, - file: &str, - page_numbers: Option>>, - dump_mupdf_page_xml: bool, - ) -> Result> + 'ctx>, Error> { - let ctx = ctx.into(); - let page_indexes = page_numbers.map(|page_numbers| { - let mut retval = Vec::from_iter(page_numbers.into_iter().map(|v| v.get() as usize - 1)); - retval.sort(); - retval - }); - let document = mupdf_ffi::Document::open(ctx, &std::ffi::CString::new(file)?)?; - let page_count = document.page_count()?; - let page_indexes = page_indexes.unwrap_or_else(|| (0..page_count).collect()); - let mut first_seen_fonts = BTreeMap::new(); - Ok(Box::new(page_indexes.into_iter().map(move |page_index| { - let page_num = page_index as u32 + 1; - println!("page {page_num}"); - let page = document - .load_page(page_index) - .map_err(|e| format!("error reading pdf page {page_num}: {e}"))?; - Ok( - Page::from_mupdf_page(page_num, &page, &mut first_seen_fonts, dump_mupdf_page_xml) - .map_err(|e| format!("error reading pdf page {page_num}: {e}"))?, - ) - }))) - } - fn parse_pdf>>( - &mut self, - ctx: impl Into>, - file: &str, - page_numbers: Option, - dump_mupdf_page_xml: bool, - ) -> Result<(), Error> { - self.pages = Pages::new(Some(Self::pages_gen( - ctx, - file, - page_numbers.map(|v| v.into_iter().collect()), - dump_mupdf_page_xml, - )?)); - self.text_section = TextSection::first(); - loop { - self.text_section = self.text_section.next(); - if self.pages.is_past_end(self.text_section.page_num)? { - return Ok(()); - } - if self.pages.get(self.text_section.page_num)?.is_some() { - println!("section {:?}", self.text_section); - self.note_text_section(Self::parse_text_section)?; - } - } - } - fn note_text_section( - &mut self, - f: impl FnOnce(&mut Self) -> Result<(), E>, - ) -> Result<(), ErrorWithNote> { - let start_text_section = self.text_section; - match f(self) { - Ok(()) => Ok(()), - Err(error) => { - let note = if self.text_section == start_text_section { - format!("text_section={:?}", self.text_section) - } else { - format!( - "start_text_section={start_text_section:?}\ntext_section={:?}", - self.text_section - ) - }; - Err(ErrorWithNote { error, note }) - } - } - } - fn parse_text_section(&mut self) -> Result<(), ErrorWithNote> { - match self.note_text_section(Self::extract_insns) { - Ok(()) => Ok(()), - Err( - e @ ErrorWithNote { - error: - ExtractInsnsError::InsnParseError(..) | ExtractInsnsError::PageParseError(..), - .. - }, - ) => { - println!("{e}"); - Ok(()) - } - Err(ErrorWithNote { - error: ExtractInsnsError::Other(error), - note, - }) => Err(ErrorWithNote { error, note }), - } - } - fn find_top_left_char_in_range( - &mut self, - min_x: f32, - max_x: f32, - min_y: f32, - max_y: f32, - allow_processed: bool, - ) -> Result, Error> { - let mut retval = None; - let page = self.page()?; - let unprocessed_chars = self.unprocessed_chars()?; - let ControlFlow::::Continue(()) = - page.qt[&self.text_section].range(min_x, max_x, min_y, max_y, |_x, _y, ch| { - let PageItem::Char(ch) = ch else { - return ControlFlow::Continue(()); - }; - if !allow_processed && !RefCell::borrow(&*unprocessed_chars)[&ch.font].contains(ch) - { - return ControlFlow::Continue(()); - } - match &mut retval { - None => retval = Some(ch.clone()), - Some(retval) - if ch.min_x.get() - ch.min_y.get() - < retval.min_x.get() - retval.min_y.get() => - { - *retval = ch.clone(); - } - Some(_) => {} - } - ControlFlow::Continue(()) - }); - Ok(retval) - } - fn extract_text_line( - &mut self, - start_char: Option, - mut start_min_y: f32, - min_x: f32, - max_x: f32, - fonts: TextLineFonts, - preceding_blank_lines: u32, - mut skip_initial_spaces: bool, - allowed_start_min_y_error: Option, - ) -> Result, ExtractInsnsError> { - let mut chars: Vec = Vec::new(); - let mut chars_set: IndexSet = IndexSet::new(); - if let Some(start_char) = start_char.clone() { - chars.push(start_char.clone()); - chars_set.insert(start_char); - } - if let Some(start_char) = start_char - && start_char.text == "*" - && self.text_section.page_num == 168 - && fonts - .subscript() - .is_some_and(|v| v.contains(&start_char.font)) - { - start_min_y = start_char.max_y.get() - fonts.regular()[0].size(); - } - let page = self.page().map_err(ExtractInsnsError::Other)?; - let unprocessed_chars = self.unprocessed_chars().map_err(ExtractInsnsError::Other)?; - let ControlFlow::::Continue(()) = page.qt[&self.text_section].range( - min_x - fonts.regular()[0].size() * 0.5, - max_x, - start_min_y - fonts.regular()[0].size() * 0.4, - start_min_y + fonts.regular()[0].size() * 0.6, - |_x, _y, ch| { - let PageItem::Char(ch) = ch else { - return ControlFlow::Continue(()); - }; - if !RefCell::borrow(&*unprocessed_chars)[&ch.font].contains(ch) - || chars_set.contains(ch) - { - return ControlFlow::Continue(()); - } - chars_set.insert(ch.clone()); - chars.push(ch.clone()); - ControlFlow::Continue(()) - }, - ); - if chars.is_empty() { - return Ok(None); - } - chars.sort_by(|a, b| (a.min_x, &a.text).cmp(&(b.min_x, &b.text))); - let mut regular_min_y = chars[0].min_y.get(); - let mut regular_max_y = chars[0].max_y.get(); - for ch in &chars { - let Some(kind) = fonts.get_kind(ch.font.clone(), BaselinePos::Below) else { - continue; - }; - if kind.sub_super() == FontVariantSubSuper::NotSubSuper { - regular_min_y = ch.min_y.get(); - regular_max_y = ch.max_y.get(); - break; - } - } - let mut retval = ParsedTextLine { - element: xml_tree::Element::new("text-line".into(), []), - regular_min_y, - regular_max_y, - fonts, - chars, - preceding_blank_lines, - }; - let mut text_and_tag_stacks: Vec<(String, Vec<&str>)> = Vec::new(); - let mut last_max_x = min_x; - let mut last_kind = None; - let mut last_char: Option = None; - for ch in &retval.chars { - let baseline_pos = if (ch.max_y.get() + ch.min_y.get()) * 0.5 - > (retval.regular_max_y + retval.regular_min_y) * 0.5 - { - BaselinePos::Above - } else { - BaselinePos::Below - }; - let Some(kind) = fonts.get_kind(ch.font.clone(), baseline_pos) else { - println!( - "font kind is None:\n\ - regular_min_y={}\n\ - fonts={fonts:?}\n\ - ch={ch:?}\n\ - baseline_pos={baseline_pos:?}\n\ - chars[0]={:?}", - retval.regular_min_y, retval.chars[0], - ); - return Ok(None); - }; - let space_kind = match last_kind { - None => kind, - Some(last_kind) if last_kind != kind => TextLineFontKind::Regular, - _ => kind, - }; - let (space_fonts, _) = fonts - .get_fonts(space_kind) - .unwrap_or((fonts.regular(), None)); - let space_width = ch.min_x.get() - last_max_x; - let space_count_f = space_width / space_fonts[0].space_width(); - let mut space_count = space_count_f.round() as usize; - if space_count == 0 && space_count_f > 0.35 { - space_count = 1 - } - if space_count_f > 0.25 && f32::abs(space_count as f32 - space_count_f) > 0.15 { - println!("spaces: space_count_f={space_count_f} space_width={space_width}"); - } - if space_count > 0 && !skip_initial_spaces { - text_and_tag_stacks.push(( - " ".repeat(space_count), - space_kind.text_line_tags().collect(), - )); - } - skip_initial_spaces = false; - if ch.text == "\u{0338}" - && let Some(last_char) = last_char - && last_char.text == "=" - && f32::abs(ch.min_x.get() - last_char.min_x.get()) < 0.01 - && f32::abs(ch.min_y.get() - last_char.min_y.get()) < 0.01 - { - *text_and_tag_stacks - .last_mut() - .expect("known to be non-empty") = ("\u{2260}".into(), Vec::new()); - last_max_x = last_char.max_x.get(); - } else { - let char_text = match &*ch.text { - "\u{fb00}" => "ff", - "\u{fb01}" => "fi", - "\u{fb02}" => "fl", - "\u{fb03}" => "ffi", - "\u{fb04}" => "ffl", - v => v, - }; - if char_text.chars().skip(1).next().is_some() { - dbg!(&ch); - } - text_and_tag_stacks.push((char_text.into(), kind.text_line_tags().collect())); - last_max_x = ch.max_x.get(); - } - last_kind = Some(kind); - last_char = Some(ch.clone()); - } - ElementBodyBuilder::scope( - &mut ElementBodyBuilder::new(&mut retval.element), - |body_builder| { - for (text, tag_stack) in text_and_tag_stacks { - body_builder.set_tag_stack(tag_stack); - body_builder.write_text(text) - } - }, - ); - for ch in &retval.chars { - RefCell::borrow_mut(&*unprocessed_chars) - .get_mut(&ch.font) - .expect("known to exist") - .shift_remove(ch); - } - let allowed_start_min_y_error = allowed_start_min_y_error.unwrap_or(0.01); - if f32::abs(start_min_y - retval.regular_min_y) > allowed_start_min_y_error { - return Err(ExtractInsnsError::PageParseError( - format!( - "start_min_y={start_min_y} regular_min_y={}\n\ - start_min_y error: {}\n\ - allowed_start_min_y_error={allowed_start_min_y_error}", - retval.regular_min_y, - start_min_y - retval.regular_min_y, - ), - Backtrace::capture(), - )); - } - Ok(Some(retval)) - } - fn extract_following_text_lines( - &mut self, - first_text_line: ParsedTextLine, - min_x: f32, - max_x: f32, - allowed_start_min_y_error: Option, - ) -> Result, ExtractInsnsError> { - let mut retval = Vec::new(); - let fonts = first_text_line.fonts; - let mut line = Some(first_text_line); - while let Some(cur_line) = line { - let start_min_y = cur_line.regular_min_y - fonts.regular()[0].line_height(); - retval.push(cur_line); - line = self.extract_text_line( - None, - start_min_y, - min_x, - max_x, - fonts, - 0, - false, - allowed_start_min_y_error, - )?; - } - return Ok(retval); - } - fn extract_insn_bit_fields( - &mut self, - mnemonic_lines: &[ParsedTextLine], - ) -> Result, ExtractInsnsError> { - let mut found_non_affix_line = false; - let [.., last_mnemonic_line] = mnemonic_lines else { - unreachable!(); - }; - let expected_non_affix_line_y = last_mnemonic_line.regular_min_y - - if mnemonic_lines.len() > 1 { - INSN_BIT_FIELDS_TOP_PAD_HEIGHT2 - } else { - INSN_BIT_FIELDS_TOP_PAD_HEIGHT - }; - let page = self.page().map_err(ExtractInsnsError::Other)?; - let _ = page.qt[&self.text_section].range( - self.text_section.min_x.get() - 5.0, - self.text_section.max_x.get() + 5.0, - expected_non_affix_line_y - 5.0, - expected_non_affix_line_y + 5.0, - |_x, _y, line| { - let PageItem::LineOrRect(LineOrRect::Line(line)) = line else { - return ControlFlow::Continue(()); - }; - if line.width() > line.height() { - found_non_affix_line = true; - return ControlFlow::Break(()); - } - ControlFlow::Continue(()) - }, - ); - if found_non_affix_line { - return self.extract_insn_bit_fields_box(expected_non_affix_line_y); - }; - let prefix_text = self.extract_text_line( - None, - last_mnemonic_line.regular_min_y - INSN_BIT_FIELDS_PREFIX_TEXT_TOP_PAD_HEIGHT, - self.text_section.min_x.get(), - self.text_section.max_x.get(), - TextLineFonts::InsnBitFieldsAffixTitleFonts, - 0, - true, - Some(2.0), - )?; - let Some(prefix_text) = prefix_text else { - return Err(ExtractInsnsError::InsnParseError( - "can't find insn prefix bit fields title".into(), - Backtrace::capture(), - )); - }; - let prefix_text_str = prefix_text.element.inner_text(); - if prefix_text_str != "Prefix:" { - return Err(ExtractInsnsError::InsnParseError( - format!("insn prefix bit fields title is not as expected: {prefix_text_str:?}"), - Backtrace::capture(), - )); - } - let prefix_bit_fields = self.extract_insn_bit_fields_box( - prefix_text.regular_min_y - INSN_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT, - )?; - let Some(prefix_bit_fields) = prefix_bit_fields else { - return Err(ExtractInsnsError::InsnParseError( - "can't find insn prefix bit fields".into(), - Backtrace::capture(), - )); - }; - let suffix_text = self.extract_text_line( - None, - prefix_bit_fields.box_min_y - INSN_BIT_FIELDS_PREFIX_BOX_BOTTOM_TO_SUFFIX_TEXT_HEIGHT, - self.text_section.min_x.get(), - self.text_section.max_x.get(), - TextLineFonts::InsnBitFieldsAffixTitleFonts, - 0, - true, - Some(2.0), - )?; - let Some(suffix_text) = suffix_text else { - return Err(ExtractInsnsError::InsnParseError( - "can't find insn suffix bit fields title".into(), - Backtrace::capture(), - )); - }; - let suffix_text_str = suffix_text.element.inner_text(); - if suffix_text_str != "Suffix:" { - return Err(ExtractInsnsError::InsnParseError( - format!("insn suffix bit fields title is not as expected: {suffix_text_str:?}"), - Backtrace::capture(), - )); - } - let suffix_bit_fields = self.extract_insn_bit_fields_box( - suffix_text.regular_min_y - INSN_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT, - )?; - let Some(suffix_bit_fields) = suffix_bit_fields else { - return Err(ExtractInsnsError::InsnParseError( - "can't find insn suffix bit fields".into(), - Backtrace::capture(), - )); - }; - return Ok(Some(InsnBitFields { - prefix: Some(InsnBitFieldsPrefix { - box_min_x: prefix_bit_fields.box_min_x, - box_min_y: prefix_bit_fields.box_min_y, - box_max_x: prefix_bit_fields.box_max_x, - box_max_y: prefix_bit_fields.box_max_y, - prefix_text: prefix_text, - fields: prefix_bit_fields.fields, - suffix_text: suffix_text, - }), - box_min_x: suffix_bit_fields.box_min_x, - box_min_y: suffix_bit_fields.box_min_y, - box_max_x: suffix_bit_fields.box_max_x, - box_max_y: suffix_bit_fields.box_max_y, - fields: suffix_bit_fields.fields, - })); - } - fn extract_insn_bit_fields_box( - &mut self, - expected_box_max_y: f32, - ) -> Result, ExtractInsnsError> { - let mut h_lines = Vec::new(); - let mut v_lines = Vec::new(); - let page = self.page().map_err(ExtractInsnsError::Other)?; - let ControlFlow::::Continue(()) = page.qt[&self.text_section].range( - self.text_section.min_x.get() - 5.0, - self.text_section.max_x.get() + 5.0, - expected_box_max_y - INSN_BIT_FIELDS_BOX_HEIGHT - 5.0, - expected_box_max_y + 5.0, - |_x, _y, line| { - let PageItem::LineOrRect(LineOrRect::Line(line)) = *line else { - return ControlFlow::Continue(()); - }; - if line.width() > line.height() { - h_lines.push(line); - } else { - v_lines.push(line); - } - ControlFlow::Continue(()) - }, - ); - h_lines.sort_by_key(|line| line.min_y()); - v_lines.sort_by_key(|line| line.min_x()); - for i in (0..v_lines.len().saturating_sub(1)).rev() { - if f32::abs(v_lines[i].min_x().get() - v_lines[i + 1].min_x().get()) < 0.5 { - v_lines.remove(i + 1); // remove duplicates - } - } - if h_lines.is_empty() && v_lines.is_empty() { - return Ok(None); - } - let [bottom_line, top_line] = &*h_lines else { - return Err(ExtractInsnsError::InsnParseError( - format!( - "instruction bit fields box has wrong number of horizontal lines:\n{h_lines:?}" - ), - Backtrace::capture(), - )); - }; - let [leftmost_line, .., rightmost_line] = &*v_lines else { - return Err(ExtractInsnsError::InsnParseError( - format!("instruction bit fields box has too few vertical lines:\n{v_lines:?}"), - Backtrace::capture(), - )); - }; - let box_min_x = leftmost_line.min_x().get(); - let box_max_x = rightmost_line.min_x().get(); - let box_min_y = bottom_line.min_y().get(); - let box_max_y = top_line.max_y().get(); - let box_mid_y = (box_min_y + box_max_y) * 0.5; - println!("bottom_line={bottom_line:?}"); - println!("top_line={top_line:?}"); - println!("{v_lines:?}"); - let mut fields = Vec::new(); - for i in 0..v_lines.len() - 1 { - let left_line = v_lines[i]; - let right_line = v_lines[i + 1]; - let field_box_min_x = left_line.max_x().get(); - let field_box_max_x = right_line.min_x().get(); - let bit_field_name_start_min_y = box_mid_y + 3.288; - let bit_field_name = self.extract_text_line( - None, - bit_field_name_start_min_y, - field_box_min_x, - field_box_max_x, - TextLineFonts::InsnBitFieldNameFonts, - 0, - true, - Some(0.4), - )?; - let Some(bit_field_name) = bit_field_name else { - return Err(ExtractInsnsError::InsnParseError( - format!( - "instruction bit field name not found:\n\ - start_min_y={bit_field_name_start_min_y} \ - field_box_min_x={field_box_min_x} \ - field_box_max_x={field_box_max_x}" - ), - Backtrace::capture(), - )); - }; - let bit_field_number_start_min_y = box_min_y + 3.487; - let bit_number = self.extract_text_line( - None, - bit_field_number_start_min_y, - field_box_min_x, - field_box_max_x, - TextLineFonts::InsnBitFieldBitNumberFonts, - 0, - true, - None, - )?; - let Some(bit_number) = bit_number else { - return Err(ExtractInsnsError::InsnParseError( - format!( - "instruction bit field bit number not found:\n\ - start_min_y={bit_field_number_start_min_y} \ - field_box_min_x={field_box_min_x} \ - field_box_max_x={field_box_max_x}" - ), - Backtrace::capture(), - )); - }; - fields.push(InsnBitField { - box_min_x: field_box_min_x, - box_max_x: field_box_max_x, - name: bit_field_name, - bit_number: bit_number, - }); - } - return Ok(Some(InsnBitFields { - prefix: None, - box_min_x, - box_min_y, - box_max_x, - box_max_y, - fields, - })); - } - fn extract_insn_header_mnemonics_and_bit_fields( - &mut self, - start_min_y: f32, - header_start_char: Option, - ) -> Result, ExtractInsnsError> { - assert!( - header_start_char - .as_ref() - .is_none_or(|v| v.font == Font::InsnHeader) - ); - let Some(header_line) = self.extract_text_line( - header_start_char, - start_min_y, - self.text_section.min_x.get(), - self.text_section.max_x.get(), - TextLineFonts::InsnHeaderFonts, - 0, - true, - Some(6.0), - )? - else { - return Ok(None); - }; - println!("found header line:\n{header_line}"); - let header_lines = self.extract_following_text_lines( - header_line, - self.text_section.min_x.get(), - self.text_section.max_x.get(), - Some(1.5), - )?; - println!("insn header lines:"); - for header_line in &header_lines { - println!("{header_line}"); - } - let [.., last_header_line] = &*header_lines else { - unreachable!(); - }; - let Some(mnemonic_start_char) = self - .find_top_left_char_in_range( - self.text_section.min_x.get() - 5.0, - self.text_section.max_x.get() + 5.0, - last_header_line.regular_min_y - 50.0, - last_header_line.regular_min_y - 5.0, - false, - ) - .map_err(ExtractInsnsError::Other)? - else { - return Err(ExtractInsnsError::InsnParseError( - "can't find insn mnemonic text line".into(), - Backtrace::capture(), - )); - }; - let mnemonic_start_char_min_y = mnemonic_start_char.min_y.get(); - let Some(mnemonic_line) = self.extract_text_line( - Some(mnemonic_start_char), - mnemonic_start_char_min_y, - self.text_section.min_x.get(), - self.text_section.max_x.get(), - TextLineFonts::InsnMnemonicFonts, - 0, - true, - None, - )? - else { - return Err(ExtractInsnsError::InsnParseError( - "can't find insn mnemonic text line".into(), - Backtrace::capture(), - )); - }; - let mnemonic_line_first_char_min_x = mnemonic_line.chars[0].min_x.get(); - let mnemonic_lines = self.extract_following_text_lines( - mnemonic_line, - mnemonic_line_first_char_min_x, - self.text_section.max_x.get(), - None, - )?; - println!("insn mnemonic lines:"); - for mnemonic_line in &mnemonic_lines { - println!("{mnemonic_line}"); - } - let Some(insn_bit_fields) = self.extract_insn_bit_fields(&mnemonic_lines)? else { - return Err(ExtractInsnsError::InsnParseError( - "can't find insn bit fields".into(), - Backtrace::capture(), - )); - }; - println!("{insn_bit_fields}"); - return Ok(Some(InsnHeader { - header_lines, - mnemonic_lines, - bit_fields: insn_bit_fields, - })); - } - fn extract_insn_sp_regs_altered( - &mut self, - mut sp_regs_altered_text: ParsedTextLine, - ) -> Result { - sp_regs_altered_text.preceding_blank_lines = 0; - let fonts = TextLineFonts::InsnDescFonts; - let column_min_x = sp_regs_altered_text.chars[0].min_x.get(); - let Some(table_header_reg_char) = self - .find_top_left_char_in_range( - column_min_x - 1.0, - column_min_x + INSN_SP_REGS_ALTERED_FIELDS_COLUMN_X - 1.0, - sp_regs_altered_text.regular_min_y - 30.0, - sp_regs_altered_text.regular_min_y - 5.0, - false, - ) - .map_err(ExtractInsnsError::Other)? - else { - return Err(ExtractInsnsError::InsnParseError( - "can't find special registers altered table's register-column's header".into(), - Backtrace::capture(), - )); - }; - const KNOWN_SPECIAL_TEXTS: &[&str] = &[ - "None", - "Dependent on the system service", - "See above.", - "See Table 5.1", - ]; - match &*table_header_reg_char.text { - "R" => {} - text if KNOWN_SPECIAL_TEXTS.iter().any(|i| text == &i[..1]) => { - let start_min_y = table_header_reg_char.min_y.get(); - let special_text = self.extract_text_line( - Some(table_header_reg_char), - start_min_y, - column_min_x, - self.text_section.max_x.get(), - fonts, - 0, - true, - None, - )?; - let special_text = match special_text { - Some(special_text) - if KNOWN_SPECIAL_TEXTS.contains(&&*special_text.element.text) => - { - special_text - } - _ => return Err(ExtractInsnsError::Other( - format!( - "can't find special-registers-altered special-text:\n{special_text:?}" - ) - .into(), - )), - }; - let final_regular_min_y = special_text.regular_min_y; - return Ok(InsnSpRegsAltered { - sp_regs_altered_text, - special_text: Some(special_text), - table_header_reg: None, - table_header_fields: None, - entries: vec![], - final_regular_min_y, - }); - } - text => { - return Err(ExtractInsnsError::InsnParseError( - format!( - "unknown special-registers-altered special-text start character: {text:?}" - ), - Backtrace::capture(), - )); - } - } - let Some(table_header_fields_char) = self - .find_top_left_char_in_range( - column_min_x + INSN_SP_REGS_ALTERED_FIELDS_COLUMN_X - 10.0, - column_min_x + INSN_SP_REGS_ALTERED_FIELDS_CONDS_SPLIT_X, - table_header_reg_char.min_y.get() - 5.0, - table_header_reg_char.min_y.get() + 5.0, - false, - ) - .map_err(ExtractInsnsError::Other)? - else { - return Err(ExtractInsnsError::Other( - "can't find special registers altered table's fields-column's header".into(), - )); - }; - if table_header_fields_char.text != "F" { - return Err(ExtractInsnsError::Other( - format!( - "can't find special registers altered table's fields-column's header:\n\ - table_header_fields_char={table_header_fields_char:?}" - ) - .into(), - )); - } - let columns_x_bounds = [ - ( - table_header_reg_char.min_x.get(), - table_header_fields_char.min_x.get() - 1.0, - ), - ( - table_header_fields_char.min_x.get(), - column_min_x + INSN_SP_REGS_ALTERED_FIELDS_CONDS_SPLIT_X, - ), - ( - column_min_x + INSN_SP_REGS_ALTERED_FIELDS_CONDS_SPLIT_X, - self.text_section.max_x.get(), - ), - ]; - let start_min_y = table_header_reg_char.min_y.get(); - let Some(table_header_reg) = self.extract_text_line( - Some(table_header_reg_char), - start_min_y, - columns_x_bounds[0].0, - columns_x_bounds[0].1, - fonts, - 0, - false, - None, - )? - else { - return Err(ExtractInsnsError::Other( - "can't find special registers altered table's register-column's header".into(), - )); - }; - let table_header_reg_text = table_header_reg.element.inner_text(); - if table_header_reg_text != "Register" { - return Err(ExtractInsnsError::Other( - format!( - "can't find special registers altered table's register-column's header:\n\ - table_header_reg_text={table_header_reg_text:?}" - ) - .into(), - )); - } - let start_min_y = table_header_fields_char.min_y.get(); - let Some(table_header_fields) = self.extract_text_line( - Some(table_header_fields_char), - start_min_y, - columns_x_bounds[1].0, - columns_x_bounds[1].1, - fonts, - 0, - false, - None, - )? - else { - return Err(ExtractInsnsError::Other( - "can't find special registers altered table's fields-column's header".into(), - )); - }; - let table_header_fields_text = table_header_fields.element.inner_text(); - if table_header_fields_text != "Field(s)" { - return Err(ExtractInsnsError::Other( - format!( - "can't find special registers altered table's fields-column's header:\n\ - table_header_fields_text={table_header_fields_text:?}" - ) - .into(), - )); - } - let mut regular_min_y = table_header_reg.regular_min_y; - let mut entries = Vec::new(); - let mut cur_reg = None; - let mut cur_fields = Vec::new(); - let mut cur_conds = Vec::new(); - loop { - let mut row = [None, None, None]; - let mut next_regular_min_y = None; - for (i, (min_x, max_x)) in columns_x_bounds.into_iter().enumerate() { - row[i] = self.extract_text_line( - None, - regular_min_y - fonts.regular()[0].line_height(), - min_x, - max_x, - fonts, - 0, - true, - Some(2.0), - )?; - if let Some(cell) = &row[i] - && next_regular_min_y.is_none() - { - next_regular_min_y = Some(cell.regular_min_y); - } - } - match next_regular_min_y { - Some(v) => regular_min_y = v, - None => break, - } - let [cur_reg_cell, cur_fields_cell, cur_conds_cell] = row; - if cur_reg_cell.is_none() { - if cur_reg.is_none() { - return Err(ExtractInsnsError::Other( - "can't find special registers altered table's first register".into(), - )); - } - cur_fields.extend(cur_fields_cell); - cur_conds.extend(cur_conds_cell); - continue; - } - if let Some(cur_reg) = cur_reg { - entries.push(InsnSpRegsAlteredEntry { - reg: cur_reg, - fields: cur_fields, - conds: cur_conds, - }); - cur_fields = Vec::new(); - cur_conds = Vec::new(); - } - cur_reg = cur_reg_cell; - cur_fields.extend(cur_fields_cell); - cur_conds.extend(cur_conds_cell); - } - let Some(cur_reg) = cur_reg else { - return Err(ExtractInsnsError::Other( - "can't find special registers altered table's first register".into(), - )); - }; - entries.push(InsnSpRegsAlteredEntry { - reg: cur_reg, - fields: cur_fields, - conds: cur_conds, - }); - return Ok(InsnSpRegsAltered { - sp_regs_altered_text: sp_regs_altered_text, - special_text: None, - table_header_reg: Some(table_header_reg), - table_header_fields: Some(table_header_fields), - entries, - final_regular_min_y: regular_min_y, - }); - } - fn extract_insn(&mut self, header_start_char: Char) -> Result { - assert_eq!(header_start_char.font, Font::InsnHeader); - println!("{header_start_char:?}"); - let Some(header) = self.extract_insn_header_mnemonics_and_bit_fields( - header_start_char.min_y.get(), - Some(header_start_char), - )? - else { - return Err(ExtractInsnsError::PageParseError( - "can't find header text line".into(), - Backtrace::capture(), - )); - }; - let mut next_start_min_y = header.min_y() - 5.0; - let mut headers = vec![header]; - let mut code_lines: Vec = Vec::new(); - let mut desc_lines: Vec = Vec::new(); - let mut sp_regs_altered = None; - loop { - let search_min_y = next_start_min_y - 70.0; - let Some(next_char) = self - .find_top_left_char_in_range( - self.text_section.min_x.get() - 5.0, - self.text_section.max_x.get() + 5.0, - search_min_y.max(self.text_section.min_y.get()), - next_start_min_y, - false, - ) - .map_err(ExtractInsnsError::Other)? - else { - if search_min_y <= self.text_section.min_y.get() - && self - .pages - .get(self.text_section.next().page_num) - .map_err(ExtractInsnsError::Other)? - .is_some() - { - // go to next section - self.text_section = self.text_section.next(); - next_start_min_y = self.text_section.max_y.get(); - continue; - } else { - return Err(ExtractInsnsError::InsnParseError( - "can't find insn code or description text".into(), - Backtrace::capture(), - )); - } - }; - let next_section = match &next_char.font { - font if TextLineFonts::InsnCodeFonts.fonts().contains(font) => { - InsnParseSection::Code - } - font if TextLineFonts::InsnDescFonts.fonts().contains(font) => { - InsnParseSection::Desc - } - Font::InsnHeader => InsnParseSection::Header, - font => { - return Err(ExtractInsnsError::InsnParseError( - format!("can't find insn code or description text\nfont={font:?}"), - Backtrace::capture(), - )); - } - }; - match next_section { - InsnParseSection::Code => { - if !desc_lines.is_empty() { - break; - } - let start_min_y = next_char.min_y.get(); - let min_x = next_char.min_x.get(); - let Some(code_line) = self.extract_text_line( - Some(next_char), - start_min_y, - min_x, - self.text_section.max_x.get(), - TextLineFonts::InsnCodeFonts, - if code_lines.is_empty() { 0 } else { 1 }, - false, - None, - )? - else { - return Err(ExtractInsnsError::InsnParseError( - "can't find insn code text line".into(), - Backtrace::capture(), - )); - }; - let min_x = code_line.chars[0].min_x.get(); - let more_code_lines = self.extract_following_text_lines( - code_line, - min_x, - self.text_section.max_x.get(), - Some(0.05), - )?; - println!("more insn code lines:"); - for i in &more_code_lines { - println!("{i}"); - } - code_lines.extend(more_code_lines); - let Some(last) = code_lines.last() else { - unreachable!() - }; - next_start_min_y = last.regular_min_y - 5.0; - } - InsnParseSection::Header => { - if !(code_lines.is_empty() && desc_lines.is_empty()) { - break; - } - let Some(header) = self.extract_insn_header_mnemonics_and_bit_fields( - next_char.min_y.get(), - Some(next_char), - )? - else { - return Err(ExtractInsnsError::InsnParseError( - "can't find header text line".into(), - Backtrace::capture(), - )); - }; - next_start_min_y = header.min_y() - 5.0; - headers.push(header); - } - InsnParseSection::Desc => { - let start_min_y = next_char.min_y.get(); - let min_x = next_char.min_x.get(); - let Some(desc_line) = self.extract_text_line( - Some(next_char), - start_min_y, - min_x, - self.text_section.max_x.get(), - TextLineFonts::InsnDescFonts, - if desc_lines.is_empty() { 0 } else { 1 }, - false, - Some(3.0), - )? - else { - return Err(ExtractInsnsError::InsnParseError( - "can't find insn desc text line".into(), - Backtrace::capture(), - )); - }; - match desc_line.get_header_text() { - None => { - let min_x = desc_line.chars[0].min_x.get(); - let more_desc_lines = self.extract_following_text_lines( - desc_line, - min_x, - self.text_section.max_x.get(), - Some(3.5), - )?; - println!("more insn desc lines:"); - for i in &more_desc_lines { - println!("{i}"); - } - desc_lines.extend(more_desc_lines); - next_start_min_y = desc_lines - .last() - .expect("known to be non-empty") - .regular_min_y - - 5.0; - } - Some(header_text) if header_text == "Special Registers Altered:" => { - let new_sp_regs_altered = - self.extract_insn_sp_regs_altered(desc_line)?; - next_start_min_y = new_sp_regs_altered.final_regular_min_y; - sp_regs_altered = Some(new_sp_regs_altered); - break; - } - Some(header_text) => { - return Err(ExtractInsnsError::Other( - format!("unhandled header text: {header_text:?}\n{desc_line}") - .into(), - )); - } - } - } - } - } - println!("insn code lines:"); - for i in &code_lines { - println!("{i}"); - } - println!("insn desc lines:"); - for i in &desc_lines { - println!("{i}"); - } - println!("sp_regs_altered:"); - println!("{sp_regs_altered:?}"); - // TODO: finish - return Ok(Insn { - headers, - code_lines, - desc_lines, - sp_regs_altered, - }); - } - fn extract_insns(&mut self) -> Result<(), ExtractInsnsError> { - loop { - let Some(header_start_char) = - RefCell::borrow(&*self.unprocessed_chars().map_err(ExtractInsnsError::Other)?) - .get(&Font::InsnHeader) - .and_then(|v| v.first().cloned()) - else { - return Ok(()); - }; - let insn = self.extract_insn(header_start_char)?; - self.insns.push(insn); - } - } -} - -#[derive(Debug)] -struct MyDevice<'a> { - page_num: u32, - qt: RefCell>>, - unprocessed_chars: - Rc>>>>>>, - unprocessed_non_text: Rc>>, - first_seen_fonts: RefCell<&'a mut BTreeMap>>, - error: RefCell>, -} - -impl<'a> MyDevice<'a> { - fn new(page_num: u32, first_seen_fonts: &'a mut BTreeMap>) -> Self { - Self { - page_num, - qt: Default::default(), - unprocessed_chars: Default::default(), - unprocessed_non_text: Default::default(), - first_seen_fonts: RefCell::new(first_seen_fonts), - error: RefCell::new(Ok(())), - } - } - fn path(&self, path: &mupdf_ffi::Path<'_>, ctm: fz_matrix) { - if self.error.borrow().is_err() { - return; - } - enum Walker { - Empty, - Moved { x: f32, y: f32 }, - Line(Line), - Rect { x1: f32, y1: f32, x2: f32, y2: f32 }, - NotRecognized, - } - fn new_line(p0_x: f32, p0_y: f32, p1_x: f32, p1_y: f32) -> Option { - Some(Line { - p0_x: NonNaNF32::new(p0_x)?, - p0_y: NonNaNF32::new(p0_y)?, - p1_x: NonNaNF32::new(p1_x)?, - p1_y: NonNaNF32::new(p1_y)?, - }) - } - impl<'ctx> mupdf_ffi::PathWalker<'ctx> for Walker { - fn move_to(&mut self, _ctx: mupdf_ffi::ContextRef<'ctx>, x: f32, y: f32) { - *self = match *self { - Walker::Empty | Walker::Moved { .. } => Walker::Moved { x, y }, - Walker::Line(_) | Walker::Rect { .. } | Walker::NotRecognized => { - Walker::NotRecognized - } - }; - } - fn line_to(&mut self, _ctx: mupdf_ffi::ContextRef<'ctx>, x: f32, y: f32) { - *self = match *self { - Walker::Empty => Walker::NotRecognized, - Walker::Moved { x: p0_x, y: p0_y } => new_line(p0_x, p0_y, x, y) - .map(Walker::Line) - .unwrap_or(Walker::NotRecognized), - Walker::Line(_) | Walker::Rect { .. } | Walker::NotRecognized => { - Walker::NotRecognized - } - }; - } - fn curve_to( - &mut self, - _ctx: mupdf_ffi::ContextRef<'ctx>, - _cx1: f32, - _cy1: f32, - _cx2: f32, - _cy2: f32, - _ex: f32, - _ey: f32, - ) { - *self = Walker::NotRecognized; - } - fn close_path(&mut self, _ctx: mupdf_ffi::ContextRef<'ctx>) {} - fn rect_to( - &mut self, - _ctx: mupdf_ffi::ContextRef<'ctx>, - x1: f32, - y1: f32, - x2: f32, - y2: f32, - ) { - *self = match *self { - Walker::Empty => Walker::Rect { x1, y1, x2, y2 }, - Walker::Moved { .. } - | Walker::Line(..) - | Walker::Rect { .. } - | Walker::NotRecognized => Walker::NotRecognized, - }; - } - } - let mut walker = Walker::Empty; - path.walk(&mut walker); - let component = match walker { - Walker::Empty | Walker::Moved { .. } | Walker::NotRecognized => return, - Walker::Line(Line { - p0_x, - p0_y, - p1_x, - p1_y, - }) => { - let mupdf_sys::fz_point { x: p0_x, y: p0_y } = - mupdf_ffi::transform_point_xy(p0_x.get(), p0_y.get(), ctm); - let mupdf_sys::fz_point { x: p1_x, y: p1_y } = - mupdf_ffi::transform_point_xy(p1_x.get(), p1_y.get(), ctm); - let Some(line) = new_line(p0_x, p0_y, p1_x, p1_y) else { - return; - }; - LineOrRect::Line(line) - } - Walker::Rect { x1, y1, x2, y2 } => { - let p1 = mupdf_ffi::transform_point_xy(x1, y1, ctm); - let p2 = mupdf_ffi::transform_point_xy(x2, y1, ctm); - let p3 = mupdf_ffi::transform_point_xy(x2, y2, ctm); - let p4 = mupdf_ffi::transform_point_xy(x1, y2, ctm); - let min_x = NonNaNF32::new(p1.x.min(p2.x).min(p3.x).min(p4.x)); - let max_x = NonNaNF32::new(p1.x.max(p2.x).max(p3.x).max(p4.x)); - let min_y = NonNaNF32::new(p1.y.min(p2.y).min(p3.y).min(p4.y)); - let max_y = NonNaNF32::new(p1.y.max(p2.y).max(p3.y).max(p4.y)); - let (Some(min_x), Some(max_x), Some(min_y), Some(max_y)) = - (min_x, max_x, min_y, max_y) - else { - return; - }; - LineOrRect::Rect(Rect { - min_x, - max_x, - min_y, - max_y, - }) - } - }; - if component.width() > 100.0 - && component.min_x().get() < COLUMN_SPLIT_X - 10.0 - && component.max_x().get() > COLUMN_SPLIT_X + 10.0 - { - println!("wide component: {component:?}"); - } else { - println!("component: {component:?}"); - } - let text_section = TextSection::for_position( - self.page_num, - (component.min_x().get() + component.max_x().get()) * 0.5, - (component.min_y().get() + component.max_y().get()) * 0.5, - ); - if let Some(text_section) = text_section { - self.qt - .borrow_mut() - .entry(text_section) - .or_default() - .insert( - component.min_x().get(), - component.min_y().get(), - PageItem::LineOrRect(component), - ); - } - } - fn text(&self, text: &mupdf_ffi::Text<'_>, ctm: fz_matrix) { - if self.error.borrow().is_err() { - return; - } - let mut first_seen_fonts = self.first_seen_fonts.borrow_mut(); - for span in text.spans() { - let tm = span.trm(); - const ROUND_FACTOR: f32 = 1000.0; - let font_size = (mupdf_ffi::matrix_expansion(tm) * ROUND_FACTOR).round() / ROUND_FACTOR; - let Some(font_size) = NonNaNF32::new(font_size) else { - continue; - }; - let font_name_with_tag = span.font().name(); - let font_name_with_tag = match font_name_with_tag { - "CGMSHV+DejaVuSansCondensed-Obli" => "CGMSHV+DejaVuSansCondensed-Oblique", - "YDJYQV+DejaVuSansCondensed-Bold" => "YDJYQV+DejaVuSansCondensed-BoldOblique", - "NHUPPK+DejaVuSansCondensed-Bold" => "NHUPPK+DejaVuSansCondensed-Bold", - _ if font_name_with_tag.len() == 31 => { - let _ = self.error.replace(Err(format!( - "probably truncated font name: {font_name_with_tag:?}" - ) - .into())); - return; - } - _ => font_name_with_tag, - }; - let mut flush_char = |char: Char| -> Result<(), ()> { - let Some(text_section) = TextSection::for_position( - self.page_num, - (char.min_x.get() + char.max_x.get()) * 0.5, - (char.min_y.get() + char.max_y.get()) * 0.5, - ) else { - if PAGE_BODY_MIN_Y <= char.min_y.get() && char.min_y.get() <= PAGE_BODY_MAX_Y { - if self.page_num != 1072 { - // page 1072 has characters in the margins - let _ = self.error.replace(Err(format!( - "char not in text section: {:?}\npage_num={}", - char.text, self.page_num, - ) - .into())); - return Err(()); - } - } - return Ok(()); - }; - let set = match first_seen_fonts.get_mut(font_name_with_tag) { - Some(v) => v, - None => first_seen_fonts - .entry(String::from(font_name_with_tag)) - .or_default(), - }; - if set.insert(font_size) { - println!( - "first seen font: {font_name_with_tag:?} {font_size}: page {} {char:?}", - self.page_num, - ); - } - self.qt - .borrow_mut() - .entry(text_section) - .or_default() - .insert( - char.min_x.get(), - char.min_y.get(), - PageItem::Char(char.clone()), - ); - self.unprocessed_chars - .borrow_mut() - .entry(text_section) - .or_default() - .borrow_mut() - .entry(char.font.clone()) - .or_default() - .insert(char); - Ok(()) - }; - let mut last_char = None; - for &fz_text_item { - x, - y, - adv, - gid, - ucs, - cid: _, - } in span.items() - { - let adv = if gid >= 0 { adv } else { 0.0 }; - let tm = fz_matrix { e: x, f: y, ..tm }; - let trm = mupdf_ffi::concat(tm, ctm); - let dir = match span.write_mode() { - WriteMode::Horizontal => fz_point { x: 1.0, y: 0.0 }, - WriteMode::Vertical => fz_point { x: 0.0, y: -1.0 }, - }; - let dir = mupdf_ffi::transform_vector(dir, trm); - let glyph_start; - let glyph_stop; - let mut glyph_ascender; - let glyph_descender; - match span.write_mode() { - WriteMode::Horizontal => { - glyph_start = fz_point { x: trm.e, y: trm.f }; - glyph_stop = fz_point { - x: trm.e + adv * dir.x, - y: trm.f + adv * dir.y, - }; - glyph_ascender = fz_point { - x: 0.0, - y: span.font().ascender(), - }; - glyph_descender = fz_point { - x: 0.0, - y: span.font().descender(), - }; - if glyph_ascender.y == glyph_descender.y { - glyph_ascender.y += 1.0; - } - } - WriteMode::Vertical => { - glyph_start = fz_point { - x: trm.e - adv * dir.x, - y: trm.f - adv * dir.y, - }; - glyph_stop = fz_point { x: trm.e, y: trm.f }; - glyph_ascender = fz_point { x: 1.0, y: 0.0 }; - glyph_descender = fz_point { x: 0.0, y: 0.0 }; - } - }; - let glyph_ascender = transform_vector(glyph_ascender, trm); - let glyph_descender = transform_vector(glyph_descender, trm); - let points = [ - add_points(glyph_start, glyph_descender), - add_points(glyph_start, glyph_ascender), - add_points(glyph_stop, glyph_descender), - add_points(glyph_stop, glyph_ascender), - ]; - let min = point_min_components( - point_min_components(point_min_components(points[0], points[1]), points[2]), - points[3], - ); - let max = point_max_components( - point_max_components(point_max_components(points[0], points[1]), points[2]), - points[3], - ); - let Some(ch) = u32::try_from(ucs).ok().and_then(|v| char::try_from(v).ok()) else { - continue; - }; - let text = String::from(ch); - if text.trim().is_empty() { - continue; - } - let font = Font::known_from_name_with_tag(font_name_with_tag, font_size) - .unwrap_or_else(|| Font::Other { - font_name: font_name_with_tag.into(), - size: font_size, - }); - let (Some(min_x), Some(min_y), Some(max_x), Some(max_y)) = ( - NonNaNF32::new(min.x), - NonNaNF32::new(min.y), - NonNaNF32::new(max.x), - NonNaNF32::new(max.y), - ) else { - let _ = self - .error - .replace(Err("char position shouldn't be NaN".into())); - return; - }; - if gid < 0 - && last_char - .as_ref() - .is_some_and(|last_char: &Char| last_char.font == font) - { - if let Some(Char { - font, - text: last_text, - min_x: last_min_x, - min_y: last_min_y, - max_x: last_max_x, - max_y: last_max_y, - }) = last_char.take() - { - last_char = Some(Char { - font, - text: last_text + &text, - min_x: last_min_x.min(min_x), - min_y: last_min_y.min(min_y), - max_x: last_max_x.max(max_x), - max_y: last_max_y.max(max_y), - }); - continue; - } - } - if let Some(last_char) = last_char.take() { - match flush_char(last_char) { - Ok(()) => {} - Err(()) => return, - } - } - last_char = Some(Char { - font, - text, - min_x, - min_y, - max_x, - max_y, - }); - } - if let Some(last_char) = last_char { - match flush_char(last_char) { - Ok(()) => {} - Err(()) => return, - } - } - } - } -} - -impl<'ctx> mupdf_ffi::DeviceCallbacks<'ctx> for MyDevice<'_> { - fn fill_path( - &self, - _ctx: mupdf_ffi::ContextRef<'ctx>, - path: &mupdf_ffi::Path<'ctx>, - _even_odd: bool, - ctm: fz_matrix, - ) { - self.path(path, ctm); - } - - fn stroke_path( - &self, - _ctx: mupdf_ffi::ContextRef<'ctx>, - path: &mupdf_ffi::Path<'ctx>, - ctm: fz_matrix, - ) { - self.path(path, ctm); - } - - fn clip_path( - &self, - _ctx: mupdf_ffi::ContextRef<'ctx>, - path: &mupdf_ffi::Path<'ctx>, - _even_odd: bool, - ctm: fz_matrix, - _scissor: mupdf_sys::fz_rect, - ) { - self.path(path, ctm); - } - - fn clip_stroke_path( - &self, - _ctx: mupdf_ffi::ContextRef<'ctx>, - path: &mupdf_ffi::Path<'ctx>, - ctm: fz_matrix, - _scissor: mupdf_sys::fz_rect, - ) { - self.path(path, ctm); - } - - fn fill_text( - &self, - _ctx: mupdf_ffi::ContextRef<'ctx>, - text: &mupdf_ffi::Text<'ctx>, - ctm: fz_matrix, - ) { - self.text(text, ctm); - } - - fn stroke_text( - &self, - _ctx: mupdf_ffi::ContextRef<'ctx>, - text: &mupdf_ffi::Text<'ctx>, - ctm: fz_matrix, - ) { - self.text(text, ctm); - } - - fn clip_text( - &self, - _ctx: mupdf_ffi::ContextRef<'ctx>, - text: &mupdf_ffi::Text<'ctx>, - ctm: fz_matrix, - _scissor: mupdf_sys::fz_rect, - ) { - self.text(text, ctm); - } - - fn clip_stroke_text( - &self, - _ctx: mupdf_ffi::ContextRef<'ctx>, - text: &mupdf_ffi::Text<'ctx>, - ctm: fz_matrix, - _scissor: mupdf_sys::fz_rect, - ) { - self.text(text, ctm); - } - - fn ignore_text( - &self, - _ctx: mupdf_ffi::ContextRef<'ctx>, - text: &mupdf_ffi::Text<'ctx>, - ctm: fz_matrix, - ) { - self.text(text, ctm); - } -} - -#[derive(serde::Deserialize, Debug)] -enum MuPdfXml<'a> { - #[serde(rename = "page")] - Page(MuPdfXmlPage<'a>), -} - -#[derive(serde::Deserialize, Debug)] -struct MuPdfXmlPage<'a> { - #[serde(rename = "@id")] - id: Cow<'a, str>, - #[serde(rename = "@width")] - width: f32, - #[serde(rename = "@height")] - height: f32, - block: Vec>, -} - -#[derive(serde::Deserialize, Debug)] -struct MuPdfXmlBlock<'a> { - #[serde(rename = "@bbox")] - bbox: [f32; 4], - #[serde(rename = "@justify")] - justify: Cow<'a, str>, - line: Vec>, -} - -#[derive(serde::Deserialize, Debug)] -struct MuPdfXmlLine<'a> { - #[serde(rename = "@bbox")] - bbox: [f32; 4], - #[serde(rename = "@wmode")] - wmode: u8, - #[serde(rename = "@dir")] - dir: [f32; 2], - #[serde(rename = "@text")] - text: Cow<'a, str>, - font: Vec>, -} - -#[derive(serde::Deserialize, Debug)] -struct MuPdfXmlFont<'a> { - #[serde(rename = "@name")] - name: Cow<'a, str>, - #[serde(rename = "@size")] - size: f32, - char: Vec>, -} - -#[derive(serde::Deserialize, Debug)] -struct MuPdfXmlChar<'a> { - #[serde(rename = "@quad")] - quad: [f32; 8], - #[serde(rename = "@x")] - x: f32, - #[serde(rename = "@y")] - y: f32, - #[serde(rename = "@bidi")] - bidi: u16, - #[serde(rename = "@color")] - color: Cow<'a, str>, - #[serde(rename = "@alpha")] - alpha: Cow<'a, str>, - #[serde(rename = "@flags")] - flags: u32, - #[serde(rename = "@c")] - c: Cow<'a, str>, -} - -impl Page { - fn from_mupdf_page( - page_num: u32, - page: &mupdf_ffi::Page<'_>, - first_seen_fonts: &mut BTreeMap>, - dump_mupdf_page_xml: bool, - ) -> Result { - if dump_mupdf_page_xml { - println!("{}", page.to_xml()?); - } - let Some(pdf_page) = page.pdf_page() else { - return Err("page is not from a pdf".into()); - }; - let device = mupdf_ffi::Device::new( - page.ctx(), - Box::new(MyDevice::new(page_num, first_seen_fonts)), - )?; - page.run(&device, pdf_page.transform()?)?; - let MyDevice { - page_num: _, - qt, - unprocessed_chars, - unprocessed_non_text, - first_seen_fonts: _, - error, - } = device.get(); - error.replace(Ok(()))?; - for (text_section, i) in unprocessed_chars.borrow_mut().iter_mut() { - for chars in i.borrow_mut().values_mut() { - chars.sort_by_key(Char::top_down_left_to_right_sort_key); - println!("first char: {text_section:?}: {:?}", chars.first()); - } - } - let mut unknown_fonts = Vec::new(); - let mut unknown_font_errors = Vec::new(); - for i in RefCell::borrow(&unprocessed_chars).values() { - for (font, chars) in RefCell::borrow(i).iter() { - if font.known_font_group().is_none() { - let mut text = String::new(); - for char in chars { - text += &char.text; - } - unknown_fonts.push(format!("{font:?},")); - unknown_font_errors.push(format!( - "unknown font {font:?}\nlast char: {:?}\ntext: {text:?}", - chars.last() - )); - } - } - } - unknown_fonts.sort(); - if !unknown_fonts.is_empty() { - return Err(format!( - "\nunknown fonts:\n{}\n\n{}", - unknown_fonts.join("\n"), - unknown_font_errors.join("\n") - ) - .into()); - } - Ok(Self { - page_num, - qt: qt.take(), - unprocessed_chars: unprocessed_chars.clone(), - unprocessed_non_text: unprocessed_non_text.clone(), - }) - } -} - -fn main_inner() -> Result<(), Error> { - let mut args: Vec = std::env::args().collect(); - let dump_mupdf_page_xml = if args.get(1).is_some_and(|v| v == "--dump-mupdf-page-xml") { - args.remove(1); - true - } else { - false - }; - let page_numbers: Option>>> = if 2 < args.len() { - Some(if let Some((start, end)) = args[2].split_once(":") { - let start: NonZero = start.trim().parse()?; - let end: NonZero = end.trim().parse()?; - Box::new( - (start.get()..end.get()).map(|v| NonZero::new(v).expect("known to be non-zero")), - ) - } else { - Box::new( - Result::>, _>::from_iter( - args[2].split(",").map(|v| v.trim().parse()), - )? - .into_iter(), - ) - }) - } else { - None - }; - mupdf_ffi::Context::with(|ctx| { - let mut parser = Parser::new(); - let is_subset = page_numbers.is_some(); - let file_name = &args[1]; - parser.parse_pdf(ctx, file_name, page_numbers, dump_mupdf_page_xml)?; - let mut insns = xml_tree::Element::new( - "instructions".into(), - [( - "is-subset".into(), - if is_subset { - "True".into() - } else { - "False".into() - }, - )], - ); - insns.text = "\n".into(); - insns.tail = "\n".into(); - let mut comment = - xml_tree::Element::comment(format!(" Automatically generated from {file_name} ")); - comment.tail = "\n".into(); - insns.children.push(comment); - for insn in parser.insns { - insn.write_xml(&mut insns); - } - let mut output = Vec::new(); - insns.write(&mut output, true)?; - std::fs::write("powerisa-instructions.xml", output)?; - Ok(()) - }) -} - fn main() -> std::process::ExitCode { - match main_inner() { - Ok(()) => std::process::ExitCode::SUCCESS, - Err(e) => { - println!("Error: {e}"); - std::process::ExitCode::FAILURE - } - } + parse_powerisa_pdf::main() } diff --git a/src/mupdf_ffi.rs b/src/mupdf_ffi.rs index 19d7564..809ee44 100644 --- a/src/mupdf_ffi.rs +++ b/src/mupdf_ffi.rs @@ -765,6 +765,7 @@ pub(crate) enum WriteMode { } impl<'a, 'ctx> TextSpanRef<'a, 'ctx> { + #[allow(dead_code)] pub(crate) fn get(self) -> &'a UnsafeCell { self.ptr } @@ -802,6 +803,7 @@ pub(crate) struct FontRef<'a, 'ctx> { } impl<'a, 'ctx> FontRef<'a, 'ctx> { + #[allow(dead_code)] pub(crate) fn get(self) -> &'a UnsafeCell { self.ptr } @@ -810,9 +812,11 @@ impl<'a, 'ctx> FontRef<'a, 'ctx> { .to_str() .expect("font name isn't valid UTF-8") } + #[allow(dead_code)] pub(crate) fn is_bold(self) -> bool { unsafe { fz_font_is_bold(self.ctx.0.get(), self.ptr.get()) != 0 } } + #[allow(dead_code)] pub(crate) fn is_italic(self) -> bool { unsafe { fz_font_is_italic(self.ctx.0.get(), self.ptr.get()) != 0 } } @@ -824,6 +828,7 @@ impl<'a, 'ctx> FontRef<'a, 'ctx> { } } +#[allow(dead_code)] pub(crate) fn transform_point(point: fz_point, m: fz_matrix) -> fz_point { unsafe { fz_transform_point(point, m) } }