From b1d83b1d840fd5637893b5cd8f193d0e188fff2e Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Sun, 4 Jan 2026 18:31:56 -0800 Subject: [PATCH 01/11] switch font names to not include tag --- src/main.rs | 320 ++++++++++++++++++++++++++++------------------------ 1 file changed, 173 insertions(+), 147 deletions(-) diff --git a/src/main.rs b/src/main.rs index d9c54ec..d2e2c1b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -68,6 +68,22 @@ mod non_nan_float { } } +const fn str_eq(a: &str, b: &str) -> bool { + let a = a.as_bytes(); + let b = b.as_bytes(); + if a.len() != b.len() { + return false; + } + let mut i = 0; + while i < a.len() { + if a[i] != b[i] { + return false; + } + i += 1; + } + true +} + macro_rules! make_enum_font { ( enum $Font:ident { @@ -75,7 +91,7 @@ macro_rules! make_enum_font { $Other:ident $other_body:tt, $(#[group] $KnownFontGroup:ident { - $(#[name = $known_font_name:literal, size = $known_font_size:literal] + $(#[name_with_tag = $known_font_name_with_tag:literal, size = $known_font_size:literal] $KnownFont:ident,)* },)* } @@ -101,10 +117,24 @@ macro_rules! make_enum_font { } impl $Font { + const fn extract_font_name_from_font_name_with_tag(font_name_with_tag: &str) -> &str { + if let [b'A'..=b'Z',b'A'..=b'Z',b'A'..=b'Z',b'A'..=b'Z',b'A'..=b'Z',b'A'..=b'Z',b'+',_,..] = font_name_with_tag.as_bytes() { + font_name_with_tag.split_at(7).1 + } else { + panic!("invalid font name with id") + } + } + const fn new_known(font_name: &str, size: NonNaNF32) -> Option { + match size.get() { + $($($known_font_size if str_eq(font_name, const { Self::extract_font_name_from_font_name_with_tag($known_font_name_with_tag) }) => Some(Self::$KnownFont),)*)* + _ => None, + } + } fn new(font_name: &str, size: NonNaNF32) -> Self { - match (font_name, size.get()) { - $($(($known_font_name, $known_font_size) => Self::$KnownFont,)*)* - _ => Self::Other { + if let Some(v) = Self::new_known(font_name, size) { + v + } else { + Self::Other { font_name: Box::from(font_name), size, } @@ -119,7 +149,7 @@ macro_rules! make_enum_font { const fn font_name(&self) -> &str { match self { Self::$Other { font_name, .. } => font_name, - $($(Self::$KnownFont => $known_font_name,)*)* + $($(Self::$KnownFont => const { Self::extract_font_name_from_font_name_with_tag($known_font_name_with_tag) },)*)* } } const fn known_font_group(&self) -> Option { @@ -135,6 +165,17 @@ macro_rules! make_enum_font { } } } + + const _: () = { + $($( + let (known_font_name, known_font) = const { + let known_font_name = Font::extract_font_name_from_font_name_with_tag($known_font_name_with_tag); + (known_font_name, &Font::new_known(known_font_name, NonNaNF32::new($known_font_size).unwrap()).unwrap()) + }; + assert!(str_eq(known_font_name, known_font.font_name())); + assert!(matches!(known_font, Font::$KnownFont)); + )*)* + }; }; } @@ -147,352 +188,352 @@ make_enum_font! { }, #[group] InsnHeader { - #[name = "YDJYQV+DejaVuSansCondensed-BoldOblique", size = 9.963] + #[name_with_tag = "YDJYQV+DejaVuSansCondensed-BoldOblique", size = 9.963] InsnHeader, }, #[group] RtlFnHeader { - #[name = "APUYSQ+zcoN-Regular", size = 9.963] + #[name_with_tag = "APUYSQ+zcoN-Regular", size = 9.963] RtlFnHeader, }, #[group] PageHeader { - #[name = "MJBFWM+DejaVuSansCondensed", size = 9.963] + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 9.963] PageHeader, }, #[group] PageFooter { - #[name = "MJBFWM+DejaVuSansCondensed", size = 4.981] + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 4.981] PageFooter, }, #[group] InsnDesc { - #[name = "MJBFWM+DejaVuSansCondensed", size = 8.966] + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 8.966] InsnDesc0, - #[name = "FZTIYT+CMMI9", size = 8.966] + #[name_with_tag = "FZTIYT+CMMI9", size = 8.966] InsnDesc1, - #[name = "ONUAYC+CMSSI9", size = 8.966] + #[name_with_tag = "ONUAYC+CMSSI9", size = 8.966] InsnDesc2, - #[name = "TNGBFZ+CMSY9", size = 8.966] + #[name_with_tag = "TNGBFZ+CMSY9", size = 8.966] InsnDesc3, - #[name = "WHMZPU+CMEX9", size = 8.966] + #[name_with_tag = "WHMZPU+CMEX9", size = 8.966] InsnDesc4, - #[name = "ZJTMSG+CMSS9", size = 8.966] + #[name_with_tag = "ZJTMSG+CMSS9", size = 8.966] InsnDesc5, }, #[group] InsnDescMisc { - #[name = "MJBFWM+DejaVuSansCondensed", size = 2.377] + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 2.377] InsnDescMisc0, - #[name = "MJBFWM+DejaVuSansCondensed", size = 2.561] + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 2.561] InsnDescMisc1, - #[name = "MJBFWM+DejaVuSansCondensed", size = 4.492] + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 4.492] InsnDescMisc2, - #[name = "MJBFWM+DejaVuSansCondensed", size = 4.641] + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 4.641] InsnDescMisc3, - #[name = "MJBFWM+DejaVuSansCondensed", size = 4.772] + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 4.772] InsnDescMisc4, - #[name = "MJBFWM+DejaVuSansCondensed", size = 4.864] + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 4.864] InsnDescMisc5, - #[name = "MJBFWM+DejaVuSansCondensed", size = 4.925] + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 4.925] InsnDescMisc6, - #[name = "MJBFWM+DejaVuSansCondensed", size = 5.097] + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 5.097] InsnDescMisc7, - #[name = "MJBFWM+DejaVuSansCondensed", size = 5.123] + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 5.123] InsnDescMisc8, - #[name = "MJBFWM+DejaVuSansCondensed", size = 5.131] + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 5.131] InsnDescMisc9, - #[name = "MJBFWM+DejaVuSansCondensed", size = 5.516] + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 5.516] InsnDescMisc10, - #[name = "MJBFWM+DejaVuSansCondensed", size = 5.604] + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 5.604] InsnDescMisc11, - #[name = "MJBFWM+DejaVuSansCondensed", size = 5.634] + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 5.634] InsnDescMisc12, - #[name = "MJBFWM+DejaVuSansCondensed", size = 5.906] + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 5.906] InsnDescMisc13, - #[name = "MJBFWM+DejaVuSansCondensed", size = 6.033] + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.033] InsnDescMisc14, - #[name = "MJBFWM+DejaVuSansCondensed", size = 6.068] + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.068] InsnDescMisc15, - #[name = "MJBFWM+DejaVuSansCondensed", size = 6.213] + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.213] InsnDescMisc16, - #[name = "MJBFWM+DejaVuSansCondensed", size = 6.252] + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.252] InsnDescMisc17, - #[name = "MJBFWM+DejaVuSansCondensed", size = 6.962] + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.962] InsnDescMisc18, - #[name = "MJBFWM+DejaVuSansCondensed", size = 7.977] + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 7.977] InsnDescMisc19, }, #[group] InsnDescCode { - #[name = "APUYSQ+zcoN-Regular", size = 6.974] + #[name_with_tag = "APUYSQ+zcoN-Regular", size = 6.974] InsnDescCode, }, #[group] InsnDescCodeMisc { - #[name = "APUYSQ+zcoN-Regular", size = 3.587] + #[name_with_tag = "APUYSQ+zcoN-Regular", size = 3.587] InsnDescCodeMisc0, - #[name = "APUYSQ+zcoN-Regular", size = 4.483] + #[name_with_tag = "APUYSQ+zcoN-Regular", size = 4.483] InsnDescCodeMisc1, }, #[group] InsnDescItalic { - #[name = "CGMSHV+DejaVuSansCondensed-Oblique", size = 8.966] + #[name_with_tag = "CGMSHV+DejaVuSansCondensed-Oblique", size = 8.966] InsnDescItalic, }, #[group] InsnDescBold { - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 8.966] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 8.966] InsnDescBold, }, #[group] InsnDescBoldItalic { - #[name = "YDJYQV+DejaVuSansCondensed-BoldOblique", size = 8.966] + #[name_with_tag = "YDJYQV+DejaVuSansCondensed-BoldOblique", size = 8.966] InsnDescBoldItalic, }, #[group] InsnDescSmall { - #[name = "MJBFWM+DejaVuSansCondensed", size = 7.97] + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 7.97] InsnDescSmall, }, #[group] InsnDescSmallItalic { - #[name = "CGMSHV+DejaVuSansCondensed-Oblique", size = 7.97] + #[name_with_tag = "CGMSHV+DejaVuSansCondensed-Oblique", size = 7.97] InsnDescSmallItalic, }, #[group] InsnDescSmallBold { - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 7.97] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 7.97] InsnDescSmallBold, }, #[group] InsnDescSmallBoldItalic { - #[name = "YDJYQV+DejaVuSansCondensed-BoldOblique", size = 7.97] + #[name_with_tag = "YDJYQV+DejaVuSansCondensed-BoldOblique", size = 7.97] InsnDescSmallBoldItalic, }, #[group] InsnDescBoldMisc { - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 2.21] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 2.21] InsnDescBoldMisc0, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 2.399] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 2.399] InsnDescBoldMisc1, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 2.763] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 2.763] InsnDescBoldMisc2, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 2.946] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 2.946] InsnDescBoldMisc3, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 2.949] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 2.949] InsnDescBoldMisc4, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 2.999] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 2.999] InsnDescBoldMisc5, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.065] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.065] InsnDescBoldMisc6, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.086] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.086] InsnDescBoldMisc7, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.183] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.183] InsnDescBoldMisc8, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.686] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.686] InsnDescBoldMisc9, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.744] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.744] InsnDescBoldMisc10, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.825] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.825] InsnDescBoldMisc11, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.842] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.842] InsnDescBoldMisc12, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.857] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.857] InsnDescBoldMisc13, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.979] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 3.979] InsnDescBoldMisc14, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.032] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.032] InsnDescBoldMisc15, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.112] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.112] InsnDescBoldMisc16, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.161] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.161] InsnDescBoldMisc17, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.206] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.206] InsnDescBoldMisc18, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.353] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.353] InsnDescBoldMisc19, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.378] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.378] InsnDescBoldMisc20, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.434] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.434] InsnDescBoldMisc21, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.595] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.595] InsnDescBoldMisc22, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.619] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.619] InsnDescBoldMisc23, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.647] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.647] InsnDescBoldMisc24, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.68] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.68] InsnDescBoldMisc25, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.693] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.693] InsnDescBoldMisc26, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.736] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.736] InsnDescBoldMisc27, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.781] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.781] InsnDescBoldMisc28, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.802] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.802] InsnDescBoldMisc29, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.995] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 4.995] InsnDescBoldMisc30, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.201] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.201] InsnDescBoldMisc31, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.258] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.258] InsnDescBoldMisc32, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.363] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.363] InsnDescBoldMisc33, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.442] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.442] InsnDescBoldMisc34, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.473] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.473] InsnDescBoldMisc35, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.485] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.485] InsnDescBoldMisc36, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.512] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.512] InsnDescBoldMisc37, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.543] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.543] InsnDescBoldMisc38, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.613] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.613] InsnDescBoldMisc39, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.744] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.744] InsnDescBoldMisc40, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.774] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.774] InsnDescBoldMisc41, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.809] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.809] InsnDescBoldMisc42, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.849] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.849] InsnDescBoldMisc43, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.911] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.911] InsnDescBoldMisc44, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.92] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.92] InsnDescBoldMisc45, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.962] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.962] InsnDescBoldMisc46, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.981] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.981] InsnDescBoldMisc47, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.146] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.146] InsnDescBoldMisc48, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.213] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.213] InsnDescBoldMisc49, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.221] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.221] InsnDescBoldMisc50, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.243] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.243] InsnDescBoldMisc51, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.55] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.55] InsnDescBoldMisc52, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.62] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.62] InsnDescBoldMisc53, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.699] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.699] InsnDescBoldMisc54, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.725] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.725] InsnDescBoldMisc55, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.751] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.751] InsnDescBoldMisc56, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.856] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.856] InsnDescBoldMisc57, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 8.029] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 8.029] InsnDescBoldMisc58, - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 8.406] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 8.406] InsnDescBoldMisc59, }, #[group] InsnDescSubscript { - #[name = "MJBFWM+DejaVuSansCondensed", size = 5.978] + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 5.978] InsnDescSubscript, }, #[group] InsnDescBoldSubscript { - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.978] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 5.978] InsnDescBoldSubscript, }, #[group] InsnDescItalicSubscript { - #[name = "CGMSHV+DejaVuSansCondensed-Oblique", size = 5.978] + #[name_with_tag = "CGMSHV+DejaVuSansCondensed-Oblique", size = 5.978] InsnDescItalicSubscript, }, #[group] InsnDescBoldItalicSubscript { - #[name = "YDJYQV+DejaVuSansCondensed-BoldOblique", size = 5.978] + #[name_with_tag = "YDJYQV+DejaVuSansCondensed-BoldOblique", size = 5.978] InsnDescBoldItalicSubscript, }, #[group] InsnExtMnemonic { - #[name = "APUYSQ+zcoN-Regular", size = 8.966] + #[name_with_tag = "APUYSQ+zcoN-Regular", size = 8.966] InsnExtMnemonic, }, #[group] InsnCode { - #[name = "APUYSQ+zcoN-Regular", size = 7.97] + #[name_with_tag = "APUYSQ+zcoN-Regular", size = 7.97] InsnCode0, - #[name = "RRFUNA+CMSY8", size = 7.97] + #[name_with_tag = "RRFUNA+CMSY8", size = 7.97] InsnCode1, - #[name = "HPXOZC+CMSS8", size = 7.97] + #[name_with_tag = "HPXOZC+CMSS8", size = 7.97] InsnCode2, }, #[group] InsnCodeSubscript { - #[name = "APUYSQ+zcoN-Regular", size = 5.978] + #[name_with_tag = "APUYSQ+zcoN-Regular", size = 5.978] InsnCodeSubscript0, - #[name = "DBQTKF+CMSY6", size = 5.978] + #[name_with_tag = "DBQTKF+CMSY6", size = 5.978] InsnCodeSubscript1, }, #[group] TitlePageBig { - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 24.787] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 24.787] TitlePageBig, }, #[group] TitlePageVersion { - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 9.963] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 9.963] TitlePageVersion, }, #[group] TitlePageTm { - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.974] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 6.974] TitlePageTm, }, #[group] TitlePageRev { - #[name = "MJBFWM+DejaVuSansCondensed", size = 6.974] + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.974] TitlePageRev, }, #[group] TitlePageBook { - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 20.663] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 20.663] TitlePageBook, }, #[group] LegalPageItalic { - #[name = "CGMSHV+DejaVuSansCondensed-Oblique", size = 9.963] + #[name_with_tag = "CGMSHV+DejaVuSansCondensed-Oblique", size = 9.963] LegalPageItalic, }, #[group] ChangeSummaryPageBold { - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 11.955] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 11.955] ChangeSummaryPageBold, }, #[group] ChapterTitle { - #[name = "NHUPPK+DejaVuSansCondensed-Bold", size = 17.215] + #[name_with_tag = "NHUPPK+DejaVuSansCondensed-Bold", size = 17.215] ChapterTitle, }, #[group] MathMisc { - #[name = "AAJMKT+CMMI6", size = 5.978] + #[name_with_tag = "AAJMKT+CMMI6", size = 5.978] MathMisc0, - #[name = "CUTMFD+CMSSI8", size = 5.978] + #[name_with_tag = "CUTMFD+CMSSI8", size = 5.978] MathMisc1, - #[name = "CUTMFD+CMSSI8", size = 7.97] + #[name_with_tag = "CUTMFD+CMSSI8", size = 7.97] MathMisc2, - #[name = "FZTIYT+CMMI9", size = 5.734] + #[name_with_tag = "FZTIYT+CMMI9", size = 5.734] MathMisc3, - #[name = "FZTIYT+CMMI9", size = 7.168] + #[name_with_tag = "FZTIYT+CMMI9", size = 7.168] MathMisc4, - #[name = "HONFQS+CMMI8", size = 7.97] + #[name_with_tag = "HONFQS+CMMI8", size = 7.97] MathMisc5, - #[name = "HPXOZC+CMSS8", size = 5.978] + #[name_with_tag = "HPXOZC+CMSS8", size = 5.978] MathMisc6, - #[name = "LLVRDD+CMSY10", size = 11.955] + #[name_with_tag = "LLVRDD+CMSY10", size = 11.955] MathMisc7, - #[name = "ZJTMSG+CMSS9", size = 7.168] + #[name_with_tag = "ZJTMSG+CMSS9", size = 7.168] MathMisc8, }, } @@ -503,21 +544,6 @@ impl Font { self.size() * const { 3.985 / Font::InsnCode0.size() } } const fn line_height_helper(&self) -> f32 { - const fn str_eq(a: &str, b: &str) -> bool { - let a = a.as_bytes(); - let b = b.as_bytes(); - if a.len() != b.len() { - return false; - } - let mut i = 0; - while i < a.len() { - if a[i] != b[i] { - return false; - } - i += 1; - } - true - } let font_name = self.font_name(); let mut i = 0; while i < KnownFontGroup::INSN_CODE_FONT_GROUPS.len() { From 45e8925d345663f2859a44ecacdccbe06a2d1f67 Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Sun, 4 Jan 2026 18:33:41 -0800 Subject: [PATCH 02/11] use Display for errors rather than Debug --- src/main.rs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/main.rs b/src/main.rs index d2e2c1b..63ddb9f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2235,7 +2235,7 @@ impl Page { } } -fn main() -> Result<(), Box> { +fn main_inner() -> Result<(), Box> { let args: Vec = std::env::args().collect(); let page_numbers: Option>>> = if 2 < args.len() { Some(if let Some((start, end)) = args[2].split_once(":") { @@ -2277,3 +2277,13 @@ fn main() -> Result<(), Box> { std::fs::write("powerisa-instructions.xml", output)?; Ok(()) } + +fn main() -> std::process::ExitCode { + match main_inner() { + Ok(()) => std::process::ExitCode::SUCCESS, + Err(e) => { + println!("Error: {e}"); + std::process::ExitCode::FAILURE + } + } +} From 442afe5f0650dc645aee0dd77d857215a4f3a7ef Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Sun, 4 Jan 2026 20:01:13 -0800 Subject: [PATCH 03/11] wip --- src/main.rs | 419 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 412 insertions(+), 7 deletions(-) diff --git a/src/main.rs b/src/main.rs index 63ddb9f..0b5aae3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -5,12 +5,15 @@ use crate::quad_tree::QuadTree; use indexmap::IndexSet; use non_nan_float::NonNaNF32; use std::{ + backtrace::Backtrace, borrow::{Borrow, Cow}, cell::RefCell, collections::{BTreeMap, BTreeSet, HashMap, HashSet}, + convert::Infallible, error::Error, fmt, num::NonZero, + ops::ControlFlow, rc::Rc, sync::OnceLock, }; @@ -1902,6 +1905,45 @@ struct Parser { insns: Vec, } +enum ExtractInsnsError { + InsnParseError(String, std::backtrace::Backtrace), + PageParseError(String, std::backtrace::Backtrace), + Other(Box), +} + +impl fmt::Display for ExtractInsnsError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let backtrace = match self { + ExtractInsnsError::InsnParseError(msg, backtrace) => { + writeln!(f, "instruction parse error: {msg}")?; + backtrace + } + ExtractInsnsError::PageParseError(msg, backtrace) => { + writeln!(f, "page parse error: {msg}")?; + backtrace + } + ExtractInsnsError::Other(e) => return fmt::Display::fmt(&e, f), + }; + backtrace.fmt(f) + } +} + +#[derive(Clone, Debug)] +struct ErrorWithNote { + error: E, + note: String, +} + +impl fmt::Display for ErrorWithNote { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let Self { error, note } = self; + fmt::Display::fmt(error, f)?; + write!(f, "\nnote: {note}") + } +} + +impl Error for ErrorWithNote {} + impl Parser { fn new() -> Self { Self { @@ -1971,14 +2013,14 @@ impl Parser { } } } - fn note_text_section( + fn note_text_section( &mut self, - f: impl FnOnce(&mut Self) -> Result<(), Box>, - ) -> Result<(), Box> { + f: impl FnOnce(&mut Self) -> Result<(), E>, + ) -> Result<(), ErrorWithNote> { let start_text_section = self.text_section; match f(self) { Ok(()) => Ok(()), - Err(e) => { + Err(error) => { let note = if self.text_section == start_text_section { format!("text_section={:?}", self.text_section) } else { @@ -1987,12 +2029,375 @@ impl Parser { self.text_section ) }; - Err(format!("{e}\nnote: {note}").into()) + Err(ErrorWithNote { error, note }) } } } - fn parse_text_section(&mut self) -> Result<(), Box> { - todo!() + fn parse_text_section(&mut self) -> Result<(), ErrorWithNote>> { + match self.note_text_section(Self::extract_insns) { + Ok(()) => Ok(()), + Err( + e @ ErrorWithNote { + error: + ExtractInsnsError::InsnParseError(_) | ExtractInsnsError::PageParseError(_), + .. + }, + ) => { + println!("{e}"); + Ok(()) + } + Err(ErrorWithNote { + error: ExtractInsnsError::Other(error), + note, + }) => Err(ErrorWithNote { error, note }), + } + } + fn find_top_left_char_in_range( + &mut self, + min_x: f32, + max_x: f32, + min_y: f32, + max_y: f32, + allow_processed: bool, + ) -> Result, Box> { + let mut retval = None; + let page = self.page()?; + let unprocessed_chars = self.unprocessed_chars()?; + let ControlFlow::::Continue(()) = + page.qt[&self.text_section].range(min_x, max_x, min_y, max_y, |x, y, ch| { + let PageItem::Char(ch) = ch else { + return ControlFlow::Continue(()); + }; + if !allow_processed && !RefCell::borrow(&*unprocessed_chars)[&ch.font].contains(ch) + { + return ControlFlow::Continue(()); + } + match &mut retval { + None => retval = Some(ch.clone()), + Some(retval) + if ch.min_x.get() - ch.min_y.get() + < retval.min_x.get() - retval.min_y.get() => + { + *retval = ch.clone(); + } + Some(_) => {} + } + ControlFlow::Continue(()) + }); + Ok(retval) + } + fn extract_text_line( + &mut self, + start_char: Option, + mut start_min_y: f32, + min_x: f32, + max_x: f32, + fonts: TextLineFonts, + preceding_blank_lines: u32, + mut skip_initial_spaces: bool, + allowed_start_min_y_error: Option, + ) -> Result, ExtractInsnsError> { + let mut chars: Vec = Vec::new(); + let mut chars_set: IndexSet = IndexSet::new(); + if let Some(start_char) = start_char.clone() { + chars.push(start_char.clone()); + chars_set.insert(start_char); + } + if let Some(start_char) = start_char + && start_char.text == "*" + && self.text_section.page_num == 168 + && fonts + .subscript() + .is_some_and(|v| v.contains(&start_char.font)) + { + start_min_y = start_char.max_y.get() - fonts.regular()[0].size(); + } + let page = self.page().map_err(ExtractInsnsError::Other)?; + let unprocessed_chars = self.unprocessed_chars().map_err(ExtractInsnsError::Other)?; + let ControlFlow::::Continue(()) = page.qt[&self.text_section].range( + min_x - fonts.regular()[0].size() * 0.5, + max_x, + start_min_y - fonts.regular()[0].size() * 0.4, + start_min_y + fonts.regular()[0].size() * 0.6, + |x, y, ch| { + let PageItem::Char(ch) = ch else { + return ControlFlow::Continue(()); + }; + if !RefCell::borrow(&*unprocessed_chars)[&ch.font].contains(ch) + || chars_set.contains(ch) + { + return ControlFlow::Continue(()); + } + chars_set.insert(ch.clone()); + chars.push(ch.clone()); + ControlFlow::Continue(()) + }, + ); + if chars.is_empty() { + return Ok(None); + } + chars.sort_by(|a, b| (a.min_x, &a.text).cmp(&(b.min_x, &b.text))); + let mut regular_min_y = chars[0].min_y.get(); + let mut regular_max_y = chars[0].max_y.get(); + for ch in &chars { + let Some(kind) = fonts.get_kind(ch.font.clone(), BaselinePos::Below) else { + continue; + }; + if kind.sub_super() == FontVariantSubSuper::NotSubSuper { + regular_min_y = ch.min_y.get(); + regular_max_y = ch.max_y.get(); + break; + } + } + let mut retval = ParsedTextLine { + element: xml_tree::Element::new("text-line".into(), []), + regular_min_y, + regular_max_y, + fonts, + chars, + preceding_blank_lines, + }; + let mut text_and_tag_stacks: Vec<(String, Vec<&str>)> = Vec::new(); + let mut last_max_x = min_x; + let mut last_kind = None; + let mut last_char: Option = None; + for ch in &retval.chars { + let baseline_pos = if (ch.max_y.get() + ch.min_y.get()) * 0.5 + > (retval.regular_max_y + retval.regular_min_y) * 0.5 + { + BaselinePos::Above + } else { + BaselinePos::Below + }; + let Some(kind) = fonts.get_kind(ch.font.clone(), baseline_pos) else { + println!( + "font kind is None:\n\ + regular_min_y={}\n\ + fonts={fonts:?}\n\ + ch={ch:?}\n\ + baseline_pos={baseline_pos:?}\n\ + chars[0]={:?}", + retval.regular_min_y, retval.chars[0], + ); + return Ok(None); + }; + let space_kind = match last_kind { + None => kind, + Some(last_kind) if last_kind != kind => TextLineFontKind::Regular, + _ => kind, + }; + let (space_fonts, _) = fonts + .get_fonts(space_kind) + .unwrap_or((fonts.regular(), None)); + let space_width = ch.min_x.get() - last_max_x; + let space_count_f = space_width / space_fonts[0].space_width(); + let mut space_count = space_count_f.round() as usize; + if space_count == 0 && space_count_f > 0.35 { + space_count = 1 + } + if space_count_f > 0.25 && f32::abs(space_count as f32 - space_count_f) > 0.15 { + println!("spaces: space_count_f={space_count_f} space_width={space_width}"); + } + if space_count > 0 && !skip_initial_spaces { + text_and_tag_stacks.push(( + " ".repeat(space_count), + space_kind.text_line_tags().collect(), + )); + } + skip_initial_spaces = false; + if ch.text == "\u{0338}" + && let Some(last_char) = last_char + && last_char.text == "=" + && f32::abs(ch.min_x.get() - last_char.min_x.get()) < 0.01 + && f32::abs(ch.min_y.get() - last_char.min_y.get()) < 0.01 + { + *text_and_tag_stacks + .last_mut() + .expect("known to be non-empty") = ("\u{2260}".into(), Vec::new()); + last_max_x = last_char.max_x.get(); + } else { + let char_text = match &*ch.text { + "\u{fb00}" => "ff", + "\u{fb01}" => "fi", + "\u{fb02}" => "fl", + "\u{fb03}" => "ffi", + "\u{fb04}" => "ffl", + v => v, + }; + text_and_tag_stacks.push((char_text.into(), kind.text_line_tags().collect())); + last_max_x = ch.max_x.get(); + } + last_kind = Some(kind); + last_char = Some(ch.clone()); + } + ElementBodyBuilder::scope( + &mut ElementBodyBuilder::new(&mut retval.element), + |body_builder| { + for (text, tag_stack) in text_and_tag_stacks { + body_builder.set_tag_stack(tag_stack); + body_builder.write_text(text) + } + }, + ); + for ch in &retval.chars { + RefCell::borrow_mut(&*unprocessed_chars) + .get_mut(&ch.font) + .expect("known to exist") + .shift_remove(ch); + } + let allowed_start_min_y_error = allowed_start_min_y_error.unwrap_or(0.01); + if f32::abs(start_min_y - retval.regular_min_y) > allowed_start_min_y_error { + return Err(ExtractInsnsError::PageParseError( + format!( + "start_min_y={start_min_y} regular_min_y={}\n\ + start_min_y error: {}\n\ + allowed_start_min_y_error={allowed_start_min_y_error}", + retval.regular_min_y, + start_min_y - retval.regular_min_y, + ), + Backtrace::capture(), + )); + } + Ok(Some(retval)) + } + /*fn extract_insn(&mut self, header_start_char: Char) -> Result { + assert_eq!(header_start_char.font, Font::InsnHeader); + println!("{header_start_char:?}"); + let Some(header) = self.extract_insn_header_mnemonics_and_bit_fields( + header_start_char.min_y.get(), + header_start_char, + )? else { + return Err(ExtractInsnsError::PageParseError("can't find header text line".into(), Backtrace::capture())); + }; + let next_start_min_y = header.min_y.get() - 5.0; + let mut headers = vec![header]; + let mut code_lines: Vec = Vec::new(); + let mut desc_lines: Vec = Vec::new(); + let mut sp_regs_altered = None; + loop { + let search_min_y = next_start_min_y - 70.0; + let Some(next_char) = self.find_top_left_char_in_range( + min_x=self.text_section.min_x.get() - 5.0, + max_x=self.text_section.max_x.get() + 5.0, + min_y=max(search_min_y, self.text_section.min_y), + max_y=next_start_min_y, + allow_processed=False, + )?; + if next_char is None: + if search_min_y <= self.text_section.min_y \ + and self.text_section.next is not None and \ + self.text_section.next.page_num in self.pages: + # go to next section + self.text_section = self.text_section.next + next_start_min_y = self.text_section.max_y + continue + else: + raise InsnParseError("can't find insn code or description text") + match next_char.font: + case font if font in TextLineFonts.INSN_CODE_FONTS.fonts: + next_section = _InsnParseSection.CODE + case font if font in TextLineFonts.INSN_DESC_FONTS.fonts: + next_section = _InsnParseSection.DESC + case Font.INSN_HEADER: + next_section = _InsnParseSection.HEADER + case font: + raise InsnParseError(f"can't find insn code or description text\nfont={font}") + match next_section: + case _InsnParseSection.CODE: + if len(desc_lines) != 0: + break + code_line = self.extract_text_line( + start_char=next_char, + start_min_y=next_char.min_y, + min_x=next_char.min_x, + max_x=self.text_section.max_x, + fonts=TextLineFonts.INSN_CODE_FONTS, + preceding_blank_lines=0 if len(code_lines) == 0 else 1, + ) + if code_line is None: + raise InsnParseError("can't find insn code text line") + more_code_lines = self.extract_following_text_lines( + first_text_line=code_line, + min_x=code_line.chars[0].min_x, + max_x=self.text_section.max_x, + allowed_start_min_y_error=0.05, + ) + print("more insn code lines:") + print("\n".join(map(str, more_code_lines))) + code_lines.extend(more_code_lines) + next_start_min_y = code_lines[-1].regular_min_y - 5 + case _InsnParseSection.HEADER: + if len(code_lines) != 0 or len(desc_lines) != 0: + break + header = self.extract_insn_header_mnemonics_and_bit_fields( + start_min_y=next_char.min_y, + header_start_char=next_char, + ) + if header is None: + raise InsnParseError("can't find header text line") + headers.append(header) + next_start_min_y = header.min_y - 5 + case _InsnParseSection.DESC: + desc_line = self.extract_text_line( + start_char=next_char, + start_min_y=next_char.min_y, + min_x=next_char.min_x, + max_x=self.text_section.max_x, + fonts=TextLineFonts.INSN_DESC_FONTS, + preceding_blank_lines=0 if len(desc_lines) == 0 else 1, + allowed_start_min_y_error=3, + ) + if desc_line is None: + raise InsnParseError("can't find insn desc text line") + match desc_line.get_header_text(): + case None: + more_desc_lines = self.extract_following_text_lines( + first_text_line=desc_line, + min_x=desc_line.chars[0].min_x, + max_x=self.text_section.max_x, + allowed_start_min_y_error=3.5, + ) + print("more insn desc lines:") + print("\n".join(map(str, more_desc_lines))) + desc_lines.extend(more_desc_lines) + next_start_min_y = desc_lines[-1].regular_min_y - 5 + case "Special Registers Altered:": + sp_regs_altered = self.extract_insn_sp_regs_altered( + sp_regs_altered_text=desc_line, + ) + next_start_min_y = sp_regs_altered.final_regular_min_y + break + case header_text: + raise AssertionError(f"unhandled header text: {header_text!r}\n{desc_line}") + case _: + assert_never(next_section) + } + print("insn code lines:") + print("\n".join(map(str, code_lines))) + print("insn desc lines:") + print("\n".join(map(str, desc_lines))) + print("sp_regs_altered:") + print(sp_regs_altered) + # TODO: finish + return Insn( + headers=tuple(headers), + code_lines=tuple(code_lines), + desc_lines=tuple(desc_lines), + sp_regs_altered=sp_regs_altered, + ) + }*/ + fn extract_insns(&mut self) -> Result<(), ExtractInsnsError> { + loop { + let Some(header_start_char) = + RefCell::borrow(&*self.unprocessed_chars().map_err(ExtractInsnsError::Other)?) + .get(&Font::InsnHeader) + .and_then(|v| v.first().cloned()) + else { + return Ok(()); + }; + let insn = self.extract_insn(header_start_char)?; + self.insns.push(insn); + } } } From e9830566c0f61a14225c28dbb9d765ddcc98cae6 Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Mon, 5 Jan 2026 06:57:29 -0800 Subject: [PATCH 04/11] fix typo in parse_powerisa_pdf.py --- parse_powerisa_pdf/parse_powerisa_pdf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parse_powerisa_pdf/parse_powerisa_pdf.py b/parse_powerisa_pdf/parse_powerisa_pdf.py index a4afd09..409c6ac 100755 --- a/parse_powerisa_pdf/parse_powerisa_pdf.py +++ b/parse_powerisa_pdf/parse_powerisa_pdf.py @@ -1503,7 +1503,7 @@ class Parser: f"instruction bit fields box has wrong number of horizontal lines:\n{h_lines}") if len(v_lines) < 2: raise InsnParseError( - f"instruction bit fields box has too few vertical lines:\n{h_lines}") + f"instruction bit fields box has too few vertical lines:\n{v_lines}") bottom_line, top_line = h_lines box_min_x = v_lines[0].x0 box_max_x = v_lines[-1].x0 From 3d66c853f60f0a1483b5520ea1df6ea7ac8386ff Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Mon, 5 Jan 2026 09:30:06 -0800 Subject: [PATCH 05/11] wip --- src/main.rs | 870 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 844 insertions(+), 26 deletions(-) diff --git a/src/main.rs b/src/main.rs index 0b5aae3..5182add 100644 --- a/src/main.rs +++ b/src/main.rs @@ -44,6 +44,12 @@ mod non_nan_float { pub(crate) const fn get(self) -> f32 { self.0 } + pub(crate) const fn min(self, other: Self) -> Self { + Self(self.0.min(other.0)) + } + pub(crate) const fn max(self, other: Self) -> Self { + Self(self.0.max(other.0)) + } } impl std::hash::Hash for NonNaNF32 { @@ -1484,10 +1490,95 @@ enum PageItem { LineOrRect(LineOrRect), } -#[derive(Clone, Debug)] +#[derive(Copy, Clone, Debug)] enum LineOrRect { - Line(()), - Rect(()), + Line(Line), + Rect(Rect), +} + +impl LineOrRect { + fn width(self) -> f32 { + match self { + Self::Line(v) => v.width(), + Self::Rect(v) => v.width(), + } + } + fn height(self) -> f32 { + match self { + Self::Line(v) => v.height(), + Self::Rect(v) => v.height(), + } + } + fn min_x(self) -> NonNaNF32 { + match self { + Self::Line(v) => v.min_x(), + Self::Rect(v) => v.min_x, + } + } + fn max_x(self) -> NonNaNF32 { + match self { + Self::Line(v) => v.max_x(), + Self::Rect(v) => v.max_x, + } + } + fn min_y(self) -> NonNaNF32 { + match self { + Self::Line(v) => v.min_y(), + Self::Rect(v) => v.min_y, + } + } + fn max_y(self) -> NonNaNF32 { + match self { + Self::Line(v) => v.max_y(), + Self::Rect(v) => v.max_y, + } + } +} + +#[derive(Copy, Clone, Debug)] +struct Line { + p0_x: NonNaNF32, + p0_y: NonNaNF32, + p1_x: NonNaNF32, + p1_y: NonNaNF32, +} + +impl Line { + fn width(self) -> f32 { + f32::abs(self.p0_x.get() - self.p1_x.get()) + } + fn height(self) -> f32 { + f32::abs(self.p0_y.get() - self.p1_y.get()) + } + fn min_x(self) -> NonNaNF32 { + self.p0_x.min(self.p1_x) + } + fn max_x(self) -> NonNaNF32 { + self.p0_x.max(self.p1_x) + } + fn min_y(self) -> NonNaNF32 { + self.p0_y.min(self.p1_y) + } + fn max_y(self) -> NonNaNF32 { + self.p0_y.max(self.p1_y) + } +} + +#[derive(Copy, Clone, Debug)] +struct Rect { + min_x: NonNaNF32, + max_x: NonNaNF32, + min_y: NonNaNF32, + max_y: NonNaNF32, +} + +impl Rect { + fn width(self) -> f32 { + self.max_x.get() - self.min_x.get() + } + fn height(self) -> f32 { + self.max_y.get() - self.min_y.get() + } } #[derive(Debug)] @@ -2039,7 +2130,7 @@ impl Parser { Err( e @ ErrorWithNote { error: - ExtractInsnsError::InsnParseError(_) | ExtractInsnsError::PageParseError(_), + ExtractInsnsError::InsnParseError(..) | ExtractInsnsError::PageParseError(..), .. }, ) => { @@ -2064,7 +2155,7 @@ impl Parser { let page = self.page()?; let unprocessed_chars = self.unprocessed_chars()?; let ControlFlow::::Continue(()) = - page.qt[&self.text_section].range(min_x, max_x, min_y, max_y, |x, y, ch| { + page.qt[&self.text_section].range(min_x, max_x, min_y, max_y, |_x, _y, ch| { let PageItem::Char(ch) = ch else { return ControlFlow::Continue(()); }; @@ -2119,7 +2210,7 @@ impl Parser { max_x, start_min_y - fonts.regular()[0].size() * 0.4, start_min_y + fonts.regular()[0].size() * 0.6, - |x, y, ch| { + |_x, _y, ch| { let PageItem::Char(ch) = ch else { return ControlFlow::Continue(()); }; @@ -2260,6 +2351,597 @@ impl Parser { } Ok(Some(retval)) } + fn extract_following_text_lines( + &mut self, + first_text_line: ParsedTextLine, + min_x: f32, + max_x: f32, + allowed_start_min_y_error: Option, + ) -> Result, ExtractInsnsError> { + let mut retval = Vec::new(); + let fonts = first_text_line.fonts; + let mut line = Some(first_text_line); + while let Some(cur_line) = line { + let start_min_y = cur_line.regular_min_y - fonts.regular()[0].line_height(); + retval.push(cur_line); + line = self.extract_text_line( + None, + start_min_y, + min_x, + max_x, + fonts, + 0, + false, + allowed_start_min_y_error, + )?; + } + return Ok(retval); + } + fn extract_insn_bit_fields( + &mut self, + mnemonic_lines: &[ParsedTextLine], + ) -> Result, ExtractInsnsError> { + let mut found_non_affix_line = false; + let [.., last_mnemonic_line] = mnemonic_lines else { + unreachable!(); + }; + let expected_non_affix_line_y = last_mnemonic_line.regular_min_y + - if mnemonic_lines.len() > 1 { + INSN_BIT_FIELDS_TOP_PAD_HEIGHT2 + } else { + INSN_BIT_FIELDS_TOP_PAD_HEIGHT + }; + let page = self.page().map_err(ExtractInsnsError::Other)?; + let _ = page.qt[&self.text_section].range( + self.text_section.min_x.get() - 5.0, + self.text_section.max_x.get() + 5.0, + expected_non_affix_line_y - 5.0, + expected_non_affix_line_y + 5.0, + |_x, _y, line| { + let PageItem::LineOrRect(LineOrRect::Line(line)) = line else { + return ControlFlow::Continue(()); + }; + if line.width() > line.height() { + found_non_affix_line = true; + return ControlFlow::Break(()); + } + ControlFlow::Continue(()) + }, + ); + if found_non_affix_line { + return self.extract_insn_bit_fields_box(expected_non_affix_line_y); + }; + let prefix_text = self.extract_text_line( + None, + last_mnemonic_line.regular_min_y - INSN_BIT_FIELDS_PREFIX_TEXT_TOP_PAD_HEIGHT, + self.text_section.min_x.get(), + self.text_section.max_x.get(), + TextLineFonts::InsnBitFieldsAffixTitleFonts, + 0, + true, + Some(2.0), + )?; + let Some(prefix_text) = prefix_text else { + return Err(ExtractInsnsError::InsnParseError( + "can't find insn prefix bit fields title".into(), + Backtrace::capture(), + )); + }; + let prefix_text_str = prefix_text.element.inner_text(); + if prefix_text_str != "Prefix:" { + return Err(ExtractInsnsError::InsnParseError( + format!("insn prefix bit fields title is not as expected: {prefix_text_str:?}"), + Backtrace::capture(), + )); + } + let prefix_bit_fields = self.extract_insn_bit_fields_box( + prefix_text.regular_min_y - INSN_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT, + )?; + let Some(prefix_bit_fields) = prefix_bit_fields else { + return Err(ExtractInsnsError::InsnParseError( + "can't find insn prefix bit fields".into(), + Backtrace::capture(), + )); + }; + let suffix_text = self.extract_text_line( + None, + prefix_bit_fields.box_min_y - INSN_BIT_FIELDS_PREFIX_BOX_BOTTOM_TO_SUFFIX_TEXT_HEIGHT, + self.text_section.min_x.get(), + self.text_section.max_x.get(), + TextLineFonts::InsnBitFieldsAffixTitleFonts, + 0, + true, + Some(2.0), + )?; + let Some(suffix_text) = suffix_text else { + return Err(ExtractInsnsError::InsnParseError( + "can't find insn suffix bit fields title".into(), + Backtrace::capture(), + )); + }; + let suffix_text_str = suffix_text.element.inner_text(); + if suffix_text_str != "Suffix:" { + return Err(ExtractInsnsError::InsnParseError( + format!("insn suffix bit fields title is not as expected: {suffix_text_str:?}"), + Backtrace::capture(), + )); + } + let suffix_bit_fields = self.extract_insn_bit_fields_box( + suffix_text.regular_min_y - INSN_BIT_FIELDS_AFFIX_TEXT_TO_BOX_TOP_HEIGHT, + )?; + let Some(suffix_bit_fields) = suffix_bit_fields else { + return Err(ExtractInsnsError::InsnParseError( + "can't find insn suffix bit fields".into(), + Backtrace::capture(), + )); + }; + return Ok(Some(InsnBitFields { + prefix: Some(InsnBitFieldsPrefix { + box_min_x: prefix_bit_fields.box_min_x, + box_min_y: prefix_bit_fields.box_min_y, + box_max_x: prefix_bit_fields.box_max_x, + box_max_y: prefix_bit_fields.box_max_y, + prefix_text: prefix_text, + fields: prefix_bit_fields.fields, + suffix_text: suffix_text, + }), + box_min_x: suffix_bit_fields.box_min_x, + box_min_y: suffix_bit_fields.box_min_y, + box_max_x: suffix_bit_fields.box_max_x, + box_max_y: suffix_bit_fields.box_max_y, + fields: suffix_bit_fields.fields, + })); + } + fn extract_insn_bit_fields_box( + &mut self, + expected_box_max_y: f32, + ) -> Result, ExtractInsnsError> { + let mut h_lines = Vec::new(); + let mut v_lines = Vec::new(); + let page = self.page().map_err(ExtractInsnsError::Other)?; + let ControlFlow::::Continue(()) = page.qt[&self.text_section].range( + self.text_section.min_x.get() - 5.0, + self.text_section.max_x.get() + 5.0, + expected_box_max_y - INSN_BIT_FIELDS_BOX_HEIGHT - 5.0, + expected_box_max_y + 5.0, + |_x, _y, line| { + let PageItem::LineOrRect(LineOrRect::Line(line)) = *line else { + return ControlFlow::Continue(()); + }; + if line.width() > line.height() { + h_lines.push(line); + } else { + v_lines.push(line); + } + ControlFlow::Continue(()) + }, + ); + h_lines.sort_by_key(|line| line.min_y()); + v_lines.sort_by_key(|line| line.min_x()); + for i in (0..v_lines.len().saturating_sub(1)).rev() { + if f32::abs(v_lines[i].min_x().get() - v_lines[i + 1].min_x().get()) < 0.5 { + v_lines.remove(i + 1); // remove duplicates + } + } + if h_lines.is_empty() && v_lines.is_empty() { + return Ok(None); + } + let [bottom_line, top_line] = &*h_lines else { + return Err(ExtractInsnsError::InsnParseError( + format!( + "instruction bit fields box has wrong number of horizontal lines:\n{h_lines:?}" + ), + Backtrace::capture(), + )); + }; + let [leftmost_line, .., rightmost_line] = &*v_lines else { + return Err(ExtractInsnsError::InsnParseError( + format!("instruction bit fields box has too few vertical lines:\n{v_lines:?}"), + Backtrace::capture(), + )); + }; + let box_min_x = leftmost_line.min_x().get(); + let box_max_x = rightmost_line.min_x().get(); + let box_min_y = bottom_line.min_y().get(); + let box_max_y = top_line.max_y().get(); + let box_mid_y = (box_min_y + box_max_y) * 0.5; + println!("bottom_line={bottom_line:?}"); + println!("top_line={top_line:?}"); + println!("{v_lines:?}"); + let mut fields = Vec::new(); + for i in 0..v_lines.len() - 1 { + let left_line = v_lines[i]; + let right_line = v_lines[i + 1]; + let field_box_min_x = left_line.max_x().get(); + let field_box_max_x = right_line.min_x().get(); + let bit_field_name_start_min_y = box_mid_y + 3.288; + let bit_field_name = self.extract_text_line( + None, + bit_field_name_start_min_y, + field_box_min_x, + field_box_max_x, + TextLineFonts::InsnBitFieldNameFonts, + 0, + true, + Some(0.4), + )?; + let Some(bit_field_name) = bit_field_name else { + return Err(ExtractInsnsError::InsnParseError( + format!( + "instruction bit field name not found:\n\ + start_min_y={bit_field_name_start_min_y} \ + field_box_min_x={field_box_min_x} \ + field_box_max_x={field_box_max_x}" + ), + Backtrace::capture(), + )); + }; + let bit_field_number_start_min_y = box_min_y + 3.487; + let bit_number = self.extract_text_line( + None, + bit_field_number_start_min_y, + field_box_min_x, + field_box_max_x, + TextLineFonts::InsnBitFieldBitNumberFonts, + 0, + true, + None, + )?; + let Some(bit_number) = bit_number else { + return Err(ExtractInsnsError::InsnParseError( + format!( + "instruction bit field bit number not found:\n\ + start_min_y={bit_field_number_start_min_y} \ + field_box_min_x={field_box_min_x} \ + field_box_max_x={field_box_max_x}" + ), + Backtrace::capture(), + )); + }; + fields.push(InsnBitField { + box_min_x: field_box_min_x, + box_max_x: field_box_max_x, + name: bit_field_name, + bit_number: bit_number, + }); + } + return Ok(Some(InsnBitFields { + prefix: None, + box_min_x, + box_min_y, + box_max_x, + box_max_y, + fields, + })); + } + fn extract_insn_header_mnemonics_and_bit_fields( + &mut self, + start_min_y: f32, + header_start_char: Option, + ) -> Result, ExtractInsnsError> { + assert!( + header_start_char + .as_ref() + .is_none_or(|v| v.font == Font::InsnHeader) + ); + let Some(header_line) = self.extract_text_line( + header_start_char, + start_min_y, + self.text_section.min_x.get(), + self.text_section.max_x.get(), + TextLineFonts::InsnHeaderFonts, + 0, + true, + Some(6.0), + )? + else { + return Ok(None); + }; + println!("found header line:\n{header_line}"); + let header_lines = self.extract_following_text_lines( + header_line, + self.text_section.min_x.get(), + self.text_section.max_x.get(), + Some(1.5), + )?; + println!("insn header lines:"); + for header_line in &header_lines { + println!("{header_line}"); + } + let [.., last_header_line] = &*header_lines else { + unreachable!(); + }; + let Some(mnemonic_start_char) = self + .find_top_left_char_in_range( + self.text_section.min_x.get() - 5.0, + self.text_section.max_x.get() + 5.0, + last_header_line.regular_min_y - 50.0, + last_header_line.regular_min_y - 5.0, + false, + ) + .map_err(ExtractInsnsError::Other)? + else { + return Err(ExtractInsnsError::InsnParseError( + "can't find insn mnemonic text line".into(), + Backtrace::capture(), + )); + }; + let mnemonic_start_char_min_y = mnemonic_start_char.min_y.get(); + let Some(mnemonic_line) = self.extract_text_line( + Some(mnemonic_start_char), + mnemonic_start_char_min_y, + self.text_section.min_x.get(), + self.text_section.max_x.get(), + TextLineFonts::InsnMnemonicFonts, + 0, + true, + None, + )? + else { + return Err(ExtractInsnsError::InsnParseError( + "can't find insn mnemonic text line".into(), + Backtrace::capture(), + )); + }; + let mnemonic_line_first_char_min_x = mnemonic_line.chars[0].min_x.get(); + let mnemonic_lines = self.extract_following_text_lines( + mnemonic_line, + mnemonic_line_first_char_min_x, + self.text_section.max_x.get(), + None, + )?; + println!("insn mnemonic lines:"); + for mnemonic_line in &mnemonic_lines { + println!("{mnemonic_line}"); + } + let Some(insn_bit_fields) = self.extract_insn_bit_fields(&mnemonic_lines)? else { + return Err(ExtractInsnsError::InsnParseError( + "can't find insn bit fields".into(), + Backtrace::capture(), + )); + }; + println!("{insn_bit_fields}"); + return Ok(Some(InsnHeader { + header_lines, + mnemonic_lines, + bit_fields: insn_bit_fields, + })); + } + fn extract_insn_sp_regs_altered( + &mut self, + mut sp_regs_altered_text: ParsedTextLine, + ) -> Result { + sp_regs_altered_text.preceding_blank_lines = 0; + let fonts = TextLineFonts::InsnDescFonts; + let column_min_x = sp_regs_altered_text.chars[0].min_x.get(); + let Some(table_header_reg_char) = self + .find_top_left_char_in_range( + column_min_x - 1.0, + column_min_x + INSN_SP_REGS_ALTERED_FIELDS_COLUMN_X - 1.0, + sp_regs_altered_text.regular_min_y - 30.0, + sp_regs_altered_text.regular_min_y - 5.0, + false, + ) + .map_err(ExtractInsnsError::Other)? + else { + return Err(ExtractInsnsError::InsnParseError( + "can't find special registers altered table's register-column's header".into(), + Backtrace::capture(), + )); + }; + const KNOWN_SPECIAL_TEXTS: &[&str] = &[ + "None", + "Dependent on the system service", + "See above.", + "See Table 5.1", + ]; + match &*table_header_reg_char.text { + "R" => {} + text if KNOWN_SPECIAL_TEXTS.iter().any(|i| text == &i[..1]) => { + let start_min_y = table_header_reg_char.min_y.get(); + let special_text = self.extract_text_line( + Some(table_header_reg_char), + start_min_y, + column_min_x, + self.text_section.max_x.get(), + fonts, + 0, + true, + None, + )?; + let special_text = match special_text { + Some(special_text) + if KNOWN_SPECIAL_TEXTS.contains(&&*special_text.element.text) => + { + special_text + } + _ => return Err(ExtractInsnsError::Other( + format!( + "can't find special-registers-altered special-text:\n{special_text:?}" + ) + .into(), + )), + }; + let final_regular_min_y = special_text.regular_min_y; + return Ok(InsnSpRegsAltered { + sp_regs_altered_text, + special_text: Some(special_text), + table_header_reg: None, + table_header_fields: None, + entries: vec![], + final_regular_min_y, + }); + } + text => { + return Err(ExtractInsnsError::InsnParseError( + format!( + "unknown special-registers-altered special-text start character: {text:?}" + ), + Backtrace::capture(), + )); + } + } + let Some(table_header_fields_char) = self + .find_top_left_char_in_range( + column_min_x + INSN_SP_REGS_ALTERED_FIELDS_COLUMN_X - 10.0, + column_min_x + INSN_SP_REGS_ALTERED_FIELDS_CONDS_SPLIT_X, + table_header_reg_char.min_y.get() - 5.0, + table_header_reg_char.min_y.get() + 5.0, + false, + ) + .map_err(ExtractInsnsError::Other)? + else { + return Err(ExtractInsnsError::Other( + "can't find special registers altered table's fields-column's header".into(), + )); + }; + if table_header_fields_char.text != "F" { + return Err(ExtractInsnsError::Other( + format!( + "can't find special registers altered table's fields-column's header:\n\ + table_header_fields_char={table_header_fields_char:?}" + ) + .into(), + )); + } + let columns_x_bounds = [ + ( + table_header_reg_char.min_x.get(), + table_header_fields_char.min_x.get() - 1.0, + ), + ( + table_header_fields_char.min_x.get(), + column_min_x + INSN_SP_REGS_ALTERED_FIELDS_CONDS_SPLIT_X, + ), + ( + column_min_x + INSN_SP_REGS_ALTERED_FIELDS_CONDS_SPLIT_X, + self.text_section.max_x.get(), + ), + ]; + let start_min_y = table_header_reg_char.min_y.get(); + let Some(table_header_reg) = self.extract_text_line( + Some(table_header_reg_char), + start_min_y, + columns_x_bounds[0].0, + columns_x_bounds[0].1, + fonts, + 0, + false, + None, + )? + else { + return Err(ExtractInsnsError::Other( + "can't find special registers altered table's register-column's header".into(), + )); + }; + let table_header_reg_text = table_header_reg.element.inner_text(); + if table_header_reg_text != "Register" { + return Err(ExtractInsnsError::Other( + format!( + "can't find special registers altered table's register-column's header:\n\ + table_header_reg_text={table_header_reg_text:?}" + ) + .into(), + )); + } + let start_min_y = table_header_fields_char.min_y.get(); + let Some(table_header_fields) = self.extract_text_line( + Some(table_header_fields_char), + start_min_y, + columns_x_bounds[1].0, + columns_x_bounds[1].1, + fonts, + 0, + false, + None, + )? + else { + return Err(ExtractInsnsError::Other( + "can't find special registers altered table's fields-column's header".into(), + )); + }; + let table_header_fields_text = table_header_fields.element.inner_text(); + if table_header_reg_text != "Field(s)" { + return Err(ExtractInsnsError::Other( + format!( + "can't find special registers altered table's fields-column's header:\n\ + table_header_fields_text={table_header_fields_text:?}" + ) + .into(), + )); + } + let mut regular_min_y = table_header_reg.regular_min_y; + let mut entries = Vec::new(); + let mut cur_reg = None; + let mut cur_fields = Vec::new(); + let mut cur_conds = Vec::new(); + loop { + let mut row = [None, None, None]; + let mut next_regular_min_y = None; + for (i, (min_x, max_x)) in columns_x_bounds.into_iter().enumerate() { + row[i] = self.extract_text_line( + None, + regular_min_y - fonts.regular()[0].line_height(), + min_x, + max_x, + fonts, + 0, + true, + Some(2.0), + )?; + if let Some(cell) = &row[i] + && next_regular_min_y.is_none() + { + next_regular_min_y = Some(cell.regular_min_y); + } + } + match next_regular_min_y { + Some(v) => regular_min_y = v, + None => break, + } + let [cur_reg_cell, cur_fields_cell, cur_conds_cell] = row; + if cur_reg_cell.is_none() { + if cur_reg.is_none() { + return Err(ExtractInsnsError::Other( + "can't find special registers altered table's first register".into(), + )); + } + cur_fields.extend(cur_fields_cell); + cur_conds.extend(cur_conds_cell); + continue; + } + if let Some(cur_reg) = cur_reg { + entries.push(InsnSpRegsAlteredEntry { + reg: cur_reg, + fields: cur_fields, + conds: cur_conds, + }); + cur_fields = Vec::new(); + cur_conds = Vec::new(); + } + cur_reg = cur_reg_cell; + cur_fields.extend(cur_fields_cell); + cur_conds.extend(cur_conds_cell); + } + let Some(cur_reg) = cur_reg else { + return Err(ExtractInsnsError::Other( + "can't find special registers altered table's first register".into(), + )); + }; + entries.push(InsnSpRegsAlteredEntry { + reg: cur_reg, + fields: cur_fields, + conds: cur_conds, + }); + return Ok(InsnSpRegsAltered { + sp_regs_altered_text: sp_regs_altered_text, + special_text: None, + table_header_reg: Some(table_header_reg), + table_header_fields: Some(table_header_fields), + entries, + final_regular_min_y: regular_min_y, + }); + } /*fn extract_insn(&mut self, header_start_char: Char) -> Result { assert_eq!(header_start_char.font, Font::InsnHeader); println!("{header_start_char:?}"); @@ -2401,57 +3083,192 @@ impl Parser { } } -#[derive(Clone, Debug, Default)] +#[derive(Clone, Debug)] struct MyDevice { + page_num: u32, qt: Rc>>>, unprocessed_non_text: Rc>>, } +impl MyDevice { + fn new(page_num: u32) -> Self { + Self { + page_num, + qt: Default::default(), + unprocessed_non_text: Default::default(), + } + } + fn path(&mut self, path: &mupdf::Path, cmt: mupdf::Matrix) { + enum Walker { + Empty, + Moved { x: f32, y: f32 }, + Line(Line), + Rect { x1: f32, y1: f32, x2: f32, y2: f32 }, + NotRecognized, + } + fn new_line(p0_x: f32, p0_y: f32, p1_x: f32, p1_y: f32) -> Option { + Some(Line { + p0_x: NonNaNF32::new(p0_x)?, + p0_y: NonNaNF32::new(p0_y)?, + p1_x: NonNaNF32::new(p1_x)?, + p1_y: NonNaNF32::new(p1_y)?, + }) + } + impl mupdf::PathWalker for Walker { + fn move_to(&mut self, x: f32, y: f32) { + *self = match *self { + Walker::Empty | Walker::Moved { .. } => Walker::Moved { x, y }, + Walker::Line(_) | Walker::Rect { .. } | Walker::NotRecognized => { + Walker::NotRecognized + } + }; + } + fn line_to(&mut self, x: f32, y: f32) { + *self = match *self { + Walker::Empty => Walker::NotRecognized, + Walker::Moved { x: p0_x, y: p0_y } => new_line(p0_x, p0_y, x, y) + .map(Walker::Line) + .unwrap_or(Walker::NotRecognized), + Walker::Line(_) | Walker::Rect { .. } | Walker::NotRecognized => { + Walker::NotRecognized + } + }; + } + fn curve_to(&mut self, _cx1: f32, _cy1: f32, _cx2: f32, _cy2: f32, _ex: f32, _ey: f32) { + *self = Walker::NotRecognized; + } + fn close(&mut self) {} + fn rect(&mut self, x1: f32, y1: f32, x2: f32, y2: f32) { + *self = match *self { + Walker::Empty => Walker::Rect { x1, y1, x2, y2 }, + Walker::Moved { .. } + | Walker::Line(..) + | Walker::Rect { .. } + | Walker::NotRecognized => Walker::NotRecognized, + }; + } + } + let mut walker = Walker::Empty; + let Ok(()) = path.walk(&mut walker) else { + return; + }; + let component = match walker { + Walker::Empty | Walker::Moved { .. } | Walker::NotRecognized => return, + Walker::Line(Line { + p0_x, + p0_y, + p1_x, + p1_y, + }) => { + let mupdf::Point { x: p0_x, y: p0_y } = mupdf::Point { + x: p0_x.get(), + y: p0_y.get(), + } + .transform(&cmt); + let mupdf::Point { x: p1_x, y: p1_y } = mupdf::Point { + x: p1_x.get(), + y: p1_y.get(), + } + .transform(&cmt); + let Some(line) = new_line(p0_x, p0_y, p1_x, p1_y) else { + return; + }; + LineOrRect::Line(line) + } + Walker::Rect { x1, y1, x2, y2 } => { + let p1 = mupdf::Point { x: x1, y: y1 }.transform(&cmt); + let p2 = mupdf::Point { x: x2, y: y1 }.transform(&cmt); + let p3 = mupdf::Point { x: x2, y: y2 }.transform(&cmt); + let p4 = mupdf::Point { x: x1, y: y2 }.transform(&cmt); + let min_x = NonNaNF32::new(p1.x.min(p2.x).min(p3.x).min(p4.x)); + let max_x = NonNaNF32::new(p1.x.max(p2.x).max(p3.x).max(p4.x)); + let min_y = NonNaNF32::new(p1.y.min(p2.y).min(p3.y).min(p4.y)); + let max_y = NonNaNF32::new(p1.y.max(p2.y).max(p3.y).max(p4.y)); + let (Some(min_x), Some(max_x), Some(min_y), Some(max_y)) = + (min_x, max_x, min_y, max_y) + else { + return; + }; + LineOrRect::Rect(Rect { + min_x, + max_x, + min_y, + max_y, + }) + } + }; + if component.width() > 100.0 + && component.min_x().get() < COLUMN_SPLIT_X - 10.0 + && component.max_x().get() > COLUMN_SPLIT_X + 10.0 + { + println!("wide component: {component:?}"); + } else { + println!("component: {component:?}"); + } + let text_section = TextSection::for_position( + self.page_num, + (component.min_x().get() + component.max_x().get()) * 0.5, + (component.min_y().get() + component.max_y().get()) * 0.5, + ); + if let Some(text_section) = text_section { + self.qt + .borrow_mut() + .entry(text_section) + .or_default() + .insert( + component.min_x().get(), + component.min_y().get(), + PageItem::LineOrRect(component), + ); + } + } +} + impl mupdf::NativeDevice for MyDevice { fn fill_path( &mut self, path: &mupdf::Path, - even_odd: bool, + _even_odd: bool, cmt: mupdf::Matrix, - color_space: &mupdf::Colorspace, - color: &[f32], - alpha: f32, - cp: mupdf::ColorParams, + _color_space: &mupdf::Colorspace, + _color: &[f32], + _alpha: f32, + _cp: mupdf::ColorParams, ) { - // TODO + self.path(path, cmt); } fn stroke_path( &mut self, path: &mupdf::Path, - stroke_state: &mupdf::StrokeState, + _stroke_state: &mupdf::StrokeState, cmt: mupdf::Matrix, - color_space: &mupdf::Colorspace, - color: &[f32], - alpha: f32, - cp: mupdf::ColorParams, + _color_space: &mupdf::Colorspace, + _color: &[f32], + _alpha: f32, + _cp: mupdf::ColorParams, ) { - // TODO + self.path(path, cmt); } fn clip_path( &mut self, path: &mupdf::Path, - even_odd: bool, + _even_odd: bool, cmt: mupdf::Matrix, - scissor: mupdf::Rect, + _scissor: mupdf::Rect, ) { - // TODO + self.path(path, cmt); } fn clip_stroke_path( &mut self, path: &mupdf::Path, - stroke_state: &mupdf::StrokeState, + _stroke_state: &mupdf::StrokeState, cmt: mupdf::Matrix, - scissor: mupdf::Rect, + _scissor: mupdf::Rect, ) { - // TODO + self.path(path, cmt); } } @@ -2528,12 +3345,13 @@ impl Page { page_num: u32, page: mupdf::Page, ) -> Result> { - let device = MyDevice::default(); + let device = MyDevice::new(page_num); page.run( &mupdf::Device::from_native(device.clone())?, &mupdf::Matrix::IDENTITY, )?; let MyDevice { + page_num, qt, unprocessed_non_text, } = device; From c58bc23904f659d2bb0e1c79e4563b837ea90a9a Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Mon, 5 Jan 2026 09:41:56 -0800 Subject: [PATCH 06/11] wip --- src/main.rs | 296 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 182 insertions(+), 114 deletions(-) diff --git a/src/main.rs b/src/main.rs index 5182add..2e9e391 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2942,132 +2942,200 @@ impl Parser { final_regular_min_y: regular_min_y, }); } - /*fn extract_insn(&mut self, header_start_char: Char) -> Result { + fn extract_insn(&mut self, header_start_char: Char) -> Result { assert_eq!(header_start_char.font, Font::InsnHeader); println!("{header_start_char:?}"); let Some(header) = self.extract_insn_header_mnemonics_and_bit_fields( header_start_char.min_y.get(), - header_start_char, - )? else { - return Err(ExtractInsnsError::PageParseError("can't find header text line".into(), Backtrace::capture())); + Some(header_start_char), + )? + else { + return Err(ExtractInsnsError::PageParseError( + "can't find header text line".into(), + Backtrace::capture(), + )); }; - let next_start_min_y = header.min_y.get() - 5.0; + let mut next_start_min_y = header.min_y() - 5.0; let mut headers = vec![header]; let mut code_lines: Vec = Vec::new(); let mut desc_lines: Vec = Vec::new(); let mut sp_regs_altered = None; loop { let search_min_y = next_start_min_y - 70.0; - let Some(next_char) = self.find_top_left_char_in_range( - min_x=self.text_section.min_x.get() - 5.0, - max_x=self.text_section.max_x.get() + 5.0, - min_y=max(search_min_y, self.text_section.min_y), - max_y=next_start_min_y, - allow_processed=False, - )?; - if next_char is None: - if search_min_y <= self.text_section.min_y \ - and self.text_section.next is not None and \ - self.text_section.next.page_num in self.pages: - # go to next section - self.text_section = self.text_section.next - next_start_min_y = self.text_section.max_y - continue - else: - raise InsnParseError("can't find insn code or description text") - match next_char.font: - case font if font in TextLineFonts.INSN_CODE_FONTS.fonts: - next_section = _InsnParseSection.CODE - case font if font in TextLineFonts.INSN_DESC_FONTS.fonts: - next_section = _InsnParseSection.DESC - case Font.INSN_HEADER: - next_section = _InsnParseSection.HEADER - case font: - raise InsnParseError(f"can't find insn code or description text\nfont={font}") - match next_section: - case _InsnParseSection.CODE: - if len(desc_lines) != 0: - break - code_line = self.extract_text_line( - start_char=next_char, - start_min_y=next_char.min_y, - min_x=next_char.min_x, - max_x=self.text_section.max_x, - fonts=TextLineFonts.INSN_CODE_FONTS, - preceding_blank_lines=0 if len(code_lines) == 0 else 1, - ) - if code_line is None: - raise InsnParseError("can't find insn code text line") - more_code_lines = self.extract_following_text_lines( - first_text_line=code_line, - min_x=code_line.chars[0].min_x, - max_x=self.text_section.max_x, - allowed_start_min_y_error=0.05, - ) - print("more insn code lines:") - print("\n".join(map(str, more_code_lines))) - code_lines.extend(more_code_lines) - next_start_min_y = code_lines[-1].regular_min_y - 5 - case _InsnParseSection.HEADER: - if len(code_lines) != 0 or len(desc_lines) != 0: - break - header = self.extract_insn_header_mnemonics_and_bit_fields( - start_min_y=next_char.min_y, - header_start_char=next_char, - ) - if header is None: - raise InsnParseError("can't find header text line") - headers.append(header) - next_start_min_y = header.min_y - 5 - case _InsnParseSection.DESC: - desc_line = self.extract_text_line( - start_char=next_char, - start_min_y=next_char.min_y, - min_x=next_char.min_x, - max_x=self.text_section.max_x, - fonts=TextLineFonts.INSN_DESC_FONTS, - preceding_blank_lines=0 if len(desc_lines) == 0 else 1, - allowed_start_min_y_error=3, - ) - if desc_line is None: - raise InsnParseError("can't find insn desc text line") - match desc_line.get_header_text(): - case None: - more_desc_lines = self.extract_following_text_lines( - first_text_line=desc_line, - min_x=desc_line.chars[0].min_x, - max_x=self.text_section.max_x, - allowed_start_min_y_error=3.5, - ) - print("more insn desc lines:") - print("\n".join(map(str, more_desc_lines))) - desc_lines.extend(more_desc_lines) - next_start_min_y = desc_lines[-1].regular_min_y - 5 - case "Special Registers Altered:": - sp_regs_altered = self.extract_insn_sp_regs_altered( - sp_regs_altered_text=desc_line, - ) - next_start_min_y = sp_regs_altered.final_regular_min_y - break - case header_text: - raise AssertionError(f"unhandled header text: {header_text!r}\n{desc_line}") - case _: - assert_never(next_section) + let Some(next_char) = self + .find_top_left_char_in_range( + self.text_section.min_x.get() - 5.0, + self.text_section.max_x.get() + 5.0, + search_min_y.max(self.text_section.min_y.get()), + next_start_min_y, + false, + ) + .map_err(ExtractInsnsError::Other)? + else { + if search_min_y <= self.text_section.min_y.get() + && self + .pages + .get(self.text_section.next().page_num) + .map_err(ExtractInsnsError::Other)? + .is_some() + { + // go to next section + self.text_section = self.text_section.next(); + next_start_min_y = self.text_section.max_y.get(); + continue; + } else { + return Err(ExtractInsnsError::InsnParseError( + "can't find insn code or description text".into(), + Backtrace::capture(), + )); + } + }; + let next_section = match &next_char.font { + font if TextLineFonts::InsnCodeFonts.fonts().contains(font) => { + InsnParseSection::Code + } + font if TextLineFonts::InsnDescFonts.fonts().contains(font) => { + InsnParseSection::Desc + } + Font::InsnHeader => InsnParseSection::Header, + font => { + return Err(ExtractInsnsError::InsnParseError( + format!("can't find insn code or description text\nfont={font:?}"), + Backtrace::capture(), + )); + } + }; + match next_section { + InsnParseSection::Code => { + if !desc_lines.is_empty() { + break; + } + let start_min_y = next_char.min_y.get(); + let min_x = next_char.min_x.get(); + let Some(code_line) = self.extract_text_line( + Some(next_char), + start_min_y, + min_x, + self.text_section.max_x.get(), + TextLineFonts::InsnCodeFonts, + if code_lines.is_empty() { 0 } else { 1 }, + false, + None, + )? + else { + return Err(ExtractInsnsError::InsnParseError( + "can't find insn code text line".into(), + Backtrace::capture(), + )); + }; + let min_x = code_line.chars[0].min_x.get(); + let more_code_lines = self.extract_following_text_lines( + code_line, + min_x, + self.text_section.max_x.get(), + Some(0.05), + )?; + println!("more insn code lines:"); + for i in &more_code_lines { + println!("{i}"); + } + code_lines.extend(more_code_lines); + let Some(last) = code_lines.last() else { + unreachable!() + }; + next_start_min_y = last.regular_min_y - 5.0; + } + InsnParseSection::Header => { + if !(code_lines.is_empty() && desc_lines.is_empty()) { + break; + } + let Some(header) = self.extract_insn_header_mnemonics_and_bit_fields( + next_char.min_y.get(), + Some(next_char), + )? + else { + return Err(ExtractInsnsError::InsnParseError( + "can't find header text line".into(), + Backtrace::capture(), + )); + }; + next_start_min_y = header.min_y() - 5.0; + headers.push(header); + } + InsnParseSection::Desc => { + let start_min_y = next_char.min_y.get(); + let min_x = next_char.min_x.get(); + let Some(desc_line) = self.extract_text_line( + Some(next_char), + start_min_y, + min_x, + self.text_section.max_x.get(), + TextLineFonts::InsnDescFonts, + if desc_lines.is_empty() { 0 } else { 1 }, + false, + Some(3.0), + )? + else { + return Err(ExtractInsnsError::InsnParseError( + "can't find insn desc text line".into(), + Backtrace::capture(), + )); + }; + match desc_line.get_header_text() { + None => { + let min_x = desc_line.chars[0].min_x.get(); + let more_desc_lines = self.extract_following_text_lines( + desc_line, + min_x, + self.text_section.max_x.get(), + Some(3.5), + )?; + println!("more insn desc lines:"); + for i in &more_desc_lines { + println!("{i}"); + } + desc_lines.extend(more_desc_lines); + next_start_min_y = desc_lines + .last() + .expect("known to be non-empty") + .regular_min_y + - 5.0; + } + Some(header_text) if header_text == "Special Registers Altered:" => { + let new_sp_regs_altered = + self.extract_insn_sp_regs_altered(desc_line)?; + next_start_min_y = new_sp_regs_altered.final_regular_min_y; + sp_regs_altered = Some(new_sp_regs_altered); + break; + } + Some(header_text) => { + return Err(ExtractInsnsError::Other( + format!("unhandled header text: {header_text:?}\n{desc_line}") + .into(), + )); + } + } + } + } } - print("insn code lines:") - print("\n".join(map(str, code_lines))) - print("insn desc lines:") - print("\n".join(map(str, desc_lines))) - print("sp_regs_altered:") - print(sp_regs_altered) - # TODO: finish - return Insn( - headers=tuple(headers), - code_lines=tuple(code_lines), - desc_lines=tuple(desc_lines), - sp_regs_altered=sp_regs_altered, - ) - }*/ + println!("insn code lines:"); + for i in &code_lines { + println!("{i}"); + } + println!("insn desc lines:"); + for i in &desc_lines { + println!("{i}"); + } + println!("sp_regs_altered:"); + println!("{sp_regs_altered:?}"); + // TODO: finish + return Ok(Insn { + headers, + code_lines, + desc_lines, + sp_regs_altered, + }); + } fn extract_insns(&mut self) -> Result<(), ExtractInsnsError> { loop { let Some(header_start_char) = From fcf1c63cb7700abd3ff6fd593f7807809c39893b Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Mon, 5 Jan 2026 11:27:52 -0800 Subject: [PATCH 07/11] wip --- Cargo.lock | 1 + Cargo.toml | 1 + parse_powerisa_pdf/parse_powerisa_pdf.py | 18 ++++-- src/main.rs | 77 +++++++++++++++++------- 4 files changed, 68 insertions(+), 29 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0281106..e329500 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -225,6 +225,7 @@ dependencies = [ "indexmap", "libm", "mupdf", + "mupdf-sys", "quick-xml", "serde", ] diff --git a/Cargo.toml b/Cargo.toml index 224dad3..3de7338 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,5 +14,6 @@ rust-version = "1.89.0" indexmap = "2.12.1" libm = "0.2.15" mupdf = { version = "0.5.0", default-features = false } +mupdf-sys = { version = "0.5.0", default-features = false } quick-xml = { version = "0.38.4", features = ["serialize"] } serde = { version = "1.0.228", features = ["derive"] } diff --git a/parse_powerisa_pdf/parse_powerisa_pdf.py b/parse_powerisa_pdf/parse_powerisa_pdf.py index 409c6ac..3c2afe5 100755 --- a/parse_powerisa_pdf/parse_powerisa_pdf.py +++ b/parse_powerisa_pdf/parse_powerisa_pdf.py @@ -765,7 +765,7 @@ class Page: unprocessed_non_text: SetById[LTLine | LTRect] @staticmethod - def from_lt_page(page_num: int, page: LTPage) -> Page: + def from_lt_page(page_num: int, page: LTPage, first_seen_fonts: defaultdict[str, set[float]]) -> Page: qt: defaultdict[TextSection, QuadTree[Char | LTLine | LTRect]] = defaultdict(QuadTree) unprocessed_chars = defaultdict(lambda: defaultdict(SetById[Char])) unprocessed_non_text: SetById[LTLine | LTRect] = SetById() @@ -804,20 +804,25 @@ class Page: raise AssertionError( f"char not in text section: {element}\npage_num={page_num}") continue + font_size = round(element.size, 3) char = Char( text=element.get_text(), - font=Font(font_name=element.fontname, size=round(element.size, 3)), + font=Font(font_name=element.fontname, size=font_size), adv=element.adv, min_x=element.x0, min_y=element.y0, max_x=element.x1, max_y=element.y1, ) + if font_size not in first_seen_fonts[element.fontname]: + first_seen_fonts[element.fontname].add(font_size) + print(f"first seen font: {element.fontname!r} {font_size}: page {page_num} {char!r}") qt[text_section].insert(char.min_x, char.min_y, char) unprocessed_chars[text_section][char.font].add(char) - for i in unprocessed_chars.values(): - for j in i.values(): - j.sort(key=Char.top_down_left_to_right_sort_key) + for text_section, i in unprocessed_chars.items(): + for chars in i.values(): + chars.sort(key=Char.top_down_left_to_right_sort_key) + print(f"first char: {text_section!r}: {next(iter(chars), None)!r}") unknown_fonts=[] unknown_font_errors=[] for i in unprocessed_chars.values(): @@ -1181,13 +1186,14 @@ class Parser: def __pages_gen(file: Path, page_numbers: Iterable[int] | None) -> Generator[Page, None, None]: if page_numbers is not None: page_numbers = sorted(i - 1 for i in page_numbers) + first_seen_fonts = defaultdict(set) for i, page in enumerate(extract_pages(file, page_numbers=page_numbers)): if page_numbers is not None: page_num = page_numbers[i] + 1 else: page_num = i + 1 print(f"page {page_num}") - yield Page.from_lt_page(page_num=page_num, page=page) + yield Page.from_lt_page(page_num=page_num, page=page, first_seen_fonts=first_seen_fonts) def parse_pdf(self, file: Path, page_numbers: Iterable[int] | None = None): self.pages = Pages(pages_gen=Parser.__pages_gen( diff --git a/src/main.rs b/src/main.rs index 2e9e391..e84a5f9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,7 +2,8 @@ // See Notices.txt for copyright information use crate::quad_tree::QuadTree; -use indexmap::IndexSet; +use indexmap::{IndexMap, IndexSet}; +use mupdf_sys::FZ_STEXT_BOLD; use non_nan_float::NonNaNF32; use std::{ backtrace::Backtrace, @@ -135,7 +136,9 @@ macro_rules! make_enum_font { } const fn new_known(font_name: &str, size: NonNaNF32) -> Option { match size.get() { - $($($known_font_size if str_eq(font_name, const { Self::extract_font_name_from_font_name_with_tag($known_font_name_with_tag) }) => Some(Self::$KnownFont),)*)* + $($($known_font_size if str_eq(font_name, const { + Self::extract_font_name_from_font_name_with_tag($known_font_name_with_tag) + }) => Some(Self::$KnownFont),)*)* _ => None, } } @@ -266,12 +269,16 @@ make_enum_font! { InsnDescMisc15, #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.213] InsnDescMisc16, - #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.252] + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.238] InsnDescMisc17, - #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.962] + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.252] InsnDescMisc18, - #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 7.977] + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.962] InsnDescMisc19, + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 7.977] + InsnDescMisc20, + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 8.506] + InsnDescMisc21, }, #[group] InsnDescCode { @@ -2064,24 +2071,23 @@ impl Parser { file: &str, page_numbers: Option>>, ) -> Result>>>, Box> { - let page_numbers = page_numbers.map(|page_numbers| { - let mut retval = Vec::from_iter(page_numbers.into_iter().map(|v| v.get() - 1)); + let page_indexes = page_numbers.map(|page_numbers| { + let mut retval = Vec::from_iter(page_numbers.into_iter().map(|v| v.get() as usize - 1)); retval.sort(); retval }); let document = mupdf::Document::open(file)?; let pages: Vec = document.pages().and_then(|pages| pages.collect())?; - Ok(Box::new(pages.into_iter().enumerate().map( - move |(i, page)| { - let page_num = match &page_numbers { - Some(page_numbers) => page_numbers[i] + 1, - None => i as u32 + 1, - }; - println!("page {page_num}"); - Ok(Page::from_mupdf_page(page_num, page) - .map_err(|e| format!("error reading pdf page {page_num}: {e}"))?) - }, - ))) + let page_indexes = page_indexes.unwrap_or_else(|| (0..pages.len()).collect()); + let mut first_seen_fonts = BTreeMap::new(); + Ok(Box::new(page_indexes.into_iter().map(move |page_index| { + let page_num = page_index as u32 + 1; + println!("page {page_num}"); + Ok( + Page::from_mupdf_page(page_num, &pages[page_index], &mut first_seen_fonts) + .map_err(|e| format!("error reading pdf page {page_num}: {e}"))?, + ) + }))) } fn parse_pdf>>( &mut self, @@ -3411,7 +3417,8 @@ struct MuPdfXmlChar<'a> { impl Page { fn from_mupdf_page( page_num: u32, - page: mupdf::Page, + page: &mupdf::Page, + first_seen_fonts: &mut BTreeMap>, ) -> Result> { let device = MyDevice::new(page_num); page.run( @@ -3439,8 +3446,21 @@ impl Page { const ROUND_FACTOR: f32 = 1000.0; let font_size = (xml_font.size * ROUND_FACTOR).round() / ROUND_FACTOR; let font_size = NonNaNF32::new(font_size).ok_or("font size must not be NaN")?; - let font = Font::new(&xml_font.name, font_size); for xml_char in xml_font.char { + if xml_char.c.trim().is_empty() { + continue; + } + let font_name = match &*xml_font.name { + "DejaVuSansCondensed-Obli" => { + if (xml_char.flags & FZ_STEXT_BOLD) != 0 { + "DejaVuSansCondensed-BoldOblique" + } else { + "DejaVuSansCondensed-Oblique" + } + } + font_name => font_name, + }; + let font = Font::new(font_name, font_size); let [x0, y0, x1, y1, x2, y2, x3, y3] = xml_char.quad; let min_x = x0.min(x1).min(x2).min(x3); let max_x = x0.max(x1).max(x2).max(x3); @@ -3469,6 +3489,16 @@ impl Page { max_x: NonNaNF32::new(max_x).ok_or("char position shouldn't be NaN")?, max_y: NonNaNF32::new(max_y).ok_or("char position shouldn't be NaN")?, }; + let set = match first_seen_fonts.get_mut(font_name) { + Some(v) => v, + None => first_seen_fonts.entry(String::from(font_name)).or_default(), + }; + if set.insert(font_size) { + println!( + "first seen font: {font_name:?} {font_size}: page {page_num} {char:?} {:x}", + xml_char.flags, + ); + } qt.entry(text_section).or_default().insert( min_x, min_y, @@ -3486,9 +3516,10 @@ impl Page { } } } - for i in unprocessed_chars.borrow_mut().values_mut() { - for j in i.borrow_mut().values_mut() { - j.sort_by_key(Char::top_down_left_to_right_sort_key); + for (text_section, i) in unprocessed_chars.borrow_mut().iter_mut() { + for chars in i.borrow_mut().values_mut() { + chars.sort_by_key(Char::top_down_left_to_right_sort_key); + println!("first char: {text_section:?}: {:?}", chars.first()); } } let mut unknown_fonts = Vec::new(); From f9a24f4c488d3e05f53676626bef70509cc8fb42 Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Mon, 5 Jan 2026 14:17:49 -0800 Subject: [PATCH 08/11] switching to using mupdf-sys directly since mupdf hides all the necessary functionality --- Cargo.lock | 96 -------- Cargo.toml | 1 - src/main.rs | 138 ++++++++---- src/mupdf_ffi.rs | 569 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 666 insertions(+), 138 deletions(-) create mode 100644 src/mupdf_ffi.rs diff --git a/Cargo.lock b/Cargo.lock index e329500..4de1b68 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -154,19 +154,6 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" -[[package]] -name = "mupdf" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a6499267155b9ae03ff8e53c456d0bfff988b2647d62ff1df038f39ebe93a0c" -dependencies = [ - "bitflags", - "mupdf-sys", - "num_enum", - "once_cell", - "zerocopy", -] - [[package]] name = "mupdf-sys" version = "0.5.0" @@ -190,41 +177,12 @@ dependencies = [ "minimal-lexical", ] -[[package]] -name = "num_enum" -version = "0.7.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1207a7e20ad57b847bbddc6776b968420d38292bbfe2089accff5e19e82454c" -dependencies = [ - "num_enum_derive", - "rustversion", -] - -[[package]] -name = "num_enum_derive" -version = "0.7.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff32365de1b6743cb203b710788263c44a03de03802daf96092f2da4fe6ba4d7" -dependencies = [ - "proc-macro-crate", - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "once_cell" -version = "1.21.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" - [[package]] name = "parse_powerisa_pdf" version = "0.1.0" dependencies = [ "indexmap", "libm", - "mupdf", "mupdf-sys", "quick-xml", "serde", @@ -236,15 +194,6 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" -[[package]] -name = "proc-macro-crate" -version = "3.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "219cb19e96be00ab2e37d6e299658a0cfa83e52429179969b0f0121b4ac46983" -dependencies = [ - "toml_edit", -] - [[package]] name = "proc-macro2" version = "1.0.104" @@ -308,12 +257,6 @@ version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" -[[package]] -name = "rustversion" -version = "1.0.22" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" - [[package]] name = "serde" version = "1.0.228" @@ -361,36 +304,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "toml_datetime" -version = "0.7.5+spec-1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92e1cfed4a3038bc5a127e35a2d360f145e1f4b971b551a2ba5fd7aedf7e1347" -dependencies = [ - "serde_core", -] - -[[package]] -name = "toml_edit" -version = "0.23.10+spec-1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84c8b9f757e028cee9fa244aea147aab2a9ec09d5325a9b01e0a49730c2b5269" -dependencies = [ - "indexmap", - "toml_datetime", - "toml_parser", - "winnow", -] - -[[package]] -name = "toml_parser" -version = "1.0.6+spec-1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3198b4b0a8e11f09dd03e133c0280504d0801269e9afa46362ffde1cbeebf44" -dependencies = [ - "winnow", -] - [[package]] name = "unicode-ident" version = "1.0.22" @@ -403,15 +316,6 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" -[[package]] -name = "winnow" -version = "0.7.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829" -dependencies = [ - "memchr", -] - [[package]] name = "zerocopy" version = "0.8.31" diff --git a/Cargo.toml b/Cargo.toml index 3de7338..09de0ba 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,6 @@ rust-version = "1.89.0" [dependencies] indexmap = "2.12.1" libm = "0.2.15" -mupdf = { version = "0.5.0", default-features = false } mupdf-sys = { version = "0.5.0", default-features = false } quick-xml = { version = "0.38.4", features = ["serialize"] } serde = { version = "1.0.228", features = ["derive"] } diff --git a/src/main.rs b/src/main.rs index e84a5f9..d2668ee 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,7 +3,7 @@ use crate::quad_tree::QuadTree; use indexmap::{IndexMap, IndexSet}; -use mupdf_sys::FZ_STEXT_BOLD; +use mupdf_sys::{FZ_STEXT_BOLD, fz_matrix}; use non_nan_float::NonNaNF32; use std::{ backtrace::Backtrace, @@ -19,6 +19,7 @@ use std::{ sync::OnceLock, }; +mod mupdf_ffi; mod quad_tree; mod xml_tree; @@ -1597,13 +1598,13 @@ struct Page { unprocessed_non_text: Rc>>, } -struct Pages { - pages_gen: Option>>>>, +struct Pages<'ctx> { + pages_gen: Option>> + 'ctx>>, pages: BTreeMap>, max_page_num: u32, } -impl fmt::Debug for Pages { +impl<'ctx> fmt::Debug for Pages<'ctx> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let Self { pages_gen, @@ -1621,8 +1622,10 @@ impl fmt::Debug for Pages { } } -impl Pages { - fn new(pages_gen: Option>>>>) -> Self { +impl<'ctx> Pages<'ctx> { + fn new( + pages_gen: Option>> + 'ctx>>, + ) -> Self { Self { pages_gen, pages: BTreeMap::new(), @@ -1997,8 +2000,8 @@ impl Insn { } #[derive(Debug)] -struct Parser { - pages: Pages, +struct Parser<'ctx> { + pages: Pages<'ctx>, text_section: TextSection, insns: Vec, } @@ -2042,7 +2045,7 @@ impl fmt::Display for ErrorWithNote { impl Error for ErrorWithNote {} -impl Parser { +impl<'ctx> Parser<'ctx> { fn new() -> Self { Self { pages: Pages::new(None), @@ -2068,33 +2071,40 @@ impl Parser { .clone()) } fn pages_gen( + ctx: impl Into>, file: &str, page_numbers: Option>>, - ) -> Result>>>, Box> { + ) -> Result>> + 'ctx>, Box> { + let ctx = ctx.into(); let page_indexes = page_numbers.map(|page_numbers| { let mut retval = Vec::from_iter(page_numbers.into_iter().map(|v| v.get() as usize - 1)); retval.sort(); retval }); - let document = mupdf::Document::open(file)?; - let pages: Vec = document.pages().and_then(|pages| pages.collect())?; - let page_indexes = page_indexes.unwrap_or_else(|| (0..pages.len()).collect()); + let document = mupdf_ffi::Document::open(ctx, &std::ffi::CString::new(file)?)?; + let page_count = document.page_count()?; + let page_indexes = page_indexes.unwrap_or_else(|| (0..page_count).collect()); let mut first_seen_fonts = BTreeMap::new(); Ok(Box::new(page_indexes.into_iter().map(move |page_index| { let page_num = page_index as u32 + 1; println!("page {page_num}"); + let page = document + .load_page(page_index) + .map_err(|e| format!("error reading pdf page {page_num}: {e}"))?; Ok( - Page::from_mupdf_page(page_num, &pages[page_index], &mut first_seen_fonts) + Page::from_mupdf_page(page_num, &page, &mut first_seen_fonts) .map_err(|e| format!("error reading pdf page {page_num}: {e}"))?, ) }))) } fn parse_pdf>>( &mut self, + ctx: impl Into>, file: &str, page_numbers: Option, ) -> Result<(), Box> { self.pages = Pages::new(Some(Self::pages_gen( + ctx, file, page_numbers.map(|v| v.into_iter().collect()), )?)); @@ -3160,7 +3170,7 @@ impl Parser { #[derive(Clone, Debug)] struct MyDevice { page_num: u32, - qt: Rc>>>, + qt: BTreeMap>, unprocessed_non_text: Rc>>, } @@ -3172,7 +3182,7 @@ impl MyDevice { unprocessed_non_text: Default::default(), } } - fn path(&mut self, path: &mupdf::Path, cmt: mupdf::Matrix) { + fn path(&mut self, path: &mupdf_ffi::Path<'_>, cmt: fz_matrix) { enum Walker { Empty, Moved { x: f32, y: f32 }, @@ -3296,54 +3306,98 @@ impl MyDevice { ); } } + fn text(&mut self, text: &mupdf_ffi::Text<'_>, cmt: fz_matrix) { + todo!() + } } -impl mupdf::NativeDevice for MyDevice { +impl<'ctx> mupdf_ffi::DeviceCallbacks<'ctx> for MyDevice { fn fill_path( &mut self, - path: &mupdf::Path, - _even_odd: bool, - cmt: mupdf::Matrix, - _color_space: &mupdf::Colorspace, - _color: &[f32], - _alpha: f32, - _cp: mupdf::ColorParams, + ctx: mupdf_ffi::ContextRef<'ctx>, + path: &mupdf_ffi::Path<'ctx>, + even_odd: bool, + cmt: fz_matrix, ) { self.path(path, cmt); } fn stroke_path( &mut self, - path: &mupdf::Path, - _stroke_state: &mupdf::StrokeState, - cmt: mupdf::Matrix, - _color_space: &mupdf::Colorspace, - _color: &[f32], - _alpha: f32, - _cp: mupdf::ColorParams, + ctx: mupdf_ffi::ContextRef<'ctx>, + path: &mupdf_ffi::Path<'ctx>, + cmt: fz_matrix, ) { self.path(path, cmt); } fn clip_path( &mut self, - path: &mupdf::Path, - _even_odd: bool, - cmt: mupdf::Matrix, - _scissor: mupdf::Rect, + ctx: mupdf_ffi::ContextRef<'ctx>, + path: &mupdf_ffi::Path<'ctx>, + even_odd: bool, + cmt: fz_matrix, + scissor: mupdf_sys::fz_rect, ) { self.path(path, cmt); } fn clip_stroke_path( &mut self, - path: &mupdf::Path, - _stroke_state: &mupdf::StrokeState, - cmt: mupdf::Matrix, - _scissor: mupdf::Rect, + ctx: mupdf_ffi::ContextRef<'ctx>, + path: &mupdf_ffi::Path<'ctx>, + cmt: fz_matrix, + scissor: mupdf_sys::fz_rect, ) { self.path(path, cmt); } + + fn fill_text( + &mut self, + ctx: mupdf_ffi::ContextRef<'ctx>, + text: &mupdf_ffi::Text<'ctx>, + cmt: fz_matrix, + ) { + self.text(text, cmt); + } + + fn stroke_text( + &mut self, + ctx: mupdf_ffi::ContextRef<'ctx>, + text: &mupdf_ffi::Text<'ctx>, + cmt: fz_matrix, + ) { + self.text(text, cmt); + } + + fn clip_text( + &mut self, + ctx: mupdf_ffi::ContextRef<'ctx>, + text: &mupdf_ffi::Text<'ctx>, + cmt: fz_matrix, + scissor: mupdf_sys::fz_rect, + ) { + self.text(text, cmt); + } + + fn clip_stroke_text( + &mut self, + ctx: mupdf_ffi::ContextRef<'ctx>, + text: &mupdf_ffi::Text<'ctx>, + cmt: fz_matrix, + scissor: mupdf_sys::fz_rect, + ) { + self.text(text, cmt); + } + + fn ignore_text( + &mut self, + ctx: mupdf_ffi::ContextRef<'ctx>, + text: &mupdf_ffi::Text<'ctx>, + cmt: fz_matrix, + ) { + self.text(text, cmt); + } } #[derive(serde::Deserialize, Debug)] @@ -3417,12 +3471,12 @@ struct MuPdfXmlChar<'a> { impl Page { fn from_mupdf_page( page_num: u32, - page: &mupdf::Page, + page: &mupdf_ffi::Page<'_>, first_seen_fonts: &mut BTreeMap>, ) -> Result> { let device = MyDevice::new(page_num); page.run( - &mupdf::Device::from_native(device.clone())?, + &mupdf_ffi::Device::new(page.ctx(), Box::new(device))?, &mupdf::Matrix::IDENTITY, )?; let MyDevice { @@ -3438,6 +3492,8 @@ impl Page { RefCell>>>>>, > = Rc::default(); // we convert to xml and parse that becuase the mupdf rust crate doesn't include all the API surface we need. + let json = page.stext_page_as_json_from_page(1.0)?; + todo!("{json}"); let xml = page.to_xml()?; let MuPdfXml::Page(xml_page) = quick_xml::de::from_str(&xml)?; for xml_block in xml_page.block { diff --git a/src/mupdf_ffi.rs b/src/mupdf_ffi.rs new file mode 100644 index 0000000..1a0e669 --- /dev/null +++ b/src/mupdf_ffi.rs @@ -0,0 +1,569 @@ +// SPDX-License-Identifier: LGPL-3.0-or-later +// See Notices.txt for copyright information + +use mupdf_sys::{ + fz_clone_context, fz_color_params, fz_colorspace, fz_context, fz_device, fz_document, + fz_drop_context, fz_drop_device, fz_drop_document, fz_drop_page, fz_drop_path, fz_drop_text, + fz_error_type_FZ_ERROR_GENERIC, fz_matrix, fz_page, fz_path, fz_rect, fz_stroke_state, fz_text, + mupdf_document_page_count, mupdf_drop_error, mupdf_error_t, mupdf_load_page, + mupdf_new_base_context, mupdf_new_derived_device, mupdf_open_document, mupdf_run_page, +}; +use std::{ + cell::Cell, + ffi::{CStr, CString, c_int}, + fmt, + marker::PhantomData, + mem::ManuallyDrop, + ptr::{self, NonNull}, + sync::{Mutex, OnceLock}, +}; + +#[derive(Debug)] +struct MuPdfError { + type_: c_int, + message: CString, +} + +impl MuPdfError { + fn new_generic(message: impl ToString) -> Self { + Self { + type_: fz_error_type_FZ_ERROR_GENERIC as _, + message: message.try_into().expect("nul byte in message"), + } + } +} + +impl fmt::Display for MuPdfError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "MuPDF error: type: {}, message: {}", + self.type_, self.message + ) + } +} + +impl std::error::Error for MuPdfError {} + +struct OwnedMuPdfError(NonNull); + +impl Drop for OwnedMuPdfError { + fn drop(&mut self) { + unsafe { + mupdf_drop_error(self.0.as_ptr()); + } + } +} + +unsafe fn mupdf_try(f: impl FnOnce(&mut *mut mupdf_error_t) -> R) -> Result { + let mut err = <*mut mupdf_error_t>::null_mut(); + let retval = f(&mut err); + let Some(err) = NonNull::new(err).map(OwnedMuPdfError) else { + return Ok(retval); + }; + unsafe { + Err(MuPdfError { + type_: (*err.0).type_, + message: CString::from(CStr::from_ptr((*err.0).message)), + }) + } +} + +pub(crate) struct Context(NonNull); + +impl Context { + fn new() -> Self { + struct BaseContext(NonNull); + unsafe impl Send for BaseContext {} + static CTX: OnceLock> = OnceLock::new(); + let base = CTX + .get_or_init(|| { + let ctx = unsafe { mupdf_new_base_context() }; + let Some(ctx) = NonNull::new(ctx).map(BaseContext) else { + panic!("failed to allocate a MuPDF context"); + }; + Mutex::new(ctx) + }) + .lock() + .expect("not poisoned"); + let ctx = unsafe { fz_clone_context(base.0.as_ptr()) }; + let Some(ctx) = NonNull::new(ctx).map(Self) else { + drop(base); + panic!("failed to clone a MuPDF context"); + }; + ctx + } + pub(crate) fn with(f: impl FnOnce(&Self) -> R) -> R { + thread_local! { + static CTX: Context = Context::new(); + } + CTX.with(f) + } + pub(crate) fn as_ref(&self) -> ContextRef<'_> { + ContextRef(self.0, PhantomData) + } +} + +impl Drop for Context { + fn drop(&mut self) { + unsafe { + fz_drop_context(self.0.as_ptr()); + } + } +} + +#[derive(Clone, Copy)] +pub(crate) struct ContextRef<'ctx>(NonNull, PhantomData<&'ctx Context>); + +impl<'ctx> From<&'ctx Context> for ContextRef<'ctx> { + fn from(value: &'ctx Context) -> Self { + value.as_ref() + } +} + +pub(crate) struct Document<'ctx> { + ptr: *mut fz_document, + ctx: ContextRef<'ctx>, +} + +impl<'ctx> Document<'ctx> { + pub(crate) fn open( + ctx: impl Into>, + file_name: &CStr, + ) -> Result, MuPdfError> { + let ctx = ctx.into(); + unsafe { + mupdf_try(|errptr| mupdf_open_document(ctx.0.as_ptr(), file_name.as_ptr(), errptr)) + .map(|ptr| Document { ptr, ctx }) + } + } + pub(crate) fn page_count(&self) -> Result { + unsafe { + mupdf_try(|errptr| mupdf_document_page_count(self.ctx.0.as_ptr(), self.ptr, errptr))? + .try_into() + .map_err(MuPdfError::new_generic) + } + } + pub(crate) fn load_page(&self, page: usize) -> Result, MuPdfError> { + let page = page.try_into().map_err(MuPdfError::new_generic)?; + unsafe { + mupdf_try(|errptr| mupdf_load_page(self.ctx.0.as_ptr(), self.ptr, page, errptr)) + .map(|ptr| Page { ptr, ctx: self.ctx }) + } + } +} + +impl<'ctx> Drop for Document<'ctx> { + fn drop(&mut self) { + unsafe { + fz_drop_document(self.ctx.0.as_ptr(), self.ptr); + } + } +} + +pub(crate) struct Page<'ctx> { + ptr: *mut fz_page, + ctx: ContextRef<'ctx>, +} + +impl<'ctx> Page<'ctx> { + pub(crate) fn ctx(&self) -> ContextRef<'ctx> { + self.ctx + } + pub(crate) fn run( + &self, + device: &Device<'ctx, T>, + ctm: fz_matrix, + ) -> Result<(), MuPdfError> { + unsafe { + mupdf_try(|errptr| { + mupdf_run_page( + self.ctx.0.as_ptr(), + self.ptr, + device.dev, + ctm, + ptr::null_mut(), + errptr, + ) + }) + } + } +} + +impl<'ctx> Drop for Page<'ctx> { + fn drop(&mut self) { + unsafe { + fz_drop_page(self.ctx.0.as_ptr(), self.ptr); + } + } +} + +pub(crate) struct Device<'ctx, T: 'ctx> { + dev: *mut fz_device, + ctx: ContextRef<'ctx>, + _phantom: PhantomData>>, +} + +pub(crate) trait DeviceCallbacks<'ctx> { + fn fill_path(&self, ctx: ContextRef<'ctx>, path: &Path<'ctx>, even_odd: bool, cmt: fz_matrix); + fn stroke_path(&self, ctx: ContextRef<'ctx>, path: &Path<'ctx>, cmt: fz_matrix); + fn clip_path( + &self, + ctx: ContextRef<'ctx>, + path: &Path<'ctx>, + even_odd: bool, + cmt: fz_matrix, + scissor: fz_rect, + ); + fn clip_stroke_path( + &self, + ctx: ContextRef<'ctx>, + path: &Path<'ctx>, + cmt: fz_matrix, + scissor: fz_rect, + ); + fn fill_text(&self, ctx: ContextRef<'ctx>, text: &Text<'ctx>, cmt: fz_matrix); + fn stroke_text(&self, ctx: ContextRef<'ctx>, text: &Text<'ctx>, cmt: fz_matrix); + fn clip_text(&self, ctx: ContextRef<'ctx>, text: &Text<'ctx>, cmt: fz_matrix, scissor: fz_rect); + fn clip_stroke_text( + &self, + ctx: ContextRef<'ctx>, + text: &Text<'ctx>, + cmt: fz_matrix, + scissor: fz_rect, + ); + fn ignore_text(&self, ctx: ContextRef<'ctx>, text: &Text<'ctx>, cmt: fz_matrix); +} + +impl<'ctx, T: DeviceCallbacks<'ctx>> Device<'ctx, T> { + pub(crate) fn new(ctx: impl Into>, value: Box) -> Result { + let ctx = ctx.into(); + unsafe { + let dev_ptr = mupdf_try(|errptr| { + mupdf_new_derived_device::>( + ctx.0.as_ptr(), + c"parse_powerisa_pdf::mupdf_ffi::Device", + errptr, + ) + })?; + let retval = Device { + dev: dev_ptr.cast(), + ctx, + _phantom: PhantomData, + }; + (&raw mut (*dev_ptr).value).write(value); + let fz_device { + refs, + hints, + flags, + close_device, + drop_device, + fill_path, + stroke_path, + clip_path, + clip_stroke_path, + fill_text, + stroke_text, + clip_text, + clip_stroke_text, + ignore_text, + fill_shade, + fill_image, + fill_image_mask, + clip_image_mask, + pop_clip, + begin_mask, + end_mask, + begin_group, + end_group, + begin_tile, + end_tile, + render_flags, + set_default_colorspaces, + begin_layer, + end_layer, + begin_structure, + end_structure, + begin_metatext, + end_metatext, + d1_rect, + container_len, + container_cap, + container, + } = &mut (*dev_ptr).base; + *drop_device = Some(Self::drop_device_fn); + *fill_path = Some(Self::fill_path_fn); + *stroke_path = Some(Self::stroke_path_fn); + *clip_path = Some(Self::clip_path_fn); + *clip_stroke_path = Some(Self::clip_stroke_path_fn); + *fill_text = Some(Self::fill_text_fn); + *stroke_text = Some(Self::stroke_text_fn); + *clip_text = Some(Self::clip_text_fn); + *clip_stroke_text = Some(Self::clip_stroke_text_fn); + *ignore_text = Some(Self::ignore_text_fn); + Ok(retval) + } + } + pub(crate) fn get(&self) -> &T { + unsafe { &(*self.ptr.cast::>()).value } + } + unsafe extern "C" fn drop_device_fn(_ctx: *mut fz_context, dev: *mut fz_device) { + unsafe { + (&raw mut (*dev.cast::>()).value).drop_in_place(); + } + } + unsafe extern "C" fn fill_path_fn( + ctx: *mut fz_context, + dev: *mut fz_device, + path: *const fz_path, + even_odd: c_int, + cmt: fz_matrix, + color_space: *mut fz_colorspace, + color: *const f32, + alpha: f32, + color_params: fz_color_params, + ) { + let Some(ctx) = NonNull::new(ctx) else { + return; + }; + let ctx = ContextRef(ctx, PhantomData); + let this = unsafe { &mut (*dev.cast::>()).value }; + this.fill_path( + ctx, + &ManuallyDrop::new(Path { + ptr: path.cast_mut(), + ctx, + }), + even_odd != 0, + cmt, + ); + } + unsafe extern "C" fn stroke_path_fn( + ctx: *mut fz_context, + dev: *mut fz_device, + path: *const fz_path, + stroke_state: *const fz_stroke_state, + cmt: fz_matrix, + color_space: *mut fz_colorspace, + color: *const f32, + alpha: f32, + color_params: fz_color_params, + ) { + let Some(ctx) = NonNull::new(ctx) else { + return; + }; + let ctx = ContextRef(ctx, PhantomData); + let this = unsafe { &mut (*dev.cast::>()).value }; + this.stroke_path( + ctx, + &ManuallyDrop::new(Path { + ptr: path.cast_mut(), + ctx, + }), + cmt, + ); + } + unsafe extern "C" fn clip_path_fn( + ctx: *mut fz_context, + dev: *mut fz_device, + path: *const fz_path, + even_odd: ::std::os::raw::c_int, + cmt: fz_matrix, + scissor: fz_rect, + ) { + let Some(ctx) = NonNull::new(ctx) else { + return; + }; + let ctx = ContextRef(ctx, PhantomData); + let this = unsafe { &mut (*dev.cast::>()).value }; + this.clip_path( + ctx, + &ManuallyDrop::new(Path { + ptr: path.cast_mut(), + ctx, + }), + even_odd != 0, + cmt, + scissor, + ); + } + unsafe extern "C" fn clip_stroke_path_fn( + ctx: *mut fz_context, + dev: *mut fz_device, + path: *const fz_path, + stroke_state: *const fz_stroke_state, + cmt: fz_matrix, + scissor: fz_rect, + ) { + let Some(ctx) = NonNull::new(ctx) else { + return; + }; + let ctx = ContextRef(ctx, PhantomData); + let this = unsafe { &mut (*dev.cast::>()).value }; + this.clip_stroke_path( + ctx, + &ManuallyDrop::new(Path { + ptr: path.cast_mut(), + ctx, + }), + cmt, + scissor, + ); + } + unsafe extern "C" fn fill_text_fn( + ctx: *mut fz_context, + dev: *mut fz_device, + text: *const fz_text, + cmt: fz_matrix, + color_space: *mut fz_colorspace, + color: *const f32, + alpha: f32, + color_params: fz_color_params, + ) { + let Some(ctx) = NonNull::new(ctx) else { + return; + }; + let ctx = ContextRef(ctx, PhantomData); + let this = unsafe { &mut (*dev.cast::>()).value }; + this.fill_text( + ctx, + &ManuallyDrop::new(Text { + ptr: text.cast_mut(), + ctx, + }), + cmt, + ); + } + unsafe extern "C" fn stroke_text_fn( + ctx: *mut fz_context, + dev: *mut fz_device, + text: *const fz_text, + stroke_state: *const fz_stroke_state, + cmt: fz_matrix, + color_space: *mut fz_colorspace, + color: *const f32, + alpha: f32, + color_params: fz_color_params, + ) { + let Some(ctx) = NonNull::new(ctx) else { + return; + }; + let ctx = ContextRef(ctx, PhantomData); + let this = unsafe { &mut (*dev.cast::>()).value }; + this.stroke_text( + ctx, + &ManuallyDrop::new(Text { + ptr: text.cast_mut(), + ctx, + }), + cmt, + ); + } + unsafe extern "C" fn clip_text_fn( + ctx: *mut fz_context, + dev: *mut fz_device, + text: *const fz_text, + cmt: fz_matrix, + scissor: fz_rect, + ) { + let Some(ctx) = NonNull::new(ctx) else { + return; + }; + let ctx = ContextRef(ctx, PhantomData); + let this = unsafe { &mut (*dev.cast::>()).value }; + this.clip_text( + ctx, + &ManuallyDrop::new(Text { + ptr: text.cast_mut(), + ctx, + }), + cmt, + scissor, + ); + } + unsafe extern "C" fn clip_stroke_text_fn( + ctx: *mut fz_context, + dev: *mut fz_device, + text: *const fz_text, + stroke_state: *const fz_stroke_state, + cmt: fz_matrix, + scissor: fz_rect, + ) { + let Some(ctx) = NonNull::new(ctx) else { + return; + }; + let ctx = ContextRef(ctx, PhantomData); + let this = unsafe { &mut (*dev.cast::>()).value }; + this.clip_stroke_text( + ctx, + &ManuallyDrop::new(Text { + ptr: text.cast_mut(), + ctx, + }), + cmt, + scissor, + ); + } + unsafe extern "C" fn ignore_text_fn( + ctx: *mut fz_context, + dev: *mut fz_device, + text: *const fz_text, + cmt: fz_matrix, + ) { + let Some(ctx) = NonNull::new(ctx) else { + return; + }; + let ctx = ContextRef(ctx, PhantomData); + let this = unsafe { &mut (*dev.cast::>()).value }; + this.ignore_text( + ctx, + &ManuallyDrop::new(Text { + ptr: text.cast_mut(), + ctx, + }), + cmt, + ); + } +} + +impl<'ctx, T> Drop for Device<'ctx, T> { + fn drop(&mut self) { + unsafe { + // FIXME: fz_close_device may throw exceptions + // fz_close_device(self.ctx.0.as_ptr(), self.dev); + fz_drop_device(self.ctx.0.as_ptr(), self.dev); + } + } +} + +#[repr(C)] +struct DeviceStruct { + base: fz_device, + value: Box, +} + +pub(crate) struct Path<'ctx> { + ptr: *mut fz_path, + ctx: ContextRef<'ctx>, +} + +impl<'ctx> Drop for Path<'ctx> { + fn drop(&mut self) { + unsafe { + fz_drop_path(self.ctx.0.as_ptr(), self.ptr); + } + } +} + +pub(crate) struct Text<'ctx> { + ptr: *mut fz_text, + ctx: ContextRef<'ctx>, +} + +impl<'ctx> Drop for Text<'ctx> { + fn drop(&mut self) { + unsafe { + fz_drop_text(self.ctx.0.as_ptr(), self.ptr); + } + } +} From 103f986bc06ed6c2ba12d475aff3d9be8a10ebef Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Mon, 5 Jan 2026 17:53:57 -0800 Subject: [PATCH 09/11] wip --- src/main.rs | 140 +++++++++++++++++++++++------------------ src/mupdf_ffi.rs | 160 ++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 236 insertions(+), 64 deletions(-) diff --git a/src/main.rs b/src/main.rs index d2668ee..b29bf07 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,12 +2,12 @@ // See Notices.txt for copyright information use crate::quad_tree::QuadTree; -use indexmap::{IndexMap, IndexSet}; +use indexmap::IndexSet; use mupdf_sys::{FZ_STEXT_BOLD, fz_matrix}; use non_nan_float::NonNaNF32; use std::{ backtrace::Backtrace, - borrow::{Borrow, Cow}, + borrow::Cow, cell::RefCell, collections::{BTreeMap, BTreeSet, HashMap, HashSet}, convert::Infallible, @@ -681,7 +681,7 @@ impl ParsedTextLine { } } fn write_xml_lines( - lines: impl IntoIterator>, + lines: impl IntoIterator>, parent: &mut xml_tree::Element, trailing_nl: bool, preceding_nl: bool, @@ -691,7 +691,7 @@ impl ParsedTextLine { } let mut first = true; for line in lines { - let line = line.borrow(); + let line = std::borrow::Borrow::borrow(&line); if first { first = false; } else { @@ -1182,8 +1182,8 @@ impl<'a> ElementBodyBuilder<'a> { } self.shrink_stack(new_len); } - fn write_text(&mut self, text: impl Borrow) { - let text = text.borrow(); + fn write_text(&mut self, text: impl std::borrow::Borrow) { + let text = std::borrow::Borrow::borrow(&text); let insert_point = self.insert_point(); if let Some(child) = insert_point.children.last_mut() { child.tail += text; @@ -1328,14 +1328,14 @@ impl fmt::Display for InsnBitFields { impl InsnBitFields { fn write_xml_fields( - fields: impl IntoIterator>, + fields: impl IntoIterator>, parent: &mut xml_tree::Element, ) { let fields_elm = parent.sub_element("fields".into(), []); fields_elm.text = "\n".into(); fields_elm.tail = "\n".into(); for field in fields { - field.borrow().write_xml(fields_elm); + std::borrow::Borrow::borrow(&field).write_xml(fields_elm); } } fn write_xml(&self, parent: &mut xml_tree::Element) { @@ -3170,7 +3170,9 @@ impl<'ctx> Parser<'ctx> { #[derive(Clone, Debug)] struct MyDevice { page_num: u32, - qt: BTreeMap>, + qt: Rc>>>, + unprocessed_chars: + Rc>>>>>>, unprocessed_non_text: Rc>>, } @@ -3179,10 +3181,11 @@ impl MyDevice { Self { page_num, qt: Default::default(), + unprocessed_chars: Default::default(), unprocessed_non_text: Default::default(), } } - fn path(&mut self, path: &mupdf_ffi::Path<'_>, cmt: fz_matrix) { + fn path(&self, path: &mupdf_ffi::Path<'_>, cmt: fz_matrix) { enum Walker { Empty, Moved { x: f32, y: f32 }, @@ -3198,8 +3201,8 @@ impl MyDevice { p1_y: NonNaNF32::new(p1_y)?, }) } - impl mupdf::PathWalker for Walker { - fn move_to(&mut self, x: f32, y: f32) { + impl<'ctx> mupdf_ffi::PathWalker<'ctx> for Walker { + fn move_to(&mut self, _ctx: mupdf_ffi::ContextRef<'ctx>, x: f32, y: f32) { *self = match *self { Walker::Empty | Walker::Moved { .. } => Walker::Moved { x, y }, Walker::Line(_) | Walker::Rect { .. } | Walker::NotRecognized => { @@ -3207,7 +3210,7 @@ impl MyDevice { } }; } - fn line_to(&mut self, x: f32, y: f32) { + fn line_to(&mut self, _ctx: mupdf_ffi::ContextRef<'ctx>, x: f32, y: f32) { *self = match *self { Walker::Empty => Walker::NotRecognized, Walker::Moved { x: p0_x, y: p0_y } => new_line(p0_x, p0_y, x, y) @@ -3218,11 +3221,27 @@ impl MyDevice { } }; } - fn curve_to(&mut self, _cx1: f32, _cy1: f32, _cx2: f32, _cy2: f32, _ex: f32, _ey: f32) { + fn curve_to( + &mut self, + _ctx: mupdf_ffi::ContextRef<'ctx>, + _cx1: f32, + _cy1: f32, + _cx2: f32, + _cy2: f32, + _ex: f32, + _ey: f32, + ) { *self = Walker::NotRecognized; } - fn close(&mut self) {} - fn rect(&mut self, x1: f32, y1: f32, x2: f32, y2: f32) { + fn close_path(&mut self, _ctx: mupdf_ffi::ContextRef<'ctx>) {} + fn rect_to( + &mut self, + _ctx: mupdf_ffi::ContextRef<'ctx>, + x1: f32, + y1: f32, + x2: f32, + y2: f32, + ) { *self = match *self { Walker::Empty => Walker::Rect { x1, y1, x2, y2 }, Walker::Moved { .. } @@ -3233,9 +3252,7 @@ impl MyDevice { } } let mut walker = Walker::Empty; - let Ok(()) = path.walk(&mut walker) else { - return; - }; + path.walk(&mut walker); let component = match walker { Walker::Empty | Walker::Moved { .. } | Walker::NotRecognized => return, Walker::Line(Line { @@ -3244,26 +3261,20 @@ impl MyDevice { p1_x, p1_y, }) => { - let mupdf::Point { x: p0_x, y: p0_y } = mupdf::Point { - x: p0_x.get(), - y: p0_y.get(), - } - .transform(&cmt); - let mupdf::Point { x: p1_x, y: p1_y } = mupdf::Point { - x: p1_x.get(), - y: p1_y.get(), - } - .transform(&cmt); + let mupdf_sys::fz_point { x: p0_x, y: p0_y } = + mupdf_ffi::transform_point_xy(p0_x.get(), p0_y.get(), cmt); + let mupdf_sys::fz_point { x: p1_x, y: p1_y } = + mupdf_ffi::transform_point_xy(p1_x.get(), p1_y.get(), cmt); let Some(line) = new_line(p0_x, p0_y, p1_x, p1_y) else { return; }; LineOrRect::Line(line) } Walker::Rect { x1, y1, x2, y2 } => { - let p1 = mupdf::Point { x: x1, y: y1 }.transform(&cmt); - let p2 = mupdf::Point { x: x2, y: y1 }.transform(&cmt); - let p3 = mupdf::Point { x: x2, y: y2 }.transform(&cmt); - let p4 = mupdf::Point { x: x1, y: y2 }.transform(&cmt); + let p1 = mupdf_ffi::transform_point_xy(x1, y1, cmt); + let p2 = mupdf_ffi::transform_point_xy(x2, y1, cmt); + let p3 = mupdf_ffi::transform_point_xy(x2, y2, cmt); + let p4 = mupdf_ffi::transform_point_xy(x1, y2, cmt); let min_x = NonNaNF32::new(p1.x.min(p2.x).min(p3.x).min(p4.x)); let max_x = NonNaNF32::new(p1.x.max(p2.x).max(p3.x).max(p4.x)); let min_y = NonNaNF32::new(p1.y.min(p2.y).min(p3.y).min(p4.y)); @@ -3306,25 +3317,25 @@ impl MyDevice { ); } } - fn text(&mut self, text: &mupdf_ffi::Text<'_>, cmt: fz_matrix) { + fn text(&self, text: &mupdf_ffi::Text<'_>, cmt: fz_matrix) { todo!() } } impl<'ctx> mupdf_ffi::DeviceCallbacks<'ctx> for MyDevice { fn fill_path( - &mut self, - ctx: mupdf_ffi::ContextRef<'ctx>, + &self, + _ctx: mupdf_ffi::ContextRef<'ctx>, path: &mupdf_ffi::Path<'ctx>, - even_odd: bool, + _even_odd: bool, cmt: fz_matrix, ) { self.path(path, cmt); } fn stroke_path( - &mut self, - ctx: mupdf_ffi::ContextRef<'ctx>, + &self, + _ctx: mupdf_ffi::ContextRef<'ctx>, path: &mupdf_ffi::Path<'ctx>, cmt: fz_matrix, ) { @@ -3332,29 +3343,29 @@ impl<'ctx> mupdf_ffi::DeviceCallbacks<'ctx> for MyDevice { } fn clip_path( - &mut self, - ctx: mupdf_ffi::ContextRef<'ctx>, + &self, + _ctx: mupdf_ffi::ContextRef<'ctx>, path: &mupdf_ffi::Path<'ctx>, - even_odd: bool, + _even_odd: bool, cmt: fz_matrix, - scissor: mupdf_sys::fz_rect, + _scissor: mupdf_sys::fz_rect, ) { self.path(path, cmt); } fn clip_stroke_path( - &mut self, - ctx: mupdf_ffi::ContextRef<'ctx>, + &self, + _ctx: mupdf_ffi::ContextRef<'ctx>, path: &mupdf_ffi::Path<'ctx>, cmt: fz_matrix, - scissor: mupdf_sys::fz_rect, + _scissor: mupdf_sys::fz_rect, ) { self.path(path, cmt); } fn fill_text( - &mut self, - ctx: mupdf_ffi::ContextRef<'ctx>, + &self, + _ctx: mupdf_ffi::ContextRef<'ctx>, text: &mupdf_ffi::Text<'ctx>, cmt: fz_matrix, ) { @@ -3362,8 +3373,8 @@ impl<'ctx> mupdf_ffi::DeviceCallbacks<'ctx> for MyDevice { } fn stroke_text( - &mut self, - ctx: mupdf_ffi::ContextRef<'ctx>, + &self, + _ctx: mupdf_ffi::ContextRef<'ctx>, text: &mupdf_ffi::Text<'ctx>, cmt: fz_matrix, ) { @@ -3371,28 +3382,28 @@ impl<'ctx> mupdf_ffi::DeviceCallbacks<'ctx> for MyDevice { } fn clip_text( - &mut self, - ctx: mupdf_ffi::ContextRef<'ctx>, + &self, + _ctx: mupdf_ffi::ContextRef<'ctx>, text: &mupdf_ffi::Text<'ctx>, cmt: fz_matrix, - scissor: mupdf_sys::fz_rect, + _scissor: mupdf_sys::fz_rect, ) { self.text(text, cmt); } fn clip_stroke_text( - &mut self, - ctx: mupdf_ffi::ContextRef<'ctx>, + &self, + _ctx: mupdf_ffi::ContextRef<'ctx>, text: &mupdf_ffi::Text<'ctx>, cmt: fz_matrix, - scissor: mupdf_sys::fz_rect, + _scissor: mupdf_sys::fz_rect, ) { self.text(text, cmt); } fn ignore_text( - &mut self, - ctx: mupdf_ffi::ContextRef<'ctx>, + &self, + _ctx: mupdf_ffi::ContextRef<'ctx>, text: &mupdf_ffi::Text<'ctx>, cmt: fz_matrix, ) { @@ -3477,20 +3488,25 @@ impl Page { let device = MyDevice::new(page_num); page.run( &mupdf_ffi::Device::new(page.ctx(), Box::new(device))?, - &mupdf::Matrix::IDENTITY, + fz_matrix { + a: 1.0, + b: 0.0, + c: 0.0, + d: 1.0, + e: 0.0, + f: 0.0, + }, )?; let MyDevice { page_num, qt, + unprocessed_chars, unprocessed_non_text, } = device; let mut qt = Rc::try_unwrap(qt) .ok() .expect("already dropped all other references") .into_inner(); - let unprocessed_chars: Rc< - RefCell>>>>>, - > = Rc::default(); // we convert to xml and parse that becuase the mupdf rust crate doesn't include all the API surface we need. let json = page.stext_page_as_json_from_page(1.0)?; todo!("{json}"); diff --git a/src/mupdf_ffi.rs b/src/mupdf_ffi.rs index 1a0e669..3eb24fb 100644 --- a/src/mupdf_ffi.rs +++ b/src/mupdf_ffi.rs @@ -4,13 +4,14 @@ use mupdf_sys::{ fz_clone_context, fz_color_params, fz_colorspace, fz_context, fz_device, fz_document, fz_drop_context, fz_drop_device, fz_drop_document, fz_drop_page, fz_drop_path, fz_drop_text, - fz_error_type_FZ_ERROR_GENERIC, fz_matrix, fz_page, fz_path, fz_rect, fz_stroke_state, fz_text, + fz_error_type_FZ_ERROR_GENERIC, fz_matrix, fz_page, fz_path, fz_path_walker, fz_point, fz_rect, + fz_stroke_state, fz_text, fz_transform_point, fz_transform_point_xy, fz_walk_path, mupdf_document_page_count, mupdf_drop_error, mupdf_error_t, mupdf_load_page, mupdf_new_base_context, mupdf_new_derived_device, mupdf_open_document, mupdf_run_page, }; use std::{ cell::Cell, - ffi::{CStr, CString, c_int}, + ffi::{CStr, CString, c_int, c_void}, fmt, marker::PhantomData, mem::ManuallyDrop, @@ -542,11 +543,158 @@ struct DeviceStruct { value: Box, } +pub(crate) trait PathWalker<'ctx> { + fn move_to(&mut self, ctx: ContextRef<'ctx>, x: f32, y: f32); + fn line_to(&mut self, ctx: ContextRef<'ctx>, x: f32, y: f32); + fn curve_to( + &mut self, + ctx: ContextRef<'ctx>, + x1: f32, + y1: f32, + x2: f32, + y2: f32, + x3: f32, + y3: f32, + ); + fn close_path(&mut self, ctx: ContextRef<'ctx>); + fn rect_to(&mut self, ctx: ContextRef<'ctx>, x1: f32, y1: f32, x2: f32, y2: f32) { + self.move_to(ctx, x1, y1); + self.move_to(ctx, x2, y1); + self.move_to(ctx, x2, y2); + self.move_to(ctx, x1, y2); + self.close_path(ctx); + } +} + +impl<'ctx, T: ?Sized + PathWalker<'ctx>> PathWalker<'ctx> for &'_ mut T { + fn move_to(&mut self, ctx: ContextRef<'ctx>, x: f32, y: f32) { + T::move_to(self, ctx, x, y); + } + + fn line_to(&mut self, ctx: ContextRef<'ctx>, x: f32, y: f32) { + T::line_to(self, ctx, x, y); + } + + fn curve_to( + &mut self, + ctx: ContextRef<'ctx>, + x1: f32, + y1: f32, + x2: f32, + y2: f32, + x3: f32, + y3: f32, + ) { + T::curve_to(self, ctx, x1, y1, x2, y2, x3, y3); + } + + fn close_path(&mut self, ctx: ContextRef<'ctx>) { + T::close_path(self, ctx); + } + + fn rect_to(&mut self, ctx: ContextRef<'ctx>, x1: f32, y1: f32, x2: f32, y2: f32) { + T::rect_to(self, ctx, x1, y1, x2, y2); + } +} + pub(crate) struct Path<'ctx> { ptr: *mut fz_path, ctx: ContextRef<'ctx>, } +impl<'ctx> Path<'ctx> { + pub(crate) fn walk>(&self, mut walker: W) { + unsafe { + fz_walk_path( + self.ctx.0.as_ptr(), + self.ptr, + const { + &fz_path_walker { + moveto: Some(Self::move_to_fn::), + lineto: Some(Self::line_to_fn::), + curveto: Some(Self::curve_to_fn::), + closepath: Some(Self::close_path_fn::), + quadto: None, + curvetov: None, + curvetoy: None, + rectto: Some(Self::rect_to_fn::), + } + }, + (&raw mut walker).cast(), + ); + } + } + unsafe extern "C" fn move_to_fn>( + ctx: *mut fz_context, + arg: *mut c_void, + x: f32, + y: f32, + ) { + let Some(ctx) = NonNull::new(ctx) else { + return; + }; + let ctx = ContextRef(ctx, PhantomData); + let this = unsafe { &mut *arg.cast::() }; + this.move_to(ctx, x, y); + } + unsafe extern "C" fn line_to_fn>( + ctx: *mut fz_context, + arg: *mut c_void, + x: f32, + y: f32, + ) { + let Some(ctx) = NonNull::new(ctx) else { + return; + }; + let ctx = ContextRef(ctx, PhantomData); + let this = unsafe { &mut *arg.cast::() }; + this.line_to(ctx, x, y); + } + unsafe extern "C" fn curve_to_fn>( + ctx: *mut fz_context, + arg: *mut c_void, + x1: f32, + y1: f32, + x2: f32, + y2: f32, + x3: f32, + y3: f32, + ) { + let Some(ctx) = NonNull::new(ctx) else { + return; + }; + let ctx = ContextRef(ctx, PhantomData); + let this = unsafe { &mut *arg.cast::() }; + this.curve_to(ctx, x1, y1, x2, y2, x3, y3); + } + unsafe extern "C" fn close_path_fn>( + ctx: *mut fz_context, + arg: *mut c_void, + ) { + let Some(ctx) = NonNull::new(ctx) else { + return; + }; + let ctx = ContextRef(ctx, PhantomData); + let this = unsafe { &mut *arg.cast::() }; + this.close_path(ctx); + } + unsafe extern "C" fn rect_to_fn>( + ctx: *mut fz_context, + arg: *mut c_void, + x1: f32, + y1: f32, + x2: f32, + y2: f32, + ) { + let Some(ctx) = NonNull::new(ctx) else { + return; + }; + let ctx = ContextRef(ctx, PhantomData); + let this = unsafe { &mut *arg.cast::() }; + this.rect_to(ctx, x1, y1, x2, y2); + } +} + impl<'ctx> Drop for Path<'ctx> { fn drop(&mut self) { unsafe { @@ -567,3 +715,11 @@ impl<'ctx> Drop for Text<'ctx> { } } } + +pub(crate) fn transform_point(point: fz_point, m: fz_matrix) -> fz_point { + unsafe { fz_transform_point(point, m) } +} + +pub(crate) fn transform_point_xy(x: f32, y: f32, m: fz_matrix) -> fz_point { + unsafe { fz_transform_point_xy(x, y, m) } +} From a677cd8a33403fac0317d46f29de120ceefe3965 Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Mon, 5 Jan 2026 18:33:44 -0800 Subject: [PATCH 10/11] wip --- src/main.rs | 88 +++++++++++++++++-------- src/mupdf_ffi.rs | 166 ++++++++++++++++++++++++++++++----------------- 2 files changed, 169 insertions(+), 85 deletions(-) diff --git a/src/main.rs b/src/main.rs index b29bf07..b56937c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3318,7 +3318,43 @@ impl MyDevice { } } fn text(&self, text: &mupdf_ffi::Text<'_>, cmt: fz_matrix) { - todo!() + for span in text.spans() { + let mupdf_sys::fz_text_span { trm, .. } = span.get(); + let mupdf_sys::fz_font { + refs, + name, + buffer, + flags, + ft_face, + shaper_data, + t3matrix, + t3resources, + t3procs, + t3lists, + t3widths, + t3flags, + t3doc, + t3run, + t3freeres, + bbox, + ascender, + descender, + glyph_count, + bbox_table, + use_glyph_bbox, + width_count, + width_default, + width_table, + advance_cache, + encoding_cache, + has_digest, + digest, + subfont, + } = *span.font().get(); + for item in span.items() { + todo!() + } + } } } @@ -3487,7 +3523,7 @@ impl Page { ) -> Result> { let device = MyDevice::new(page_num); page.run( - &mupdf_ffi::Device::new(page.ctx(), Box::new(device))?, + &mupdf_ffi::Device::new(page.ctx(), Box::new(device.clone()))?, fz_matrix { a: 1.0, b: 0.0, @@ -3508,9 +3544,7 @@ impl Page { .expect("already dropped all other references") .into_inner(); // we convert to xml and parse that becuase the mupdf rust crate doesn't include all the API surface we need. - let json = page.stext_page_as_json_from_page(1.0)?; - todo!("{json}"); - let xml = page.to_xml()?; + let xml: String = todo!("page.to_xml()?"); let MuPdfXml::Page(xml_page) = quick_xml::de::from_str(&xml)?; for xml_block in xml_page.block { for xml_line in xml_block.line { @@ -3649,27 +3683,29 @@ fn main_inner() -> Result<(), Box> { } else { None }; - let mut parser = Parser::new(); - let is_subset = page_numbers.is_some(); - let file_name = &args[1]; - parser.parse_pdf(file_name, page_numbers)?; - let mut insns = xml_tree::Element::new( - "instructions".into(), - [("is-subset".into(), is_subset.to_string())], - ); - insns.text = "\n".into(); - insns.tail = "\n".into(); - let mut comment = - xml_tree::Element::comment(format!(" Automatically generated from {file_name} ")); - comment.tail = "\n".into(); - insns.children.push(comment); - for insn in parser.insns { - insn.write_xml(&mut insns); - } - let mut output = Vec::new(); - insns.write(&mut output, true)?; - std::fs::write("powerisa-instructions.xml", output)?; - Ok(()) + mupdf_ffi::Context::with(|ctx| { + let mut parser = Parser::new(); + let is_subset = page_numbers.is_some(); + let file_name = &args[1]; + parser.parse_pdf(ctx, file_name, page_numbers)?; + let mut insns = xml_tree::Element::new( + "instructions".into(), + [("is-subset".into(), is_subset.to_string())], + ); + insns.text = "\n".into(); + insns.tail = "\n".into(); + let mut comment = + xml_tree::Element::comment(format!(" Automatically generated from {file_name} ")); + comment.tail = "\n".into(); + insns.children.push(comment); + for insn in parser.insns { + insn.write_xml(&mut insns); + } + let mut output = Vec::new(); + insns.write(&mut output, true)?; + std::fs::write("powerisa-instructions.xml", output)?; + Ok(()) + }) } fn main() -> std::process::ExitCode { diff --git a/src/mupdf_ffi.rs b/src/mupdf_ffi.rs index 3eb24fb..2e8f325 100644 --- a/src/mupdf_ffi.rs +++ b/src/mupdf_ffi.rs @@ -4,14 +4,15 @@ use mupdf_sys::{ fz_clone_context, fz_color_params, fz_colorspace, fz_context, fz_device, fz_document, fz_drop_context, fz_drop_device, fz_drop_document, fz_drop_page, fz_drop_path, fz_drop_text, - fz_error_type_FZ_ERROR_GENERIC, fz_matrix, fz_page, fz_path, fz_path_walker, fz_point, fz_rect, - fz_stroke_state, fz_text, fz_transform_point, fz_transform_point_xy, fz_walk_path, - mupdf_document_page_count, mupdf_drop_error, mupdf_error_t, mupdf_load_page, - mupdf_new_base_context, mupdf_new_derived_device, mupdf_open_document, mupdf_run_page, + fz_error_type_FZ_ERROR_GENERIC, fz_font, fz_matrix, fz_page, fz_path, fz_path_walker, fz_point, + fz_rect, fz_stroke_state, fz_text, fz_text_item, fz_text_span, fz_transform_point, + fz_transform_point_xy, fz_walk_path, mupdf_document_page_count, mupdf_drop_error, + mupdf_error_t, mupdf_load_page, mupdf_new_base_context, mupdf_new_derived_device, + mupdf_open_document, mupdf_run_page, }; use std::{ cell::Cell, - ffi::{CStr, CString, c_int, c_void}, + ffi::{CStr, c_int, c_void}, fmt, marker::PhantomData, mem::ManuallyDrop, @@ -20,16 +21,16 @@ use std::{ }; #[derive(Debug)] -struct MuPdfError { +pub(crate) struct MuPdfError { type_: c_int, - message: CString, + message: String, } impl MuPdfError { fn new_generic(message: impl ToString) -> Self { Self { type_: fz_error_type_FZ_ERROR_GENERIC as _, - message: message.try_into().expect("nul byte in message"), + message: message.to_string(), } } } @@ -57,15 +58,17 @@ impl Drop for OwnedMuPdfError { } unsafe fn mupdf_try(f: impl FnOnce(&mut *mut mupdf_error_t) -> R) -> Result { - let mut err = <*mut mupdf_error_t>::null_mut(); + let mut err = ptr::null_mut(); let retval = f(&mut err); let Some(err) = NonNull::new(err).map(OwnedMuPdfError) else { return Ok(retval); }; unsafe { Err(MuPdfError { - type_: (*err.0).type_, - message: CString::from(CStr::from_ptr((*err.0).message)), + type_: (*err.0.as_ptr()).type_, + message: CStr::from_ptr((*err.0.as_ptr()).message) + .to_string_lossy() + .into_owned(), }) } } @@ -254,10 +257,6 @@ impl<'ctx, T: DeviceCallbacks<'ctx>> Device<'ctx, T> { }; (&raw mut (*dev_ptr).value).write(value); let fz_device { - refs, - hints, - flags, - close_device, drop_device, fill_path, stroke_path, @@ -268,29 +267,7 @@ impl<'ctx, T: DeviceCallbacks<'ctx>> Device<'ctx, T> { clip_text, clip_stroke_text, ignore_text, - fill_shade, - fill_image, - fill_image_mask, - clip_image_mask, - pop_clip, - begin_mask, - end_mask, - begin_group, - end_group, - begin_tile, - end_tile, - render_flags, - set_default_colorspaces, - begin_layer, - end_layer, - begin_structure, - end_structure, - begin_metatext, - end_metatext, - d1_rect, - container_len, - container_cap, - container, + .. } = &mut (*dev_ptr).base; *drop_device = Some(Self::drop_device_fn); *fill_path = Some(Self::fill_path_fn); @@ -306,7 +283,7 @@ impl<'ctx, T: DeviceCallbacks<'ctx>> Device<'ctx, T> { } } pub(crate) fn get(&self) -> &T { - unsafe { &(*self.ptr.cast::>()).value } + unsafe { &(*self.dev.cast::>()).value } } unsafe extern "C" fn drop_device_fn(_ctx: *mut fz_context, dev: *mut fz_device) { unsafe { @@ -319,10 +296,10 @@ impl<'ctx, T: DeviceCallbacks<'ctx>> Device<'ctx, T> { path: *const fz_path, even_odd: c_int, cmt: fz_matrix, - color_space: *mut fz_colorspace, - color: *const f32, - alpha: f32, - color_params: fz_color_params, + _color_space: *mut fz_colorspace, + _color: *const f32, + _alpha: f32, + _color_params: fz_color_params, ) { let Some(ctx) = NonNull::new(ctx) else { return; @@ -343,12 +320,12 @@ impl<'ctx, T: DeviceCallbacks<'ctx>> Device<'ctx, T> { ctx: *mut fz_context, dev: *mut fz_device, path: *const fz_path, - stroke_state: *const fz_stroke_state, + _stroke_state: *const fz_stroke_state, cmt: fz_matrix, - color_space: *mut fz_colorspace, - color: *const f32, - alpha: f32, - color_params: fz_color_params, + _color_space: *mut fz_colorspace, + _color: *const f32, + _alpha: f32, + _color_params: fz_color_params, ) { let Some(ctx) = NonNull::new(ctx) else { return; @@ -392,7 +369,7 @@ impl<'ctx, T: DeviceCallbacks<'ctx>> Device<'ctx, T> { ctx: *mut fz_context, dev: *mut fz_device, path: *const fz_path, - stroke_state: *const fz_stroke_state, + _stroke_state: *const fz_stroke_state, cmt: fz_matrix, scissor: fz_rect, ) { @@ -416,10 +393,10 @@ impl<'ctx, T: DeviceCallbacks<'ctx>> Device<'ctx, T> { dev: *mut fz_device, text: *const fz_text, cmt: fz_matrix, - color_space: *mut fz_colorspace, - color: *const f32, - alpha: f32, - color_params: fz_color_params, + _color_space: *mut fz_colorspace, + _color: *const f32, + _alpha: f32, + _color_params: fz_color_params, ) { let Some(ctx) = NonNull::new(ctx) else { return; @@ -439,12 +416,12 @@ impl<'ctx, T: DeviceCallbacks<'ctx>> Device<'ctx, T> { ctx: *mut fz_context, dev: *mut fz_device, text: *const fz_text, - stroke_state: *const fz_stroke_state, + _stroke_state: *const fz_stroke_state, cmt: fz_matrix, - color_space: *mut fz_colorspace, - color: *const f32, - alpha: f32, - color_params: fz_color_params, + _color_space: *mut fz_colorspace, + _color: *const f32, + _alpha: f32, + _color_params: fz_color_params, ) { let Some(ctx) = NonNull::new(ctx) else { return; @@ -486,7 +463,7 @@ impl<'ctx, T: DeviceCallbacks<'ctx>> Device<'ctx, T> { ctx: *mut fz_context, dev: *mut fz_device, text: *const fz_text, - stroke_state: *const fz_stroke_state, + _stroke_state: *const fz_stroke_state, cmt: fz_matrix, scissor: fz_rect, ) { @@ -716,6 +693,77 @@ impl<'ctx> Drop for Text<'ctx> { } } +impl<'ctx> Text<'ctx> { + pub(crate) fn spans<'a>(&'a self) -> TextSpanIter<'a, 'ctx> { + TextSpanIter { + ptr: unsafe { NonNull::new((*self.ptr).head) }, + ctx: self.ctx, + _phantom: PhantomData, + } + } +} + +#[derive(Clone)] +pub(crate) struct TextSpanIter<'a, 'ctx> { + ptr: Option>, + ctx: ContextRef<'ctx>, + _phantom: PhantomData<&'a Text<'ctx>>, +} + +impl<'a, 'ctx> Iterator for TextSpanIter<'a, 'ctx> { + type Item = TextSpanRef<'a, 'ctx>; + + fn next(&mut self) -> Option { + let ptr = self.ptr?; + self.ptr = NonNull::new(unsafe { ptr.as_ref().next }); + Some(TextSpanRef { + ptr: unsafe { &*ptr.as_ptr() }, + ctx: self.ctx, + _phantom: PhantomData, + }) + } +} + +#[derive(Copy, Clone)] +pub(crate) struct TextSpanRef<'a, 'ctx> { + ptr: &'a fz_text_span, + ctx: ContextRef<'ctx>, + _phantom: PhantomData<&'a Text<'ctx>>, +} + +impl<'a, 'ctx> TextSpanRef<'a, 'ctx> { + pub(crate) fn get(self) -> &'a fz_text_span { + self.ptr + } + pub(crate) fn font(self) -> FontRef<'a, 'ctx> { + FontRef { + ptr: unsafe { &*self.ptr.font }, + ctx: self.ctx, + _phantom: PhantomData, + } + } + pub(crate) fn items(self) -> &'a [fz_text_item] { + let len = self.ptr.len as usize; + if len == 0 { + return &[]; + } + unsafe { std::slice::from_raw_parts(self.ptr.items, len) } + } +} + +#[derive(Clone, Copy)] +pub(crate) struct FontRef<'a, 'ctx> { + ptr: &'a fz_font, + ctx: ContextRef<'ctx>, + _phantom: PhantomData<&'a Text<'ctx>>, +} + +impl<'a, 'ctx> FontRef<'a, 'ctx> { + pub(crate) fn get(self) -> &'a fz_font { + self.ptr + } +} + pub(crate) fn transform_point(point: fz_point, m: fz_matrix) -> fz_point { unsafe { fz_transform_point(point, m) } } From 040afcc435e69fa234caaef2719c1bc188c2999c Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Tue, 6 Jan 2026 08:00:38 -0800 Subject: [PATCH 11/11] extracts pdf items using mupdf-sys directly --- src/main.rs | 384 ++++++++++++++++++++++++++++------------------- src/mupdf_ffi.rs | 263 ++++++++++++++++++-------------- 2 files changed, 373 insertions(+), 274 deletions(-) diff --git a/src/main.rs b/src/main.rs index b56937c..a6a36e6 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,9 +1,14 @@ // SPDX-License-Identifier: LGPL-3.0-or-later // See Notices.txt for copyright information -use crate::quad_tree::QuadTree; +use crate::{ + mupdf_ffi::{ + WriteMode, add_points, point_max_components, point_min_components, transform_vector, + }, + quad_tree::QuadTree, +}; use indexmap::IndexSet; -use mupdf_sys::{FZ_STEXT_BOLD, fz_matrix}; +use mupdf_sys::{fz_matrix, fz_point, fz_text_item}; use non_nan_float::NonNaNF32; use std::{ backtrace::Backtrace, @@ -135,6 +140,12 @@ macro_rules! make_enum_font { panic!("invalid font name with id") } } + const fn known_from_name_with_tag(font_name_with_tag: &str, size: NonNaNF32) -> Option { + match size.get() { + $($($known_font_size if str_eq(font_name_with_tag, $known_font_name_with_tag) => Some(Self::$KnownFont),)*)* + _ => None, + } + } const fn new_known(font_name: &str, size: NonNaNF32) -> Option { match size.get() { $($($known_font_size if str_eq(font_name, const { @@ -3167,25 +3178,32 @@ impl<'ctx> Parser<'ctx> { } } -#[derive(Clone, Debug)] -struct MyDevice { +#[derive(Debug)] +struct MyDevice<'a> { page_num: u32, - qt: Rc>>>, + qt: RefCell>>, unprocessed_chars: Rc>>>>>>, unprocessed_non_text: Rc>>, + first_seen_fonts: RefCell<&'a mut BTreeMap>>, + error: RefCell>>, } -impl MyDevice { - fn new(page_num: u32) -> Self { +impl<'a> MyDevice<'a> { + fn new(page_num: u32, first_seen_fonts: &'a mut BTreeMap>) -> Self { Self { page_num, qt: Default::default(), unprocessed_chars: Default::default(), unprocessed_non_text: Default::default(), + first_seen_fonts: RefCell::new(first_seen_fonts), + error: RefCell::new(Ok(())), } } - fn path(&self, path: &mupdf_ffi::Path<'_>, cmt: fz_matrix) { + fn path(&self, path: &mupdf_ffi::Path<'_>, ctm: fz_matrix) { + if self.error.borrow().is_err() { + return; + } enum Walker { Empty, Moved { x: f32, y: f32 }, @@ -3262,19 +3280,19 @@ impl MyDevice { p1_y, }) => { let mupdf_sys::fz_point { x: p0_x, y: p0_y } = - mupdf_ffi::transform_point_xy(p0_x.get(), p0_y.get(), cmt); + mupdf_ffi::transform_point_xy(p0_x.get(), p0_y.get(), ctm); let mupdf_sys::fz_point { x: p1_x, y: p1_y } = - mupdf_ffi::transform_point_xy(p1_x.get(), p1_y.get(), cmt); + mupdf_ffi::transform_point_xy(p1_x.get(), p1_y.get(), ctm); let Some(line) = new_line(p0_x, p0_y, p1_x, p1_y) else { return; }; LineOrRect::Line(line) } Walker::Rect { x1, y1, x2, y2 } => { - let p1 = mupdf_ffi::transform_point_xy(x1, y1, cmt); - let p2 = mupdf_ffi::transform_point_xy(x2, y1, cmt); - let p3 = mupdf_ffi::transform_point_xy(x2, y2, cmt); - let p4 = mupdf_ffi::transform_point_xy(x1, y2, cmt); + let p1 = mupdf_ffi::transform_point_xy(x1, y1, ctm); + let p2 = mupdf_ffi::transform_point_xy(x2, y1, ctm); + let p3 = mupdf_ffi::transform_point_xy(x2, y2, ctm); + let p4 = mupdf_ffi::transform_point_xy(x1, y2, ctm); let min_x = NonNaNF32::new(p1.x.min(p2.x).min(p3.x).min(p4.x)); let max_x = NonNaNF32::new(p1.x.max(p2.x).max(p3.x).max(p4.x)); let min_y = NonNaNF32::new(p1.y.min(p2.y).min(p3.y).min(p4.y)); @@ -3317,65 +3335,192 @@ impl MyDevice { ); } } - fn text(&self, text: &mupdf_ffi::Text<'_>, cmt: fz_matrix) { + fn text(&self, text: &mupdf_ffi::Text<'_>, ctm: fz_matrix) { + if self.error.borrow().is_err() { + return; + } + let mut first_seen_fonts = self.first_seen_fonts.borrow_mut(); for span in text.spans() { - let mupdf_sys::fz_text_span { trm, .. } = span.get(); - let mupdf_sys::fz_font { - refs, - name, - buffer, - flags, - ft_face, - shaper_data, - t3matrix, - t3resources, - t3procs, - t3lists, - t3widths, - t3flags, - t3doc, - t3run, - t3freeres, - bbox, - ascender, - descender, - glyph_count, - bbox_table, - use_glyph_bbox, - width_count, - width_default, - width_table, - advance_cache, - encoding_cache, - has_digest, - digest, - subfont, - } = *span.font().get(); - for item in span.items() { - todo!() + let tm = span.trm(); + const ROUND_FACTOR: f32 = 1000.0; + let font_size = (mupdf_ffi::matrix_expansion(tm) * ROUND_FACTOR).round() / ROUND_FACTOR; + let Some(font_size) = NonNaNF32::new(font_size) else { + continue; + }; + let font_name_with_tag = span.font().name(); + let font_name_with_tag = match font_name_with_tag { + "CGMSHV+DejaVuSansCondensed-Obli" => "CGMSHV+DejaVuSansCondensed-Oblique", + "YDJYQV+DejaVuSansCondensed-Bold" => "YDJYQV+DejaVuSansCondensed-BoldOblique", + "NHUPPK+DejaVuSansCondensed-Bold" => "NHUPPK+DejaVuSansCondensed-Bold", + _ if font_name_with_tag.len() == 31 => { + let _ = self.error.replace(Err(format!( + "probably truncated font name: {font_name_with_tag:?}" + ) + .into())); + return; + } + _ => font_name_with_tag, + }; + for &fz_text_item { + x, + y, + adv, + gid, + ucs, + cid: _, + } in span.items() + { + let adv = if gid >= 0 { adv } else { 0.0 }; + let tm = fz_matrix { e: x, f: y, ..tm }; + let trm = mupdf_ffi::concat(tm, ctm); + let dir = match span.write_mode() { + WriteMode::Horizontal => fz_point { x: 1.0, y: 0.0 }, + WriteMode::Vertical => fz_point { x: 0.0, y: -1.0 }, + }; + let dir = mupdf_ffi::transform_vector(dir, trm); + let glyph_start; + let glyph_stop; + let glyph_ascender; + let glyph_descender; + match span.write_mode() { + WriteMode::Horizontal => { + glyph_start = fz_point { x: trm.e, y: trm.f }; + glyph_stop = fz_point { + x: trm.e + adv * dir.x, + y: trm.f + adv * dir.y, + }; + glyph_ascender = fz_point { + x: 0.0, + y: span.font().ascender(), + }; + glyph_descender = fz_point { + x: 0.0, + y: span.font().descender(), + }; + } + WriteMode::Vertical => { + glyph_start = fz_point { + x: trm.e - adv * dir.x, + y: trm.f - adv * dir.y, + }; + glyph_stop = fz_point { x: trm.e, y: trm.f }; + glyph_ascender = fz_point { x: 1.0, y: 0.0 }; + glyph_descender = fz_point { x: 0.0, y: 0.0 }; + } + }; + let glyph_ascender = transform_vector(glyph_ascender, trm); + let glyph_descender = transform_vector(glyph_descender, trm); + let points = [ + add_points(glyph_start, glyph_descender), + add_points(glyph_start, glyph_ascender), + add_points(glyph_stop, glyph_descender), + add_points(glyph_stop, glyph_ascender), + ]; + let min = point_min_components( + point_min_components(point_min_components(points[0], points[1]), points[2]), + points[3], + ); + let max = point_max_components( + point_max_components(point_max_components(points[0], points[1]), points[2]), + points[3], + ); + let Some(ch) = u32::try_from(ucs).ok().and_then(|v| char::try_from(v).ok()) else { + continue; + }; + let text = String::from(ch); + if text.trim().is_empty() { + continue; + } + let font = Font::known_from_name_with_tag(font_name_with_tag, font_size) + .unwrap_or_else(|| Font::Other { + font_name: font_name_with_tag.into(), + size: font_size, + }); + let Some(text_section) = TextSection::for_position( + self.page_num, + (min.x + max.x) * 0.5, + (min.y + max.y) * 0.5, + ) else { + if PAGE_BODY_MIN_Y <= min.y && min.y <= PAGE_BODY_MAX_Y { + if self.page_num != 1072 { + // page 1072 has characters in the margins + let _ = self.error.replace(Err(format!( + "char not in text section: {text:?}\npage_num={}", + self.page_num, + ) + .into())); + return; + } + } + continue; + }; + let (Some(min_x), Some(min_y), Some(max_x), Some(max_y)) = ( + NonNaNF32::new(min.x), + NonNaNF32::new(min.y), + NonNaNF32::new(max.x), + NonNaNF32::new(max.y), + ) else { + let _ = self + .error + .replace(Err("char position shouldn't be NaN".into())); + return; + }; + let char = Char { + font, + text, + min_x, + min_y, + max_x, + max_y, + }; + let set = match first_seen_fonts.get_mut(font_name_with_tag) { + Some(v) => v, + None => first_seen_fonts + .entry(String::from(font_name_with_tag)) + .or_default(), + }; + if set.insert(font_size) { + println!( + "first seen font: {font_name_with_tag:?} {font_size}: page {} {char:?}", + self.page_num, + ); + } + self.qt + .borrow_mut() + .entry(text_section) + .or_default() + .insert(min_x.get(), min_y.get(), PageItem::Char(char.clone())); + self.unprocessed_chars + .borrow_mut() + .entry(text_section) + .or_default() + .borrow_mut() + .entry(char.font.clone()) + .or_default() + .insert(char); } } } } -impl<'ctx> mupdf_ffi::DeviceCallbacks<'ctx> for MyDevice { +impl<'ctx> mupdf_ffi::DeviceCallbacks<'ctx> for MyDevice<'_> { fn fill_path( &self, _ctx: mupdf_ffi::ContextRef<'ctx>, path: &mupdf_ffi::Path<'ctx>, _even_odd: bool, - cmt: fz_matrix, + ctm: fz_matrix, ) { - self.path(path, cmt); + self.path(path, ctm); } fn stroke_path( &self, _ctx: mupdf_ffi::ContextRef<'ctx>, path: &mupdf_ffi::Path<'ctx>, - cmt: fz_matrix, + ctm: fz_matrix, ) { - self.path(path, cmt); + self.path(path, ctm); } fn clip_path( @@ -3383,67 +3528,67 @@ impl<'ctx> mupdf_ffi::DeviceCallbacks<'ctx> for MyDevice { _ctx: mupdf_ffi::ContextRef<'ctx>, path: &mupdf_ffi::Path<'ctx>, _even_odd: bool, - cmt: fz_matrix, + ctm: fz_matrix, _scissor: mupdf_sys::fz_rect, ) { - self.path(path, cmt); + self.path(path, ctm); } fn clip_stroke_path( &self, _ctx: mupdf_ffi::ContextRef<'ctx>, path: &mupdf_ffi::Path<'ctx>, - cmt: fz_matrix, + ctm: fz_matrix, _scissor: mupdf_sys::fz_rect, ) { - self.path(path, cmt); + self.path(path, ctm); } fn fill_text( &self, _ctx: mupdf_ffi::ContextRef<'ctx>, text: &mupdf_ffi::Text<'ctx>, - cmt: fz_matrix, + ctm: fz_matrix, ) { - self.text(text, cmt); + self.text(text, ctm); } fn stroke_text( &self, _ctx: mupdf_ffi::ContextRef<'ctx>, text: &mupdf_ffi::Text<'ctx>, - cmt: fz_matrix, + ctm: fz_matrix, ) { - self.text(text, cmt); + self.text(text, ctm); } fn clip_text( &self, _ctx: mupdf_ffi::ContextRef<'ctx>, text: &mupdf_ffi::Text<'ctx>, - cmt: fz_matrix, + ctm: fz_matrix, _scissor: mupdf_sys::fz_rect, ) { - self.text(text, cmt); + self.text(text, ctm); } fn clip_stroke_text( &self, _ctx: mupdf_ffi::ContextRef<'ctx>, text: &mupdf_ffi::Text<'ctx>, - cmt: fz_matrix, + ctm: fz_matrix, _scissor: mupdf_sys::fz_rect, ) { - self.text(text, cmt); + self.text(text, ctm); } fn ignore_text( &self, _ctx: mupdf_ffi::ContextRef<'ctx>, text: &mupdf_ffi::Text<'ctx>, - cmt: fz_matrix, + ctm: fz_matrix, ) { - self.text(text, cmt); + self.text(text, ctm); } } @@ -3521,9 +3666,12 @@ impl Page { page: &mupdf_ffi::Page<'_>, first_seen_fonts: &mut BTreeMap>, ) -> Result> { - let device = MyDevice::new(page_num); + let device = mupdf_ffi::Device::new( + page.ctx(), + Box::new(MyDevice::new(page_num, first_seen_fonts)), + )?; page.run( - &mupdf_ffi::Device::new(page.ctx(), Box::new(device.clone()))?, + &device, fz_matrix { a: 1.0, b: 0.0, @@ -3534,94 +3682,14 @@ impl Page { }, )?; let MyDevice { - page_num, + page_num: _, qt, unprocessed_chars, unprocessed_non_text, - } = device; - let mut qt = Rc::try_unwrap(qt) - .ok() - .expect("already dropped all other references") - .into_inner(); - // we convert to xml and parse that becuase the mupdf rust crate doesn't include all the API surface we need. - let xml: String = todo!("page.to_xml()?"); - let MuPdfXml::Page(xml_page) = quick_xml::de::from_str(&xml)?; - for xml_block in xml_page.block { - for xml_line in xml_block.line { - for xml_font in xml_line.font { - const ROUND_FACTOR: f32 = 1000.0; - let font_size = (xml_font.size * ROUND_FACTOR).round() / ROUND_FACTOR; - let font_size = NonNaNF32::new(font_size).ok_or("font size must not be NaN")?; - for xml_char in xml_font.char { - if xml_char.c.trim().is_empty() { - continue; - } - let font_name = match &*xml_font.name { - "DejaVuSansCondensed-Obli" => { - if (xml_char.flags & FZ_STEXT_BOLD) != 0 { - "DejaVuSansCondensed-BoldOblique" - } else { - "DejaVuSansCondensed-Oblique" - } - } - font_name => font_name, - }; - let font = Font::new(font_name, font_size); - let [x0, y0, x1, y1, x2, y2, x3, y3] = xml_char.quad; - let min_x = x0.min(x1).min(x2).min(x3); - let max_x = x0.max(x1).max(x2).max(x3); - let min_y = y0.min(y1).min(y2).min(y3); - let max_y = y0.max(y1).max(y2).max(y3); - let Some(text_section) = TextSection::for_position( - page_num, - (min_x + max_x) * 0.5, - (min_y + max_y) * 0.5, - ) else { - if PAGE_BODY_MIN_Y <= min_y && min_y <= PAGE_BODY_MAX_Y { - if page_num != 1072 { - // page 1072 has characters in the margins - return Err( - format!("char not in text section: {xml_char:?}\npage_num={page_num}").into(), - ); - } - } - continue; - }; - let char = Char { - font: font.clone(), - text: xml_char.c.into_owned(), - min_x: NonNaNF32::new(min_x).ok_or("char position shouldn't be NaN")?, - min_y: NonNaNF32::new(min_y).ok_or("char position shouldn't be NaN")?, - max_x: NonNaNF32::new(max_x).ok_or("char position shouldn't be NaN")?, - max_y: NonNaNF32::new(max_y).ok_or("char position shouldn't be NaN")?, - }; - let set = match first_seen_fonts.get_mut(font_name) { - Some(v) => v, - None => first_seen_fonts.entry(String::from(font_name)).or_default(), - }; - if set.insert(font_size) { - println!( - "first seen font: {font_name:?} {font_size}: page {page_num} {char:?} {:x}", - xml_char.flags, - ); - } - qt.entry(text_section).or_default().insert( - min_x, - min_y, - PageItem::Char(char.clone()), - ); - unprocessed_chars - .borrow_mut() - .entry(text_section) - .or_default() - .borrow_mut() - .entry(char.font.clone()) - .or_default() - .insert(char); - } - } - } - } + first_seen_fonts: _, + error, + } = device.get(); + error.replace(Ok(()))?; for (text_section, i) in unprocessed_chars.borrow_mut().iter_mut() { for chars in i.borrow_mut().values_mut() { chars.sort_by_key(Char::top_down_left_to_right_sort_key); @@ -3656,9 +3724,9 @@ impl Page { } Ok(Self { page_num, - qt, - unprocessed_chars, - unprocessed_non_text, + qt: qt.take(), + unprocessed_chars: unprocessed_chars.clone(), + unprocessed_non_text: unprocessed_non_text.clone(), }) } } diff --git a/src/mupdf_ffi.rs b/src/mupdf_ffi.rs index 2e8f325..942bcfc 100644 --- a/src/mupdf_ffi.rs +++ b/src/mupdf_ffi.rs @@ -2,16 +2,17 @@ // See Notices.txt for copyright information use mupdf_sys::{ - fz_clone_context, fz_color_params, fz_colorspace, fz_context, fz_device, fz_document, - fz_drop_context, fz_drop_device, fz_drop_document, fz_drop_page, fz_drop_path, fz_drop_text, - fz_error_type_FZ_ERROR_GENERIC, fz_font, fz_matrix, fz_page, fz_path, fz_path_walker, fz_point, - fz_rect, fz_stroke_state, fz_text, fz_text_item, fz_text_span, fz_transform_point, - fz_transform_point_xy, fz_walk_path, mupdf_document_page_count, mupdf_drop_error, - mupdf_error_t, mupdf_load_page, mupdf_new_base_context, mupdf_new_derived_device, - mupdf_open_document, mupdf_run_page, + fz_clone_context, fz_color_params, fz_colorspace, fz_concat, fz_context, fz_device, + fz_document, fz_drop_context, fz_drop_device, fz_drop_document, fz_drop_page, fz_drop_path, + fz_drop_text, fz_error_type_FZ_ERROR_GENERIC, fz_font, fz_font_ascender, fz_font_descender, + fz_font_is_bold, fz_font_is_italic, fz_font_name, fz_matrix, fz_matrix_expansion, fz_page, + fz_path, fz_path_walker, fz_point, fz_rect, fz_stroke_state, fz_text, fz_text_item, + fz_text_span, fz_transform_point, fz_transform_point_xy, fz_transform_vector, fz_walk_path, + mupdf_document_page_count, mupdf_drop_error, mupdf_error_t, mupdf_load_page, + mupdf_new_base_context, mupdf_new_derived_device, mupdf_open_document, mupdf_run_page, }; use std::{ - cell::Cell, + cell::{Cell, UnsafeCell}, ffi::{CStr, c_int, c_void}, fmt, marker::PhantomData, @@ -104,7 +105,7 @@ impl Context { CTX.with(f) } pub(crate) fn as_ref(&self) -> ContextRef<'_> { - ContextRef(self.0, PhantomData) + unsafe { ContextRef::from_ptr(self.0.as_ptr()) } } } @@ -117,7 +118,13 @@ impl Drop for Context { } #[derive(Clone, Copy)] -pub(crate) struct ContextRef<'ctx>(NonNull, PhantomData<&'ctx Context>); +pub(crate) struct ContextRef<'ctx>(&'ctx UnsafeCell); + +impl<'ctx> ContextRef<'ctx> { + unsafe fn from_ptr(ptr: *mut fz_context) -> Self { + Self(unsafe { &*ptr.cast() }) + } +} impl<'ctx> From<&'ctx Context> for ContextRef<'ctx> { fn from(value: &'ctx Context) -> Self { @@ -137,13 +144,13 @@ impl<'ctx> Document<'ctx> { ) -> Result, MuPdfError> { let ctx = ctx.into(); unsafe { - mupdf_try(|errptr| mupdf_open_document(ctx.0.as_ptr(), file_name.as_ptr(), errptr)) + mupdf_try(|errptr| mupdf_open_document(ctx.0.get(), file_name.as_ptr(), errptr)) .map(|ptr| Document { ptr, ctx }) } } pub(crate) fn page_count(&self) -> Result { unsafe { - mupdf_try(|errptr| mupdf_document_page_count(self.ctx.0.as_ptr(), self.ptr, errptr))? + mupdf_try(|errptr| mupdf_document_page_count(self.ctx.0.get(), self.ptr, errptr))? .try_into() .map_err(MuPdfError::new_generic) } @@ -151,7 +158,7 @@ impl<'ctx> Document<'ctx> { pub(crate) fn load_page(&self, page: usize) -> Result, MuPdfError> { let page = page.try_into().map_err(MuPdfError::new_generic)?; unsafe { - mupdf_try(|errptr| mupdf_load_page(self.ctx.0.as_ptr(), self.ptr, page, errptr)) + mupdf_try(|errptr| mupdf_load_page(self.ctx.0.get(), self.ptr, page, errptr)) .map(|ptr| Page { ptr, ctx: self.ctx }) } } @@ -160,7 +167,7 @@ impl<'ctx> Document<'ctx> { impl<'ctx> Drop for Document<'ctx> { fn drop(&mut self) { unsafe { - fz_drop_document(self.ctx.0.as_ptr(), self.ptr); + fz_drop_document(self.ctx.0.get(), self.ptr); } } } @@ -182,7 +189,7 @@ impl<'ctx> Page<'ctx> { unsafe { mupdf_try(|errptr| { mupdf_run_page( - self.ctx.0.as_ptr(), + self.ctx.0.get(), self.ptr, device.dev, ctm, @@ -197,7 +204,7 @@ impl<'ctx> Page<'ctx> { impl<'ctx> Drop for Page<'ctx> { fn drop(&mut self) { unsafe { - fz_drop_page(self.ctx.0.as_ptr(), self.ptr); + fz_drop_page(self.ctx.0.get(), self.ptr); } } } @@ -209,34 +216,34 @@ pub(crate) struct Device<'ctx, T: 'ctx> { } pub(crate) trait DeviceCallbacks<'ctx> { - fn fill_path(&self, ctx: ContextRef<'ctx>, path: &Path<'ctx>, even_odd: bool, cmt: fz_matrix); - fn stroke_path(&self, ctx: ContextRef<'ctx>, path: &Path<'ctx>, cmt: fz_matrix); + fn fill_path(&self, ctx: ContextRef<'ctx>, path: &Path<'ctx>, even_odd: bool, ctm: fz_matrix); + fn stroke_path(&self, ctx: ContextRef<'ctx>, path: &Path<'ctx>, ctm: fz_matrix); fn clip_path( &self, ctx: ContextRef<'ctx>, path: &Path<'ctx>, even_odd: bool, - cmt: fz_matrix, + ctm: fz_matrix, scissor: fz_rect, ); fn clip_stroke_path( &self, ctx: ContextRef<'ctx>, path: &Path<'ctx>, - cmt: fz_matrix, + ctm: fz_matrix, scissor: fz_rect, ); - fn fill_text(&self, ctx: ContextRef<'ctx>, text: &Text<'ctx>, cmt: fz_matrix); - fn stroke_text(&self, ctx: ContextRef<'ctx>, text: &Text<'ctx>, cmt: fz_matrix); - fn clip_text(&self, ctx: ContextRef<'ctx>, text: &Text<'ctx>, cmt: fz_matrix, scissor: fz_rect); + fn fill_text(&self, ctx: ContextRef<'ctx>, text: &Text<'ctx>, ctm: fz_matrix); + fn stroke_text(&self, ctx: ContextRef<'ctx>, text: &Text<'ctx>, ctm: fz_matrix); + fn clip_text(&self, ctx: ContextRef<'ctx>, text: &Text<'ctx>, ctm: fz_matrix, scissor: fz_rect); fn clip_stroke_text( &self, ctx: ContextRef<'ctx>, text: &Text<'ctx>, - cmt: fz_matrix, + ctm: fz_matrix, scissor: fz_rect, ); - fn ignore_text(&self, ctx: ContextRef<'ctx>, text: &Text<'ctx>, cmt: fz_matrix); + fn ignore_text(&self, ctx: ContextRef<'ctx>, text: &Text<'ctx>, ctm: fz_matrix); } impl<'ctx, T: DeviceCallbacks<'ctx>> Device<'ctx, T> { @@ -245,7 +252,7 @@ impl<'ctx, T: DeviceCallbacks<'ctx>> Device<'ctx, T> { unsafe { let dev_ptr = mupdf_try(|errptr| { mupdf_new_derived_device::>( - ctx.0.as_ptr(), + ctx.0.get(), c"parse_powerisa_pdf::mupdf_ffi::Device", errptr, ) @@ -295,16 +302,13 @@ impl<'ctx, T: DeviceCallbacks<'ctx>> Device<'ctx, T> { dev: *mut fz_device, path: *const fz_path, even_odd: c_int, - cmt: fz_matrix, + ctm: fz_matrix, _color_space: *mut fz_colorspace, _color: *const f32, _alpha: f32, _color_params: fz_color_params, ) { - let Some(ctx) = NonNull::new(ctx) else { - return; - }; - let ctx = ContextRef(ctx, PhantomData); + let ctx = unsafe { ContextRef::from_ptr(ctx) }; let this = unsafe { &mut (*dev.cast::>()).value }; this.fill_path( ctx, @@ -313,7 +317,7 @@ impl<'ctx, T: DeviceCallbacks<'ctx>> Device<'ctx, T> { ctx, }), even_odd != 0, - cmt, + ctm, ); } unsafe extern "C" fn stroke_path_fn( @@ -321,16 +325,13 @@ impl<'ctx, T: DeviceCallbacks<'ctx>> Device<'ctx, T> { dev: *mut fz_device, path: *const fz_path, _stroke_state: *const fz_stroke_state, - cmt: fz_matrix, + ctm: fz_matrix, _color_space: *mut fz_colorspace, _color: *const f32, _alpha: f32, _color_params: fz_color_params, ) { - let Some(ctx) = NonNull::new(ctx) else { - return; - }; - let ctx = ContextRef(ctx, PhantomData); + let ctx = unsafe { ContextRef::from_ptr(ctx) }; let this = unsafe { &mut (*dev.cast::>()).value }; this.stroke_path( ctx, @@ -338,7 +339,7 @@ impl<'ctx, T: DeviceCallbacks<'ctx>> Device<'ctx, T> { ptr: path.cast_mut(), ctx, }), - cmt, + ctm, ); } unsafe extern "C" fn clip_path_fn( @@ -346,13 +347,10 @@ impl<'ctx, T: DeviceCallbacks<'ctx>> Device<'ctx, T> { dev: *mut fz_device, path: *const fz_path, even_odd: ::std::os::raw::c_int, - cmt: fz_matrix, + ctm: fz_matrix, scissor: fz_rect, ) { - let Some(ctx) = NonNull::new(ctx) else { - return; - }; - let ctx = ContextRef(ctx, PhantomData); + let ctx = unsafe { ContextRef::from_ptr(ctx) }; let this = unsafe { &mut (*dev.cast::>()).value }; this.clip_path( ctx, @@ -361,7 +359,7 @@ impl<'ctx, T: DeviceCallbacks<'ctx>> Device<'ctx, T> { ctx, }), even_odd != 0, - cmt, + ctm, scissor, ); } @@ -370,13 +368,10 @@ impl<'ctx, T: DeviceCallbacks<'ctx>> Device<'ctx, T> { dev: *mut fz_device, path: *const fz_path, _stroke_state: *const fz_stroke_state, - cmt: fz_matrix, + ctm: fz_matrix, scissor: fz_rect, ) { - let Some(ctx) = NonNull::new(ctx) else { - return; - }; - let ctx = ContextRef(ctx, PhantomData); + let ctx = unsafe { ContextRef::from_ptr(ctx) }; let this = unsafe { &mut (*dev.cast::>()).value }; this.clip_stroke_path( ctx, @@ -384,7 +379,7 @@ impl<'ctx, T: DeviceCallbacks<'ctx>> Device<'ctx, T> { ptr: path.cast_mut(), ctx, }), - cmt, + ctm, scissor, ); } @@ -392,16 +387,13 @@ impl<'ctx, T: DeviceCallbacks<'ctx>> Device<'ctx, T> { ctx: *mut fz_context, dev: *mut fz_device, text: *const fz_text, - cmt: fz_matrix, + ctm: fz_matrix, _color_space: *mut fz_colorspace, _color: *const f32, _alpha: f32, _color_params: fz_color_params, ) { - let Some(ctx) = NonNull::new(ctx) else { - return; - }; - let ctx = ContextRef(ctx, PhantomData); + let ctx = unsafe { ContextRef::from_ptr(ctx) }; let this = unsafe { &mut (*dev.cast::>()).value }; this.fill_text( ctx, @@ -409,7 +401,7 @@ impl<'ctx, T: DeviceCallbacks<'ctx>> Device<'ctx, T> { ptr: text.cast_mut(), ctx, }), - cmt, + ctm, ); } unsafe extern "C" fn stroke_text_fn( @@ -417,16 +409,13 @@ impl<'ctx, T: DeviceCallbacks<'ctx>> Device<'ctx, T> { dev: *mut fz_device, text: *const fz_text, _stroke_state: *const fz_stroke_state, - cmt: fz_matrix, + ctm: fz_matrix, _color_space: *mut fz_colorspace, _color: *const f32, _alpha: f32, _color_params: fz_color_params, ) { - let Some(ctx) = NonNull::new(ctx) else { - return; - }; - let ctx = ContextRef(ctx, PhantomData); + let ctx = unsafe { ContextRef::from_ptr(ctx) }; let this = unsafe { &mut (*dev.cast::>()).value }; this.stroke_text( ctx, @@ -434,20 +423,17 @@ impl<'ctx, T: DeviceCallbacks<'ctx>> Device<'ctx, T> { ptr: text.cast_mut(), ctx, }), - cmt, + ctm, ); } unsafe extern "C" fn clip_text_fn( ctx: *mut fz_context, dev: *mut fz_device, text: *const fz_text, - cmt: fz_matrix, + ctm: fz_matrix, scissor: fz_rect, ) { - let Some(ctx) = NonNull::new(ctx) else { - return; - }; - let ctx = ContextRef(ctx, PhantomData); + let ctx = unsafe { ContextRef::from_ptr(ctx) }; let this = unsafe { &mut (*dev.cast::>()).value }; this.clip_text( ctx, @@ -455,7 +441,7 @@ impl<'ctx, T: DeviceCallbacks<'ctx>> Device<'ctx, T> { ptr: text.cast_mut(), ctx, }), - cmt, + ctm, scissor, ); } @@ -464,13 +450,10 @@ impl<'ctx, T: DeviceCallbacks<'ctx>> Device<'ctx, T> { dev: *mut fz_device, text: *const fz_text, _stroke_state: *const fz_stroke_state, - cmt: fz_matrix, + ctm: fz_matrix, scissor: fz_rect, ) { - let Some(ctx) = NonNull::new(ctx) else { - return; - }; - let ctx = ContextRef(ctx, PhantomData); + let ctx = unsafe { ContextRef::from_ptr(ctx) }; let this = unsafe { &mut (*dev.cast::>()).value }; this.clip_stroke_text( ctx, @@ -478,7 +461,7 @@ impl<'ctx, T: DeviceCallbacks<'ctx>> Device<'ctx, T> { ptr: text.cast_mut(), ctx, }), - cmt, + ctm, scissor, ); } @@ -486,12 +469,9 @@ impl<'ctx, T: DeviceCallbacks<'ctx>> Device<'ctx, T> { ctx: *mut fz_context, dev: *mut fz_device, text: *const fz_text, - cmt: fz_matrix, + ctm: fz_matrix, ) { - let Some(ctx) = NonNull::new(ctx) else { - return; - }; - let ctx = ContextRef(ctx, PhantomData); + let ctx = unsafe { ContextRef::from_ptr(ctx) }; let this = unsafe { &mut (*dev.cast::>()).value }; this.ignore_text( ctx, @@ -499,7 +479,7 @@ impl<'ctx, T: DeviceCallbacks<'ctx>> Device<'ctx, T> { ptr: text.cast_mut(), ctx, }), - cmt, + ctm, ); } } @@ -508,8 +488,8 @@ impl<'ctx, T> Drop for Device<'ctx, T> { fn drop(&mut self) { unsafe { // FIXME: fz_close_device may throw exceptions - // fz_close_device(self.ctx.0.as_ptr(), self.dev); - fz_drop_device(self.ctx.0.as_ptr(), self.dev); + // fz_close_device(self.ctx.0.get(), self.dev); + fz_drop_device(self.ctx.0.get(), self.dev); } } } @@ -583,7 +563,7 @@ impl<'ctx> Path<'ctx> { pub(crate) fn walk>(&self, mut walker: W) { unsafe { fz_walk_path( - self.ctx.0.as_ptr(), + self.ctx.0.get(), self.ptr, const { &fz_path_walker { @@ -607,10 +587,7 @@ impl<'ctx> Path<'ctx> { x: f32, y: f32, ) { - let Some(ctx) = NonNull::new(ctx) else { - return; - }; - let ctx = ContextRef(ctx, PhantomData); + let ctx = unsafe { ContextRef::from_ptr(ctx) }; let this = unsafe { &mut *arg.cast::() }; this.move_to(ctx, x, y); } @@ -620,10 +597,7 @@ impl<'ctx> Path<'ctx> { x: f32, y: f32, ) { - let Some(ctx) = NonNull::new(ctx) else { - return; - }; - let ctx = ContextRef(ctx, PhantomData); + let ctx = unsafe { ContextRef::from_ptr(ctx) }; let this = unsafe { &mut *arg.cast::() }; this.line_to(ctx, x, y); } @@ -637,10 +611,7 @@ impl<'ctx> Path<'ctx> { x3: f32, y3: f32, ) { - let Some(ctx) = NonNull::new(ctx) else { - return; - }; - let ctx = ContextRef(ctx, PhantomData); + let ctx = unsafe { ContextRef::from_ptr(ctx) }; let this = unsafe { &mut *arg.cast::() }; this.curve_to(ctx, x1, y1, x2, y2, x3, y3); } @@ -648,10 +619,7 @@ impl<'ctx> Path<'ctx> { ctx: *mut fz_context, arg: *mut c_void, ) { - let Some(ctx) = NonNull::new(ctx) else { - return; - }; - let ctx = ContextRef(ctx, PhantomData); + let ctx = unsafe { ContextRef::from_ptr(ctx) }; let this = unsafe { &mut *arg.cast::() }; this.close_path(ctx); } @@ -663,10 +631,7 @@ impl<'ctx> Path<'ctx> { x2: f32, y2: f32, ) { - let Some(ctx) = NonNull::new(ctx) else { - return; - }; - let ctx = ContextRef(ctx, PhantomData); + let ctx = unsafe { ContextRef::from_ptr(ctx) }; let this = unsafe { &mut *arg.cast::() }; this.rect_to(ctx, x1, y1, x2, y2); } @@ -675,7 +640,7 @@ impl<'ctx> Path<'ctx> { impl<'ctx> Drop for Path<'ctx> { fn drop(&mut self) { unsafe { - fz_drop_path(self.ctx.0.as_ptr(), self.ptr); + fz_drop_path(self.ctx.0.get(), self.ptr); } } } @@ -688,7 +653,7 @@ pub(crate) struct Text<'ctx> { impl<'ctx> Drop for Text<'ctx> { fn drop(&mut self) { unsafe { - fz_drop_text(self.ctx.0.as_ptr(), self.ptr); + fz_drop_text(self.ctx.0.get(), self.ptr); } } } @@ -696,7 +661,7 @@ impl<'ctx> Drop for Text<'ctx> { impl<'ctx> Text<'ctx> { pub(crate) fn spans<'a>(&'a self) -> TextSpanIter<'a, 'ctx> { TextSpanIter { - ptr: unsafe { NonNull::new((*self.ptr).head) }, + ptr: unsafe { NonNull::new((*self.ptr).head).map(|ptr| &*ptr.as_ptr().cast()) }, ctx: self.ctx, _phantom: PhantomData, } @@ -705,7 +670,7 @@ impl<'ctx> Text<'ctx> { #[derive(Clone)] pub(crate) struct TextSpanIter<'a, 'ctx> { - ptr: Option>, + ptr: Option<&'a UnsafeCell>, ctx: ContextRef<'ctx>, _phantom: PhantomData<&'a Text<'ctx>>, } @@ -715,9 +680,9 @@ impl<'a, 'ctx> Iterator for TextSpanIter<'a, 'ctx> { fn next(&mut self) -> Option { let ptr = self.ptr?; - self.ptr = NonNull::new(unsafe { ptr.as_ref().next }); + self.ptr = unsafe { NonNull::new((*ptr.get()).next).map(|ptr| &*ptr.as_ptr().cast()) }; Some(TextSpanRef { - ptr: unsafe { &*ptr.as_ptr() }, + ptr, ctx: self.ctx, _phantom: PhantomData, }) @@ -726,42 +691,75 @@ impl<'a, 'ctx> Iterator for TextSpanIter<'a, 'ctx> { #[derive(Copy, Clone)] pub(crate) struct TextSpanRef<'a, 'ctx> { - ptr: &'a fz_text_span, + ptr: &'a UnsafeCell, ctx: ContextRef<'ctx>, _phantom: PhantomData<&'a Text<'ctx>>, } +#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq)] +pub(crate) enum WriteMode { + Horizontal, + Vertical, +} + impl<'a, 'ctx> TextSpanRef<'a, 'ctx> { - pub(crate) fn get(self) -> &'a fz_text_span { + pub(crate) fn get(self) -> &'a UnsafeCell { self.ptr } pub(crate) fn font(self) -> FontRef<'a, 'ctx> { FontRef { - ptr: unsafe { &*self.ptr.font }, + ptr: unsafe { &*(*self.ptr.get()).font.cast::>() }, ctx: self.ctx, _phantom: PhantomData, } } + pub(crate) fn trm(self) -> fz_matrix { + unsafe { (*self.ptr.get()).trm } + } + pub(crate) fn write_mode(self) -> WriteMode { + if unsafe { (*self.ptr.get()).wmode() != 0 } { + WriteMode::Vertical + } else { + WriteMode::Horizontal + } + } pub(crate) fn items(self) -> &'a [fz_text_item] { - let len = self.ptr.len as usize; + let len = unsafe { (*self.ptr.get()).len } as usize; if len == 0 { return &[]; } - unsafe { std::slice::from_raw_parts(self.ptr.items, len) } + unsafe { std::slice::from_raw_parts((*self.ptr.get()).items, len) } } } #[derive(Clone, Copy)] pub(crate) struct FontRef<'a, 'ctx> { - ptr: &'a fz_font, + ptr: &'a UnsafeCell, ctx: ContextRef<'ctx>, _phantom: PhantomData<&'a Text<'ctx>>, } impl<'a, 'ctx> FontRef<'a, 'ctx> { - pub(crate) fn get(self) -> &'a fz_font { + pub(crate) fn get(self) -> &'a UnsafeCell { self.ptr } + pub(crate) fn name(self) -> &'a str { + unsafe { CStr::from_ptr(fz_font_name(self.ctx.0.get(), self.ptr.get())) } + .to_str() + .expect("font name isn't valid UTF-8") + } + pub(crate) fn is_bold(self) -> bool { + unsafe { fz_font_is_bold(self.ctx.0.get(), self.ptr.get()) != 0 } + } + pub(crate) fn is_italic(self) -> bool { + unsafe { fz_font_is_italic(self.ctx.0.get(), self.ptr.get()) != 0 } + } + pub(crate) fn ascender(self) -> f32 { + unsafe { fz_font_ascender(self.ctx.0.get(), self.ptr.get()) } + } + pub(crate) fn descender(self) -> f32 { + unsafe { fz_font_descender(self.ctx.0.get(), self.ptr.get()) } + } } pub(crate) fn transform_point(point: fz_point, m: fz_matrix) -> fz_point { @@ -771,3 +769,36 @@ pub(crate) fn transform_point(point: fz_point, m: fz_matrix) -> fz_point { pub(crate) fn transform_point_xy(x: f32, y: f32, m: fz_matrix) -> fz_point { unsafe { fz_transform_point_xy(x, y, m) } } + +pub(crate) fn transform_vector(vector: fz_point, m: fz_matrix) -> fz_point { + unsafe { fz_transform_vector(vector, m) } +} + +pub(crate) fn matrix_expansion(m: fz_matrix) -> f32 { + unsafe { fz_matrix_expansion(m) } +} + +pub(crate) fn concat(left: fz_matrix, right: fz_matrix) -> fz_matrix { + unsafe { fz_concat(left, right) } +} + +pub(crate) fn add_points(a: fz_point, b: fz_point) -> fz_point { + fz_point { + x: a.x + b.x, + y: a.y + b.y, + } +} + +pub(crate) fn point_min_components(a: fz_point, b: fz_point) -> fz_point { + fz_point { + x: a.x.min(b.x), + y: a.y.min(b.y), + } +} + +pub(crate) fn point_max_components(a: fz_point, b: fz_point) -> fz_point { + fz_point { + x: a.x.max(b.x), + y: a.y.max(b.y), + } +}