From fcf1c63cb7700abd3ff6fd593f7807809c39893b Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Mon, 5 Jan 2026 11:27:52 -0800 Subject: [PATCH] wip --- Cargo.lock | 1 + Cargo.toml | 1 + parse_powerisa_pdf/parse_powerisa_pdf.py | 18 ++++-- src/main.rs | 77 +++++++++++++++++------- 4 files changed, 68 insertions(+), 29 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0281106..e329500 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -225,6 +225,7 @@ dependencies = [ "indexmap", "libm", "mupdf", + "mupdf-sys", "quick-xml", "serde", ] diff --git a/Cargo.toml b/Cargo.toml index 224dad3..3de7338 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,5 +14,6 @@ rust-version = "1.89.0" indexmap = "2.12.1" libm = "0.2.15" mupdf = { version = "0.5.0", default-features = false } +mupdf-sys = { version = "0.5.0", default-features = false } quick-xml = { version = "0.38.4", features = ["serialize"] } serde = { version = "1.0.228", features = ["derive"] } diff --git a/parse_powerisa_pdf/parse_powerisa_pdf.py b/parse_powerisa_pdf/parse_powerisa_pdf.py index 409c6ac..3c2afe5 100755 --- a/parse_powerisa_pdf/parse_powerisa_pdf.py +++ b/parse_powerisa_pdf/parse_powerisa_pdf.py @@ -765,7 +765,7 @@ class Page: unprocessed_non_text: SetById[LTLine | LTRect] @staticmethod - def from_lt_page(page_num: int, page: LTPage) -> Page: + def from_lt_page(page_num: int, page: LTPage, first_seen_fonts: defaultdict[str, set[float]]) -> Page: qt: defaultdict[TextSection, QuadTree[Char | LTLine | LTRect]] = defaultdict(QuadTree) unprocessed_chars = defaultdict(lambda: defaultdict(SetById[Char])) unprocessed_non_text: SetById[LTLine | LTRect] = SetById() @@ -804,20 +804,25 @@ class Page: raise AssertionError( f"char not in text section: {element}\npage_num={page_num}") continue + font_size = round(element.size, 3) char = Char( text=element.get_text(), - font=Font(font_name=element.fontname, size=round(element.size, 3)), + font=Font(font_name=element.fontname, size=font_size), adv=element.adv, min_x=element.x0, min_y=element.y0, max_x=element.x1, max_y=element.y1, ) + if font_size not in first_seen_fonts[element.fontname]: + first_seen_fonts[element.fontname].add(font_size) + print(f"first seen font: {element.fontname!r} {font_size}: page {page_num} {char!r}") qt[text_section].insert(char.min_x, char.min_y, char) unprocessed_chars[text_section][char.font].add(char) - for i in unprocessed_chars.values(): - for j in i.values(): - j.sort(key=Char.top_down_left_to_right_sort_key) + for text_section, i in unprocessed_chars.items(): + for chars in i.values(): + chars.sort(key=Char.top_down_left_to_right_sort_key) + print(f"first char: {text_section!r}: {next(iter(chars), None)!r}") unknown_fonts=[] unknown_font_errors=[] for i in unprocessed_chars.values(): @@ -1181,13 +1186,14 @@ class Parser: def __pages_gen(file: Path, page_numbers: Iterable[int] | None) -> Generator[Page, None, None]: if page_numbers is not None: page_numbers = sorted(i - 1 for i in page_numbers) + first_seen_fonts = defaultdict(set) for i, page in enumerate(extract_pages(file, page_numbers=page_numbers)): if page_numbers is not None: page_num = page_numbers[i] + 1 else: page_num = i + 1 print(f"page {page_num}") - yield Page.from_lt_page(page_num=page_num, page=page) + yield Page.from_lt_page(page_num=page_num, page=page, first_seen_fonts=first_seen_fonts) def parse_pdf(self, file: Path, page_numbers: Iterable[int] | None = None): self.pages = Pages(pages_gen=Parser.__pages_gen( diff --git a/src/main.rs b/src/main.rs index 2e9e391..e84a5f9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,7 +2,8 @@ // See Notices.txt for copyright information use crate::quad_tree::QuadTree; -use indexmap::IndexSet; +use indexmap::{IndexMap, IndexSet}; +use mupdf_sys::FZ_STEXT_BOLD; use non_nan_float::NonNaNF32; use std::{ backtrace::Backtrace, @@ -135,7 +136,9 @@ macro_rules! make_enum_font { } const fn new_known(font_name: &str, size: NonNaNF32) -> Option { match size.get() { - $($($known_font_size if str_eq(font_name, const { Self::extract_font_name_from_font_name_with_tag($known_font_name_with_tag) }) => Some(Self::$KnownFont),)*)* + $($($known_font_size if str_eq(font_name, const { + Self::extract_font_name_from_font_name_with_tag($known_font_name_with_tag) + }) => Some(Self::$KnownFont),)*)* _ => None, } } @@ -266,12 +269,16 @@ make_enum_font! { InsnDescMisc15, #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.213] InsnDescMisc16, - #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.252] + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.238] InsnDescMisc17, - #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.962] + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.252] InsnDescMisc18, - #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 7.977] + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.962] InsnDescMisc19, + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 7.977] + InsnDescMisc20, + #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 8.506] + InsnDescMisc21, }, #[group] InsnDescCode { @@ -2064,24 +2071,23 @@ impl Parser { file: &str, page_numbers: Option>>, ) -> Result>>>, Box> { - let page_numbers = page_numbers.map(|page_numbers| { - let mut retval = Vec::from_iter(page_numbers.into_iter().map(|v| v.get() - 1)); + let page_indexes = page_numbers.map(|page_numbers| { + let mut retval = Vec::from_iter(page_numbers.into_iter().map(|v| v.get() as usize - 1)); retval.sort(); retval }); let document = mupdf::Document::open(file)?; let pages: Vec = document.pages().and_then(|pages| pages.collect())?; - Ok(Box::new(pages.into_iter().enumerate().map( - move |(i, page)| { - let page_num = match &page_numbers { - Some(page_numbers) => page_numbers[i] + 1, - None => i as u32 + 1, - }; - println!("page {page_num}"); - Ok(Page::from_mupdf_page(page_num, page) - .map_err(|e| format!("error reading pdf page {page_num}: {e}"))?) - }, - ))) + let page_indexes = page_indexes.unwrap_or_else(|| (0..pages.len()).collect()); + let mut first_seen_fonts = BTreeMap::new(); + Ok(Box::new(page_indexes.into_iter().map(move |page_index| { + let page_num = page_index as u32 + 1; + println!("page {page_num}"); + Ok( + Page::from_mupdf_page(page_num, &pages[page_index], &mut first_seen_fonts) + .map_err(|e| format!("error reading pdf page {page_num}: {e}"))?, + ) + }))) } fn parse_pdf>>( &mut self, @@ -3411,7 +3417,8 @@ struct MuPdfXmlChar<'a> { impl Page { fn from_mupdf_page( page_num: u32, - page: mupdf::Page, + page: &mupdf::Page, + first_seen_fonts: &mut BTreeMap>, ) -> Result> { let device = MyDevice::new(page_num); page.run( @@ -3439,8 +3446,21 @@ impl Page { const ROUND_FACTOR: f32 = 1000.0; let font_size = (xml_font.size * ROUND_FACTOR).round() / ROUND_FACTOR; let font_size = NonNaNF32::new(font_size).ok_or("font size must not be NaN")?; - let font = Font::new(&xml_font.name, font_size); for xml_char in xml_font.char { + if xml_char.c.trim().is_empty() { + continue; + } + let font_name = match &*xml_font.name { + "DejaVuSansCondensed-Obli" => { + if (xml_char.flags & FZ_STEXT_BOLD) != 0 { + "DejaVuSansCondensed-BoldOblique" + } else { + "DejaVuSansCondensed-Oblique" + } + } + font_name => font_name, + }; + let font = Font::new(font_name, font_size); let [x0, y0, x1, y1, x2, y2, x3, y3] = xml_char.quad; let min_x = x0.min(x1).min(x2).min(x3); let max_x = x0.max(x1).max(x2).max(x3); @@ -3469,6 +3489,16 @@ impl Page { max_x: NonNaNF32::new(max_x).ok_or("char position shouldn't be NaN")?, max_y: NonNaNF32::new(max_y).ok_or("char position shouldn't be NaN")?, }; + let set = match first_seen_fonts.get_mut(font_name) { + Some(v) => v, + None => first_seen_fonts.entry(String::from(font_name)).or_default(), + }; + if set.insert(font_size) { + println!( + "first seen font: {font_name:?} {font_size}: page {page_num} {char:?} {:x}", + xml_char.flags, + ); + } qt.entry(text_section).or_default().insert( min_x, min_y, @@ -3486,9 +3516,10 @@ impl Page { } } } - for i in unprocessed_chars.borrow_mut().values_mut() { - for j in i.borrow_mut().values_mut() { - j.sort_by_key(Char::top_down_left_to_right_sort_key); + for (text_section, i) in unprocessed_chars.borrow_mut().iter_mut() { + for chars in i.borrow_mut().values_mut() { + chars.sort_by_key(Char::top_down_left_to_right_sort_key); + println!("first char: {text_section:?}: {:?}", chars.first()); } } let mut unknown_fonts = Vec::new();