From 73c45323c84545aa02c41cffe9d128cbcd7e54d9 Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Tue, 6 Jan 2026 13:36:04 -0800 Subject: [PATCH] seems to work --- src/main.rs | 248 +++++++++++++++++++++++++++++++---------------- src/mupdf_ffi.rs | 78 +++++++++++++-- 2 files changed, 234 insertions(+), 92 deletions(-) diff --git a/src/main.rs b/src/main.rs index a6a36e6..5b0feca 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,7 +3,8 @@ use crate::{ mupdf_ffi::{ - WriteMode, add_points, point_max_components, point_min_components, transform_vector, + MuPdfError, WriteMode, add_points, point_max_components, point_min_components, + transform_vector, }, quad_tree::QuadTree, }; @@ -16,7 +17,6 @@ use std::{ cell::RefCell, collections::{BTreeMap, BTreeSet, HashMap, HashSet}, convert::Infallible, - error::Error, fmt, num::NonZero, ops::ControlFlow, @@ -1610,7 +1610,7 @@ struct Page { } struct Pages<'ctx> { - pages_gen: Option>> + 'ctx>>, + pages_gen: Option> + 'ctx>>, pages: BTreeMap>, max_page_num: u32, } @@ -1634,9 +1634,7 @@ impl<'ctx> fmt::Debug for Pages<'ctx> { } impl<'ctx> Pages<'ctx> { - fn new( - pages_gen: Option>> + 'ctx>>, - ) -> Self { + fn new(pages_gen: Option> + 'ctx>>) -> Self { Self { pages_gen, pages: BTreeMap::new(), @@ -1646,13 +1644,13 @@ impl<'ctx> Pages<'ctx> { fn close(&mut self) { self.pages_gen = None; } - fn is_past_end(&mut self, page_num: u32) -> Result> { + fn is_past_end(&mut self, page_num: u32) -> Result { while self.pages_gen.is_some() && page_num > self.max_page_num { self.fill_page()?; } Ok(page_num > self.max_page_num) } - fn fill_page(&mut self) -> Result> { + fn fill_page(&mut self) -> Result { let Some(pages_gen) = &mut self.pages_gen else { return Ok(false); }; @@ -1673,7 +1671,7 @@ impl<'ctx> Pages<'ctx> { self.max_page_num = page_num; Ok(true) } - fn get(&mut self, page_num: u32) -> Result>, Box> { + fn get(&mut self, page_num: u32) -> Result>, Error> { loop { if let Some(page) = self.pages.get(&page_num) { return Ok(Some(page.clone())); @@ -2017,10 +2015,37 @@ struct Parser<'ctx> { insns: Vec, } +#[derive(Debug)] +struct Error(String, Backtrace); + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(&self.0)?; + f.write_str("\n")?; + fmt::Display::fmt(&self.1, f) + } +} + +trait IntoError: fmt::Display {} + +impl From for Error { + fn from(value: T) -> Self { + Error(value.to_string(), Backtrace::capture()) + } +} + +impl IntoError for &'_ str {} +impl IntoError for String {} +impl IntoError for MuPdfError {} +impl IntoError for std::ffi::NulError {} +impl IntoError for std::num::ParseIntError {} +impl IntoError for std::io::Error {} +impl IntoError for ErrorWithNote {} + enum ExtractInsnsError { - InsnParseError(String, std::backtrace::Backtrace), - PageParseError(String, std::backtrace::Backtrace), - Other(Box), + InsnParseError(String, Backtrace), + PageParseError(String, Backtrace), + Other(Error), } impl fmt::Display for ExtractInsnsError { @@ -2054,7 +2079,7 @@ impl fmt::Display for ErrorWithNote { } } -impl Error for ErrorWithNote {} +impl std::error::Error for ErrorWithNote {} impl<'ctx> Parser<'ctx> { fn new() -> Self { @@ -2064,15 +2089,13 @@ impl<'ctx> Parser<'ctx> { insns: Vec::new(), } } - fn page(&mut self) -> Result, Box> { + fn page(&mut self) -> Result, Error> { Ok(self .pages .get(self.text_section.page_num)? .ok_or("page_num is out of range")?) } - fn unprocessed_chars( - &mut self, - ) -> Result>>>, Box> { + fn unprocessed_chars(&mut self) -> Result>>>, Error> { Ok(self .page()? .unprocessed_chars @@ -2085,7 +2108,8 @@ impl<'ctx> Parser<'ctx> { ctx: impl Into>, file: &str, page_numbers: Option>>, - ) -> Result>> + 'ctx>, Box> { + dump_mupdf_page_xml: bool, + ) -> Result> + 'ctx>, Error> { let ctx = ctx.into(); let page_indexes = page_numbers.map(|page_numbers| { let mut retval = Vec::from_iter(page_numbers.into_iter().map(|v| v.get() as usize - 1)); @@ -2103,7 +2127,7 @@ impl<'ctx> Parser<'ctx> { .load_page(page_index) .map_err(|e| format!("error reading pdf page {page_num}: {e}"))?; Ok( - Page::from_mupdf_page(page_num, &page, &mut first_seen_fonts) + Page::from_mupdf_page(page_num, &page, &mut first_seen_fonts, dump_mupdf_page_xml) .map_err(|e| format!("error reading pdf page {page_num}: {e}"))?, ) }))) @@ -2113,11 +2137,13 @@ impl<'ctx> Parser<'ctx> { ctx: impl Into>, file: &str, page_numbers: Option, - ) -> Result<(), Box> { + dump_mupdf_page_xml: bool, + ) -> Result<(), Error> { self.pages = Pages::new(Some(Self::pages_gen( ctx, file, page_numbers.map(|v| v.into_iter().collect()), + dump_mupdf_page_xml, )?)); self.text_section = TextSection::first(); loop { @@ -2151,7 +2177,7 @@ impl<'ctx> Parser<'ctx> { } } } - fn parse_text_section(&mut self) -> Result<(), ErrorWithNote>> { + fn parse_text_section(&mut self) -> Result<(), ErrorWithNote> { match self.note_text_section(Self::extract_insns) { Ok(()) => Ok(()), Err( @@ -2177,7 +2203,7 @@ impl<'ctx> Parser<'ctx> { min_y: f32, max_y: f32, allow_processed: bool, - ) -> Result, Box> { + ) -> Result, Error> { let mut retval = None; let page = self.page()?; let unprocessed_chars = self.unprocessed_chars()?; @@ -2342,6 +2368,9 @@ impl<'ctx> Parser<'ctx> { "\u{fb04}" => "ffl", v => v, }; + if char_text.chars().skip(1).next().is_some() { + dbg!(&ch); + } text_and_tag_stacks.push((char_text.into(), kind.text_line_tags().collect())); last_max_x = ch.max_x.get(); } @@ -2888,7 +2917,7 @@ impl<'ctx> Parser<'ctx> { )); }; let table_header_fields_text = table_header_fields.element.inner_text(); - if table_header_reg_text != "Field(s)" { + if table_header_fields_text != "Field(s)" { return Err(ExtractInsnsError::Other( format!( "can't find special registers altered table's fields-column's header:\n\ @@ -3186,7 +3215,7 @@ struct MyDevice<'a> { Rc>>>>>>, unprocessed_non_text: Rc>>, first_seen_fonts: RefCell<&'a mut BTreeMap>>, - error: RefCell>>, + error: RefCell>, } impl<'a> MyDevice<'a> { @@ -3361,6 +3390,57 @@ impl<'a> MyDevice<'a> { } _ => font_name_with_tag, }; + let mut flush_char = |char: Char| -> Result<(), ()> { + let Some(text_section) = TextSection::for_position( + self.page_num, + (char.min_x.get() + char.max_x.get()) * 0.5, + (char.min_y.get() + char.max_y.get()) * 0.5, + ) else { + if PAGE_BODY_MIN_Y <= char.min_y.get() && char.min_y.get() <= PAGE_BODY_MAX_Y { + if self.page_num != 1072 { + // page 1072 has characters in the margins + let _ = self.error.replace(Err(format!( + "char not in text section: {:?}\npage_num={}", + char.text, self.page_num, + ) + .into())); + return Err(()); + } + } + return Ok(()); + }; + let set = match first_seen_fonts.get_mut(font_name_with_tag) { + Some(v) => v, + None => first_seen_fonts + .entry(String::from(font_name_with_tag)) + .or_default(), + }; + if set.insert(font_size) { + println!( + "first seen font: {font_name_with_tag:?} {font_size}: page {} {char:?}", + self.page_num, + ); + } + self.qt + .borrow_mut() + .entry(text_section) + .or_default() + .insert( + char.min_x.get(), + char.min_y.get(), + PageItem::Char(char.clone()), + ); + self.unprocessed_chars + .borrow_mut() + .entry(text_section) + .or_default() + .borrow_mut() + .entry(char.font.clone()) + .or_default() + .insert(char); + Ok(()) + }; + let mut last_char = None; for &fz_text_item { x, y, @@ -3380,7 +3460,7 @@ impl<'a> MyDevice<'a> { let dir = mupdf_ffi::transform_vector(dir, trm); let glyph_start; let glyph_stop; - let glyph_ascender; + let mut glyph_ascender; let glyph_descender; match span.write_mode() { WriteMode::Horizontal => { @@ -3397,6 +3477,9 @@ impl<'a> MyDevice<'a> { x: 0.0, y: span.font().descender(), }; + if glyph_ascender.y == glyph_descender.y { + glyph_ascender.y += 1.0; + } } WriteMode::Vertical => { glyph_start = fz_point { @@ -3436,24 +3519,6 @@ impl<'a> MyDevice<'a> { font_name: font_name_with_tag.into(), size: font_size, }); - let Some(text_section) = TextSection::for_position( - self.page_num, - (min.x + max.x) * 0.5, - (min.y + max.y) * 0.5, - ) else { - if PAGE_BODY_MIN_Y <= min.y && min.y <= PAGE_BODY_MAX_Y { - if self.page_num != 1072 { - // page 1072 has characters in the margins - let _ = self.error.replace(Err(format!( - "char not in text section: {text:?}\npage_num={}", - self.page_num, - ) - .into())); - return; - } - } - continue; - }; let (Some(min_x), Some(min_y), Some(max_x), Some(max_y)) = ( NonNaNF32::new(min.x), NonNaNF32::new(min.y), @@ -3465,39 +3530,51 @@ impl<'a> MyDevice<'a> { .replace(Err("char position shouldn't be NaN".into())); return; }; - let char = Char { + if gid < 0 + && last_char + .as_ref() + .is_some_and(|last_char: &Char| last_char.font == font) + { + if let Some(Char { + font, + text: last_text, + min_x: last_min_x, + min_y: last_min_y, + max_x: last_max_x, + max_y: last_max_y, + }) = last_char.take() + { + last_char = Some(Char { + font, + text: last_text + &text, + min_x: last_min_x.min(min_x), + min_y: last_min_y.min(min_y), + max_x: last_max_x.max(max_x), + max_y: last_max_y.max(max_y), + }); + continue; + } + } + if let Some(last_char) = last_char.take() { + match flush_char(last_char) { + Ok(()) => {} + Err(()) => return, + } + } + last_char = Some(Char { font, text, min_x, min_y, max_x, max_y, - }; - let set = match first_seen_fonts.get_mut(font_name_with_tag) { - Some(v) => v, - None => first_seen_fonts - .entry(String::from(font_name_with_tag)) - .or_default(), - }; - if set.insert(font_size) { - println!( - "first seen font: {font_name_with_tag:?} {font_size}: page {} {char:?}", - self.page_num, - ); + }); + } + if let Some(last_char) = last_char { + match flush_char(last_char) { + Ok(()) => {} + Err(()) => return, } - self.qt - .borrow_mut() - .entry(text_section) - .or_default() - .insert(min_x.get(), min_y.get(), PageItem::Char(char.clone())); - self.unprocessed_chars - .borrow_mut() - .entry(text_section) - .or_default() - .borrow_mut() - .entry(char.font.clone()) - .or_default() - .insert(char); } } } @@ -3665,22 +3742,19 @@ impl Page { page_num: u32, page: &mupdf_ffi::Page<'_>, first_seen_fonts: &mut BTreeMap>, - ) -> Result> { + dump_mupdf_page_xml: bool, + ) -> Result { + if dump_mupdf_page_xml { + println!("{}", page.to_xml()?); + } + let Some(pdf_page) = page.pdf_page() else { + return Err("page is not from a pdf".into()); + }; let device = mupdf_ffi::Device::new( page.ctx(), Box::new(MyDevice::new(page_num, first_seen_fonts)), )?; - page.run( - &device, - fz_matrix { - a: 1.0, - b: 0.0, - c: 0.0, - d: 1.0, - e: 0.0, - f: 0.0, - }, - )?; + page.run(&device, pdf_page.transform()?)?; let MyDevice { page_num: _, qt, @@ -3731,8 +3805,14 @@ impl Page { } } -fn main_inner() -> Result<(), Box> { - let args: Vec = std::env::args().collect(); +fn main_inner() -> Result<(), Error> { + let mut args: Vec = std::env::args().collect(); + let dump_mupdf_page_xml = if args.get(1).is_some_and(|v| v == "--dump-mupdf-page-xml") { + args.remove(1); + true + } else { + false + }; let page_numbers: Option>>> = if 2 < args.len() { Some(if let Some((start, end)) = args[2].split_once(":") { let start: NonZero = start.trim().parse()?; @@ -3755,7 +3835,7 @@ fn main_inner() -> Result<(), Box> { let mut parser = Parser::new(); let is_subset = page_numbers.is_some(); let file_name = &args[1]; - parser.parse_pdf(ctx, file_name, page_numbers)?; + parser.parse_pdf(ctx, file_name, page_numbers, dump_mupdf_page_xml)?; let mut insns = xml_tree::Element::new( "instructions".into(), [("is-subset".into(), is_subset.to_string())], diff --git a/src/mupdf_ffi.rs b/src/mupdf_ffi.rs index 942bcfc..19d7564 100644 --- a/src/mupdf_ffi.rs +++ b/src/mupdf_ffi.rs @@ -2,14 +2,16 @@ // See Notices.txt for copyright information use mupdf_sys::{ - fz_clone_context, fz_color_params, fz_colorspace, fz_concat, fz_context, fz_device, - fz_document, fz_drop_context, fz_drop_device, fz_drop_document, fz_drop_page, fz_drop_path, - fz_drop_text, fz_error_type_FZ_ERROR_GENERIC, fz_font, fz_font_ascender, fz_font_descender, - fz_font_is_bold, fz_font_is_italic, fz_font_name, fz_matrix, fz_matrix_expansion, fz_page, - fz_path, fz_path_walker, fz_point, fz_rect, fz_stroke_state, fz_text, fz_text_item, - fz_text_span, fz_transform_point, fz_transform_point_xy, fz_transform_vector, fz_walk_path, - mupdf_document_page_count, mupdf_drop_error, mupdf_error_t, mupdf_load_page, - mupdf_new_base_context, mupdf_new_derived_device, mupdf_open_document, mupdf_run_page, + fz_buffer, fz_buffer_storage, fz_clone_context, fz_color_params, fz_colorspace, fz_concat, + fz_context, fz_device, fz_document, fz_drop_buffer, fz_drop_context, fz_drop_device, + fz_drop_document, fz_drop_page, fz_drop_path, fz_drop_text, fz_error_type_FZ_ERROR_GENERIC, + fz_font, fz_font_ascender, fz_font_descender, fz_font_is_bold, fz_font_is_italic, fz_font_name, + fz_matrix, fz_matrix_expansion, fz_page, fz_path, fz_path_walker, fz_point, fz_rect, + fz_stroke_state, fz_text, fz_text_item, fz_text_span, fz_transform_point, + fz_transform_point_xy, fz_transform_vector, fz_walk_path, mupdf_document_page_count, + mupdf_drop_error, mupdf_error_t, mupdf_load_page, mupdf_new_base_context, + mupdf_new_derived_device, mupdf_open_document, mupdf_page_to_xml, mupdf_pdf_page_transform, + mupdf_run_page, pdf_page, pdf_page_from_fz_page, }; use std::{ cell::{Cell, UnsafeCell}, @@ -172,6 +174,33 @@ impl<'ctx> Drop for Document<'ctx> { } } +struct Buffer<'ctx> { + ptr: *mut fz_buffer, + ctx: ContextRef<'ctx>, +} + +impl<'ctx> Buffer<'ctx> { + fn storage(&mut self) -> &mut [u8] { + unsafe { + let mut ptr = ptr::null_mut(); + let len = fz_buffer_storage(self.ctx.0.get(), self.ptr, &raw mut ptr); + if len == 0 { + &mut [] + } else { + std::slice::from_raw_parts_mut(ptr, len) + } + } + } +} + +impl<'ctx> Drop for Buffer<'ctx> { + fn drop(&mut self) { + unsafe { + fz_drop_buffer(self.ctx.0.get(), self.ptr); + } + } +} + pub(crate) struct Page<'ctx> { ptr: *mut fz_page, ctx: ContextRef<'ctx>, @@ -199,6 +228,25 @@ impl<'ctx> Page<'ctx> { }) } } + pub(crate) fn to_xml(&self) -> Result { + unsafe { + let mut buffer = + mupdf_try(|errptr| mupdf_page_to_xml(self.ctx.0.get(), self.ptr, errptr)) + .map(|ptr| Buffer { ptr, ctx: self.ctx })?; + Ok(str::from_utf8(buffer.storage()) + .map_err(MuPdfError::new_generic)? + .into()) + } + } + pub(crate) fn pdf_page<'a>(&'a self) -> Option> { + unsafe { + let ptr = pdf_page_from_fz_page(self.ctx.0.get(), self.ptr); + NonNull::new(ptr).map(|ptr| PdfPageRef { + ptr: &*ptr.as_ptr().cast(), + ctx: self.ctx, + }) + } + } } impl<'ctx> Drop for Page<'ctx> { @@ -209,6 +257,20 @@ impl<'ctx> Drop for Page<'ctx> { } } +#[derive(Clone, Copy)] +pub(crate) struct PdfPageRef<'a, 'ctx> { + ptr: &'a UnsafeCell, + ctx: ContextRef<'ctx>, +} + +impl<'a, 'ctx> PdfPageRef<'a, 'ctx> { + pub(crate) fn transform(self) -> Result { + unsafe { + mupdf_try(|errptr| mupdf_pdf_page_transform(self.ctx.0.get(), self.ptr.get(), errptr)) + } + } +} + pub(crate) struct Device<'ctx, T: 'ctx> { dev: *mut fz_device, ctx: ContextRef<'ctx>,