seems to work

This commit is contained in:
Jacob Lifshay 2026-01-06 13:36:04 -08:00
parent 040afcc435
commit 73c45323c8
Signed by: programmerjake
SSH key fingerprint: SHA256:HnFTLGpSm4Q4Fj502oCFisjZSoakwEuTsJJMSke63RQ
2 changed files with 234 additions and 92 deletions

View file

@ -3,7 +3,8 @@
use crate::{
mupdf_ffi::{
WriteMode, add_points, point_max_components, point_min_components, transform_vector,
MuPdfError, WriteMode, add_points, point_max_components, point_min_components,
transform_vector,
},
quad_tree::QuadTree,
};
@ -16,7 +17,6 @@ use std::{
cell::RefCell,
collections::{BTreeMap, BTreeSet, HashMap, HashSet},
convert::Infallible,
error::Error,
fmt,
num::NonZero,
ops::ControlFlow,
@ -1610,7 +1610,7 @@ struct Page {
}
struct Pages<'ctx> {
pages_gen: Option<Box<dyn Iterator<Item = Result<Page, Box<dyn Error>>> + 'ctx>>,
pages_gen: Option<Box<dyn Iterator<Item = Result<Page, Error>> + 'ctx>>,
pages: BTreeMap<u32, Rc<Page>>,
max_page_num: u32,
}
@ -1634,9 +1634,7 @@ impl<'ctx> fmt::Debug for Pages<'ctx> {
}
impl<'ctx> Pages<'ctx> {
fn new(
pages_gen: Option<Box<dyn Iterator<Item = Result<Page, Box<dyn Error>>> + 'ctx>>,
) -> Self {
fn new(pages_gen: Option<Box<dyn Iterator<Item = Result<Page, Error>> + 'ctx>>) -> Self {
Self {
pages_gen,
pages: BTreeMap::new(),
@ -1646,13 +1644,13 @@ impl<'ctx> Pages<'ctx> {
fn close(&mut self) {
self.pages_gen = None;
}
fn is_past_end(&mut self, page_num: u32) -> Result<bool, Box<dyn Error>> {
fn is_past_end(&mut self, page_num: u32) -> Result<bool, Error> {
while self.pages_gen.is_some() && page_num > self.max_page_num {
self.fill_page()?;
}
Ok(page_num > self.max_page_num)
}
fn fill_page(&mut self) -> Result<bool, Box<dyn Error>> {
fn fill_page(&mut self) -> Result<bool, Error> {
let Some(pages_gen) = &mut self.pages_gen else {
return Ok(false);
};
@ -1673,7 +1671,7 @@ impl<'ctx> Pages<'ctx> {
self.max_page_num = page_num;
Ok(true)
}
fn get(&mut self, page_num: u32) -> Result<Option<Rc<Page>>, Box<dyn Error>> {
fn get(&mut self, page_num: u32) -> Result<Option<Rc<Page>>, Error> {
loop {
if let Some(page) = self.pages.get(&page_num) {
return Ok(Some(page.clone()));
@ -2017,10 +2015,37 @@ struct Parser<'ctx> {
insns: Vec<Insn>,
}
#[derive(Debug)]
struct Error(String, Backtrace);
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str(&self.0)?;
f.write_str("\n")?;
fmt::Display::fmt(&self.1, f)
}
}
trait IntoError: fmt::Display {}
impl<T: IntoError> From<T> for Error {
fn from(value: T) -> Self {
Error(value.to_string(), Backtrace::capture())
}
}
impl IntoError for &'_ str {}
impl IntoError for String {}
impl IntoError for MuPdfError {}
impl IntoError for std::ffi::NulError {}
impl IntoError for std::num::ParseIntError {}
impl IntoError for std::io::Error {}
impl<T: fmt::Display> IntoError for ErrorWithNote<T> {}
enum ExtractInsnsError {
InsnParseError(String, std::backtrace::Backtrace),
PageParseError(String, std::backtrace::Backtrace),
Other(Box<dyn Error>),
InsnParseError(String, Backtrace),
PageParseError(String, Backtrace),
Other(Error),
}
impl fmt::Display for ExtractInsnsError {
@ -2054,7 +2079,7 @@ impl<E: fmt::Display> fmt::Display for ErrorWithNote<E> {
}
}
impl<E: fmt::Display + fmt::Debug> Error for ErrorWithNote<E> {}
impl<E: fmt::Display + fmt::Debug> std::error::Error for ErrorWithNote<E> {}
impl<'ctx> Parser<'ctx> {
fn new() -> Self {
@ -2064,15 +2089,13 @@ impl<'ctx> Parser<'ctx> {
insns: Vec::new(),
}
}
fn page(&mut self) -> Result<Rc<Page>, Box<dyn Error>> {
fn page(&mut self) -> Result<Rc<Page>, Error> {
Ok(self
.pages
.get(self.text_section.page_num)?
.ok_or("page_num is out of range")?)
}
fn unprocessed_chars(
&mut self,
) -> Result<Rc<RefCell<BTreeMap<Font, IndexSet<Char>>>>, Box<dyn Error>> {
fn unprocessed_chars(&mut self) -> Result<Rc<RefCell<BTreeMap<Font, IndexSet<Char>>>>, Error> {
Ok(self
.page()?
.unprocessed_chars
@ -2085,7 +2108,8 @@ impl<'ctx> Parser<'ctx> {
ctx: impl Into<mupdf_ffi::ContextRef<'ctx>>,
file: &str,
page_numbers: Option<Vec<NonZero<u32>>>,
) -> Result<Box<dyn Iterator<Item = Result<Page, Box<dyn Error>>> + 'ctx>, Box<dyn Error>> {
dump_mupdf_page_xml: bool,
) -> Result<Box<dyn Iterator<Item = Result<Page, Error>> + 'ctx>, Error> {
let ctx = ctx.into();
let page_indexes = page_numbers.map(|page_numbers| {
let mut retval = Vec::from_iter(page_numbers.into_iter().map(|v| v.get() as usize - 1));
@ -2103,7 +2127,7 @@ impl<'ctx> Parser<'ctx> {
.load_page(page_index)
.map_err(|e| format!("error reading pdf page {page_num}: {e}"))?;
Ok(
Page::from_mupdf_page(page_num, &page, &mut first_seen_fonts)
Page::from_mupdf_page(page_num, &page, &mut first_seen_fonts, dump_mupdf_page_xml)
.map_err(|e| format!("error reading pdf page {page_num}: {e}"))?,
)
})))
@ -2113,11 +2137,13 @@ impl<'ctx> Parser<'ctx> {
ctx: impl Into<mupdf_ffi::ContextRef<'ctx>>,
file: &str,
page_numbers: Option<I>,
) -> Result<(), Box<dyn Error>> {
dump_mupdf_page_xml: bool,
) -> Result<(), Error> {
self.pages = Pages::new(Some(Self::pages_gen(
ctx,
file,
page_numbers.map(|v| v.into_iter().collect()),
dump_mupdf_page_xml,
)?));
self.text_section = TextSection::first();
loop {
@ -2151,7 +2177,7 @@ impl<'ctx> Parser<'ctx> {
}
}
}
fn parse_text_section(&mut self) -> Result<(), ErrorWithNote<Box<dyn Error>>> {
fn parse_text_section(&mut self) -> Result<(), ErrorWithNote<Error>> {
match self.note_text_section(Self::extract_insns) {
Ok(()) => Ok(()),
Err(
@ -2177,7 +2203,7 @@ impl<'ctx> Parser<'ctx> {
min_y: f32,
max_y: f32,
allow_processed: bool,
) -> Result<Option<Char>, Box<dyn Error>> {
) -> Result<Option<Char>, Error> {
let mut retval = None;
let page = self.page()?;
let unprocessed_chars = self.unprocessed_chars()?;
@ -2342,6 +2368,9 @@ impl<'ctx> Parser<'ctx> {
"\u{fb04}" => "ffl",
v => v,
};
if char_text.chars().skip(1).next().is_some() {
dbg!(&ch);
}
text_and_tag_stacks.push((char_text.into(), kind.text_line_tags().collect()));
last_max_x = ch.max_x.get();
}
@ -2888,7 +2917,7 @@ impl<'ctx> Parser<'ctx> {
));
};
let table_header_fields_text = table_header_fields.element.inner_text();
if table_header_reg_text != "Field(s)" {
if table_header_fields_text != "Field(s)" {
return Err(ExtractInsnsError::Other(
format!(
"can't find special registers altered table's fields-column's header:\n\
@ -3186,7 +3215,7 @@ struct MyDevice<'a> {
Rc<RefCell<BTreeMap<TextSection, Rc<RefCell<BTreeMap<Font, IndexSet<Char>>>>>>>,
unprocessed_non_text: Rc<RefCell<IndexSet<LineOrRect>>>,
first_seen_fonts: RefCell<&'a mut BTreeMap<String, BTreeSet<NonNaNF32>>>,
error: RefCell<Result<(), Box<dyn Error>>>,
error: RefCell<Result<(), Error>>,
}
impl<'a> MyDevice<'a> {
@ -3361,6 +3390,57 @@ impl<'a> MyDevice<'a> {
}
_ => font_name_with_tag,
};
let mut flush_char = |char: Char| -> Result<(), ()> {
let Some(text_section) = TextSection::for_position(
self.page_num,
(char.min_x.get() + char.max_x.get()) * 0.5,
(char.min_y.get() + char.max_y.get()) * 0.5,
) else {
if PAGE_BODY_MIN_Y <= char.min_y.get() && char.min_y.get() <= PAGE_BODY_MAX_Y {
if self.page_num != 1072 {
// page 1072 has characters in the margins
let _ = self.error.replace(Err(format!(
"char not in text section: {:?}\npage_num={}",
char.text, self.page_num,
)
.into()));
return Err(());
}
}
return Ok(());
};
let set = match first_seen_fonts.get_mut(font_name_with_tag) {
Some(v) => v,
None => first_seen_fonts
.entry(String::from(font_name_with_tag))
.or_default(),
};
if set.insert(font_size) {
println!(
"first seen font: {font_name_with_tag:?} {font_size}: page {} {char:?}",
self.page_num,
);
}
self.qt
.borrow_mut()
.entry(text_section)
.or_default()
.insert(
char.min_x.get(),
char.min_y.get(),
PageItem::Char(char.clone()),
);
self.unprocessed_chars
.borrow_mut()
.entry(text_section)
.or_default()
.borrow_mut()
.entry(char.font.clone())
.or_default()
.insert(char);
Ok(())
};
let mut last_char = None;
for &fz_text_item {
x,
y,
@ -3380,7 +3460,7 @@ impl<'a> MyDevice<'a> {
let dir = mupdf_ffi::transform_vector(dir, trm);
let glyph_start;
let glyph_stop;
let glyph_ascender;
let mut glyph_ascender;
let glyph_descender;
match span.write_mode() {
WriteMode::Horizontal => {
@ -3397,6 +3477,9 @@ impl<'a> MyDevice<'a> {
x: 0.0,
y: span.font().descender(),
};
if glyph_ascender.y == glyph_descender.y {
glyph_ascender.y += 1.0;
}
}
WriteMode::Vertical => {
glyph_start = fz_point {
@ -3436,24 +3519,6 @@ impl<'a> MyDevice<'a> {
font_name: font_name_with_tag.into(),
size: font_size,
});
let Some(text_section) = TextSection::for_position(
self.page_num,
(min.x + max.x) * 0.5,
(min.y + max.y) * 0.5,
) else {
if PAGE_BODY_MIN_Y <= min.y && min.y <= PAGE_BODY_MAX_Y {
if self.page_num != 1072 {
// page 1072 has characters in the margins
let _ = self.error.replace(Err(format!(
"char not in text section: {text:?}\npage_num={}",
self.page_num,
)
.into()));
return;
}
}
continue;
};
let (Some(min_x), Some(min_y), Some(max_x), Some(max_y)) = (
NonNaNF32::new(min.x),
NonNaNF32::new(min.y),
@ -3465,39 +3530,51 @@ impl<'a> MyDevice<'a> {
.replace(Err("char position shouldn't be NaN".into()));
return;
};
let char = Char {
if gid < 0
&& last_char
.as_ref()
.is_some_and(|last_char: &Char| last_char.font == font)
{
if let Some(Char {
font,
text: last_text,
min_x: last_min_x,
min_y: last_min_y,
max_x: last_max_x,
max_y: last_max_y,
}) = last_char.take()
{
last_char = Some(Char {
font,
text: last_text + &text,
min_x: last_min_x.min(min_x),
min_y: last_min_y.min(min_y),
max_x: last_max_x.max(max_x),
max_y: last_max_y.max(max_y),
});
continue;
}
}
if let Some(last_char) = last_char.take() {
match flush_char(last_char) {
Ok(()) => {}
Err(()) => return,
}
}
last_char = Some(Char {
font,
text,
min_x,
min_y,
max_x,
max_y,
};
let set = match first_seen_fonts.get_mut(font_name_with_tag) {
Some(v) => v,
None => first_seen_fonts
.entry(String::from(font_name_with_tag))
.or_default(),
};
if set.insert(font_size) {
println!(
"first seen font: {font_name_with_tag:?} {font_size}: page {} {char:?}",
self.page_num,
);
});
}
if let Some(last_char) = last_char {
match flush_char(last_char) {
Ok(()) => {}
Err(()) => return,
}
self.qt
.borrow_mut()
.entry(text_section)
.or_default()
.insert(min_x.get(), min_y.get(), PageItem::Char(char.clone()));
self.unprocessed_chars
.borrow_mut()
.entry(text_section)
.or_default()
.borrow_mut()
.entry(char.font.clone())
.or_default()
.insert(char);
}
}
}
@ -3665,22 +3742,19 @@ impl Page {
page_num: u32,
page: &mupdf_ffi::Page<'_>,
first_seen_fonts: &mut BTreeMap<String, BTreeSet<NonNaNF32>>,
) -> Result<Self, Box<dyn std::error::Error>> {
dump_mupdf_page_xml: bool,
) -> Result<Self, Error> {
if dump_mupdf_page_xml {
println!("{}", page.to_xml()?);
}
let Some(pdf_page) = page.pdf_page() else {
return Err("page is not from a pdf".into());
};
let device = mupdf_ffi::Device::new(
page.ctx(),
Box::new(MyDevice::new(page_num, first_seen_fonts)),
)?;
page.run(
&device,
fz_matrix {
a: 1.0,
b: 0.0,
c: 0.0,
d: 1.0,
e: 0.0,
f: 0.0,
},
)?;
page.run(&device, pdf_page.transform()?)?;
let MyDevice {
page_num: _,
qt,
@ -3731,8 +3805,14 @@ impl Page {
}
}
fn main_inner() -> Result<(), Box<dyn std::error::Error>> {
let args: Vec<String> = std::env::args().collect();
fn main_inner() -> Result<(), Error> {
let mut args: Vec<String> = std::env::args().collect();
let dump_mupdf_page_xml = if args.get(1).is_some_and(|v| v == "--dump-mupdf-page-xml") {
args.remove(1);
true
} else {
false
};
let page_numbers: Option<Box<dyn Iterator<Item = NonZero<u32>>>> = if 2 < args.len() {
Some(if let Some((start, end)) = args[2].split_once(":") {
let start: NonZero<u32> = start.trim().parse()?;
@ -3755,7 +3835,7 @@ fn main_inner() -> Result<(), Box<dyn std::error::Error>> {
let mut parser = Parser::new();
let is_subset = page_numbers.is_some();
let file_name = &args[1];
parser.parse_pdf(ctx, file_name, page_numbers)?;
parser.parse_pdf(ctx, file_name, page_numbers, dump_mupdf_page_xml)?;
let mut insns = xml_tree::Element::new(
"instructions".into(),
[("is-subset".into(), is_subset.to_string())],

View file

@ -2,14 +2,16 @@
// See Notices.txt for copyright information
use mupdf_sys::{
fz_clone_context, fz_color_params, fz_colorspace, fz_concat, fz_context, fz_device,
fz_document, fz_drop_context, fz_drop_device, fz_drop_document, fz_drop_page, fz_drop_path,
fz_drop_text, fz_error_type_FZ_ERROR_GENERIC, fz_font, fz_font_ascender, fz_font_descender,
fz_font_is_bold, fz_font_is_italic, fz_font_name, fz_matrix, fz_matrix_expansion, fz_page,
fz_path, fz_path_walker, fz_point, fz_rect, fz_stroke_state, fz_text, fz_text_item,
fz_text_span, fz_transform_point, fz_transform_point_xy, fz_transform_vector, fz_walk_path,
mupdf_document_page_count, mupdf_drop_error, mupdf_error_t, mupdf_load_page,
mupdf_new_base_context, mupdf_new_derived_device, mupdf_open_document, mupdf_run_page,
fz_buffer, fz_buffer_storage, fz_clone_context, fz_color_params, fz_colorspace, fz_concat,
fz_context, fz_device, fz_document, fz_drop_buffer, fz_drop_context, fz_drop_device,
fz_drop_document, fz_drop_page, fz_drop_path, fz_drop_text, fz_error_type_FZ_ERROR_GENERIC,
fz_font, fz_font_ascender, fz_font_descender, fz_font_is_bold, fz_font_is_italic, fz_font_name,
fz_matrix, fz_matrix_expansion, fz_page, fz_path, fz_path_walker, fz_point, fz_rect,
fz_stroke_state, fz_text, fz_text_item, fz_text_span, fz_transform_point,
fz_transform_point_xy, fz_transform_vector, fz_walk_path, mupdf_document_page_count,
mupdf_drop_error, mupdf_error_t, mupdf_load_page, mupdf_new_base_context,
mupdf_new_derived_device, mupdf_open_document, mupdf_page_to_xml, mupdf_pdf_page_transform,
mupdf_run_page, pdf_page, pdf_page_from_fz_page,
};
use std::{
cell::{Cell, UnsafeCell},
@ -172,6 +174,33 @@ impl<'ctx> Drop for Document<'ctx> {
}
}
struct Buffer<'ctx> {
ptr: *mut fz_buffer,
ctx: ContextRef<'ctx>,
}
impl<'ctx> Buffer<'ctx> {
fn storage(&mut self) -> &mut [u8] {
unsafe {
let mut ptr = ptr::null_mut();
let len = fz_buffer_storage(self.ctx.0.get(), self.ptr, &raw mut ptr);
if len == 0 {
&mut []
} else {
std::slice::from_raw_parts_mut(ptr, len)
}
}
}
}
impl<'ctx> Drop for Buffer<'ctx> {
fn drop(&mut self) {
unsafe {
fz_drop_buffer(self.ctx.0.get(), self.ptr);
}
}
}
pub(crate) struct Page<'ctx> {
ptr: *mut fz_page,
ctx: ContextRef<'ctx>,
@ -199,6 +228,25 @@ impl<'ctx> Page<'ctx> {
})
}
}
pub(crate) fn to_xml(&self) -> Result<String, MuPdfError> {
unsafe {
let mut buffer =
mupdf_try(|errptr| mupdf_page_to_xml(self.ctx.0.get(), self.ptr, errptr))
.map(|ptr| Buffer { ptr, ctx: self.ctx })?;
Ok(str::from_utf8(buffer.storage())
.map_err(MuPdfError::new_generic)?
.into())
}
}
pub(crate) fn pdf_page<'a>(&'a self) -> Option<PdfPageRef<'a, 'ctx>> {
unsafe {
let ptr = pdf_page_from_fz_page(self.ctx.0.get(), self.ptr);
NonNull::new(ptr).map(|ptr| PdfPageRef {
ptr: &*ptr.as_ptr().cast(),
ctx: self.ctx,
})
}
}
}
impl<'ctx> Drop for Page<'ctx> {
@ -209,6 +257,20 @@ impl<'ctx> Drop for Page<'ctx> {
}
}
#[derive(Clone, Copy)]
pub(crate) struct PdfPageRef<'a, 'ctx> {
ptr: &'a UnsafeCell<pdf_page>,
ctx: ContextRef<'ctx>,
}
impl<'a, 'ctx> PdfPageRef<'a, 'ctx> {
pub(crate) fn transform(self) -> Result<fz_matrix, MuPdfError> {
unsafe {
mupdf_try(|errptr| mupdf_pdf_page_transform(self.ctx.0.get(), self.ptr.get(), errptr))
}
}
}
pub(crate) struct Device<'ctx, T: 'ctx> {
dev: *mut fz_device,
ctx: ContextRef<'ctx>,