seems to work
This commit is contained in:
parent
040afcc435
commit
73c45323c8
2 changed files with 234 additions and 92 deletions
248
src/main.rs
248
src/main.rs
|
|
@ -3,7 +3,8 @@
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
mupdf_ffi::{
|
mupdf_ffi::{
|
||||||
WriteMode, add_points, point_max_components, point_min_components, transform_vector,
|
MuPdfError, WriteMode, add_points, point_max_components, point_min_components,
|
||||||
|
transform_vector,
|
||||||
},
|
},
|
||||||
quad_tree::QuadTree,
|
quad_tree::QuadTree,
|
||||||
};
|
};
|
||||||
|
|
@ -16,7 +17,6 @@ use std::{
|
||||||
cell::RefCell,
|
cell::RefCell,
|
||||||
collections::{BTreeMap, BTreeSet, HashMap, HashSet},
|
collections::{BTreeMap, BTreeSet, HashMap, HashSet},
|
||||||
convert::Infallible,
|
convert::Infallible,
|
||||||
error::Error,
|
|
||||||
fmt,
|
fmt,
|
||||||
num::NonZero,
|
num::NonZero,
|
||||||
ops::ControlFlow,
|
ops::ControlFlow,
|
||||||
|
|
@ -1610,7 +1610,7 @@ struct Page {
|
||||||
}
|
}
|
||||||
|
|
||||||
struct Pages<'ctx> {
|
struct Pages<'ctx> {
|
||||||
pages_gen: Option<Box<dyn Iterator<Item = Result<Page, Box<dyn Error>>> + 'ctx>>,
|
pages_gen: Option<Box<dyn Iterator<Item = Result<Page, Error>> + 'ctx>>,
|
||||||
pages: BTreeMap<u32, Rc<Page>>,
|
pages: BTreeMap<u32, Rc<Page>>,
|
||||||
max_page_num: u32,
|
max_page_num: u32,
|
||||||
}
|
}
|
||||||
|
|
@ -1634,9 +1634,7 @@ impl<'ctx> fmt::Debug for Pages<'ctx> {
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'ctx> Pages<'ctx> {
|
impl<'ctx> Pages<'ctx> {
|
||||||
fn new(
|
fn new(pages_gen: Option<Box<dyn Iterator<Item = Result<Page, Error>> + 'ctx>>) -> Self {
|
||||||
pages_gen: Option<Box<dyn Iterator<Item = Result<Page, Box<dyn Error>>> + 'ctx>>,
|
|
||||||
) -> Self {
|
|
||||||
Self {
|
Self {
|
||||||
pages_gen,
|
pages_gen,
|
||||||
pages: BTreeMap::new(),
|
pages: BTreeMap::new(),
|
||||||
|
|
@ -1646,13 +1644,13 @@ impl<'ctx> Pages<'ctx> {
|
||||||
fn close(&mut self) {
|
fn close(&mut self) {
|
||||||
self.pages_gen = None;
|
self.pages_gen = None;
|
||||||
}
|
}
|
||||||
fn is_past_end(&mut self, page_num: u32) -> Result<bool, Box<dyn Error>> {
|
fn is_past_end(&mut self, page_num: u32) -> Result<bool, Error> {
|
||||||
while self.pages_gen.is_some() && page_num > self.max_page_num {
|
while self.pages_gen.is_some() && page_num > self.max_page_num {
|
||||||
self.fill_page()?;
|
self.fill_page()?;
|
||||||
}
|
}
|
||||||
Ok(page_num > self.max_page_num)
|
Ok(page_num > self.max_page_num)
|
||||||
}
|
}
|
||||||
fn fill_page(&mut self) -> Result<bool, Box<dyn Error>> {
|
fn fill_page(&mut self) -> Result<bool, Error> {
|
||||||
let Some(pages_gen) = &mut self.pages_gen else {
|
let Some(pages_gen) = &mut self.pages_gen else {
|
||||||
return Ok(false);
|
return Ok(false);
|
||||||
};
|
};
|
||||||
|
|
@ -1673,7 +1671,7 @@ impl<'ctx> Pages<'ctx> {
|
||||||
self.max_page_num = page_num;
|
self.max_page_num = page_num;
|
||||||
Ok(true)
|
Ok(true)
|
||||||
}
|
}
|
||||||
fn get(&mut self, page_num: u32) -> Result<Option<Rc<Page>>, Box<dyn Error>> {
|
fn get(&mut self, page_num: u32) -> Result<Option<Rc<Page>>, Error> {
|
||||||
loop {
|
loop {
|
||||||
if let Some(page) = self.pages.get(&page_num) {
|
if let Some(page) = self.pages.get(&page_num) {
|
||||||
return Ok(Some(page.clone()));
|
return Ok(Some(page.clone()));
|
||||||
|
|
@ -2017,10 +2015,37 @@ struct Parser<'ctx> {
|
||||||
insns: Vec<Insn>,
|
insns: Vec<Insn>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
struct Error(String, Backtrace);
|
||||||
|
|
||||||
|
impl fmt::Display for Error {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
f.write_str(&self.0)?;
|
||||||
|
f.write_str("\n")?;
|
||||||
|
fmt::Display::fmt(&self.1, f)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
trait IntoError: fmt::Display {}
|
||||||
|
|
||||||
|
impl<T: IntoError> From<T> for Error {
|
||||||
|
fn from(value: T) -> Self {
|
||||||
|
Error(value.to_string(), Backtrace::capture())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl IntoError for &'_ str {}
|
||||||
|
impl IntoError for String {}
|
||||||
|
impl IntoError for MuPdfError {}
|
||||||
|
impl IntoError for std::ffi::NulError {}
|
||||||
|
impl IntoError for std::num::ParseIntError {}
|
||||||
|
impl IntoError for std::io::Error {}
|
||||||
|
impl<T: fmt::Display> IntoError for ErrorWithNote<T> {}
|
||||||
|
|
||||||
enum ExtractInsnsError {
|
enum ExtractInsnsError {
|
||||||
InsnParseError(String, std::backtrace::Backtrace),
|
InsnParseError(String, Backtrace),
|
||||||
PageParseError(String, std::backtrace::Backtrace),
|
PageParseError(String, Backtrace),
|
||||||
Other(Box<dyn Error>),
|
Other(Error),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl fmt::Display for ExtractInsnsError {
|
impl fmt::Display for ExtractInsnsError {
|
||||||
|
|
@ -2054,7 +2079,7 @@ impl<E: fmt::Display> fmt::Display for ErrorWithNote<E> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<E: fmt::Display + fmt::Debug> Error for ErrorWithNote<E> {}
|
impl<E: fmt::Display + fmt::Debug> std::error::Error for ErrorWithNote<E> {}
|
||||||
|
|
||||||
impl<'ctx> Parser<'ctx> {
|
impl<'ctx> Parser<'ctx> {
|
||||||
fn new() -> Self {
|
fn new() -> Self {
|
||||||
|
|
@ -2064,15 +2089,13 @@ impl<'ctx> Parser<'ctx> {
|
||||||
insns: Vec::new(),
|
insns: Vec::new(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fn page(&mut self) -> Result<Rc<Page>, Box<dyn Error>> {
|
fn page(&mut self) -> Result<Rc<Page>, Error> {
|
||||||
Ok(self
|
Ok(self
|
||||||
.pages
|
.pages
|
||||||
.get(self.text_section.page_num)?
|
.get(self.text_section.page_num)?
|
||||||
.ok_or("page_num is out of range")?)
|
.ok_or("page_num is out of range")?)
|
||||||
}
|
}
|
||||||
fn unprocessed_chars(
|
fn unprocessed_chars(&mut self) -> Result<Rc<RefCell<BTreeMap<Font, IndexSet<Char>>>>, Error> {
|
||||||
&mut self,
|
|
||||||
) -> Result<Rc<RefCell<BTreeMap<Font, IndexSet<Char>>>>, Box<dyn Error>> {
|
|
||||||
Ok(self
|
Ok(self
|
||||||
.page()?
|
.page()?
|
||||||
.unprocessed_chars
|
.unprocessed_chars
|
||||||
|
|
@ -2085,7 +2108,8 @@ impl<'ctx> Parser<'ctx> {
|
||||||
ctx: impl Into<mupdf_ffi::ContextRef<'ctx>>,
|
ctx: impl Into<mupdf_ffi::ContextRef<'ctx>>,
|
||||||
file: &str,
|
file: &str,
|
||||||
page_numbers: Option<Vec<NonZero<u32>>>,
|
page_numbers: Option<Vec<NonZero<u32>>>,
|
||||||
) -> Result<Box<dyn Iterator<Item = Result<Page, Box<dyn Error>>> + 'ctx>, Box<dyn Error>> {
|
dump_mupdf_page_xml: bool,
|
||||||
|
) -> Result<Box<dyn Iterator<Item = Result<Page, Error>> + 'ctx>, Error> {
|
||||||
let ctx = ctx.into();
|
let ctx = ctx.into();
|
||||||
let page_indexes = page_numbers.map(|page_numbers| {
|
let page_indexes = page_numbers.map(|page_numbers| {
|
||||||
let mut retval = Vec::from_iter(page_numbers.into_iter().map(|v| v.get() as usize - 1));
|
let mut retval = Vec::from_iter(page_numbers.into_iter().map(|v| v.get() as usize - 1));
|
||||||
|
|
@ -2103,7 +2127,7 @@ impl<'ctx> Parser<'ctx> {
|
||||||
.load_page(page_index)
|
.load_page(page_index)
|
||||||
.map_err(|e| format!("error reading pdf page {page_num}: {e}"))?;
|
.map_err(|e| format!("error reading pdf page {page_num}: {e}"))?;
|
||||||
Ok(
|
Ok(
|
||||||
Page::from_mupdf_page(page_num, &page, &mut first_seen_fonts)
|
Page::from_mupdf_page(page_num, &page, &mut first_seen_fonts, dump_mupdf_page_xml)
|
||||||
.map_err(|e| format!("error reading pdf page {page_num}: {e}"))?,
|
.map_err(|e| format!("error reading pdf page {page_num}: {e}"))?,
|
||||||
)
|
)
|
||||||
})))
|
})))
|
||||||
|
|
@ -2113,11 +2137,13 @@ impl<'ctx> Parser<'ctx> {
|
||||||
ctx: impl Into<mupdf_ffi::ContextRef<'ctx>>,
|
ctx: impl Into<mupdf_ffi::ContextRef<'ctx>>,
|
||||||
file: &str,
|
file: &str,
|
||||||
page_numbers: Option<I>,
|
page_numbers: Option<I>,
|
||||||
) -> Result<(), Box<dyn Error>> {
|
dump_mupdf_page_xml: bool,
|
||||||
|
) -> Result<(), Error> {
|
||||||
self.pages = Pages::new(Some(Self::pages_gen(
|
self.pages = Pages::new(Some(Self::pages_gen(
|
||||||
ctx,
|
ctx,
|
||||||
file,
|
file,
|
||||||
page_numbers.map(|v| v.into_iter().collect()),
|
page_numbers.map(|v| v.into_iter().collect()),
|
||||||
|
dump_mupdf_page_xml,
|
||||||
)?));
|
)?));
|
||||||
self.text_section = TextSection::first();
|
self.text_section = TextSection::first();
|
||||||
loop {
|
loop {
|
||||||
|
|
@ -2151,7 +2177,7 @@ impl<'ctx> Parser<'ctx> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fn parse_text_section(&mut self) -> Result<(), ErrorWithNote<Box<dyn Error>>> {
|
fn parse_text_section(&mut self) -> Result<(), ErrorWithNote<Error>> {
|
||||||
match self.note_text_section(Self::extract_insns) {
|
match self.note_text_section(Self::extract_insns) {
|
||||||
Ok(()) => Ok(()),
|
Ok(()) => Ok(()),
|
||||||
Err(
|
Err(
|
||||||
|
|
@ -2177,7 +2203,7 @@ impl<'ctx> Parser<'ctx> {
|
||||||
min_y: f32,
|
min_y: f32,
|
||||||
max_y: f32,
|
max_y: f32,
|
||||||
allow_processed: bool,
|
allow_processed: bool,
|
||||||
) -> Result<Option<Char>, Box<dyn Error>> {
|
) -> Result<Option<Char>, Error> {
|
||||||
let mut retval = None;
|
let mut retval = None;
|
||||||
let page = self.page()?;
|
let page = self.page()?;
|
||||||
let unprocessed_chars = self.unprocessed_chars()?;
|
let unprocessed_chars = self.unprocessed_chars()?;
|
||||||
|
|
@ -2342,6 +2368,9 @@ impl<'ctx> Parser<'ctx> {
|
||||||
"\u{fb04}" => "ffl",
|
"\u{fb04}" => "ffl",
|
||||||
v => v,
|
v => v,
|
||||||
};
|
};
|
||||||
|
if char_text.chars().skip(1).next().is_some() {
|
||||||
|
dbg!(&ch);
|
||||||
|
}
|
||||||
text_and_tag_stacks.push((char_text.into(), kind.text_line_tags().collect()));
|
text_and_tag_stacks.push((char_text.into(), kind.text_line_tags().collect()));
|
||||||
last_max_x = ch.max_x.get();
|
last_max_x = ch.max_x.get();
|
||||||
}
|
}
|
||||||
|
|
@ -2888,7 +2917,7 @@ impl<'ctx> Parser<'ctx> {
|
||||||
));
|
));
|
||||||
};
|
};
|
||||||
let table_header_fields_text = table_header_fields.element.inner_text();
|
let table_header_fields_text = table_header_fields.element.inner_text();
|
||||||
if table_header_reg_text != "Field(s)" {
|
if table_header_fields_text != "Field(s)" {
|
||||||
return Err(ExtractInsnsError::Other(
|
return Err(ExtractInsnsError::Other(
|
||||||
format!(
|
format!(
|
||||||
"can't find special registers altered table's fields-column's header:\n\
|
"can't find special registers altered table's fields-column's header:\n\
|
||||||
|
|
@ -3186,7 +3215,7 @@ struct MyDevice<'a> {
|
||||||
Rc<RefCell<BTreeMap<TextSection, Rc<RefCell<BTreeMap<Font, IndexSet<Char>>>>>>>,
|
Rc<RefCell<BTreeMap<TextSection, Rc<RefCell<BTreeMap<Font, IndexSet<Char>>>>>>>,
|
||||||
unprocessed_non_text: Rc<RefCell<IndexSet<LineOrRect>>>,
|
unprocessed_non_text: Rc<RefCell<IndexSet<LineOrRect>>>,
|
||||||
first_seen_fonts: RefCell<&'a mut BTreeMap<String, BTreeSet<NonNaNF32>>>,
|
first_seen_fonts: RefCell<&'a mut BTreeMap<String, BTreeSet<NonNaNF32>>>,
|
||||||
error: RefCell<Result<(), Box<dyn Error>>>,
|
error: RefCell<Result<(), Error>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> MyDevice<'a> {
|
impl<'a> MyDevice<'a> {
|
||||||
|
|
@ -3361,6 +3390,57 @@ impl<'a> MyDevice<'a> {
|
||||||
}
|
}
|
||||||
_ => font_name_with_tag,
|
_ => font_name_with_tag,
|
||||||
};
|
};
|
||||||
|
let mut flush_char = |char: Char| -> Result<(), ()> {
|
||||||
|
let Some(text_section) = TextSection::for_position(
|
||||||
|
self.page_num,
|
||||||
|
(char.min_x.get() + char.max_x.get()) * 0.5,
|
||||||
|
(char.min_y.get() + char.max_y.get()) * 0.5,
|
||||||
|
) else {
|
||||||
|
if PAGE_BODY_MIN_Y <= char.min_y.get() && char.min_y.get() <= PAGE_BODY_MAX_Y {
|
||||||
|
if self.page_num != 1072 {
|
||||||
|
// page 1072 has characters in the margins
|
||||||
|
let _ = self.error.replace(Err(format!(
|
||||||
|
"char not in text section: {:?}\npage_num={}",
|
||||||
|
char.text, self.page_num,
|
||||||
|
)
|
||||||
|
.into()));
|
||||||
|
return Err(());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return Ok(());
|
||||||
|
};
|
||||||
|
let set = match first_seen_fonts.get_mut(font_name_with_tag) {
|
||||||
|
Some(v) => v,
|
||||||
|
None => first_seen_fonts
|
||||||
|
.entry(String::from(font_name_with_tag))
|
||||||
|
.or_default(),
|
||||||
|
};
|
||||||
|
if set.insert(font_size) {
|
||||||
|
println!(
|
||||||
|
"first seen font: {font_name_with_tag:?} {font_size}: page {} {char:?}",
|
||||||
|
self.page_num,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
self.qt
|
||||||
|
.borrow_mut()
|
||||||
|
.entry(text_section)
|
||||||
|
.or_default()
|
||||||
|
.insert(
|
||||||
|
char.min_x.get(),
|
||||||
|
char.min_y.get(),
|
||||||
|
PageItem::Char(char.clone()),
|
||||||
|
);
|
||||||
|
self.unprocessed_chars
|
||||||
|
.borrow_mut()
|
||||||
|
.entry(text_section)
|
||||||
|
.or_default()
|
||||||
|
.borrow_mut()
|
||||||
|
.entry(char.font.clone())
|
||||||
|
.or_default()
|
||||||
|
.insert(char);
|
||||||
|
Ok(())
|
||||||
|
};
|
||||||
|
let mut last_char = None;
|
||||||
for &fz_text_item {
|
for &fz_text_item {
|
||||||
x,
|
x,
|
||||||
y,
|
y,
|
||||||
|
|
@ -3380,7 +3460,7 @@ impl<'a> MyDevice<'a> {
|
||||||
let dir = mupdf_ffi::transform_vector(dir, trm);
|
let dir = mupdf_ffi::transform_vector(dir, trm);
|
||||||
let glyph_start;
|
let glyph_start;
|
||||||
let glyph_stop;
|
let glyph_stop;
|
||||||
let glyph_ascender;
|
let mut glyph_ascender;
|
||||||
let glyph_descender;
|
let glyph_descender;
|
||||||
match span.write_mode() {
|
match span.write_mode() {
|
||||||
WriteMode::Horizontal => {
|
WriteMode::Horizontal => {
|
||||||
|
|
@ -3397,6 +3477,9 @@ impl<'a> MyDevice<'a> {
|
||||||
x: 0.0,
|
x: 0.0,
|
||||||
y: span.font().descender(),
|
y: span.font().descender(),
|
||||||
};
|
};
|
||||||
|
if glyph_ascender.y == glyph_descender.y {
|
||||||
|
glyph_ascender.y += 1.0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
WriteMode::Vertical => {
|
WriteMode::Vertical => {
|
||||||
glyph_start = fz_point {
|
glyph_start = fz_point {
|
||||||
|
|
@ -3436,24 +3519,6 @@ impl<'a> MyDevice<'a> {
|
||||||
font_name: font_name_with_tag.into(),
|
font_name: font_name_with_tag.into(),
|
||||||
size: font_size,
|
size: font_size,
|
||||||
});
|
});
|
||||||
let Some(text_section) = TextSection::for_position(
|
|
||||||
self.page_num,
|
|
||||||
(min.x + max.x) * 0.5,
|
|
||||||
(min.y + max.y) * 0.5,
|
|
||||||
) else {
|
|
||||||
if PAGE_BODY_MIN_Y <= min.y && min.y <= PAGE_BODY_MAX_Y {
|
|
||||||
if self.page_num != 1072 {
|
|
||||||
// page 1072 has characters in the margins
|
|
||||||
let _ = self.error.replace(Err(format!(
|
|
||||||
"char not in text section: {text:?}\npage_num={}",
|
|
||||||
self.page_num,
|
|
||||||
)
|
|
||||||
.into()));
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
};
|
|
||||||
let (Some(min_x), Some(min_y), Some(max_x), Some(max_y)) = (
|
let (Some(min_x), Some(min_y), Some(max_x), Some(max_y)) = (
|
||||||
NonNaNF32::new(min.x),
|
NonNaNF32::new(min.x),
|
||||||
NonNaNF32::new(min.y),
|
NonNaNF32::new(min.y),
|
||||||
|
|
@ -3465,39 +3530,51 @@ impl<'a> MyDevice<'a> {
|
||||||
.replace(Err("char position shouldn't be NaN".into()));
|
.replace(Err("char position shouldn't be NaN".into()));
|
||||||
return;
|
return;
|
||||||
};
|
};
|
||||||
let char = Char {
|
if gid < 0
|
||||||
|
&& last_char
|
||||||
|
.as_ref()
|
||||||
|
.is_some_and(|last_char: &Char| last_char.font == font)
|
||||||
|
{
|
||||||
|
if let Some(Char {
|
||||||
|
font,
|
||||||
|
text: last_text,
|
||||||
|
min_x: last_min_x,
|
||||||
|
min_y: last_min_y,
|
||||||
|
max_x: last_max_x,
|
||||||
|
max_y: last_max_y,
|
||||||
|
}) = last_char.take()
|
||||||
|
{
|
||||||
|
last_char = Some(Char {
|
||||||
|
font,
|
||||||
|
text: last_text + &text,
|
||||||
|
min_x: last_min_x.min(min_x),
|
||||||
|
min_y: last_min_y.min(min_y),
|
||||||
|
max_x: last_max_x.max(max_x),
|
||||||
|
max_y: last_max_y.max(max_y),
|
||||||
|
});
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if let Some(last_char) = last_char.take() {
|
||||||
|
match flush_char(last_char) {
|
||||||
|
Ok(()) => {}
|
||||||
|
Err(()) => return,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
last_char = Some(Char {
|
||||||
font,
|
font,
|
||||||
text,
|
text,
|
||||||
min_x,
|
min_x,
|
||||||
min_y,
|
min_y,
|
||||||
max_x,
|
max_x,
|
||||||
max_y,
|
max_y,
|
||||||
};
|
});
|
||||||
let set = match first_seen_fonts.get_mut(font_name_with_tag) {
|
}
|
||||||
Some(v) => v,
|
if let Some(last_char) = last_char {
|
||||||
None => first_seen_fonts
|
match flush_char(last_char) {
|
||||||
.entry(String::from(font_name_with_tag))
|
Ok(()) => {}
|
||||||
.or_default(),
|
Err(()) => return,
|
||||||
};
|
|
||||||
if set.insert(font_size) {
|
|
||||||
println!(
|
|
||||||
"first seen font: {font_name_with_tag:?} {font_size}: page {} {char:?}",
|
|
||||||
self.page_num,
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
self.qt
|
|
||||||
.borrow_mut()
|
|
||||||
.entry(text_section)
|
|
||||||
.or_default()
|
|
||||||
.insert(min_x.get(), min_y.get(), PageItem::Char(char.clone()));
|
|
||||||
self.unprocessed_chars
|
|
||||||
.borrow_mut()
|
|
||||||
.entry(text_section)
|
|
||||||
.or_default()
|
|
||||||
.borrow_mut()
|
|
||||||
.entry(char.font.clone())
|
|
||||||
.or_default()
|
|
||||||
.insert(char);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -3665,22 +3742,19 @@ impl Page {
|
||||||
page_num: u32,
|
page_num: u32,
|
||||||
page: &mupdf_ffi::Page<'_>,
|
page: &mupdf_ffi::Page<'_>,
|
||||||
first_seen_fonts: &mut BTreeMap<String, BTreeSet<NonNaNF32>>,
|
first_seen_fonts: &mut BTreeMap<String, BTreeSet<NonNaNF32>>,
|
||||||
) -> Result<Self, Box<dyn std::error::Error>> {
|
dump_mupdf_page_xml: bool,
|
||||||
|
) -> Result<Self, Error> {
|
||||||
|
if dump_mupdf_page_xml {
|
||||||
|
println!("{}", page.to_xml()?);
|
||||||
|
}
|
||||||
|
let Some(pdf_page) = page.pdf_page() else {
|
||||||
|
return Err("page is not from a pdf".into());
|
||||||
|
};
|
||||||
let device = mupdf_ffi::Device::new(
|
let device = mupdf_ffi::Device::new(
|
||||||
page.ctx(),
|
page.ctx(),
|
||||||
Box::new(MyDevice::new(page_num, first_seen_fonts)),
|
Box::new(MyDevice::new(page_num, first_seen_fonts)),
|
||||||
)?;
|
)?;
|
||||||
page.run(
|
page.run(&device, pdf_page.transform()?)?;
|
||||||
&device,
|
|
||||||
fz_matrix {
|
|
||||||
a: 1.0,
|
|
||||||
b: 0.0,
|
|
||||||
c: 0.0,
|
|
||||||
d: 1.0,
|
|
||||||
e: 0.0,
|
|
||||||
f: 0.0,
|
|
||||||
},
|
|
||||||
)?;
|
|
||||||
let MyDevice {
|
let MyDevice {
|
||||||
page_num: _,
|
page_num: _,
|
||||||
qt,
|
qt,
|
||||||
|
|
@ -3731,8 +3805,14 @@ impl Page {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn main_inner() -> Result<(), Box<dyn std::error::Error>> {
|
fn main_inner() -> Result<(), Error> {
|
||||||
let args: Vec<String> = std::env::args().collect();
|
let mut args: Vec<String> = std::env::args().collect();
|
||||||
|
let dump_mupdf_page_xml = if args.get(1).is_some_and(|v| v == "--dump-mupdf-page-xml") {
|
||||||
|
args.remove(1);
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
};
|
||||||
let page_numbers: Option<Box<dyn Iterator<Item = NonZero<u32>>>> = if 2 < args.len() {
|
let page_numbers: Option<Box<dyn Iterator<Item = NonZero<u32>>>> = if 2 < args.len() {
|
||||||
Some(if let Some((start, end)) = args[2].split_once(":") {
|
Some(if let Some((start, end)) = args[2].split_once(":") {
|
||||||
let start: NonZero<u32> = start.trim().parse()?;
|
let start: NonZero<u32> = start.trim().parse()?;
|
||||||
|
|
@ -3755,7 +3835,7 @@ fn main_inner() -> Result<(), Box<dyn std::error::Error>> {
|
||||||
let mut parser = Parser::new();
|
let mut parser = Parser::new();
|
||||||
let is_subset = page_numbers.is_some();
|
let is_subset = page_numbers.is_some();
|
||||||
let file_name = &args[1];
|
let file_name = &args[1];
|
||||||
parser.parse_pdf(ctx, file_name, page_numbers)?;
|
parser.parse_pdf(ctx, file_name, page_numbers, dump_mupdf_page_xml)?;
|
||||||
let mut insns = xml_tree::Element::new(
|
let mut insns = xml_tree::Element::new(
|
||||||
"instructions".into(),
|
"instructions".into(),
|
||||||
[("is-subset".into(), is_subset.to_string())],
|
[("is-subset".into(), is_subset.to_string())],
|
||||||
|
|
|
||||||
|
|
@ -2,14 +2,16 @@
|
||||||
// See Notices.txt for copyright information
|
// See Notices.txt for copyright information
|
||||||
|
|
||||||
use mupdf_sys::{
|
use mupdf_sys::{
|
||||||
fz_clone_context, fz_color_params, fz_colorspace, fz_concat, fz_context, fz_device,
|
fz_buffer, fz_buffer_storage, fz_clone_context, fz_color_params, fz_colorspace, fz_concat,
|
||||||
fz_document, fz_drop_context, fz_drop_device, fz_drop_document, fz_drop_page, fz_drop_path,
|
fz_context, fz_device, fz_document, fz_drop_buffer, fz_drop_context, fz_drop_device,
|
||||||
fz_drop_text, fz_error_type_FZ_ERROR_GENERIC, fz_font, fz_font_ascender, fz_font_descender,
|
fz_drop_document, fz_drop_page, fz_drop_path, fz_drop_text, fz_error_type_FZ_ERROR_GENERIC,
|
||||||
fz_font_is_bold, fz_font_is_italic, fz_font_name, fz_matrix, fz_matrix_expansion, fz_page,
|
fz_font, fz_font_ascender, fz_font_descender, fz_font_is_bold, fz_font_is_italic, fz_font_name,
|
||||||
fz_path, fz_path_walker, fz_point, fz_rect, fz_stroke_state, fz_text, fz_text_item,
|
fz_matrix, fz_matrix_expansion, fz_page, fz_path, fz_path_walker, fz_point, fz_rect,
|
||||||
fz_text_span, fz_transform_point, fz_transform_point_xy, fz_transform_vector, fz_walk_path,
|
fz_stroke_state, fz_text, fz_text_item, fz_text_span, fz_transform_point,
|
||||||
mupdf_document_page_count, mupdf_drop_error, mupdf_error_t, mupdf_load_page,
|
fz_transform_point_xy, fz_transform_vector, fz_walk_path, mupdf_document_page_count,
|
||||||
mupdf_new_base_context, mupdf_new_derived_device, mupdf_open_document, mupdf_run_page,
|
mupdf_drop_error, mupdf_error_t, mupdf_load_page, mupdf_new_base_context,
|
||||||
|
mupdf_new_derived_device, mupdf_open_document, mupdf_page_to_xml, mupdf_pdf_page_transform,
|
||||||
|
mupdf_run_page, pdf_page, pdf_page_from_fz_page,
|
||||||
};
|
};
|
||||||
use std::{
|
use std::{
|
||||||
cell::{Cell, UnsafeCell},
|
cell::{Cell, UnsafeCell},
|
||||||
|
|
@ -172,6 +174,33 @@ impl<'ctx> Drop for Document<'ctx> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct Buffer<'ctx> {
|
||||||
|
ptr: *mut fz_buffer,
|
||||||
|
ctx: ContextRef<'ctx>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'ctx> Buffer<'ctx> {
|
||||||
|
fn storage(&mut self) -> &mut [u8] {
|
||||||
|
unsafe {
|
||||||
|
let mut ptr = ptr::null_mut();
|
||||||
|
let len = fz_buffer_storage(self.ctx.0.get(), self.ptr, &raw mut ptr);
|
||||||
|
if len == 0 {
|
||||||
|
&mut []
|
||||||
|
} else {
|
||||||
|
std::slice::from_raw_parts_mut(ptr, len)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'ctx> Drop for Buffer<'ctx> {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
unsafe {
|
||||||
|
fz_drop_buffer(self.ctx.0.get(), self.ptr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub(crate) struct Page<'ctx> {
|
pub(crate) struct Page<'ctx> {
|
||||||
ptr: *mut fz_page,
|
ptr: *mut fz_page,
|
||||||
ctx: ContextRef<'ctx>,
|
ctx: ContextRef<'ctx>,
|
||||||
|
|
@ -199,6 +228,25 @@ impl<'ctx> Page<'ctx> {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
pub(crate) fn to_xml(&self) -> Result<String, MuPdfError> {
|
||||||
|
unsafe {
|
||||||
|
let mut buffer =
|
||||||
|
mupdf_try(|errptr| mupdf_page_to_xml(self.ctx.0.get(), self.ptr, errptr))
|
||||||
|
.map(|ptr| Buffer { ptr, ctx: self.ctx })?;
|
||||||
|
Ok(str::from_utf8(buffer.storage())
|
||||||
|
.map_err(MuPdfError::new_generic)?
|
||||||
|
.into())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pub(crate) fn pdf_page<'a>(&'a self) -> Option<PdfPageRef<'a, 'ctx>> {
|
||||||
|
unsafe {
|
||||||
|
let ptr = pdf_page_from_fz_page(self.ctx.0.get(), self.ptr);
|
||||||
|
NonNull::new(ptr).map(|ptr| PdfPageRef {
|
||||||
|
ptr: &*ptr.as_ptr().cast(),
|
||||||
|
ctx: self.ctx,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'ctx> Drop for Page<'ctx> {
|
impl<'ctx> Drop for Page<'ctx> {
|
||||||
|
|
@ -209,6 +257,20 @@ impl<'ctx> Drop for Page<'ctx> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Copy)]
|
||||||
|
pub(crate) struct PdfPageRef<'a, 'ctx> {
|
||||||
|
ptr: &'a UnsafeCell<pdf_page>,
|
||||||
|
ctx: ContextRef<'ctx>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a, 'ctx> PdfPageRef<'a, 'ctx> {
|
||||||
|
pub(crate) fn transform(self) -> Result<fz_matrix, MuPdfError> {
|
||||||
|
unsafe {
|
||||||
|
mupdf_try(|errptr| mupdf_pdf_page_transform(self.ctx.0.get(), self.ptr.get(), errptr))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub(crate) struct Device<'ctx, T: 'ctx> {
|
pub(crate) struct Device<'ctx, T: 'ctx> {
|
||||||
dev: *mut fz_device,
|
dev: *mut fz_device,
|
||||||
ctx: ContextRef<'ctx>,
|
ctx: ContextRef<'ctx>,
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue