This commit is contained in:
Jacob Lifshay 2026-01-05 11:27:52 -08:00
parent c58bc23904
commit fcf1c63cb7
Signed by: programmerjake
SSH key fingerprint: SHA256:HnFTLGpSm4Q4Fj502oCFisjZSoakwEuTsJJMSke63RQ
4 changed files with 68 additions and 29 deletions

View file

@ -2,7 +2,8 @@
// See Notices.txt for copyright information
use crate::quad_tree::QuadTree;
use indexmap::IndexSet;
use indexmap::{IndexMap, IndexSet};
use mupdf_sys::FZ_STEXT_BOLD;
use non_nan_float::NonNaNF32;
use std::{
backtrace::Backtrace,
@ -135,7 +136,9 @@ macro_rules! make_enum_font {
}
const fn new_known(font_name: &str, size: NonNaNF32) -> Option<Self> {
match size.get() {
$($($known_font_size if str_eq(font_name, const { Self::extract_font_name_from_font_name_with_tag($known_font_name_with_tag) }) => Some(Self::$KnownFont),)*)*
$($($known_font_size if str_eq(font_name, const {
Self::extract_font_name_from_font_name_with_tag($known_font_name_with_tag)
}) => Some(Self::$KnownFont),)*)*
_ => None,
}
}
@ -266,12 +269,16 @@ make_enum_font! {
InsnDescMisc15,
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.213]
InsnDescMisc16,
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.252]
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.238]
InsnDescMisc17,
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.962]
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.252]
InsnDescMisc18,
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 7.977]
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.962]
InsnDescMisc19,
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 7.977]
InsnDescMisc20,
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 8.506]
InsnDescMisc21,
},
#[group]
InsnDescCode {
@ -2064,24 +2071,23 @@ impl Parser {
file: &str,
page_numbers: Option<Vec<NonZero<u32>>>,
) -> Result<Box<dyn Iterator<Item = Result<Page, Box<dyn Error>>>>, Box<dyn Error>> {
let page_numbers = page_numbers.map(|page_numbers| {
let mut retval = Vec::from_iter(page_numbers.into_iter().map(|v| v.get() - 1));
let page_indexes = page_numbers.map(|page_numbers| {
let mut retval = Vec::from_iter(page_numbers.into_iter().map(|v| v.get() as usize - 1));
retval.sort();
retval
});
let document = mupdf::Document::open(file)?;
let pages: Vec<mupdf::Page> = document.pages().and_then(|pages| pages.collect())?;
Ok(Box::new(pages.into_iter().enumerate().map(
move |(i, page)| {
let page_num = match &page_numbers {
Some(page_numbers) => page_numbers[i] + 1,
None => i as u32 + 1,
};
println!("page {page_num}");
Ok(Page::from_mupdf_page(page_num, page)
.map_err(|e| format!("error reading pdf page {page_num}: {e}"))?)
},
)))
let page_indexes = page_indexes.unwrap_or_else(|| (0..pages.len()).collect());
let mut first_seen_fonts = BTreeMap::new();
Ok(Box::new(page_indexes.into_iter().map(move |page_index| {
let page_num = page_index as u32 + 1;
println!("page {page_num}");
Ok(
Page::from_mupdf_page(page_num, &pages[page_index], &mut first_seen_fonts)
.map_err(|e| format!("error reading pdf page {page_num}: {e}"))?,
)
})))
}
fn parse_pdf<I: Iterator<Item = NonZero<u32>>>(
&mut self,
@ -3411,7 +3417,8 @@ struct MuPdfXmlChar<'a> {
impl Page {
fn from_mupdf_page(
page_num: u32,
page: mupdf::Page,
page: &mupdf::Page,
first_seen_fonts: &mut BTreeMap<String, BTreeSet<NonNaNF32>>,
) -> Result<Self, Box<dyn std::error::Error>> {
let device = MyDevice::new(page_num);
page.run(
@ -3439,8 +3446,21 @@ impl Page {
const ROUND_FACTOR: f32 = 1000.0;
let font_size = (xml_font.size * ROUND_FACTOR).round() / ROUND_FACTOR;
let font_size = NonNaNF32::new(font_size).ok_or("font size must not be NaN")?;
let font = Font::new(&xml_font.name, font_size);
for xml_char in xml_font.char {
if xml_char.c.trim().is_empty() {
continue;
}
let font_name = match &*xml_font.name {
"DejaVuSansCondensed-Obli" => {
if (xml_char.flags & FZ_STEXT_BOLD) != 0 {
"DejaVuSansCondensed-BoldOblique"
} else {
"DejaVuSansCondensed-Oblique"
}
}
font_name => font_name,
};
let font = Font::new(font_name, font_size);
let [x0, y0, x1, y1, x2, y2, x3, y3] = xml_char.quad;
let min_x = x0.min(x1).min(x2).min(x3);
let max_x = x0.max(x1).max(x2).max(x3);
@ -3469,6 +3489,16 @@ impl Page {
max_x: NonNaNF32::new(max_x).ok_or("char position shouldn't be NaN")?,
max_y: NonNaNF32::new(max_y).ok_or("char position shouldn't be NaN")?,
};
let set = match first_seen_fonts.get_mut(font_name) {
Some(v) => v,
None => first_seen_fonts.entry(String::from(font_name)).or_default(),
};
if set.insert(font_size) {
println!(
"first seen font: {font_name:?} {font_size}: page {page_num} {char:?} {:x}",
xml_char.flags,
);
}
qt.entry(text_section).or_default().insert(
min_x,
min_y,
@ -3486,9 +3516,10 @@ impl Page {
}
}
}
for i in unprocessed_chars.borrow_mut().values_mut() {
for j in i.borrow_mut().values_mut() {
j.sort_by_key(Char::top_down_left_to_right_sort_key);
for (text_section, i) in unprocessed_chars.borrow_mut().iter_mut() {
for chars in i.borrow_mut().values_mut() {
chars.sort_by_key(Char::top_down_left_to_right_sort_key);
println!("first char: {text_section:?}: {:?}", chars.first());
}
}
let mut unknown_fonts = Vec::new();