This commit is contained in:
Jacob Lifshay 2026-01-05 11:27:52 -08:00
parent c58bc23904
commit fcf1c63cb7
Signed by: programmerjake
SSH key fingerprint: SHA256:HnFTLGpSm4Q4Fj502oCFisjZSoakwEuTsJJMSke63RQ
4 changed files with 68 additions and 29 deletions

1
Cargo.lock generated
View file

@ -225,6 +225,7 @@ dependencies = [
"indexmap", "indexmap",
"libm", "libm",
"mupdf", "mupdf",
"mupdf-sys",
"quick-xml", "quick-xml",
"serde", "serde",
] ]

View file

@ -14,5 +14,6 @@ rust-version = "1.89.0"
indexmap = "2.12.1" indexmap = "2.12.1"
libm = "0.2.15" libm = "0.2.15"
mupdf = { version = "0.5.0", default-features = false } mupdf = { version = "0.5.0", default-features = false }
mupdf-sys = { version = "0.5.0", default-features = false }
quick-xml = { version = "0.38.4", features = ["serialize"] } quick-xml = { version = "0.38.4", features = ["serialize"] }
serde = { version = "1.0.228", features = ["derive"] } serde = { version = "1.0.228", features = ["derive"] }

View file

@ -765,7 +765,7 @@ class Page:
unprocessed_non_text: SetById[LTLine | LTRect] unprocessed_non_text: SetById[LTLine | LTRect]
@staticmethod @staticmethod
def from_lt_page(page_num: int, page: LTPage) -> Page: def from_lt_page(page_num: int, page: LTPage, first_seen_fonts: defaultdict[str, set[float]]) -> Page:
qt: defaultdict[TextSection, QuadTree[Char | LTLine | LTRect]] = defaultdict(QuadTree) qt: defaultdict[TextSection, QuadTree[Char | LTLine | LTRect]] = defaultdict(QuadTree)
unprocessed_chars = defaultdict(lambda: defaultdict(SetById[Char])) unprocessed_chars = defaultdict(lambda: defaultdict(SetById[Char]))
unprocessed_non_text: SetById[LTLine | LTRect] = SetById() unprocessed_non_text: SetById[LTLine | LTRect] = SetById()
@ -804,20 +804,25 @@ class Page:
raise AssertionError( raise AssertionError(
f"char not in text section: {element}\npage_num={page_num}") f"char not in text section: {element}\npage_num={page_num}")
continue continue
font_size = round(element.size, 3)
char = Char( char = Char(
text=element.get_text(), text=element.get_text(),
font=Font(font_name=element.fontname, size=round(element.size, 3)), font=Font(font_name=element.fontname, size=font_size),
adv=element.adv, adv=element.adv,
min_x=element.x0, min_x=element.x0,
min_y=element.y0, min_y=element.y0,
max_x=element.x1, max_x=element.x1,
max_y=element.y1, max_y=element.y1,
) )
if font_size not in first_seen_fonts[element.fontname]:
first_seen_fonts[element.fontname].add(font_size)
print(f"first seen font: {element.fontname!r} {font_size}: page {page_num} {char!r}")
qt[text_section].insert(char.min_x, char.min_y, char) qt[text_section].insert(char.min_x, char.min_y, char)
unprocessed_chars[text_section][char.font].add(char) unprocessed_chars[text_section][char.font].add(char)
for i in unprocessed_chars.values(): for text_section, i in unprocessed_chars.items():
for j in i.values(): for chars in i.values():
j.sort(key=Char.top_down_left_to_right_sort_key) chars.sort(key=Char.top_down_left_to_right_sort_key)
print(f"first char: {text_section!r}: {next(iter(chars), None)!r}")
unknown_fonts=[] unknown_fonts=[]
unknown_font_errors=[] unknown_font_errors=[]
for i in unprocessed_chars.values(): for i in unprocessed_chars.values():
@ -1181,13 +1186,14 @@ class Parser:
def __pages_gen(file: Path, page_numbers: Iterable[int] | None) -> Generator[Page, None, None]: def __pages_gen(file: Path, page_numbers: Iterable[int] | None) -> Generator[Page, None, None]:
if page_numbers is not None: if page_numbers is not None:
page_numbers = sorted(i - 1 for i in page_numbers) page_numbers = sorted(i - 1 for i in page_numbers)
first_seen_fonts = defaultdict(set)
for i, page in enumerate(extract_pages(file, page_numbers=page_numbers)): for i, page in enumerate(extract_pages(file, page_numbers=page_numbers)):
if page_numbers is not None: if page_numbers is not None:
page_num = page_numbers[i] + 1 page_num = page_numbers[i] + 1
else: else:
page_num = i + 1 page_num = i + 1
print(f"page {page_num}") print(f"page {page_num}")
yield Page.from_lt_page(page_num=page_num, page=page) yield Page.from_lt_page(page_num=page_num, page=page, first_seen_fonts=first_seen_fonts)
def parse_pdf(self, file: Path, page_numbers: Iterable[int] | None = None): def parse_pdf(self, file: Path, page_numbers: Iterable[int] | None = None):
self.pages = Pages(pages_gen=Parser.__pages_gen( self.pages = Pages(pages_gen=Parser.__pages_gen(

View file

@ -2,7 +2,8 @@
// See Notices.txt for copyright information // See Notices.txt for copyright information
use crate::quad_tree::QuadTree; use crate::quad_tree::QuadTree;
use indexmap::IndexSet; use indexmap::{IndexMap, IndexSet};
use mupdf_sys::FZ_STEXT_BOLD;
use non_nan_float::NonNaNF32; use non_nan_float::NonNaNF32;
use std::{ use std::{
backtrace::Backtrace, backtrace::Backtrace,
@ -135,7 +136,9 @@ macro_rules! make_enum_font {
} }
const fn new_known(font_name: &str, size: NonNaNF32) -> Option<Self> { const fn new_known(font_name: &str, size: NonNaNF32) -> Option<Self> {
match size.get() { match size.get() {
$($($known_font_size if str_eq(font_name, const { Self::extract_font_name_from_font_name_with_tag($known_font_name_with_tag) }) => Some(Self::$KnownFont),)*)* $($($known_font_size if str_eq(font_name, const {
Self::extract_font_name_from_font_name_with_tag($known_font_name_with_tag)
}) => Some(Self::$KnownFont),)*)*
_ => None, _ => None,
} }
} }
@ -266,12 +269,16 @@ make_enum_font! {
InsnDescMisc15, InsnDescMisc15,
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.213] #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.213]
InsnDescMisc16, InsnDescMisc16,
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.252] #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.238]
InsnDescMisc17, InsnDescMisc17,
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.962] #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.252]
InsnDescMisc18, InsnDescMisc18,
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 7.977] #[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.962]
InsnDescMisc19, InsnDescMisc19,
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 7.977]
InsnDescMisc20,
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 8.506]
InsnDescMisc21,
}, },
#[group] #[group]
InsnDescCode { InsnDescCode {
@ -2064,24 +2071,23 @@ impl Parser {
file: &str, file: &str,
page_numbers: Option<Vec<NonZero<u32>>>, page_numbers: Option<Vec<NonZero<u32>>>,
) -> Result<Box<dyn Iterator<Item = Result<Page, Box<dyn Error>>>>, Box<dyn Error>> { ) -> Result<Box<dyn Iterator<Item = Result<Page, Box<dyn Error>>>>, Box<dyn Error>> {
let page_numbers = page_numbers.map(|page_numbers| { let page_indexes = page_numbers.map(|page_numbers| {
let mut retval = Vec::from_iter(page_numbers.into_iter().map(|v| v.get() - 1)); let mut retval = Vec::from_iter(page_numbers.into_iter().map(|v| v.get() as usize - 1));
retval.sort(); retval.sort();
retval retval
}); });
let document = mupdf::Document::open(file)?; let document = mupdf::Document::open(file)?;
let pages: Vec<mupdf::Page> = document.pages().and_then(|pages| pages.collect())?; let pages: Vec<mupdf::Page> = document.pages().and_then(|pages| pages.collect())?;
Ok(Box::new(pages.into_iter().enumerate().map( let page_indexes = page_indexes.unwrap_or_else(|| (0..pages.len()).collect());
move |(i, page)| { let mut first_seen_fonts = BTreeMap::new();
let page_num = match &page_numbers { Ok(Box::new(page_indexes.into_iter().map(move |page_index| {
Some(page_numbers) => page_numbers[i] + 1, let page_num = page_index as u32 + 1;
None => i as u32 + 1, println!("page {page_num}");
}; Ok(
println!("page {page_num}"); Page::from_mupdf_page(page_num, &pages[page_index], &mut first_seen_fonts)
Ok(Page::from_mupdf_page(page_num, page) .map_err(|e| format!("error reading pdf page {page_num}: {e}"))?,
.map_err(|e| format!("error reading pdf page {page_num}: {e}"))?) )
}, })))
)))
} }
fn parse_pdf<I: Iterator<Item = NonZero<u32>>>( fn parse_pdf<I: Iterator<Item = NonZero<u32>>>(
&mut self, &mut self,
@ -3411,7 +3417,8 @@ struct MuPdfXmlChar<'a> {
impl Page { impl Page {
fn from_mupdf_page( fn from_mupdf_page(
page_num: u32, page_num: u32,
page: mupdf::Page, page: &mupdf::Page,
first_seen_fonts: &mut BTreeMap<String, BTreeSet<NonNaNF32>>,
) -> Result<Self, Box<dyn std::error::Error>> { ) -> Result<Self, Box<dyn std::error::Error>> {
let device = MyDevice::new(page_num); let device = MyDevice::new(page_num);
page.run( page.run(
@ -3439,8 +3446,21 @@ impl Page {
const ROUND_FACTOR: f32 = 1000.0; const ROUND_FACTOR: f32 = 1000.0;
let font_size = (xml_font.size * ROUND_FACTOR).round() / ROUND_FACTOR; let font_size = (xml_font.size * ROUND_FACTOR).round() / ROUND_FACTOR;
let font_size = NonNaNF32::new(font_size).ok_or("font size must not be NaN")?; let font_size = NonNaNF32::new(font_size).ok_or("font size must not be NaN")?;
let font = Font::new(&xml_font.name, font_size);
for xml_char in xml_font.char { for xml_char in xml_font.char {
if xml_char.c.trim().is_empty() {
continue;
}
let font_name = match &*xml_font.name {
"DejaVuSansCondensed-Obli" => {
if (xml_char.flags & FZ_STEXT_BOLD) != 0 {
"DejaVuSansCondensed-BoldOblique"
} else {
"DejaVuSansCondensed-Oblique"
}
}
font_name => font_name,
};
let font = Font::new(font_name, font_size);
let [x0, y0, x1, y1, x2, y2, x3, y3] = xml_char.quad; let [x0, y0, x1, y1, x2, y2, x3, y3] = xml_char.quad;
let min_x = x0.min(x1).min(x2).min(x3); let min_x = x0.min(x1).min(x2).min(x3);
let max_x = x0.max(x1).max(x2).max(x3); let max_x = x0.max(x1).max(x2).max(x3);
@ -3469,6 +3489,16 @@ impl Page {
max_x: NonNaNF32::new(max_x).ok_or("char position shouldn't be NaN")?, max_x: NonNaNF32::new(max_x).ok_or("char position shouldn't be NaN")?,
max_y: NonNaNF32::new(max_y).ok_or("char position shouldn't be NaN")?, max_y: NonNaNF32::new(max_y).ok_or("char position shouldn't be NaN")?,
}; };
let set = match first_seen_fonts.get_mut(font_name) {
Some(v) => v,
None => first_seen_fonts.entry(String::from(font_name)).or_default(),
};
if set.insert(font_size) {
println!(
"first seen font: {font_name:?} {font_size}: page {page_num} {char:?} {:x}",
xml_char.flags,
);
}
qt.entry(text_section).or_default().insert( qt.entry(text_section).or_default().insert(
min_x, min_x,
min_y, min_y,
@ -3486,9 +3516,10 @@ impl Page {
} }
} }
} }
for i in unprocessed_chars.borrow_mut().values_mut() { for (text_section, i) in unprocessed_chars.borrow_mut().iter_mut() {
for j in i.borrow_mut().values_mut() { for chars in i.borrow_mut().values_mut() {
j.sort_by_key(Char::top_down_left_to_right_sort_key); chars.sort_by_key(Char::top_down_left_to_right_sort_key);
println!("first char: {text_section:?}: {:?}", chars.first());
} }
} }
let mut unknown_fonts = Vec::new(); let mut unknown_fonts = Vec::new();