wip
This commit is contained in:
parent
c58bc23904
commit
fcf1c63cb7
4 changed files with 68 additions and 29 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
|
@ -225,6 +225,7 @@ dependencies = [
|
||||||
"indexmap",
|
"indexmap",
|
||||||
"libm",
|
"libm",
|
||||||
"mupdf",
|
"mupdf",
|
||||||
|
"mupdf-sys",
|
||||||
"quick-xml",
|
"quick-xml",
|
||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
|
|
|
||||||
|
|
@ -14,5 +14,6 @@ rust-version = "1.89.0"
|
||||||
indexmap = "2.12.1"
|
indexmap = "2.12.1"
|
||||||
libm = "0.2.15"
|
libm = "0.2.15"
|
||||||
mupdf = { version = "0.5.0", default-features = false }
|
mupdf = { version = "0.5.0", default-features = false }
|
||||||
|
mupdf-sys = { version = "0.5.0", default-features = false }
|
||||||
quick-xml = { version = "0.38.4", features = ["serialize"] }
|
quick-xml = { version = "0.38.4", features = ["serialize"] }
|
||||||
serde = { version = "1.0.228", features = ["derive"] }
|
serde = { version = "1.0.228", features = ["derive"] }
|
||||||
|
|
|
||||||
|
|
@ -765,7 +765,7 @@ class Page:
|
||||||
unprocessed_non_text: SetById[LTLine | LTRect]
|
unprocessed_non_text: SetById[LTLine | LTRect]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_lt_page(page_num: int, page: LTPage) -> Page:
|
def from_lt_page(page_num: int, page: LTPage, first_seen_fonts: defaultdict[str, set[float]]) -> Page:
|
||||||
qt: defaultdict[TextSection, QuadTree[Char | LTLine | LTRect]] = defaultdict(QuadTree)
|
qt: defaultdict[TextSection, QuadTree[Char | LTLine | LTRect]] = defaultdict(QuadTree)
|
||||||
unprocessed_chars = defaultdict(lambda: defaultdict(SetById[Char]))
|
unprocessed_chars = defaultdict(lambda: defaultdict(SetById[Char]))
|
||||||
unprocessed_non_text: SetById[LTLine | LTRect] = SetById()
|
unprocessed_non_text: SetById[LTLine | LTRect] = SetById()
|
||||||
|
|
@ -804,20 +804,25 @@ class Page:
|
||||||
raise AssertionError(
|
raise AssertionError(
|
||||||
f"char not in text section: {element}\npage_num={page_num}")
|
f"char not in text section: {element}\npage_num={page_num}")
|
||||||
continue
|
continue
|
||||||
|
font_size = round(element.size, 3)
|
||||||
char = Char(
|
char = Char(
|
||||||
text=element.get_text(),
|
text=element.get_text(),
|
||||||
font=Font(font_name=element.fontname, size=round(element.size, 3)),
|
font=Font(font_name=element.fontname, size=font_size),
|
||||||
adv=element.adv,
|
adv=element.adv,
|
||||||
min_x=element.x0,
|
min_x=element.x0,
|
||||||
min_y=element.y0,
|
min_y=element.y0,
|
||||||
max_x=element.x1,
|
max_x=element.x1,
|
||||||
max_y=element.y1,
|
max_y=element.y1,
|
||||||
)
|
)
|
||||||
|
if font_size not in first_seen_fonts[element.fontname]:
|
||||||
|
first_seen_fonts[element.fontname].add(font_size)
|
||||||
|
print(f"first seen font: {element.fontname!r} {font_size}: page {page_num} {char!r}")
|
||||||
qt[text_section].insert(char.min_x, char.min_y, char)
|
qt[text_section].insert(char.min_x, char.min_y, char)
|
||||||
unprocessed_chars[text_section][char.font].add(char)
|
unprocessed_chars[text_section][char.font].add(char)
|
||||||
for i in unprocessed_chars.values():
|
for text_section, i in unprocessed_chars.items():
|
||||||
for j in i.values():
|
for chars in i.values():
|
||||||
j.sort(key=Char.top_down_left_to_right_sort_key)
|
chars.sort(key=Char.top_down_left_to_right_sort_key)
|
||||||
|
print(f"first char: {text_section!r}: {next(iter(chars), None)!r}")
|
||||||
unknown_fonts=[]
|
unknown_fonts=[]
|
||||||
unknown_font_errors=[]
|
unknown_font_errors=[]
|
||||||
for i in unprocessed_chars.values():
|
for i in unprocessed_chars.values():
|
||||||
|
|
@ -1181,13 +1186,14 @@ class Parser:
|
||||||
def __pages_gen(file: Path, page_numbers: Iterable[int] | None) -> Generator[Page, None, None]:
|
def __pages_gen(file: Path, page_numbers: Iterable[int] | None) -> Generator[Page, None, None]:
|
||||||
if page_numbers is not None:
|
if page_numbers is not None:
|
||||||
page_numbers = sorted(i - 1 for i in page_numbers)
|
page_numbers = sorted(i - 1 for i in page_numbers)
|
||||||
|
first_seen_fonts = defaultdict(set)
|
||||||
for i, page in enumerate(extract_pages(file, page_numbers=page_numbers)):
|
for i, page in enumerate(extract_pages(file, page_numbers=page_numbers)):
|
||||||
if page_numbers is not None:
|
if page_numbers is not None:
|
||||||
page_num = page_numbers[i] + 1
|
page_num = page_numbers[i] + 1
|
||||||
else:
|
else:
|
||||||
page_num = i + 1
|
page_num = i + 1
|
||||||
print(f"page {page_num}")
|
print(f"page {page_num}")
|
||||||
yield Page.from_lt_page(page_num=page_num, page=page)
|
yield Page.from_lt_page(page_num=page_num, page=page, first_seen_fonts=first_seen_fonts)
|
||||||
|
|
||||||
def parse_pdf(self, file: Path, page_numbers: Iterable[int] | None = None):
|
def parse_pdf(self, file: Path, page_numbers: Iterable[int] | None = None):
|
||||||
self.pages = Pages(pages_gen=Parser.__pages_gen(
|
self.pages = Pages(pages_gen=Parser.__pages_gen(
|
||||||
|
|
|
||||||
77
src/main.rs
77
src/main.rs
|
|
@ -2,7 +2,8 @@
|
||||||
// See Notices.txt for copyright information
|
// See Notices.txt for copyright information
|
||||||
|
|
||||||
use crate::quad_tree::QuadTree;
|
use crate::quad_tree::QuadTree;
|
||||||
use indexmap::IndexSet;
|
use indexmap::{IndexMap, IndexSet};
|
||||||
|
use mupdf_sys::FZ_STEXT_BOLD;
|
||||||
use non_nan_float::NonNaNF32;
|
use non_nan_float::NonNaNF32;
|
||||||
use std::{
|
use std::{
|
||||||
backtrace::Backtrace,
|
backtrace::Backtrace,
|
||||||
|
|
@ -135,7 +136,9 @@ macro_rules! make_enum_font {
|
||||||
}
|
}
|
||||||
const fn new_known(font_name: &str, size: NonNaNF32) -> Option<Self> {
|
const fn new_known(font_name: &str, size: NonNaNF32) -> Option<Self> {
|
||||||
match size.get() {
|
match size.get() {
|
||||||
$($($known_font_size if str_eq(font_name, const { Self::extract_font_name_from_font_name_with_tag($known_font_name_with_tag) }) => Some(Self::$KnownFont),)*)*
|
$($($known_font_size if str_eq(font_name, const {
|
||||||
|
Self::extract_font_name_from_font_name_with_tag($known_font_name_with_tag)
|
||||||
|
}) => Some(Self::$KnownFont),)*)*
|
||||||
_ => None,
|
_ => None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -266,12 +269,16 @@ make_enum_font! {
|
||||||
InsnDescMisc15,
|
InsnDescMisc15,
|
||||||
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.213]
|
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.213]
|
||||||
InsnDescMisc16,
|
InsnDescMisc16,
|
||||||
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.252]
|
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.238]
|
||||||
InsnDescMisc17,
|
InsnDescMisc17,
|
||||||
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.962]
|
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.252]
|
||||||
InsnDescMisc18,
|
InsnDescMisc18,
|
||||||
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 7.977]
|
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.962]
|
||||||
InsnDescMisc19,
|
InsnDescMisc19,
|
||||||
|
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 7.977]
|
||||||
|
InsnDescMisc20,
|
||||||
|
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 8.506]
|
||||||
|
InsnDescMisc21,
|
||||||
},
|
},
|
||||||
#[group]
|
#[group]
|
||||||
InsnDescCode {
|
InsnDescCode {
|
||||||
|
|
@ -2064,24 +2071,23 @@ impl Parser {
|
||||||
file: &str,
|
file: &str,
|
||||||
page_numbers: Option<Vec<NonZero<u32>>>,
|
page_numbers: Option<Vec<NonZero<u32>>>,
|
||||||
) -> Result<Box<dyn Iterator<Item = Result<Page, Box<dyn Error>>>>, Box<dyn Error>> {
|
) -> Result<Box<dyn Iterator<Item = Result<Page, Box<dyn Error>>>>, Box<dyn Error>> {
|
||||||
let page_numbers = page_numbers.map(|page_numbers| {
|
let page_indexes = page_numbers.map(|page_numbers| {
|
||||||
let mut retval = Vec::from_iter(page_numbers.into_iter().map(|v| v.get() - 1));
|
let mut retval = Vec::from_iter(page_numbers.into_iter().map(|v| v.get() as usize - 1));
|
||||||
retval.sort();
|
retval.sort();
|
||||||
retval
|
retval
|
||||||
});
|
});
|
||||||
let document = mupdf::Document::open(file)?;
|
let document = mupdf::Document::open(file)?;
|
||||||
let pages: Vec<mupdf::Page> = document.pages().and_then(|pages| pages.collect())?;
|
let pages: Vec<mupdf::Page> = document.pages().and_then(|pages| pages.collect())?;
|
||||||
Ok(Box::new(pages.into_iter().enumerate().map(
|
let page_indexes = page_indexes.unwrap_or_else(|| (0..pages.len()).collect());
|
||||||
move |(i, page)| {
|
let mut first_seen_fonts = BTreeMap::new();
|
||||||
let page_num = match &page_numbers {
|
Ok(Box::new(page_indexes.into_iter().map(move |page_index| {
|
||||||
Some(page_numbers) => page_numbers[i] + 1,
|
let page_num = page_index as u32 + 1;
|
||||||
None => i as u32 + 1,
|
println!("page {page_num}");
|
||||||
};
|
Ok(
|
||||||
println!("page {page_num}");
|
Page::from_mupdf_page(page_num, &pages[page_index], &mut first_seen_fonts)
|
||||||
Ok(Page::from_mupdf_page(page_num, page)
|
.map_err(|e| format!("error reading pdf page {page_num}: {e}"))?,
|
||||||
.map_err(|e| format!("error reading pdf page {page_num}: {e}"))?)
|
)
|
||||||
},
|
})))
|
||||||
)))
|
|
||||||
}
|
}
|
||||||
fn parse_pdf<I: Iterator<Item = NonZero<u32>>>(
|
fn parse_pdf<I: Iterator<Item = NonZero<u32>>>(
|
||||||
&mut self,
|
&mut self,
|
||||||
|
|
@ -3411,7 +3417,8 @@ struct MuPdfXmlChar<'a> {
|
||||||
impl Page {
|
impl Page {
|
||||||
fn from_mupdf_page(
|
fn from_mupdf_page(
|
||||||
page_num: u32,
|
page_num: u32,
|
||||||
page: mupdf::Page,
|
page: &mupdf::Page,
|
||||||
|
first_seen_fonts: &mut BTreeMap<String, BTreeSet<NonNaNF32>>,
|
||||||
) -> Result<Self, Box<dyn std::error::Error>> {
|
) -> Result<Self, Box<dyn std::error::Error>> {
|
||||||
let device = MyDevice::new(page_num);
|
let device = MyDevice::new(page_num);
|
||||||
page.run(
|
page.run(
|
||||||
|
|
@ -3439,8 +3446,21 @@ impl Page {
|
||||||
const ROUND_FACTOR: f32 = 1000.0;
|
const ROUND_FACTOR: f32 = 1000.0;
|
||||||
let font_size = (xml_font.size * ROUND_FACTOR).round() / ROUND_FACTOR;
|
let font_size = (xml_font.size * ROUND_FACTOR).round() / ROUND_FACTOR;
|
||||||
let font_size = NonNaNF32::new(font_size).ok_or("font size must not be NaN")?;
|
let font_size = NonNaNF32::new(font_size).ok_or("font size must not be NaN")?;
|
||||||
let font = Font::new(&xml_font.name, font_size);
|
|
||||||
for xml_char in xml_font.char {
|
for xml_char in xml_font.char {
|
||||||
|
if xml_char.c.trim().is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let font_name = match &*xml_font.name {
|
||||||
|
"DejaVuSansCondensed-Obli" => {
|
||||||
|
if (xml_char.flags & FZ_STEXT_BOLD) != 0 {
|
||||||
|
"DejaVuSansCondensed-BoldOblique"
|
||||||
|
} else {
|
||||||
|
"DejaVuSansCondensed-Oblique"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
font_name => font_name,
|
||||||
|
};
|
||||||
|
let font = Font::new(font_name, font_size);
|
||||||
let [x0, y0, x1, y1, x2, y2, x3, y3] = xml_char.quad;
|
let [x0, y0, x1, y1, x2, y2, x3, y3] = xml_char.quad;
|
||||||
let min_x = x0.min(x1).min(x2).min(x3);
|
let min_x = x0.min(x1).min(x2).min(x3);
|
||||||
let max_x = x0.max(x1).max(x2).max(x3);
|
let max_x = x0.max(x1).max(x2).max(x3);
|
||||||
|
|
@ -3469,6 +3489,16 @@ impl Page {
|
||||||
max_x: NonNaNF32::new(max_x).ok_or("char position shouldn't be NaN")?,
|
max_x: NonNaNF32::new(max_x).ok_or("char position shouldn't be NaN")?,
|
||||||
max_y: NonNaNF32::new(max_y).ok_or("char position shouldn't be NaN")?,
|
max_y: NonNaNF32::new(max_y).ok_or("char position shouldn't be NaN")?,
|
||||||
};
|
};
|
||||||
|
let set = match first_seen_fonts.get_mut(font_name) {
|
||||||
|
Some(v) => v,
|
||||||
|
None => first_seen_fonts.entry(String::from(font_name)).or_default(),
|
||||||
|
};
|
||||||
|
if set.insert(font_size) {
|
||||||
|
println!(
|
||||||
|
"first seen font: {font_name:?} {font_size}: page {page_num} {char:?} {:x}",
|
||||||
|
xml_char.flags,
|
||||||
|
);
|
||||||
|
}
|
||||||
qt.entry(text_section).or_default().insert(
|
qt.entry(text_section).or_default().insert(
|
||||||
min_x,
|
min_x,
|
||||||
min_y,
|
min_y,
|
||||||
|
|
@ -3486,9 +3516,10 @@ impl Page {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for i in unprocessed_chars.borrow_mut().values_mut() {
|
for (text_section, i) in unprocessed_chars.borrow_mut().iter_mut() {
|
||||||
for j in i.borrow_mut().values_mut() {
|
for chars in i.borrow_mut().values_mut() {
|
||||||
j.sort_by_key(Char::top_down_left_to_right_sort_key);
|
chars.sort_by_key(Char::top_down_left_to_right_sort_key);
|
||||||
|
println!("first char: {text_section:?}: {:?}", chars.first());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let mut unknown_fonts = Vec::new();
|
let mut unknown_fonts = Vec::new();
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue