wip
This commit is contained in:
parent
c58bc23904
commit
fcf1c63cb7
4 changed files with 68 additions and 29 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
|
@ -225,6 +225,7 @@ dependencies = [
|
|||
"indexmap",
|
||||
"libm",
|
||||
"mupdf",
|
||||
"mupdf-sys",
|
||||
"quick-xml",
|
||||
"serde",
|
||||
]
|
||||
|
|
|
|||
|
|
@ -14,5 +14,6 @@ rust-version = "1.89.0"
|
|||
indexmap = "2.12.1"
|
||||
libm = "0.2.15"
|
||||
mupdf = { version = "0.5.0", default-features = false }
|
||||
mupdf-sys = { version = "0.5.0", default-features = false }
|
||||
quick-xml = { version = "0.38.4", features = ["serialize"] }
|
||||
serde = { version = "1.0.228", features = ["derive"] }
|
||||
|
|
|
|||
|
|
@ -765,7 +765,7 @@ class Page:
|
|||
unprocessed_non_text: SetById[LTLine | LTRect]
|
||||
|
||||
@staticmethod
|
||||
def from_lt_page(page_num: int, page: LTPage) -> Page:
|
||||
def from_lt_page(page_num: int, page: LTPage, first_seen_fonts: defaultdict[str, set[float]]) -> Page:
|
||||
qt: defaultdict[TextSection, QuadTree[Char | LTLine | LTRect]] = defaultdict(QuadTree)
|
||||
unprocessed_chars = defaultdict(lambda: defaultdict(SetById[Char]))
|
||||
unprocessed_non_text: SetById[LTLine | LTRect] = SetById()
|
||||
|
|
@ -804,20 +804,25 @@ class Page:
|
|||
raise AssertionError(
|
||||
f"char not in text section: {element}\npage_num={page_num}")
|
||||
continue
|
||||
font_size = round(element.size, 3)
|
||||
char = Char(
|
||||
text=element.get_text(),
|
||||
font=Font(font_name=element.fontname, size=round(element.size, 3)),
|
||||
font=Font(font_name=element.fontname, size=font_size),
|
||||
adv=element.adv,
|
||||
min_x=element.x0,
|
||||
min_y=element.y0,
|
||||
max_x=element.x1,
|
||||
max_y=element.y1,
|
||||
)
|
||||
if font_size not in first_seen_fonts[element.fontname]:
|
||||
first_seen_fonts[element.fontname].add(font_size)
|
||||
print(f"first seen font: {element.fontname!r} {font_size}: page {page_num} {char!r}")
|
||||
qt[text_section].insert(char.min_x, char.min_y, char)
|
||||
unprocessed_chars[text_section][char.font].add(char)
|
||||
for i in unprocessed_chars.values():
|
||||
for j in i.values():
|
||||
j.sort(key=Char.top_down_left_to_right_sort_key)
|
||||
for text_section, i in unprocessed_chars.items():
|
||||
for chars in i.values():
|
||||
chars.sort(key=Char.top_down_left_to_right_sort_key)
|
||||
print(f"first char: {text_section!r}: {next(iter(chars), None)!r}")
|
||||
unknown_fonts=[]
|
||||
unknown_font_errors=[]
|
||||
for i in unprocessed_chars.values():
|
||||
|
|
@ -1181,13 +1186,14 @@ class Parser:
|
|||
def __pages_gen(file: Path, page_numbers: Iterable[int] | None) -> Generator[Page, None, None]:
|
||||
if page_numbers is not None:
|
||||
page_numbers = sorted(i - 1 for i in page_numbers)
|
||||
first_seen_fonts = defaultdict(set)
|
||||
for i, page in enumerate(extract_pages(file, page_numbers=page_numbers)):
|
||||
if page_numbers is not None:
|
||||
page_num = page_numbers[i] + 1
|
||||
else:
|
||||
page_num = i + 1
|
||||
print(f"page {page_num}")
|
||||
yield Page.from_lt_page(page_num=page_num, page=page)
|
||||
yield Page.from_lt_page(page_num=page_num, page=page, first_seen_fonts=first_seen_fonts)
|
||||
|
||||
def parse_pdf(self, file: Path, page_numbers: Iterable[int] | None = None):
|
||||
self.pages = Pages(pages_gen=Parser.__pages_gen(
|
||||
|
|
|
|||
77
src/main.rs
77
src/main.rs
|
|
@ -2,7 +2,8 @@
|
|||
// See Notices.txt for copyright information
|
||||
|
||||
use crate::quad_tree::QuadTree;
|
||||
use indexmap::IndexSet;
|
||||
use indexmap::{IndexMap, IndexSet};
|
||||
use mupdf_sys::FZ_STEXT_BOLD;
|
||||
use non_nan_float::NonNaNF32;
|
||||
use std::{
|
||||
backtrace::Backtrace,
|
||||
|
|
@ -135,7 +136,9 @@ macro_rules! make_enum_font {
|
|||
}
|
||||
const fn new_known(font_name: &str, size: NonNaNF32) -> Option<Self> {
|
||||
match size.get() {
|
||||
$($($known_font_size if str_eq(font_name, const { Self::extract_font_name_from_font_name_with_tag($known_font_name_with_tag) }) => Some(Self::$KnownFont),)*)*
|
||||
$($($known_font_size if str_eq(font_name, const {
|
||||
Self::extract_font_name_from_font_name_with_tag($known_font_name_with_tag)
|
||||
}) => Some(Self::$KnownFont),)*)*
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
|
@ -266,12 +269,16 @@ make_enum_font! {
|
|||
InsnDescMisc15,
|
||||
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.213]
|
||||
InsnDescMisc16,
|
||||
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.252]
|
||||
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.238]
|
||||
InsnDescMisc17,
|
||||
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.962]
|
||||
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.252]
|
||||
InsnDescMisc18,
|
||||
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 7.977]
|
||||
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.962]
|
||||
InsnDescMisc19,
|
||||
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 7.977]
|
||||
InsnDescMisc20,
|
||||
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 8.506]
|
||||
InsnDescMisc21,
|
||||
},
|
||||
#[group]
|
||||
InsnDescCode {
|
||||
|
|
@ -2064,24 +2071,23 @@ impl Parser {
|
|||
file: &str,
|
||||
page_numbers: Option<Vec<NonZero<u32>>>,
|
||||
) -> Result<Box<dyn Iterator<Item = Result<Page, Box<dyn Error>>>>, Box<dyn Error>> {
|
||||
let page_numbers = page_numbers.map(|page_numbers| {
|
||||
let mut retval = Vec::from_iter(page_numbers.into_iter().map(|v| v.get() - 1));
|
||||
let page_indexes = page_numbers.map(|page_numbers| {
|
||||
let mut retval = Vec::from_iter(page_numbers.into_iter().map(|v| v.get() as usize - 1));
|
||||
retval.sort();
|
||||
retval
|
||||
});
|
||||
let document = mupdf::Document::open(file)?;
|
||||
let pages: Vec<mupdf::Page> = document.pages().and_then(|pages| pages.collect())?;
|
||||
Ok(Box::new(pages.into_iter().enumerate().map(
|
||||
move |(i, page)| {
|
||||
let page_num = match &page_numbers {
|
||||
Some(page_numbers) => page_numbers[i] + 1,
|
||||
None => i as u32 + 1,
|
||||
};
|
||||
println!("page {page_num}");
|
||||
Ok(Page::from_mupdf_page(page_num, page)
|
||||
.map_err(|e| format!("error reading pdf page {page_num}: {e}"))?)
|
||||
},
|
||||
)))
|
||||
let page_indexes = page_indexes.unwrap_or_else(|| (0..pages.len()).collect());
|
||||
let mut first_seen_fonts = BTreeMap::new();
|
||||
Ok(Box::new(page_indexes.into_iter().map(move |page_index| {
|
||||
let page_num = page_index as u32 + 1;
|
||||
println!("page {page_num}");
|
||||
Ok(
|
||||
Page::from_mupdf_page(page_num, &pages[page_index], &mut first_seen_fonts)
|
||||
.map_err(|e| format!("error reading pdf page {page_num}: {e}"))?,
|
||||
)
|
||||
})))
|
||||
}
|
||||
fn parse_pdf<I: Iterator<Item = NonZero<u32>>>(
|
||||
&mut self,
|
||||
|
|
@ -3411,7 +3417,8 @@ struct MuPdfXmlChar<'a> {
|
|||
impl Page {
|
||||
fn from_mupdf_page(
|
||||
page_num: u32,
|
||||
page: mupdf::Page,
|
||||
page: &mupdf::Page,
|
||||
first_seen_fonts: &mut BTreeMap<String, BTreeSet<NonNaNF32>>,
|
||||
) -> Result<Self, Box<dyn std::error::Error>> {
|
||||
let device = MyDevice::new(page_num);
|
||||
page.run(
|
||||
|
|
@ -3439,8 +3446,21 @@ impl Page {
|
|||
const ROUND_FACTOR: f32 = 1000.0;
|
||||
let font_size = (xml_font.size * ROUND_FACTOR).round() / ROUND_FACTOR;
|
||||
let font_size = NonNaNF32::new(font_size).ok_or("font size must not be NaN")?;
|
||||
let font = Font::new(&xml_font.name, font_size);
|
||||
for xml_char in xml_font.char {
|
||||
if xml_char.c.trim().is_empty() {
|
||||
continue;
|
||||
}
|
||||
let font_name = match &*xml_font.name {
|
||||
"DejaVuSansCondensed-Obli" => {
|
||||
if (xml_char.flags & FZ_STEXT_BOLD) != 0 {
|
||||
"DejaVuSansCondensed-BoldOblique"
|
||||
} else {
|
||||
"DejaVuSansCondensed-Oblique"
|
||||
}
|
||||
}
|
||||
font_name => font_name,
|
||||
};
|
||||
let font = Font::new(font_name, font_size);
|
||||
let [x0, y0, x1, y1, x2, y2, x3, y3] = xml_char.quad;
|
||||
let min_x = x0.min(x1).min(x2).min(x3);
|
||||
let max_x = x0.max(x1).max(x2).max(x3);
|
||||
|
|
@ -3469,6 +3489,16 @@ impl Page {
|
|||
max_x: NonNaNF32::new(max_x).ok_or("char position shouldn't be NaN")?,
|
||||
max_y: NonNaNF32::new(max_y).ok_or("char position shouldn't be NaN")?,
|
||||
};
|
||||
let set = match first_seen_fonts.get_mut(font_name) {
|
||||
Some(v) => v,
|
||||
None => first_seen_fonts.entry(String::from(font_name)).or_default(),
|
||||
};
|
||||
if set.insert(font_size) {
|
||||
println!(
|
||||
"first seen font: {font_name:?} {font_size}: page {page_num} {char:?} {:x}",
|
||||
xml_char.flags,
|
||||
);
|
||||
}
|
||||
qt.entry(text_section).or_default().insert(
|
||||
min_x,
|
||||
min_y,
|
||||
|
|
@ -3486,9 +3516,10 @@ impl Page {
|
|||
}
|
||||
}
|
||||
}
|
||||
for i in unprocessed_chars.borrow_mut().values_mut() {
|
||||
for j in i.borrow_mut().values_mut() {
|
||||
j.sort_by_key(Char::top_down_left_to_right_sort_key);
|
||||
for (text_section, i) in unprocessed_chars.borrow_mut().iter_mut() {
|
||||
for chars in i.borrow_mut().values_mut() {
|
||||
chars.sort_by_key(Char::top_down_left_to_right_sort_key);
|
||||
println!("first char: {text_section:?}: {:?}", chars.first());
|
||||
}
|
||||
}
|
||||
let mut unknown_fonts = Vec::new();
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue