wip
This commit is contained in:
parent
c58bc23904
commit
fcf1c63cb7
4 changed files with 68 additions and 29 deletions
77
src/main.rs
77
src/main.rs
|
|
@ -2,7 +2,8 @@
|
|||
// See Notices.txt for copyright information
|
||||
|
||||
use crate::quad_tree::QuadTree;
|
||||
use indexmap::IndexSet;
|
||||
use indexmap::{IndexMap, IndexSet};
|
||||
use mupdf_sys::FZ_STEXT_BOLD;
|
||||
use non_nan_float::NonNaNF32;
|
||||
use std::{
|
||||
backtrace::Backtrace,
|
||||
|
|
@ -135,7 +136,9 @@ macro_rules! make_enum_font {
|
|||
}
|
||||
const fn new_known(font_name: &str, size: NonNaNF32) -> Option<Self> {
|
||||
match size.get() {
|
||||
$($($known_font_size if str_eq(font_name, const { Self::extract_font_name_from_font_name_with_tag($known_font_name_with_tag) }) => Some(Self::$KnownFont),)*)*
|
||||
$($($known_font_size if str_eq(font_name, const {
|
||||
Self::extract_font_name_from_font_name_with_tag($known_font_name_with_tag)
|
||||
}) => Some(Self::$KnownFont),)*)*
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
|
@ -266,12 +269,16 @@ make_enum_font! {
|
|||
InsnDescMisc15,
|
||||
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.213]
|
||||
InsnDescMisc16,
|
||||
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.252]
|
||||
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.238]
|
||||
InsnDescMisc17,
|
||||
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.962]
|
||||
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.252]
|
||||
InsnDescMisc18,
|
||||
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 7.977]
|
||||
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 6.962]
|
||||
InsnDescMisc19,
|
||||
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 7.977]
|
||||
InsnDescMisc20,
|
||||
#[name_with_tag = "MJBFWM+DejaVuSansCondensed", size = 8.506]
|
||||
InsnDescMisc21,
|
||||
},
|
||||
#[group]
|
||||
InsnDescCode {
|
||||
|
|
@ -2064,24 +2071,23 @@ impl Parser {
|
|||
file: &str,
|
||||
page_numbers: Option<Vec<NonZero<u32>>>,
|
||||
) -> Result<Box<dyn Iterator<Item = Result<Page, Box<dyn Error>>>>, Box<dyn Error>> {
|
||||
let page_numbers = page_numbers.map(|page_numbers| {
|
||||
let mut retval = Vec::from_iter(page_numbers.into_iter().map(|v| v.get() - 1));
|
||||
let page_indexes = page_numbers.map(|page_numbers| {
|
||||
let mut retval = Vec::from_iter(page_numbers.into_iter().map(|v| v.get() as usize - 1));
|
||||
retval.sort();
|
||||
retval
|
||||
});
|
||||
let document = mupdf::Document::open(file)?;
|
||||
let pages: Vec<mupdf::Page> = document.pages().and_then(|pages| pages.collect())?;
|
||||
Ok(Box::new(pages.into_iter().enumerate().map(
|
||||
move |(i, page)| {
|
||||
let page_num = match &page_numbers {
|
||||
Some(page_numbers) => page_numbers[i] + 1,
|
||||
None => i as u32 + 1,
|
||||
};
|
||||
println!("page {page_num}");
|
||||
Ok(Page::from_mupdf_page(page_num, page)
|
||||
.map_err(|e| format!("error reading pdf page {page_num}: {e}"))?)
|
||||
},
|
||||
)))
|
||||
let page_indexes = page_indexes.unwrap_or_else(|| (0..pages.len()).collect());
|
||||
let mut first_seen_fonts = BTreeMap::new();
|
||||
Ok(Box::new(page_indexes.into_iter().map(move |page_index| {
|
||||
let page_num = page_index as u32 + 1;
|
||||
println!("page {page_num}");
|
||||
Ok(
|
||||
Page::from_mupdf_page(page_num, &pages[page_index], &mut first_seen_fonts)
|
||||
.map_err(|e| format!("error reading pdf page {page_num}: {e}"))?,
|
||||
)
|
||||
})))
|
||||
}
|
||||
fn parse_pdf<I: Iterator<Item = NonZero<u32>>>(
|
||||
&mut self,
|
||||
|
|
@ -3411,7 +3417,8 @@ struct MuPdfXmlChar<'a> {
|
|||
impl Page {
|
||||
fn from_mupdf_page(
|
||||
page_num: u32,
|
||||
page: mupdf::Page,
|
||||
page: &mupdf::Page,
|
||||
first_seen_fonts: &mut BTreeMap<String, BTreeSet<NonNaNF32>>,
|
||||
) -> Result<Self, Box<dyn std::error::Error>> {
|
||||
let device = MyDevice::new(page_num);
|
||||
page.run(
|
||||
|
|
@ -3439,8 +3446,21 @@ impl Page {
|
|||
const ROUND_FACTOR: f32 = 1000.0;
|
||||
let font_size = (xml_font.size * ROUND_FACTOR).round() / ROUND_FACTOR;
|
||||
let font_size = NonNaNF32::new(font_size).ok_or("font size must not be NaN")?;
|
||||
let font = Font::new(&xml_font.name, font_size);
|
||||
for xml_char in xml_font.char {
|
||||
if xml_char.c.trim().is_empty() {
|
||||
continue;
|
||||
}
|
||||
let font_name = match &*xml_font.name {
|
||||
"DejaVuSansCondensed-Obli" => {
|
||||
if (xml_char.flags & FZ_STEXT_BOLD) != 0 {
|
||||
"DejaVuSansCondensed-BoldOblique"
|
||||
} else {
|
||||
"DejaVuSansCondensed-Oblique"
|
||||
}
|
||||
}
|
||||
font_name => font_name,
|
||||
};
|
||||
let font = Font::new(font_name, font_size);
|
||||
let [x0, y0, x1, y1, x2, y2, x3, y3] = xml_char.quad;
|
||||
let min_x = x0.min(x1).min(x2).min(x3);
|
||||
let max_x = x0.max(x1).max(x2).max(x3);
|
||||
|
|
@ -3469,6 +3489,16 @@ impl Page {
|
|||
max_x: NonNaNF32::new(max_x).ok_or("char position shouldn't be NaN")?,
|
||||
max_y: NonNaNF32::new(max_y).ok_or("char position shouldn't be NaN")?,
|
||||
};
|
||||
let set = match first_seen_fonts.get_mut(font_name) {
|
||||
Some(v) => v,
|
||||
None => first_seen_fonts.entry(String::from(font_name)).or_default(),
|
||||
};
|
||||
if set.insert(font_size) {
|
||||
println!(
|
||||
"first seen font: {font_name:?} {font_size}: page {page_num} {char:?} {:x}",
|
||||
xml_char.flags,
|
||||
);
|
||||
}
|
||||
qt.entry(text_section).or_default().insert(
|
||||
min_x,
|
||||
min_y,
|
||||
|
|
@ -3486,9 +3516,10 @@ impl Page {
|
|||
}
|
||||
}
|
||||
}
|
||||
for i in unprocessed_chars.borrow_mut().values_mut() {
|
||||
for j in i.borrow_mut().values_mut() {
|
||||
j.sort_by_key(Char::top_down_left_to_right_sort_key);
|
||||
for (text_section, i) in unprocessed_chars.borrow_mut().iter_mut() {
|
||||
for chars in i.borrow_mut().values_mut() {
|
||||
chars.sort_by_key(Char::top_down_left_to_right_sort_key);
|
||||
println!("first char: {text_section:?}: {:?}", chars.first());
|
||||
}
|
||||
}
|
||||
let mut unknown_fonts = Vec::new();
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue