port more code
This commit is contained in:
parent
7ecdbc0239
commit
9e090a66a3
1 changed files with 277 additions and 1 deletions
278
src/main.rs
278
src/main.rs
|
|
@ -1,10 +1,11 @@
|
|||
// SPDX-License-Identifier: LGPL-3.0-or-later
|
||||
// See Notices.txt for copyright information
|
||||
|
||||
use crate::quad_tree::QuadTree;
|
||||
use non_nan_float::NonNaNF32;
|
||||
use std::{
|
||||
borrow::Borrow,
|
||||
collections::{HashMap, HashSet},
|
||||
collections::{BTreeMap, BTreeSet, HashMap, HashSet},
|
||||
fmt,
|
||||
sync::OnceLock,
|
||||
};
|
||||
|
|
@ -1435,4 +1436,279 @@ enum InsnParseSection {
|
|||
Desc,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
enum PageItem {
|
||||
Char(Char),
|
||||
LineOrRect(LineOrRect),
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
enum LineOrRect {
|
||||
Line(()),
|
||||
Rect(()),
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct Page {
|
||||
page_num: u32,
|
||||
qt: BTreeMap<TextSection, QuadTree<PageItem>>,
|
||||
unprocessed_chars: BTreeMap<TextSection, BTreeMap<Font, BTreeSet<Char>>>,
|
||||
unprocessed_non_text: BTreeSet<LineOrRect>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
struct TextSection {
|
||||
page_num: u32,
|
||||
min_x: NonNaNF32,
|
||||
min_y: NonNaNF32,
|
||||
max_x: NonNaNF32,
|
||||
max_y: NonNaNF32,
|
||||
}
|
||||
|
||||
struct TextSectionPagesData {
|
||||
columns_then_full_page: BTreeMap<u32, f32>,
|
||||
full_page_then_columns: BTreeMap<u32, f32>,
|
||||
one_title_line_then_columns_then_full_page: BTreeMap<u32, f32>,
|
||||
two_title_lines_then_columns_then_full_page: BTreeMap<u32, f32>,
|
||||
columns_then_columns: BTreeMap<u32, f32>,
|
||||
one_title_line_then_columns_then_columns: BTreeMap<u32, f32>,
|
||||
one_title_line_then_columns: BTreeSet<u32>,
|
||||
two_title_lines_then_columns: BTreeSet<u32>,
|
||||
full_page: BTreeSet<u32>,
|
||||
}
|
||||
|
||||
impl TextSectionPagesData {
|
||||
fn get() -> &'static Self {
|
||||
static DATA: OnceLock<TextSectionPagesData> = OnceLock::new();
|
||||
DATA.get_or_init(|| Self {
|
||||
columns_then_full_page: FromIterator::from_iter([
|
||||
(129, 438.992),
|
||||
(241, 512.419),
|
||||
(242, 408.077),
|
||||
(243, 488.509),
|
||||
(244, 437.518),
|
||||
(245, 444.522),
|
||||
(247, 352.082),
|
||||
(248, 356.723),
|
||||
(249, 365.944),
|
||||
(251, 334.553),
|
||||
(264, 184.67),
|
||||
(296, 267.29),
|
||||
(297, 200.043),
|
||||
(298, 440.64),
|
||||
(299, 197.356),
|
||||
(300, 160.076),
|
||||
(301, 364.924),
|
||||
(303, 330.055),
|
||||
(305, 344.867),
|
||||
(306, 335.403),
|
||||
(307, 336.897),
|
||||
(308, 365.233),
|
||||
(309, 364.735),
|
||||
]),
|
||||
full_page_then_columns: FromIterator::from_iter([
|
||||
(246, 689.039),
|
||||
(250, 615.315),
|
||||
(266, 678.088),
|
||||
]),
|
||||
one_title_line_then_columns_then_full_page: FromIterator::from_iter([(128, 301.55)]),
|
||||
two_title_lines_then_columns_then_full_page: FromIterator::from_iter([(304, 242.732)]),
|
||||
columns_then_columns: FromIterator::from_iter([(79, 621.66), (126, 519.89)]),
|
||||
one_title_line_then_columns_then_columns: FromIterator::from_iter([
|
||||
(130, 550.43),
|
||||
(162, 599.247),
|
||||
(194, 622.161),
|
||||
(196, 682.933),
|
||||
(204, 613.195),
|
||||
(215, 633.12),
|
||||
]),
|
||||
one_title_line_then_columns: FromIterator::from_iter([
|
||||
103, 104, 105, 121, 139, 143, 146, 151, 153, 158, 207, 213, 218,
|
||||
]),
|
||||
two_title_lines_then_columns: FromIterator::from_iter([198, 206]),
|
||||
full_page: FromIterator::from_iter(
|
||||
[
|
||||
118, 157, 252, 254, 255, 257, 259, 260, 265, 268, 270, 271, 272,
|
||||
]
|
||||
.into_iter()
|
||||
.chain(274..286),
|
||||
),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl TextSection {
|
||||
fn first() -> TextSection {
|
||||
Self::page_sections(1)[0]
|
||||
}
|
||||
|
||||
fn next(self) -> TextSection {
|
||||
let page_sections = Self::page_sections(self.page_num);
|
||||
let Some(index) = page_sections.iter().position(|v| *v == self) else {
|
||||
panic!("not a known TextSection: {self:?}");
|
||||
};
|
||||
if let Some(&retval) = page_sections.get(index + 1) {
|
||||
return retval;
|
||||
}
|
||||
for page_num in self.page_num + 1..self.page_num + 100000 {
|
||||
let page_sections = Self::page_sections(page_num);
|
||||
if let Some(&retval) = page_sections.get(0) {
|
||||
return retval;
|
||||
}
|
||||
}
|
||||
panic!("can't find next TextSection after {self:?}")
|
||||
}
|
||||
|
||||
fn new(page_num: u32, min_x: f32, min_y: f32, max_x: f32, max_y: f32) -> Self {
|
||||
Self {
|
||||
page_num,
|
||||
min_x: NonNaNF32::new(min_x).expect("invalid min_x"),
|
||||
min_y: NonNaNF32::new(min_y).expect("invalid min_y"),
|
||||
max_x: NonNaNF32::new(max_x).expect("invalid max_x"),
|
||||
max_y: NonNaNF32::new(max_y).expect("invalid max_y"),
|
||||
}
|
||||
}
|
||||
|
||||
fn left_column(page_num: u32, min_y: f32, max_y: f32) -> TextSection {
|
||||
Self::new(page_num, PAGE_BODY_MIN_X, min_y, COLUMN_SPLIT_X, max_y)
|
||||
}
|
||||
|
||||
fn right_column(page_num: u32, min_y: f32, max_y: f32) -> TextSection {
|
||||
Self::new(page_num, COLUMN_SPLIT_X, min_y, PAGE_BODY_MAX_X, max_y)
|
||||
}
|
||||
|
||||
fn columns(page_num: u32, min_y: f32, max_y: f32) -> [TextSection; 2] {
|
||||
[
|
||||
Self::left_column(page_num, min_y, max_y),
|
||||
Self::right_column(page_num, min_y, max_y),
|
||||
]
|
||||
}
|
||||
|
||||
fn full_page(page_num: u32, min_y: f32, max_y: f32) -> TextSection {
|
||||
Self::new(page_num, PAGE_BODY_MIN_X, min_y, PAGE_BODY_MAX_X, max_y)
|
||||
}
|
||||
|
||||
fn page_sections_helper(page_num: u32) -> Box<[TextSection]> {
|
||||
let TextSectionPagesData {
|
||||
columns_then_full_page,
|
||||
full_page_then_columns,
|
||||
one_title_line_then_columns_then_full_page,
|
||||
two_title_lines_then_columns_then_full_page,
|
||||
columns_then_columns,
|
||||
one_title_line_then_columns_then_columns,
|
||||
one_title_line_then_columns,
|
||||
two_title_lines_then_columns,
|
||||
full_page,
|
||||
} = TextSectionPagesData::get();
|
||||
if let Some(split_y) = columns_then_columns.get(&page_num) {
|
||||
return Box::from_iter(
|
||||
Self::columns(page_num, *split_y, PAGE_BODY_MAX_Y)
|
||||
.into_iter()
|
||||
.chain(Self::columns(page_num, PAGE_BODY_MIN_Y, *split_y)),
|
||||
);
|
||||
}
|
||||
if one_title_line_then_columns.contains(&page_num) {
|
||||
return Box::from_iter(
|
||||
[Self::full_page(
|
||||
page_num,
|
||||
ONE_TITLE_LINE_SPLIT_Y,
|
||||
PAGE_BODY_MAX_Y,
|
||||
)]
|
||||
.into_iter()
|
||||
.chain(Self::columns(
|
||||
page_num,
|
||||
PAGE_BODY_MIN_Y,
|
||||
ONE_TITLE_LINE_SPLIT_Y,
|
||||
)),
|
||||
);
|
||||
}
|
||||
if full_page.contains(&page_num) {
|
||||
return Box::new([Self::full_page(page_num, PAGE_BODY_MIN_Y, PAGE_BODY_MAX_Y)]);
|
||||
}
|
||||
if let Some(split_y) = one_title_line_then_columns_then_columns.get(&page_num) {
|
||||
return Box::from_iter(
|
||||
[Self::full_page(
|
||||
page_num,
|
||||
ONE_TITLE_LINE_SPLIT_Y,
|
||||
PAGE_BODY_MAX_Y,
|
||||
)]
|
||||
.into_iter()
|
||||
.chain(Self::columns(page_num, *split_y, ONE_TITLE_LINE_SPLIT_Y))
|
||||
.chain(Self::columns(page_num, PAGE_BODY_MIN_Y, *split_y)),
|
||||
);
|
||||
}
|
||||
if two_title_lines_then_columns.contains(&page_num) {
|
||||
return Box::from_iter(
|
||||
[Self::full_page(
|
||||
page_num,
|
||||
TWO_TITLE_LINES_SPLIT_Y,
|
||||
PAGE_BODY_MAX_Y,
|
||||
)]
|
||||
.into_iter()
|
||||
.chain(Self::columns(
|
||||
page_num,
|
||||
PAGE_BODY_MIN_Y,
|
||||
TWO_TITLE_LINES_SPLIT_Y,
|
||||
)),
|
||||
);
|
||||
}
|
||||
if let Some(split_y) = columns_then_full_page.get(&page_num) {
|
||||
return Box::from_iter(
|
||||
Self::columns(page_num, *split_y, PAGE_BODY_MAX_Y)
|
||||
.into_iter()
|
||||
.chain([Self::full_page(page_num, PAGE_BODY_MIN_Y, *split_y)]),
|
||||
);
|
||||
}
|
||||
if let Some(split_y) = full_page_then_columns.get(&page_num) {
|
||||
return Box::from_iter(
|
||||
[Self::full_page(page_num, *split_y, PAGE_BODY_MAX_Y)]
|
||||
.into_iter()
|
||||
.chain(Self::columns(page_num, PAGE_BODY_MIN_Y, *split_y)),
|
||||
);
|
||||
}
|
||||
if let Some(split_y) = one_title_line_then_columns_then_full_page.get(&page_num) {
|
||||
return Box::from_iter(
|
||||
[Self::full_page(
|
||||
page_num,
|
||||
ONE_TITLE_LINE_SPLIT_Y,
|
||||
PAGE_BODY_MAX_Y,
|
||||
)]
|
||||
.into_iter()
|
||||
.chain(Self::columns(page_num, *split_y, ONE_TITLE_LINE_SPLIT_Y))
|
||||
.chain([Self::full_page(page_num, PAGE_BODY_MIN_Y, *split_y)]),
|
||||
);
|
||||
}
|
||||
if let Some(split_y) = two_title_lines_then_columns_then_full_page.get(&page_num) {
|
||||
return Box::from_iter(
|
||||
[Self::full_page(
|
||||
page_num,
|
||||
TWO_TITLE_LINES_SPLIT_Y,
|
||||
PAGE_BODY_MAX_Y,
|
||||
)]
|
||||
.into_iter()
|
||||
.chain(Self::columns(page_num, *split_y, TWO_TITLE_LINES_SPLIT_Y))
|
||||
.chain([Self::full_page(page_num, PAGE_BODY_MIN_Y, *split_y)]),
|
||||
);
|
||||
}
|
||||
if page_num == 263 {
|
||||
return Box::from_iter(
|
||||
[Self::full_page(page_num, 699.997, PAGE_BODY_MAX_Y)]
|
||||
.into_iter()
|
||||
.chain(Self::columns(page_num, 366.396, 699.997))
|
||||
.chain(Self::columns(page_num, 207.0, 366.396))
|
||||
.chain([Self::full_page(page_num, PAGE_BODY_MIN_Y, 207.0)]),
|
||||
);
|
||||
}
|
||||
// TODO: checked up to page 309 (page named 273)
|
||||
Box::new(Self::columns(page_num, PAGE_BODY_MIN_Y, PAGE_BODY_MAX_Y))
|
||||
}
|
||||
fn page_sections(page_num: u32) -> &'static [TextSection] {
|
||||
static CACHE: [OnceLock<Box<[TextSection]>>; 2000] = [const { OnceLock::new() }; _];
|
||||
CACHE
|
||||
.get(page_num as usize)
|
||||
.expect("page_num out of range")
|
||||
.get_or_init(|| Self::page_sections_helper(page_num))
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue