From 9e090a66a329ec58a13022ed7b60ba94d7cd7958 Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Fri, 2 Jan 2026 04:50:51 -0800 Subject: [PATCH] port more code --- src/main.rs | 278 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 277 insertions(+), 1 deletion(-) diff --git a/src/main.rs b/src/main.rs index e7fc231..6fb1cd4 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,10 +1,11 @@ // SPDX-License-Identifier: LGPL-3.0-or-later // See Notices.txt for copyright information +use crate::quad_tree::QuadTree; use non_nan_float::NonNaNF32; use std::{ borrow::Borrow, - collections::{HashMap, HashSet}, + collections::{BTreeMap, BTreeSet, HashMap, HashSet}, fmt, sync::OnceLock, }; @@ -1435,4 +1436,279 @@ enum InsnParseSection { Desc, } +#[derive(Clone, Debug)] +enum PageItem { + Char(Char), + LineOrRect(LineOrRect), +} + +#[derive(Clone, Debug)] +enum LineOrRect { + Line(()), + Rect(()), +} + +#[derive(Debug)] +struct Page { + page_num: u32, + qt: BTreeMap>, + unprocessed_chars: BTreeMap>>, + unprocessed_non_text: BTreeSet, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +struct TextSection { + page_num: u32, + min_x: NonNaNF32, + min_y: NonNaNF32, + max_x: NonNaNF32, + max_y: NonNaNF32, +} + +struct TextSectionPagesData { + columns_then_full_page: BTreeMap, + full_page_then_columns: BTreeMap, + one_title_line_then_columns_then_full_page: BTreeMap, + two_title_lines_then_columns_then_full_page: BTreeMap, + columns_then_columns: BTreeMap, + one_title_line_then_columns_then_columns: BTreeMap, + one_title_line_then_columns: BTreeSet, + two_title_lines_then_columns: BTreeSet, + full_page: BTreeSet, +} + +impl TextSectionPagesData { + fn get() -> &'static Self { + static DATA: OnceLock = OnceLock::new(); + DATA.get_or_init(|| Self { + columns_then_full_page: FromIterator::from_iter([ + (129, 438.992), + (241, 512.419), + (242, 408.077), + (243, 488.509), + (244, 437.518), + (245, 444.522), + (247, 352.082), + (248, 356.723), + (249, 365.944), + (251, 334.553), + (264, 184.67), + (296, 267.29), + (297, 200.043), + (298, 440.64), + (299, 197.356), + (300, 160.076), + (301, 364.924), + (303, 330.055), + (305, 344.867), + (306, 335.403), + (307, 336.897), + (308, 365.233), + (309, 364.735), + ]), + full_page_then_columns: FromIterator::from_iter([ + (246, 689.039), + (250, 615.315), + (266, 678.088), + ]), + one_title_line_then_columns_then_full_page: FromIterator::from_iter([(128, 301.55)]), + two_title_lines_then_columns_then_full_page: FromIterator::from_iter([(304, 242.732)]), + columns_then_columns: FromIterator::from_iter([(79, 621.66), (126, 519.89)]), + one_title_line_then_columns_then_columns: FromIterator::from_iter([ + (130, 550.43), + (162, 599.247), + (194, 622.161), + (196, 682.933), + (204, 613.195), + (215, 633.12), + ]), + one_title_line_then_columns: FromIterator::from_iter([ + 103, 104, 105, 121, 139, 143, 146, 151, 153, 158, 207, 213, 218, + ]), + two_title_lines_then_columns: FromIterator::from_iter([198, 206]), + full_page: FromIterator::from_iter( + [ + 118, 157, 252, 254, 255, 257, 259, 260, 265, 268, 270, 271, 272, + ] + .into_iter() + .chain(274..286), + ), + }) + } +} + +impl TextSection { + fn first() -> TextSection { + Self::page_sections(1)[0] + } + + fn next(self) -> TextSection { + let page_sections = Self::page_sections(self.page_num); + let Some(index) = page_sections.iter().position(|v| *v == self) else { + panic!("not a known TextSection: {self:?}"); + }; + if let Some(&retval) = page_sections.get(index + 1) { + return retval; + } + for page_num in self.page_num + 1..self.page_num + 100000 { + let page_sections = Self::page_sections(page_num); + if let Some(&retval) = page_sections.get(0) { + return retval; + } + } + panic!("can't find next TextSection after {self:?}") + } + + fn new(page_num: u32, min_x: f32, min_y: f32, max_x: f32, max_y: f32) -> Self { + Self { + page_num, + min_x: NonNaNF32::new(min_x).expect("invalid min_x"), + min_y: NonNaNF32::new(min_y).expect("invalid min_y"), + max_x: NonNaNF32::new(max_x).expect("invalid max_x"), + max_y: NonNaNF32::new(max_y).expect("invalid max_y"), + } + } + + fn left_column(page_num: u32, min_y: f32, max_y: f32) -> TextSection { + Self::new(page_num, PAGE_BODY_MIN_X, min_y, COLUMN_SPLIT_X, max_y) + } + + fn right_column(page_num: u32, min_y: f32, max_y: f32) -> TextSection { + Self::new(page_num, COLUMN_SPLIT_X, min_y, PAGE_BODY_MAX_X, max_y) + } + + fn columns(page_num: u32, min_y: f32, max_y: f32) -> [TextSection; 2] { + [ + Self::left_column(page_num, min_y, max_y), + Self::right_column(page_num, min_y, max_y), + ] + } + + fn full_page(page_num: u32, min_y: f32, max_y: f32) -> TextSection { + Self::new(page_num, PAGE_BODY_MIN_X, min_y, PAGE_BODY_MAX_X, max_y) + } + + fn page_sections_helper(page_num: u32) -> Box<[TextSection]> { + let TextSectionPagesData { + columns_then_full_page, + full_page_then_columns, + one_title_line_then_columns_then_full_page, + two_title_lines_then_columns_then_full_page, + columns_then_columns, + one_title_line_then_columns_then_columns, + one_title_line_then_columns, + two_title_lines_then_columns, + full_page, + } = TextSectionPagesData::get(); + if let Some(split_y) = columns_then_columns.get(&page_num) { + return Box::from_iter( + Self::columns(page_num, *split_y, PAGE_BODY_MAX_Y) + .into_iter() + .chain(Self::columns(page_num, PAGE_BODY_MIN_Y, *split_y)), + ); + } + if one_title_line_then_columns.contains(&page_num) { + return Box::from_iter( + [Self::full_page( + page_num, + ONE_TITLE_LINE_SPLIT_Y, + PAGE_BODY_MAX_Y, + )] + .into_iter() + .chain(Self::columns( + page_num, + PAGE_BODY_MIN_Y, + ONE_TITLE_LINE_SPLIT_Y, + )), + ); + } + if full_page.contains(&page_num) { + return Box::new([Self::full_page(page_num, PAGE_BODY_MIN_Y, PAGE_BODY_MAX_Y)]); + } + if let Some(split_y) = one_title_line_then_columns_then_columns.get(&page_num) { + return Box::from_iter( + [Self::full_page( + page_num, + ONE_TITLE_LINE_SPLIT_Y, + PAGE_BODY_MAX_Y, + )] + .into_iter() + .chain(Self::columns(page_num, *split_y, ONE_TITLE_LINE_SPLIT_Y)) + .chain(Self::columns(page_num, PAGE_BODY_MIN_Y, *split_y)), + ); + } + if two_title_lines_then_columns.contains(&page_num) { + return Box::from_iter( + [Self::full_page( + page_num, + TWO_TITLE_LINES_SPLIT_Y, + PAGE_BODY_MAX_Y, + )] + .into_iter() + .chain(Self::columns( + page_num, + PAGE_BODY_MIN_Y, + TWO_TITLE_LINES_SPLIT_Y, + )), + ); + } + if let Some(split_y) = columns_then_full_page.get(&page_num) { + return Box::from_iter( + Self::columns(page_num, *split_y, PAGE_BODY_MAX_Y) + .into_iter() + .chain([Self::full_page(page_num, PAGE_BODY_MIN_Y, *split_y)]), + ); + } + if let Some(split_y) = full_page_then_columns.get(&page_num) { + return Box::from_iter( + [Self::full_page(page_num, *split_y, PAGE_BODY_MAX_Y)] + .into_iter() + .chain(Self::columns(page_num, PAGE_BODY_MIN_Y, *split_y)), + ); + } + if let Some(split_y) = one_title_line_then_columns_then_full_page.get(&page_num) { + return Box::from_iter( + [Self::full_page( + page_num, + ONE_TITLE_LINE_SPLIT_Y, + PAGE_BODY_MAX_Y, + )] + .into_iter() + .chain(Self::columns(page_num, *split_y, ONE_TITLE_LINE_SPLIT_Y)) + .chain([Self::full_page(page_num, PAGE_BODY_MIN_Y, *split_y)]), + ); + } + if let Some(split_y) = two_title_lines_then_columns_then_full_page.get(&page_num) { + return Box::from_iter( + [Self::full_page( + page_num, + TWO_TITLE_LINES_SPLIT_Y, + PAGE_BODY_MAX_Y, + )] + .into_iter() + .chain(Self::columns(page_num, *split_y, TWO_TITLE_LINES_SPLIT_Y)) + .chain([Self::full_page(page_num, PAGE_BODY_MIN_Y, *split_y)]), + ); + } + if page_num == 263 { + return Box::from_iter( + [Self::full_page(page_num, 699.997, PAGE_BODY_MAX_Y)] + .into_iter() + .chain(Self::columns(page_num, 366.396, 699.997)) + .chain(Self::columns(page_num, 207.0, 366.396)) + .chain([Self::full_page(page_num, PAGE_BODY_MIN_Y, 207.0)]), + ); + } + // TODO: checked up to page 309 (page named 273) + Box::new(Self::columns(page_num, PAGE_BODY_MIN_Y, PAGE_BODY_MAX_Y)) + } + fn page_sections(page_num: u32) -> &'static [TextSection] { + static CACHE: [OnceLock>; 2000] = [const { OnceLock::new() }; _]; + CACHE + .get(page_num as usize) + .expect("page_num out of range") + .get_or_init(|| Self::page_sections_helper(page_num)) + } +} + fn main() {}