diff --git a/src/main.rs b/src/main.rs index 63ddb9f..0b5aae3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -5,12 +5,15 @@ use crate::quad_tree::QuadTree; use indexmap::IndexSet; use non_nan_float::NonNaNF32; use std::{ + backtrace::Backtrace, borrow::{Borrow, Cow}, cell::RefCell, collections::{BTreeMap, BTreeSet, HashMap, HashSet}, + convert::Infallible, error::Error, fmt, num::NonZero, + ops::ControlFlow, rc::Rc, sync::OnceLock, }; @@ -1902,6 +1905,45 @@ struct Parser { insns: Vec, } +enum ExtractInsnsError { + InsnParseError(String, std::backtrace::Backtrace), + PageParseError(String, std::backtrace::Backtrace), + Other(Box), +} + +impl fmt::Display for ExtractInsnsError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let backtrace = match self { + ExtractInsnsError::InsnParseError(msg, backtrace) => { + writeln!(f, "instruction parse error: {msg}")?; + backtrace + } + ExtractInsnsError::PageParseError(msg, backtrace) => { + writeln!(f, "page parse error: {msg}")?; + backtrace + } + ExtractInsnsError::Other(e) => return fmt::Display::fmt(&e, f), + }; + backtrace.fmt(f) + } +} + +#[derive(Clone, Debug)] +struct ErrorWithNote { + error: E, + note: String, +} + +impl fmt::Display for ErrorWithNote { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let Self { error, note } = self; + fmt::Display::fmt(error, f)?; + write!(f, "\nnote: {note}") + } +} + +impl Error for ErrorWithNote {} + impl Parser { fn new() -> Self { Self { @@ -1971,14 +2013,14 @@ impl Parser { } } } - fn note_text_section( + fn note_text_section( &mut self, - f: impl FnOnce(&mut Self) -> Result<(), Box>, - ) -> Result<(), Box> { + f: impl FnOnce(&mut Self) -> Result<(), E>, + ) -> Result<(), ErrorWithNote> { let start_text_section = self.text_section; match f(self) { Ok(()) => Ok(()), - Err(e) => { + Err(error) => { let note = if self.text_section == start_text_section { format!("text_section={:?}", self.text_section) } else { @@ -1987,12 +2029,375 @@ impl Parser { self.text_section ) }; - Err(format!("{e}\nnote: {note}").into()) + Err(ErrorWithNote { error, note }) } } } - fn parse_text_section(&mut self) -> Result<(), Box> { - todo!() + fn parse_text_section(&mut self) -> Result<(), ErrorWithNote>> { + match self.note_text_section(Self::extract_insns) { + Ok(()) => Ok(()), + Err( + e @ ErrorWithNote { + error: + ExtractInsnsError::InsnParseError(_) | ExtractInsnsError::PageParseError(_), + .. + }, + ) => { + println!("{e}"); + Ok(()) + } + Err(ErrorWithNote { + error: ExtractInsnsError::Other(error), + note, + }) => Err(ErrorWithNote { error, note }), + } + } + fn find_top_left_char_in_range( + &mut self, + min_x: f32, + max_x: f32, + min_y: f32, + max_y: f32, + allow_processed: bool, + ) -> Result, Box> { + let mut retval = None; + let page = self.page()?; + let unprocessed_chars = self.unprocessed_chars()?; + let ControlFlow::::Continue(()) = + page.qt[&self.text_section].range(min_x, max_x, min_y, max_y, |x, y, ch| { + let PageItem::Char(ch) = ch else { + return ControlFlow::Continue(()); + }; + if !allow_processed && !RefCell::borrow(&*unprocessed_chars)[&ch.font].contains(ch) + { + return ControlFlow::Continue(()); + } + match &mut retval { + None => retval = Some(ch.clone()), + Some(retval) + if ch.min_x.get() - ch.min_y.get() + < retval.min_x.get() - retval.min_y.get() => + { + *retval = ch.clone(); + } + Some(_) => {} + } + ControlFlow::Continue(()) + }); + Ok(retval) + } + fn extract_text_line( + &mut self, + start_char: Option, + mut start_min_y: f32, + min_x: f32, + max_x: f32, + fonts: TextLineFonts, + preceding_blank_lines: u32, + mut skip_initial_spaces: bool, + allowed_start_min_y_error: Option, + ) -> Result, ExtractInsnsError> { + let mut chars: Vec = Vec::new(); + let mut chars_set: IndexSet = IndexSet::new(); + if let Some(start_char) = start_char.clone() { + chars.push(start_char.clone()); + chars_set.insert(start_char); + } + if let Some(start_char) = start_char + && start_char.text == "*" + && self.text_section.page_num == 168 + && fonts + .subscript() + .is_some_and(|v| v.contains(&start_char.font)) + { + start_min_y = start_char.max_y.get() - fonts.regular()[0].size(); + } + let page = self.page().map_err(ExtractInsnsError::Other)?; + let unprocessed_chars = self.unprocessed_chars().map_err(ExtractInsnsError::Other)?; + let ControlFlow::::Continue(()) = page.qt[&self.text_section].range( + min_x - fonts.regular()[0].size() * 0.5, + max_x, + start_min_y - fonts.regular()[0].size() * 0.4, + start_min_y + fonts.regular()[0].size() * 0.6, + |x, y, ch| { + let PageItem::Char(ch) = ch else { + return ControlFlow::Continue(()); + }; + if !RefCell::borrow(&*unprocessed_chars)[&ch.font].contains(ch) + || chars_set.contains(ch) + { + return ControlFlow::Continue(()); + } + chars_set.insert(ch.clone()); + chars.push(ch.clone()); + ControlFlow::Continue(()) + }, + ); + if chars.is_empty() { + return Ok(None); + } + chars.sort_by(|a, b| (a.min_x, &a.text).cmp(&(b.min_x, &b.text))); + let mut regular_min_y = chars[0].min_y.get(); + let mut regular_max_y = chars[0].max_y.get(); + for ch in &chars { + let Some(kind) = fonts.get_kind(ch.font.clone(), BaselinePos::Below) else { + continue; + }; + if kind.sub_super() == FontVariantSubSuper::NotSubSuper { + regular_min_y = ch.min_y.get(); + regular_max_y = ch.max_y.get(); + break; + } + } + let mut retval = ParsedTextLine { + element: xml_tree::Element::new("text-line".into(), []), + regular_min_y, + regular_max_y, + fonts, + chars, + preceding_blank_lines, + }; + let mut text_and_tag_stacks: Vec<(String, Vec<&str>)> = Vec::new(); + let mut last_max_x = min_x; + let mut last_kind = None; + let mut last_char: Option = None; + for ch in &retval.chars { + let baseline_pos = if (ch.max_y.get() + ch.min_y.get()) * 0.5 + > (retval.regular_max_y + retval.regular_min_y) * 0.5 + { + BaselinePos::Above + } else { + BaselinePos::Below + }; + let Some(kind) = fonts.get_kind(ch.font.clone(), baseline_pos) else { + println!( + "font kind is None:\n\ + regular_min_y={}\n\ + fonts={fonts:?}\n\ + ch={ch:?}\n\ + baseline_pos={baseline_pos:?}\n\ + chars[0]={:?}", + retval.regular_min_y, retval.chars[0], + ); + return Ok(None); + }; + let space_kind = match last_kind { + None => kind, + Some(last_kind) if last_kind != kind => TextLineFontKind::Regular, + _ => kind, + }; + let (space_fonts, _) = fonts + .get_fonts(space_kind) + .unwrap_or((fonts.regular(), None)); + let space_width = ch.min_x.get() - last_max_x; + let space_count_f = space_width / space_fonts[0].space_width(); + let mut space_count = space_count_f.round() as usize; + if space_count == 0 && space_count_f > 0.35 { + space_count = 1 + } + if space_count_f > 0.25 && f32::abs(space_count as f32 - space_count_f) > 0.15 { + println!("spaces: space_count_f={space_count_f} space_width={space_width}"); + } + if space_count > 0 && !skip_initial_spaces { + text_and_tag_stacks.push(( + " ".repeat(space_count), + space_kind.text_line_tags().collect(), + )); + } + skip_initial_spaces = false; + if ch.text == "\u{0338}" + && let Some(last_char) = last_char + && last_char.text == "=" + && f32::abs(ch.min_x.get() - last_char.min_x.get()) < 0.01 + && f32::abs(ch.min_y.get() - last_char.min_y.get()) < 0.01 + { + *text_and_tag_stacks + .last_mut() + .expect("known to be non-empty") = ("\u{2260}".into(), Vec::new()); + last_max_x = last_char.max_x.get(); + } else { + let char_text = match &*ch.text { + "\u{fb00}" => "ff", + "\u{fb01}" => "fi", + "\u{fb02}" => "fl", + "\u{fb03}" => "ffi", + "\u{fb04}" => "ffl", + v => v, + }; + text_and_tag_stacks.push((char_text.into(), kind.text_line_tags().collect())); + last_max_x = ch.max_x.get(); + } + last_kind = Some(kind); + last_char = Some(ch.clone()); + } + ElementBodyBuilder::scope( + &mut ElementBodyBuilder::new(&mut retval.element), + |body_builder| { + for (text, tag_stack) in text_and_tag_stacks { + body_builder.set_tag_stack(tag_stack); + body_builder.write_text(text) + } + }, + ); + for ch in &retval.chars { + RefCell::borrow_mut(&*unprocessed_chars) + .get_mut(&ch.font) + .expect("known to exist") + .shift_remove(ch); + } + let allowed_start_min_y_error = allowed_start_min_y_error.unwrap_or(0.01); + if f32::abs(start_min_y - retval.regular_min_y) > allowed_start_min_y_error { + return Err(ExtractInsnsError::PageParseError( + format!( + "start_min_y={start_min_y} regular_min_y={}\n\ + start_min_y error: {}\n\ + allowed_start_min_y_error={allowed_start_min_y_error}", + retval.regular_min_y, + start_min_y - retval.regular_min_y, + ), + Backtrace::capture(), + )); + } + Ok(Some(retval)) + } + /*fn extract_insn(&mut self, header_start_char: Char) -> Result { + assert_eq!(header_start_char.font, Font::InsnHeader); + println!("{header_start_char:?}"); + let Some(header) = self.extract_insn_header_mnemonics_and_bit_fields( + header_start_char.min_y.get(), + header_start_char, + )? else { + return Err(ExtractInsnsError::PageParseError("can't find header text line".into(), Backtrace::capture())); + }; + let next_start_min_y = header.min_y.get() - 5.0; + let mut headers = vec![header]; + let mut code_lines: Vec = Vec::new(); + let mut desc_lines: Vec = Vec::new(); + let mut sp_regs_altered = None; + loop { + let search_min_y = next_start_min_y - 70.0; + let Some(next_char) = self.find_top_left_char_in_range( + min_x=self.text_section.min_x.get() - 5.0, + max_x=self.text_section.max_x.get() + 5.0, + min_y=max(search_min_y, self.text_section.min_y), + max_y=next_start_min_y, + allow_processed=False, + )?; + if next_char is None: + if search_min_y <= self.text_section.min_y \ + and self.text_section.next is not None and \ + self.text_section.next.page_num in self.pages: + # go to next section + self.text_section = self.text_section.next + next_start_min_y = self.text_section.max_y + continue + else: + raise InsnParseError("can't find insn code or description text") + match next_char.font: + case font if font in TextLineFonts.INSN_CODE_FONTS.fonts: + next_section = _InsnParseSection.CODE + case font if font in TextLineFonts.INSN_DESC_FONTS.fonts: + next_section = _InsnParseSection.DESC + case Font.INSN_HEADER: + next_section = _InsnParseSection.HEADER + case font: + raise InsnParseError(f"can't find insn code or description text\nfont={font}") + match next_section: + case _InsnParseSection.CODE: + if len(desc_lines) != 0: + break + code_line = self.extract_text_line( + start_char=next_char, + start_min_y=next_char.min_y, + min_x=next_char.min_x, + max_x=self.text_section.max_x, + fonts=TextLineFonts.INSN_CODE_FONTS, + preceding_blank_lines=0 if len(code_lines) == 0 else 1, + ) + if code_line is None: + raise InsnParseError("can't find insn code text line") + more_code_lines = self.extract_following_text_lines( + first_text_line=code_line, + min_x=code_line.chars[0].min_x, + max_x=self.text_section.max_x, + allowed_start_min_y_error=0.05, + ) + print("more insn code lines:") + print("\n".join(map(str, more_code_lines))) + code_lines.extend(more_code_lines) + next_start_min_y = code_lines[-1].regular_min_y - 5 + case _InsnParseSection.HEADER: + if len(code_lines) != 0 or len(desc_lines) != 0: + break + header = self.extract_insn_header_mnemonics_and_bit_fields( + start_min_y=next_char.min_y, + header_start_char=next_char, + ) + if header is None: + raise InsnParseError("can't find header text line") + headers.append(header) + next_start_min_y = header.min_y - 5 + case _InsnParseSection.DESC: + desc_line = self.extract_text_line( + start_char=next_char, + start_min_y=next_char.min_y, + min_x=next_char.min_x, + max_x=self.text_section.max_x, + fonts=TextLineFonts.INSN_DESC_FONTS, + preceding_blank_lines=0 if len(desc_lines) == 0 else 1, + allowed_start_min_y_error=3, + ) + if desc_line is None: + raise InsnParseError("can't find insn desc text line") + match desc_line.get_header_text(): + case None: + more_desc_lines = self.extract_following_text_lines( + first_text_line=desc_line, + min_x=desc_line.chars[0].min_x, + max_x=self.text_section.max_x, + allowed_start_min_y_error=3.5, + ) + print("more insn desc lines:") + print("\n".join(map(str, more_desc_lines))) + desc_lines.extend(more_desc_lines) + next_start_min_y = desc_lines[-1].regular_min_y - 5 + case "Special Registers Altered:": + sp_regs_altered = self.extract_insn_sp_regs_altered( + sp_regs_altered_text=desc_line, + ) + next_start_min_y = sp_regs_altered.final_regular_min_y + break + case header_text: + raise AssertionError(f"unhandled header text: {header_text!r}\n{desc_line}") + case _: + assert_never(next_section) + } + print("insn code lines:") + print("\n".join(map(str, code_lines))) + print("insn desc lines:") + print("\n".join(map(str, desc_lines))) + print("sp_regs_altered:") + print(sp_regs_altered) + # TODO: finish + return Insn( + headers=tuple(headers), + code_lines=tuple(code_lines), + desc_lines=tuple(desc_lines), + sp_regs_altered=sp_regs_altered, + ) + }*/ + fn extract_insns(&mut self) -> Result<(), ExtractInsnsError> { + loop { + let Some(header_start_char) = + RefCell::borrow(&*self.unprocessed_chars().map_err(ExtractInsnsError::Other)?) + .get(&Font::InsnHeader) + .and_then(|v| v.first().cloned()) + else { + return Ok(()); + }; + let insn = self.extract_insn(header_start_char)?; + self.insns.push(insn); + } } }