This commit is contained in:
Jacob Lifshay 2026-01-04 20:01:13 -08:00
parent 45e8925d34
commit 442afe5f06
Signed by: programmerjake
SSH key fingerprint: SHA256:HnFTLGpSm4Q4Fj502oCFisjZSoakwEuTsJJMSke63RQ

View file

@ -5,12 +5,15 @@ use crate::quad_tree::QuadTree;
use indexmap::IndexSet;
use non_nan_float::NonNaNF32;
use std::{
backtrace::Backtrace,
borrow::{Borrow, Cow},
cell::RefCell,
collections::{BTreeMap, BTreeSet, HashMap, HashSet},
convert::Infallible,
error::Error,
fmt,
num::NonZero,
ops::ControlFlow,
rc::Rc,
sync::OnceLock,
};
@ -1902,6 +1905,45 @@ struct Parser {
insns: Vec<Insn>,
}
enum ExtractInsnsError {
InsnParseError(String, std::backtrace::Backtrace),
PageParseError(String, std::backtrace::Backtrace),
Other(Box<dyn Error>),
}
impl fmt::Display for ExtractInsnsError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let backtrace = match self {
ExtractInsnsError::InsnParseError(msg, backtrace) => {
writeln!(f, "instruction parse error: {msg}")?;
backtrace
}
ExtractInsnsError::PageParseError(msg, backtrace) => {
writeln!(f, "page parse error: {msg}")?;
backtrace
}
ExtractInsnsError::Other(e) => return fmt::Display::fmt(&e, f),
};
backtrace.fmt(f)
}
}
#[derive(Clone, Debug)]
struct ErrorWithNote<E> {
error: E,
note: String,
}
impl<E: fmt::Display> fmt::Display for ErrorWithNote<E> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let Self { error, note } = self;
fmt::Display::fmt(error, f)?;
write!(f, "\nnote: {note}")
}
}
impl<E: fmt::Display + fmt::Debug> Error for ErrorWithNote<E> {}
impl Parser {
fn new() -> Self {
Self {
@ -1971,14 +2013,14 @@ impl Parser {
}
}
}
fn note_text_section(
fn note_text_section<E>(
&mut self,
f: impl FnOnce(&mut Self) -> Result<(), Box<dyn Error>>,
) -> Result<(), Box<dyn Error>> {
f: impl FnOnce(&mut Self) -> Result<(), E>,
) -> Result<(), ErrorWithNote<E>> {
let start_text_section = self.text_section;
match f(self) {
Ok(()) => Ok(()),
Err(e) => {
Err(error) => {
let note = if self.text_section == start_text_section {
format!("text_section={:?}", self.text_section)
} else {
@ -1987,12 +2029,375 @@ impl Parser {
self.text_section
)
};
Err(format!("{e}\nnote: {note}").into())
Err(ErrorWithNote { error, note })
}
}
}
fn parse_text_section(&mut self) -> Result<(), Box<dyn Error>> {
todo!()
fn parse_text_section(&mut self) -> Result<(), ErrorWithNote<Box<dyn Error>>> {
match self.note_text_section(Self::extract_insns) {
Ok(()) => Ok(()),
Err(
e @ ErrorWithNote {
error:
ExtractInsnsError::InsnParseError(_) | ExtractInsnsError::PageParseError(_),
..
},
) => {
println!("{e}");
Ok(())
}
Err(ErrorWithNote {
error: ExtractInsnsError::Other(error),
note,
}) => Err(ErrorWithNote { error, note }),
}
}
fn find_top_left_char_in_range(
&mut self,
min_x: f32,
max_x: f32,
min_y: f32,
max_y: f32,
allow_processed: bool,
) -> Result<Option<Char>, Box<dyn Error>> {
let mut retval = None;
let page = self.page()?;
let unprocessed_chars = self.unprocessed_chars()?;
let ControlFlow::<Infallible>::Continue(()) =
page.qt[&self.text_section].range(min_x, max_x, min_y, max_y, |x, y, ch| {
let PageItem::Char(ch) = ch else {
return ControlFlow::Continue(());
};
if !allow_processed && !RefCell::borrow(&*unprocessed_chars)[&ch.font].contains(ch)
{
return ControlFlow::Continue(());
}
match &mut retval {
None => retval = Some(ch.clone()),
Some(retval)
if ch.min_x.get() - ch.min_y.get()
< retval.min_x.get() - retval.min_y.get() =>
{
*retval = ch.clone();
}
Some(_) => {}
}
ControlFlow::Continue(())
});
Ok(retval)
}
fn extract_text_line(
&mut self,
start_char: Option<Char>,
mut start_min_y: f32,
min_x: f32,
max_x: f32,
fonts: TextLineFonts,
preceding_blank_lines: u32,
mut skip_initial_spaces: bool,
allowed_start_min_y_error: Option<f32>,
) -> Result<Option<ParsedTextLine>, ExtractInsnsError> {
let mut chars: Vec<Char> = Vec::new();
let mut chars_set: IndexSet<Char> = IndexSet::new();
if let Some(start_char) = start_char.clone() {
chars.push(start_char.clone());
chars_set.insert(start_char);
}
if let Some(start_char) = start_char
&& start_char.text == "*"
&& self.text_section.page_num == 168
&& fonts
.subscript()
.is_some_and(|v| v.contains(&start_char.font))
{
start_min_y = start_char.max_y.get() - fonts.regular()[0].size();
}
let page = self.page().map_err(ExtractInsnsError::Other)?;
let unprocessed_chars = self.unprocessed_chars().map_err(ExtractInsnsError::Other)?;
let ControlFlow::<Infallible>::Continue(()) = page.qt[&self.text_section].range(
min_x - fonts.regular()[0].size() * 0.5,
max_x,
start_min_y - fonts.regular()[0].size() * 0.4,
start_min_y + fonts.regular()[0].size() * 0.6,
|x, y, ch| {
let PageItem::Char(ch) = ch else {
return ControlFlow::Continue(());
};
if !RefCell::borrow(&*unprocessed_chars)[&ch.font].contains(ch)
|| chars_set.contains(ch)
{
return ControlFlow::Continue(());
}
chars_set.insert(ch.clone());
chars.push(ch.clone());
ControlFlow::Continue(())
},
);
if chars.is_empty() {
return Ok(None);
}
chars.sort_by(|a, b| (a.min_x, &a.text).cmp(&(b.min_x, &b.text)));
let mut regular_min_y = chars[0].min_y.get();
let mut regular_max_y = chars[0].max_y.get();
for ch in &chars {
let Some(kind) = fonts.get_kind(ch.font.clone(), BaselinePos::Below) else {
continue;
};
if kind.sub_super() == FontVariantSubSuper::NotSubSuper {
regular_min_y = ch.min_y.get();
regular_max_y = ch.max_y.get();
break;
}
}
let mut retval = ParsedTextLine {
element: xml_tree::Element::new("text-line".into(), []),
regular_min_y,
regular_max_y,
fonts,
chars,
preceding_blank_lines,
};
let mut text_and_tag_stacks: Vec<(String, Vec<&str>)> = Vec::new();
let mut last_max_x = min_x;
let mut last_kind = None;
let mut last_char: Option<Char> = None;
for ch in &retval.chars {
let baseline_pos = if (ch.max_y.get() + ch.min_y.get()) * 0.5
> (retval.regular_max_y + retval.regular_min_y) * 0.5
{
BaselinePos::Above
} else {
BaselinePos::Below
};
let Some(kind) = fonts.get_kind(ch.font.clone(), baseline_pos) else {
println!(
"font kind is None:\n\
regular_min_y={}\n\
fonts={fonts:?}\n\
ch={ch:?}\n\
baseline_pos={baseline_pos:?}\n\
chars[0]={:?}",
retval.regular_min_y, retval.chars[0],
);
return Ok(None);
};
let space_kind = match last_kind {
None => kind,
Some(last_kind) if last_kind != kind => TextLineFontKind::Regular,
_ => kind,
};
let (space_fonts, _) = fonts
.get_fonts(space_kind)
.unwrap_or((fonts.regular(), None));
let space_width = ch.min_x.get() - last_max_x;
let space_count_f = space_width / space_fonts[0].space_width();
let mut space_count = space_count_f.round() as usize;
if space_count == 0 && space_count_f > 0.35 {
space_count = 1
}
if space_count_f > 0.25 && f32::abs(space_count as f32 - space_count_f) > 0.15 {
println!("spaces: space_count_f={space_count_f} space_width={space_width}");
}
if space_count > 0 && !skip_initial_spaces {
text_and_tag_stacks.push((
" ".repeat(space_count),
space_kind.text_line_tags().collect(),
));
}
skip_initial_spaces = false;
if ch.text == "\u{0338}"
&& let Some(last_char) = last_char
&& last_char.text == "="
&& f32::abs(ch.min_x.get() - last_char.min_x.get()) < 0.01
&& f32::abs(ch.min_y.get() - last_char.min_y.get()) < 0.01
{
*text_and_tag_stacks
.last_mut()
.expect("known to be non-empty") = ("\u{2260}".into(), Vec::new());
last_max_x = last_char.max_x.get();
} else {
let char_text = match &*ch.text {
"\u{fb00}" => "ff",
"\u{fb01}" => "fi",
"\u{fb02}" => "fl",
"\u{fb03}" => "ffi",
"\u{fb04}" => "ffl",
v => v,
};
text_and_tag_stacks.push((char_text.into(), kind.text_line_tags().collect()));
last_max_x = ch.max_x.get();
}
last_kind = Some(kind);
last_char = Some(ch.clone());
}
ElementBodyBuilder::scope(
&mut ElementBodyBuilder::new(&mut retval.element),
|body_builder| {
for (text, tag_stack) in text_and_tag_stacks {
body_builder.set_tag_stack(tag_stack);
body_builder.write_text(text)
}
},
);
for ch in &retval.chars {
RefCell::borrow_mut(&*unprocessed_chars)
.get_mut(&ch.font)
.expect("known to exist")
.shift_remove(ch);
}
let allowed_start_min_y_error = allowed_start_min_y_error.unwrap_or(0.01);
if f32::abs(start_min_y - retval.regular_min_y) > allowed_start_min_y_error {
return Err(ExtractInsnsError::PageParseError(
format!(
"start_min_y={start_min_y} regular_min_y={}\n\
start_min_y error: {}\n\
allowed_start_min_y_error={allowed_start_min_y_error}",
retval.regular_min_y,
start_min_y - retval.regular_min_y,
),
Backtrace::capture(),
));
}
Ok(Some(retval))
}
/*fn extract_insn(&mut self, header_start_char: Char) -> Result<Insn, ExtractInsnsError> {
assert_eq!(header_start_char.font, Font::InsnHeader);
println!("{header_start_char:?}");
let Some(header) = self.extract_insn_header_mnemonics_and_bit_fields(
header_start_char.min_y.get(),
header_start_char,
)? else {
return Err(ExtractInsnsError::PageParseError("can't find header text line".into(), Backtrace::capture()));
};
let next_start_min_y = header.min_y.get() - 5.0;
let mut headers = vec![header];
let mut code_lines: Vec<ParsedTextLine> = Vec::new();
let mut desc_lines: Vec<ParsedTextLine> = Vec::new();
let mut sp_regs_altered = None;
loop {
let search_min_y = next_start_min_y - 70.0;
let Some(next_char) = self.find_top_left_char_in_range(
min_x=self.text_section.min_x.get() - 5.0,
max_x=self.text_section.max_x.get() + 5.0,
min_y=max(search_min_y, self.text_section.min_y),
max_y=next_start_min_y,
allow_processed=False,
)?;
if next_char is None:
if search_min_y <= self.text_section.min_y \
and self.text_section.next is not None and \
self.text_section.next.page_num in self.pages:
# go to next section
self.text_section = self.text_section.next
next_start_min_y = self.text_section.max_y
continue
else:
raise InsnParseError("can't find insn code or description text")
match next_char.font:
case font if font in TextLineFonts.INSN_CODE_FONTS.fonts:
next_section = _InsnParseSection.CODE
case font if font in TextLineFonts.INSN_DESC_FONTS.fonts:
next_section = _InsnParseSection.DESC
case Font.INSN_HEADER:
next_section = _InsnParseSection.HEADER
case font:
raise InsnParseError(f"can't find insn code or description text\nfont={font}")
match next_section:
case _InsnParseSection.CODE:
if len(desc_lines) != 0:
break
code_line = self.extract_text_line(
start_char=next_char,
start_min_y=next_char.min_y,
min_x=next_char.min_x,
max_x=self.text_section.max_x,
fonts=TextLineFonts.INSN_CODE_FONTS,
preceding_blank_lines=0 if len(code_lines) == 0 else 1,
)
if code_line is None:
raise InsnParseError("can't find insn code text line")
more_code_lines = self.extract_following_text_lines(
first_text_line=code_line,
min_x=code_line.chars[0].min_x,
max_x=self.text_section.max_x,
allowed_start_min_y_error=0.05,
)
print("more insn code lines:")
print("\n".join(map(str, more_code_lines)))
code_lines.extend(more_code_lines)
next_start_min_y = code_lines[-1].regular_min_y - 5
case _InsnParseSection.HEADER:
if len(code_lines) != 0 or len(desc_lines) != 0:
break
header = self.extract_insn_header_mnemonics_and_bit_fields(
start_min_y=next_char.min_y,
header_start_char=next_char,
)
if header is None:
raise InsnParseError("can't find header text line")
headers.append(header)
next_start_min_y = header.min_y - 5
case _InsnParseSection.DESC:
desc_line = self.extract_text_line(
start_char=next_char,
start_min_y=next_char.min_y,
min_x=next_char.min_x,
max_x=self.text_section.max_x,
fonts=TextLineFonts.INSN_DESC_FONTS,
preceding_blank_lines=0 if len(desc_lines) == 0 else 1,
allowed_start_min_y_error=3,
)
if desc_line is None:
raise InsnParseError("can't find insn desc text line")
match desc_line.get_header_text():
case None:
more_desc_lines = self.extract_following_text_lines(
first_text_line=desc_line,
min_x=desc_line.chars[0].min_x,
max_x=self.text_section.max_x,
allowed_start_min_y_error=3.5,
)
print("more insn desc lines:")
print("\n".join(map(str, more_desc_lines)))
desc_lines.extend(more_desc_lines)
next_start_min_y = desc_lines[-1].regular_min_y - 5
case "Special Registers Altered:":
sp_regs_altered = self.extract_insn_sp_regs_altered(
sp_regs_altered_text=desc_line,
)
next_start_min_y = sp_regs_altered.final_regular_min_y
break
case header_text:
raise AssertionError(f"unhandled header text: {header_text!r}\n{desc_line}")
case _:
assert_never(next_section)
}
print("insn code lines:")
print("\n".join(map(str, code_lines)))
print("insn desc lines:")
print("\n".join(map(str, desc_lines)))
print("sp_regs_altered:")
print(sp_regs_altered)
# TODO: finish
return Insn(
headers=tuple(headers),
code_lines=tuple(code_lines),
desc_lines=tuple(desc_lines),
sp_regs_altered=sp_regs_altered,
)
}*/
fn extract_insns(&mut self) -> Result<(), ExtractInsnsError> {
loop {
let Some(header_start_char) =
RefCell::borrow(&*self.unprocessed_chars().map_err(ExtractInsnsError::Other)?)
.get(&Font::InsnHeader)
.and_then(|v| v.first().cloned())
else {
return Ok(());
};
let insn = self.extract_insn(header_start_char)?;
self.insns.push(insn);
}
}
}