wip
This commit is contained in:
parent
45e8925d34
commit
442afe5f06
1 changed files with 412 additions and 7 deletions
419
src/main.rs
419
src/main.rs
|
|
@ -5,12 +5,15 @@ use crate::quad_tree::QuadTree;
|
|||
use indexmap::IndexSet;
|
||||
use non_nan_float::NonNaNF32;
|
||||
use std::{
|
||||
backtrace::Backtrace,
|
||||
borrow::{Borrow, Cow},
|
||||
cell::RefCell,
|
||||
collections::{BTreeMap, BTreeSet, HashMap, HashSet},
|
||||
convert::Infallible,
|
||||
error::Error,
|
||||
fmt,
|
||||
num::NonZero,
|
||||
ops::ControlFlow,
|
||||
rc::Rc,
|
||||
sync::OnceLock,
|
||||
};
|
||||
|
|
@ -1902,6 +1905,45 @@ struct Parser {
|
|||
insns: Vec<Insn>,
|
||||
}
|
||||
|
||||
enum ExtractInsnsError {
|
||||
InsnParseError(String, std::backtrace::Backtrace),
|
||||
PageParseError(String, std::backtrace::Backtrace),
|
||||
Other(Box<dyn Error>),
|
||||
}
|
||||
|
||||
impl fmt::Display for ExtractInsnsError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let backtrace = match self {
|
||||
ExtractInsnsError::InsnParseError(msg, backtrace) => {
|
||||
writeln!(f, "instruction parse error: {msg}")?;
|
||||
backtrace
|
||||
}
|
||||
ExtractInsnsError::PageParseError(msg, backtrace) => {
|
||||
writeln!(f, "page parse error: {msg}")?;
|
||||
backtrace
|
||||
}
|
||||
ExtractInsnsError::Other(e) => return fmt::Display::fmt(&e, f),
|
||||
};
|
||||
backtrace.fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
struct ErrorWithNote<E> {
|
||||
error: E,
|
||||
note: String,
|
||||
}
|
||||
|
||||
impl<E: fmt::Display> fmt::Display for ErrorWithNote<E> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let Self { error, note } = self;
|
||||
fmt::Display::fmt(error, f)?;
|
||||
write!(f, "\nnote: {note}")
|
||||
}
|
||||
}
|
||||
|
||||
impl<E: fmt::Display + fmt::Debug> Error for ErrorWithNote<E> {}
|
||||
|
||||
impl Parser {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
|
|
@ -1971,14 +2013,14 @@ impl Parser {
|
|||
}
|
||||
}
|
||||
}
|
||||
fn note_text_section(
|
||||
fn note_text_section<E>(
|
||||
&mut self,
|
||||
f: impl FnOnce(&mut Self) -> Result<(), Box<dyn Error>>,
|
||||
) -> Result<(), Box<dyn Error>> {
|
||||
f: impl FnOnce(&mut Self) -> Result<(), E>,
|
||||
) -> Result<(), ErrorWithNote<E>> {
|
||||
let start_text_section = self.text_section;
|
||||
match f(self) {
|
||||
Ok(()) => Ok(()),
|
||||
Err(e) => {
|
||||
Err(error) => {
|
||||
let note = if self.text_section == start_text_section {
|
||||
format!("text_section={:?}", self.text_section)
|
||||
} else {
|
||||
|
|
@ -1987,12 +2029,375 @@ impl Parser {
|
|||
self.text_section
|
||||
)
|
||||
};
|
||||
Err(format!("{e}\nnote: {note}").into())
|
||||
Err(ErrorWithNote { error, note })
|
||||
}
|
||||
}
|
||||
}
|
||||
fn parse_text_section(&mut self) -> Result<(), Box<dyn Error>> {
|
||||
todo!()
|
||||
fn parse_text_section(&mut self) -> Result<(), ErrorWithNote<Box<dyn Error>>> {
|
||||
match self.note_text_section(Self::extract_insns) {
|
||||
Ok(()) => Ok(()),
|
||||
Err(
|
||||
e @ ErrorWithNote {
|
||||
error:
|
||||
ExtractInsnsError::InsnParseError(_) | ExtractInsnsError::PageParseError(_),
|
||||
..
|
||||
},
|
||||
) => {
|
||||
println!("{e}");
|
||||
Ok(())
|
||||
}
|
||||
Err(ErrorWithNote {
|
||||
error: ExtractInsnsError::Other(error),
|
||||
note,
|
||||
}) => Err(ErrorWithNote { error, note }),
|
||||
}
|
||||
}
|
||||
fn find_top_left_char_in_range(
|
||||
&mut self,
|
||||
min_x: f32,
|
||||
max_x: f32,
|
||||
min_y: f32,
|
||||
max_y: f32,
|
||||
allow_processed: bool,
|
||||
) -> Result<Option<Char>, Box<dyn Error>> {
|
||||
let mut retval = None;
|
||||
let page = self.page()?;
|
||||
let unprocessed_chars = self.unprocessed_chars()?;
|
||||
let ControlFlow::<Infallible>::Continue(()) =
|
||||
page.qt[&self.text_section].range(min_x, max_x, min_y, max_y, |x, y, ch| {
|
||||
let PageItem::Char(ch) = ch else {
|
||||
return ControlFlow::Continue(());
|
||||
};
|
||||
if !allow_processed && !RefCell::borrow(&*unprocessed_chars)[&ch.font].contains(ch)
|
||||
{
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
match &mut retval {
|
||||
None => retval = Some(ch.clone()),
|
||||
Some(retval)
|
||||
if ch.min_x.get() - ch.min_y.get()
|
||||
< retval.min_x.get() - retval.min_y.get() =>
|
||||
{
|
||||
*retval = ch.clone();
|
||||
}
|
||||
Some(_) => {}
|
||||
}
|
||||
ControlFlow::Continue(())
|
||||
});
|
||||
Ok(retval)
|
||||
}
|
||||
fn extract_text_line(
|
||||
&mut self,
|
||||
start_char: Option<Char>,
|
||||
mut start_min_y: f32,
|
||||
min_x: f32,
|
||||
max_x: f32,
|
||||
fonts: TextLineFonts,
|
||||
preceding_blank_lines: u32,
|
||||
mut skip_initial_spaces: bool,
|
||||
allowed_start_min_y_error: Option<f32>,
|
||||
) -> Result<Option<ParsedTextLine>, ExtractInsnsError> {
|
||||
let mut chars: Vec<Char> = Vec::new();
|
||||
let mut chars_set: IndexSet<Char> = IndexSet::new();
|
||||
if let Some(start_char) = start_char.clone() {
|
||||
chars.push(start_char.clone());
|
||||
chars_set.insert(start_char);
|
||||
}
|
||||
if let Some(start_char) = start_char
|
||||
&& start_char.text == "*"
|
||||
&& self.text_section.page_num == 168
|
||||
&& fonts
|
||||
.subscript()
|
||||
.is_some_and(|v| v.contains(&start_char.font))
|
||||
{
|
||||
start_min_y = start_char.max_y.get() - fonts.regular()[0].size();
|
||||
}
|
||||
let page = self.page().map_err(ExtractInsnsError::Other)?;
|
||||
let unprocessed_chars = self.unprocessed_chars().map_err(ExtractInsnsError::Other)?;
|
||||
let ControlFlow::<Infallible>::Continue(()) = page.qt[&self.text_section].range(
|
||||
min_x - fonts.regular()[0].size() * 0.5,
|
||||
max_x,
|
||||
start_min_y - fonts.regular()[0].size() * 0.4,
|
||||
start_min_y + fonts.regular()[0].size() * 0.6,
|
||||
|x, y, ch| {
|
||||
let PageItem::Char(ch) = ch else {
|
||||
return ControlFlow::Continue(());
|
||||
};
|
||||
if !RefCell::borrow(&*unprocessed_chars)[&ch.font].contains(ch)
|
||||
|| chars_set.contains(ch)
|
||||
{
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
chars_set.insert(ch.clone());
|
||||
chars.push(ch.clone());
|
||||
ControlFlow::Continue(())
|
||||
},
|
||||
);
|
||||
if chars.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
chars.sort_by(|a, b| (a.min_x, &a.text).cmp(&(b.min_x, &b.text)));
|
||||
let mut regular_min_y = chars[0].min_y.get();
|
||||
let mut regular_max_y = chars[0].max_y.get();
|
||||
for ch in &chars {
|
||||
let Some(kind) = fonts.get_kind(ch.font.clone(), BaselinePos::Below) else {
|
||||
continue;
|
||||
};
|
||||
if kind.sub_super() == FontVariantSubSuper::NotSubSuper {
|
||||
regular_min_y = ch.min_y.get();
|
||||
regular_max_y = ch.max_y.get();
|
||||
break;
|
||||
}
|
||||
}
|
||||
let mut retval = ParsedTextLine {
|
||||
element: xml_tree::Element::new("text-line".into(), []),
|
||||
regular_min_y,
|
||||
regular_max_y,
|
||||
fonts,
|
||||
chars,
|
||||
preceding_blank_lines,
|
||||
};
|
||||
let mut text_and_tag_stacks: Vec<(String, Vec<&str>)> = Vec::new();
|
||||
let mut last_max_x = min_x;
|
||||
let mut last_kind = None;
|
||||
let mut last_char: Option<Char> = None;
|
||||
for ch in &retval.chars {
|
||||
let baseline_pos = if (ch.max_y.get() + ch.min_y.get()) * 0.5
|
||||
> (retval.regular_max_y + retval.regular_min_y) * 0.5
|
||||
{
|
||||
BaselinePos::Above
|
||||
} else {
|
||||
BaselinePos::Below
|
||||
};
|
||||
let Some(kind) = fonts.get_kind(ch.font.clone(), baseline_pos) else {
|
||||
println!(
|
||||
"font kind is None:\n\
|
||||
regular_min_y={}\n\
|
||||
fonts={fonts:?}\n\
|
||||
ch={ch:?}\n\
|
||||
baseline_pos={baseline_pos:?}\n\
|
||||
chars[0]={:?}",
|
||||
retval.regular_min_y, retval.chars[0],
|
||||
);
|
||||
return Ok(None);
|
||||
};
|
||||
let space_kind = match last_kind {
|
||||
None => kind,
|
||||
Some(last_kind) if last_kind != kind => TextLineFontKind::Regular,
|
||||
_ => kind,
|
||||
};
|
||||
let (space_fonts, _) = fonts
|
||||
.get_fonts(space_kind)
|
||||
.unwrap_or((fonts.regular(), None));
|
||||
let space_width = ch.min_x.get() - last_max_x;
|
||||
let space_count_f = space_width / space_fonts[0].space_width();
|
||||
let mut space_count = space_count_f.round() as usize;
|
||||
if space_count == 0 && space_count_f > 0.35 {
|
||||
space_count = 1
|
||||
}
|
||||
if space_count_f > 0.25 && f32::abs(space_count as f32 - space_count_f) > 0.15 {
|
||||
println!("spaces: space_count_f={space_count_f} space_width={space_width}");
|
||||
}
|
||||
if space_count > 0 && !skip_initial_spaces {
|
||||
text_and_tag_stacks.push((
|
||||
" ".repeat(space_count),
|
||||
space_kind.text_line_tags().collect(),
|
||||
));
|
||||
}
|
||||
skip_initial_spaces = false;
|
||||
if ch.text == "\u{0338}"
|
||||
&& let Some(last_char) = last_char
|
||||
&& last_char.text == "="
|
||||
&& f32::abs(ch.min_x.get() - last_char.min_x.get()) < 0.01
|
||||
&& f32::abs(ch.min_y.get() - last_char.min_y.get()) < 0.01
|
||||
{
|
||||
*text_and_tag_stacks
|
||||
.last_mut()
|
||||
.expect("known to be non-empty") = ("\u{2260}".into(), Vec::new());
|
||||
last_max_x = last_char.max_x.get();
|
||||
} else {
|
||||
let char_text = match &*ch.text {
|
||||
"\u{fb00}" => "ff",
|
||||
"\u{fb01}" => "fi",
|
||||
"\u{fb02}" => "fl",
|
||||
"\u{fb03}" => "ffi",
|
||||
"\u{fb04}" => "ffl",
|
||||
v => v,
|
||||
};
|
||||
text_and_tag_stacks.push((char_text.into(), kind.text_line_tags().collect()));
|
||||
last_max_x = ch.max_x.get();
|
||||
}
|
||||
last_kind = Some(kind);
|
||||
last_char = Some(ch.clone());
|
||||
}
|
||||
ElementBodyBuilder::scope(
|
||||
&mut ElementBodyBuilder::new(&mut retval.element),
|
||||
|body_builder| {
|
||||
for (text, tag_stack) in text_and_tag_stacks {
|
||||
body_builder.set_tag_stack(tag_stack);
|
||||
body_builder.write_text(text)
|
||||
}
|
||||
},
|
||||
);
|
||||
for ch in &retval.chars {
|
||||
RefCell::borrow_mut(&*unprocessed_chars)
|
||||
.get_mut(&ch.font)
|
||||
.expect("known to exist")
|
||||
.shift_remove(ch);
|
||||
}
|
||||
let allowed_start_min_y_error = allowed_start_min_y_error.unwrap_or(0.01);
|
||||
if f32::abs(start_min_y - retval.regular_min_y) > allowed_start_min_y_error {
|
||||
return Err(ExtractInsnsError::PageParseError(
|
||||
format!(
|
||||
"start_min_y={start_min_y} regular_min_y={}\n\
|
||||
start_min_y error: {}\n\
|
||||
allowed_start_min_y_error={allowed_start_min_y_error}",
|
||||
retval.regular_min_y,
|
||||
start_min_y - retval.regular_min_y,
|
||||
),
|
||||
Backtrace::capture(),
|
||||
));
|
||||
}
|
||||
Ok(Some(retval))
|
||||
}
|
||||
/*fn extract_insn(&mut self, header_start_char: Char) -> Result<Insn, ExtractInsnsError> {
|
||||
assert_eq!(header_start_char.font, Font::InsnHeader);
|
||||
println!("{header_start_char:?}");
|
||||
let Some(header) = self.extract_insn_header_mnemonics_and_bit_fields(
|
||||
header_start_char.min_y.get(),
|
||||
header_start_char,
|
||||
)? else {
|
||||
return Err(ExtractInsnsError::PageParseError("can't find header text line".into(), Backtrace::capture()));
|
||||
};
|
||||
let next_start_min_y = header.min_y.get() - 5.0;
|
||||
let mut headers = vec![header];
|
||||
let mut code_lines: Vec<ParsedTextLine> = Vec::new();
|
||||
let mut desc_lines: Vec<ParsedTextLine> = Vec::new();
|
||||
let mut sp_regs_altered = None;
|
||||
loop {
|
||||
let search_min_y = next_start_min_y - 70.0;
|
||||
let Some(next_char) = self.find_top_left_char_in_range(
|
||||
min_x=self.text_section.min_x.get() - 5.0,
|
||||
max_x=self.text_section.max_x.get() + 5.0,
|
||||
min_y=max(search_min_y, self.text_section.min_y),
|
||||
max_y=next_start_min_y,
|
||||
allow_processed=False,
|
||||
)?;
|
||||
if next_char is None:
|
||||
if search_min_y <= self.text_section.min_y \
|
||||
and self.text_section.next is not None and \
|
||||
self.text_section.next.page_num in self.pages:
|
||||
# go to next section
|
||||
self.text_section = self.text_section.next
|
||||
next_start_min_y = self.text_section.max_y
|
||||
continue
|
||||
else:
|
||||
raise InsnParseError("can't find insn code or description text")
|
||||
match next_char.font:
|
||||
case font if font in TextLineFonts.INSN_CODE_FONTS.fonts:
|
||||
next_section = _InsnParseSection.CODE
|
||||
case font if font in TextLineFonts.INSN_DESC_FONTS.fonts:
|
||||
next_section = _InsnParseSection.DESC
|
||||
case Font.INSN_HEADER:
|
||||
next_section = _InsnParseSection.HEADER
|
||||
case font:
|
||||
raise InsnParseError(f"can't find insn code or description text\nfont={font}")
|
||||
match next_section:
|
||||
case _InsnParseSection.CODE:
|
||||
if len(desc_lines) != 0:
|
||||
break
|
||||
code_line = self.extract_text_line(
|
||||
start_char=next_char,
|
||||
start_min_y=next_char.min_y,
|
||||
min_x=next_char.min_x,
|
||||
max_x=self.text_section.max_x,
|
||||
fonts=TextLineFonts.INSN_CODE_FONTS,
|
||||
preceding_blank_lines=0 if len(code_lines) == 0 else 1,
|
||||
)
|
||||
if code_line is None:
|
||||
raise InsnParseError("can't find insn code text line")
|
||||
more_code_lines = self.extract_following_text_lines(
|
||||
first_text_line=code_line,
|
||||
min_x=code_line.chars[0].min_x,
|
||||
max_x=self.text_section.max_x,
|
||||
allowed_start_min_y_error=0.05,
|
||||
)
|
||||
print("more insn code lines:")
|
||||
print("\n".join(map(str, more_code_lines)))
|
||||
code_lines.extend(more_code_lines)
|
||||
next_start_min_y = code_lines[-1].regular_min_y - 5
|
||||
case _InsnParseSection.HEADER:
|
||||
if len(code_lines) != 0 or len(desc_lines) != 0:
|
||||
break
|
||||
header = self.extract_insn_header_mnemonics_and_bit_fields(
|
||||
start_min_y=next_char.min_y,
|
||||
header_start_char=next_char,
|
||||
)
|
||||
if header is None:
|
||||
raise InsnParseError("can't find header text line")
|
||||
headers.append(header)
|
||||
next_start_min_y = header.min_y - 5
|
||||
case _InsnParseSection.DESC:
|
||||
desc_line = self.extract_text_line(
|
||||
start_char=next_char,
|
||||
start_min_y=next_char.min_y,
|
||||
min_x=next_char.min_x,
|
||||
max_x=self.text_section.max_x,
|
||||
fonts=TextLineFonts.INSN_DESC_FONTS,
|
||||
preceding_blank_lines=0 if len(desc_lines) == 0 else 1,
|
||||
allowed_start_min_y_error=3,
|
||||
)
|
||||
if desc_line is None:
|
||||
raise InsnParseError("can't find insn desc text line")
|
||||
match desc_line.get_header_text():
|
||||
case None:
|
||||
more_desc_lines = self.extract_following_text_lines(
|
||||
first_text_line=desc_line,
|
||||
min_x=desc_line.chars[0].min_x,
|
||||
max_x=self.text_section.max_x,
|
||||
allowed_start_min_y_error=3.5,
|
||||
)
|
||||
print("more insn desc lines:")
|
||||
print("\n".join(map(str, more_desc_lines)))
|
||||
desc_lines.extend(more_desc_lines)
|
||||
next_start_min_y = desc_lines[-1].regular_min_y - 5
|
||||
case "Special Registers Altered:":
|
||||
sp_regs_altered = self.extract_insn_sp_regs_altered(
|
||||
sp_regs_altered_text=desc_line,
|
||||
)
|
||||
next_start_min_y = sp_regs_altered.final_regular_min_y
|
||||
break
|
||||
case header_text:
|
||||
raise AssertionError(f"unhandled header text: {header_text!r}\n{desc_line}")
|
||||
case _:
|
||||
assert_never(next_section)
|
||||
}
|
||||
print("insn code lines:")
|
||||
print("\n".join(map(str, code_lines)))
|
||||
print("insn desc lines:")
|
||||
print("\n".join(map(str, desc_lines)))
|
||||
print("sp_regs_altered:")
|
||||
print(sp_regs_altered)
|
||||
# TODO: finish
|
||||
return Insn(
|
||||
headers=tuple(headers),
|
||||
code_lines=tuple(code_lines),
|
||||
desc_lines=tuple(desc_lines),
|
||||
sp_regs_altered=sp_regs_altered,
|
||||
)
|
||||
}*/
|
||||
fn extract_insns(&mut self) -> Result<(), ExtractInsnsError> {
|
||||
loop {
|
||||
let Some(header_start_char) =
|
||||
RefCell::borrow(&*self.unprocessed_chars().map_err(ExtractInsnsError::Other)?)
|
||||
.get(&Font::InsnHeader)
|
||||
.and_then(|v| v.first().cloned())
|
||||
else {
|
||||
return Ok(());
|
||||
};
|
||||
let insn = self.extract_insn(header_start_char)?;
|
||||
self.insns.push(insn);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue