diff --git a/Cargo.lock b/Cargo.lock index e85021f..0281106 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -222,9 +222,11 @@ checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" name = "parse_powerisa_pdf" version = "0.1.0" dependencies = [ + "indexmap", "libm", "mupdf", "quick-xml", + "serde", ] [[package]] @@ -258,6 +260,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b66c2058c55a409d601666cffe35f04333cf1013010882cec174a7467cd4e21c" dependencies = [ "memchr", + "serde", ] [[package]] @@ -310,6 +313,16 @@ version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + [[package]] name = "serde_core" version = "1.0.228" diff --git a/Cargo.toml b/Cargo.toml index dd8f2bc..224dad3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,6 +11,8 @@ categories = [] rust-version = "1.89.0" [dependencies] +indexmap = "2.12.1" libm = "0.2.15" mupdf = { version = "0.5.0", default-features = false } -quick-xml = "0.38.4" +quick-xml = { version = "0.38.4", features = ["serialize"] } +serde = { version = "1.0.228", features = ["derive"] } diff --git a/src/main.rs b/src/main.rs index 21e3a60..d9c54ec 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,11 +2,13 @@ // See Notices.txt for copyright information use crate::quad_tree::QuadTree; +use indexmap::IndexSet; use non_nan_float::NonNaNF32; use std::{ - borrow::Borrow, + borrow::{Borrow, Cow}, cell::RefCell, collections::{BTreeMap, BTreeSet, HashMap, HashSet}, + error::Error, fmt, num::NonZero, rc::Rc, @@ -560,7 +562,7 @@ impl Char { fn height(&self) -> f32 { self.max_y.get() - self.min_y.get() } - fn top_down_left_to_right_sort_key(&self) -> impl Ord { + fn top_down_left_to_right_sort_key(&self) -> impl Ord + use<> { (-self.min_y, self.min_x) } } @@ -1463,8 +1465,87 @@ enum LineOrRect { struct Page { page_num: u32, qt: BTreeMap>, - unprocessed_chars: BTreeMap>>, - unprocessed_non_text: BTreeSet, + unprocessed_chars: + Rc>>>>>>, + unprocessed_non_text: Rc>>, +} + +struct Pages { + pages_gen: Option>>>>, + pages: BTreeMap>, + max_page_num: u32, +} + +impl fmt::Debug for Pages { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let Self { + pages_gen, + pages, + max_page_num, + } = self; + f.debug_struct("Pages") + .field( + "pages_gen", + &pages_gen.is_some().then_some(format_args!("...")), + ) + .field("pages", pages) + .field("max_page_num", max_page_num) + .finish() + } +} + +impl Pages { + fn new(pages_gen: Option>>>>) -> Self { + Self { + pages_gen, + pages: BTreeMap::new(), + max_page_num: 0, + } + } + fn close(&mut self) { + self.pages_gen = None; + } + fn is_past_end(&mut self, page_num: u32) -> Result> { + while self.pages_gen.is_some() && page_num > self.max_page_num { + self.fill_page()?; + } + Ok(page_num > self.max_page_num) + } + fn fill_page(&mut self) -> Result> { + let Some(pages_gen) = &mut self.pages_gen else { + return Ok(false); + }; + let page = pages_gen.next(); + let Some(page) = page else { + self.close(); + return Ok(false); + }; + let page = page?; + let page_num = page.page_num; + assert!( + page_num > self.max_page_num, + "page numbers must be a strictly-increasing positive integer sequence:\n\ + got {page_num} which isn't more than {}", + self.max_page_num + ); + self.pages.insert(page_num, Rc::new(page)); + self.max_page_num = page_num; + Ok(true) + } + fn get(&mut self, page_num: u32) -> Result>, Box> { + loop { + if let Some(page) = self.pages.get(&page_num) { + return Ok(Some(page.clone())); + } + if self.pages_gen.is_none() { + return Ok(None); + } + if page_num < self.max_page_num { + return Ok(None); + } + self.fill_page()?; + } + } } #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] @@ -1720,53 +1801,179 @@ impl TextSection { .expect("page_num out of range") .get_or_init(|| Self::page_sections_helper(page_num)) } + fn for_position(page_num: u32, x: f32, y: f32) -> Option { + for &i in Self::page_sections(page_num) { + if i.min_x.get() <= x && x <= i.max_x.get() && i.min_y.get() <= y && y <= i.max_y.get() + { + return Some(i); + } + } + None + } +} + +#[derive(Debug, Clone)] +struct InsnHeader { + header_lines: Vec, + mnemonic_lines: Vec, + bit_fields: InsnBitFields, +} + +impl InsnHeader { + fn min_y(&self) -> f32 { + self.bit_fields.box_min_y + } + fn write_xml(&self, parent: &mut xml_tree::Element) { + let header = parent.sub_element("header".into(), []); + header.text = "\n".into(); + header.tail = "\n".into(); + let title = header.sub_element("title".into(), []); + title.tail = "\n".into(); + ParsedTextLine::write_xml_lines(&self.header_lines, title, false, false); + let mnemonics = header.sub_element("mnemonics".into(), []); + mnemonics.tail = "\n".into(); + ParsedTextLine::write_xml_lines(&self.mnemonic_lines, mnemonics, false, false); + self.bit_fields.write_xml(header); + } +} + +#[derive(Debug, Clone)] +struct Insn { + headers: Vec, + code_lines: Vec, + desc_lines: Vec, + sp_regs_altered: Option, +} + +impl Insn { + fn write_xml(&self, parent: &mut xml_tree::Element) { + let insn = parent.sub_element("instruction".into(), []); + insn.text = "\n".into(); + insn.tail = "\n".into(); + for header in &self.headers { + header.write_xml(insn); + } + if !self.code_lines.is_empty() { + let code = insn.sub_element("code".into(), []); + code.tail = "\n".into(); + ParsedTextLine::write_xml_lines(&self.code_lines, code, false, false); + } + if !self.desc_lines.is_empty() { + let desc = insn.sub_element("description".into(), []); + desc.tail = "\n".into(); + ParsedTextLine::write_xml_lines(&self.desc_lines, desc, false, false); + } + if let Some(sp_regs_altered) = &self.sp_regs_altered { + sp_regs_altered.write_xml(insn); + } + } +} + +#[derive(Debug)] +struct Parser { + pages: Pages, + text_section: TextSection, + insns: Vec, +} + +impl Parser { + fn new() -> Self { + Self { + pages: Pages::new(None), + text_section: TextSection::first(), + insns: Vec::new(), + } + } + fn page(&mut self) -> Result, Box> { + Ok(self + .pages + .get(self.text_section.page_num)? + .ok_or("page_num is out of range")?) + } + fn unprocessed_chars( + &mut self, + ) -> Result>>>, Box> { + Ok(self + .page()? + .unprocessed_chars + .borrow_mut() + .entry(self.text_section) + .or_default() + .clone()) + } + fn pages_gen( + file: &str, + page_numbers: Option>>, + ) -> Result>>>, Box> { + let page_numbers = page_numbers.map(|page_numbers| { + let mut retval = Vec::from_iter(page_numbers.into_iter().map(|v| v.get() - 1)); + retval.sort(); + retval + }); + let document = mupdf::Document::open(file)?; + let pages: Vec = document.pages().and_then(|pages| pages.collect())?; + Ok(Box::new(pages.into_iter().enumerate().map( + move |(i, page)| { + let page_num = match &page_numbers { + Some(page_numbers) => page_numbers[i] + 1, + None => i as u32 + 1, + }; + println!("page {page_num}"); + Ok(Page::from_mupdf_page(page_num, page) + .map_err(|e| format!("error reading pdf page {page_num}: {e}"))?) + }, + ))) + } + fn parse_pdf>>( + &mut self, + file: &str, + page_numbers: Option, + ) -> Result<(), Box> { + self.pages = Pages::new(Some(Self::pages_gen( + file, + page_numbers.map(|v| v.into_iter().collect()), + )?)); + self.text_section = TextSection::first(); + loop { + self.text_section = self.text_section.next(); + if self.pages.is_past_end(self.text_section.page_num)? { + return Ok(()); + } + if self.pages.get(self.text_section.page_num)?.is_some() { + println!("section {:?}", self.text_section); + self.note_text_section(Self::parse_text_section)?; + } + } + } + fn note_text_section( + &mut self, + f: impl FnOnce(&mut Self) -> Result<(), Box>, + ) -> Result<(), Box> { + let start_text_section = self.text_section; + match f(self) { + Ok(()) => Ok(()), + Err(e) => { + let note = if self.text_section == start_text_section { + format!("text_section={:?}", self.text_section) + } else { + format!( + "start_text_section={start_text_section:?}\ntext_section={:?}", + self.text_section + ) + }; + Err(format!("{e}\nnote: {note}").into()) + } + } + } + fn parse_text_section(&mut self) -> Result<(), Box> { + todo!() + } } #[derive(Clone, Debug, Default)] struct MyDevice { - chars: Rc>>, -} - -impl MyDevice { - fn text(&mut self, text: &mupdf::Text, cmt: mupdf::Matrix) { - for span in text.spans() { - let span_font = span.font(); - let font_name = span_font.name(); - const ROUND_FACTOR: f32 = 1000.0; - let Some(size) = - NonNaNF32::new((span.trm().expansion() * ROUND_FACTOR).round() / ROUND_FACTOR) - else { - continue; - }; - let font = Font::new(font_name, size); - for item in span.items() { - let Some(ch) = u32::try_from(item.ucs()) - .ok() - .and_then(|v| char::try_from(v).ok()) - else { - continue; - }; - let mut m = span.trm(); - m.e = item.x(); - m.f = item.y(); - m.concat(cmt); - let (min_x, min_y, max_x, max_y) = match span.wmode() { - mupdf::WriteMode::Horizontal => { - todo!(); - } - mupdf::WriteMode::Vertical => todo!(), - }; - self.chars.borrow_mut().push(Char { - font, - text: String::from(ch), - min_x, - min_y, - max_x, - max_y, - }); - } - } - } + qt: Rc>>>, + unprocessed_non_text: Rc>>, } impl mupdf::NativeDevice for MyDevice { @@ -1815,48 +2022,190 @@ impl mupdf::NativeDevice for MyDevice { ) { // TODO } +} - fn fill_text( - &mut self, - text: &mupdf::Text, - cmt: mupdf::Matrix, - _color_space: &mupdf::Colorspace, - _color: &[f32], - _alpha: f32, - _cp: mupdf::ColorParams, - ) { - self.text(text, cmt); - } +#[derive(serde::Deserialize, Debug)] +enum MuPdfXml<'a> { + #[serde(rename = "page")] + Page(MuPdfXmlPage<'a>), +} - fn stroke_text( - &mut self, - text: &mupdf::Text, - _stroke_state: &mupdf::StrokeState, - cmt: mupdf::Matrix, - _color_space: &mupdf::Colorspace, - _color: &[f32], - _alpha: f32, - _cp: mupdf::ColorParams, - ) { - self.text(text, cmt); - } +#[derive(serde::Deserialize, Debug)] +struct MuPdfXmlPage<'a> { + #[serde(rename = "@id")] + id: Cow<'a, str>, + #[serde(rename = "@width")] + width: f32, + #[serde(rename = "@height")] + height: f32, + block: Vec>, +} - fn clip_text(&mut self, text: &mupdf::Text, cmt: mupdf::Matrix, _scissor: mupdf::Rect) { - self.text(text, cmt); - } +#[derive(serde::Deserialize, Debug)] +struct MuPdfXmlBlock<'a> { + #[serde(rename = "@bbox")] + bbox: [f32; 4], + #[serde(rename = "@justify")] + justify: Cow<'a, str>, + line: Vec>, +} - fn clip_stroke_text( - &mut self, - text: &mupdf::Text, - _stroke_state: &mupdf::StrokeState, - cmt: mupdf::Matrix, - _scissor: mupdf::Rect, - ) { - self.text(text, cmt); - } +#[derive(serde::Deserialize, Debug)] +struct MuPdfXmlLine<'a> { + #[serde(rename = "@bbox")] + bbox: [f32; 4], + #[serde(rename = "@wmode")] + wmode: u8, + #[serde(rename = "@dir")] + dir: [f32; 2], + #[serde(rename = "@text")] + text: Cow<'a, str>, + font: Vec>, +} - fn ignore_text(&mut self, text: &mupdf::Text, cmt: mupdf::Matrix) { - self.text(text, cmt); +#[derive(serde::Deserialize, Debug)] +struct MuPdfXmlFont<'a> { + #[serde(rename = "@name")] + name: Cow<'a, str>, + #[serde(rename = "@size")] + size: f32, + char: Vec>, +} + +#[derive(serde::Deserialize, Debug)] +struct MuPdfXmlChar<'a> { + #[serde(rename = "@quad")] + quad: [f32; 8], + #[serde(rename = "@x")] + x: f32, + #[serde(rename = "@y")] + y: f32, + #[serde(rename = "@bidi")] + bidi: u16, + #[serde(rename = "@color")] + color: Cow<'a, str>, + #[serde(rename = "@alpha")] + alpha: Cow<'a, str>, + #[serde(rename = "@flags")] + flags: u32, + #[serde(rename = "@c")] + c: Cow<'a, str>, +} + +impl Page { + fn from_mupdf_page( + page_num: u32, + page: mupdf::Page, + ) -> Result> { + let device = MyDevice::default(); + page.run( + &mupdf::Device::from_native(device.clone())?, + &mupdf::Matrix::IDENTITY, + )?; + let MyDevice { + qt, + unprocessed_non_text, + } = device; + let mut qt = Rc::try_unwrap(qt) + .ok() + .expect("already dropped all other references") + .into_inner(); + let unprocessed_chars: Rc< + RefCell>>>>>, + > = Rc::default(); + // we convert to xml and parse that becuase the mupdf rust crate doesn't include all the API surface we need. + let xml = page.to_xml()?; + let MuPdfXml::Page(xml_page) = quick_xml::de::from_str(&xml)?; + for xml_block in xml_page.block { + for xml_line in xml_block.line { + for xml_font in xml_line.font { + const ROUND_FACTOR: f32 = 1000.0; + let font_size = (xml_font.size * ROUND_FACTOR).round() / ROUND_FACTOR; + let font_size = NonNaNF32::new(font_size).ok_or("font size must not be NaN")?; + let font = Font::new(&xml_font.name, font_size); + for xml_char in xml_font.char { + let [x0, y0, x1, y1, x2, y2, x3, y3] = xml_char.quad; + let min_x = x0.min(x1).min(x2).min(x3); + let max_x = x0.max(x1).max(x2).max(x3); + let min_y = y0.min(y1).min(y2).min(y3); + let max_y = y0.max(y1).max(y2).max(y3); + let Some(text_section) = TextSection::for_position( + page_num, + (min_x + max_x) * 0.5, + (min_y + max_y) * 0.5, + ) else { + if PAGE_BODY_MIN_Y <= min_y && min_y <= PAGE_BODY_MAX_Y { + if page_num != 1072 { + // page 1072 has characters in the margins + return Err( + format!("char not in text section: {xml_char:?}\npage_num={page_num}").into(), + ); + } + } + continue; + }; + let char = Char { + font: font.clone(), + text: xml_char.c.into_owned(), + min_x: NonNaNF32::new(min_x).ok_or("char position shouldn't be NaN")?, + min_y: NonNaNF32::new(min_y).ok_or("char position shouldn't be NaN")?, + max_x: NonNaNF32::new(max_x).ok_or("char position shouldn't be NaN")?, + max_y: NonNaNF32::new(max_y).ok_or("char position shouldn't be NaN")?, + }; + qt.entry(text_section).or_default().insert( + min_x, + min_y, + PageItem::Char(char.clone()), + ); + unprocessed_chars + .borrow_mut() + .entry(text_section) + .or_default() + .borrow_mut() + .entry(char.font.clone()) + .or_default() + .insert(char); + } + } + } + } + for i in unprocessed_chars.borrow_mut().values_mut() { + for j in i.borrow_mut().values_mut() { + j.sort_by_key(Char::top_down_left_to_right_sort_key); + } + } + let mut unknown_fonts = Vec::new(); + let mut unknown_font_errors = Vec::new(); + for i in RefCell::borrow(&unprocessed_chars).values() { + for (font, chars) in RefCell::borrow(i).iter() { + if font.known_font_group().is_none() { + let mut text = String::new(); + for char in chars { + text += &char.text; + } + unknown_fonts.push(format!("{font:?},")); + unknown_font_errors.push(format!( + "unknown font {font:?}\nlast char: {:?}\ntext: {text:?}", + chars.last() + )); + } + } + } + unknown_fonts.sort(); + if !unknown_fonts.is_empty() { + return Err(format!( + "\nunknown fonts:\n{}\n\n{}", + unknown_fonts.join("\n"), + unknown_font_errors.join("\n") + ) + .into()); + } + Ok(Self { + page_num, + qt, + unprocessed_chars, + unprocessed_non_text, + }) } } @@ -1880,20 +2229,25 @@ fn main() -> Result<(), Box> { } else { None }; - let document = mupdf::Document::open(&args[1])?; - let pages: Vec<_> = document.pages()?.collect::>()?; - let page_numbers = page_numbers.unwrap_or_else(|| { - Box::new( - (0..pages.len()).map(|i| NonZero::new((i + 1) as u32).expect("known to be non-zero")), - ) - }); - for page_num in page_numbers { - let device = MyDevice::default(); - pages[page_num.get() as usize - 1].run( - &mupdf::Device::from_native(device.clone())?, - &mupdf::Matrix::IDENTITY, - )?; - println!("{device:?}"); + let mut parser = Parser::new(); + let is_subset = page_numbers.is_some(); + let file_name = &args[1]; + parser.parse_pdf(file_name, page_numbers)?; + let mut insns = xml_tree::Element::new( + "instructions".into(), + [("is-subset".into(), is_subset.to_string())], + ); + insns.text = "\n".into(); + insns.tail = "\n".into(); + let mut comment = + xml_tree::Element::comment(format!(" Automatically generated from {file_name} ")); + comment.tail = "\n".into(); + insns.children.push(comment); + for insn in parser.insns { + insn.write_xml(&mut insns); } + let mut output = Vec::new(); + insns.write(&mut output, true)?; + std::fs::write("powerisa-instructions.xml", output)?; Ok(()) } diff --git a/src/xml_tree.rs b/src/xml_tree.rs index 1fce103..6139ca6 100644 --- a/src/xml_tree.rs +++ b/src/xml_tree.rs @@ -3,7 +3,7 @@ use quick_xml::{ Writer, - events::{BytesText, Event}, + events::{BytesDecl, BytesText, Event}, }; use std::fmt; @@ -110,47 +110,7 @@ pub(crate) struct Element { impl fmt::Display for Element { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let mut writer = Writer::new(FmtToIoAdaptor::new(f)); - fn helper( - element: &Element, - writer: &mut Writer, - ) -> std::io::Result<()> { - let Element { - tag, - attrib, - text, - children, - tail, - } = element; - match tag { - ElementTag::Comment => { - writer.write_event(Event::Comment(BytesText::new(text)))?; - } - ElementTag::Normal(tag) if tag.is_empty() => { - writer.write_event(Event::Text(BytesText::new(text)))?; - } - ElementTag::Normal(tag) => { - let mut element_writer = writer.create_element(tag); - for (name, value) in attrib { - element_writer = - element_writer.with_attribute((name.as_str(), value.as_str())); - } - if text.is_empty() && children.is_empty() { - element_writer.write_empty()?; - } else { - element_writer.write_inner_content(|writer| { - writer.write_event(Event::Text(BytesText::new(text)))?; - for child in children { - helper(child, writer)?; - } - Ok(()) - })?; - } - } - } - writer.write_event(Event::Text(BytesText::new(tail)))?; - Ok(()) - } - helper(self, &mut writer).map_err(|_| fmt::Error)?; + self.write_to(&mut writer).map_err(|_| fmt::Error)?; writer.into_inner().finish()?; Ok(()) } @@ -166,6 +126,16 @@ impl Element { tail: String::new(), } } + /// equivalent of python's `xml.etree.ElementTree.Comment()` + pub(crate) fn comment(text: String) -> Self { + Self { + tag: ElementTag::Comment, + attrib: Vec::new(), + text, + children: Vec::new(), + tail: String::new(), + } + } /// equivalent to python `"".join(self.itertext())` pub(crate) fn inner_text(&self) -> String { let mut retval = String::new(); @@ -198,4 +168,53 @@ impl Element { self.children.push(Self::new(tag, attrib)); self.children.last_mut().expect("just pushed") } + pub(crate) fn write_to(&self, writer: &mut Writer) -> std::io::Result<()> { + let Element { + tag, + attrib, + text, + children, + tail, + } = self; + match tag { + ElementTag::Comment => { + writer.write_event(Event::Comment(BytesText::new(text)))?; + } + ElementTag::Normal(tag) if tag.is_empty() => { + writer.write_event(Event::Text(BytesText::new(text)))?; + } + ElementTag::Normal(tag) => { + let mut element_writer = writer.create_element(tag); + for (name, value) in attrib { + element_writer = element_writer.with_attribute((name.as_str(), value.as_str())); + } + if text.is_empty() && children.is_empty() { + element_writer.write_empty()?; + } else { + element_writer.write_inner_content(|writer| { + writer.write_event(Event::Text(BytesText::new(text)))?; + for child in children { + child.write_to(writer)?; + } + Ok(()) + })?; + } + } + } + writer.write_event(Event::Text(BytesText::new(tail)))?; + Ok(()) + } + /// equivalent of python's `xml.etree.ElementTree(self).write(writer, encoding='utf-8', xml_declaration=xml_declaration)` + pub(crate) fn write( + &self, + writer: impl std::io::Write, + xml_declaration: bool, + ) -> std::io::Result<()> { + let mut writer = Writer::new(writer); + if xml_declaration { + writer.write_event(Event::Decl(BytesDecl::new("1.0", Some("utf-8"), None)))?; + writer.write_event(Event::Text(BytesText::new("\n")))?; + } + self.write_to(&mut writer) + } }