wip porting to rust
Some checks failed
/ test (push) Failing after 27s

This commit is contained in:
Jacob Lifshay 2026-01-02 16:09:43 -08:00
parent 718de40b09
commit c8cd234d8f
Signed by: programmerjake
SSH key fingerprint: SHA256:HnFTLGpSm4Q4Fj502oCFisjZSoakwEuTsJJMSke63RQ
4 changed files with 529 additions and 141 deletions

13
Cargo.lock generated
View file

@ -222,9 +222,11 @@ checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
name = "parse_powerisa_pdf"
version = "0.1.0"
dependencies = [
"indexmap",
"libm",
"mupdf",
"quick-xml",
"serde",
]
[[package]]
@ -258,6 +260,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b66c2058c55a409d601666cffe35f04333cf1013010882cec174a7467cd4e21c"
dependencies = [
"memchr",
"serde",
]
[[package]]
@ -310,6 +313,16 @@ version = "1.0.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
[[package]]
name = "serde"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
dependencies = [
"serde_core",
"serde_derive",
]
[[package]]
name = "serde_core"
version = "1.0.228"

View file

@ -11,6 +11,8 @@ categories = []
rust-version = "1.89.0"
[dependencies]
indexmap = "2.12.1"
libm = "0.2.15"
mupdf = { version = "0.5.0", default-features = false }
quick-xml = "0.38.4"
quick-xml = { version = "0.38.4", features = ["serialize"] }
serde = { version = "1.0.228", features = ["derive"] }

View file

@ -2,11 +2,13 @@
// See Notices.txt for copyright information
use crate::quad_tree::QuadTree;
use indexmap::IndexSet;
use non_nan_float::NonNaNF32;
use std::{
borrow::Borrow,
borrow::{Borrow, Cow},
cell::RefCell,
collections::{BTreeMap, BTreeSet, HashMap, HashSet},
error::Error,
fmt,
num::NonZero,
rc::Rc,
@ -560,7 +562,7 @@ impl Char {
fn height(&self) -> f32 {
self.max_y.get() - self.min_y.get()
}
fn top_down_left_to_right_sort_key(&self) -> impl Ord {
fn top_down_left_to_right_sort_key(&self) -> impl Ord + use<> {
(-self.min_y, self.min_x)
}
}
@ -1463,8 +1465,87 @@ enum LineOrRect {
struct Page {
page_num: u32,
qt: BTreeMap<TextSection, QuadTree<PageItem>>,
unprocessed_chars: BTreeMap<TextSection, BTreeMap<Font, BTreeSet<Char>>>,
unprocessed_non_text: BTreeSet<LineOrRect>,
unprocessed_chars:
Rc<RefCell<BTreeMap<TextSection, Rc<RefCell<BTreeMap<Font, IndexSet<Char>>>>>>>,
unprocessed_non_text: Rc<RefCell<IndexSet<LineOrRect>>>,
}
struct Pages {
pages_gen: Option<Box<dyn Iterator<Item = Result<Page, Box<dyn Error>>>>>,
pages: BTreeMap<u32, Rc<Page>>,
max_page_num: u32,
}
impl fmt::Debug for Pages {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let Self {
pages_gen,
pages,
max_page_num,
} = self;
f.debug_struct("Pages")
.field(
"pages_gen",
&pages_gen.is_some().then_some(format_args!("...")),
)
.field("pages", pages)
.field("max_page_num", max_page_num)
.finish()
}
}
impl Pages {
fn new(pages_gen: Option<Box<dyn Iterator<Item = Result<Page, Box<dyn Error>>>>>) -> Self {
Self {
pages_gen,
pages: BTreeMap::new(),
max_page_num: 0,
}
}
fn close(&mut self) {
self.pages_gen = None;
}
fn is_past_end(&mut self, page_num: u32) -> Result<bool, Box<dyn Error>> {
while self.pages_gen.is_some() && page_num > self.max_page_num {
self.fill_page()?;
}
Ok(page_num > self.max_page_num)
}
fn fill_page(&mut self) -> Result<bool, Box<dyn Error>> {
let Some(pages_gen) = &mut self.pages_gen else {
return Ok(false);
};
let page = pages_gen.next();
let Some(page) = page else {
self.close();
return Ok(false);
};
let page = page?;
let page_num = page.page_num;
assert!(
page_num > self.max_page_num,
"page numbers must be a strictly-increasing positive integer sequence:\n\
got {page_num} which isn't more than {}",
self.max_page_num
);
self.pages.insert(page_num, Rc::new(page));
self.max_page_num = page_num;
Ok(true)
}
fn get(&mut self, page_num: u32) -> Result<Option<Rc<Page>>, Box<dyn Error>> {
loop {
if let Some(page) = self.pages.get(&page_num) {
return Ok(Some(page.clone()));
}
if self.pages_gen.is_none() {
return Ok(None);
}
if page_num < self.max_page_num {
return Ok(None);
}
self.fill_page()?;
}
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
@ -1720,53 +1801,179 @@ impl TextSection {
.expect("page_num out of range")
.get_or_init(|| Self::page_sections_helper(page_num))
}
fn for_position(page_num: u32, x: f32, y: f32) -> Option<Self> {
for &i in Self::page_sections(page_num) {
if i.min_x.get() <= x && x <= i.max_x.get() && i.min_y.get() <= y && y <= i.max_y.get()
{
return Some(i);
}
}
None
}
}
#[derive(Debug, Clone)]
struct InsnHeader {
header_lines: Vec<ParsedTextLine>,
mnemonic_lines: Vec<ParsedTextLine>,
bit_fields: InsnBitFields,
}
impl InsnHeader {
fn min_y(&self) -> f32 {
self.bit_fields.box_min_y
}
fn write_xml(&self, parent: &mut xml_tree::Element) {
let header = parent.sub_element("header".into(), []);
header.text = "\n".into();
header.tail = "\n".into();
let title = header.sub_element("title".into(), []);
title.tail = "\n".into();
ParsedTextLine::write_xml_lines(&self.header_lines, title, false, false);
let mnemonics = header.sub_element("mnemonics".into(), []);
mnemonics.tail = "\n".into();
ParsedTextLine::write_xml_lines(&self.mnemonic_lines, mnemonics, false, false);
self.bit_fields.write_xml(header);
}
}
#[derive(Debug, Clone)]
struct Insn {
headers: Vec<InsnHeader>,
code_lines: Vec<ParsedTextLine>,
desc_lines: Vec<ParsedTextLine>,
sp_regs_altered: Option<InsnSpRegsAltered>,
}
impl Insn {
fn write_xml(&self, parent: &mut xml_tree::Element) {
let insn = parent.sub_element("instruction".into(), []);
insn.text = "\n".into();
insn.tail = "\n".into();
for header in &self.headers {
header.write_xml(insn);
}
if !self.code_lines.is_empty() {
let code = insn.sub_element("code".into(), []);
code.tail = "\n".into();
ParsedTextLine::write_xml_lines(&self.code_lines, code, false, false);
}
if !self.desc_lines.is_empty() {
let desc = insn.sub_element("description".into(), []);
desc.tail = "\n".into();
ParsedTextLine::write_xml_lines(&self.desc_lines, desc, false, false);
}
if let Some(sp_regs_altered) = &self.sp_regs_altered {
sp_regs_altered.write_xml(insn);
}
}
}
#[derive(Debug)]
struct Parser {
pages: Pages,
text_section: TextSection,
insns: Vec<Insn>,
}
impl Parser {
fn new() -> Self {
Self {
pages: Pages::new(None),
text_section: TextSection::first(),
insns: Vec::new(),
}
}
fn page(&mut self) -> Result<Rc<Page>, Box<dyn Error>> {
Ok(self
.pages
.get(self.text_section.page_num)?
.ok_or("page_num is out of range")?)
}
fn unprocessed_chars(
&mut self,
) -> Result<Rc<RefCell<BTreeMap<Font, IndexSet<Char>>>>, Box<dyn Error>> {
Ok(self
.page()?
.unprocessed_chars
.borrow_mut()
.entry(self.text_section)
.or_default()
.clone())
}
fn pages_gen(
file: &str,
page_numbers: Option<Vec<NonZero<u32>>>,
) -> Result<Box<dyn Iterator<Item = Result<Page, Box<dyn Error>>>>, Box<dyn Error>> {
let page_numbers = page_numbers.map(|page_numbers| {
let mut retval = Vec::from_iter(page_numbers.into_iter().map(|v| v.get() - 1));
retval.sort();
retval
});
let document = mupdf::Document::open(file)?;
let pages: Vec<mupdf::Page> = document.pages().and_then(|pages| pages.collect())?;
Ok(Box::new(pages.into_iter().enumerate().map(
move |(i, page)| {
let page_num = match &page_numbers {
Some(page_numbers) => page_numbers[i] + 1,
None => i as u32 + 1,
};
println!("page {page_num}");
Ok(Page::from_mupdf_page(page_num, page)
.map_err(|e| format!("error reading pdf page {page_num}: {e}"))?)
},
)))
}
fn parse_pdf<I: Iterator<Item = NonZero<u32>>>(
&mut self,
file: &str,
page_numbers: Option<I>,
) -> Result<(), Box<dyn Error>> {
self.pages = Pages::new(Some(Self::pages_gen(
file,
page_numbers.map(|v| v.into_iter().collect()),
)?));
self.text_section = TextSection::first();
loop {
self.text_section = self.text_section.next();
if self.pages.is_past_end(self.text_section.page_num)? {
return Ok(());
}
if self.pages.get(self.text_section.page_num)?.is_some() {
println!("section {:?}", self.text_section);
self.note_text_section(Self::parse_text_section)?;
}
}
}
fn note_text_section(
&mut self,
f: impl FnOnce(&mut Self) -> Result<(), Box<dyn Error>>,
) -> Result<(), Box<dyn Error>> {
let start_text_section = self.text_section;
match f(self) {
Ok(()) => Ok(()),
Err(e) => {
let note = if self.text_section == start_text_section {
format!("text_section={:?}", self.text_section)
} else {
format!(
"start_text_section={start_text_section:?}\ntext_section={:?}",
self.text_section
)
};
Err(format!("{e}\nnote: {note}").into())
}
}
}
fn parse_text_section(&mut self) -> Result<(), Box<dyn Error>> {
todo!()
}
}
#[derive(Clone, Debug, Default)]
struct MyDevice {
chars: Rc<RefCell<Vec<Char>>>,
}
impl MyDevice {
fn text(&mut self, text: &mupdf::Text, cmt: mupdf::Matrix) {
for span in text.spans() {
let span_font = span.font();
let font_name = span_font.name();
const ROUND_FACTOR: f32 = 1000.0;
let Some(size) =
NonNaNF32::new((span.trm().expansion() * ROUND_FACTOR).round() / ROUND_FACTOR)
else {
continue;
};
let font = Font::new(font_name, size);
for item in span.items() {
let Some(ch) = u32::try_from(item.ucs())
.ok()
.and_then(|v| char::try_from(v).ok())
else {
continue;
};
let mut m = span.trm();
m.e = item.x();
m.f = item.y();
m.concat(cmt);
let (min_x, min_y, max_x, max_y) = match span.wmode() {
mupdf::WriteMode::Horizontal => {
todo!();
}
mupdf::WriteMode::Vertical => todo!(),
};
self.chars.borrow_mut().push(Char {
font,
text: String::from(ch),
min_x,
min_y,
max_x,
max_y,
});
}
}
}
qt: Rc<RefCell<BTreeMap<TextSection, QuadTree<PageItem>>>>,
unprocessed_non_text: Rc<RefCell<IndexSet<LineOrRect>>>,
}
impl mupdf::NativeDevice for MyDevice {
@ -1815,48 +2022,190 @@ impl mupdf::NativeDevice for MyDevice {
) {
// TODO
}
}
fn fill_text(
&mut self,
text: &mupdf::Text,
cmt: mupdf::Matrix,
_color_space: &mupdf::Colorspace,
_color: &[f32],
_alpha: f32,
_cp: mupdf::ColorParams,
) {
self.text(text, cmt);
}
#[derive(serde::Deserialize, Debug)]
enum MuPdfXml<'a> {
#[serde(rename = "page")]
Page(MuPdfXmlPage<'a>),
}
fn stroke_text(
&mut self,
text: &mupdf::Text,
_stroke_state: &mupdf::StrokeState,
cmt: mupdf::Matrix,
_color_space: &mupdf::Colorspace,
_color: &[f32],
_alpha: f32,
_cp: mupdf::ColorParams,
) {
self.text(text, cmt);
}
#[derive(serde::Deserialize, Debug)]
struct MuPdfXmlPage<'a> {
#[serde(rename = "@id")]
id: Cow<'a, str>,
#[serde(rename = "@width")]
width: f32,
#[serde(rename = "@height")]
height: f32,
block: Vec<MuPdfXmlBlock<'a>>,
}
fn clip_text(&mut self, text: &mupdf::Text, cmt: mupdf::Matrix, _scissor: mupdf::Rect) {
self.text(text, cmt);
}
#[derive(serde::Deserialize, Debug)]
struct MuPdfXmlBlock<'a> {
#[serde(rename = "@bbox")]
bbox: [f32; 4],
#[serde(rename = "@justify")]
justify: Cow<'a, str>,
line: Vec<MuPdfXmlLine<'a>>,
}
fn clip_stroke_text(
&mut self,
text: &mupdf::Text,
_stroke_state: &mupdf::StrokeState,
cmt: mupdf::Matrix,
_scissor: mupdf::Rect,
) {
self.text(text, cmt);
}
#[derive(serde::Deserialize, Debug)]
struct MuPdfXmlLine<'a> {
#[serde(rename = "@bbox")]
bbox: [f32; 4],
#[serde(rename = "@wmode")]
wmode: u8,
#[serde(rename = "@dir")]
dir: [f32; 2],
#[serde(rename = "@text")]
text: Cow<'a, str>,
font: Vec<MuPdfXmlFont<'a>>,
}
fn ignore_text(&mut self, text: &mupdf::Text, cmt: mupdf::Matrix) {
self.text(text, cmt);
#[derive(serde::Deserialize, Debug)]
struct MuPdfXmlFont<'a> {
#[serde(rename = "@name")]
name: Cow<'a, str>,
#[serde(rename = "@size")]
size: f32,
char: Vec<MuPdfXmlChar<'a>>,
}
#[derive(serde::Deserialize, Debug)]
struct MuPdfXmlChar<'a> {
#[serde(rename = "@quad")]
quad: [f32; 8],
#[serde(rename = "@x")]
x: f32,
#[serde(rename = "@y")]
y: f32,
#[serde(rename = "@bidi")]
bidi: u16,
#[serde(rename = "@color")]
color: Cow<'a, str>,
#[serde(rename = "@alpha")]
alpha: Cow<'a, str>,
#[serde(rename = "@flags")]
flags: u32,
#[serde(rename = "@c")]
c: Cow<'a, str>,
}
impl Page {
fn from_mupdf_page(
page_num: u32,
page: mupdf::Page,
) -> Result<Self, Box<dyn std::error::Error>> {
let device = MyDevice::default();
page.run(
&mupdf::Device::from_native(device.clone())?,
&mupdf::Matrix::IDENTITY,
)?;
let MyDevice {
qt,
unprocessed_non_text,
} = device;
let mut qt = Rc::try_unwrap(qt)
.ok()
.expect("already dropped all other references")
.into_inner();
let unprocessed_chars: Rc<
RefCell<BTreeMap<TextSection, Rc<RefCell<BTreeMap<Font, IndexSet<Char>>>>>>,
> = Rc::default();
// we convert to xml and parse that becuase the mupdf rust crate doesn't include all the API surface we need.
let xml = page.to_xml()?;
let MuPdfXml::Page(xml_page) = quick_xml::de::from_str(&xml)?;
for xml_block in xml_page.block {
for xml_line in xml_block.line {
for xml_font in xml_line.font {
const ROUND_FACTOR: f32 = 1000.0;
let font_size = (xml_font.size * ROUND_FACTOR).round() / ROUND_FACTOR;
let font_size = NonNaNF32::new(font_size).ok_or("font size must not be NaN")?;
let font = Font::new(&xml_font.name, font_size);
for xml_char in xml_font.char {
let [x0, y0, x1, y1, x2, y2, x3, y3] = xml_char.quad;
let min_x = x0.min(x1).min(x2).min(x3);
let max_x = x0.max(x1).max(x2).max(x3);
let min_y = y0.min(y1).min(y2).min(y3);
let max_y = y0.max(y1).max(y2).max(y3);
let Some(text_section) = TextSection::for_position(
page_num,
(min_x + max_x) * 0.5,
(min_y + max_y) * 0.5,
) else {
if PAGE_BODY_MIN_Y <= min_y && min_y <= PAGE_BODY_MAX_Y {
if page_num != 1072 {
// page 1072 has characters in the margins
return Err(
format!("char not in text section: {xml_char:?}\npage_num={page_num}").into(),
);
}
}
continue;
};
let char = Char {
font: font.clone(),
text: xml_char.c.into_owned(),
min_x: NonNaNF32::new(min_x).ok_or("char position shouldn't be NaN")?,
min_y: NonNaNF32::new(min_y).ok_or("char position shouldn't be NaN")?,
max_x: NonNaNF32::new(max_x).ok_or("char position shouldn't be NaN")?,
max_y: NonNaNF32::new(max_y).ok_or("char position shouldn't be NaN")?,
};
qt.entry(text_section).or_default().insert(
min_x,
min_y,
PageItem::Char(char.clone()),
);
unprocessed_chars
.borrow_mut()
.entry(text_section)
.or_default()
.borrow_mut()
.entry(char.font.clone())
.or_default()
.insert(char);
}
}
}
}
for i in unprocessed_chars.borrow_mut().values_mut() {
for j in i.borrow_mut().values_mut() {
j.sort_by_key(Char::top_down_left_to_right_sort_key);
}
}
let mut unknown_fonts = Vec::new();
let mut unknown_font_errors = Vec::new();
for i in RefCell::borrow(&unprocessed_chars).values() {
for (font, chars) in RefCell::borrow(i).iter() {
if font.known_font_group().is_none() {
let mut text = String::new();
for char in chars {
text += &char.text;
}
unknown_fonts.push(format!("{font:?},"));
unknown_font_errors.push(format!(
"unknown font {font:?}\nlast char: {:?}\ntext: {text:?}",
chars.last()
));
}
}
}
unknown_fonts.sort();
if !unknown_fonts.is_empty() {
return Err(format!(
"\nunknown fonts:\n{}\n\n{}",
unknown_fonts.join("\n"),
unknown_font_errors.join("\n")
)
.into());
}
Ok(Self {
page_num,
qt,
unprocessed_chars,
unprocessed_non_text,
})
}
}
@ -1880,20 +2229,25 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
} else {
None
};
let document = mupdf::Document::open(&args[1])?;
let pages: Vec<_> = document.pages()?.collect::<Result<_, _>>()?;
let page_numbers = page_numbers.unwrap_or_else(|| {
Box::new(
(0..pages.len()).map(|i| NonZero::new((i + 1) as u32).expect("known to be non-zero")),
)
});
for page_num in page_numbers {
let device = MyDevice::default();
pages[page_num.get() as usize - 1].run(
&mupdf::Device::from_native(device.clone())?,
&mupdf::Matrix::IDENTITY,
)?;
println!("{device:?}");
let mut parser = Parser::new();
let is_subset = page_numbers.is_some();
let file_name = &args[1];
parser.parse_pdf(file_name, page_numbers)?;
let mut insns = xml_tree::Element::new(
"instructions".into(),
[("is-subset".into(), is_subset.to_string())],
);
insns.text = "\n".into();
insns.tail = "\n".into();
let mut comment =
xml_tree::Element::comment(format!(" Automatically generated from {file_name} "));
comment.tail = "\n".into();
insns.children.push(comment);
for insn in parser.insns {
insn.write_xml(&mut insns);
}
let mut output = Vec::new();
insns.write(&mut output, true)?;
std::fs::write("powerisa-instructions.xml", output)?;
Ok(())
}

View file

@ -3,7 +3,7 @@
use quick_xml::{
Writer,
events::{BytesText, Event},
events::{BytesDecl, BytesText, Event},
};
use std::fmt;
@ -110,47 +110,7 @@ pub(crate) struct Element {
impl fmt::Display for Element {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let mut writer = Writer::new(FmtToIoAdaptor::new(f));
fn helper(
element: &Element,
writer: &mut Writer<impl std::io::Write>,
) -> std::io::Result<()> {
let Element {
tag,
attrib,
text,
children,
tail,
} = element;
match tag {
ElementTag::Comment => {
writer.write_event(Event::Comment(BytesText::new(text)))?;
}
ElementTag::Normal(tag) if tag.is_empty() => {
writer.write_event(Event::Text(BytesText::new(text)))?;
}
ElementTag::Normal(tag) => {
let mut element_writer = writer.create_element(tag);
for (name, value) in attrib {
element_writer =
element_writer.with_attribute((name.as_str(), value.as_str()));
}
if text.is_empty() && children.is_empty() {
element_writer.write_empty()?;
} else {
element_writer.write_inner_content(|writer| {
writer.write_event(Event::Text(BytesText::new(text)))?;
for child in children {
helper(child, writer)?;
}
Ok(())
})?;
}
}
}
writer.write_event(Event::Text(BytesText::new(tail)))?;
Ok(())
}
helper(self, &mut writer).map_err(|_| fmt::Error)?;
self.write_to(&mut writer).map_err(|_| fmt::Error)?;
writer.into_inner().finish()?;
Ok(())
}
@ -166,6 +126,16 @@ impl Element {
tail: String::new(),
}
}
/// equivalent of python's `xml.etree.ElementTree.Comment()`
pub(crate) fn comment(text: String) -> Self {
Self {
tag: ElementTag::Comment,
attrib: Vec::new(),
text,
children: Vec::new(),
tail: String::new(),
}
}
/// equivalent to python `"".join(self.itertext())`
pub(crate) fn inner_text(&self) -> String {
let mut retval = String::new();
@ -198,4 +168,53 @@ impl Element {
self.children.push(Self::new(tag, attrib));
self.children.last_mut().expect("just pushed")
}
pub(crate) fn write_to(&self, writer: &mut Writer<impl std::io::Write>) -> std::io::Result<()> {
let Element {
tag,
attrib,
text,
children,
tail,
} = self;
match tag {
ElementTag::Comment => {
writer.write_event(Event::Comment(BytesText::new(text)))?;
}
ElementTag::Normal(tag) if tag.is_empty() => {
writer.write_event(Event::Text(BytesText::new(text)))?;
}
ElementTag::Normal(tag) => {
let mut element_writer = writer.create_element(tag);
for (name, value) in attrib {
element_writer = element_writer.with_attribute((name.as_str(), value.as_str()));
}
if text.is_empty() && children.is_empty() {
element_writer.write_empty()?;
} else {
element_writer.write_inner_content(|writer| {
writer.write_event(Event::Text(BytesText::new(text)))?;
for child in children {
child.write_to(writer)?;
}
Ok(())
})?;
}
}
}
writer.write_event(Event::Text(BytesText::new(tail)))?;
Ok(())
}
/// equivalent of python's `xml.etree.ElementTree(self).write(writer, encoding='utf-8', xml_declaration=xml_declaration)`
pub(crate) fn write(
&self,
writer: impl std::io::Write,
xml_declaration: bool,
) -> std::io::Result<()> {
let mut writer = Writer::new(writer);
if xml_declaration {
writer.write_event(Event::Decl(BytesDecl::new("1.0", Some("utf-8"), None)))?;
writer.write_event(Event::Text(BytesText::new("\n")))?;
}
self.write_to(&mut writer)
}
}