parent
718de40b09
commit
c8cd234d8f
4 changed files with 529 additions and 141 deletions
13
Cargo.lock
generated
13
Cargo.lock
generated
|
|
@ -222,9 +222,11 @@ checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
|
||||||
name = "parse_powerisa_pdf"
|
name = "parse_powerisa_pdf"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"indexmap",
|
||||||
"libm",
|
"libm",
|
||||||
"mupdf",
|
"mupdf",
|
||||||
"quick-xml",
|
"quick-xml",
|
||||||
|
"serde",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|
@ -258,6 +260,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b66c2058c55a409d601666cffe35f04333cf1013010882cec174a7467cd4e21c"
|
checksum = "b66c2058c55a409d601666cffe35f04333cf1013010882cec174a7467cd4e21c"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"memchr",
|
"memchr",
|
||||||
|
"serde",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|
@ -310,6 +313,16 @@ version = "1.0.22"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
|
checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "serde"
|
||||||
|
version = "1.0.228"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
|
||||||
|
dependencies = [
|
||||||
|
"serde_core",
|
||||||
|
"serde_derive",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "serde_core"
|
name = "serde_core"
|
||||||
version = "1.0.228"
|
version = "1.0.228"
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,8 @@ categories = []
|
||||||
rust-version = "1.89.0"
|
rust-version = "1.89.0"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
indexmap = "2.12.1"
|
||||||
libm = "0.2.15"
|
libm = "0.2.15"
|
||||||
mupdf = { version = "0.5.0", default-features = false }
|
mupdf = { version = "0.5.0", default-features = false }
|
||||||
quick-xml = "0.38.4"
|
quick-xml = { version = "0.38.4", features = ["serialize"] }
|
||||||
|
serde = { version = "1.0.228", features = ["derive"] }
|
||||||
|
|
|
||||||
550
src/main.rs
550
src/main.rs
|
|
@ -2,11 +2,13 @@
|
||||||
// See Notices.txt for copyright information
|
// See Notices.txt for copyright information
|
||||||
|
|
||||||
use crate::quad_tree::QuadTree;
|
use crate::quad_tree::QuadTree;
|
||||||
|
use indexmap::IndexSet;
|
||||||
use non_nan_float::NonNaNF32;
|
use non_nan_float::NonNaNF32;
|
||||||
use std::{
|
use std::{
|
||||||
borrow::Borrow,
|
borrow::{Borrow, Cow},
|
||||||
cell::RefCell,
|
cell::RefCell,
|
||||||
collections::{BTreeMap, BTreeSet, HashMap, HashSet},
|
collections::{BTreeMap, BTreeSet, HashMap, HashSet},
|
||||||
|
error::Error,
|
||||||
fmt,
|
fmt,
|
||||||
num::NonZero,
|
num::NonZero,
|
||||||
rc::Rc,
|
rc::Rc,
|
||||||
|
|
@ -560,7 +562,7 @@ impl Char {
|
||||||
fn height(&self) -> f32 {
|
fn height(&self) -> f32 {
|
||||||
self.max_y.get() - self.min_y.get()
|
self.max_y.get() - self.min_y.get()
|
||||||
}
|
}
|
||||||
fn top_down_left_to_right_sort_key(&self) -> impl Ord {
|
fn top_down_left_to_right_sort_key(&self) -> impl Ord + use<> {
|
||||||
(-self.min_y, self.min_x)
|
(-self.min_y, self.min_x)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -1463,8 +1465,87 @@ enum LineOrRect {
|
||||||
struct Page {
|
struct Page {
|
||||||
page_num: u32,
|
page_num: u32,
|
||||||
qt: BTreeMap<TextSection, QuadTree<PageItem>>,
|
qt: BTreeMap<TextSection, QuadTree<PageItem>>,
|
||||||
unprocessed_chars: BTreeMap<TextSection, BTreeMap<Font, BTreeSet<Char>>>,
|
unprocessed_chars:
|
||||||
unprocessed_non_text: BTreeSet<LineOrRect>,
|
Rc<RefCell<BTreeMap<TextSection, Rc<RefCell<BTreeMap<Font, IndexSet<Char>>>>>>>,
|
||||||
|
unprocessed_non_text: Rc<RefCell<IndexSet<LineOrRect>>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Pages {
|
||||||
|
pages_gen: Option<Box<dyn Iterator<Item = Result<Page, Box<dyn Error>>>>>,
|
||||||
|
pages: BTreeMap<u32, Rc<Page>>,
|
||||||
|
max_page_num: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Debug for Pages {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
let Self {
|
||||||
|
pages_gen,
|
||||||
|
pages,
|
||||||
|
max_page_num,
|
||||||
|
} = self;
|
||||||
|
f.debug_struct("Pages")
|
||||||
|
.field(
|
||||||
|
"pages_gen",
|
||||||
|
&pages_gen.is_some().then_some(format_args!("...")),
|
||||||
|
)
|
||||||
|
.field("pages", pages)
|
||||||
|
.field("max_page_num", max_page_num)
|
||||||
|
.finish()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Pages {
|
||||||
|
fn new(pages_gen: Option<Box<dyn Iterator<Item = Result<Page, Box<dyn Error>>>>>) -> Self {
|
||||||
|
Self {
|
||||||
|
pages_gen,
|
||||||
|
pages: BTreeMap::new(),
|
||||||
|
max_page_num: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fn close(&mut self) {
|
||||||
|
self.pages_gen = None;
|
||||||
|
}
|
||||||
|
fn is_past_end(&mut self, page_num: u32) -> Result<bool, Box<dyn Error>> {
|
||||||
|
while self.pages_gen.is_some() && page_num > self.max_page_num {
|
||||||
|
self.fill_page()?;
|
||||||
|
}
|
||||||
|
Ok(page_num > self.max_page_num)
|
||||||
|
}
|
||||||
|
fn fill_page(&mut self) -> Result<bool, Box<dyn Error>> {
|
||||||
|
let Some(pages_gen) = &mut self.pages_gen else {
|
||||||
|
return Ok(false);
|
||||||
|
};
|
||||||
|
let page = pages_gen.next();
|
||||||
|
let Some(page) = page else {
|
||||||
|
self.close();
|
||||||
|
return Ok(false);
|
||||||
|
};
|
||||||
|
let page = page?;
|
||||||
|
let page_num = page.page_num;
|
||||||
|
assert!(
|
||||||
|
page_num > self.max_page_num,
|
||||||
|
"page numbers must be a strictly-increasing positive integer sequence:\n\
|
||||||
|
got {page_num} which isn't more than {}",
|
||||||
|
self.max_page_num
|
||||||
|
);
|
||||||
|
self.pages.insert(page_num, Rc::new(page));
|
||||||
|
self.max_page_num = page_num;
|
||||||
|
Ok(true)
|
||||||
|
}
|
||||||
|
fn get(&mut self, page_num: u32) -> Result<Option<Rc<Page>>, Box<dyn Error>> {
|
||||||
|
loop {
|
||||||
|
if let Some(page) = self.pages.get(&page_num) {
|
||||||
|
return Ok(Some(page.clone()));
|
||||||
|
}
|
||||||
|
if self.pages_gen.is_none() {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
if page_num < self.max_page_num {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
self.fill_page()?;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||||
|
|
@ -1720,53 +1801,179 @@ impl TextSection {
|
||||||
.expect("page_num out of range")
|
.expect("page_num out of range")
|
||||||
.get_or_init(|| Self::page_sections_helper(page_num))
|
.get_or_init(|| Self::page_sections_helper(page_num))
|
||||||
}
|
}
|
||||||
|
fn for_position(page_num: u32, x: f32, y: f32) -> Option<Self> {
|
||||||
|
for &i in Self::page_sections(page_num) {
|
||||||
|
if i.min_x.get() <= x && x <= i.max_x.get() && i.min_y.get() <= y && y <= i.max_y.get()
|
||||||
|
{
|
||||||
|
return Some(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
struct InsnHeader {
|
||||||
|
header_lines: Vec<ParsedTextLine>,
|
||||||
|
mnemonic_lines: Vec<ParsedTextLine>,
|
||||||
|
bit_fields: InsnBitFields,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl InsnHeader {
|
||||||
|
fn min_y(&self) -> f32 {
|
||||||
|
self.bit_fields.box_min_y
|
||||||
|
}
|
||||||
|
fn write_xml(&self, parent: &mut xml_tree::Element) {
|
||||||
|
let header = parent.sub_element("header".into(), []);
|
||||||
|
header.text = "\n".into();
|
||||||
|
header.tail = "\n".into();
|
||||||
|
let title = header.sub_element("title".into(), []);
|
||||||
|
title.tail = "\n".into();
|
||||||
|
ParsedTextLine::write_xml_lines(&self.header_lines, title, false, false);
|
||||||
|
let mnemonics = header.sub_element("mnemonics".into(), []);
|
||||||
|
mnemonics.tail = "\n".into();
|
||||||
|
ParsedTextLine::write_xml_lines(&self.mnemonic_lines, mnemonics, false, false);
|
||||||
|
self.bit_fields.write_xml(header);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
struct Insn {
|
||||||
|
headers: Vec<InsnHeader>,
|
||||||
|
code_lines: Vec<ParsedTextLine>,
|
||||||
|
desc_lines: Vec<ParsedTextLine>,
|
||||||
|
sp_regs_altered: Option<InsnSpRegsAltered>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Insn {
|
||||||
|
fn write_xml(&self, parent: &mut xml_tree::Element) {
|
||||||
|
let insn = parent.sub_element("instruction".into(), []);
|
||||||
|
insn.text = "\n".into();
|
||||||
|
insn.tail = "\n".into();
|
||||||
|
for header in &self.headers {
|
||||||
|
header.write_xml(insn);
|
||||||
|
}
|
||||||
|
if !self.code_lines.is_empty() {
|
||||||
|
let code = insn.sub_element("code".into(), []);
|
||||||
|
code.tail = "\n".into();
|
||||||
|
ParsedTextLine::write_xml_lines(&self.code_lines, code, false, false);
|
||||||
|
}
|
||||||
|
if !self.desc_lines.is_empty() {
|
||||||
|
let desc = insn.sub_element("description".into(), []);
|
||||||
|
desc.tail = "\n".into();
|
||||||
|
ParsedTextLine::write_xml_lines(&self.desc_lines, desc, false, false);
|
||||||
|
}
|
||||||
|
if let Some(sp_regs_altered) = &self.sp_regs_altered {
|
||||||
|
sp_regs_altered.write_xml(insn);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
struct Parser {
|
||||||
|
pages: Pages,
|
||||||
|
text_section: TextSection,
|
||||||
|
insns: Vec<Insn>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Parser {
|
||||||
|
fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
pages: Pages::new(None),
|
||||||
|
text_section: TextSection::first(),
|
||||||
|
insns: Vec::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fn page(&mut self) -> Result<Rc<Page>, Box<dyn Error>> {
|
||||||
|
Ok(self
|
||||||
|
.pages
|
||||||
|
.get(self.text_section.page_num)?
|
||||||
|
.ok_or("page_num is out of range")?)
|
||||||
|
}
|
||||||
|
fn unprocessed_chars(
|
||||||
|
&mut self,
|
||||||
|
) -> Result<Rc<RefCell<BTreeMap<Font, IndexSet<Char>>>>, Box<dyn Error>> {
|
||||||
|
Ok(self
|
||||||
|
.page()?
|
||||||
|
.unprocessed_chars
|
||||||
|
.borrow_mut()
|
||||||
|
.entry(self.text_section)
|
||||||
|
.or_default()
|
||||||
|
.clone())
|
||||||
|
}
|
||||||
|
fn pages_gen(
|
||||||
|
file: &str,
|
||||||
|
page_numbers: Option<Vec<NonZero<u32>>>,
|
||||||
|
) -> Result<Box<dyn Iterator<Item = Result<Page, Box<dyn Error>>>>, Box<dyn Error>> {
|
||||||
|
let page_numbers = page_numbers.map(|page_numbers| {
|
||||||
|
let mut retval = Vec::from_iter(page_numbers.into_iter().map(|v| v.get() - 1));
|
||||||
|
retval.sort();
|
||||||
|
retval
|
||||||
|
});
|
||||||
|
let document = mupdf::Document::open(file)?;
|
||||||
|
let pages: Vec<mupdf::Page> = document.pages().and_then(|pages| pages.collect())?;
|
||||||
|
Ok(Box::new(pages.into_iter().enumerate().map(
|
||||||
|
move |(i, page)| {
|
||||||
|
let page_num = match &page_numbers {
|
||||||
|
Some(page_numbers) => page_numbers[i] + 1,
|
||||||
|
None => i as u32 + 1,
|
||||||
|
};
|
||||||
|
println!("page {page_num}");
|
||||||
|
Ok(Page::from_mupdf_page(page_num, page)
|
||||||
|
.map_err(|e| format!("error reading pdf page {page_num}: {e}"))?)
|
||||||
|
},
|
||||||
|
)))
|
||||||
|
}
|
||||||
|
fn parse_pdf<I: Iterator<Item = NonZero<u32>>>(
|
||||||
|
&mut self,
|
||||||
|
file: &str,
|
||||||
|
page_numbers: Option<I>,
|
||||||
|
) -> Result<(), Box<dyn Error>> {
|
||||||
|
self.pages = Pages::new(Some(Self::pages_gen(
|
||||||
|
file,
|
||||||
|
page_numbers.map(|v| v.into_iter().collect()),
|
||||||
|
)?));
|
||||||
|
self.text_section = TextSection::first();
|
||||||
|
loop {
|
||||||
|
self.text_section = self.text_section.next();
|
||||||
|
if self.pages.is_past_end(self.text_section.page_num)? {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
if self.pages.get(self.text_section.page_num)?.is_some() {
|
||||||
|
println!("section {:?}", self.text_section);
|
||||||
|
self.note_text_section(Self::parse_text_section)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fn note_text_section(
|
||||||
|
&mut self,
|
||||||
|
f: impl FnOnce(&mut Self) -> Result<(), Box<dyn Error>>,
|
||||||
|
) -> Result<(), Box<dyn Error>> {
|
||||||
|
let start_text_section = self.text_section;
|
||||||
|
match f(self) {
|
||||||
|
Ok(()) => Ok(()),
|
||||||
|
Err(e) => {
|
||||||
|
let note = if self.text_section == start_text_section {
|
||||||
|
format!("text_section={:?}", self.text_section)
|
||||||
|
} else {
|
||||||
|
format!(
|
||||||
|
"start_text_section={start_text_section:?}\ntext_section={:?}",
|
||||||
|
self.text_section
|
||||||
|
)
|
||||||
|
};
|
||||||
|
Err(format!("{e}\nnote: {note}").into())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fn parse_text_section(&mut self) -> Result<(), Box<dyn Error>> {
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug, Default)]
|
#[derive(Clone, Debug, Default)]
|
||||||
struct MyDevice {
|
struct MyDevice {
|
||||||
chars: Rc<RefCell<Vec<Char>>>,
|
qt: Rc<RefCell<BTreeMap<TextSection, QuadTree<PageItem>>>>,
|
||||||
}
|
unprocessed_non_text: Rc<RefCell<IndexSet<LineOrRect>>>,
|
||||||
|
|
||||||
impl MyDevice {
|
|
||||||
fn text(&mut self, text: &mupdf::Text, cmt: mupdf::Matrix) {
|
|
||||||
for span in text.spans() {
|
|
||||||
let span_font = span.font();
|
|
||||||
let font_name = span_font.name();
|
|
||||||
const ROUND_FACTOR: f32 = 1000.0;
|
|
||||||
let Some(size) =
|
|
||||||
NonNaNF32::new((span.trm().expansion() * ROUND_FACTOR).round() / ROUND_FACTOR)
|
|
||||||
else {
|
|
||||||
continue;
|
|
||||||
};
|
|
||||||
let font = Font::new(font_name, size);
|
|
||||||
for item in span.items() {
|
|
||||||
let Some(ch) = u32::try_from(item.ucs())
|
|
||||||
.ok()
|
|
||||||
.and_then(|v| char::try_from(v).ok())
|
|
||||||
else {
|
|
||||||
continue;
|
|
||||||
};
|
|
||||||
let mut m = span.trm();
|
|
||||||
m.e = item.x();
|
|
||||||
m.f = item.y();
|
|
||||||
m.concat(cmt);
|
|
||||||
let (min_x, min_y, max_x, max_y) = match span.wmode() {
|
|
||||||
mupdf::WriteMode::Horizontal => {
|
|
||||||
todo!();
|
|
||||||
}
|
|
||||||
mupdf::WriteMode::Vertical => todo!(),
|
|
||||||
};
|
|
||||||
self.chars.borrow_mut().push(Char {
|
|
||||||
font,
|
|
||||||
text: String::from(ch),
|
|
||||||
min_x,
|
|
||||||
min_y,
|
|
||||||
max_x,
|
|
||||||
max_y,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl mupdf::NativeDevice for MyDevice {
|
impl mupdf::NativeDevice for MyDevice {
|
||||||
|
|
@ -1815,48 +2022,190 @@ impl mupdf::NativeDevice for MyDevice {
|
||||||
) {
|
) {
|
||||||
// TODO
|
// TODO
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn fill_text(
|
#[derive(serde::Deserialize, Debug)]
|
||||||
&mut self,
|
enum MuPdfXml<'a> {
|
||||||
text: &mupdf::Text,
|
#[serde(rename = "page")]
|
||||||
cmt: mupdf::Matrix,
|
Page(MuPdfXmlPage<'a>),
|
||||||
_color_space: &mupdf::Colorspace,
|
}
|
||||||
_color: &[f32],
|
|
||||||
_alpha: f32,
|
|
||||||
_cp: mupdf::ColorParams,
|
|
||||||
) {
|
|
||||||
self.text(text, cmt);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn stroke_text(
|
#[derive(serde::Deserialize, Debug)]
|
||||||
&mut self,
|
struct MuPdfXmlPage<'a> {
|
||||||
text: &mupdf::Text,
|
#[serde(rename = "@id")]
|
||||||
_stroke_state: &mupdf::StrokeState,
|
id: Cow<'a, str>,
|
||||||
cmt: mupdf::Matrix,
|
#[serde(rename = "@width")]
|
||||||
_color_space: &mupdf::Colorspace,
|
width: f32,
|
||||||
_color: &[f32],
|
#[serde(rename = "@height")]
|
||||||
_alpha: f32,
|
height: f32,
|
||||||
_cp: mupdf::ColorParams,
|
block: Vec<MuPdfXmlBlock<'a>>,
|
||||||
) {
|
}
|
||||||
self.text(text, cmt);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn clip_text(&mut self, text: &mupdf::Text, cmt: mupdf::Matrix, _scissor: mupdf::Rect) {
|
#[derive(serde::Deserialize, Debug)]
|
||||||
self.text(text, cmt);
|
struct MuPdfXmlBlock<'a> {
|
||||||
}
|
#[serde(rename = "@bbox")]
|
||||||
|
bbox: [f32; 4],
|
||||||
|
#[serde(rename = "@justify")]
|
||||||
|
justify: Cow<'a, str>,
|
||||||
|
line: Vec<MuPdfXmlLine<'a>>,
|
||||||
|
}
|
||||||
|
|
||||||
fn clip_stroke_text(
|
#[derive(serde::Deserialize, Debug)]
|
||||||
&mut self,
|
struct MuPdfXmlLine<'a> {
|
||||||
text: &mupdf::Text,
|
#[serde(rename = "@bbox")]
|
||||||
_stroke_state: &mupdf::StrokeState,
|
bbox: [f32; 4],
|
||||||
cmt: mupdf::Matrix,
|
#[serde(rename = "@wmode")]
|
||||||
_scissor: mupdf::Rect,
|
wmode: u8,
|
||||||
) {
|
#[serde(rename = "@dir")]
|
||||||
self.text(text, cmt);
|
dir: [f32; 2],
|
||||||
}
|
#[serde(rename = "@text")]
|
||||||
|
text: Cow<'a, str>,
|
||||||
|
font: Vec<MuPdfXmlFont<'a>>,
|
||||||
|
}
|
||||||
|
|
||||||
fn ignore_text(&mut self, text: &mupdf::Text, cmt: mupdf::Matrix) {
|
#[derive(serde::Deserialize, Debug)]
|
||||||
self.text(text, cmt);
|
struct MuPdfXmlFont<'a> {
|
||||||
|
#[serde(rename = "@name")]
|
||||||
|
name: Cow<'a, str>,
|
||||||
|
#[serde(rename = "@size")]
|
||||||
|
size: f32,
|
||||||
|
char: Vec<MuPdfXmlChar<'a>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(serde::Deserialize, Debug)]
|
||||||
|
struct MuPdfXmlChar<'a> {
|
||||||
|
#[serde(rename = "@quad")]
|
||||||
|
quad: [f32; 8],
|
||||||
|
#[serde(rename = "@x")]
|
||||||
|
x: f32,
|
||||||
|
#[serde(rename = "@y")]
|
||||||
|
y: f32,
|
||||||
|
#[serde(rename = "@bidi")]
|
||||||
|
bidi: u16,
|
||||||
|
#[serde(rename = "@color")]
|
||||||
|
color: Cow<'a, str>,
|
||||||
|
#[serde(rename = "@alpha")]
|
||||||
|
alpha: Cow<'a, str>,
|
||||||
|
#[serde(rename = "@flags")]
|
||||||
|
flags: u32,
|
||||||
|
#[serde(rename = "@c")]
|
||||||
|
c: Cow<'a, str>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Page {
|
||||||
|
fn from_mupdf_page(
|
||||||
|
page_num: u32,
|
||||||
|
page: mupdf::Page,
|
||||||
|
) -> Result<Self, Box<dyn std::error::Error>> {
|
||||||
|
let device = MyDevice::default();
|
||||||
|
page.run(
|
||||||
|
&mupdf::Device::from_native(device.clone())?,
|
||||||
|
&mupdf::Matrix::IDENTITY,
|
||||||
|
)?;
|
||||||
|
let MyDevice {
|
||||||
|
qt,
|
||||||
|
unprocessed_non_text,
|
||||||
|
} = device;
|
||||||
|
let mut qt = Rc::try_unwrap(qt)
|
||||||
|
.ok()
|
||||||
|
.expect("already dropped all other references")
|
||||||
|
.into_inner();
|
||||||
|
let unprocessed_chars: Rc<
|
||||||
|
RefCell<BTreeMap<TextSection, Rc<RefCell<BTreeMap<Font, IndexSet<Char>>>>>>,
|
||||||
|
> = Rc::default();
|
||||||
|
// we convert to xml and parse that becuase the mupdf rust crate doesn't include all the API surface we need.
|
||||||
|
let xml = page.to_xml()?;
|
||||||
|
let MuPdfXml::Page(xml_page) = quick_xml::de::from_str(&xml)?;
|
||||||
|
for xml_block in xml_page.block {
|
||||||
|
for xml_line in xml_block.line {
|
||||||
|
for xml_font in xml_line.font {
|
||||||
|
const ROUND_FACTOR: f32 = 1000.0;
|
||||||
|
let font_size = (xml_font.size * ROUND_FACTOR).round() / ROUND_FACTOR;
|
||||||
|
let font_size = NonNaNF32::new(font_size).ok_or("font size must not be NaN")?;
|
||||||
|
let font = Font::new(&xml_font.name, font_size);
|
||||||
|
for xml_char in xml_font.char {
|
||||||
|
let [x0, y0, x1, y1, x2, y2, x3, y3] = xml_char.quad;
|
||||||
|
let min_x = x0.min(x1).min(x2).min(x3);
|
||||||
|
let max_x = x0.max(x1).max(x2).max(x3);
|
||||||
|
let min_y = y0.min(y1).min(y2).min(y3);
|
||||||
|
let max_y = y0.max(y1).max(y2).max(y3);
|
||||||
|
let Some(text_section) = TextSection::for_position(
|
||||||
|
page_num,
|
||||||
|
(min_x + max_x) * 0.5,
|
||||||
|
(min_y + max_y) * 0.5,
|
||||||
|
) else {
|
||||||
|
if PAGE_BODY_MIN_Y <= min_y && min_y <= PAGE_BODY_MAX_Y {
|
||||||
|
if page_num != 1072 {
|
||||||
|
// page 1072 has characters in the margins
|
||||||
|
return Err(
|
||||||
|
format!("char not in text section: {xml_char:?}\npage_num={page_num}").into(),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
let char = Char {
|
||||||
|
font: font.clone(),
|
||||||
|
text: xml_char.c.into_owned(),
|
||||||
|
min_x: NonNaNF32::new(min_x).ok_or("char position shouldn't be NaN")?,
|
||||||
|
min_y: NonNaNF32::new(min_y).ok_or("char position shouldn't be NaN")?,
|
||||||
|
max_x: NonNaNF32::new(max_x).ok_or("char position shouldn't be NaN")?,
|
||||||
|
max_y: NonNaNF32::new(max_y).ok_or("char position shouldn't be NaN")?,
|
||||||
|
};
|
||||||
|
qt.entry(text_section).or_default().insert(
|
||||||
|
min_x,
|
||||||
|
min_y,
|
||||||
|
PageItem::Char(char.clone()),
|
||||||
|
);
|
||||||
|
unprocessed_chars
|
||||||
|
.borrow_mut()
|
||||||
|
.entry(text_section)
|
||||||
|
.or_default()
|
||||||
|
.borrow_mut()
|
||||||
|
.entry(char.font.clone())
|
||||||
|
.or_default()
|
||||||
|
.insert(char);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for i in unprocessed_chars.borrow_mut().values_mut() {
|
||||||
|
for j in i.borrow_mut().values_mut() {
|
||||||
|
j.sort_by_key(Char::top_down_left_to_right_sort_key);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let mut unknown_fonts = Vec::new();
|
||||||
|
let mut unknown_font_errors = Vec::new();
|
||||||
|
for i in RefCell::borrow(&unprocessed_chars).values() {
|
||||||
|
for (font, chars) in RefCell::borrow(i).iter() {
|
||||||
|
if font.known_font_group().is_none() {
|
||||||
|
let mut text = String::new();
|
||||||
|
for char in chars {
|
||||||
|
text += &char.text;
|
||||||
|
}
|
||||||
|
unknown_fonts.push(format!("{font:?},"));
|
||||||
|
unknown_font_errors.push(format!(
|
||||||
|
"unknown font {font:?}\nlast char: {:?}\ntext: {text:?}",
|
||||||
|
chars.last()
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
unknown_fonts.sort();
|
||||||
|
if !unknown_fonts.is_empty() {
|
||||||
|
return Err(format!(
|
||||||
|
"\nunknown fonts:\n{}\n\n{}",
|
||||||
|
unknown_fonts.join("\n"),
|
||||||
|
unknown_font_errors.join("\n")
|
||||||
|
)
|
||||||
|
.into());
|
||||||
|
}
|
||||||
|
Ok(Self {
|
||||||
|
page_num,
|
||||||
|
qt,
|
||||||
|
unprocessed_chars,
|
||||||
|
unprocessed_non_text,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1880,20 +2229,25 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
};
|
};
|
||||||
let document = mupdf::Document::open(&args[1])?;
|
let mut parser = Parser::new();
|
||||||
let pages: Vec<_> = document.pages()?.collect::<Result<_, _>>()?;
|
let is_subset = page_numbers.is_some();
|
||||||
let page_numbers = page_numbers.unwrap_or_else(|| {
|
let file_name = &args[1];
|
||||||
Box::new(
|
parser.parse_pdf(file_name, page_numbers)?;
|
||||||
(0..pages.len()).map(|i| NonZero::new((i + 1) as u32).expect("known to be non-zero")),
|
let mut insns = xml_tree::Element::new(
|
||||||
)
|
"instructions".into(),
|
||||||
});
|
[("is-subset".into(), is_subset.to_string())],
|
||||||
for page_num in page_numbers {
|
);
|
||||||
let device = MyDevice::default();
|
insns.text = "\n".into();
|
||||||
pages[page_num.get() as usize - 1].run(
|
insns.tail = "\n".into();
|
||||||
&mupdf::Device::from_native(device.clone())?,
|
let mut comment =
|
||||||
&mupdf::Matrix::IDENTITY,
|
xml_tree::Element::comment(format!(" Automatically generated from {file_name} "));
|
||||||
)?;
|
comment.tail = "\n".into();
|
||||||
println!("{device:?}");
|
insns.children.push(comment);
|
||||||
|
for insn in parser.insns {
|
||||||
|
insn.write_xml(&mut insns);
|
||||||
}
|
}
|
||||||
|
let mut output = Vec::new();
|
||||||
|
insns.write(&mut output, true)?;
|
||||||
|
std::fs::write("powerisa-instructions.xml", output)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
|
||||||
103
src/xml_tree.rs
103
src/xml_tree.rs
|
|
@ -3,7 +3,7 @@
|
||||||
|
|
||||||
use quick_xml::{
|
use quick_xml::{
|
||||||
Writer,
|
Writer,
|
||||||
events::{BytesText, Event},
|
events::{BytesDecl, BytesText, Event},
|
||||||
};
|
};
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
|
|
||||||
|
|
@ -110,47 +110,7 @@ pub(crate) struct Element {
|
||||||
impl fmt::Display for Element {
|
impl fmt::Display for Element {
|
||||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
let mut writer = Writer::new(FmtToIoAdaptor::new(f));
|
let mut writer = Writer::new(FmtToIoAdaptor::new(f));
|
||||||
fn helper(
|
self.write_to(&mut writer).map_err(|_| fmt::Error)?;
|
||||||
element: &Element,
|
|
||||||
writer: &mut Writer<impl std::io::Write>,
|
|
||||||
) -> std::io::Result<()> {
|
|
||||||
let Element {
|
|
||||||
tag,
|
|
||||||
attrib,
|
|
||||||
text,
|
|
||||||
children,
|
|
||||||
tail,
|
|
||||||
} = element;
|
|
||||||
match tag {
|
|
||||||
ElementTag::Comment => {
|
|
||||||
writer.write_event(Event::Comment(BytesText::new(text)))?;
|
|
||||||
}
|
|
||||||
ElementTag::Normal(tag) if tag.is_empty() => {
|
|
||||||
writer.write_event(Event::Text(BytesText::new(text)))?;
|
|
||||||
}
|
|
||||||
ElementTag::Normal(tag) => {
|
|
||||||
let mut element_writer = writer.create_element(tag);
|
|
||||||
for (name, value) in attrib {
|
|
||||||
element_writer =
|
|
||||||
element_writer.with_attribute((name.as_str(), value.as_str()));
|
|
||||||
}
|
|
||||||
if text.is_empty() && children.is_empty() {
|
|
||||||
element_writer.write_empty()?;
|
|
||||||
} else {
|
|
||||||
element_writer.write_inner_content(|writer| {
|
|
||||||
writer.write_event(Event::Text(BytesText::new(text)))?;
|
|
||||||
for child in children {
|
|
||||||
helper(child, writer)?;
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
})?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
writer.write_event(Event::Text(BytesText::new(tail)))?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
helper(self, &mut writer).map_err(|_| fmt::Error)?;
|
|
||||||
writer.into_inner().finish()?;
|
writer.into_inner().finish()?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
@ -166,6 +126,16 @@ impl Element {
|
||||||
tail: String::new(),
|
tail: String::new(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
/// equivalent of python's `xml.etree.ElementTree.Comment()`
|
||||||
|
pub(crate) fn comment(text: String) -> Self {
|
||||||
|
Self {
|
||||||
|
tag: ElementTag::Comment,
|
||||||
|
attrib: Vec::new(),
|
||||||
|
text,
|
||||||
|
children: Vec::new(),
|
||||||
|
tail: String::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
/// equivalent to python `"".join(self.itertext())`
|
/// equivalent to python `"".join(self.itertext())`
|
||||||
pub(crate) fn inner_text(&self) -> String {
|
pub(crate) fn inner_text(&self) -> String {
|
||||||
let mut retval = String::new();
|
let mut retval = String::new();
|
||||||
|
|
@ -198,4 +168,53 @@ impl Element {
|
||||||
self.children.push(Self::new(tag, attrib));
|
self.children.push(Self::new(tag, attrib));
|
||||||
self.children.last_mut().expect("just pushed")
|
self.children.last_mut().expect("just pushed")
|
||||||
}
|
}
|
||||||
|
pub(crate) fn write_to(&self, writer: &mut Writer<impl std::io::Write>) -> std::io::Result<()> {
|
||||||
|
let Element {
|
||||||
|
tag,
|
||||||
|
attrib,
|
||||||
|
text,
|
||||||
|
children,
|
||||||
|
tail,
|
||||||
|
} = self;
|
||||||
|
match tag {
|
||||||
|
ElementTag::Comment => {
|
||||||
|
writer.write_event(Event::Comment(BytesText::new(text)))?;
|
||||||
|
}
|
||||||
|
ElementTag::Normal(tag) if tag.is_empty() => {
|
||||||
|
writer.write_event(Event::Text(BytesText::new(text)))?;
|
||||||
|
}
|
||||||
|
ElementTag::Normal(tag) => {
|
||||||
|
let mut element_writer = writer.create_element(tag);
|
||||||
|
for (name, value) in attrib {
|
||||||
|
element_writer = element_writer.with_attribute((name.as_str(), value.as_str()));
|
||||||
|
}
|
||||||
|
if text.is_empty() && children.is_empty() {
|
||||||
|
element_writer.write_empty()?;
|
||||||
|
} else {
|
||||||
|
element_writer.write_inner_content(|writer| {
|
||||||
|
writer.write_event(Event::Text(BytesText::new(text)))?;
|
||||||
|
for child in children {
|
||||||
|
child.write_to(writer)?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
})?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
writer.write_event(Event::Text(BytesText::new(tail)))?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
/// equivalent of python's `xml.etree.ElementTree(self).write(writer, encoding='utf-8', xml_declaration=xml_declaration)`
|
||||||
|
pub(crate) fn write(
|
||||||
|
&self,
|
||||||
|
writer: impl std::io::Write,
|
||||||
|
xml_declaration: bool,
|
||||||
|
) -> std::io::Result<()> {
|
||||||
|
let mut writer = Writer::new(writer);
|
||||||
|
if xml_declaration {
|
||||||
|
writer.write_event(Event::Decl(BytesDecl::new("1.0", Some("utf-8"), None)))?;
|
||||||
|
writer.write_event(Event::Text(BytesText::new("\n")))?;
|
||||||
|
}
|
||||||
|
self.write_to(&mut writer)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue