parent
718de40b09
commit
c8cd234d8f
4 changed files with 529 additions and 141 deletions
13
Cargo.lock
generated
13
Cargo.lock
generated
|
|
@ -222,9 +222,11 @@ checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
|
|||
name = "parse_powerisa_pdf"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"indexmap",
|
||||
"libm",
|
||||
"mupdf",
|
||||
"quick-xml",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -258,6 +260,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
checksum = "b66c2058c55a409d601666cffe35f04333cf1013010882cec174a7467cd4e21c"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -310,6 +313,16 @@ version = "1.0.22"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.228"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
|
||||
dependencies = [
|
||||
"serde_core",
|
||||
"serde_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_core"
|
||||
version = "1.0.228"
|
||||
|
|
|
|||
|
|
@ -11,6 +11,8 @@ categories = []
|
|||
rust-version = "1.89.0"
|
||||
|
||||
[dependencies]
|
||||
indexmap = "2.12.1"
|
||||
libm = "0.2.15"
|
||||
mupdf = { version = "0.5.0", default-features = false }
|
||||
quick-xml = "0.38.4"
|
||||
quick-xml = { version = "0.38.4", features = ["serialize"] }
|
||||
serde = { version = "1.0.228", features = ["derive"] }
|
||||
|
|
|
|||
550
src/main.rs
550
src/main.rs
|
|
@ -2,11 +2,13 @@
|
|||
// See Notices.txt for copyright information
|
||||
|
||||
use crate::quad_tree::QuadTree;
|
||||
use indexmap::IndexSet;
|
||||
use non_nan_float::NonNaNF32;
|
||||
use std::{
|
||||
borrow::Borrow,
|
||||
borrow::{Borrow, Cow},
|
||||
cell::RefCell,
|
||||
collections::{BTreeMap, BTreeSet, HashMap, HashSet},
|
||||
error::Error,
|
||||
fmt,
|
||||
num::NonZero,
|
||||
rc::Rc,
|
||||
|
|
@ -560,7 +562,7 @@ impl Char {
|
|||
fn height(&self) -> f32 {
|
||||
self.max_y.get() - self.min_y.get()
|
||||
}
|
||||
fn top_down_left_to_right_sort_key(&self) -> impl Ord {
|
||||
fn top_down_left_to_right_sort_key(&self) -> impl Ord + use<> {
|
||||
(-self.min_y, self.min_x)
|
||||
}
|
||||
}
|
||||
|
|
@ -1463,8 +1465,87 @@ enum LineOrRect {
|
|||
struct Page {
|
||||
page_num: u32,
|
||||
qt: BTreeMap<TextSection, QuadTree<PageItem>>,
|
||||
unprocessed_chars: BTreeMap<TextSection, BTreeMap<Font, BTreeSet<Char>>>,
|
||||
unprocessed_non_text: BTreeSet<LineOrRect>,
|
||||
unprocessed_chars:
|
||||
Rc<RefCell<BTreeMap<TextSection, Rc<RefCell<BTreeMap<Font, IndexSet<Char>>>>>>>,
|
||||
unprocessed_non_text: Rc<RefCell<IndexSet<LineOrRect>>>,
|
||||
}
|
||||
|
||||
struct Pages {
|
||||
pages_gen: Option<Box<dyn Iterator<Item = Result<Page, Box<dyn Error>>>>>,
|
||||
pages: BTreeMap<u32, Rc<Page>>,
|
||||
max_page_num: u32,
|
||||
}
|
||||
|
||||
impl fmt::Debug for Pages {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let Self {
|
||||
pages_gen,
|
||||
pages,
|
||||
max_page_num,
|
||||
} = self;
|
||||
f.debug_struct("Pages")
|
||||
.field(
|
||||
"pages_gen",
|
||||
&pages_gen.is_some().then_some(format_args!("...")),
|
||||
)
|
||||
.field("pages", pages)
|
||||
.field("max_page_num", max_page_num)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl Pages {
|
||||
fn new(pages_gen: Option<Box<dyn Iterator<Item = Result<Page, Box<dyn Error>>>>>) -> Self {
|
||||
Self {
|
||||
pages_gen,
|
||||
pages: BTreeMap::new(),
|
||||
max_page_num: 0,
|
||||
}
|
||||
}
|
||||
fn close(&mut self) {
|
||||
self.pages_gen = None;
|
||||
}
|
||||
fn is_past_end(&mut self, page_num: u32) -> Result<bool, Box<dyn Error>> {
|
||||
while self.pages_gen.is_some() && page_num > self.max_page_num {
|
||||
self.fill_page()?;
|
||||
}
|
||||
Ok(page_num > self.max_page_num)
|
||||
}
|
||||
fn fill_page(&mut self) -> Result<bool, Box<dyn Error>> {
|
||||
let Some(pages_gen) = &mut self.pages_gen else {
|
||||
return Ok(false);
|
||||
};
|
||||
let page = pages_gen.next();
|
||||
let Some(page) = page else {
|
||||
self.close();
|
||||
return Ok(false);
|
||||
};
|
||||
let page = page?;
|
||||
let page_num = page.page_num;
|
||||
assert!(
|
||||
page_num > self.max_page_num,
|
||||
"page numbers must be a strictly-increasing positive integer sequence:\n\
|
||||
got {page_num} which isn't more than {}",
|
||||
self.max_page_num
|
||||
);
|
||||
self.pages.insert(page_num, Rc::new(page));
|
||||
self.max_page_num = page_num;
|
||||
Ok(true)
|
||||
}
|
||||
fn get(&mut self, page_num: u32) -> Result<Option<Rc<Page>>, Box<dyn Error>> {
|
||||
loop {
|
||||
if let Some(page) = self.pages.get(&page_num) {
|
||||
return Ok(Some(page.clone()));
|
||||
}
|
||||
if self.pages_gen.is_none() {
|
||||
return Ok(None);
|
||||
}
|
||||
if page_num < self.max_page_num {
|
||||
return Ok(None);
|
||||
}
|
||||
self.fill_page()?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
|
|
@ -1720,53 +1801,179 @@ impl TextSection {
|
|||
.expect("page_num out of range")
|
||||
.get_or_init(|| Self::page_sections_helper(page_num))
|
||||
}
|
||||
fn for_position(page_num: u32, x: f32, y: f32) -> Option<Self> {
|
||||
for &i in Self::page_sections(page_num) {
|
||||
if i.min_x.get() <= x && x <= i.max_x.get() && i.min_y.get() <= y && y <= i.max_y.get()
|
||||
{
|
||||
return Some(i);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct InsnHeader {
|
||||
header_lines: Vec<ParsedTextLine>,
|
||||
mnemonic_lines: Vec<ParsedTextLine>,
|
||||
bit_fields: InsnBitFields,
|
||||
}
|
||||
|
||||
impl InsnHeader {
|
||||
fn min_y(&self) -> f32 {
|
||||
self.bit_fields.box_min_y
|
||||
}
|
||||
fn write_xml(&self, parent: &mut xml_tree::Element) {
|
||||
let header = parent.sub_element("header".into(), []);
|
||||
header.text = "\n".into();
|
||||
header.tail = "\n".into();
|
||||
let title = header.sub_element("title".into(), []);
|
||||
title.tail = "\n".into();
|
||||
ParsedTextLine::write_xml_lines(&self.header_lines, title, false, false);
|
||||
let mnemonics = header.sub_element("mnemonics".into(), []);
|
||||
mnemonics.tail = "\n".into();
|
||||
ParsedTextLine::write_xml_lines(&self.mnemonic_lines, mnemonics, false, false);
|
||||
self.bit_fields.write_xml(header);
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct Insn {
|
||||
headers: Vec<InsnHeader>,
|
||||
code_lines: Vec<ParsedTextLine>,
|
||||
desc_lines: Vec<ParsedTextLine>,
|
||||
sp_regs_altered: Option<InsnSpRegsAltered>,
|
||||
}
|
||||
|
||||
impl Insn {
|
||||
fn write_xml(&self, parent: &mut xml_tree::Element) {
|
||||
let insn = parent.sub_element("instruction".into(), []);
|
||||
insn.text = "\n".into();
|
||||
insn.tail = "\n".into();
|
||||
for header in &self.headers {
|
||||
header.write_xml(insn);
|
||||
}
|
||||
if !self.code_lines.is_empty() {
|
||||
let code = insn.sub_element("code".into(), []);
|
||||
code.tail = "\n".into();
|
||||
ParsedTextLine::write_xml_lines(&self.code_lines, code, false, false);
|
||||
}
|
||||
if !self.desc_lines.is_empty() {
|
||||
let desc = insn.sub_element("description".into(), []);
|
||||
desc.tail = "\n".into();
|
||||
ParsedTextLine::write_xml_lines(&self.desc_lines, desc, false, false);
|
||||
}
|
||||
if let Some(sp_regs_altered) = &self.sp_regs_altered {
|
||||
sp_regs_altered.write_xml(insn);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct Parser {
|
||||
pages: Pages,
|
||||
text_section: TextSection,
|
||||
insns: Vec<Insn>,
|
||||
}
|
||||
|
||||
impl Parser {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
pages: Pages::new(None),
|
||||
text_section: TextSection::first(),
|
||||
insns: Vec::new(),
|
||||
}
|
||||
}
|
||||
fn page(&mut self) -> Result<Rc<Page>, Box<dyn Error>> {
|
||||
Ok(self
|
||||
.pages
|
||||
.get(self.text_section.page_num)?
|
||||
.ok_or("page_num is out of range")?)
|
||||
}
|
||||
fn unprocessed_chars(
|
||||
&mut self,
|
||||
) -> Result<Rc<RefCell<BTreeMap<Font, IndexSet<Char>>>>, Box<dyn Error>> {
|
||||
Ok(self
|
||||
.page()?
|
||||
.unprocessed_chars
|
||||
.borrow_mut()
|
||||
.entry(self.text_section)
|
||||
.or_default()
|
||||
.clone())
|
||||
}
|
||||
fn pages_gen(
|
||||
file: &str,
|
||||
page_numbers: Option<Vec<NonZero<u32>>>,
|
||||
) -> Result<Box<dyn Iterator<Item = Result<Page, Box<dyn Error>>>>, Box<dyn Error>> {
|
||||
let page_numbers = page_numbers.map(|page_numbers| {
|
||||
let mut retval = Vec::from_iter(page_numbers.into_iter().map(|v| v.get() - 1));
|
||||
retval.sort();
|
||||
retval
|
||||
});
|
||||
let document = mupdf::Document::open(file)?;
|
||||
let pages: Vec<mupdf::Page> = document.pages().and_then(|pages| pages.collect())?;
|
||||
Ok(Box::new(pages.into_iter().enumerate().map(
|
||||
move |(i, page)| {
|
||||
let page_num = match &page_numbers {
|
||||
Some(page_numbers) => page_numbers[i] + 1,
|
||||
None => i as u32 + 1,
|
||||
};
|
||||
println!("page {page_num}");
|
||||
Ok(Page::from_mupdf_page(page_num, page)
|
||||
.map_err(|e| format!("error reading pdf page {page_num}: {e}"))?)
|
||||
},
|
||||
)))
|
||||
}
|
||||
fn parse_pdf<I: Iterator<Item = NonZero<u32>>>(
|
||||
&mut self,
|
||||
file: &str,
|
||||
page_numbers: Option<I>,
|
||||
) -> Result<(), Box<dyn Error>> {
|
||||
self.pages = Pages::new(Some(Self::pages_gen(
|
||||
file,
|
||||
page_numbers.map(|v| v.into_iter().collect()),
|
||||
)?));
|
||||
self.text_section = TextSection::first();
|
||||
loop {
|
||||
self.text_section = self.text_section.next();
|
||||
if self.pages.is_past_end(self.text_section.page_num)? {
|
||||
return Ok(());
|
||||
}
|
||||
if self.pages.get(self.text_section.page_num)?.is_some() {
|
||||
println!("section {:?}", self.text_section);
|
||||
self.note_text_section(Self::parse_text_section)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
fn note_text_section(
|
||||
&mut self,
|
||||
f: impl FnOnce(&mut Self) -> Result<(), Box<dyn Error>>,
|
||||
) -> Result<(), Box<dyn Error>> {
|
||||
let start_text_section = self.text_section;
|
||||
match f(self) {
|
||||
Ok(()) => Ok(()),
|
||||
Err(e) => {
|
||||
let note = if self.text_section == start_text_section {
|
||||
format!("text_section={:?}", self.text_section)
|
||||
} else {
|
||||
format!(
|
||||
"start_text_section={start_text_section:?}\ntext_section={:?}",
|
||||
self.text_section
|
||||
)
|
||||
};
|
||||
Err(format!("{e}\nnote: {note}").into())
|
||||
}
|
||||
}
|
||||
}
|
||||
fn parse_text_section(&mut self) -> Result<(), Box<dyn Error>> {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default)]
|
||||
struct MyDevice {
|
||||
chars: Rc<RefCell<Vec<Char>>>,
|
||||
}
|
||||
|
||||
impl MyDevice {
|
||||
fn text(&mut self, text: &mupdf::Text, cmt: mupdf::Matrix) {
|
||||
for span in text.spans() {
|
||||
let span_font = span.font();
|
||||
let font_name = span_font.name();
|
||||
const ROUND_FACTOR: f32 = 1000.0;
|
||||
let Some(size) =
|
||||
NonNaNF32::new((span.trm().expansion() * ROUND_FACTOR).round() / ROUND_FACTOR)
|
||||
else {
|
||||
continue;
|
||||
};
|
||||
let font = Font::new(font_name, size);
|
||||
for item in span.items() {
|
||||
let Some(ch) = u32::try_from(item.ucs())
|
||||
.ok()
|
||||
.and_then(|v| char::try_from(v).ok())
|
||||
else {
|
||||
continue;
|
||||
};
|
||||
let mut m = span.trm();
|
||||
m.e = item.x();
|
||||
m.f = item.y();
|
||||
m.concat(cmt);
|
||||
let (min_x, min_y, max_x, max_y) = match span.wmode() {
|
||||
mupdf::WriteMode::Horizontal => {
|
||||
todo!();
|
||||
}
|
||||
mupdf::WriteMode::Vertical => todo!(),
|
||||
};
|
||||
self.chars.borrow_mut().push(Char {
|
||||
font,
|
||||
text: String::from(ch),
|
||||
min_x,
|
||||
min_y,
|
||||
max_x,
|
||||
max_y,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
qt: Rc<RefCell<BTreeMap<TextSection, QuadTree<PageItem>>>>,
|
||||
unprocessed_non_text: Rc<RefCell<IndexSet<LineOrRect>>>,
|
||||
}
|
||||
|
||||
impl mupdf::NativeDevice for MyDevice {
|
||||
|
|
@ -1815,48 +2022,190 @@ impl mupdf::NativeDevice for MyDevice {
|
|||
) {
|
||||
// TODO
|
||||
}
|
||||
}
|
||||
|
||||
fn fill_text(
|
||||
&mut self,
|
||||
text: &mupdf::Text,
|
||||
cmt: mupdf::Matrix,
|
||||
_color_space: &mupdf::Colorspace,
|
||||
_color: &[f32],
|
||||
_alpha: f32,
|
||||
_cp: mupdf::ColorParams,
|
||||
) {
|
||||
self.text(text, cmt);
|
||||
}
|
||||
#[derive(serde::Deserialize, Debug)]
|
||||
enum MuPdfXml<'a> {
|
||||
#[serde(rename = "page")]
|
||||
Page(MuPdfXmlPage<'a>),
|
||||
}
|
||||
|
||||
fn stroke_text(
|
||||
&mut self,
|
||||
text: &mupdf::Text,
|
||||
_stroke_state: &mupdf::StrokeState,
|
||||
cmt: mupdf::Matrix,
|
||||
_color_space: &mupdf::Colorspace,
|
||||
_color: &[f32],
|
||||
_alpha: f32,
|
||||
_cp: mupdf::ColorParams,
|
||||
) {
|
||||
self.text(text, cmt);
|
||||
}
|
||||
#[derive(serde::Deserialize, Debug)]
|
||||
struct MuPdfXmlPage<'a> {
|
||||
#[serde(rename = "@id")]
|
||||
id: Cow<'a, str>,
|
||||
#[serde(rename = "@width")]
|
||||
width: f32,
|
||||
#[serde(rename = "@height")]
|
||||
height: f32,
|
||||
block: Vec<MuPdfXmlBlock<'a>>,
|
||||
}
|
||||
|
||||
fn clip_text(&mut self, text: &mupdf::Text, cmt: mupdf::Matrix, _scissor: mupdf::Rect) {
|
||||
self.text(text, cmt);
|
||||
}
|
||||
#[derive(serde::Deserialize, Debug)]
|
||||
struct MuPdfXmlBlock<'a> {
|
||||
#[serde(rename = "@bbox")]
|
||||
bbox: [f32; 4],
|
||||
#[serde(rename = "@justify")]
|
||||
justify: Cow<'a, str>,
|
||||
line: Vec<MuPdfXmlLine<'a>>,
|
||||
}
|
||||
|
||||
fn clip_stroke_text(
|
||||
&mut self,
|
||||
text: &mupdf::Text,
|
||||
_stroke_state: &mupdf::StrokeState,
|
||||
cmt: mupdf::Matrix,
|
||||
_scissor: mupdf::Rect,
|
||||
) {
|
||||
self.text(text, cmt);
|
||||
}
|
||||
#[derive(serde::Deserialize, Debug)]
|
||||
struct MuPdfXmlLine<'a> {
|
||||
#[serde(rename = "@bbox")]
|
||||
bbox: [f32; 4],
|
||||
#[serde(rename = "@wmode")]
|
||||
wmode: u8,
|
||||
#[serde(rename = "@dir")]
|
||||
dir: [f32; 2],
|
||||
#[serde(rename = "@text")]
|
||||
text: Cow<'a, str>,
|
||||
font: Vec<MuPdfXmlFont<'a>>,
|
||||
}
|
||||
|
||||
fn ignore_text(&mut self, text: &mupdf::Text, cmt: mupdf::Matrix) {
|
||||
self.text(text, cmt);
|
||||
#[derive(serde::Deserialize, Debug)]
|
||||
struct MuPdfXmlFont<'a> {
|
||||
#[serde(rename = "@name")]
|
||||
name: Cow<'a, str>,
|
||||
#[serde(rename = "@size")]
|
||||
size: f32,
|
||||
char: Vec<MuPdfXmlChar<'a>>,
|
||||
}
|
||||
|
||||
#[derive(serde::Deserialize, Debug)]
|
||||
struct MuPdfXmlChar<'a> {
|
||||
#[serde(rename = "@quad")]
|
||||
quad: [f32; 8],
|
||||
#[serde(rename = "@x")]
|
||||
x: f32,
|
||||
#[serde(rename = "@y")]
|
||||
y: f32,
|
||||
#[serde(rename = "@bidi")]
|
||||
bidi: u16,
|
||||
#[serde(rename = "@color")]
|
||||
color: Cow<'a, str>,
|
||||
#[serde(rename = "@alpha")]
|
||||
alpha: Cow<'a, str>,
|
||||
#[serde(rename = "@flags")]
|
||||
flags: u32,
|
||||
#[serde(rename = "@c")]
|
||||
c: Cow<'a, str>,
|
||||
}
|
||||
|
||||
impl Page {
|
||||
fn from_mupdf_page(
|
||||
page_num: u32,
|
||||
page: mupdf::Page,
|
||||
) -> Result<Self, Box<dyn std::error::Error>> {
|
||||
let device = MyDevice::default();
|
||||
page.run(
|
||||
&mupdf::Device::from_native(device.clone())?,
|
||||
&mupdf::Matrix::IDENTITY,
|
||||
)?;
|
||||
let MyDevice {
|
||||
qt,
|
||||
unprocessed_non_text,
|
||||
} = device;
|
||||
let mut qt = Rc::try_unwrap(qt)
|
||||
.ok()
|
||||
.expect("already dropped all other references")
|
||||
.into_inner();
|
||||
let unprocessed_chars: Rc<
|
||||
RefCell<BTreeMap<TextSection, Rc<RefCell<BTreeMap<Font, IndexSet<Char>>>>>>,
|
||||
> = Rc::default();
|
||||
// we convert to xml and parse that becuase the mupdf rust crate doesn't include all the API surface we need.
|
||||
let xml = page.to_xml()?;
|
||||
let MuPdfXml::Page(xml_page) = quick_xml::de::from_str(&xml)?;
|
||||
for xml_block in xml_page.block {
|
||||
for xml_line in xml_block.line {
|
||||
for xml_font in xml_line.font {
|
||||
const ROUND_FACTOR: f32 = 1000.0;
|
||||
let font_size = (xml_font.size * ROUND_FACTOR).round() / ROUND_FACTOR;
|
||||
let font_size = NonNaNF32::new(font_size).ok_or("font size must not be NaN")?;
|
||||
let font = Font::new(&xml_font.name, font_size);
|
||||
for xml_char in xml_font.char {
|
||||
let [x0, y0, x1, y1, x2, y2, x3, y3] = xml_char.quad;
|
||||
let min_x = x0.min(x1).min(x2).min(x3);
|
||||
let max_x = x0.max(x1).max(x2).max(x3);
|
||||
let min_y = y0.min(y1).min(y2).min(y3);
|
||||
let max_y = y0.max(y1).max(y2).max(y3);
|
||||
let Some(text_section) = TextSection::for_position(
|
||||
page_num,
|
||||
(min_x + max_x) * 0.5,
|
||||
(min_y + max_y) * 0.5,
|
||||
) else {
|
||||
if PAGE_BODY_MIN_Y <= min_y && min_y <= PAGE_BODY_MAX_Y {
|
||||
if page_num != 1072 {
|
||||
// page 1072 has characters in the margins
|
||||
return Err(
|
||||
format!("char not in text section: {xml_char:?}\npage_num={page_num}").into(),
|
||||
);
|
||||
}
|
||||
}
|
||||
continue;
|
||||
};
|
||||
let char = Char {
|
||||
font: font.clone(),
|
||||
text: xml_char.c.into_owned(),
|
||||
min_x: NonNaNF32::new(min_x).ok_or("char position shouldn't be NaN")?,
|
||||
min_y: NonNaNF32::new(min_y).ok_or("char position shouldn't be NaN")?,
|
||||
max_x: NonNaNF32::new(max_x).ok_or("char position shouldn't be NaN")?,
|
||||
max_y: NonNaNF32::new(max_y).ok_or("char position shouldn't be NaN")?,
|
||||
};
|
||||
qt.entry(text_section).or_default().insert(
|
||||
min_x,
|
||||
min_y,
|
||||
PageItem::Char(char.clone()),
|
||||
);
|
||||
unprocessed_chars
|
||||
.borrow_mut()
|
||||
.entry(text_section)
|
||||
.or_default()
|
||||
.borrow_mut()
|
||||
.entry(char.font.clone())
|
||||
.or_default()
|
||||
.insert(char);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for i in unprocessed_chars.borrow_mut().values_mut() {
|
||||
for j in i.borrow_mut().values_mut() {
|
||||
j.sort_by_key(Char::top_down_left_to_right_sort_key);
|
||||
}
|
||||
}
|
||||
let mut unknown_fonts = Vec::new();
|
||||
let mut unknown_font_errors = Vec::new();
|
||||
for i in RefCell::borrow(&unprocessed_chars).values() {
|
||||
for (font, chars) in RefCell::borrow(i).iter() {
|
||||
if font.known_font_group().is_none() {
|
||||
let mut text = String::new();
|
||||
for char in chars {
|
||||
text += &char.text;
|
||||
}
|
||||
unknown_fonts.push(format!("{font:?},"));
|
||||
unknown_font_errors.push(format!(
|
||||
"unknown font {font:?}\nlast char: {:?}\ntext: {text:?}",
|
||||
chars.last()
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
unknown_fonts.sort();
|
||||
if !unknown_fonts.is_empty() {
|
||||
return Err(format!(
|
||||
"\nunknown fonts:\n{}\n\n{}",
|
||||
unknown_fonts.join("\n"),
|
||||
unknown_font_errors.join("\n")
|
||||
)
|
||||
.into());
|
||||
}
|
||||
Ok(Self {
|
||||
page_num,
|
||||
qt,
|
||||
unprocessed_chars,
|
||||
unprocessed_non_text,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1880,20 +2229,25 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
} else {
|
||||
None
|
||||
};
|
||||
let document = mupdf::Document::open(&args[1])?;
|
||||
let pages: Vec<_> = document.pages()?.collect::<Result<_, _>>()?;
|
||||
let page_numbers = page_numbers.unwrap_or_else(|| {
|
||||
Box::new(
|
||||
(0..pages.len()).map(|i| NonZero::new((i + 1) as u32).expect("known to be non-zero")),
|
||||
)
|
||||
});
|
||||
for page_num in page_numbers {
|
||||
let device = MyDevice::default();
|
||||
pages[page_num.get() as usize - 1].run(
|
||||
&mupdf::Device::from_native(device.clone())?,
|
||||
&mupdf::Matrix::IDENTITY,
|
||||
)?;
|
||||
println!("{device:?}");
|
||||
let mut parser = Parser::new();
|
||||
let is_subset = page_numbers.is_some();
|
||||
let file_name = &args[1];
|
||||
parser.parse_pdf(file_name, page_numbers)?;
|
||||
let mut insns = xml_tree::Element::new(
|
||||
"instructions".into(),
|
||||
[("is-subset".into(), is_subset.to_string())],
|
||||
);
|
||||
insns.text = "\n".into();
|
||||
insns.tail = "\n".into();
|
||||
let mut comment =
|
||||
xml_tree::Element::comment(format!(" Automatically generated from {file_name} "));
|
||||
comment.tail = "\n".into();
|
||||
insns.children.push(comment);
|
||||
for insn in parser.insns {
|
||||
insn.write_xml(&mut insns);
|
||||
}
|
||||
let mut output = Vec::new();
|
||||
insns.write(&mut output, true)?;
|
||||
std::fs::write("powerisa-instructions.xml", output)?;
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
|||
103
src/xml_tree.rs
103
src/xml_tree.rs
|
|
@ -3,7 +3,7 @@
|
|||
|
||||
use quick_xml::{
|
||||
Writer,
|
||||
events::{BytesText, Event},
|
||||
events::{BytesDecl, BytesText, Event},
|
||||
};
|
||||
use std::fmt;
|
||||
|
||||
|
|
@ -110,47 +110,7 @@ pub(crate) struct Element {
|
|||
impl fmt::Display for Element {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let mut writer = Writer::new(FmtToIoAdaptor::new(f));
|
||||
fn helper(
|
||||
element: &Element,
|
||||
writer: &mut Writer<impl std::io::Write>,
|
||||
) -> std::io::Result<()> {
|
||||
let Element {
|
||||
tag,
|
||||
attrib,
|
||||
text,
|
||||
children,
|
||||
tail,
|
||||
} = element;
|
||||
match tag {
|
||||
ElementTag::Comment => {
|
||||
writer.write_event(Event::Comment(BytesText::new(text)))?;
|
||||
}
|
||||
ElementTag::Normal(tag) if tag.is_empty() => {
|
||||
writer.write_event(Event::Text(BytesText::new(text)))?;
|
||||
}
|
||||
ElementTag::Normal(tag) => {
|
||||
let mut element_writer = writer.create_element(tag);
|
||||
for (name, value) in attrib {
|
||||
element_writer =
|
||||
element_writer.with_attribute((name.as_str(), value.as_str()));
|
||||
}
|
||||
if text.is_empty() && children.is_empty() {
|
||||
element_writer.write_empty()?;
|
||||
} else {
|
||||
element_writer.write_inner_content(|writer| {
|
||||
writer.write_event(Event::Text(BytesText::new(text)))?;
|
||||
for child in children {
|
||||
helper(child, writer)?;
|
||||
}
|
||||
Ok(())
|
||||
})?;
|
||||
}
|
||||
}
|
||||
}
|
||||
writer.write_event(Event::Text(BytesText::new(tail)))?;
|
||||
Ok(())
|
||||
}
|
||||
helper(self, &mut writer).map_err(|_| fmt::Error)?;
|
||||
self.write_to(&mut writer).map_err(|_| fmt::Error)?;
|
||||
writer.into_inner().finish()?;
|
||||
Ok(())
|
||||
}
|
||||
|
|
@ -166,6 +126,16 @@ impl Element {
|
|||
tail: String::new(),
|
||||
}
|
||||
}
|
||||
/// equivalent of python's `xml.etree.ElementTree.Comment()`
|
||||
pub(crate) fn comment(text: String) -> Self {
|
||||
Self {
|
||||
tag: ElementTag::Comment,
|
||||
attrib: Vec::new(),
|
||||
text,
|
||||
children: Vec::new(),
|
||||
tail: String::new(),
|
||||
}
|
||||
}
|
||||
/// equivalent to python `"".join(self.itertext())`
|
||||
pub(crate) fn inner_text(&self) -> String {
|
||||
let mut retval = String::new();
|
||||
|
|
@ -198,4 +168,53 @@ impl Element {
|
|||
self.children.push(Self::new(tag, attrib));
|
||||
self.children.last_mut().expect("just pushed")
|
||||
}
|
||||
pub(crate) fn write_to(&self, writer: &mut Writer<impl std::io::Write>) -> std::io::Result<()> {
|
||||
let Element {
|
||||
tag,
|
||||
attrib,
|
||||
text,
|
||||
children,
|
||||
tail,
|
||||
} = self;
|
||||
match tag {
|
||||
ElementTag::Comment => {
|
||||
writer.write_event(Event::Comment(BytesText::new(text)))?;
|
||||
}
|
||||
ElementTag::Normal(tag) if tag.is_empty() => {
|
||||
writer.write_event(Event::Text(BytesText::new(text)))?;
|
||||
}
|
||||
ElementTag::Normal(tag) => {
|
||||
let mut element_writer = writer.create_element(tag);
|
||||
for (name, value) in attrib {
|
||||
element_writer = element_writer.with_attribute((name.as_str(), value.as_str()));
|
||||
}
|
||||
if text.is_empty() && children.is_empty() {
|
||||
element_writer.write_empty()?;
|
||||
} else {
|
||||
element_writer.write_inner_content(|writer| {
|
||||
writer.write_event(Event::Text(BytesText::new(text)))?;
|
||||
for child in children {
|
||||
child.write_to(writer)?;
|
||||
}
|
||||
Ok(())
|
||||
})?;
|
||||
}
|
||||
}
|
||||
}
|
||||
writer.write_event(Event::Text(BytesText::new(tail)))?;
|
||||
Ok(())
|
||||
}
|
||||
/// equivalent of python's `xml.etree.ElementTree(self).write(writer, encoding='utf-8', xml_declaration=xml_declaration)`
|
||||
pub(crate) fn write(
|
||||
&self,
|
||||
writer: impl std::io::Write,
|
||||
xml_declaration: bool,
|
||||
) -> std::io::Result<()> {
|
||||
let mut writer = Writer::new(writer);
|
||||
if xml_declaration {
|
||||
writer.write_event(Event::Decl(BytesDecl::new("1.0", Some("utf-8"), None)))?;
|
||||
writer.write_event(Event::Text(BytesText::new("\n")))?;
|
||||
}
|
||||
self.write_to(&mut writer)
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue