parse_powerisa_pdf/src/pdf/font/to_unicode_parse.rs

325 lines
14 KiB
Rust

use std::{collections::BTreeMap, sync::Arc};
use crate::{
pdf::{
font::{
PdfFontToUnicode,
type_1_parse::{PsFile, Token},
},
object::{PdfName, PdfObjectDirect, PdfString},
parse::{PdfInputPosition, PdfParseError},
},
util::ArcOrRef,
};
pub(crate) struct ToUnicodeParser {
tokenizer: PsFile,
}
#[track_caller]
fn invalid_token_err<T>(pos: PdfInputPosition, token: Option<Token>) -> Result<T, PdfParseError> {
Err(PdfParseError::InvalidTokenInToUnicodeStream {
pos,
token: format!("{token:?}"),
})
}
impl ToUnicodeParser {
pub(crate) fn new(tokenizer: PsFile) -> Self {
Self { tokenizer }
}
fn expect_any_string(&mut self) -> Result<Vec<u8>, PdfParseError> {
self.tokenizer.skip_comments_and_whitespace();
let pos = self.tokenizer.pos();
match self.tokenizer.next_token()? {
Some(Token::String(string)) => Ok(string),
token => invalid_token_err(pos, token),
}
}
pub(crate) fn expect_string_with_len(
&mut self,
expected_len: usize,
) -> Result<Vec<u8>, PdfParseError> {
self.tokenizer.skip_comments_and_whitespace();
let pos = self.tokenizer.pos();
match self.tokenizer.next_token()? {
Some(Token::String(string)) if string.len() == expected_len => Ok(string),
token => invalid_token_err(pos, token),
}
}
pub(crate) fn expect_literal_name(
&mut self,
expected_name: &[u8],
) -> Result<(), PdfParseError> {
self.tokenizer.skip_comments_and_whitespace();
let pos = self.tokenizer.pos();
match self.tokenizer.next_token()? {
Some(Token::LiteralName(name)) if name == expected_name => Ok(()),
token => invalid_token_err(pos, token),
}
}
pub(crate) fn expect_any_literal_name(&mut self) -> Result<Vec<u8>, PdfParseError> {
self.tokenizer.skip_comments_and_whitespace();
let pos = self.tokenizer.pos();
match self.tokenizer.next_token()? {
Some(Token::LiteralName(name)) => Ok(name),
token => invalid_token_err(pos, token),
}
}
pub(crate) fn expect_executable_name(
&mut self,
expected_name: &[u8],
) -> Result<(), PdfParseError> {
self.tokenizer.skip_comments_and_whitespace();
let pos = self.tokenizer.pos();
match self.tokenizer.next_token()? {
Some(Token::ExecutableName(name)) if name == expected_name => Ok(()),
token => invalid_token_err(pos, token),
}
}
pub(crate) fn expect(&mut self, expected_token: Token) -> Result<(), PdfParseError> {
self.tokenizer.skip_comments_and_whitespace();
let pos = self.tokenizer.pos();
match self.tokenizer.next_token()? {
Some(token) if token == expected_token => Ok(()),
token => invalid_token_err(pos, token),
}
}
pub(crate) fn expect_integer(&mut self) -> Result<i128, PdfParseError> {
self.tokenizer.skip_comments_and_whitespace();
let pos = self.tokenizer.pos();
match self.tokenizer.next_token()? {
Some(Token::Integer(value)) => Ok(value),
token => invalid_token_err(pos, token),
}
}
pub(crate) fn parse_dict(
&mut self,
mut entry_callback: impl FnMut(Vec<u8>, PdfInputPosition, Token) -> Result<(), PdfParseError>,
) -> Result<(), PdfParseError> {
self.expect(Token::DictStart)?;
loop {
self.tokenizer.skip_comments_and_whitespace();
let name_pos = self.tokenizer.pos();
match self.tokenizer.next_token()? {
Some(Token::DictEnd) => return Ok(()),
Some(Token::LiteralName(name)) => {
self.tokenizer.skip_comments_and_whitespace();
let value_pos = self.tokenizer.pos();
let Some(value) = self.tokenizer.next_token()? else {
return invalid_token_err(value_pos, None);
};
entry_callback(name, value_pos, value)?;
}
token => {
return invalid_token_err(name_pos, token);
}
}
}
}
pub(crate) fn parse(
mut self,
base_map: Option<PdfObjectDirect>,
) -> Result<PdfFontToUnicode, PdfParseError> {
self.tokenizer.skip_comments_and_whitespace();
self.expect_literal_name(b"CIDInit")?;
self.expect_literal_name(b"ProcSet")?;
self.expect_executable_name(b"findresource")?;
self.expect_executable_name(b"begin")?;
self.expect_integer()?;
self.expect_executable_name(b"dict")?;
self.expect_executable_name(b"begin")?;
self.expect_executable_name(b"begincmap")?;
self.expect_literal_name(b"CIDSystemInfo")?;
let mut registry = None;
let mut ordering = None;
let mut supplement = None;
self.parse_dict(|name, value_pos, value| match &*name {
b"Registry" => {
let Token::String(v) = value else {
return invalid_token_err(value_pos, Some(value));
};
registry = Some(v);
Ok(())
}
b"Ordering" => {
let Token::String(v) = value else {
return invalid_token_err(value_pos, Some(value));
};
ordering = Some(v);
Ok(())
}
b"Supplement" => {
let Token::Integer(v) = value else {
return invalid_token_err(value_pos, Some(value));
};
supplement = Some(v);
Ok(())
}
_ => todo!("{}: {value:?}", name.escape_ascii()),
})?;
self.expect_executable_name(b"def")?;
self.expect_literal_name(b"CMapName")?;
self.tokenizer.skip_comments_and_whitespace();
let char_map_name_pos = self.tokenizer.pos();
let char_map_name = self.expect_any_literal_name()?;
self.expect_executable_name(b"def")?;
self.expect_literal_name(b"CMapType")?;
self.expect(Token::Integer(2))?;
self.expect_executable_name(b"def")?;
self.expect(Token::Integer(1))?;
self.expect_executable_name(b"begincodespacerange")?;
self.tokenizer.skip_comments_and_whitespace();
let range_start_pos = self.tokenizer.pos();
let range_start = self.expect_any_string()?;
if range_start.is_empty() {
return invalid_token_err(range_start_pos, Some(Token::String(range_start)));
}
self.tokenizer.skip_comments_and_whitespace();
let range_end_pos = self.tokenizer.pos();
let range_end = self.expect_string_with_len(range_start.len())?;
self.expect_executable_name(b"endcodespacerange")?;
let mut to_unicode_map: BTreeMap<PdfString, Arc<str>> = BTreeMap::new();
let mut dest_str = String::new();
let mut insert_mapping = |src_pos: PdfInputPosition,
src: &[u8],
dest_pos: PdfInputPosition,
dest_utf16_be: &[u8]|
-> Result<(), PdfParseError> {
dest_str.clear();
for ch in char::decode_utf16(
dest_utf16_be
.chunks(2)
.map(|chunk| u16::from_be_bytes([chunk[0], chunk[1]])),
) {
match ch {
Ok(ch) => dest_str.push(ch),
Err(_) => {
return Err(PdfParseError::InvalidUtf16 { pos: dest_pos });
}
}
}
to_unicode_map.insert(
PdfString::new(src_pos, ArcOrRef::Arc(src.into())),
dest_str.as_str().into(),
);
Ok(())
};
loop {
match self.tokenizer.next_token()? {
Some(Token::Integer(size)) => match self.tokenizer.next_token()? {
Some(Token::ExecutableName(name)) if name == b"beginbfrange" => {
for _ in 0..size {
self.tokenizer.skip_comments_and_whitespace();
let src_pos = self.tokenizer.pos();
let src_low = self.expect_string_with_len(range_start.len())?;
self.tokenizer.skip_comments_and_whitespace();
let src_high_pos = self.tokenizer.pos();
let src_high = self.expect_string_with_len(range_start.len())?;
if src_low.split_last().map(|(_, prefix)| prefix)
!= src_high.split_last().map(|(_, prefix)| prefix)
{
return invalid_token_err(
src_high_pos,
Some(Token::String(src_high)),
);
}
let src_last_range = *src_low.last().expect("known to be non-empty")
..=*src_high.last().expect("known to be non-empty");
self.tokenizer.skip_comments_and_whitespace();
let dest_pos = self.tokenizer.pos();
match self.tokenizer.next_token()? {
Some(Token::String(dest))
if dest.len() >= 2 && dest.len() % 2 == 0 =>
{
let mut src = src_low;
for (index, src_last_byte) in src_last_range.enumerate() {
*src.last_mut().expect("known to be non-empty") =
src_last_byte;
let mut dest = dest.clone();
let [.., last] = &mut *dest else {
unreachable!();
};
*last += index as u8;
insert_mapping(src_pos, &src, dest_pos, &dest)?;
}
}
Some(token @ Token::String(_)) => {
todo!("odd number of dest bytes: {token:?}");
}
Some(Token::ArrayStart) => {
let mut src = src_low;
for src_last_byte in src_last_range {
*src.last_mut().expect("known to be non-empty") =
src_last_byte;
self.tokenizer.skip_comments_and_whitespace();
let dest_pos = self.tokenizer.pos();
match self.tokenizer.next_token()? {
Some(Token::String(dest))
if dest.len() >= 2 && dest.len() % 2 == 0 =>
{
insert_mapping(src_pos, &src, dest_pos, &dest)?;
}
Some(token @ Token::String(_)) => {
todo!("odd number of dest bytes: {token:?}");
}
token => return invalid_token_err(dest_pos, token),
}
}
self.expect(Token::ArrayEnd)?;
}
token => return invalid_token_err(dest_pos, token),
}
}
self.expect_executable_name(b"endbfrange")?;
}
Some(Token::ExecutableName(name)) if name == b"beginbfchar" => {
for _ in 0..size {
self.tokenizer.skip_comments_and_whitespace();
let src_pos = self.tokenizer.pos();
let src = self.expect_string_with_len(range_start.len())?;
self.tokenizer.skip_comments_and_whitespace();
let dest_pos = self.tokenizer.pos();
match self.tokenizer.next_token()? {
Some(Token::String(dest)) if dest.len() % 2 == 0 => {
insert_mapping(src_pos, &src, dest_pos, &dest)?;
}
Some(token @ Token::String(_)) => {
todo!("odd number of dest bytes: {token:?}");
}
token => return invalid_token_err(dest_pos, token),
}
}
self.expect_executable_name(b"endbfchar")?;
}
token => todo!("{token:?}"),
},
Some(Token::ExecutableName(name)) if name == b"endcmap" => {
break;
}
token => todo!("{token:?}"),
}
}
self.expect_executable_name(b"CMapName")?;
self.expect_executable_name(b"currentdict")?;
self.expect_literal_name(b"CMap")?;
self.expect_executable_name(b"defineresource")?;
self.expect_executable_name(b"pop")?;
self.expect_executable_name(b"end")?;
self.expect_executable_name(b"end")?;
self.tokenizer.skip_comments_and_whitespace();
let eof_pos = self.tokenizer.pos();
if let token @ Some(_) = self.tokenizer.next_token()? {
return invalid_token_err(eof_pos, token);
}
Ok(PdfFontToUnicode {
base_map,
char_map_name: PdfName::new(char_map_name_pos, Arc::<[u8]>::from(char_map_name)),
src_ranges: Arc::new([
PdfString::new(range_start_pos, ArcOrRef::Arc(range_start.into()))
..=PdfString::new(range_end_pos, ArcOrRef::Arc(range_end.into())),
]),
to_unicode_map: Arc::new(to_unicode_map),
})
}
}