325 lines
14 KiB
Rust
325 lines
14 KiB
Rust
use std::{collections::BTreeMap, sync::Arc};
|
|
|
|
use crate::{
|
|
pdf::{
|
|
font::{
|
|
PdfFontToUnicode,
|
|
type_1_parse::{PsFile, Token},
|
|
},
|
|
object::{PdfName, PdfObjectDirect, PdfString},
|
|
parse::{PdfInputPosition, PdfParseError},
|
|
},
|
|
util::ArcOrRef,
|
|
};
|
|
|
|
pub(crate) struct ToUnicodeParser {
|
|
tokenizer: PsFile,
|
|
}
|
|
|
|
#[track_caller]
|
|
fn invalid_token_err<T>(pos: PdfInputPosition, token: Option<Token>) -> Result<T, PdfParseError> {
|
|
Err(PdfParseError::InvalidTokenInToUnicodeStream {
|
|
pos,
|
|
token: format!("{token:?}"),
|
|
})
|
|
}
|
|
|
|
impl ToUnicodeParser {
|
|
pub(crate) fn new(tokenizer: PsFile) -> Self {
|
|
Self { tokenizer }
|
|
}
|
|
fn expect_any_string(&mut self) -> Result<Vec<u8>, PdfParseError> {
|
|
self.tokenizer.skip_comments_and_whitespace();
|
|
let pos = self.tokenizer.pos();
|
|
match self.tokenizer.next_token()? {
|
|
Some(Token::String(string)) => Ok(string),
|
|
token => invalid_token_err(pos, token),
|
|
}
|
|
}
|
|
pub(crate) fn expect_string_with_len(
|
|
&mut self,
|
|
expected_len: usize,
|
|
) -> Result<Vec<u8>, PdfParseError> {
|
|
self.tokenizer.skip_comments_and_whitespace();
|
|
let pos = self.tokenizer.pos();
|
|
match self.tokenizer.next_token()? {
|
|
Some(Token::String(string)) if string.len() == expected_len => Ok(string),
|
|
token => invalid_token_err(pos, token),
|
|
}
|
|
}
|
|
pub(crate) fn expect_literal_name(
|
|
&mut self,
|
|
expected_name: &[u8],
|
|
) -> Result<(), PdfParseError> {
|
|
self.tokenizer.skip_comments_and_whitespace();
|
|
let pos = self.tokenizer.pos();
|
|
match self.tokenizer.next_token()? {
|
|
Some(Token::LiteralName(name)) if name == expected_name => Ok(()),
|
|
token => invalid_token_err(pos, token),
|
|
}
|
|
}
|
|
pub(crate) fn expect_any_literal_name(&mut self) -> Result<Vec<u8>, PdfParseError> {
|
|
self.tokenizer.skip_comments_and_whitespace();
|
|
let pos = self.tokenizer.pos();
|
|
match self.tokenizer.next_token()? {
|
|
Some(Token::LiteralName(name)) => Ok(name),
|
|
token => invalid_token_err(pos, token),
|
|
}
|
|
}
|
|
pub(crate) fn expect_executable_name(
|
|
&mut self,
|
|
expected_name: &[u8],
|
|
) -> Result<(), PdfParseError> {
|
|
self.tokenizer.skip_comments_and_whitespace();
|
|
let pos = self.tokenizer.pos();
|
|
match self.tokenizer.next_token()? {
|
|
Some(Token::ExecutableName(name)) if name == expected_name => Ok(()),
|
|
token => invalid_token_err(pos, token),
|
|
}
|
|
}
|
|
pub(crate) fn expect(&mut self, expected_token: Token) -> Result<(), PdfParseError> {
|
|
self.tokenizer.skip_comments_and_whitespace();
|
|
let pos = self.tokenizer.pos();
|
|
match self.tokenizer.next_token()? {
|
|
Some(token) if token == expected_token => Ok(()),
|
|
token => invalid_token_err(pos, token),
|
|
}
|
|
}
|
|
pub(crate) fn expect_integer(&mut self) -> Result<i128, PdfParseError> {
|
|
self.tokenizer.skip_comments_and_whitespace();
|
|
let pos = self.tokenizer.pos();
|
|
match self.tokenizer.next_token()? {
|
|
Some(Token::Integer(value)) => Ok(value),
|
|
token => invalid_token_err(pos, token),
|
|
}
|
|
}
|
|
pub(crate) fn parse_dict(
|
|
&mut self,
|
|
mut entry_callback: impl FnMut(Vec<u8>, PdfInputPosition, Token) -> Result<(), PdfParseError>,
|
|
) -> Result<(), PdfParseError> {
|
|
self.expect(Token::DictStart)?;
|
|
loop {
|
|
self.tokenizer.skip_comments_and_whitespace();
|
|
let name_pos = self.tokenizer.pos();
|
|
match self.tokenizer.next_token()? {
|
|
Some(Token::DictEnd) => return Ok(()),
|
|
Some(Token::LiteralName(name)) => {
|
|
self.tokenizer.skip_comments_and_whitespace();
|
|
let value_pos = self.tokenizer.pos();
|
|
let Some(value) = self.tokenizer.next_token()? else {
|
|
return invalid_token_err(value_pos, None);
|
|
};
|
|
entry_callback(name, value_pos, value)?;
|
|
}
|
|
token => {
|
|
return invalid_token_err(name_pos, token);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
pub(crate) fn parse(
|
|
mut self,
|
|
base_map: Option<PdfObjectDirect>,
|
|
) -> Result<PdfFontToUnicode, PdfParseError> {
|
|
self.tokenizer.skip_comments_and_whitespace();
|
|
self.expect_literal_name(b"CIDInit")?;
|
|
self.expect_literal_name(b"ProcSet")?;
|
|
self.expect_executable_name(b"findresource")?;
|
|
self.expect_executable_name(b"begin")?;
|
|
self.expect_integer()?;
|
|
self.expect_executable_name(b"dict")?;
|
|
self.expect_executable_name(b"begin")?;
|
|
self.expect_executable_name(b"begincmap")?;
|
|
self.expect_literal_name(b"CIDSystemInfo")?;
|
|
let mut registry = None;
|
|
let mut ordering = None;
|
|
let mut supplement = None;
|
|
self.parse_dict(|name, value_pos, value| match &*name {
|
|
b"Registry" => {
|
|
let Token::String(v) = value else {
|
|
return invalid_token_err(value_pos, Some(value));
|
|
};
|
|
registry = Some(v);
|
|
Ok(())
|
|
}
|
|
b"Ordering" => {
|
|
let Token::String(v) = value else {
|
|
return invalid_token_err(value_pos, Some(value));
|
|
};
|
|
ordering = Some(v);
|
|
Ok(())
|
|
}
|
|
b"Supplement" => {
|
|
let Token::Integer(v) = value else {
|
|
return invalid_token_err(value_pos, Some(value));
|
|
};
|
|
supplement = Some(v);
|
|
Ok(())
|
|
}
|
|
_ => todo!("{}: {value:?}", name.escape_ascii()),
|
|
})?;
|
|
self.expect_executable_name(b"def")?;
|
|
self.expect_literal_name(b"CMapName")?;
|
|
self.tokenizer.skip_comments_and_whitespace();
|
|
let char_map_name_pos = self.tokenizer.pos();
|
|
let char_map_name = self.expect_any_literal_name()?;
|
|
self.expect_executable_name(b"def")?;
|
|
self.expect_literal_name(b"CMapType")?;
|
|
self.expect(Token::Integer(2))?;
|
|
self.expect_executable_name(b"def")?;
|
|
self.expect(Token::Integer(1))?;
|
|
self.expect_executable_name(b"begincodespacerange")?;
|
|
self.tokenizer.skip_comments_and_whitespace();
|
|
let range_start_pos = self.tokenizer.pos();
|
|
let range_start = self.expect_any_string()?;
|
|
if range_start.is_empty() {
|
|
return invalid_token_err(range_start_pos, Some(Token::String(range_start)));
|
|
}
|
|
self.tokenizer.skip_comments_and_whitespace();
|
|
let range_end_pos = self.tokenizer.pos();
|
|
let range_end = self.expect_string_with_len(range_start.len())?;
|
|
self.expect_executable_name(b"endcodespacerange")?;
|
|
let mut to_unicode_map: BTreeMap<PdfString, Arc<str>> = BTreeMap::new();
|
|
let mut dest_str = String::new();
|
|
let mut insert_mapping = |src_pos: PdfInputPosition,
|
|
src: &[u8],
|
|
dest_pos: PdfInputPosition,
|
|
dest_utf16_be: &[u8]|
|
|
-> Result<(), PdfParseError> {
|
|
dest_str.clear();
|
|
for ch in char::decode_utf16(
|
|
dest_utf16_be
|
|
.chunks(2)
|
|
.map(|chunk| u16::from_be_bytes([chunk[0], chunk[1]])),
|
|
) {
|
|
match ch {
|
|
Ok(ch) => dest_str.push(ch),
|
|
Err(_) => {
|
|
return Err(PdfParseError::InvalidUtf16 { pos: dest_pos });
|
|
}
|
|
}
|
|
}
|
|
to_unicode_map.insert(
|
|
PdfString::new(src_pos, ArcOrRef::Arc(src.into())),
|
|
dest_str.as_str().into(),
|
|
);
|
|
Ok(())
|
|
};
|
|
loop {
|
|
match self.tokenizer.next_token()? {
|
|
Some(Token::Integer(size)) => match self.tokenizer.next_token()? {
|
|
Some(Token::ExecutableName(name)) if name == b"beginbfrange" => {
|
|
for _ in 0..size {
|
|
self.tokenizer.skip_comments_and_whitespace();
|
|
let src_pos = self.tokenizer.pos();
|
|
let src_low = self.expect_string_with_len(range_start.len())?;
|
|
self.tokenizer.skip_comments_and_whitespace();
|
|
let src_high_pos = self.tokenizer.pos();
|
|
let src_high = self.expect_string_with_len(range_start.len())?;
|
|
if src_low.split_last().map(|(_, prefix)| prefix)
|
|
!= src_high.split_last().map(|(_, prefix)| prefix)
|
|
{
|
|
return invalid_token_err(
|
|
src_high_pos,
|
|
Some(Token::String(src_high)),
|
|
);
|
|
}
|
|
let src_last_range = *src_low.last().expect("known to be non-empty")
|
|
..=*src_high.last().expect("known to be non-empty");
|
|
self.tokenizer.skip_comments_and_whitespace();
|
|
let dest_pos = self.tokenizer.pos();
|
|
match self.tokenizer.next_token()? {
|
|
Some(Token::String(dest))
|
|
if dest.len() >= 2 && dest.len() % 2 == 0 =>
|
|
{
|
|
let mut src = src_low;
|
|
for (index, src_last_byte) in src_last_range.enumerate() {
|
|
*src.last_mut().expect("known to be non-empty") =
|
|
src_last_byte;
|
|
let mut dest = dest.clone();
|
|
let [.., last] = &mut *dest else {
|
|
unreachable!();
|
|
};
|
|
*last += index as u8;
|
|
insert_mapping(src_pos, &src, dest_pos, &dest)?;
|
|
}
|
|
}
|
|
Some(token @ Token::String(_)) => {
|
|
todo!("odd number of dest bytes: {token:?}");
|
|
}
|
|
Some(Token::ArrayStart) => {
|
|
let mut src = src_low;
|
|
for src_last_byte in src_last_range {
|
|
*src.last_mut().expect("known to be non-empty") =
|
|
src_last_byte;
|
|
self.tokenizer.skip_comments_and_whitespace();
|
|
let dest_pos = self.tokenizer.pos();
|
|
match self.tokenizer.next_token()? {
|
|
Some(Token::String(dest))
|
|
if dest.len() >= 2 && dest.len() % 2 == 0 =>
|
|
{
|
|
insert_mapping(src_pos, &src, dest_pos, &dest)?;
|
|
}
|
|
Some(token @ Token::String(_)) => {
|
|
todo!("odd number of dest bytes: {token:?}");
|
|
}
|
|
token => return invalid_token_err(dest_pos, token),
|
|
}
|
|
}
|
|
self.expect(Token::ArrayEnd)?;
|
|
}
|
|
token => return invalid_token_err(dest_pos, token),
|
|
}
|
|
}
|
|
self.expect_executable_name(b"endbfrange")?;
|
|
}
|
|
Some(Token::ExecutableName(name)) if name == b"beginbfchar" => {
|
|
for _ in 0..size {
|
|
self.tokenizer.skip_comments_and_whitespace();
|
|
let src_pos = self.tokenizer.pos();
|
|
let src = self.expect_string_with_len(range_start.len())?;
|
|
self.tokenizer.skip_comments_and_whitespace();
|
|
let dest_pos = self.tokenizer.pos();
|
|
match self.tokenizer.next_token()? {
|
|
Some(Token::String(dest)) if dest.len() % 2 == 0 => {
|
|
insert_mapping(src_pos, &src, dest_pos, &dest)?;
|
|
}
|
|
Some(token @ Token::String(_)) => {
|
|
todo!("odd number of dest bytes: {token:?}");
|
|
}
|
|
token => return invalid_token_err(dest_pos, token),
|
|
}
|
|
}
|
|
self.expect_executable_name(b"endbfchar")?;
|
|
}
|
|
token => todo!("{token:?}"),
|
|
},
|
|
Some(Token::ExecutableName(name)) if name == b"endcmap" => {
|
|
break;
|
|
}
|
|
token => todo!("{token:?}"),
|
|
}
|
|
}
|
|
self.expect_executable_name(b"CMapName")?;
|
|
self.expect_executable_name(b"currentdict")?;
|
|
self.expect_literal_name(b"CMap")?;
|
|
self.expect_executable_name(b"defineresource")?;
|
|
self.expect_executable_name(b"pop")?;
|
|
self.expect_executable_name(b"end")?;
|
|
self.expect_executable_name(b"end")?;
|
|
self.tokenizer.skip_comments_and_whitespace();
|
|
let eof_pos = self.tokenizer.pos();
|
|
if let token @ Some(_) = self.tokenizer.next_token()? {
|
|
return invalid_token_err(eof_pos, token);
|
|
}
|
|
Ok(PdfFontToUnicode {
|
|
base_map,
|
|
char_map_name: PdfName::new(char_map_name_pos, Arc::<[u8]>::from(char_map_name)),
|
|
src_ranges: Arc::new([
|
|
PdfString::new(range_start_pos, ArcOrRef::Arc(range_start.into()))
|
|
..=PdfString::new(range_end_pos, ArcOrRef::Arc(range_end.into())),
|
|
]),
|
|
to_unicode_map: Arc::new(to_unicode_map),
|
|
})
|
|
}
|
|
}
|