use std::{collections::BTreeMap, sync::Arc}; use crate::{ pdf::{ font::{ PdfFontToUnicode, type_1_parse::{PsFile, Token}, }, object::{PdfName, PdfObjectDirect, PdfString}, parse::{PdfInputPosition, PdfParseError}, }, util::ArcOrRef, }; pub(crate) struct ToUnicodeParser { tokenizer: PsFile, } #[track_caller] fn invalid_token_err(pos: PdfInputPosition, token: Option) -> Result { Err(PdfParseError::InvalidTokenInToUnicodeStream { pos, token: format!("{token:?}"), }) } impl ToUnicodeParser { pub(crate) fn new(tokenizer: PsFile) -> Self { Self { tokenizer } } fn expect_any_string(&mut self) -> Result, PdfParseError> { self.tokenizer.skip_comments_and_whitespace(); let pos = self.tokenizer.pos(); match self.tokenizer.next_token()? { Some(Token::String(string)) => Ok(string), token => invalid_token_err(pos, token), } } pub(crate) fn expect_string_with_len( &mut self, expected_len: usize, ) -> Result, PdfParseError> { self.tokenizer.skip_comments_and_whitespace(); let pos = self.tokenizer.pos(); match self.tokenizer.next_token()? { Some(Token::String(string)) if string.len() == expected_len => Ok(string), token => invalid_token_err(pos, token), } } pub(crate) fn expect_literal_name( &mut self, expected_name: &[u8], ) -> Result<(), PdfParseError> { self.tokenizer.skip_comments_and_whitespace(); let pos = self.tokenizer.pos(); match self.tokenizer.next_token()? { Some(Token::LiteralName(name)) if name == expected_name => Ok(()), token => invalid_token_err(pos, token), } } pub(crate) fn expect_any_literal_name(&mut self) -> Result, PdfParseError> { self.tokenizer.skip_comments_and_whitespace(); let pos = self.tokenizer.pos(); match self.tokenizer.next_token()? { Some(Token::LiteralName(name)) => Ok(name), token => invalid_token_err(pos, token), } } pub(crate) fn expect_executable_name( &mut self, expected_name: &[u8], ) -> Result<(), PdfParseError> { self.tokenizer.skip_comments_and_whitespace(); let pos = self.tokenizer.pos(); match self.tokenizer.next_token()? { Some(Token::ExecutableName(name)) if name == expected_name => Ok(()), token => invalid_token_err(pos, token), } } pub(crate) fn expect(&mut self, expected_token: Token) -> Result<(), PdfParseError> { self.tokenizer.skip_comments_and_whitespace(); let pos = self.tokenizer.pos(); match self.tokenizer.next_token()? { Some(token) if token == expected_token => Ok(()), token => invalid_token_err(pos, token), } } pub(crate) fn expect_integer(&mut self) -> Result { self.tokenizer.skip_comments_and_whitespace(); let pos = self.tokenizer.pos(); match self.tokenizer.next_token()? { Some(Token::Integer(value)) => Ok(value), token => invalid_token_err(pos, token), } } pub(crate) fn parse_dict( &mut self, mut entry_callback: impl FnMut(Vec, PdfInputPosition, Token) -> Result<(), PdfParseError>, ) -> Result<(), PdfParseError> { self.expect(Token::DictStart)?; loop { self.tokenizer.skip_comments_and_whitespace(); let name_pos = self.tokenizer.pos(); match self.tokenizer.next_token()? { Some(Token::DictEnd) => return Ok(()), Some(Token::LiteralName(name)) => { self.tokenizer.skip_comments_and_whitespace(); let value_pos = self.tokenizer.pos(); let Some(value) = self.tokenizer.next_token()? else { return invalid_token_err(value_pos, None); }; entry_callback(name, value_pos, value)?; } token => { return invalid_token_err(name_pos, token); } } } } pub(crate) fn parse( mut self, base_map: Option, ) -> Result { self.tokenizer.skip_comments_and_whitespace(); self.expect_literal_name(b"CIDInit")?; self.expect_literal_name(b"ProcSet")?; self.expect_executable_name(b"findresource")?; self.expect_executable_name(b"begin")?; self.expect_integer()?; self.expect_executable_name(b"dict")?; self.expect_executable_name(b"begin")?; self.expect_executable_name(b"begincmap")?; self.expect_literal_name(b"CIDSystemInfo")?; let mut registry = None; let mut ordering = None; let mut supplement = None; self.parse_dict(|name, value_pos, value| match &*name { b"Registry" => { let Token::String(v) = value else { return invalid_token_err(value_pos, Some(value)); }; registry = Some(v); Ok(()) } b"Ordering" => { let Token::String(v) = value else { return invalid_token_err(value_pos, Some(value)); }; ordering = Some(v); Ok(()) } b"Supplement" => { let Token::Integer(v) = value else { return invalid_token_err(value_pos, Some(value)); }; supplement = Some(v); Ok(()) } _ => todo!("{}: {value:?}", name.escape_ascii()), })?; self.expect_executable_name(b"def")?; self.expect_literal_name(b"CMapName")?; self.tokenizer.skip_comments_and_whitespace(); let char_map_name_pos = self.tokenizer.pos(); let char_map_name = self.expect_any_literal_name()?; self.expect_executable_name(b"def")?; self.expect_literal_name(b"CMapType")?; self.expect(Token::Integer(2))?; self.expect_executable_name(b"def")?; self.expect(Token::Integer(1))?; self.expect_executable_name(b"begincodespacerange")?; self.tokenizer.skip_comments_and_whitespace(); let range_start_pos = self.tokenizer.pos(); let range_start = self.expect_any_string()?; if range_start.is_empty() { return invalid_token_err(range_start_pos, Some(Token::String(range_start))); } self.tokenizer.skip_comments_and_whitespace(); let range_end_pos = self.tokenizer.pos(); let range_end = self.expect_string_with_len(range_start.len())?; self.expect_executable_name(b"endcodespacerange")?; let mut to_unicode_map: BTreeMap> = BTreeMap::new(); let mut dest_str = String::new(); let mut insert_mapping = |src_pos: PdfInputPosition, src: &[u8], dest_pos: PdfInputPosition, dest_utf16_be: &[u8]| -> Result<(), PdfParseError> { dest_str.clear(); for ch in char::decode_utf16( dest_utf16_be .chunks(2) .map(|chunk| u16::from_be_bytes([chunk[0], chunk[1]])), ) { match ch { Ok(ch) => dest_str.push(ch), Err(_) => { return Err(PdfParseError::InvalidUtf16 { pos: dest_pos }); } } } to_unicode_map.insert( PdfString::new(src_pos, ArcOrRef::Arc(src.into())), dest_str.as_str().into(), ); Ok(()) }; loop { match self.tokenizer.next_token()? { Some(Token::Integer(size)) => match self.tokenizer.next_token()? { Some(Token::ExecutableName(name)) if name == b"beginbfrange" => { for _ in 0..size { self.tokenizer.skip_comments_and_whitespace(); let src_pos = self.tokenizer.pos(); let src_low = self.expect_string_with_len(range_start.len())?; self.tokenizer.skip_comments_and_whitespace(); let src_high_pos = self.tokenizer.pos(); let src_high = self.expect_string_with_len(range_start.len())?; if src_low.split_last().map(|(_, prefix)| prefix) != src_high.split_last().map(|(_, prefix)| prefix) { return invalid_token_err( src_high_pos, Some(Token::String(src_high)), ); } let src_last_range = *src_low.last().expect("known to be non-empty") ..=*src_high.last().expect("known to be non-empty"); self.tokenizer.skip_comments_and_whitespace(); let dest_pos = self.tokenizer.pos(); match self.tokenizer.next_token()? { Some(Token::String(dest)) if dest.len() >= 2 && dest.len() % 2 == 0 => { let mut src = src_low; for (index, src_last_byte) in src_last_range.enumerate() { *src.last_mut().expect("known to be non-empty") = src_last_byte; let mut dest = dest.clone(); let [.., last] = &mut *dest else { unreachable!(); }; *last += index as u8; insert_mapping(src_pos, &src, dest_pos, &dest)?; } } Some(token @ Token::String(_)) => { todo!("odd number of dest bytes: {token:?}"); } Some(Token::ArrayStart) => { let mut src = src_low; for src_last_byte in src_last_range { *src.last_mut().expect("known to be non-empty") = src_last_byte; self.tokenizer.skip_comments_and_whitespace(); let dest_pos = self.tokenizer.pos(); match self.tokenizer.next_token()? { Some(Token::String(dest)) if dest.len() >= 2 && dest.len() % 2 == 0 => { insert_mapping(src_pos, &src, dest_pos, &dest)?; } Some(token @ Token::String(_)) => { todo!("odd number of dest bytes: {token:?}"); } token => return invalid_token_err(dest_pos, token), } } self.expect(Token::ArrayEnd)?; } token => return invalid_token_err(dest_pos, token), } } self.expect_executable_name(b"endbfrange")?; } Some(Token::ExecutableName(name)) if name == b"beginbfchar" => { for _ in 0..size { self.tokenizer.skip_comments_and_whitespace(); let src_pos = self.tokenizer.pos(); let src = self.expect_string_with_len(range_start.len())?; self.tokenizer.skip_comments_and_whitespace(); let dest_pos = self.tokenizer.pos(); match self.tokenizer.next_token()? { Some(Token::String(dest)) if dest.len() % 2 == 0 => { insert_mapping(src_pos, &src, dest_pos, &dest)?; } Some(token @ Token::String(_)) => { todo!("odd number of dest bytes: {token:?}"); } token => return invalid_token_err(dest_pos, token), } } self.expect_executable_name(b"endbfchar")?; } token => todo!("{token:?}"), }, Some(Token::ExecutableName(name)) if name == b"endcmap" => { break; } token => todo!("{token:?}"), } } self.expect_executable_name(b"CMapName")?; self.expect_executable_name(b"currentdict")?; self.expect_literal_name(b"CMap")?; self.expect_executable_name(b"defineresource")?; self.expect_executable_name(b"pop")?; self.expect_executable_name(b"end")?; self.expect_executable_name(b"end")?; self.tokenizer.skip_comments_and_whitespace(); let eof_pos = self.tokenizer.pos(); if let token @ Some(_) = self.tokenizer.next_token()? { return invalid_token_err(eof_pos, token); } Ok(PdfFontToUnicode { base_map, char_map_name: PdfName::new(char_map_name_pos, Arc::<[u8]>::from(char_map_name)), src_ranges: Arc::new([ PdfString::new(range_start_pos, ArcOrRef::Arc(range_start.into())) ..=PdfString::new(range_end_pos, ArcOrRef::Arc(range_end.into())), ]), to_unicode_map: Arc::new(to_unicode_map), }) } }