diff --git a/src/pdf/font.rs b/src/pdf/font.rs index 04b62f3..2196bf6 100644 --- a/src/pdf/font.rs +++ b/src/pdf/font.rs @@ -1,5 +1,6 @@ use crate::{ pdf::{ + font::type_1_parse::PsFile, object::{ IsPdfNull, PdfArray, PdfDictionary, PdfMatrix, PdfName, PdfNameOrInteger, PdfObject, PdfObjectDirect, PdfRectangle, PdfStream, PdfString, @@ -15,26 +16,88 @@ use crate::{ use std::{borrow::Cow, collections::BTreeMap, fmt, sync::Arc}; mod tables; +mod to_unicode_parse; mod type_1_parse; pdf_parse! { - #[pdf(transparent)] - #[derive(Clone)] - // TODO: actually parse the stream - pub struct PdfFontToUnicode { - #[pdf] - stream: PdfStream, + #[pdf] + #[derive(Clone, Debug)] + pub struct PdfFontToUnicodeDictionary { + #[pdf(name = "UseCMap")] + pub base_map: Option, // TODO: parse + #[pdf(flatten)] + pub rest: PdfDictionary, } } +#[derive(Clone)] +pub struct PdfFontToUnicode { + pub base_map: Option, // TODO: parse + pub char_map_name: PdfName, + pub src_ranges: Arc<[std::ops::RangeInclusive]>, + pub to_unicode_map: Arc>>, +} + impl fmt::Debug for PdfFontToUnicode { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - DagDebugState::scope(|_state| { - let Self { stream } = self; - f.debug_struct("PdfFontToUnicode") - .field("stream", stream) - .finish() - }) + struct DebugFn) -> fmt::Result>(F); + impl) -> fmt::Result> fmt::Debug for DebugFn { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + (self.0)(f) + } + } + let Self { + base_map, + char_map_name, + src_ranges, + to_unicode_map, + } = self; + f.debug_struct("PdfFontToUnicode") + .field("base_map", base_map) + .field("char_map_name", char_map_name) + .field( + "src_ranges", + &DebugFn(|f| { + f.debug_set() + .entries( + src_ranges + .iter() + .map(|v| v.start().bytes_debug()..=v.end().bytes_debug()), + ) + .finish() + }), + ) + .field( + "to_unicode_map", + &DebugFn(|f| { + f.debug_map() + .entries(to_unicode_map.iter().map(|(k, v)| (k.bytes_debug(), v))) + .finish() + }), + ) + .finish() + } +} + +impl IsPdfNull for PdfFontToUnicode { + fn is_pdf_null(&self) -> bool { + false + } +} + +impl PdfParse for PdfFontToUnicode { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("PdfFontToUnicode") + } + fn parse(object: PdfObject) -> Result { + let stream = PdfStream::::parse(object)?; + let base_map = stream.dictionary().rest.base_map.clone(); + let decoded_data = stream.decoded_data().clone()?; + to_unicode_parse::ToUnicodeParser::new(PsFile::from_arc_bytes( + decoded_data, + stream.get_pdf_input_position(), + )) + .parse(base_map) } } @@ -858,6 +921,17 @@ pub struct PdfSimpleFontEncodingTable { pub table: ArcOrRef<'static, [PdfSimpleFontEncodingTableEntry; 0x100]>, } +impl PdfSimpleFontEncodingTable { + pub const fn empty() -> Self { + const EMPTY_ENTRY: PdfSimpleFontEncodingTableEntry = + PdfSimpleFontEncodingTableEntry::new_static(None, None); + const EMPTY_TABLE: &[PdfSimpleFontEncodingTableEntry; 0x100] = &[EMPTY_ENTRY; 0x100]; + Self { + table: ArcOrRef::Ref(EMPTY_TABLE), + } + } +} + #[derive(Clone, Debug)] pub enum PdfSimpleFontEncoding { Predefined(PdfSimpleFontEncodingPredefined), @@ -903,11 +977,12 @@ impl PdfParse for PdfSimpleFontEncoding { #[derive(Clone, Debug)] #[non_exhaustive] pub struct PdfFontType1Program { - pub encoding: Option]>>, - pub font_bbox: Option, + pub encoding: PdfSimpleFontEncodingTable, + pub font_bbox: PdfRectangle, pub font_info: Option, - pub font_matrix: Option, + pub font_matrix: PdfMatrix, pub font_name: Option, + pub vertical_writing_mode: bool, } #[derive(Clone, Debug)] diff --git a/src/pdf/font/to_unicode_parse.rs b/src/pdf/font/to_unicode_parse.rs new file mode 100644 index 0000000..e5c57be --- /dev/null +++ b/src/pdf/font/to_unicode_parse.rs @@ -0,0 +1,325 @@ +use std::{collections::BTreeMap, sync::Arc}; + +use crate::{ + pdf::{ + font::{ + PdfFontToUnicode, + type_1_parse::{PsFile, Token}, + }, + object::{PdfName, PdfObjectDirect, PdfString}, + parse::{PdfInputPosition, PdfParseError}, + }, + util::ArcOrRef, +}; + +pub(crate) struct ToUnicodeParser { + tokenizer: PsFile, +} + +#[track_caller] +fn invalid_token_err(pos: PdfInputPosition, token: Option) -> Result { + Err(PdfParseError::InvalidTokenInToUnicodeStream { + pos, + token: format!("{token:?}"), + }) +} + +impl ToUnicodeParser { + pub(crate) fn new(tokenizer: PsFile) -> Self { + Self { tokenizer } + } + fn expect_any_string(&mut self) -> Result, PdfParseError> { + self.tokenizer.skip_comments_and_whitespace(); + let pos = self.tokenizer.pos(); + match self.tokenizer.next_token()? { + Some(Token::String(string)) => Ok(string), + token => invalid_token_err(pos, token), + } + } + pub(crate) fn expect_string_with_len( + &mut self, + expected_len: usize, + ) -> Result, PdfParseError> { + self.tokenizer.skip_comments_and_whitespace(); + let pos = self.tokenizer.pos(); + match self.tokenizer.next_token()? { + Some(Token::String(string)) if string.len() == expected_len => Ok(string), + token => invalid_token_err(pos, token), + } + } + pub(crate) fn expect_literal_name( + &mut self, + expected_name: &[u8], + ) -> Result<(), PdfParseError> { + self.tokenizer.skip_comments_and_whitespace(); + let pos = self.tokenizer.pos(); + match self.tokenizer.next_token()? { + Some(Token::LiteralName(name)) if name == expected_name => Ok(()), + token => invalid_token_err(pos, token), + } + } + pub(crate) fn expect_any_literal_name(&mut self) -> Result, PdfParseError> { + self.tokenizer.skip_comments_and_whitespace(); + let pos = self.tokenizer.pos(); + match self.tokenizer.next_token()? { + Some(Token::LiteralName(name)) => Ok(name), + token => invalid_token_err(pos, token), + } + } + pub(crate) fn expect_executable_name( + &mut self, + expected_name: &[u8], + ) -> Result<(), PdfParseError> { + self.tokenizer.skip_comments_and_whitespace(); + let pos = self.tokenizer.pos(); + match self.tokenizer.next_token()? { + Some(Token::ExecutableName(name)) if name == expected_name => Ok(()), + token => invalid_token_err(pos, token), + } + } + pub(crate) fn expect(&mut self, expected_token: Token) -> Result<(), PdfParseError> { + self.tokenizer.skip_comments_and_whitespace(); + let pos = self.tokenizer.pos(); + match self.tokenizer.next_token()? { + Some(token) if token == expected_token => Ok(()), + token => invalid_token_err(pos, token), + } + } + pub(crate) fn expect_integer(&mut self) -> Result { + self.tokenizer.skip_comments_and_whitespace(); + let pos = self.tokenizer.pos(); + match self.tokenizer.next_token()? { + Some(Token::Integer(value)) => Ok(value), + token => invalid_token_err(pos, token), + } + } + pub(crate) fn parse_dict( + &mut self, + mut entry_callback: impl FnMut(Vec, PdfInputPosition, Token) -> Result<(), PdfParseError>, + ) -> Result<(), PdfParseError> { + self.expect(Token::DictStart)?; + loop { + self.tokenizer.skip_comments_and_whitespace(); + let name_pos = self.tokenizer.pos(); + match self.tokenizer.next_token()? { + Some(Token::DictEnd) => return Ok(()), + Some(Token::LiteralName(name)) => { + self.tokenizer.skip_comments_and_whitespace(); + let value_pos = self.tokenizer.pos(); + let Some(value) = self.tokenizer.next_token()? else { + return invalid_token_err(value_pos, None); + }; + entry_callback(name, value_pos, value)?; + } + token => { + return invalid_token_err(name_pos, token); + } + } + } + } + pub(crate) fn parse( + mut self, + base_map: Option, + ) -> Result { + self.tokenizer.skip_comments_and_whitespace(); + self.expect_literal_name(b"CIDInit")?; + self.expect_literal_name(b"ProcSet")?; + self.expect_executable_name(b"findresource")?; + self.expect_executable_name(b"begin")?; + self.expect_integer()?; + self.expect_executable_name(b"dict")?; + self.expect_executable_name(b"begin")?; + self.expect_executable_name(b"begincmap")?; + self.expect_literal_name(b"CIDSystemInfo")?; + let mut registry = None; + let mut ordering = None; + let mut supplement = None; + self.parse_dict(|name, value_pos, value| match &*name { + b"Registry" => { + let Token::String(v) = value else { + return invalid_token_err(value_pos, Some(value)); + }; + registry = Some(v); + Ok(()) + } + b"Ordering" => { + let Token::String(v) = value else { + return invalid_token_err(value_pos, Some(value)); + }; + ordering = Some(v); + Ok(()) + } + b"Supplement" => { + let Token::Integer(v) = value else { + return invalid_token_err(value_pos, Some(value)); + }; + supplement = Some(v); + Ok(()) + } + _ => todo!("{}: {value:?}", name.escape_ascii()), + })?; + self.expect_executable_name(b"def")?; + self.expect_literal_name(b"CMapName")?; + self.tokenizer.skip_comments_and_whitespace(); + let char_map_name_pos = self.tokenizer.pos(); + let char_map_name = self.expect_any_literal_name()?; + self.expect_executable_name(b"def")?; + self.expect_literal_name(b"CMapType")?; + self.expect(Token::Integer(2))?; + self.expect_executable_name(b"def")?; + self.expect(Token::Integer(1))?; + self.expect_executable_name(b"begincodespacerange")?; + self.tokenizer.skip_comments_and_whitespace(); + let range_start_pos = self.tokenizer.pos(); + let range_start = self.expect_any_string()?; + if range_start.is_empty() { + return invalid_token_err(range_start_pos, Some(Token::String(range_start))); + } + self.tokenizer.skip_comments_and_whitespace(); + let range_end_pos = self.tokenizer.pos(); + let range_end = self.expect_string_with_len(range_start.len())?; + self.expect_executable_name(b"endcodespacerange")?; + let mut to_unicode_map: BTreeMap> = BTreeMap::new(); + let mut dest_str = String::new(); + let mut insert_mapping = |src_pos: PdfInputPosition, + src: &[u8], + dest_pos: PdfInputPosition, + dest_utf16_be: &[u8]| + -> Result<(), PdfParseError> { + dest_str.clear(); + for ch in char::decode_utf16( + dest_utf16_be + .chunks(2) + .map(|chunk| u16::from_be_bytes([chunk[0], chunk[1]])), + ) { + match ch { + Ok(ch) => dest_str.push(ch), + Err(_) => { + return Err(PdfParseError::InvalidUtf16 { pos: dest_pos }); + } + } + } + to_unicode_map.insert( + PdfString::new(src_pos, ArcOrRef::Arc(src.into())), + dest_str.as_str().into(), + ); + Ok(()) + }; + loop { + match self.tokenizer.next_token()? { + Some(Token::Integer(size)) => match self.tokenizer.next_token()? { + Some(Token::ExecutableName(name)) if name == b"beginbfrange" => { + for _ in 0..size { + self.tokenizer.skip_comments_and_whitespace(); + let src_pos = self.tokenizer.pos(); + let src_low = self.expect_string_with_len(range_start.len())?; + self.tokenizer.skip_comments_and_whitespace(); + let src_high_pos = self.tokenizer.pos(); + let src_high = self.expect_string_with_len(range_start.len())?; + if src_low.split_last().map(|(_, prefix)| prefix) + != src_high.split_last().map(|(_, prefix)| prefix) + { + return invalid_token_err( + src_high_pos, + Some(Token::String(src_high)), + ); + } + let src_last_range = *src_low.last().expect("known to be non-empty") + ..=*src_high.last().expect("known to be non-empty"); + self.tokenizer.skip_comments_and_whitespace(); + let dest_pos = self.tokenizer.pos(); + match self.tokenizer.next_token()? { + Some(Token::String(dest)) + if dest.len() >= 2 && dest.len() % 2 == 0 => + { + let mut src = src_low; + for (index, src_last_byte) in src_last_range.enumerate() { + *src.last_mut().expect("known to be non-empty") = + src_last_byte; + let mut dest = dest.clone(); + let [.., last] = &mut *dest else { + unreachable!(); + }; + *last += index as u8; + insert_mapping(src_pos, &src, dest_pos, &dest)?; + } + } + Some(token @ Token::String(_)) => { + todo!("odd number of dest bytes: {token:?}"); + } + Some(Token::ArrayStart) => { + let mut src = src_low; + for src_last_byte in src_last_range { + *src.last_mut().expect("known to be non-empty") = + src_last_byte; + self.tokenizer.skip_comments_and_whitespace(); + let dest_pos = self.tokenizer.pos(); + match self.tokenizer.next_token()? { + Some(Token::String(dest)) + if dest.len() >= 2 && dest.len() % 2 == 0 => + { + insert_mapping(src_pos, &src, dest_pos, &dest)?; + } + Some(token @ Token::String(_)) => { + todo!("odd number of dest bytes: {token:?}"); + } + token => return invalid_token_err(dest_pos, token), + } + } + self.expect(Token::ArrayEnd)?; + } + token => return invalid_token_err(dest_pos, token), + } + } + self.expect_executable_name(b"endbfrange")?; + } + Some(Token::ExecutableName(name)) if name == b"beginbfchar" => { + for _ in 0..size { + self.tokenizer.skip_comments_and_whitespace(); + let src_pos = self.tokenizer.pos(); + let src = self.expect_string_with_len(range_start.len())?; + self.tokenizer.skip_comments_and_whitespace(); + let dest_pos = self.tokenizer.pos(); + match self.tokenizer.next_token()? { + Some(Token::String(dest)) if dest.len() % 2 == 0 => { + insert_mapping(src_pos, &src, dest_pos, &dest)?; + } + Some(token @ Token::String(_)) => { + todo!("odd number of dest bytes: {token:?}"); + } + token => return invalid_token_err(dest_pos, token), + } + } + self.expect_executable_name(b"endbfchar")?; + } + token => todo!("{token:?}"), + }, + Some(Token::ExecutableName(name)) if name == b"endcmap" => { + break; + } + token => todo!("{token:?}"), + } + } + self.expect_executable_name(b"CMapName")?; + self.expect_executable_name(b"currentdict")?; + self.expect_literal_name(b"CMap")?; + self.expect_executable_name(b"defineresource")?; + self.expect_executable_name(b"pop")?; + self.expect_executable_name(b"end")?; + self.expect_executable_name(b"end")?; + self.tokenizer.skip_comments_and_whitespace(); + let eof_pos = self.tokenizer.pos(); + if let token @ Some(_) = self.tokenizer.next_token()? { + return invalid_token_err(eof_pos, token); + } + Ok(PdfFontToUnicode { + base_map, + char_map_name: PdfName::new(char_map_name_pos, Arc::<[u8]>::from(char_map_name)), + src_ranges: Arc::new([ + PdfString::new(range_start_pos, ArcOrRef::Arc(range_start.into())) + ..=PdfString::new(range_end_pos, ArcOrRef::Arc(range_end.into())), + ]), + to_unicode_map: Arc::new(to_unicode_map), + }) + } +} diff --git a/src/pdf/font/type_1_parse.rs b/src/pdf/font/type_1_parse.rs index c557d5a..1825e42 100644 --- a/src/pdf/font/type_1_parse.rs +++ b/src/pdf/font/type_1_parse.rs @@ -1,7 +1,10 @@ use crate::{ pdf::{ PdfObjects, - font::{PdfFontType1FontInfo, PdfFontType1Program}, + font::{ + PdfFontType1FontInfo, PdfFontType1Program, PdfSimpleFontEncodingTable, + PdfSimpleFontEncodingTableEntry, + }, object::{PdfMatrix, PdfName, PdfRectangle, PdfStreamContents, PdfString, PdfVec2D}, parse::{ PdfInputPosition, PdfInputPositionKnown, PdfInputPositionNoCompare, PdfParseError, @@ -52,7 +55,7 @@ impl PsFileDecryptedSource { #[derive(Clone)] enum PsFileSource { - Bytes(Rc<[u8]>), + Bytes(Arc<[u8]>), Decrypted(Rc>), } @@ -66,7 +69,7 @@ impl PsFileSource { } #[derive(Clone)] -struct PsFile { +pub(crate) struct PsFile { id: u64, source: PsFileSource, pos: Rc>, @@ -119,8 +122,8 @@ fn is_regular_char(v: u8) -> bool { struct NotALineEnd; -#[derive(Clone)] -enum Token { +#[derive(Clone, PartialEq)] +pub(crate) enum Token { Integer(i128), Real(f64), ArrayStart, @@ -131,6 +134,8 @@ enum Token { LiteralName(Vec), ImmediatelyEvaluatedName(Vec), String(Vec), + DictStart, + DictEnd, } impl fmt::Debug for Token { @@ -150,6 +155,8 @@ impl fmt::Debug for Token { Self::String(contents) => { write!(f, "String({})", contents.escape_ascii()) } + Self::DictStart => write!(f, "DictStart"), + Self::DictEnd => write!(f, "DictEnd"), } } } @@ -165,7 +172,10 @@ impl PsFile { })), } } - fn pos(&self) -> PdfInputPosition { + pub(crate) fn from_arc_bytes(bytes: Arc<[u8]>, stream_pos: PdfInputPosition) -> Self { + Self::new(0, PsFileSource::Bytes(bytes), 0, stream_pos) + } + pub(crate) fn pos(&self) -> PdfInputPosition { PdfInputPosition::new(Some(self.pos.get())) } fn peek_byte(&self) -> Option { @@ -200,7 +210,7 @@ impl PsFile { self.next_byte(); } } - fn skip_comments_and_whitespace(&mut self) { + pub(crate) fn skip_comments_and_whitespace(&mut self) { loop { self.skip_whitespace(); let Some(b'%') = self.peek_byte() else { @@ -340,7 +350,41 @@ impl PsFile { } Err(PdfParseError::TruncatedFile { pos: self.pos() }) } - fn next_token(&mut self) -> Result, PdfParseError> { + fn parse_string_after_l_angle(&mut self) -> Result, PdfParseError> { + let mut contents = Vec::new(); + let mut high_digit_value = None; + let mut push_digit_value = |value: u8| { + high_digit_value = match high_digit_value { + Some(high_digit_value) => { + contents.push((high_digit_value << 4) | value); + None + } + None => Some(value), + }; + }; + let string_pos = self.pos(); + loop { + let pos = self.pos(); + match self.next_byte() { + None => { + return Err(PdfParseError::TruncatedFile { pos }); + } + Some(b'\0' | b'\t' | b'\n' | b'\x0C' | b'\r' | b' ') => {} + Some(b'>') => { + // if we have an odd trailing digit, add the final digit, otherwise doesn't modify contents + push_digit_value(0); + return Ok(contents); + } + Some(b) => { + let Some(value) = (b as char).to_digit(0x10) else { + return Err(PdfParseError::InvalidHexStringDigit { pos }); + }; + push_digit_value(value as u8); + } + } + } + } + pub(crate) fn next_token(&mut self) -> Result, PdfParseError> { self.skip_comments_and_whitespace(); let Some(first_byte) = self.peek_byte() else { return Ok(None); @@ -352,9 +396,26 @@ impl PsFile { } b')' => todo!(), b'<' => { - todo!("encoded string"); + self.next_byte(); + match self.peek_byte() { + Some(b'<') => { + self.next_byte(); + Ok(Some(Token::DictStart)) + } + Some(b'~') => todo!("base 85 encoded string"), + _ => Ok(Some(Token::String(self.parse_string_after_l_angle()?))), + } + } + b'>' => { + self.next_byte(); + match self.peek_byte() { + Some(b'>') => { + self.next_byte(); + Ok(Some(Token::DictEnd)) + } + _ => todo!("stray >"), + } } - b'>' => todo!(), b'[' => { self.next_byte(); Ok(Some(Token::ArrayStart)) @@ -724,12 +785,15 @@ impl PsOperator { let PsObject::Integer(initial) = initial else { todo!("{initial:?}"); }; - let PsObject::Integer(increment @ (..=-1 | 1..)) = increment else { + let PsObject::Integer(increment) = increment else { todo!("{increment:?}"); }; let PsObject::Integer(limit) = limit else { todo!("{limit:?} {:?}", parser.operand_stack); }; + if increment == 0 { + return custom_err("postscript for operator: increment can't be zero"); + }; let mut counter = initial; let proc = proc.into_vec(); loop { @@ -1158,6 +1222,8 @@ impl PsParser { Token::Real(v) => PsObject::Real(PsReal(v)), Token::ArrayStart => PsObject::ExecutableName(PsName(b"[".into())), Token::ArrayEnd => PsObject::ExecutableName(PsName(b"]".into())), + Token::DictStart => PsObject::ExecutableName(PsName(b"<<".into())), + Token::DictEnd => PsObject::ExecutableName(PsName(b">>".into())), Token::ProcedureStart => PsObject::Procedure(self.parse_procedure()?), Token::ProcedureEnd => return Ok(PsArray::from_elements(self, objects)), Token::ExecutableName(name) => PsObject::ExecutableName(PsName(name.into())), @@ -1176,6 +1242,8 @@ impl PsParser { Token::Real(v) => self.operand_stack.push(PsObject::Real(PsReal(v))), Token::ArrayStart => self.run_name(&PsName(b"[".into()))?, Token::ArrayEnd => self.run_name(&PsName(b"]".into()))?, + Token::DictStart => self.run_name(&PsName(b"<<".into()))?, + Token::DictEnd => self.run_name(&PsName(b">>".into()))?, Token::ProcedureStart => { let procedure = self.parse_procedure()?; self.operand_stack.push(PsObject::Procedure(procedure)) @@ -1199,26 +1267,30 @@ impl PsParser { fn parse_font_encoding( &mut self, value: PsArray, - ) -> Result]>, PdfParseError> { + ) -> Result { let value = value.rc(); let value = value.borrow(); - let mut vec = Vec::with_capacity(value.len()); + let mut retval = PdfSimpleFontEncodingTable::empty(); + let mut table_iter = ArcOrRef::make_mut(&mut retval.table).iter_mut(); for entry in value.iter() { match entry { PsObject::Name(name) => { - if name.0 == b".notdef" { - vec.push(None); + let name = if name.0 == b".notdef" { + None } else { - vec.push(Some(PdfName::new( - self.tokenizer.pos(), - Arc::from(&*name.0), - ))); + Some(PdfName::new(self.tokenizer.pos(), Arc::from(&*name.0))) + }; + if let Some(entry) = table_iter.next() { + *entry = PdfSimpleFontEncodingTableEntry { + name, + presumed_unicode: None, + }; } } _ => todo!("{entry:?}"), } } - Ok(Arc::from(vec)) + Ok(retval) } fn parse_font_bbox(&mut self, value: PsArray) -> Result { let value = value.rc(); @@ -1332,6 +1404,7 @@ impl PsParser { let mut font_info = None; let mut font_matrix = None; let mut font_name = None; + let mut vertical_writing_mode = false; for (key, value) in named { match (&*key.0, value) { (b"Encoding", PsObject::Array(value)) => { @@ -1349,6 +1422,7 @@ impl PsParser { (b"FontName", PsObject::Name(value)) => { font_name = Some(value.into()); } + (b"WMode", PsObject::Boolean(v)) => vertical_writing_mode = v, (b"FontType", _) => { // TODO } @@ -1361,12 +1435,22 @@ impl PsParser { for (key, value) in other { todo!("{key:?}: {value:?}"); } + let Some(encoding) = encoding else { + return custom_err("postscript type 1 font must have Encoding"); + }; + let Some(font_bbox) = font_bbox else { + return custom_err("postscript type 1 font must have FontBBox"); + }; + let Some(font_matrix) = font_matrix else { + return custom_err("postscript type 1 font must have FontMatrix"); + }; Ok(PdfFontType1Program { encoding, font_bbox, font_info, font_matrix, font_name, + vertical_writing_mode, }) } fn parse(mut self) -> Result { @@ -1414,7 +1498,7 @@ impl PdfStreamContents for PdfFontType1Program { ) -> Result { PsParser::new(PsFile::new( 0, - PsFileSource::Bytes(Rc::from(data)), + PsFileSource::Bytes(Arc::from(data)), 0, stream_pos, )) diff --git a/src/pdf/parse.rs b/src/pdf/parse.rs index 1d57f5e..4e5502a 100644 --- a/src/pdf/parse.rs +++ b/src/pdf/parse.rs @@ -296,6 +296,13 @@ pub enum PdfParseError { MissingSetFontOperator { pos: PdfInputPosition, }, + InvalidTokenInToUnicodeStream { + pos: PdfInputPosition, + token: String, + }, + InvalidUtf16 { + pos: PdfInputPosition, + }, } impl From for PdfParseError { @@ -345,7 +352,9 @@ impl GetPdfInputPosition for PdfParseError { | PdfParseError::CantRestoreGraphicsStateWithEmptyStack { pos } | PdfParseError::FontResourceNotFound { pos, .. } | PdfParseError::MissingBeginTextOperator { pos } - | PdfParseError::MissingSetFontOperator { pos } => pos, + | PdfParseError::MissingSetFontOperator { pos } + | PdfParseError::InvalidTokenInToUnicodeStream { pos, .. } + | PdfParseError::InvalidUtf16 { pos } => pos, PdfParseError::OperatorNotAllowedHere { ref operator } => operator.pos(), PdfParseError::OperatorHasTooFewOperands { ref operator } | PdfParseError::OperatorHasTooManyOperands { ref operator } => operator.pos(), @@ -488,7 +497,7 @@ impl fmt::Display for PdfParseError { ) } PdfParseError::MissingOperator { pos } => { - write!(f, "at {pos}: stream not allowed here") + write!(f, "at {pos}: missing operator") } PdfParseError::OperatorHasTooFewOperands { ref operator } => { write!( @@ -525,6 +534,12 @@ impl fmt::Display for PdfParseError { "at {pos}: missing set font `Tf` operator before this text showing operator" ) } + PdfParseError::InvalidTokenInToUnicodeStream { pos, ref token } => { + write!(f, "at {pos}: invalid token in ToUnicode stream: {token}") + } + PdfParseError::InvalidUtf16 { pos } => { + write!(f, "at {pos}: invalid UTF-16") + } } } } diff --git a/src/pdf/render.rs b/src/pdf/render.rs index 8ffbcac..586b6c4 100644 --- a/src/pdf/render.rs +++ b/src/pdf/render.rs @@ -35,7 +35,7 @@ use crate::{ PdfOperatorUnparsed, }, document_structure::{PdfPage, PdfResourcesDictionary}, - font::{PdfFont, PdfTodo}, + font::{PdfFont, PdfSimpleFontEncodingTableEntry, PdfTodo}, object::{ IsPdfNull, PdfMatrix, PdfName, PdfNumber, PdfObject, PdfObjectDirect, PdfStringOrNumber, PdfVec2D, @@ -934,13 +934,17 @@ impl PdfRenderOperator for PdfOperatorShowTextWithGlyphPositioning { .font_descriptor() .and_then(|v| v.font_file.as_ref()) .and_then(|v| v.decoded_data().as_ref().ok()) - .and_then(|v| v.encoding.as_ref()) + .map(|v| v.encoding.clone()) else { todo!() }; - todo!("{font_encoding:?}"); + font_encoding }); - todo!("{table:?}"); + let PdfSimpleFontEncodingTableEntry { + name, + presumed_unicode, + } = table.table[usize::from(*glyph)].clone(); + todo!("{name:?} {presumed_unicode:?} {:#?}", font.to_unicode()); } } PdfStringOrNumber::Number(number) => positioning = number.as_f32(),