use crate::{ pdf::{ content_stream::PdfOperatorUnparsed, document_structure::PdfDocumentCatalog, object::{ PdfArray, PdfBoolean, PdfDictionary, PdfInteger, PdfName, PdfNull, PdfObject, PdfObjectIdentifier, PdfObjectIndirect, PdfObjectStreamDictionary, PdfReal, PdfStream, PdfStreamDictionary, PdfString, }, parse::{ GetPdfInputPosition, PdfInputPosition, PdfInputPositionKnown, PdfParse, PdfParseError, }, }, pdf_parse, util::{ArcOrRef, DagDebugState}, }; use std::{ any::{Any, TypeId}, collections::{BTreeMap, HashMap}, convert::Infallible, fmt, num::NonZero, str::FromStr, sync::{Arc, Mutex, OnceLock}, }; pub mod content_stream; pub mod document_structure; pub mod font; pub mod object; pub mod parse; pub mod render; pub mod stream_filters; struct ParseCache { parse_results: HashMap>, steps_till_next_gc: usize, } impl Default for ParseCache { fn default() -> Self { Self { parse_results: HashMap::new(), steps_till_next_gc: 1, } } } impl ParseCache { fn gc(&mut self) { if self.steps_till_next_gc == 0 { self.parse_results.retain(|_k, v| v.strong_count() > 0); let mut adjusted_len = self.parse_results.len(); if adjusted_len < 10 { adjusted_len = 10; } self.steps_till_next_gc = adjusted_len.saturating_mul(20); } else { self.steps_till_next_gc -= 1; } } fn get(&mut self) -> Option> { self.gc(); let Ok(retval) = self .parse_results .get(&TypeId::of::())? .upgrade()? .downcast() else { unreachable!(); }; Some(retval) } fn get_or_insert( &mut self, value: Arc, ) -> (Arc, impl Sized + use) { use std::collections::hash_map::Entry; self.gc(); match self.parse_results.entry(TypeId::of::()) { Entry::Occupied(mut entry) => { if let Some(retval) = entry.get().upgrade() { let Ok(retval) = retval.downcast::() else { unreachable!(); }; (retval, Some(value)) } else { entry.insert(Arc::::downgrade(&value)); (value, None) } } Entry::Vacant(entry) => { entry.insert(Arc::::downgrade(&value)); (value, None) } } } } struct PdfObjectAndParseCache { object: PdfObject, parse_cache: Mutex, } impl PdfObjectAndParseCache { fn new(object: PdfObject) -> Self { Self { object, parse_cache: Mutex::default(), } } fn parse_cache_get(&self) -> Option> { self.parse_cache.lock().expect("not poisoned").get() } fn parse_cache_get_or_insert(&self, value: Arc) -> Arc { let mut parse_cache = self.parse_cache.lock().expect("not poisoned"); let (retval, to_drop_after_unlock) = parse_cache.get_or_insert(value); drop(parse_cache); drop(to_drop_after_unlock); retval } } struct PdfObjectsInner { objects: BTreeMap, #[allow(dead_code)] object_streams: Vec>, } pub struct PdfObjects { inner: OnceLock, } #[derive(Copy, Clone, Debug)] pub struct PdfHeader { pub major: NonZero, pub minor: u16, } impl PdfHeader { pub const PREFIX: &str = "%PDF-"; } pdf_parse! { #[pdf] #[derive(Clone)] pub struct PdfTrailerDictionary { #[pdf(name = "Size")] pub size: usize, #[pdf(name = "Prev")] pub prev: Option, #[pdf(name = "Root")] pub root: PdfDocumentCatalog, #[pdf(name = "Encrypt")] pub encrypt: Option, #[pdf(name = "Info")] pub info: Option, #[pdf(name = "ID")] pub id: Option<[PdfString; 2]>, #[pdf(flatten)] pub rest: PdfDictionary, } } impl fmt::Debug for PdfTrailerDictionary { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { DagDebugState::scope(|_state| { let Self { size, prev, root, encrypt, info, id, rest, } = self; f.debug_struct("PdfTrailerDictionary") .field("size", size) .field("prev", prev) .field("root", root) .field("encrypt", encrypt) .field("info", info) .field("id", id) .field("rest", rest) .finish() }) } } pdf_parse! { #[pdf(name)] #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Default, Debug)] pub enum PdfXRefName { #[pdf(name = "XRef")] #[default] XRef, } } pdf_parse! { #[pdf] #[derive(Clone)] pub struct PdfXRefStreamDictionaryRest { #[pdf(name = "Type")] pub ty: PdfXRefName, #[pdf(name = "Index")] pub index: Option>, #[pdf(name = "W")] pub w: Option>, #[pdf(flatten)] pub rest: PdfTrailerDictionary, } } impl fmt::Debug for PdfXRefStreamDictionaryRest { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { DagDebugState::scope(|_state| { let Self { ty, index, w, rest } = self; f.debug_struct("PdfXRefStreamDictionaryRest") .field("ty", ty) .field("index", index) .field("w", w) .field("rest", rest) .finish() }) } } #[derive(Clone)] pub enum PdfTrailer { Trailer { trailer_dictionary: PdfTrailerDictionary, start_xref: usize, }, Stream { xref_stream: PdfStream, start_xref: usize, }, } impl fmt::Debug for PdfTrailer { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { DagDebugState::scope(|_state| match self { Self::Trailer { trailer_dictionary, start_xref, } => f .debug_struct("Trailer") .field("trailer_dictionary", trailer_dictionary) .field("start_xref", start_xref) .finish(), Self::Stream { xref_stream, start_xref, } => f .debug_struct("Stream") .field("xref_stream", xref_stream) .field("start_xref", start_xref) .finish(), }) } } impl PdfTrailer { pub fn trailer_dictionary(&self) -> &PdfTrailerDictionary { match self { PdfTrailer::Trailer { trailer_dictionary, .. } => trailer_dictionary, PdfTrailer::Stream { xref_stream, .. } => &xref_stream.dictionary().rest.rest, } } } pub struct Pdf { pub header: PdfHeader, pub objects: Arc, pub trailer: PdfTrailer, } #[derive(Clone, Copy, PartialEq, Eq, Debug)] enum PdfCharCategory { Regular, Whitespace, LParen, RParen, LAngle, RAngle, LBracket, RBracket, LBrace, RBrace, FSlash, Percent, } impl PdfCharCategory { fn new(b: u8) -> Self { match b { b'\0' | b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' => Self::Whitespace, b'(' => Self::LParen, b')' => Self::RParen, b'<' => Self::LAngle, b'>' => Self::RAngle, b'[' => Self::LBracket, b']' => Self::RBracket, b'{' => Self::LBrace, b'}' => Self::RBrace, b'/' => Self::FSlash, b'%' => Self::Percent, _ => Self::Regular, } } } #[derive(Clone, Copy, PartialEq)] enum PdfToken<'a> { Regular(&'a [u8]), LParen, RParen, LAngle, RAngle, LBracket, RBracket, LBrace, RBrace, FSlash, Comment(&'a [u8]), } impl<'a> fmt::Debug for PdfToken<'a> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Self::Regular(contents) => { if let Ok(contents) = str::from_utf8(contents) { write!(f, "Regular({contents:?})") } else { write!(f, "Regular({contents:?})") } } Self::LParen => write!(f, "LParen"), Self::RParen => write!(f, "RParen"), Self::LAngle => write!(f, "LAngle"), Self::RAngle => write!(f, "RAngle"), Self::LBracket => write!(f, "LBracket"), Self::RBracket => write!(f, "RBracket"), Self::LBrace => write!(f, "LBrace"), Self::RBrace => write!(f, "RBrace"), Self::FSlash => write!(f, "FSlash"), Self::Comment(contents) => { if let Ok(contents) = str::from_utf8(contents) { write!(f, "Comment({contents:?})") } else { write!(f, "Comment({contents:?})") } } } } } #[derive(Clone)] struct PdfTokenizerPeek<'a> { token: PdfToken<'a>, pos_after_token: usize, } #[derive(Clone)] struct PdfTokenizer<'a> { bytes: &'a [u8], pos: PdfInputPositionKnown, peek_cache: Option>, } impl<'a> PdfTokenizer<'a> { fn new(bytes: &'a [u8], pos: PdfInputPositionKnown) -> Self { Self { bytes, pos, peek_cache: None, } } fn pos(&self) -> PdfInputPosition { PdfInputPosition::new(Some(self.pos)) } fn peek_byte(&mut self) -> Option { self.bytes.get(self.pos.pos).copied() } fn next_byte(&mut self) -> Option { let b = self.bytes.get(self.pos.pos)?; self.pos.pos += 1; self.peek_cache = None; Some(*b) } fn skip_whitespace(&mut self) { while let Some(PdfCharCategory::Whitespace) = self.peek_byte().map(PdfCharCategory::new) { self.next_byte(); } } fn peek(&mut self) -> Option> { if let Some(PdfTokenizerPeek { token, .. }) = self.peek_cache { return Some(token); } let mut tokenizer = self.clone(); let token = tokenizer.next()?; self.peek_cache = Some(PdfTokenizerPeek { token, pos_after_token: tokenizer.pos.pos, }); Some(token) } fn read_bytes(&mut self, len: usize) -> Option<&'a [u8]> { let retval = self .bytes .get(self.pos.pos..self.pos.pos.saturating_add(len))?; self.peek_cache = None; self.pos.pos += len; Some(retval) } } impl<'a> Iterator for PdfTokenizer<'a> { type Item = PdfToken<'a>; fn next(&mut self) -> Option { if let Some(PdfTokenizerPeek { token, pos_after_token, }) = self.peek_cache.take() { self.pos.pos = pos_after_token; return Some(token); } loop { let start_pos = self.pos.pos; break match PdfCharCategory::new(self.next_byte()?) { PdfCharCategory::Whitespace => continue, PdfCharCategory::LParen => Some(PdfToken::LParen), PdfCharCategory::RParen => Some(PdfToken::RParen), PdfCharCategory::LAngle => Some(PdfToken::LAngle), PdfCharCategory::RAngle => Some(PdfToken::RAngle), PdfCharCategory::LBracket => Some(PdfToken::LBracket), PdfCharCategory::RBracket => Some(PdfToken::RBracket), PdfCharCategory::LBrace => Some(PdfToken::LBrace), PdfCharCategory::RBrace => Some(PdfToken::RBrace), PdfCharCategory::FSlash => Some(PdfToken::FSlash), PdfCharCategory::Percent => { loop { match self.next_byte() { None | Some(b'\n') => break, Some(b'\r') => { if let Some(b'\n') = self.peek_byte() { self.pos.pos += 1; } break; } Some(_) => continue, } } Some(PdfToken::Comment(&self.bytes[start_pos..self.pos.pos])) } PdfCharCategory::Regular => { while let Some(PdfCharCategory::Regular) = self.peek_byte().map(PdfCharCategory::new) { self.pos.pos += 1; } Some(PdfToken::Regular(&self.bytes[start_pos..self.pos.pos])) } }; } } } struct PdfParser<'a> { objects: Arc, tokenizer: PdfTokenizer<'a>, } enum PdfObjectOrStreamDictionaryOrOperator { StreamDictionary { dictionary: PdfDictionary, stream_kw_pos: PdfInputPosition, }, Object(PdfObject), Operator(PdfOperatorUnparsed), } impl PdfObjectOrStreamDictionaryOrOperator { fn error_on_stream_or_operator(self) -> Result { match self { PdfObjectOrStreamDictionaryOrOperator::StreamDictionary { dictionary: _, stream_kw_pos, } => Err(PdfParseError::StreamNotAllowedHere { pos: stream_kw_pos }), PdfObjectOrStreamDictionaryOrOperator::Object(object) => Ok(object), PdfObjectOrStreamDictionaryOrOperator::Operator(operator) => { Err(PdfParseError::OperatorNotAllowedHere { operator }) } } } } impl<'a> PdfParser<'a> { fn skip_comments_and_whitespace(&mut self) { self.tokenizer.skip_whitespace(); while let Some(PdfToken::Comment(_)) = self.tokenizer.peek() { self.tokenizer.next(); self.tokenizer.skip_whitespace(); } } fn parse_digits( &mut self, on_parse_failed: impl FnOnce(PdfInputPosition) -> Result, PdfParseError>, ) -> Result, PdfParseError> { self.skip_comments_and_whitespace(); let old_tokenizer = self.tokenizer.clone(); let pos = self.tokenizer.pos(); let Some(PdfToken::Regular(number)) = self.tokenizer.next() else { self.tokenizer = old_tokenizer; return Ok(None); }; if !number.iter().all(|b| b.is_ascii_digit()) { self.tokenizer = old_tokenizer; return Ok(None); } let Some(number) = str::from_utf8(number).ok().and_then(|v| v.parse().ok()) else { self.tokenizer = old_tokenizer; return Ok(match on_parse_failed(pos)? { None => None, }); }; Ok(Some((pos, number))) } fn parse_object_identifier( &mut self, return_none_for_out_of_range: bool, ) -> Result, PdfParseError> { let old_tokenizer = self.tokenizer.clone(); let Some((pos, object_number)) = self.parse_digits(|pos| { if return_none_for_out_of_range { Ok(None) } else { Err(PdfParseError::InvalidObjectNumber { pos }) } })? else { self.tokenizer = old_tokenizer; return Ok(None); }; let Some((_pos, generation_number)) = self.parse_digits(|pos| { if return_none_for_out_of_range { Ok(None) } else { Err(PdfParseError::InvalidGenerationNumber { pos }) } })? else { self.tokenizer = old_tokenizer; return Ok(None); }; Ok(Some(PdfObjectIdentifier { pos: pos.into(), object_number, generation_number, })) } fn parse_indirect_object(&mut self) -> Result, PdfParseError> { let old_tokenizer = self.tokenizer.clone(); let Some(id) = self.parse_object_identifier(true)? else { self.tokenizer = old_tokenizer; return Ok(None); }; if let Some(PdfToken::Regular(b"R")) = self.tokenizer.next() { Ok(Some(PdfObjectIndirect::new(&self.objects, id))) } else { self.tokenizer = old_tokenizer; Ok(None) } } fn parse_string_after_l_paren(&mut self) -> Result { let mut contents = Vec::new(); let mut paren_level = NonZero::new(1usize).expect("non-zero"); let string_pos = self.tokenizer.pos(); while let Some(b) = self.tokenizer.next_byte() { contents.push(match b { b'(' => { paren_level = paren_level.checked_add(1).expect("overflow"); b } b')' => { let Some(new_paren_level) = NonZero::new(paren_level.get() - 1) else { return Ok(PdfString::new( string_pos, ArcOrRef::Arc(Arc::from(contents)), )); }; paren_level = new_paren_level; b } b'\r' if self.tokenizer.peek_byte() == Some(b'\n') => { self.tokenizer.next_byte(); b'\n' } b'\r' | b'\n' => b'\n', b'\\' => { let pos = self.tokenizer.pos(); let Some(b) = self.tokenizer.next_byte() else { return Err(PdfParseError::InvalidStringEscape { pos }); }; match b { b'\r' if self.tokenizer.peek_byte() == Some(b'\n') => { self.tokenizer.next_byte(); continue; } b'\r' | b'\n' => continue, b'n' => b'\n', b'r' => b'\r', b't' => b'\t', b'b' => b'\x08', b'f' => b'\x0C', b'(' | b')' | b'\\' => b, b'0'..=b'7' => { const MAX_OCTAL_DIGITS: usize = 3; let mut value = b - b'0'; let mut len = 1; while len < MAX_OCTAL_DIGITS { let Some(b @ b'0'..=b'7') = self.tokenizer.peek_byte() else { break; }; value <<= 3; value |= b - b'0'; len += 1; self.tokenizer.next_byte(); } value } _ => { return Err(PdfParseError::InvalidStringEscape { pos }); } } } _ => b, }); } Err(PdfParseError::TruncatedFile { pos: self.tokenizer.pos(), }) } fn parse_string_after_l_angle(&mut self) -> Result { let mut contents = Vec::new(); let mut high_digit_value = None; let mut push_digit_value = |value: u8| { high_digit_value = match high_digit_value { Some(high_digit_value) => { contents.push((high_digit_value << 4) | value); None } None => Some(value), }; }; let string_pos = self.tokenizer.pos(); loop { let pos = self.tokenizer.pos(); match self.tokenizer.next_byte() { None => { return Err(PdfParseError::TruncatedFile { pos }); } Some(b) if PdfCharCategory::new(b) == PdfCharCategory::Whitespace => {} Some(b'>') => { // if we have an odd trailing digit, add the final digit, otherwise doesn't modify contents push_digit_value(0); return Ok(PdfString::new( string_pos, Arc::<[u8]>::from(contents).into(), )); } Some(b) => { let Some(value) = (b as char).to_digit(0x10) else { return Err(PdfParseError::InvalidHexStringDigit { pos }); }; push_digit_value(value as u8); } } } } fn parse_name_after_f_slash(&mut self) -> Result { let mut name = vec![]; let name_pos = self.tokenizer.pos(); loop { let Some(PdfCharCategory::Regular) = self.tokenizer.peek_byte().map(PdfCharCategory::new) else { return Ok(PdfName::new(name_pos, ArcOrRef::Arc(Arc::from(name)))); }; let pos = self.tokenizer.pos(); match self .tokenizer .next_byte() .expect("just checked that it's not None") { b'#' => { let mut value = 0u8; for _ in 0..2 { let Some(digit) = self .tokenizer .next_byte() .and_then(|b| (b as char).to_digit(0x10)) else { return Err(PdfParseError::InvalidNameEscape { pos }); }; value <<= 4; value |= digit as u8; } name.push(value); } b => name.push(b), } } } fn parse_array_after_l_bracket(&mut self) -> Result { let array_pos = self.tokenizer.pos(); let mut contents: Vec = Vec::new(); loop { self.skip_comments_and_whitespace(); if let Some(PdfToken::RBracket) = self.tokenizer.peek() { self.tokenizer.next(); return Ok(PdfArray::from_elements(array_pos, Arc::from(contents))); } contents.push( self.parse_object_or_operator()? .error_on_stream_or_operator()?, ); } } /// assumes `self.tokenizer.peek_byte() == Some(b'<')` fn parse_dictionary_after_one_l_angle(&mut self) -> Result { let l_angle = self.tokenizer.next_byte(); assert_eq!(l_angle, Some(b'<')); let dictionary_pos = self.tokenizer.pos(); let mut contents: BTreeMap = BTreeMap::new(); loop { self.skip_comments_and_whitespace(); if let Some(PdfToken::RAngle) = self.tokenizer.peek() { self.tokenizer.next(); let pos = self.tokenizer.pos(); let b'>' = self .tokenizer .next_byte() .ok_or(PdfParseError::TruncatedFile { pos })? else { return Err(PdfParseError::InvalidDictionaryClosingDoubleRAngle { pos }); }; return Ok(PdfDictionary::from_fields( dictionary_pos, Arc::new(contents), )); } let name = PdfName::parse( self.parse_object_or_operator()? .error_on_stream_or_operator()?, )?; let name_pos = name.pos(); match contents.entry(name) { std::collections::btree_map::Entry::Vacant(entry) => { entry.insert( self.parse_object_or_operator()? .error_on_stream_or_operator()?, ); } std::collections::btree_map::Entry::Occupied(entry) => { return Err(PdfParseError::DuplicateDictionaryKey { pos: name_pos, name: entry.key().clone(), }); } } } } fn parse_object_or_operator( &mut self, ) -> Result { self.skip_comments_and_whitespace(); if let Some(indirect) = self.parse_indirect_object()? { return Ok(PdfObjectOrStreamDictionaryOrOperator::Object( indirect.into(), )); } let pos = self.tokenizer.pos(); Ok(PdfObjectOrStreamDictionaryOrOperator::Object( match self .tokenizer .next() .ok_or(PdfParseError::TruncatedFile { pos })? { PdfToken::Regular(b"true") => PdfObject::Boolean(PdfBoolean::new(pos, true)), PdfToken::Regular(b"false") => PdfObject::Boolean(PdfBoolean::new(pos, false)), PdfToken::Regular(b"null") => PdfObject::Null(PdfNull::new(pos)), PdfToken::Regular( number @ ([b'+' | b'-', b'0'..=b'9' | b'.', ..] | [b'0'..=b'9' | b'.', ..]), ) => { // parse number let Ok(number) = str::from_utf8(number) else { return Err(PdfParseError::InvalidNumber { pos }); }; let mut parts = number .strip_prefix(&['+', '-']) .unwrap_or(number) .split('.'); let integer_part = parts .next() .expect("split always returns at least one part"); let fraction_part = parts.next(); if parts.next().is_some() { return Err(PdfParseError::InvalidNumber { pos }); } if integer_part.is_empty() && fraction_part.is_none_or(|v| v.is_empty()) { return Err(PdfParseError::InvalidNumber { pos }); } if !integer_part.bytes().all(|v| v.is_ascii_digit()) { return Err(PdfParseError::InvalidNumber { pos }); } if let Some(fraction_part) = fraction_part { if !fraction_part.bytes().all(|v| v.is_ascii_digit()) { return Err(PdfParseError::InvalidNumber { pos }); } PdfObject::Real(PdfReal::new( pos, number .parse() .map_err(|_| PdfParseError::InvalidNumber { pos })?, )) } else { PdfObject::Integer(PdfInteger::new( pos, number .parse() .map_err(|_| PdfParseError::InvalidNumber { pos })?, )) } } PdfToken::Regular(name) => { return Ok(PdfObjectOrStreamDictionaryOrOperator::Operator( PdfOperatorUnparsed::new(pos, ArcOrRef::Arc(name.into())), )); } PdfToken::LParen => PdfObject::String(self.parse_string_after_l_paren()?), PdfToken::RParen => todo!(), PdfToken::LAngle => { if self.tokenizer.peek_byte() == Some(b'<') { let dictionary = self.parse_dictionary_after_one_l_angle()?; self.skip_comments_and_whitespace(); if let Some(PdfToken::Regular(b"stream")) = self.tokenizer.peek() { return Ok(PdfObjectOrStreamDictionaryOrOperator::StreamDictionary { dictionary, stream_kw_pos: self.tokenizer.pos(), }); } else { dictionary.into() } } else { self.parse_string_after_l_angle()?.into() } } PdfToken::RAngle => todo!(), PdfToken::LBracket => self.parse_array_after_l_bracket()?.into(), PdfToken::RBracket => todo!(), PdfToken::LBrace => todo!(), PdfToken::RBrace => todo!(), PdfToken::FSlash => self.parse_name_after_f_slash()?.into(), PdfToken::Comment(_) => unreachable!(), }, )) } } struct PdfFileParser<'a> { parser: PdfParser<'a>, objects_map: BTreeMap, } impl<'a> PdfFileParser<'a> { fn parse_header(&mut self) -> Result { let Some(b'%') = self.parser.tokenizer.bytes.first() else { return Err(PdfParseError::NotAPdfFile); }; let Some(PdfToken::Comment(header)) = self.parser.tokenizer.next() else { unreachable!() }; let Ok(header) = str::from_utf8(header) else { return Err(PdfParseError::NotAPdfFile); }; let header = header.trim_end_matches(['\n', '\r']); let Some(version) = header.strip_prefix(PdfHeader::PREFIX) else { return Err(PdfParseError::NotAPdfFile); }; let Some((major_str, minor_str)) = version.split_once('.') else { return Err(PdfParseError::NotAPdfFile); }; let (Ok(major), Ok(minor)) = (major_str.parse(), minor_str.parse()) else { return Err(PdfParseError::NotAPdfFile); }; Ok(PdfHeader { major, minor }) } /// assumes `self.tokenizer.peek() == Some(PdfToken::Regular(b"stream"))` fn parse_stream_after_dictionary( &mut self, dictionary: PdfDictionary, ) -> Result { self.parser.tokenizer.skip_whitespace(); let stream_pos = self.parser.tokenizer.pos(); let stream = self.parser.tokenizer.next(); assert_eq!(stream, Some(PdfToken::Regular(b"stream"))); let dictionary = PdfStreamDictionary::parse(dictionary.into())?; let eol_pos = self.parser.tokenizer.pos(); match self.parser.tokenizer.next_byte() { None => return Err(PdfParseError::TruncatedFile { pos: eol_pos }), Some(b'\r') => { let Some(b'\n') = self.parser.tokenizer.next_byte() else { return Err(PdfParseError::InvalidOrMissingEolAfterStreamKeyword { pos: eol_pos, }); }; } Some(b'\n') => {} _ => return Err(PdfParseError::InvalidOrMissingEolAfterStreamKeyword { pos: eol_pos }), } let Some(data) = self.parser.tokenizer.read_bytes(dictionary.len) else { return Err(PdfParseError::TruncatedFile { pos: PdfInputPosition::new(Some(PdfInputPositionKnown { pos: self.parser.tokenizer.bytes.len(), ..self.parser.tokenizer.pos })), }); }; let stream = PdfStream::new( stream_pos, &self.parser.objects, dictionary, Arc::from(data), ); self.parser.skip_comments_and_whitespace(); let pos = self.parser.tokenizer.pos(); if let Some(PdfToken::Regular(b"endstream")) = self.parser.tokenizer.next() { Ok(stream) } else { Err(PdfParseError::MissingEndStreamKeyword { pos }) } } fn parse_object(&mut self) -> Result { match self.parser.parse_object_or_operator()? { PdfObjectOrStreamDictionaryOrOperator::StreamDictionary { dictionary, stream_kw_pos: _, } => Ok(PdfObject::Stream( self.parse_stream_after_dictionary(dictionary)?, )), PdfObjectOrStreamDictionaryOrOperator::Object(object) => Ok(object), PdfObjectOrStreamDictionaryOrOperator::Operator(operator) => { Err(PdfParseError::OperatorNotAllowedHere { operator }) } } } fn parse_indirect_object_definition(&mut self) -> Result, PdfParseError> { self.parser.skip_comments_and_whitespace(); let Some(id) = self.parser.parse_object_identifier(false)? else { return Ok(None); }; self.parser.skip_comments_and_whitespace(); let obj_pos = self.parser.tokenizer.pos(); let Some(PdfToken::Regular(b"obj")) = self.parser.tokenizer.next() else { return Err(PdfParseError::MissingObj { pos: obj_pos }); }; let object = self.parse_object()?; self.parser.skip_comments_and_whitespace(); let end_obj_pos = self.parser.tokenizer.pos(); let Some(PdfToken::Regular(b"endobj")) = self.parser.tokenizer.next() else { return Err(PdfParseError::MissingEndObj { pos: end_obj_pos }); }; if self .objects_map .insert(id, PdfObjectAndParseCache::new(object)) .is_some() { Err(PdfParseError::DuplicateIndirectObjectDefinition { pos: id.pos.0, id }) } else { Ok(Some(())) } } fn parse_object_stream( &mut self, object_stream: &PdfStream, ) -> Result<(), PdfParseError> { let data = object_stream.decoded_data().as_ref()?; let mut parser = PdfParser { tokenizer: PdfTokenizer::new( data, PdfInputPositionKnown { pos: 0, containing_streams_pos: Some( object_stream .get_pdf_input_position() .get() .expect("known to be set") .pos, ), }, ), objects: self.parser.objects.clone(), }; let mut object_ids_and_byte_positions = Vec::<(PdfObjectIdentifier, usize)>::with_capacity(object_stream.dictionary().rest.n); for _ in 0..object_stream.dictionary().rest.n { parser.skip_comments_and_whitespace(); let Some((pos, object_number)) = parser.parse_digits(|pos| Err(PdfParseError::InvalidObjectNumber { pos }))? else { return Err(PdfParseError::InvalidObjectNumber { pos: parser.tokenizer.pos(), }); }; parser.skip_comments_and_whitespace(); let Some((_, byte_position)) = parser.parse_digits(|pos| Err(PdfParseError::InvalidNumber { pos }))? else { return Err(PdfParseError::InvalidNumber { pos: parser.tokenizer.pos(), }); }; object_ids_and_byte_positions.push(( PdfObjectIdentifier { pos: pos.into(), object_number, generation_number: 0, }, byte_position, )); } for (id, _byte_position) in object_ids_and_byte_positions { let object = parser .parse_object_or_operator()? .error_on_stream_or_operator()?; if self .objects_map .insert(id, PdfObjectAndParseCache::new(object)) .is_some() { return Err(PdfParseError::DuplicateIndirectObjectDefinition { pos: id.pos.0, id }); } } Ok(()) } fn parse_body(&mut self) -> Result<(), PdfParseError> { while let Some(()) = self.parse_indirect_object_definition()? {} let mut object_streams: Vec> = Vec::new(); for object in self.objects_map.values_mut() { let stream = match &object.object { PdfObject::Stream(stream) => stream, PdfObject::Boolean(_) | PdfObject::Integer(_) | PdfObject::Real(_) | PdfObject::String(_) | PdfObject::Name(_) | PdfObject::Array(_) | PdfObject::Dictionary(_) | PdfObject::Null(_) | PdfObject::Indirect(_) => continue, }; if PdfObjectStreamDictionary::parse_type_from_dictionary(&stream.dictionary().rest) .is_ok() { object_streams.push(PdfStream::parse(object.object.clone())?); } } for object_stream in &object_streams { self.parse_object_stream(object_stream)?; } let Ok(()) = self.parser.objects.inner.set(PdfObjectsInner { objects: std::mem::take(&mut self.objects_map), object_streams, }) else { unreachable!(); }; Ok(()) } fn parse_xref_table(&mut self) -> Result<(), PdfParseError> { self.parser.skip_comments_and_whitespace(); let xref_pos = self.parser.tokenizer.pos(); let Some(PdfToken::Regular(b"xref")) = self.parser.tokenizer.peek() else { return Ok(()); }; todo!("{xref_pos}") } fn parse_trailer(&mut self) -> Result { self.parser.skip_comments_and_whitespace(); let trailer_pos = self.parser.tokenizer.pos(); let trailer_dictionary = match self.parser.tokenizer.peek() { Some(PdfToken::Regular(b"trailer")) => { self.parser.tokenizer.next(); Some(PdfTrailerDictionary::parse(self.parse_object()?)?) } Some(PdfToken::Regular(b"startxref")) => None, _ => { return Err(PdfParseError::MissingTrailer { pos: trailer_pos }); } }; self.parser.skip_comments_and_whitespace(); let start_xref_kw_pos = self.parser.tokenizer.pos(); let Some(PdfToken::Regular(b"startxref")) = self.parser.tokenizer.next() else { return Err(PdfParseError::MissingStartXRefKeyword { pos: start_xref_kw_pos, }); }; let start_xref_pos = self.parser.tokenizer.pos(); let Some((start_xref_pos, start_xref)) = self .parser .parse_digits(|pos| Err(PdfParseError::IntegerOutOfRange { pos }))? else { return Err(PdfParseError::MissingStartXRefValue { pos: start_xref_pos, }); }; self.parser.tokenizer.skip_whitespace(); let eof_comment_pos = self.parser.tokenizer.pos(); let Some(PdfToken::Comment(b"%%EOF" | b"%%EOF\r" | b"%%EOF\r\n" | b"%%EOF\n")) = self.parser.tokenizer.next() else { return Err(PdfParseError::MissingEofComment { pos: eof_comment_pos, }); }; self.parser.tokenizer.skip_whitespace(); if let Some(byte) = self.parser.tokenizer.peek_byte() { return Err(PdfParseError::UnexpectedByte { pos: self.parser.tokenizer.pos(), byte, }); } if let Some(trailer_dictionary) = trailer_dictionary { return Ok(PdfTrailer::Trailer { trailer_dictionary, start_xref, }); } let id = PdfParser { tokenizer: PdfTokenizer::new( self.parser.tokenizer.bytes, PdfInputPositionKnown { pos: start_xref, containing_streams_pos: None, }, ), objects: self.parser.objects.clone(), } .parse_object_identifier(false); let Some(id) = id? else { return Err(PdfParseError::InvalidStartXRefValue { pos: start_xref_pos, start_xref, }); }; let xref_stream = PdfStream::parse( PdfObjectIndirect::new(&self.parser.objects, id) .get() .into(), )?; Ok(PdfTrailer::Stream { xref_stream, start_xref, }) } fn parse_file(mut self) -> Result { let header = self.parse_header()?; self.parse_body()?; self.parse_xref_table()?; let trailer = self.parse_trailer()?; Ok(Pdf { header, objects: self.parser.objects, trailer, }) } } impl Pdf { pub fn parse(bytes: impl AsRef<[u8]>) -> Result { PdfFileParser { parser: PdfParser { objects: Arc::new(PdfObjects { inner: OnceLock::new(), }), tokenizer: PdfTokenizer::new( bytes.as_ref(), PdfInputPositionKnown { pos: 0, containing_streams_pos: None, }, ), }, objects_map: BTreeMap::new(), } .parse_file() } } #[cfg(test)] mod tests { use crate::{ pdf::{ object::{ PdfBoolean, PdfDictionary, PdfInteger, PdfName, PdfNull, PdfObject, PdfString, }, parse::{PdfInputPosition, PdfParse, PdfParseError}, }, util::ArcOrRef, }; #[test] fn test_deserialize_dict() -> Result<(), PdfParseError> { crate::pdf::parse::pdf_parse! { #[pdf] #[derive(Debug)] #[allow(dead_code)] struct TestStruct { #[pdf(name = "a")] a: i32, #[pdf(name = "c")] c: i32, #[pdf(name = "b")] b: i32, #[pdf(flatten)] rest: PdfDictionary, } } let v: TestStruct = PdfParse::parse(PdfObject::from(PdfDictionary::from_iter([ ( PdfName::new_static(b"a"), PdfInteger::new(PdfInputPosition::empty(), 1).into(), ), ( PdfName::new_static(b"c"), PdfInteger::new(PdfInputPosition::empty(), 7).into(), ), ( PdfName::new_static(b"b"), PdfInteger::new(PdfInputPosition::empty(), 5).into(), ), ( PdfName::new_static(b"d"), PdfBoolean::new(PdfInputPosition::empty(), false).into(), ), ( PdfName::new_static(b"e"), PdfNull::new(PdfInputPosition::empty()).into(), ), ( PdfName::new_static(b"f"), PdfString::new(PdfInputPosition::empty(), ArcOrRef::Ref(b"test")).into(), ), ])))?; let expected = TestStruct { a: 1, c: 7, b: 5, rest: PdfDictionary::from_iter([ ( PdfName::new_static(b"d"), PdfBoolean::new(PdfInputPosition::empty(), false).into(), ), ( PdfName::new_static(b"f"), PdfString::new(PdfInputPosition::empty(), ArcOrRef::Ref(b"test")).into(), ), ]), }; assert_eq!(format!("{v:?}"), format!("{expected:?}")); Ok(()) } }