diff --git a/src/pdf.rs b/src/pdf.rs index 0ccd293..1933489 100644 --- a/src/pdf.rs +++ b/src/pdf.rs @@ -1,5 +1,6 @@ use crate::{ pdf::{ + content_stream::PdfOperatorUnparsed, document_structure::PdfDocumentCatalog, object::{ PdfArray, PdfBoolean, PdfDictionary, PdfInteger, PdfName, PdfNull, PdfObject, @@ -318,67 +319,35 @@ impl<'a> Iterator for PdfTokenizer<'a> { } struct PdfParser<'a> { - objects_arc: Arc, - objects_map: BTreeMap, - unparsed_stream_dictionaries: Vec>, + objects: Arc, tokenizer: PdfTokenizer<'a>, } +enum PdfObjectOrStreamDictionaryOrOperator { + StreamDictionary { + dictionary: PdfDictionary, + stream_kw_pos: PdfInputPosition, + }, + Object(PdfObject), + Operator(PdfOperatorUnparsed), +} + +impl PdfObjectOrStreamDictionaryOrOperator { + fn error_on_stream_or_operator(self) -> Result { + match self { + PdfObjectOrStreamDictionaryOrOperator::StreamDictionary { + dictionary: _, + stream_kw_pos, + } => Err(PdfParseError::StreamNotAllowedHere { pos: stream_kw_pos }), + PdfObjectOrStreamDictionaryOrOperator::Object(object) => Ok(object), + PdfObjectOrStreamDictionaryOrOperator::Operator(operator) => { + Err(PdfParseError::OperatorNotAllowedHere { operator }) + } + } + } +} + impl<'a> PdfParser<'a> { - fn with_tokenizer<'b, R>( - &mut self, - tokenizer: PdfTokenizer<'b>, - f: impl FnOnce(&mut PdfParser<'b>) -> R, - ) -> R { - let PdfParser { - objects_arc, - objects_map, - unparsed_stream_dictionaries, - tokenizer: _, - } = self; - let objects_arc = objects_arc.clone(); - let objects_map = std::mem::take(objects_map); - let unparsed_stream_dictionaries = std::mem::take(unparsed_stream_dictionaries); - let mut new_parser = PdfParser { - objects_arc, - objects_map, - unparsed_stream_dictionaries, - tokenizer, - }; - let retval = f(&mut new_parser); - let PdfParser { - objects_arc, - objects_map, - unparsed_stream_dictionaries, - tokenizer: _, - } = new_parser; - self.objects_arc = objects_arc; - self.objects_map = objects_map; - self.unparsed_stream_dictionaries = unparsed_stream_dictionaries; - retval - } - fn parse_header(&mut self) -> Result { - let Some(b'%') = self.tokenizer.bytes.first() else { - return Err(PdfParseError::NotAPdfFile); - }; - let Some(PdfToken::Comment(header)) = self.tokenizer.next() else { - unreachable!() - }; - let Ok(header) = str::from_utf8(header) else { - return Err(PdfParseError::NotAPdfFile); - }; - let header = header.trim_end_matches(['\n', '\r']); - let Some(version) = header.strip_prefix(PdfHeader::PREFIX) else { - return Err(PdfParseError::NotAPdfFile); - }; - let Some((major_str, minor_str)) = version.split_once('.') else { - return Err(PdfParseError::NotAPdfFile); - }; - let (Ok(major), Ok(minor)) = (major_str.parse(), minor_str.parse()) else { - return Err(PdfParseError::NotAPdfFile); - }; - Ok(PdfHeader { major, minor }) - } fn skip_comments_and_whitespace(&mut self) { self.tokenizer.skip_whitespace(); while let Some(PdfToken::Comment(_)) = self.tokenizer.peek() { @@ -449,7 +418,7 @@ impl<'a> PdfParser<'a> { return Ok(None); }; if let Some(PdfToken::Regular(b"R")) = self.tokenizer.next() { - Ok(Some(PdfObjectIndirect::new(&self.objects_arc, id))) + Ok(Some(PdfObjectIndirect::new(&self.objects, id))) } else { self.tokenizer = old_tokenizer; Ok(None) @@ -604,7 +573,10 @@ impl<'a> PdfParser<'a> { self.tokenizer.next(); return Ok(PdfArray::from_elements(array_pos, Arc::from(contents))); } - contents.push(self.parse_object()?); + contents.push( + self.parse_object_or_operator()? + .error_on_stream_or_operator()?, + ); } } /// assumes `self.tokenizer.peek_byte() == Some(b'<')` @@ -630,11 +602,17 @@ impl<'a> PdfParser<'a> { Arc::new(contents), )); } - let name = PdfName::parse(self.parse_object()?.into())?; + let name = PdfName::parse( + self.parse_object_or_operator()? + .error_on_stream_or_operator()?, + )?; let name_pos = name.pos(); match contents.entry(name) { std::collections::btree_map::Entry::Vacant(entry) => { - entry.insert(self.parse_object()?.into()); + entry.insert( + self.parse_object_or_operator()? + .error_on_stream_or_operator()?, + ); } std::collections::btree_map::Entry::Occupied(entry) => { return Err(PdfParseError::DuplicateDictionaryKey { @@ -645,21 +623,146 @@ impl<'a> PdfParser<'a> { } } } + fn parse_object_or_operator( + &mut self, + ) -> Result { + self.skip_comments_and_whitespace(); + if let Some(indirect) = self.parse_indirect_object()? { + return Ok(PdfObjectOrStreamDictionaryOrOperator::Object( + indirect.into(), + )); + } + let pos = self.tokenizer.pos(); + Ok(PdfObjectOrStreamDictionaryOrOperator::Object( + match self + .tokenizer + .next() + .ok_or(PdfParseError::TruncatedFile { pos })? + { + PdfToken::Regular(b"true") => PdfObject::Boolean(PdfBoolean::new(pos, true)), + PdfToken::Regular(b"false") => PdfObject::Boolean(PdfBoolean::new(pos, false)), + PdfToken::Regular(b"null") => PdfObject::Null(PdfNull::new(pos)), + PdfToken::Regular( + number @ ([b'+' | b'-', b'0'..=b'9' | b'.', ..] | [b'0'..=b'9' | b'.', ..]), + ) => { + // parse number + let Ok(number) = str::from_utf8(number) else { + return Err(PdfParseError::InvalidNumber { pos }); + }; + let mut parts = number + .strip_prefix(&['+', '-']) + .unwrap_or(number) + .split('.'); + let integer_part = parts + .next() + .expect("split always returns at least one part"); + let fraction_part = parts.next(); + if parts.next().is_some() { + return Err(PdfParseError::InvalidNumber { pos }); + } + if integer_part.is_empty() && fraction_part.is_none_or(|v| v.is_empty()) { + return Err(PdfParseError::InvalidNumber { pos }); + } + if !integer_part.bytes().all(|v| v.is_ascii_digit()) { + return Err(PdfParseError::InvalidNumber { pos }); + } + if let Some(fraction_part) = fraction_part { + if !fraction_part.bytes().all(|v| v.is_ascii_digit()) { + return Err(PdfParseError::InvalidNumber { pos }); + } + PdfObject::Real(PdfReal::new( + pos, + number + .parse() + .map_err(|_| PdfParseError::InvalidNumber { pos })?, + )) + } else { + PdfObject::Integer(PdfInteger::new( + pos, + number + .parse() + .map_err(|_| PdfParseError::InvalidNumber { pos })?, + )) + } + } + PdfToken::Regular(name) => { + return Ok(PdfObjectOrStreamDictionaryOrOperator::Operator( + PdfOperatorUnparsed::new(pos, ArcOrRef::Arc(name.into())), + )); + } + PdfToken::LParen => PdfObject::String(self.parse_string_after_l_paren()?), + PdfToken::RParen => todo!(), + PdfToken::LAngle => { + if self.tokenizer.peek_byte() == Some(b'<') { + let dictionary = self.parse_dictionary_after_one_l_angle()?; + self.skip_comments_and_whitespace(); + if let Some(PdfToken::Regular(b"stream")) = self.tokenizer.peek() { + return Ok(PdfObjectOrStreamDictionaryOrOperator::StreamDictionary { + dictionary, + stream_kw_pos: self.tokenizer.pos(), + }); + } else { + dictionary.into() + } + } else { + self.parse_string_after_l_angle()?.into() + } + } + PdfToken::RAngle => todo!(), + PdfToken::LBracket => self.parse_array_after_l_bracket()?.into(), + PdfToken::RBracket => todo!(), + PdfToken::LBrace => todo!(), + PdfToken::RBrace => todo!(), + PdfToken::FSlash => self.parse_name_after_f_slash()?.into(), + PdfToken::Comment(_) => unreachable!(), + }, + )) + } +} + +struct PdfFileParser<'a> { + parser: PdfParser<'a>, + objects_map: BTreeMap, +} + +impl<'a> PdfFileParser<'a> { + fn parse_header(&mut self) -> Result { + let Some(b'%') = self.parser.tokenizer.bytes.first() else { + return Err(PdfParseError::NotAPdfFile); + }; + let Some(PdfToken::Comment(header)) = self.parser.tokenizer.next() else { + unreachable!() + }; + let Ok(header) = str::from_utf8(header) else { + return Err(PdfParseError::NotAPdfFile); + }; + let header = header.trim_end_matches(['\n', '\r']); + let Some(version) = header.strip_prefix(PdfHeader::PREFIX) else { + return Err(PdfParseError::NotAPdfFile); + }; + let Some((major_str, minor_str)) = version.split_once('.') else { + return Err(PdfParseError::NotAPdfFile); + }; + let (Ok(major), Ok(minor)) = (major_str.parse(), minor_str.parse()) else { + return Err(PdfParseError::NotAPdfFile); + }; + Ok(PdfHeader { major, minor }) + } /// assumes `self.tokenizer.peek() == Some(PdfToken::Regular(b"stream"))` fn parse_stream_after_dictionary( &mut self, dictionary: PdfDictionary, ) -> Result { - self.tokenizer.skip_whitespace(); - let stream_pos = self.tokenizer.pos(); - let stream = self.tokenizer.next(); + self.parser.tokenizer.skip_whitespace(); + let stream_pos = self.parser.tokenizer.pos(); + let stream = self.parser.tokenizer.next(); assert_eq!(stream, Some(PdfToken::Regular(b"stream"))); - let len = PdfStreamDictionary::parse_len_from_dictionary(&dictionary)?; - let eol_pos = self.tokenizer.pos(); - match self.tokenizer.next_byte() { + let dictionary = PdfStreamDictionary::parse(dictionary.into())?; + let eol_pos = self.parser.tokenizer.pos(); + match self.parser.tokenizer.next_byte() { None => return Err(PdfParseError::TruncatedFile { pos: eol_pos }), Some(b'\r') => { - let Some(b'\n') = self.tokenizer.next_byte() else { + let Some(b'\n') = self.parser.tokenizer.next_byte() else { return Err(PdfParseError::InvalidOrMissingEolAfterStreamKeyword { pos: eol_pos, }); @@ -668,121 +771,56 @@ impl<'a> PdfParser<'a> { Some(b'\n') => {} _ => return Err(PdfParseError::InvalidOrMissingEolAfterStreamKeyword { pos: eol_pos }), } - let Some(data) = self.tokenizer.read_bytes(len) else { + let Some(data) = self.parser.tokenizer.read_bytes(dictionary.len) else { return Err(PdfParseError::TruncatedFile { pos: PdfInputPosition::new(Some(PdfInputPositionKnown { - pos: self.tokenizer.bytes.len(), - ..self.tokenizer.pos + pos: self.parser.tokenizer.bytes.len(), + ..self.parser.tokenizer.pos })), }); }; - let (stream, unparsed) = PdfStream::new_unparsed(stream_pos, dictionary, Arc::from(data)); - self.unparsed_stream_dictionaries.push(unparsed); - self.skip_comments_and_whitespace(); - let pos = self.tokenizer.pos(); - if let Some(PdfToken::Regular(b"endstream")) = self.tokenizer.next() { + let stream = PdfStream::new( + stream_pos, + &self.parser.objects, + dictionary, + Arc::from(data), + ); + self.parser.skip_comments_and_whitespace(); + let pos = self.parser.tokenizer.pos(); + if let Some(PdfToken::Regular(b"endstream")) = self.parser.tokenizer.next() { Ok(stream) } else { Err(PdfParseError::MissingEndStreamKeyword { pos }) } } fn parse_object(&mut self) -> Result { - self.skip_comments_and_whitespace(); - if let Some(indirect) = self.parse_indirect_object()? { - return Ok(indirect.into()); - } - let pos = self.tokenizer.pos(); - match self - .tokenizer - .next() - .ok_or(PdfParseError::TruncatedFile { pos })? - { - PdfToken::Regular(b"true") => Ok(PdfObject::Boolean(PdfBoolean::new(pos, true))), - PdfToken::Regular(b"false") => Ok(PdfObject::Boolean(PdfBoolean::new(pos, false))), - PdfToken::Regular(b"null") => Ok(PdfObject::Null(PdfNull::new(pos))), - PdfToken::Regular( - number @ ([b'+' | b'-', b'0'..=b'9' | b'.', ..] | [b'0'..=b'9' | b'.', ..]), - ) => { - // parse number - let Ok(number) = str::from_utf8(number) else { - return Err(PdfParseError::InvalidNumber { pos }); - }; - let mut parts = number - .strip_prefix(&['+', '-']) - .unwrap_or(number) - .split('.'); - let integer_part = parts - .next() - .expect("split always returns at least one part"); - let fraction_part = parts.next(); - if parts.next().is_some() { - return Err(PdfParseError::InvalidNumber { pos }); - } - if integer_part.is_empty() && fraction_part.is_none_or(|v| v.is_empty()) { - return Err(PdfParseError::InvalidNumber { pos }); - } - if !integer_part.bytes().all(|v| v.is_ascii_digit()) { - return Err(PdfParseError::InvalidNumber { pos }); - } - if let Some(fraction_part) = fraction_part { - if !fraction_part.bytes().all(|v| v.is_ascii_digit()) { - return Err(PdfParseError::InvalidNumber { pos }); - } - Ok(PdfObject::Real(PdfReal::new( - pos, - number - .parse() - .map_err(|_| PdfParseError::InvalidNumber { pos })?, - ))) - } else { - Ok(PdfObject::Integer(PdfInteger::new( - pos, - number - .parse() - .map_err(|_| PdfParseError::InvalidNumber { pos })?, - ))) - } + match self.parser.parse_object_or_operator()? { + PdfObjectOrStreamDictionaryOrOperator::StreamDictionary { + dictionary, + stream_kw_pos: _, + } => Ok(PdfObject::Stream( + self.parse_stream_after_dictionary(dictionary)?, + )), + PdfObjectOrStreamDictionaryOrOperator::Object(object) => Ok(object), + PdfObjectOrStreamDictionaryOrOperator::Operator(operator) => { + Err(PdfParseError::OperatorNotAllowedHere { operator }) } - PdfToken::Regular(items) => todo!("{:?}", str::from_utf8(items)), - PdfToken::LParen => self.parse_string_after_l_paren().map(PdfObject::String), - PdfToken::RParen => todo!(), - PdfToken::LAngle => { - if self.tokenizer.peek_byte() == Some(b'<') { - let dictionary = self.parse_dictionary_after_one_l_angle()?; - self.skip_comments_and_whitespace(); - if let Some(PdfToken::Regular(b"stream")) = self.tokenizer.peek() { - self.parse_stream_after_dictionary(dictionary) - .map(PdfObject::Stream) - } else { - Ok(dictionary.into()) - } - } else { - self.parse_string_after_l_angle().map(PdfObject::String) - } - } - PdfToken::RAngle => todo!(), - PdfToken::LBracket => self.parse_array_after_l_bracket().map(PdfObject::Array), - PdfToken::RBracket => todo!(), - PdfToken::LBrace => todo!(), - PdfToken::RBrace => todo!(), - PdfToken::FSlash => self.parse_name_after_f_slash().map(PdfObject::Name), - PdfToken::Comment(_) => unreachable!(), } } fn parse_indirect_object_definition(&mut self) -> Result, PdfParseError> { - self.skip_comments_and_whitespace(); - let Some(id) = self.parse_object_identifier(false)? else { + self.parser.skip_comments_and_whitespace(); + let Some(id) = self.parser.parse_object_identifier(false)? else { return Ok(None); }; - self.skip_comments_and_whitespace(); - let obj_pos = self.tokenizer.pos(); - let Some(PdfToken::Regular(b"obj")) = self.tokenizer.next() else { + self.parser.skip_comments_and_whitespace(); + let obj_pos = self.parser.tokenizer.pos(); + let Some(PdfToken::Regular(b"obj")) = self.parser.tokenizer.next() else { return Err(PdfParseError::MissingObj { pos: obj_pos }); }; let object = self.parse_object()?; - self.skip_comments_and_whitespace(); - let end_obj_pos = self.tokenizer.pos(); - let Some(PdfToken::Regular(b"endobj")) = self.tokenizer.next() else { + self.parser.skip_comments_and_whitespace(); + let end_obj_pos = self.parser.tokenizer.pos(); + let Some(PdfToken::Regular(b"endobj")) = self.parser.tokenizer.next() else { return Err(PdfParseError::MissingEndObj { pos: end_obj_pos }); }; if self.objects_map.insert(id, object).is_some() { @@ -791,53 +829,13 @@ impl<'a> PdfParser<'a> { Ok(Some(())) } } - fn parse_object_stream_inner( - &mut self, - object_stream: &PdfStream, - ) -> Result<(), PdfParseError> { - let mut object_ids_and_byte_positions = - Vec::<(PdfObjectIdentifier, usize)>::with_capacity(object_stream.dictionary().rest.n); - for _ in 0..object_stream.dictionary().rest.n { - self.skip_comments_and_whitespace(); - let Some((pos, object_number)) = - self.parse_digits(|pos| Err(PdfParseError::InvalidObjectNumber { pos }))? - else { - return Err(PdfParseError::InvalidObjectNumber { - pos: self.tokenizer.pos(), - }); - }; - self.skip_comments_and_whitespace(); - let Some((_, byte_position)) = - self.parse_digits(|pos| Err(PdfParseError::InvalidNumber { pos }))? - else { - return Err(PdfParseError::InvalidNumber { - pos: self.tokenizer.pos(), - }); - }; - object_ids_and_byte_positions.push(( - PdfObjectIdentifier { - pos: pos.into(), - object_number, - generation_number: 0, - }, - byte_position, - )); - } - for (id, _byte_position) in object_ids_and_byte_positions { - let object = self.parse_object()?; - if self.objects_map.insert(id, object).is_some() { - return Err(PdfParseError::DuplicateIndirectObjectDefinition { pos: id.pos.0, id }); - } - } - Ok(()) - } fn parse_object_stream( &mut self, object_stream: &PdfStream, ) -> Result<(), PdfParseError> { let data = object_stream.decoded_data().as_ref()?; - self.with_tokenizer( - PdfTokenizer::new( + let mut parser = PdfParser { + tokenizer: PdfTokenizer::new( data, PdfInputPositionKnown { pos: 0, @@ -850,18 +848,48 @@ impl<'a> PdfParser<'a> { ), }, ), - |parser| parser.parse_object_stream_inner(object_stream), - ) - .map_err(|e| PdfParseError::ObjectStreamParseError { - stream_pos: object_stream.get_pdf_input_position(), - error: Arc::new(e), - }) + objects: self.parser.objects.clone(), + }; + let mut object_ids_and_byte_positions = + Vec::<(PdfObjectIdentifier, usize)>::with_capacity(object_stream.dictionary().rest.n); + for _ in 0..object_stream.dictionary().rest.n { + parser.skip_comments_and_whitespace(); + let Some((pos, object_number)) = + parser.parse_digits(|pos| Err(PdfParseError::InvalidObjectNumber { pos }))? + else { + return Err(PdfParseError::InvalidObjectNumber { + pos: parser.tokenizer.pos(), + }); + }; + parser.skip_comments_and_whitespace(); + let Some((_, byte_position)) = + parser.parse_digits(|pos| Err(PdfParseError::InvalidNumber { pos }))? + else { + return Err(PdfParseError::InvalidNumber { + pos: parser.tokenizer.pos(), + }); + }; + object_ids_and_byte_positions.push(( + PdfObjectIdentifier { + pos: pos.into(), + object_number, + generation_number: 0, + }, + byte_position, + )); + } + for (id, _byte_position) in object_ids_and_byte_positions { + let object = parser + .parse_object_or_operator()? + .error_on_stream_or_operator()?; + if self.objects_map.insert(id, object).is_some() { + return Err(PdfParseError::DuplicateIndirectObjectDefinition { pos: id.pos.0, id }); + } + } + Ok(()) } fn parse_body(&mut self) -> Result<(), PdfParseError> { while let Some(()) = self.parse_indirect_object_definition()? {} - self.unparsed_stream_dictionaries - .drain(..) - .try_for_each(|v| v.finish_parsing())?; let mut object_streams: Vec> = Vec::new(); for object in self.objects_map.values_mut() { let stream = match object { @@ -885,7 +913,7 @@ impl<'a> PdfParser<'a> { for object_stream in &object_streams { self.parse_object_stream(object_stream)?; } - let Ok(()) = self.objects_arc.inner.set(PdfObjectsInner { + let Ok(()) = self.parser.objects.inner.set(PdfObjectsInner { objects: std::mem::take(&mut self.objects_map), object_streams, }) else { @@ -894,19 +922,19 @@ impl<'a> PdfParser<'a> { Ok(()) } fn parse_xref_table(&mut self) -> Result<(), PdfParseError> { - self.skip_comments_and_whitespace(); - let xref_pos = self.tokenizer.pos(); - let Some(PdfToken::Regular(b"xref")) = self.tokenizer.peek() else { + self.parser.skip_comments_and_whitespace(); + let xref_pos = self.parser.tokenizer.pos(); + let Some(PdfToken::Regular(b"xref")) = self.parser.tokenizer.peek() else { return Ok(()); }; todo!("{xref_pos}") } fn parse_trailer(&mut self) -> Result { - self.skip_comments_and_whitespace(); - let trailer_pos = self.tokenizer.pos(); - let trailer_dictionary = match self.tokenizer.peek() { + self.parser.skip_comments_and_whitespace(); + let trailer_pos = self.parser.tokenizer.pos(); + let trailer_dictionary = match self.parser.tokenizer.peek() { Some(PdfToken::Regular(b"trailer")) => { - self.tokenizer.next(); + self.parser.tokenizer.next(); Some(PdfTrailerDictionary::parse(self.parse_object()?)?) } Some(PdfToken::Regular(b"startxref")) => None, @@ -914,34 +942,35 @@ impl<'a> PdfParser<'a> { return Err(PdfParseError::MissingTrailer { pos: trailer_pos }); } }; - self.skip_comments_and_whitespace(); - let start_xref_kw_pos = self.tokenizer.pos(); - let Some(PdfToken::Regular(b"startxref")) = self.tokenizer.next() else { + self.parser.skip_comments_and_whitespace(); + let start_xref_kw_pos = self.parser.tokenizer.pos(); + let Some(PdfToken::Regular(b"startxref")) = self.parser.tokenizer.next() else { return Err(PdfParseError::MissingStartXRefKeyword { pos: start_xref_kw_pos, }); }; - let start_xref_pos = self.tokenizer.pos(); - let Some((start_xref_pos, start_xref)) = - self.parse_digits(|pos| Err(PdfParseError::IntegerOutOfRange { pos }))? + let start_xref_pos = self.parser.tokenizer.pos(); + let Some((start_xref_pos, start_xref)) = self + .parser + .parse_digits(|pos| Err(PdfParseError::IntegerOutOfRange { pos }))? else { return Err(PdfParseError::MissingStartXRefValue { pos: start_xref_pos, }); }; - self.tokenizer.skip_whitespace(); - let eof_comment_pos = self.tokenizer.pos(); + self.parser.tokenizer.skip_whitespace(); + let eof_comment_pos = self.parser.tokenizer.pos(); let Some(PdfToken::Comment(b"%%EOF" | b"%%EOF\r" | b"%%EOF\r\n" | b"%%EOF\n")) = - self.tokenizer.next() + self.parser.tokenizer.next() else { return Err(PdfParseError::MissingEofComment { pos: eof_comment_pos, }); }; - self.tokenizer.skip_whitespace(); - if let Some(byte) = self.tokenizer.peek_byte() { + self.parser.tokenizer.skip_whitespace(); + if let Some(byte) = self.parser.tokenizer.peek_byte() { return Err(PdfParseError::UnexpectedByte { - pos: self.tokenizer.pos(), + pos: self.parser.tokenizer.pos(), byte, }); } @@ -951,24 +980,28 @@ impl<'a> PdfParser<'a> { start_xref, }); } - let old_tokenizer = self.tokenizer.clone(); - self.tokenizer = PdfTokenizer::new( - self.tokenizer.bytes, - PdfInputPositionKnown { - pos: start_xref, - containing_streams_pos: None, - }, - ); - let id = self.parse_object_identifier(false); - self.tokenizer = old_tokenizer; + let id = PdfParser { + tokenizer: PdfTokenizer::new( + self.parser.tokenizer.bytes, + PdfInputPositionKnown { + pos: start_xref, + containing_streams_pos: None, + }, + ), + objects: self.parser.objects.clone(), + } + .parse_object_identifier(false); let Some(id) = id? else { return Err(PdfParseError::InvalidStartXRefValue { pos: start_xref_pos, start_xref, }); }; - let xref_stream = - PdfStream::parse(PdfObjectIndirect::new(&self.objects_arc, id).get().into())?; + let xref_stream = PdfStream::parse( + PdfObjectIndirect::new(&self.parser.objects, id) + .get() + .into(), + )?; Ok(PdfTrailer::Stream { xref_stream, start_xref, @@ -979,9 +1012,14 @@ impl<'a> PdfParser<'a> { self.parse_body()?; self.parse_xref_table()?; let trailer = self.parse_trailer()?; + for page in trailer.trailer_dictionary().root.pages.pages().iter() { + for content in page.contents.iter() { + content.decoded_data().as_ref()?; + } + } Ok(Pdf { header, - objects: self.objects_arc, + objects: self.parser.objects, trailer, }) } @@ -989,19 +1027,20 @@ impl<'a> PdfParser<'a> { impl Pdf { pub fn parse(bytes: impl AsRef<[u8]>) -> Result { - PdfParser { - objects_arc: Arc::new(PdfObjects { - inner: OnceLock::new(), - }), + PdfFileParser { + parser: PdfParser { + objects: Arc::new(PdfObjects { + inner: OnceLock::new(), + }), + tokenizer: PdfTokenizer::new( + bytes.as_ref(), + PdfInputPositionKnown { + pos: 0, + containing_streams_pos: None, + }, + ), + }, objects_map: BTreeMap::new(), - unparsed_stream_dictionaries: vec![], - tokenizer: PdfTokenizer::new( - bytes.as_ref(), - PdfInputPositionKnown { - pos: 0, - containing_streams_pos: None, - }, - ), } .parse_file() } diff --git a/src/pdf/content_stream.rs b/src/pdf/content_stream.rs index f58737e..2552df7 100644 --- a/src/pdf/content_stream.rs +++ b/src/pdf/content_stream.rs @@ -1,6 +1,813 @@ -use crate::pdf::object::PdfStream; +use crate::{ + pdf::{ + PdfObjectOrStreamDictionaryOrOperator, PdfObjects, PdfParser, PdfTokenizer, + object::{ + NameOr, PdfDictionary, PdfMatrix, PdfName, PdfObject, PdfObjectDirect, PdfRectangle, + PdfStream, PdfStreamContents, PdfString, PdfStringBytesDebug, PdfStringOrNumber, + PdfVec2D, + }, + parse::{ + GetPdfInputPosition, PdfInputPosition, PdfInputPositionKnown, + PdfInputPositionNoCompare, PdfParse, PdfParseError, + }, + }, + util::ArcOrRef, +}; +use std::{fmt, sync::Arc}; -pub struct PdfContentStream { - stream: PdfStream, - // TODO +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord)] +pub struct PdfOperatorUnparsed { + pos: PdfInputPositionNoCompare, + bytes: ArcOrRef<'static, [u8]>, } + +impl GetPdfInputPosition for PdfOperatorUnparsed { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.pos() + } +} + +impl fmt::Debug for PdfOperatorUnparsed { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + Self::debug_with_name("PdfOperatorUnparsed", &self.bytes, self.pos.0, f) + } +} + +trait PdfParseIter: Sized { + fn parse_iter(iter: impl IntoIterator) -> Result; +} + +impl PdfParseIter for Arc<[T]> { + fn parse_iter(iter: impl IntoIterator) -> Result { + FromIterator::from_iter(iter.into_iter().map(T::parse)) + } +} + +impl PdfOperatorUnparsed { + pub fn new( + pos: impl Into, + bytes: impl Into>, + ) -> Self { + Self { + pos: pos.into(), + bytes: bytes.into(), + } + } + pub const fn new_static(bytes: &'static [u8]) -> Self { + Self { + pos: PdfInputPositionNoCompare::empty(), + bytes: ArcOrRef::Ref(bytes), + } + } + pub fn pos(&self) -> PdfInputPosition { + self.pos.0 + } + pub fn bytes(&self) -> &ArcOrRef<'static, [u8]> { + &self.bytes + } + fn debug_with_name( + name: &str, + pdf_name: &[u8], + pos: PdfInputPosition, + f: &mut fmt::Formatter<'_>, + ) -> fmt::Result { + write!(f, "{name}(at {pos}, {})", PdfStringBytesDebug(pdf_name)) + } + pub fn bytes_debug(&self) -> PdfStringBytesDebug<'_> { + PdfStringBytesDebug(&self.bytes) + } +} + +macro_rules! make_pdf_operator_enum { + ( + $(#[$($operator_meta:tt)*])* + $operator_enum_vis:vis enum $PdfOperator:ident; + + $(#[$($operator_and_operands_meta:tt)*])* + $enum_vis:vis enum $PdfOperatorAndOperands:ident { + $(#[$($unknown_variant_meta:tt)*])* + $Unknown:ident { + $(#[$($unknown_operands_meta:tt)*])* + $unknown_operands:ident: $unknown_operands_ty:ty, + $(#[$($unknown_operator_meta:tt)*])* + $unknown_operator:ident: $unknown_operator_ty:ty, + }, + $( + #[kw = $kw:literal] + $(#[$($variant_meta:tt)*])* + $Variant:ident($VariantStruct:ident { + $pos:ident: PdfInputPositionNoCompare, + $( + #[$field_parse:ident($($parse_args:tt)*)] + $(#[$($field_meta:tt)*])* + $field:ident: $field_ty:ty, + )* + }), + )* + } + ) => { + $(#[$($operator_meta)*])* + $operator_enum_vis enum $PdfOperator { + $(#[$($unknown_variant_meta)*])* + $Unknown($unknown_operator_ty), + $( + $(#[$($variant_meta)*])* + $Variant(PdfInputPositionNoCompare), + )* + } + + impl $PdfOperator { + $operator_enum_vis fn parse(self, operands: impl IntoIterator) -> Result<$PdfOperatorAndOperands, PdfParseError> { + let operands = operands.into_iter(); + Ok(match self { + Self::$Unknown(operator) => $PdfOperatorAndOperands::$Unknown { + operands: FromIterator::from_iter(operands.map(Into::into)), + operator, + }, + $(Self::$Variant(pos) => $VariantStruct::parse(pos, operands)?.into(),)* + }) + } + $operator_enum_vis fn pos(&self) -> PdfInputPosition { + match *self { + Self::$Unknown(ref operator) => operator.pos(), + $(Self::$Variant(pos) => pos.0,)* + } + } + } + + impl fmt::Debug for $PdfOperator { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::$Unknown(operator) => PdfOperatorUnparsed::debug_with_name("Unknown", &operator.bytes, operator.pos.0, f), + $(Self::$Variant(pos) => PdfOperatorUnparsed::debug_with_name(stringify!($Variant), $kw, pos.0, f),)* + } + } + } + + impl From<$PdfOperator> for PdfOperatorUnparsed { + fn from(v: $PdfOperator) -> PdfOperatorUnparsed { + match v { + $PdfOperator::$Unknown(operator) => operator, + $($PdfOperator::$Variant(pos) => PdfOperatorUnparsed { pos, bytes: ArcOrRef::Ref($kw) },)* + } + } + } + + impl From for $PdfOperator { + fn from(v: PdfOperatorUnparsed) -> $PdfOperator { + match &**v.bytes() { + $($kw => Self::$Variant(v.pos),)* + _ => Self::$Unknown(v), + } + } + } + + $(#[derive(Clone)] + $(#[$($variant_meta)*])* + $enum_vis struct $VariantStruct { + $enum_vis $pos: PdfInputPositionNoCompare, + $( + $(#[$($field_meta)*])* + $enum_vis $field: $field_ty, + )* + } + + impl fmt::Debug for $VariantStruct { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct(stringify!($VariantStruct)).field("pos", &self.pos)$(.field(stringify!($field), &self.$field))*.finish() + } + } + + impl GetPdfInputPosition for $VariantStruct { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.pos() + } + } + + impl From<$VariantStruct> for $PdfOperatorAndOperands { + fn from(v: $VariantStruct) -> Self { + Self::$Variant(v) + } + } + + impl $VariantStruct { + $enum_vis fn operator_from_pos(pos: impl Into) -> $PdfOperator { + $PdfOperator::$Variant(pos.into()) + } + $enum_vis fn operator(&self) -> $PdfOperator { + $PdfOperator::$Variant(self.pos) + } + $enum_vis fn pos(&self) -> PdfInputPosition { + self.pos.0 + } + } + + make_pdf_operator_enum! { + @impl_variant_parse + $enum_vis enum; + struct $VariantStruct { + $pos: PdfInputPositionNoCompare, + $( + #[$field_parse($($parse_args)*)] + $(#[$($field_meta)*])* + $field: $field_ty, + )* + } + })* + + $(#[$($operator_and_operands_meta)*])* + $enum_vis enum $PdfOperatorAndOperands { + $(#[$($unknown_variant_meta)*])* + $Unknown { + $(#[$($unknown_operands_meta)*])* + $unknown_operands: $unknown_operands_ty, + $(#[$($unknown_operator_meta)*])* + $unknown_operator: $unknown_operator_ty, + }, + $( + $(#[$($variant_meta)*])* + $Variant($VariantStruct), + )* + } + + impl $PdfOperatorAndOperands { + $enum_vis fn operator(&self) -> $PdfOperator { + match self { + Self::Unknown { operator, .. } => $PdfOperator::Unknown(operator.clone()), + $(Self::$Variant(v) => v.operator(),)* + } + } + $enum_vis fn pos(&self) -> PdfInputPosition { + match self { + Self::$Unknown { operator, .. } => operator.pos(), + $(Self::$Variant(v) => v.pos(),)* + } + } + } + + impl fmt::Debug for $PdfOperatorAndOperands { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::$Unknown { + operands, + operator, + } => f.debug_struct("Unknown").field("operator", operator).field("operands", operands).finish(), + $(Self::$Variant($VariantStruct { + $pos, + $($field,)* + }) => f.debug_struct(stringify!($Variant)).field("pos", $pos)$(.field(stringify!($field), $field))*.finish(),)* + } + } + } + }; + ( + @impl_variant_parse + $enum_vis:vis enum; + struct $VariantStruct:ident { + $pos:ident: PdfInputPositionNoCompare, + $( + #[$field_parse:ident($($parse_args:ident),* $(,)?)] + $(#[$($field_meta:tt)*])* + $field:ident: $field_ty:ty, + )* + } + ) => { + impl $VariantStruct { + $enum_vis fn parse(pos: impl Into, operands: impl IntoIterator) -> Result { + let pos = pos.into(); + let mut operands = operands.into_iter(); + $($(let Some($parse_args) = operands.next() else { + return Err(PdfParseError::OperatorHasTooFewOperands { operator: Self::operator_from_pos(pos) }); + };)*)* + if operands.next().is_some() { + return Err(PdfParseError::OperatorHasTooManyOperands { operator: Self::operator_from_pos(pos) }); + } + Ok(Self { + pos, + $($field: <$field_ty>::$field_parse($($parse_args),*)?,)* + }) + } + } + }; + ( + @impl_variant_parse + $enum_vis:vis enum; + struct $VariantStruct:ident { + $pos:ident: PdfInputPositionNoCompare, + #[$field_parse:ident(...)] + $(#[$($field_meta:tt)*])* + $field:ident: $field_ty:ty, + } + ) => { + impl $VariantStruct { + $enum_vis fn parse(pos: impl Into, operands: impl IntoIterator) -> Result { + let pos = pos.into(); + let operands = operands.into_iter(); + Ok(Self { + pos, + $field: <$field_ty>::$field_parse(operands)?, + }) + } + } + }; +} + +make_pdf_operator_enum! { + #[derive(Clone)] + pub enum PdfOperator; + #[derive(Clone)] + pub enum PdfOperatorAndOperands { + Unknown { + operands: Arc<[PdfObjectDirect]>, + operator: PdfOperatorUnparsed, + }, + #[kw = b"b"] + CloseFillAndStrokePath(PdfOperatorCloseFillAndStrokePath { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"B"] + FillAndStrokePath(PdfOperatorFillAndStrokePath { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"b*"] + CloseFillAndStrokePathEvenOdd(PdfOperatorCloseFillAndStrokePathEvenOdd { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"B*"] + FillAndStrokePathEvenOdd(PdfOperatorFillAndStrokePathEvenOdd { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"BDC"] + BeginMarkedContentWithProperties(PdfOperatorBeginMarkedContentWithProperties { + pos: PdfInputPositionNoCompare, + #[parse(tag)] + tag: PdfName, + #[parse(properties)] + properties: NameOr, + }), + #[kw = b"BI"] + BeginInlineImage(PdfOperatorBeginInlineImage { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"BMC"] + BeginMarkedContent(PdfOperatorBeginMarkedContent { + pos: PdfInputPositionNoCompare, + #[parse(tag)] + tag: PdfName, + }), + #[kw = b"BT"] + BeginText(PdfOperatorBeginText { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"BX"] + BeginCompatibilitySection(PdfOperatorBeginCompatibilitySection { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"c"] + CurveTo(PdfOperatorCurveTo { + pos: PdfInputPositionNoCompare, + #[parse(x1, y1)] + p1: PdfVec2D, + #[parse(x2, y2)] + p2: PdfVec2D, + #[parse(x3, y3)] + p3: PdfVec2D, + }), + #[kw = b"cm"] + ConcatMatrix(PdfOperatorConcatMatrix { + pos: PdfInputPositionNoCompare, + #[parse_flat(a, b, c, d, e, f)] + matrix: PdfMatrix, + }), + #[kw = b"CS"] + SetStrokeColorSpace(PdfOperatorSetStrokeColorSpace { + pos: PdfInputPositionNoCompare, + #[parse(name)] + name: PdfName, + }), + #[kw = b"cs"] + SetNonStrokeColorSpace(PdfOperatorSetNonStrokeColorSpace { + pos: PdfInputPositionNoCompare, + #[parse(name)] + name: PdfName, + }), + #[kw = b"d"] + SetLineDashPattern(PdfOperatorSetLineDashPattern { + pos: PdfInputPositionNoCompare, + #[parse(dash_array)] + dash_array: PdfObject, // TODO: actually parse + #[parse(dash_phase)] + dash_phase: PdfObject, // TODO: actually parse + }), + #[kw = b"d0"] + FontType3SetWidth(PdfOperatorFontType3SetWidth { + pos: PdfInputPositionNoCompare, + #[parse(x, y)] + width: PdfVec2D, + }), + #[kw = b"d1"] + FontType3SetWidthAndBBox(PdfOperatorFontType3SetWidthAndBBox { + pos: PdfInputPositionNoCompare, + #[parse(width_x, width_y)] + width: PdfVec2D, + #[parse_flat(lower_left_x, lower_left_y, upper_right_x, upper_right_y)] + bbox: PdfRectangle, + }), + #[kw = b"Do"] + PaintXObject(PdfOperatorPaintXObject { + pos: PdfInputPositionNoCompare, + #[parse(name)] + name: PdfName, + }), + #[kw = b"DP"] + DesignateMarkedContentPointWithProperties(PdfOperatorDesignateMarkedContentPointWithProperties { + pos: PdfInputPositionNoCompare, + #[parse(tag)] + tag: PdfName, + #[parse(properties)] + properties: NameOr, + }), + #[kw = b"EI"] + EndInlineImage(PdfOperatorEndInlineImage { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"EMC"] + EndMarkedContent(PdfOperatorEndMarkedContent { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"ET"] + EndText(PdfOperatorEndText { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"EX"] + EndCompatibilitySection(PdfOperatorEndCompatibilitySection { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"f"] + FillPath(PdfOperatorFillPath { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"F"] + FillPathObsolete(PdfOperatorFillPathObsolete { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"f*"] + FillPathEvenOdd(PdfOperatorFillPathEvenOdd { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"G"] + SetStrokeGray(PdfOperatorSetStrokeGray { + pos: PdfInputPositionNoCompare, + #[parse(gray)] + gray: f32, + }), + #[kw = b"g"] + SetNonStrokeGray(PdfOperatorSetNonStrokeGray { + pos: PdfInputPositionNoCompare, + #[parse(gray)] + gray: f32, + }), + #[kw = b"gs"] + SetGraphicsState(PdfOperatorSetGraphicsState { + pos: PdfInputPositionNoCompare, + #[parse(dictionary_name)] + dictionary_name: PdfName, + }), + #[kw = b"h"] + CloseSubpath(PdfOperatorCloseSubpath { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"i"] + SetFlatnessTolerance(PdfOperatorSetFlatnessTolerance { + pos: PdfInputPositionNoCompare, + #[parse(flatness)] + flatness: f32, + }), + #[kw = b"ID"] + BeginInlineImageData(PdfOperatorBeginInlineImageData { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"j"] + SetLineJoinStyle(PdfOperatorSetLineJoinStyle { + pos: PdfInputPositionNoCompare, + #[parse(line_join_style)] + line_join_style: u8, // TODO parse + }), + #[kw = b"J"] + SetLineCapStyle(PdfOperatorSetLineCapStyle { + pos: PdfInputPositionNoCompare, + #[parse(line_cap_style)] + line_cap_style: u8, // TODO parse + }), + #[kw = b"K"] + SetStrokeCmyk(PdfOperatorSetStrokeCmyk { + pos: PdfInputPositionNoCompare, + #[parse(c)] + c: f32, + #[parse(m)] + m: f32, + #[parse(y)] + y: f32, + #[parse(k)] + k: f32, + }), + #[kw = b"k"] + SetNonStrokeCmyk(PdfOperatorSetNonStrokeCmyk { + pos: PdfInputPositionNoCompare, + #[parse(c)] + c: f32, + #[parse(m)] + m: f32, + #[parse(y)] + y: f32, + #[parse(k)] + k: f32, + }), + #[kw = b"l"] + LineTo(PdfOperatorLineTo { + pos: PdfInputPositionNoCompare, + #[parse(x, y)] + to: PdfVec2D, + }), + #[kw = b"m"] + MoveTo(PdfOperatorMoveTo { + pos: PdfInputPositionNoCompare, + #[parse(x, y)] + to: PdfVec2D, + }), + #[kw = b"M"] + SetMiterLimit(PdfOperatorSetMiterLimit { + pos: PdfInputPositionNoCompare, + #[parse(limit)] + limit: f32, + }), + #[kw = b"MP"] + DesignateMarkedContentPoint(PdfOperatorDesignateMarkedContentPoint { + pos: PdfInputPositionNoCompare, + #[parse(tag)] + tag: PdfName, + }), + #[kw = b"n"] + EndPath(PdfOperatorEndPath { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"q"] + SaveGraphicsState(PdfOperatorSaveGraphicsState { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"Q"] + RestoreGraphicsState(PdfOperatorRestoreGraphicsState { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"re"] + Rectangle(PdfOperatorRectangle { + pos: PdfInputPositionNoCompare, + #[parse(x, y)] + p: PdfVec2D, + #[parse(width, height)] + size: PdfVec2D, + }), + #[kw = b"RG"] + SetStrokeRgb(PdfOperatorSetStrokeRgb { + pos: PdfInputPositionNoCompare, + #[parse(r)] + r: f32, + #[parse(g)] + g: f32, + #[parse(b)] + b: f32, + }), + #[kw = b"rg"] + SetNonStrokeRgb(PdfOperatorSetNonStrokeRgb { + pos: PdfInputPositionNoCompare, + #[parse(r)] + r: f32, + #[parse(g)] + g: f32, + #[parse(b)] + b: f32, + }), + #[kw = b"ri"] + SetColorRenderingIntent(PdfOperatorSetColorRenderingIntent { + pos: PdfInputPositionNoCompare, + #[parse(intent)] + intent: PdfName, + }), + #[kw = b"s"] + CloseAndStrokePath(PdfOperatorCloseAndStrokePath { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"S"] + StrokePath(PdfOperatorStrokePath { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"SC"] + SetStrokeColor(PdfOperatorSetStrokeColor { + pos: PdfInputPositionNoCompare, + #[parse_iter(...)] + color: Arc<[f32]>, + }), + #[kw = b"sc"] + SetNonStrokeColor(PdfOperatorSetNonStrokeColor { + pos: PdfInputPositionNoCompare, + #[parse_iter(...)] + color: Arc<[f32]>, + }), + #[kw = b"SCN"] + SetStrokeColorWithName(PdfOperatorSetStrokeColorWithName { + pos: PdfInputPositionNoCompare, + #[parse_iter(...)] + color_and_name: Arc<[NameOr]>, + }), + #[kw = b"scn"] + SetNonStrokeColorWithName(PdfOperatorSetNonStrokeColorWithName { + pos: PdfInputPositionNoCompare, + #[parse_iter(...)] + color_and_name: Arc<[NameOr]>, + }), + #[kw = b"sh"] + Shade(PdfOperatorShade { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"T*"] + TextNextLine(PdfOperatorTextNextLine { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"Tc"] + SetCharacterSpacing(PdfOperatorSetCharacterSpacing { + pos: PdfInputPositionNoCompare, + #[parse(char_space)] + char_space: f32, + }), + #[kw = b"Td"] + TextNextLineWithOffset(PdfOperatorTextNextLineWithOffset { + pos: PdfInputPositionNoCompare, + #[parse(x, y)] + offset: PdfVec2D, + }), + #[kw = b"TD"] + TextNextLineWithOffsetAndLeading(PdfOperatorTextNextLineWithOffsetAndLeading { + pos: PdfInputPositionNoCompare, + #[parse(x, y)] + offset: PdfVec2D, + }), + #[kw = b"Tf"] + SetFontAndSize(PdfOperatorSetFontAndSize { + pos: PdfInputPositionNoCompare, + #[parse(font)] + font: PdfName, + #[parse(size)] + size: f32, + }), + #[kw = b"Tj"] + ShowText(PdfOperatorShowText { + pos: PdfInputPositionNoCompare, + #[parse(text)] + text: PdfString, + }), + #[kw = b"TJ"] + ShowTextWithGlyphPositioning(PdfOperatorShowTextWithGlyphPositioning { + pos: PdfInputPositionNoCompare, + #[parse(text_and_positioning)] + text_and_positioning: Arc<[PdfStringOrNumber]>, + }), + #[kw = b"TL"] + SetTextLeading(PdfOperatorSetTextLeading { + pos: PdfInputPositionNoCompare, + #[parse(leading)] + leading: f32, + }), + #[kw = b"Tm"] + SetTextMatrix(PdfOperatorSetTextMatrix { + pos: PdfInputPositionNoCompare, + #[parse_flat(a, b, c, d, e, f)] + matrix: PdfMatrix, + }), + #[kw = b"Tr"] + SetTextRenderingMode(PdfOperatorSetTextRenderingMode { + pos: PdfInputPositionNoCompare, + #[parse(rendering_mode)] + rendering_mode: u8, // TODO: parse + }), + #[kw = b"Ts"] + SetTextRise(PdfOperatorSetTextRise { + pos: PdfInputPositionNoCompare, + #[parse(rise)] + rise: f32, + }), + #[kw = b"Tw"] + SetWordSpacing(PdfOperatorSetWordSpacing { + pos: PdfInputPositionNoCompare, + #[parse(word_space)] + word_space: f32, + }), + #[kw = b"Tz"] + SetTextHorizontalScaling(PdfOperatorSetTextHorizontalScaling { + pos: PdfInputPositionNoCompare, + #[parse(scale_percent)] + scale_percent: f32, + }), + #[kw = b"v"] + CurveTo23(PdfOperatorCurveTo23 { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"w"] + SetLineWidth(PdfOperatorSetLineWidth { + pos: PdfInputPositionNoCompare, + #[parse(line_width)] + line_width: f32, + }), + #[kw = b"W"] + Clip(PdfOperatorClip { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"W*"] + ClipEvenOdd(PdfOperatorClipEvenOdd { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"y"] + CurveTo13(PdfOperatorCurveTo13 { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"'"] + TextNextLineAndShow(PdfOperatorTextNextLineAndShow { + pos: PdfInputPositionNoCompare, + #[parse(text)] + text: PdfString, + }), + #[kw = b"\""] + SetSpacingThenTextNextLineAndShow(PdfOperatorSetSpacingThenTextNextLineAndShow { + pos: PdfInputPositionNoCompare, + #[parse(word_space)] + word_space: f32, + #[parse(char_space)] + char_space: f32, + #[parse(text)] + text: PdfString, + }), + } +} + +impl GetPdfInputPosition for PdfOperator { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.pos() + } +} + +impl GetPdfInputPosition for PdfOperatorAndOperands { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.pos() + } +} + +#[derive(Debug, Clone)] +pub struct PdfContentStreamData { + pub operators: Arc<[PdfOperatorAndOperands]>, +} + +impl PdfStreamContents for PdfContentStreamData { + fn parse( + data: &[u8], + stream_pos: PdfInputPosition, + objects: Arc, + ) -> Result { + let mut parser = PdfParser { + objects, + tokenizer: PdfTokenizer::new( + data, + PdfInputPositionKnown { + pos: 0, + containing_streams_pos: stream_pos.get().map(|v| v.pos), + }, + ), + }; + let mut operands = Vec::new(); + let mut operators = Vec::new(); + loop { + parser.skip_comments_and_whitespace(); + if parser.tokenizer.peek().is_none() { + break; + } + match parser.parse_object_or_operator()? { + PdfObjectOrStreamDictionaryOrOperator::StreamDictionary { + stream_kw_pos, .. + } => return Err(PdfParseError::StreamNotAllowedHere { pos: stream_kw_pos }), + PdfObjectOrStreamDictionaryOrOperator::Object(object) => operands.push(object), + PdfObjectOrStreamDictionaryOrOperator::Operator(operator) => { + operators.push(PdfOperator::from(operator).parse(operands.drain(..))?); + } + } + } + if operands.is_empty() { + Ok(Self { + operators: operators.into(), + }) + } else { + Err(PdfParseError::MissingOperator { + pos: parser.tokenizer.pos(), + }) + } + } +} + +pub type PdfContentStream = PdfStream; diff --git a/src/pdf/document_structure.rs b/src/pdf/document_structure.rs index 265182c..13c0de3 100644 --- a/src/pdf/document_structure.rs +++ b/src/pdf/document_structure.rs @@ -2,6 +2,7 @@ use core::fmt; use std::{borrow::Cow, sync::Arc}; use crate::pdf::{ + content_stream::PdfContentStream, font::PdfFont, object::{ IsPdfNull, MaybeArray, PdfDate, PdfDictionary, PdfInteger, PdfName, PdfObject, @@ -238,7 +239,7 @@ pdf_parse! { #[pdf(name = "BoxColorInfo")] pub box_color_info: Option, #[pdf(name = "Contents")] - pub contents: MaybeArray, + pub contents: MaybeArray, #[pdf(name = "Group")] pub group: Option, #[pdf(name = "Thumb")] @@ -388,7 +389,7 @@ pub struct PdfPage { pub trim_box: PdfRectangle, pub art_box: PdfRectangle, pub box_color_info: Option, - pub contents: Arc<[PdfStream]>, + pub contents: Arc<[PdfContentStream]>, pub rotate: PdfPageRotation, pub group: Option, pub thumbnail: Option, diff --git a/src/pdf/object.rs b/src/pdf/object.rs index dad6e49..de3b6da 100644 --- a/src/pdf/object.rs +++ b/src/pdf/object.rs @@ -34,7 +34,7 @@ impl std::fmt::Debug for PdfString { } #[derive(Clone, Copy)] -pub struct PdfStringBytesDebug<'a>(&'a [u8]); +pub struct PdfStringBytesDebug<'a>(pub &'a [u8]); impl<'a> fmt::Display for PdfStringBytesDebug<'a> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { @@ -407,6 +407,81 @@ impl PdfParse for PdfNumber { } } +#[derive(Clone)] +pub enum PdfStringOrNumber { + String(PdfString), + Number(PdfNumber), +} + +impl fmt::Debug for PdfStringOrNumber { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::String(v) => v.fmt(f), + Self::Number(v) => v.fmt(f), + } + } +} + +impl PdfStringOrNumber { + pub fn pos(self) -> PdfInputPosition { + match self { + Self::String(v) => v.pos(), + Self::Number(v) => v.pos(), + } + } +} + +impl PdfObjectDirect { + pub fn string_or_number(&self) -> Option { + match *self { + PdfObjectDirect::Integer(v) => Some(PdfStringOrNumber::Number(PdfNumber::Integer(v))), + PdfObjectDirect::Real(v) => Some(PdfStringOrNumber::Number(PdfNumber::Real(v))), + PdfObjectDirect::String(ref v) => Some(PdfStringOrNumber::String(v.clone())), + PdfObjectDirect::Boolean(_) + | PdfObjectDirect::Name(_) + | PdfObjectDirect::Array(_) + | PdfObjectDirect::Dictionary(_) + | PdfObjectDirect::Stream(_) + | PdfObjectDirect::Null(_) => None, + } + } +} + +impl PdfObjectNonNull { + pub fn string_or_number(&self) -> Option { + match *self { + PdfObjectNonNull::Integer(v) => Some(PdfStringOrNumber::Number(PdfNumber::Integer(v))), + PdfObjectNonNull::Real(v) => Some(PdfStringOrNumber::Number(PdfNumber::Real(v))), + PdfObjectNonNull::String(ref v) => Some(PdfStringOrNumber::String(v.clone())), + PdfObjectNonNull::Boolean(_) + | PdfObjectNonNull::Name(_) + | PdfObjectNonNull::Array(_) + | PdfObjectNonNull::Dictionary(_) + | PdfObjectNonNull::Stream(_) => None, + } + } +} + +impl IsPdfNull for PdfStringOrNumber { + fn is_pdf_null(&self) -> bool { + false + } +} + +impl PdfParse for PdfStringOrNumber { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("string or number") + } + fn parse(object: PdfObject) -> Result { + let object = PdfObjectDirect::from(object); + object.string_or_number().ok_or(PdfParseError::InvalidType { + pos: object.pos(), + ty: object.type_name(), + expected_ty: "string or number", + }) + } +} + macro_rules! make_pdf_object { ( $( @@ -818,34 +893,35 @@ impl PdfObjectIndirect { } } pub fn get(&self) -> PdfObjectDirect { - if let Some(objects) = self.objects.upgrade() { - if let Some(objects) = objects.inner.get() { - let final_id = self.final_id.get().copied(); - let limit = if final_id.is_some() { 1 } else { 1000usize }; - let mut id = final_id.unwrap_or(self.id); - for _ in 0..limit { - if let Some(object) = objects.objects.get(&self.id) { - let retval = match object { - PdfObject::Boolean(v) => PdfObjectDirect::Boolean(*v), - PdfObject::Integer(v) => PdfObjectDirect::Integer(*v), - PdfObject::Real(v) => PdfObjectDirect::Real(*v), - PdfObject::String(v) => PdfObjectDirect::String(v.clone()), - PdfObject::Name(v) => PdfObjectDirect::Name(v.clone()), - PdfObject::Array(v) => PdfObjectDirect::Array(v.clone()), - PdfObject::Dictionary(v) => PdfObjectDirect::Dictionary(v.clone()), - PdfObject::Stream(v) => PdfObjectDirect::Stream(v.clone()), - PdfObject::Null(v) => PdfObjectDirect::Null(*v), - PdfObject::Indirect(v) => { - id = v.id; - continue; - } - }; - // we could be racing with another thread, so set can fail but that's not a problem - let _ = self.final_id.set(id); - return retval; - } else { - return PdfObjectDirect::Null(PdfNull::new(id.pos)); - } + let Some(objects) = self.objects.upgrade() else { + panic!("PdfObjects is no longer available"); + }; + if let Some(objects) = objects.inner.get() { + let final_id = self.final_id.get().copied(); + let limit = if final_id.is_some() { 1 } else { 1000usize }; + let mut id = final_id.unwrap_or(self.id); + for _ in 0..limit { + if let Some(object) = objects.objects.get(&self.id) { + let retval = match object { + PdfObject::Boolean(v) => PdfObjectDirect::Boolean(*v), + PdfObject::Integer(v) => PdfObjectDirect::Integer(*v), + PdfObject::Real(v) => PdfObjectDirect::Real(*v), + PdfObject::String(v) => PdfObjectDirect::String(v.clone()), + PdfObject::Name(v) => PdfObjectDirect::Name(v.clone()), + PdfObject::Array(v) => PdfObjectDirect::Array(v.clone()), + PdfObject::Dictionary(v) => PdfObjectDirect::Dictionary(v.clone()), + PdfObject::Stream(v) => PdfObjectDirect::Stream(v.clone()), + PdfObject::Null(v) => PdfObjectDirect::Null(*v), + PdfObject::Indirect(v) => { + id = v.id; + continue; + } + }; + // we could be racing with another thread, so set can fail but that's not a problem + let _ = self.final_id.set(id); + return retval; + } else { + return PdfObjectDirect::Null(PdfNull::new(id.pos)); } } } @@ -1213,21 +1289,150 @@ impl<'a, T> IntoIterator for &'a MaybeArray { } } +#[derive(Clone)] +pub enum NameOr { + Name(PdfName), + Value(T), +} + +impl NameOr { + pub fn into_resolved(self, resolve: impl FnOnce(PdfName) -> Result) -> Result { + match self { + Self::Name(name) => resolve(name), + Self::Value(v) => Ok(v), + } + } + pub fn replace_with_resolved( + &mut self, + resolve: impl FnOnce(&PdfName) -> Result, + ) -> Result<&mut T, E> { + match self { + Self::Name(name) => { + *self = Self::Value(resolve(name)?); + let Self::Value(v) = self else { + unreachable!(); + }; + Ok(v) + } + Self::Value(v) => Ok(v), + } + } +} + +impl fmt::Debug for NameOr { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Name(v) => v.fmt(f), + Self::Value(v) => v.fmt(f), + } + } +} + +impl GetPdfInputPosition for NameOr { + fn get_pdf_input_position(&self) -> PdfInputPosition { + match self { + Self::Name(v) => v.pos(), + Self::Value(v) => v.get_pdf_input_position(), + } + } +} + +impl IsPdfNull for NameOr { + fn is_pdf_null(&self) -> bool { + match self { + Self::Name(_) => false, + Self::Value(v) => v.is_pdf_null(), + } + } +} + +impl PdfParse for NameOr { + fn type_name() -> Cow<'static, str> { + Cow::Owned(format!("NameOr<{}>", T::type_name())) + } + fn parse(object: PdfObject) -> Result { + Ok(match PdfObjectDirect::from(object) { + PdfObjectDirect::Name(name) => Self::Name(name), + object => Self::Value(T::parse(object.into())?), + }) + } +} + #[derive(Copy, Clone, PartialEq)] -pub struct PdfPoint { +pub struct PdfMatrix { + pub pos: PdfInputPositionNoCompare, + pub elements: [f32; 6], +} + +impl fmt::Debug for PdfMatrix { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let Self { pos, elements } = *self; + write!(f, "PdfMatrix(at {pos}, {elements:?})") + } +} + +impl IsPdfNull for PdfMatrix { + fn is_pdf_null(&self) -> bool { + false + } +} + +impl PdfParse for PdfMatrix { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("matrix") + } + fn parse(object: PdfObject) -> Result { + Ok(Self { + pos: object.pos().into(), + elements: PdfParse::parse(object)?, + }) + } +} + +impl PdfMatrix { + pub fn parse_flat( + a: PdfObject, + b: PdfObject, + c: PdfObject, + d: PdfObject, + e: PdfObject, + f: PdfObject, + ) -> Result { + Ok(Self { + pos: a.pos().into(), + elements: [ + PdfParse::parse(a)?, + PdfParse::parse(b)?, + PdfParse::parse(c)?, + PdfParse::parse(d)?, + PdfParse::parse(e)?, + PdfParse::parse(f)?, + ], + }) + } +} + +impl GetPdfInputPosition for PdfMatrix { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.pos.0 + } +} + +#[derive(Copy, Clone, PartialEq)] +pub struct PdfVec2D { pub pos: PdfInputPositionNoCompare, pub x: f32, pub y: f32, } -impl fmt::Debug for PdfPoint { +impl fmt::Debug for PdfVec2D { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let Self { pos, x, y } = *self; - write!(f, "PdfPoint(at {pos}, {x}, {y})") + write!(f, "PdfVec2D(at {pos}, {x}, {y})") } } -impl PdfPoint { +impl PdfVec2D { pub fn parse(x: PdfObject, y: PdfObject) -> Result { Ok(Self { pos: x.pos().into(), @@ -1237,7 +1442,7 @@ impl PdfPoint { } } -impl GetPdfInputPosition for PdfPoint { +impl GetPdfInputPosition for PdfVec2D { fn get_pdf_input_position(&self) -> PdfInputPosition { self.pos.0 } @@ -1246,13 +1451,13 @@ impl GetPdfInputPosition for PdfPoint { #[derive(Copy, Clone, Debug)] pub struct PdfRectangle { /// the corner with the smaller x and y coordinates - smaller: PdfPoint, + smaller: PdfVec2D, /// the corner with the larger x and y coordinates - larger: PdfPoint, + larger: PdfVec2D, } impl PdfRectangle { - pub fn new(mut smaller: PdfPoint, mut larger: PdfPoint) -> Self { + pub fn new(mut smaller: PdfVec2D, mut larger: PdfVec2D) -> Self { // `pos` follows the `x` coordinate if smaller.x.is_nan() { smaller.pos = larger.pos; @@ -1262,12 +1467,12 @@ impl PdfRectangle { std::mem::swap(&mut smaller.pos, &mut larger.pos); } Self { - smaller: PdfPoint { + smaller: PdfVec2D { pos: smaller.pos, x: smaller.x.min(larger.x), y: smaller.y.min(larger.y), }, - larger: PdfPoint { + larger: PdfVec2D { pos: larger.pos, x: smaller.x.max(larger.x), y: smaller.y.max(larger.y), @@ -1275,13 +1480,24 @@ impl PdfRectangle { } } /// return the corner with the smaller x and y coordinates - pub fn smaller(&self) -> PdfPoint { + pub fn smaller(&self) -> PdfVec2D { self.smaller } /// return the corner with the larger x and y coordinates - pub fn larger(&self) -> PdfPoint { + pub fn larger(&self) -> PdfVec2D { self.larger } + pub fn parse_flat( + lower_left_x: PdfObject, + lower_left_y: PdfObject, + upper_right_x: PdfObject, + upper_right_y: PdfObject, + ) -> Result { + Ok(Self::new( + PdfVec2D::parse(lower_left_x, lower_left_y)?, + PdfVec2D::parse(upper_right_x, upper_right_y)?, + )) + } } impl GetPdfInputPosition for PdfRectangle { @@ -1317,10 +1533,12 @@ impl PdfParse for PdfRectangle { expected_ty: "rectangle", }); }; - Ok(Self::new( - PdfPoint::parse(lower_left_x.clone(), lower_left_y.clone())?, - PdfPoint::parse(upper_right_x.clone(), upper_right_y.clone())?, - )) + Self::parse_flat( + lower_left_x.clone(), + lower_left_y.clone(), + upper_right_x.clone(), + upper_right_y.clone(), + ) } } @@ -1366,7 +1584,7 @@ pdf_parse! { #[pdf] #[derive(Clone, Debug)] pub struct PdfStreamDictionary { - #[pdf(name = PdfStreamDictionary::LENGTH_NAME)] + #[pdf(name = "Length")] pub len: usize, #[pdf(name = "Filter")] pub filters: MaybeArray, @@ -1385,15 +1603,6 @@ pdf_parse! { } } -impl PdfStreamDictionary { - pub const LENGTH_NAME: &str = "Length"; - pub(crate) fn parse_len_from_dictionary( - dictionary: &PdfDictionary, - ) -> Result { - PdfParse::parse(dictionary.get_or_null(Self::LENGTH_NAME.as_bytes())) - } -} - #[derive(Debug, Clone, Default)] pub struct PdfStreamDictionaryFiltersAndParms<'a> { filters: std::iter::Enumerate>, @@ -1505,12 +1714,45 @@ impl UnparsedPdfStreamDictionary { } } +pub trait PdfStreamContents: Sized + fmt::Debug + 'static { + fn parse( + data: &[u8], + stream_pos: PdfInputPosition, + objects: Arc, + ) -> Result; + fn parse_arc( + data: Arc<[u8]>, + stream_pos: PdfInputPosition, + objects: Arc, + ) -> Result { + Self::parse(&*data, stream_pos, objects) + } +} + +impl PdfStreamContents for Arc<[u8]> { + fn parse( + data: &[u8], + _stream_pos: PdfInputPosition, + _objects: Arc, + ) -> Result { + Ok(Arc::from(data)) + } + fn parse_arc( + data: Arc<[u8]>, + _stream_pos: PdfInputPosition, + _objects: Arc, + ) -> Result { + Ok(data.clone()) + } +} + #[derive(Clone)] -pub struct PdfStream { +pub struct PdfStream> { pos: PdfInputPositionNoCompare, - dictionary: Arc>>, + objects: std::sync::Weak, + dictionary: PdfStreamDictionary, encoded_data: Arc<[u8]>, - decoded_data: Arc, PdfParseError>>>, + decoded_data: Arc>>, } struct DumpBytes<'a>(&'a [u8]); @@ -1542,25 +1784,30 @@ impl fmt::Display for DumpBytes<'_> { } } -impl fmt::Debug for PdfStream { +impl fmt::Debug for PdfStream { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let Self { pos, + objects: _, dictionary, encoded_data, decoded_data, } = self; let mut debug_struct = f.debug_struct("PdfStream"); debug_struct.field("pos", pos); - if let Some(dictionary) = dictionary.get() { - debug_struct.field("dictionary", dictionary); - } else { - debug_struct.field("dictionary", &format_args!("")); - } + debug_struct.field("dictionary", dictionary); debug_struct.field("encoded_data", &DumpBytes(encoded_data)); if let Some(decoded_data) = decoded_data.get() { match decoded_data { - Ok(decoded_data) => debug_struct.field("decoded_data", &DumpBytes(decoded_data)), + Ok(decoded_data) => { + if let Some(decoded_data) = + ::downcast_ref::>(decoded_data) + { + debug_struct.field("decoded_data", &DumpBytes(&**decoded_data)) + } else { + debug_struct.field("decoded_data", decoded_data) + } + } Err(e) => debug_struct.field("decoded_data", &Err::<(), _>(e)), }; } else { @@ -1570,47 +1817,31 @@ impl fmt::Debug for PdfStream { } } -impl PdfStream { +impl PdfStream { pub fn new( pos: impl Into, + objects: &Arc, dictionary: PdfStreamDictionary, encoded_data: Arc<[u8]>, ) -> Self { Self { pos: pos.into(), - dictionary: Arc::new(OnceLock::from(dictionary)), + objects: Arc::downgrade(objects), + dictionary, encoded_data, decoded_data: Arc::new(OnceLock::new()), } } - pub(crate) fn new_unparsed( - pos: impl Into, - unparsed_dictionary: PdfDictionary, - encoded_data: Arc<[u8]>, - ) -> (Self, UnparsedPdfStreamDictionary) { - let dictionary = Arc::new(OnceLock::new()); - ( - Self { - pos: pos.into(), - dictionary: dictionary.clone(), - encoded_data, - decoded_data: Arc::new(OnceLock::new()), - }, - UnparsedPdfStreamDictionary { - unparsed_dictionary, - dictionary, - }, - ) - } pub fn dictionary(&self) -> &PdfStreamDictionary { - self.dictionary - .get() - .expect("haven't finished parsing all pdf object definitions yet") + &self.dictionary } pub fn encoded_data(&self) -> &Arc<[u8]> { &self.encoded_data } - fn try_decode_data(&self) -> Result, PdfParseError> { + fn try_decode_data(&self) -> Result { + let Some(objects) = self.objects.upgrade() else { + panic!("PdfObjects is no longer available"); + }; let dictionary = self.dictionary(); let (data, filters) = if let Some(file) = &dictionary.file { todo!() @@ -1618,7 +1849,7 @@ impl PdfStream { (&self.encoded_data, dictionary.filters_and_parms()) }; if filters.len() == 0 { - return Ok(data.clone()); + return Data::parse_arc(data.clone(), self.pos.0, objects); } let mut data: &[u8] = data; let mut buffer; @@ -1626,26 +1857,26 @@ impl PdfStream { buffer = filter.decode_stream_data(filter_parms.clone(), self.pos.0, &data)?; data = &buffer; } - Ok(Arc::from(data)) + Data::parse(data, self.pos.0, objects) } - pub fn decoded_data(&self) -> &Result, PdfParseError> { + pub fn decoded_data(&self) -> &Result { self.decoded_data.get_or_init(|| self.try_decode_data()) } } -impl GetPdfInputPosition for PdfStream { +impl GetPdfInputPosition for PdfStream { fn get_pdf_input_position(&self) -> PdfInputPosition { self.pos.0 } } -impl IsPdfNull for PdfStream { +impl IsPdfNull for PdfStream { fn is_pdf_null(&self) -> bool { false } } -impl PdfParse for PdfStream { +impl PdfParse for PdfStream { fn type_name() -> Cow<'static, str> { if TypeId::of::() == TypeId::of::() { Cow::Borrowed("stream") @@ -1655,38 +1886,56 @@ impl PdfParse for PdfStream { } fn parse(object: PdfObject) -> Result { match PdfObjectDirect::from(object) { - PdfObjectDirect::Stream(stream) => Ok(PdfStream { - pos: stream.pos, - dictionary: if let Some(dictionary) = ::downcast_ref::< - Arc>>, - >(&stream.dictionary) - { - dictionary.clone() - } else { - let PdfStreamDictionary { - len, - filters, - decode_parms, - file, - file_filters, - file_decode_parms, - decoded_len, - rest, - } = stream.dictionary(); - Arc::new(OnceLock::from(PdfStreamDictionary { - len: *len, - filters: filters.clone(), - decode_parms: decode_parms.clone(), - file: file.clone(), - file_filters: file_filters.clone(), - file_decode_parms: file_decode_parms.clone(), - decoded_len: *decoded_len, - rest: Rest::parse(rest.clone().into())?, - })) - }, - encoded_data: stream.encoded_data, - decoded_data: stream.decoded_data, - }), + PdfObjectDirect::Stream(stream) => { + Ok(PdfStream { + pos: stream.pos, + dictionary: { + let PdfStreamDictionary { + len, + filters, + decode_parms, + file, + file_filters, + file_decode_parms, + decoded_len, + rest, + } = stream.dictionary; + PdfStreamDictionary { + len, + filters, + decode_parms, + file, + file_filters, + file_decode_parms, + decoded_len, + rest: Rest::parse(rest.into())?, + } + }, + encoded_data: stream.encoded_data, + decoded_data: if let Some(decoded_data) = + ::downcast_ref(&stream.decoded_data) + { + Arc::clone(decoded_data) + } else { + let Some(objects) = stream.objects.upgrade() else { + panic!("PdfObjects is no longer available"); + }; + Arc::new( + stream + .decoded_data + .get() + .cloned() + .map(|data| { + OnceLock::from(data.and_then(|data| { + Data::parse_arc(data, stream.pos.0, objects) + })) + }) + .unwrap_or_default(), + ) + }, + objects: stream.objects, + }) + } object => Err(PdfParseError::InvalidType { pos: object.get_pdf_input_position(), ty: object.type_name(), diff --git a/src/pdf/parse.rs b/src/pdf/parse.rs index 8e5a7fc..95e58ac 100644 --- a/src/pdf/parse.rs +++ b/src/pdf/parse.rs @@ -1,6 +1,9 @@ -use crate::pdf::object::{ - IsPdfNull, MaybeArray, PdfInteger, PdfName, PdfNull, PdfNumber, PdfObject, PdfObjectDirect, - PdfObjectIdentifier, PdfObjectIndirect, PdfObjectNonNull, +use crate::pdf::{ + content_stream::{PdfOperator, PdfOperatorUnparsed}, + object::{ + IsPdfNull, MaybeArray, PdfInteger, PdfName, PdfNull, PdfNumber, PdfObject, PdfObjectDirect, + PdfObjectIdentifier, PdfObjectIndirect, PdfObjectNonNull, + }, }; use std::{any::Any, borrow::Cow, fmt, mem, num::NonZero, sync::Arc}; @@ -265,9 +268,20 @@ pub enum PdfParseError { filter: PdfName, error: String, }, - ObjectStreamParseError { - stream_pos: PdfInputPosition, - error: Arc, + StreamNotAllowedHere { + pos: PdfInputPosition, + }, + OperatorNotAllowedHere { + operator: PdfOperatorUnparsed, + }, + MissingOperator { + pos: PdfInputPosition, + }, + OperatorHasTooFewOperands { + operator: PdfOperator, + }, + OperatorHasTooManyOperands { + operator: PdfOperator, }, } @@ -313,9 +327,11 @@ impl GetPdfInputPosition for PdfParseError { | PdfParseError::InvalidStartXRefValue { pos, .. } | PdfParseError::UnknownStreamFilter { pos, .. } | PdfParseError::StreamFilterError { pos, .. } - | PdfParseError::ObjectStreamParseError { - stream_pos: pos, .. - } => pos, + | PdfParseError::StreamNotAllowedHere { pos } + | PdfParseError::MissingOperator { pos } => pos, + PdfParseError::OperatorNotAllowedHere { ref operator } => operator.pos(), + PdfParseError::OperatorHasTooFewOperands { ref operator } + | PdfParseError::OperatorHasTooManyOperands { ref operator } => operator.pos(), } } } @@ -443,12 +459,33 @@ impl fmt::Display for PdfParseError { } => { write!(f, "at {pos}: stream filter {filter} error: {error}") } - PdfParseError::ObjectStreamParseError { - stream_pos, - ref error, - } => { - write!(f, "at {stream_pos}: object stream error: ")?; - error.fmt(f) + PdfParseError::StreamNotAllowedHere { pos } => { + write!(f, "at {pos}: stream not allowed here") + } + PdfParseError::OperatorNotAllowedHere { ref operator } => { + write!( + f, + "at {}: operator not allowed here: {}", + operator.pos(), + operator.bytes_debug() + ) + } + PdfParseError::MissingOperator { pos } => { + write!(f, "at {pos}: stream not allowed here") + } + PdfParseError::OperatorHasTooFewOperands { ref operator } => { + write!( + f, + "at {}: operator has too few operands: {operator:?}", + operator.pos(), + ) + } + PdfParseError::OperatorHasTooManyOperands { ref operator } => { + write!( + f, + "at {}: operator has too many operands: {operator:?}", + operator.pos(), + ) } } }