diff --git a/Cargo.lock b/Cargo.lock index 3cb67c2..07f112f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,56 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "adler2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "crc32fast" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "flate2" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfe33edd8e85a12a67454e37f8c75e730830d83e313556ab9ebf9ee7fbeb3bfb" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "miniz_oxide" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" +dependencies = [ + "adler2", + "simd-adler32", +] + [[package]] name = "parse_powerisa_pdf" version = "0.1.0" +dependencies = [ + "flate2", +] + +[[package]] +name = "simd-adler32" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" diff --git a/Cargo.toml b/Cargo.toml index 125e5e2..c5d18eb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,4 +5,5 @@ edition = "2024" license = "LGPL-3.0-or-later" [dependencies] +flate2 = "1.1.5" diff --git a/src/pdf.rs b/src/pdf.rs index 4ba785b..d4a4922 100644 --- a/src/pdf.rs +++ b/src/pdf.rs @@ -1,11 +1,11 @@ use crate::{ pdf::{ object::{ - MaybeArray, PdfArray, PdfBoolean, PdfDictionary, PdfInteger, PdfName, PdfNull, - PdfObject, PdfObjectIdentifier, PdfObjectIndirect, PdfReal, PdfStream, + PdfArray, PdfBoolean, PdfDictionary, PdfInteger, PdfName, PdfNull, PdfObject, + PdfObjectIdentifier, PdfObjectIndirect, PdfObjectStreamDictionary, PdfReal, PdfStream, PdfStreamDictionary, PdfString, UnparsedPdfStreamDictionary, }, - parse::{PdfInputPosition, PdfParse, PdfParseError}, + parse::{GetPdfInputPosition, PdfInputPosition, PdfParse, PdfParseError}, }, pdf_parse, util::ArcOrRef, @@ -21,9 +21,15 @@ use std::{ pub mod object; pub mod parse; +pub mod stream_filters; + +struct PdfObjectsInner { + objects: BTreeMap, + object_streams: Vec>, +} pub struct PdfObjects { - objects: OnceLock>, + inner: OnceLock, } #[derive(Copy, Clone, Debug)] @@ -70,24 +76,12 @@ pdf_parse! { pub struct PdfXRefStreamDictionaryRest { #[pdf(name = "Type")] pub ty: PdfXRefName, - #[pdf(name = "Size")] - pub size: usize, #[pdf(name = "Index")] pub index: Option>, - #[pdf(name = "Prev")] - pub prev: Option, #[pdf(name = "W")] pub w: Option>, - #[pdf(name = "Root")] - pub root: PdfDictionary, - #[pdf(name = "Encrypt")] - pub encrypt: Option, - #[pdf(name = "Info")] - pub info: Option, - #[pdf(name = "ID")] - pub id: Option<[PdfString; 2]>, #[pdf(flatten)] - pub rest: PdfDictionary, + pub rest: PdfTrailerDictionary, } } @@ -308,6 +302,38 @@ struct PdfParser<'a> { } impl<'a> PdfParser<'a> { + fn with_tokenizer<'b, R>( + &mut self, + tokenizer: PdfTokenizer<'b>, + f: impl FnOnce(&mut PdfParser<'b>) -> R, + ) -> R { + let PdfParser { + objects_arc, + objects_map, + unparsed_stream_dictionaries, + tokenizer: _, + } = self; + let objects_arc = objects_arc.clone(); + let objects_map = std::mem::take(objects_map); + let unparsed_stream_dictionaries = std::mem::take(unparsed_stream_dictionaries); + let mut new_parser = PdfParser { + objects_arc, + objects_map, + unparsed_stream_dictionaries, + tokenizer, + }; + let retval = f(&mut new_parser); + let PdfParser { + objects_arc, + objects_map, + unparsed_stream_dictionaries, + tokenizer: _, + } = new_parser; + self.objects_arc = objects_arc; + self.objects_map = objects_map; + self.unparsed_stream_dictionaries = unparsed_stream_dictionaries; + retval + } fn parse_header(&mut self) -> Result { let Some(b'%') = self.tokenizer.bytes.first() else { return Err(PdfParseError::NotAPdfFile); @@ -739,18 +765,94 @@ impl<'a> PdfParser<'a> { Ok(Some(())) } } + fn parse_object_stream_inner( + &mut self, + object_stream: &PdfStream, + ) -> Result<(), PdfParseError> { + let mut object_ids_and_byte_positions = + Vec::<(PdfObjectIdentifier, usize)>::with_capacity(object_stream.dictionary().rest.n); + for _ in 0..object_stream.dictionary().rest.n { + self.skip_comments_and_whitespace(); + let Some((pos, object_number)) = + self.parse_digits(|pos| Err(PdfParseError::InvalidObjectNumber { pos }))? + else { + return Err(PdfParseError::InvalidObjectNumber { + pos: self.tokenizer.pos(), + }); + }; + self.skip_comments_and_whitespace(); + let Some((_, byte_position)) = + self.parse_digits(|pos| Err(PdfParseError::InvalidNumber { pos }))? + else { + return Err(PdfParseError::InvalidNumber { + pos: self.tokenizer.pos(), + }); + }; + object_ids_and_byte_positions.push(( + PdfObjectIdentifier { + pos: pos.into(), + object_number, + generation_number: 0, + }, + byte_position, + )); + } + for (id, _byte_position) in object_ids_and_byte_positions { + let object = self.parse_object()?; + if self.objects_map.insert(id, object).is_some() { + return Err(PdfParseError::DuplicateIndirectObjectDefinition { pos: id.pos.0, id }); + } + } + Ok(()) + } + fn parse_object_stream( + &mut self, + object_stream: &PdfStream, + ) -> Result<(), PdfParseError> { + let data = object_stream.decoded_data().as_ref()?; + self.with_tokenizer(PdfTokenizer::new(data, 0), |parser| { + parser.parse_object_stream_inner(object_stream) + }) + .map_err(|e| PdfParseError::ObjectStreamParseError { + stream_pos: object_stream.get_pdf_input_position(), + error: Arc::new(e), + }) + } fn parse_body(&mut self) -> Result<(), PdfParseError> { while let Some(()) = self.parse_indirect_object_definition()? {} - let Ok(()) = self - .objects_arc - .objects - .set(std::mem::take(&mut self.objects_map)) - else { - unreachable!(); - }; self.unparsed_stream_dictionaries .drain(..) - .try_for_each(|v| v.finish_parsing()) + .try_for_each(|v| v.finish_parsing())?; + let mut object_streams: Vec> = Vec::new(); + for object in self.objects_map.values_mut() { + let stream = match object { + PdfObject::Stream(stream) => stream, + PdfObject::Boolean(_) + | PdfObject::Integer(_) + | PdfObject::Real(_) + | PdfObject::String(_) + | PdfObject::Name(_) + | PdfObject::Array(_) + | PdfObject::Dictionary(_) + | PdfObject::Null(_) + | PdfObject::Indirect(_) => continue, + }; + if PdfObjectStreamDictionary::parse_type_from_dictionary(&stream.dictionary().rest) + .is_ok() + { + object_streams.push(PdfStream::parse(object.clone())?); + } + } + for object_stream in &object_streams { + self.parse_object_stream(object_stream)?; + } + let Ok(()) = self.objects_arc.inner.set(PdfObjectsInner { + objects: std::mem::take(&mut self.objects_map), + object_streams, + }) else { + unreachable!(); + }; + Ok(()) } fn parse_xref_table(&mut self) -> Result<(), PdfParseError> { self.skip_comments_and_whitespace(); @@ -844,7 +946,7 @@ impl Pdf { pub fn parse(bytes: impl AsRef<[u8]>) -> Result { PdfParser { objects_arc: Arc::new(PdfObjects { - objects: OnceLock::new(), + inner: OnceLock::new(), }), objects_map: BTreeMap::new(), unparsed_stream_dictionaries: vec![], diff --git a/src/pdf/object.rs b/src/pdf/object.rs index d3979d8..2a17df5 100644 --- a/src/pdf/object.rs +++ b/src/pdf/object.rs @@ -5,15 +5,17 @@ use crate::{ GetPdfInputPosition, PdfInputPosition, PdfInputPositionNoCompare, PdfParse, PdfParseError, }, + stream_filters::PdfStreamFilter, }, pdf_parse, util::ArcOrRef, }; use std::{ any::TypeId, - borrow::Cow, + borrow::{Borrow, Cow}, collections::BTreeMap, fmt::{self, Write}, + iter::FusedIterator, num::NonZero, sync::{Arc, OnceLock}, }; @@ -61,6 +63,12 @@ pub struct PdfName { bytes: ArcOrRef<'static, [u8]>, } +impl Borrow<[u8]> for PdfName { + fn borrow(&self) -> &[u8] { + &self.bytes + } +} + impl PdfName { pub fn try_new( pos: impl Into, @@ -218,24 +226,51 @@ macro_rules! make_pdf_object { $Variant:ident($ty:ty), )+ ) => { - #[derive(Clone, Debug)] + #[derive(Clone)] pub enum PdfObjectNonNull { $($Variant($ty),)* } - #[derive(Clone, Debug)] + impl fmt::Debug for PdfObjectNonNull { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + $(Self::$Variant(v) => v.fmt(f),)* + } + } + } + + #[derive(Clone)] pub enum PdfObjectDirect { $($Variant($ty),)* Null(PdfNull), } - #[derive(Clone, Debug)] + impl fmt::Debug for PdfObjectDirect { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + $(Self::$Variant(v) => v.fmt(f),)* + Self::Null(v) => v.fmt(f), + } + } + } + + #[derive(Clone)] pub enum PdfObject { $($Variant($ty),)* Null(PdfNull), Indirect(PdfObjectIndirect), } + impl fmt::Debug for PdfObject { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + $(Self::$Variant(v) => v.fmt(f),)* + Self::Null(v) => v.fmt(f), + Self::Indirect(v) => v.fmt(f), + } + } + } + $( impl From<$ty> for PdfObjectNonNull { fn from(value: $ty) -> Self { @@ -546,12 +581,12 @@ impl PdfObjectIndirect { } pub fn get(&self) -> PdfObjectDirect { if let Some(objects) = self.objects.upgrade() { - if let Some(objects) = objects.objects.get() { + if let Some(objects) = objects.inner.get() { let final_id = self.final_id.get().copied(); let limit = if final_id.is_some() { 1 } else { 1000usize }; let mut id = final_id.unwrap_or(self.id); for _ in 0..limit { - if let Some(object) = objects.get(&self.id) { + if let Some(object) = objects.objects.get(&self.id) { let retval = match object { PdfObject::Boolean(v) => PdfObjectDirect::Boolean(*v), PdfObject::Integer(v) => PdfObjectDirect::Integer(*v), @@ -628,18 +663,27 @@ impl PdfDictionary { } pub fn contains_key(&self, key: &Q) -> bool where - PdfName: std::borrow::Borrow + Ord, + PdfName: std::borrow::Borrow, Q: Ord, { self.fields.contains_key(key) } pub fn get(&self, key: &Q) -> Option<&PdfObject> where - PdfName: std::borrow::Borrow + Ord, + PdfName: std::borrow::Borrow, Q: Ord, { self.fields.get(key) } + pub fn get_or_null(&self, key: &Q) -> PdfObject + where + PdfName: std::borrow::Borrow, + Q: Ord, + { + self.get(key) + .cloned() + .unwrap_or(PdfObject::Null(PdfNull(self.pos))) + } pub fn pos(&self) -> PdfInputPosition { self.pos.0 } @@ -842,35 +886,6 @@ impl std::ops::DerefMut for MaybeArray { } } -pdf_parse! { - #[derive(Clone, Debug, PartialEq, Eq)] - #[non_exhaustive] - pub enum PdfStreamFilter { - #[pdf(name = "ASCIIHexDecode")] - AsciiHexDecode, - #[pdf(name = "ASCII85Decode")] - Ascii85Decode, - #[pdf(name = "LZWDecode")] - LzwDecode, - #[pdf(name = "FlateDecode")] - FlateDecode, - #[pdf(name = "RunLengthDecode")] - RunLengthDecode, - #[pdf(name = "CCITTFaxDecode")] - CcittFaxDecode, - #[pdf(name = "JBIG2Decode")] - Jbig2Decode, - #[pdf(name = "DCTDecode")] - DctDecode, - #[pdf(name = "JPXDecode")] - JpxDecode, - #[pdf(name = "Crypt")] - Crypt, - #[pdf(other)] - Unknown(PdfName), - } -} - impl Default for MaybeArray { fn default() -> Self { Self(Arc::default()) @@ -936,47 +951,101 @@ impl PdfStreamDictionary { pub(crate) fn parse_len_from_dictionary( dictionary: &PdfDictionary, ) -> Result { - PdfParse::parse( - dictionary - .get(&PdfName::new_static(Self::LENGTH_NAME.as_bytes())) - .cloned() - .unwrap_or_default(), + PdfParse::parse(dictionary.get_or_null(Self::LENGTH_NAME.as_bytes())) + } +} + +#[derive(Debug, Clone, Default)] +pub struct PdfStreamDictionaryFiltersAndParms<'a> { + filters: std::iter::Enumerate>, + decode_parms: &'a [Option], +} + +impl<'a> PdfStreamDictionaryFiltersAndParms<'a> { + fn item_helper( + filter: (usize, &'a PdfStreamFilter), + decode_parms: &'a [Option], + ) -> (&'a PdfStreamFilter, &'a PdfDictionary) { + static EMPTY_DICTIONARY: OnceLock = OnceLock::new(); + let (index, filter) = filter; + ( + filter, + match decode_parms.get(index) { + Some(Some(v)) => v, + _ => EMPTY_DICTIONARY.get_or_init(PdfDictionary::default), + }, ) } } -impl PdfStreamDictionary { - pub fn filters_and_parms( - &self, - ) -> impl Clone + ExactSizeIterator + DoubleEndedIterator - { - self.filters.iter().enumerate().map(|(index, filter)| { - ( - filter.clone(), - self.decode_parms - .0 - .get(index) - .cloned() - .flatten() - .unwrap_or_default(), - ) - }) +impl<'a> Iterator for PdfStreamDictionaryFiltersAndParms<'a> { + type Item = (&'a PdfStreamFilter, &'a PdfDictionary); + + fn next(&mut self) -> Option { + self.filters + .next() + .map(|filter| Self::item_helper(filter, self.decode_parms)) } - pub fn file_filters_and_parms( - &self, - ) -> impl Clone + ExactSizeIterator + DoubleEndedIterator + + fn size_hint(&self) -> (usize, Option) { + self.filters.size_hint() + } + + fn nth(&mut self, n: usize) -> Option { + self.filters + .nth(n) + .map(|filter| Self::item_helper(filter, self.decode_parms)) + } + + fn fold(self, init: B, f: F) -> B + where + F: FnMut(B, Self::Item) -> B, { - self.file_filters.iter().enumerate().map(|(index, filter)| { - ( - filter.clone(), - self.file_decode_parms - .0 - .get(index) - .cloned() - .flatten() - .unwrap_or_default(), - ) - }) + self.filters + .map(|filter| Self::item_helper(filter, self.decode_parms)) + .fold(init, f) + } +} + +impl<'a> FusedIterator for PdfStreamDictionaryFiltersAndParms<'a> {} + +impl<'a> ExactSizeIterator for PdfStreamDictionaryFiltersAndParms<'a> {} + +impl<'a> DoubleEndedIterator for PdfStreamDictionaryFiltersAndParms<'a> { + fn next_back(&mut self) -> Option { + self.filters + .next_back() + .map(|filter| Self::item_helper(filter, self.decode_parms)) + } + + fn nth_back(&mut self, n: usize) -> Option { + self.filters + .nth_back(n) + .map(|filter| Self::item_helper(filter, self.decode_parms)) + } + + fn rfold(self, init: B, f: F) -> B + where + F: FnMut(B, Self::Item) -> B, + { + self.filters + .map(|filter| Self::item_helper(filter, self.decode_parms)) + .rfold(init, f) + } +} + +impl PdfStreamDictionary { + pub fn filters_and_parms<'a>(&'a self) -> PdfStreamDictionaryFiltersAndParms<'a> { + PdfStreamDictionaryFiltersAndParms { + filters: self.filters.iter().enumerate(), + decode_parms: &self.decode_parms, + } + } + pub fn file_filters_and_parms<'a>(&'a self) -> PdfStreamDictionaryFiltersAndParms<'a> { + PdfStreamDictionaryFiltersAndParms { + filters: self.file_filters.iter().enumerate(), + decode_parms: &self.file_decode_parms, + } } } @@ -1001,16 +1070,64 @@ impl UnparsedPdfStreamDictionary { pub struct PdfStream { pos: PdfInputPositionNoCompare, dictionary: Arc>>, - data: Arc<[u8]>, + encoded_data: Arc<[u8]>, + decoded_data: Arc, PdfParseError>>>, +} + +struct DumpBytes<'a>(&'a [u8]); + +impl<'a> fmt::Debug for DumpBytes<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(self, f) + } +} + +impl fmt::Display for DumpBytes<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut first = true; + let mut fmt_chunk = |chunk: &[u8]| -> fmt::Result { + if first { + first = false; + } else { + f.write_str("\n")?; + } + write!(f, "\"{}\"", chunk.escape_ascii()) + }; + if self.0.is_empty() { + return fmt_chunk(self.0); + } + for chunk in self.0.chunks(32) { + fmt_chunk(chunk)?; + } + Ok(()) + } } impl fmt::Debug for PdfStream { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("PdfStream") - .field("pos", &self.pos) - .field("dictionary", &self.dictionary) - .field("data", &format_args!("{:02x?}", self.data)) - .finish() + let Self { + pos, + dictionary, + encoded_data, + decoded_data, + } = self; + let mut debug_struct = f.debug_struct("PdfStream"); + debug_struct.field("pos", pos); + if let Some(dictionary) = dictionary.get() { + debug_struct.field("dictionary", dictionary); + } else { + debug_struct.field("dictionary", &format_args!("")); + } + debug_struct.field("encoded_data", &DumpBytes(encoded_data)); + if let Some(decoded_data) = decoded_data.get() { + match decoded_data { + Ok(decoded_data) => debug_struct.field("decoded_data", &DumpBytes(decoded_data)), + Err(e) => debug_struct.field("decoded_data", &Err::<(), _>(e)), + }; + } else { + debug_struct.field("decoded_data", &format_args!("")); + } + debug_struct.finish() } } @@ -1018,25 +1135,27 @@ impl PdfStream { pub fn new( pos: impl Into, dictionary: PdfStreamDictionary, - data: Arc<[u8]>, + encoded_data: Arc<[u8]>, ) -> Self { Self { pos: pos.into(), dictionary: Arc::new(OnceLock::from(dictionary)), - data, + encoded_data, + decoded_data: Arc::new(OnceLock::new()), } } pub(crate) fn new_unparsed( pos: impl Into, unparsed_dictionary: PdfDictionary, - data: Arc<[u8]>, + encoded_data: Arc<[u8]>, ) -> (Self, UnparsedPdfStreamDictionary) { let dictionary = Arc::new(OnceLock::new()); ( Self { pos: pos.into(), dictionary: dictionary.clone(), - data, + encoded_data, + decoded_data: Arc::new(OnceLock::new()), }, UnparsedPdfStreamDictionary { unparsed_dictionary, @@ -1049,8 +1168,29 @@ impl PdfStream { .get() .expect("haven't finished parsing all pdf object definitions yet") } - pub fn data(&self) -> &Arc<[u8]> { - &self.data + pub fn encoded_data(&self) -> &Arc<[u8]> { + &self.encoded_data + } + fn try_decode_data(&self) -> Result, PdfParseError> { + let dictionary = self.dictionary(); + let (data, filters) = if let Some(file) = &dictionary.file { + todo!() + } else { + (&self.encoded_data, dictionary.filters_and_parms()) + }; + if filters.len() == 0 { + return Ok(data.clone()); + } + let mut data: &[u8] = data; + let mut buffer; + for (filter, filter_parms) in filters { + buffer = filter.decode_stream_data(filter_parms.clone(), self.pos.0, &data)?; + data = &buffer; + } + Ok(Arc::from(data)) + } + pub fn decoded_data(&self) -> &Result, PdfParseError> { + self.decoded_data.get_or_init(|| self.try_decode_data()) } } @@ -1099,7 +1239,8 @@ impl PdfParse for PdfStream { rest: Rest::parse(rest.clone().into())?, })) }, - data: stream.data, + encoded_data: stream.encoded_data, + decoded_data: stream.decoded_data, }), object => Err(PdfParseError::InvalidType { pos: object.get_pdf_input_position(), @@ -1109,3 +1250,37 @@ impl PdfParse for PdfStream { } } } + +pdf_parse! { + #[derive(Clone, Copy, Debug, Hash, Default, PartialEq, Eq, PartialOrd, Ord)] + pub enum PdfObjectStreamType { + #[pdf(name = "ObjStm")] + #[default] + ObjStm, + } +} + +pdf_parse! { + #[derive(Clone, Debug)] + pub struct PdfObjectStreamDictionary { + #[pdf(name = Self::TYPE_NAME)] + pub ty: PdfObjectStreamType, + #[pdf(name = "N")] + pub n: usize, + #[pdf(name = "First")] + pub first: usize, + #[pdf(name = "Extends")] + pub extends: Option, + #[pdf(flatten)] + pub rest: PdfDictionary, + } +} + +impl PdfObjectStreamDictionary { + pub const TYPE_NAME: &str = "Type"; + pub(crate) fn parse_type_from_dictionary( + dictionary: &PdfDictionary, + ) -> Result { + PdfParse::parse(dictionary.get_or_null(Self::TYPE_NAME.as_bytes())) + } +} diff --git a/src/pdf/parse.rs b/src/pdf/parse.rs index aa5bc3d..2287dbf 100644 --- a/src/pdf/parse.rs +++ b/src/pdf/parse.rs @@ -144,7 +144,7 @@ impl PartialEq for PdfInputPositionNoCompare { } } -#[derive(Debug)] +#[derive(Debug, Clone)] #[non_exhaustive] pub enum PdfParseError { Custom(String), @@ -231,6 +231,19 @@ pub enum PdfParseError { pos: PdfInputPosition, start_xref: usize, }, + UnknownStreamFilter { + pos: PdfInputPosition, + filter: PdfName, + }, + StreamFilterError { + pos: PdfInputPosition, + filter: PdfName, + error: String, + }, + ObjectStreamParseError { + stream_pos: PdfInputPosition, + error: Arc, + }, } impl From for PdfParseError { @@ -239,6 +252,12 @@ impl From for PdfParseError { } } +impl<'a> From<&'a Self> for PdfParseError { + fn from(value: &'a Self) -> Self { + value.clone() + } +} + impl GetPdfInputPosition for PdfParseError { fn get_pdf_input_position(&self) -> PdfInputPosition { match *self { @@ -266,7 +285,12 @@ impl GetPdfInputPosition for PdfParseError { | PdfParseError::MissingStartXRefValue { pos } | PdfParseError::MissingEofComment { pos } | PdfParseError::UnexpectedByte { pos, .. } - | PdfParseError::InvalidStartXRefValue { pos, .. } => pos, + | PdfParseError::InvalidStartXRefValue { pos, .. } + | PdfParseError::UnknownStreamFilter { pos, .. } + | PdfParseError::StreamFilterError { pos, .. } + | PdfParseError::ObjectStreamParseError { + stream_pos: pos, .. + } => pos, } } } @@ -376,7 +400,7 @@ impl fmt::Display for PdfParseError { write!(f, "at {pos}: missing `%%EOF` comment") } PdfParseError::UnexpectedByte { pos, byte } => { - write!(f, "at {pos}: unexpected byte {}", byte.escape_ascii()) + write!(f, "at {pos}: unexpected byte '{}'", byte.escape_ascii()) } PdfParseError::InvalidStartXRefValue { pos, start_xref } => { write!( @@ -384,6 +408,23 @@ impl fmt::Display for PdfParseError { "at {pos}: invalid `startxref` value: {start_xref} ({start_xref:#x})" ) } + PdfParseError::UnknownStreamFilter { pos, ref filter } => { + write!(f, "at {pos}: unknown stream filter: {filter}") + } + PdfParseError::StreamFilterError { + pos, + ref filter, + ref error, + } => { + write!(f, "at {pos}: stream filter {filter} error: {error}") + } + PdfParseError::ObjectStreamParseError { + stream_pos, + ref error, + } => { + write!(f, "at {stream_pos}: object stream error: ")?; + error.fmt(f) + } } } } @@ -785,12 +826,10 @@ macro_rules! pdf_parse { [$(#[$($field_meta:tt)*])*] $field_name:ident: $field_ty:ty ) => { - let $field_name = $crate::pdf::object::PdfName::new_static( - $crate::__std::convert::AsRef::<[u8]>::as_ref($name), - ); + let $field_name = $crate::__std::convert::AsRef::<[u8]>::as_ref($name); let $field_name = <$field_ty as $crate::pdf::parse::PdfParse>::parse( $object_mut - .remove(&$field_name) + .remove($field_name) .unwrap_or($crate::pdf::object::PdfObject::Null($crate::pdf::object::PdfNull::new($pos))), )?; }; diff --git a/src/pdf/stream_filters.rs b/src/pdf/stream_filters.rs new file mode 100644 index 0000000..51a3884 --- /dev/null +++ b/src/pdf/stream_filters.rs @@ -0,0 +1,65 @@ +use crate::pdf::{ + object::{PdfDictionary, PdfName}, + parse::{PdfInputPosition, PdfParse, PdfParseError}, + pdf_parse, +}; + +pub mod flate; + +pdf_parse! { + #[derive(Clone, Debug, PartialEq, Eq)] + #[non_exhaustive] + pub enum PdfStreamFilter { + #[pdf(name = "ASCIIHexDecode")] + AsciiHexDecode, + #[pdf(name = "ASCII85Decode")] + Ascii85Decode, + #[pdf(name = "LZWDecode")] + LzwDecode, + #[pdf(name = "FlateDecode")] + FlateDecode, + #[pdf(name = "RunLengthDecode")] + RunLengthDecode, + #[pdf(name = "CCITTFaxDecode")] + CcittFaxDecode, + #[pdf(name = "JBIG2Decode")] + Jbig2Decode, + #[pdf(name = "DCTDecode")] + DctDecode, + #[pdf(name = "JPXDecode")] + JpxDecode, + #[pdf(name = "Crypt")] + Crypt, + #[pdf(other)] + Unknown(PdfName), + } +} + +impl PdfStreamFilter { + pub fn decode_stream_data( + &self, + filter_parms: PdfDictionary, + stream_pos: PdfInputPosition, + encoded_data: &[u8], + ) -> Result, PdfParseError> { + match self { + PdfStreamFilter::AsciiHexDecode => todo!(), + PdfStreamFilter::Ascii85Decode => todo!(), + PdfStreamFilter::LzwDecode => todo!(), + PdfStreamFilter::FlateDecode => { + flate::PdfFilterParmsFlateDecode::parse(filter_parms.into())? + .decode_stream_data(stream_pos, encoded_data) + } + PdfStreamFilter::RunLengthDecode => todo!(), + PdfStreamFilter::CcittFaxDecode => todo!(), + PdfStreamFilter::Jbig2Decode => todo!(), + PdfStreamFilter::DctDecode => todo!(), + PdfStreamFilter::JpxDecode => todo!(), + PdfStreamFilter::Crypt => todo!(), + PdfStreamFilter::Unknown(filter) => Err(PdfParseError::UnknownStreamFilter { + pos: stream_pos, + filter: filter.clone(), + }), + } + } +} diff --git a/src/pdf/stream_filters/flate.rs b/src/pdf/stream_filters/flate.rs new file mode 100644 index 0000000..46d01a8 --- /dev/null +++ b/src/pdf/stream_filters/flate.rs @@ -0,0 +1,73 @@ +use crate::pdf::{ + object::PdfDictionary, + parse::{PdfInputPosition, PdfParseError}, + pdf_parse, + stream_filters::PdfStreamFilter, +}; +use std::{io::Read, num::NonZero}; + +pdf_parse! { + #[derive(Clone, Debug, Default)] + pub struct PdfFilterParmsFlateDecode { + #[pdf(name = "Predictor")] + pub predictor: Option>, + #[pdf(name = "Colors")] + pub colors: Option>, + #[pdf(name = "BitsPerComponent")] + pub bits_per_component: Option>, + #[pdf(name = "Columns")] + pub columns: Option>, + #[pdf(flatten)] + pub rest: PdfDictionary, + } +} + +impl PdfFilterParmsFlateDecode { + pub const FILTER: PdfStreamFilter = PdfStreamFilter::FlateDecode; + pub const DEFAULT_PREDICTOR: NonZero = const { NonZero::new(1).unwrap() }; + pub const DEFAULT_COLORS: NonZero = const { NonZero::new(1).unwrap() }; + pub const DEFAULT_BITS_PER_COMPONENT: NonZero = const { NonZero::new(8).unwrap() }; + pub const DEFAULT_COLUMNS: NonZero = const { NonZero::new(1).unwrap() }; + pub fn predictor(&self) -> NonZero { + self.predictor.unwrap_or(Self::DEFAULT_PREDICTOR) + } + pub fn colors(&self) -> NonZero { + self.colors.unwrap_or(Self::DEFAULT_COLORS) + } + pub fn bits_per_component(&self) -> NonZero { + self.bits_per_component + .unwrap_or(Self::DEFAULT_BITS_PER_COMPONENT) + } + pub fn columns(&self) -> NonZero { + self.columns.unwrap_or(Self::DEFAULT_COLUMNS) + } + pub fn decode_stream_data( + &self, + stream_pos: PdfInputPosition, + encoded_data: &[u8], + ) -> Result, PdfParseError> { + let Self { + predictor: _, + colors: _, + bits_per_component: _, + columns: _, + rest: _, + } = self; + let mut decoded_data = vec![]; + flate2::bufread::ZlibDecoder::new(encoded_data) + .read_to_end(&mut decoded_data) + .map_err(|e| PdfParseError::StreamFilterError { + pos: stream_pos, + filter: Self::FILTER.into(), + error: e.to_string(), + })?; + let predictor = self.predictor(); + let colors = self.colors(); + let bits_per_component = self.bits_per_component(); + let columns = self.columns(); + match predictor { + Self::DEFAULT_PREDICTOR => Ok(decoded_data), + _ => todo!("{predictor}"), + } + } +}