diff --git a/Cargo.lock b/Cargo.lock index 2d752ff..3cb67c2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5,71 +5,3 @@ version = 4 [[package]] name = "parse_powerisa_pdf" version = "0.1.0" -dependencies = [ - "serde", -] - -[[package]] -name = "proc-macro2" -version = "1.0.103" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8" -dependencies = [ - "unicode-ident", -] - -[[package]] -name = "quote" -version = "1.0.42" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f" -dependencies = [ - "proc-macro2", -] - -[[package]] -name = "serde" -version = "1.0.228" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" -dependencies = [ - "serde_core", - "serde_derive", -] - -[[package]] -name = "serde_core" -version = "1.0.228" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" -dependencies = [ - "serde_derive", -] - -[[package]] -name = "serde_derive" -version = "1.0.228" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "syn" -version = "2.0.111" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "390cc9a294ab71bdb1aa2e99d13be9c753cd2d7bd6560c77118597410c4d2e87" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - -[[package]] -name = "unicode-ident" -version = "1.0.22" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" diff --git a/Cargo.toml b/Cargo.toml index d2f159a..125e5e2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,5 +5,4 @@ edition = "2024" license = "LGPL-3.0-or-later" [dependencies] -serde = { version = "1.0.228", features = ["derive"] } diff --git a/src/lib.rs b/src/lib.rs index 938fe11..d0e7860 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,2 +1,5 @@ -mod pdf; -mod util; +#[doc(hidden)] +pub use std as __std; + +pub mod pdf; +pub mod util; diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..34539a3 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,45 @@ +use parse_powerisa_pdf::pdf::{Pdf, PdfTrailer}; +use std::{ + error::Error, + io::{IsTerminal, Read}, + process::ExitCode, +}; + +fn main() -> Result> { + let args: Vec<_> = std::env::args_os().collect(); + if args + .iter() + .skip(1) + .any(|v| v.as_encoded_bytes().starts_with(b"-") && v != "-") + || args.len() > 2 + || (args.len() == 1 && std::io::stdin().is_terminal()) + { + eprintln!( + "Usage: {} []\n\ + Reads the PDF file passed on the command line,\n\ + Reads stdin if no arguments are passed or if the file name is just a dash `-`.\n\ + If stdin is a terminal, you have to pass `-` explicitly to read from it.", + args[0].display() + ); + return Ok(ExitCode::FAILURE); + } + let file_path = args.get(1).filter(|v| *v != "-"); + let input = if let Some(file_path) = file_path { + std::fs::read(file_path)? + } else { + let mut buf = Vec::new(); + std::io::stdin().lock().read_to_end(&mut buf)?; + buf + }; + let pdf = Pdf::parse(input)?; + if let PdfTrailer::Stream { + xref_stream, + start_xref, + } = pdf.trailer + { + dbg!(xref_stream.dictionary()); + } + + todo!(); + Ok(ExitCode::SUCCESS) +} diff --git a/src/pdf.rs b/src/pdf.rs index 0af9d6b..4ba785b 100644 --- a/src/pdf.rs +++ b/src/pdf.rs @@ -1,956 +1,930 @@ -use crate::util::ArcOrRef; -use serde::{de, forward_to_deserialize_any}; +use crate::{ + pdf::{ + object::{ + MaybeArray, PdfArray, PdfBoolean, PdfDictionary, PdfInteger, PdfName, PdfNull, + PdfObject, PdfObjectIdentifier, PdfObjectIndirect, PdfReal, PdfStream, + PdfStreamDictionary, PdfString, UnparsedPdfStreamDictionary, + }, + parse::{PdfInputPosition, PdfParse, PdfParseError}, + }, + pdf_parse, + util::ArcOrRef, +}; use std::{ - cell::RefCell, collections::BTreeMap, convert::Infallible, - fmt::{self, Write}, - iter::FusedIterator, - marker::PhantomData, + fmt, num::NonZero, - sync::{Arc, Weak}, + str::FromStr, + sync::{Arc, OnceLock}, }; -#[derive(Debug)] -pub(crate) enum PdfParseError { - InvalidFieldKind { - containing_ty: &'static str, - field: &'static str, - expected_kind: &'static str, - kind: &'static str, - }, - Custom(String), +pub mod object; +pub mod parse; + +pub struct PdfObjects { + objects: OnceLock>, } -impl From for PdfParseError { - fn from(value: Infallible) -> Self { - match value {} +#[derive(Copy, Clone, Debug)] +pub struct PdfHeader { + pub major: NonZero, + pub minor: u16, +} + +impl PdfHeader { + pub const PREFIX: &str = "%PDF-"; +} + +pdf_parse! { + #[derive(Clone, Debug)] + pub struct PdfTrailerDictionary { + #[pdf(name = "Size")] + pub size: usize, + #[pdf(name = "Prev")] + pub prev: Option, + #[pdf(name = "Root")] + pub root: PdfDictionary, + #[pdf(name = "Encrypt")] + pub encrypt: Option, + #[pdf(name = "Info")] + pub info: Option, + #[pdf(name = "ID")] + pub id: Option<[PdfString; 2]>, + #[pdf(flatten)] + pub rest: PdfDictionary, } } -impl fmt::Display for PdfParseError { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - &PdfParseError::InvalidFieldKind { - containing_ty, - field, - expected_kind, - kind, - } => write!( - f, - "invalid field kind: {containing_ty}.{field}: expected {expected_kind}, got {kind}" - ), - PdfParseError::Custom(msg) => f.write_str(msg), - } +pdf_parse! { + #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Default, Debug)] + pub enum PdfXRefName { + #[pdf(name = "XRef")] + #[default] + XRef, } } -impl std::error::Error for PdfParseError {} - -impl de::Error for PdfParseError { - fn custom(msg: T) -> Self - where - T: fmt::Display, - { - PdfParseError::Custom(msg.to_string()) - } -} - -impl<'de> de::IntoDeserializer<'de, PdfParseError> for PdfName { - type Deserializer = PdfObject; - - fn into_deserializer(self) -> Self::Deserializer { - self.into() - } -} - -impl<'de> de::IntoDeserializer<'de, PdfParseError> for PdfObject { - type Deserializer = Self; - - fn into_deserializer(self) -> Self::Deserializer { - self - } -} - -impl PdfObject { - const SERDE_FIELD_NAME: &str = "__PdfObject__look_in_thread_local"; - const SERDE_NAME_AND_FIELDS: (&str, &[&str]) = ("PdfObject", &[Self::SERDE_FIELD_NAME]); - fn with_thread_local(f: impl FnOnce(&RefCell>) -> R) -> R { - thread_local! { - static CURRENT_OBJECT: RefCell> = const { RefCell::new(None) }; - } - CURRENT_OBJECT.with(f) - } - fn set_thread_local_scoped(self, f: impl FnOnce() -> R) -> R { - Self::with_thread_local(|current_object| { - struct PutBackOnDrop<'a> { - current_object: &'a RefCell>, - old_object: Option, - } - impl Drop for PutBackOnDrop<'_> { - fn drop(&mut self) { - self.current_object.replace(self.old_object.take()); - } - } - let put_back_on_drop = PutBackOnDrop { - current_object, - old_object: current_object.replace(Some(self)), - }; - let retval = f(); - drop(put_back_on_drop); - retval - }) - } - fn take_thread_local() -> Option { - Self::with_thread_local(RefCell::take) - } -} - -trait PdfObjectDeserializeHelperTrait: Sized { - fn expecting(f: &mut fmt::Formatter<'_>) -> fmt::Result; - fn from_pdf_object( - value: PdfObject, - expected: &dyn de::Expected, - ) -> Result; -} - -struct PdfObjectDeserializeHelper(T); - -impl<'de, T: PdfObjectDeserializeHelperTrait> de::Deserialize<'de> - for PdfObjectDeserializeHelper -{ - fn deserialize(deserializer: D) -> Result - where - D: de::Deserializer<'de>, - { - struct PdfObjectVisitor(PhantomData); - fn expected_pdf_object() -> E { - de::Error::invalid_type(de::Unexpected::Map, &PdfObjectVisitor::(PhantomData)) - } - impl<'de, T: PdfObjectDeserializeHelperTrait> de::Visitor<'de> for PdfObjectVisitor { - type Value = PdfObject; - - fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { - T::expecting(formatter) - } - - fn visit_map(self, mut map: A) -> Result - where - A: de::MapAccess<'de>, - { - struct Field(PhantomData); - impl<'de, T: PdfObjectDeserializeHelperTrait> de::Deserialize<'de> for Field { - fn deserialize(deserializer: D) -> Result - where - D: de::Deserializer<'de>, - { - deserializer.deserialize_identifier(Field(PhantomData)) - } - } - impl<'de, T: PdfObjectDeserializeHelperTrait> de::Visitor<'de> for Field { - type Value = Self; - - fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { - T::expecting(formatter) - } - - fn visit_str(self, v: &str) -> Result - where - E: de::Error, - { - if v == PdfObject::SERDE_FIELD_NAME { - Ok(self) - } else { - Err(expected_pdf_object::()) - } - } - } - let (Field::(PhantomData), ()) = map - .next_entry()? - .ok_or_else(expected_pdf_object::)?; - let None = map.next_entry::, ()>()? else { - return Err(expected_pdf_object::<_, T>()); - }; - PdfObject::take_thread_local().ok_or_else(expected_pdf_object::<_, T>) - } - } - let (name, fields) = PdfObject::SERDE_NAME_AND_FIELDS; - let pdf_object = - deserializer.deserialize_struct(name, fields, PdfObjectVisitor::(PhantomData))?; - T::from_pdf_object::(pdf_object, &PdfObjectVisitor::(PhantomData)).map(Self) - } -} - -macro_rules! forward_deserialize_to_pdf_object_helper { - ($ty:ty) => { - impl<'de> de::Deserialize<'de> for $ty { - fn deserialize(deserializer: D) -> Result - where - D: de::Deserializer<'de>, - { - let PdfObjectDeserializeHelper(v) = de::Deserialize::deserialize(deserializer)?; - Ok(v) - } - } - }; -} - -forward_deserialize_to_pdf_object_helper!(PdfObject); - -impl PdfObjectDeserializeHelperTrait for PdfObject { - fn expecting(f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str("PdfObject") - } - - fn from_pdf_object( - value: PdfObject, - _expected: &dyn de::Expected, - ) -> Result { - Ok(value) - } -} - -forward_deserialize_to_pdf_object_helper!(PdfObjectIndirect); - -impl PdfObjectDeserializeHelperTrait for PdfObjectIndirect { - fn expecting(f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str("PdfObjectIndirect") - } - - fn from_pdf_object( - value: PdfObject, - expected: &dyn de::Expected, - ) -> Result { - match value { - PdfObject::Indirect(v) => Ok(v), - _ => Err(E::invalid_type(value.as_unexpected(), expected)), - } - } -} - -forward_deserialize_to_pdf_object_helper!(PdfString); - -impl PdfObjectDeserializeHelperTrait for PdfString { - fn expecting(f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str("PdfString") - } - - fn from_pdf_object( - value: PdfObject, - expected: &dyn de::Expected, - ) -> Result { - match value { - PdfObject::String(v) => Ok(v), - _ => Err(E::invalid_type(value.as_unexpected(), expected)), - } - } -} - -forward_deserialize_to_pdf_object_helper!(PdfName); - -impl PdfObjectDeserializeHelperTrait for PdfName { - fn expecting(f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str("PdfName") - } - - fn from_pdf_object( - value: PdfObject, - expected: &dyn de::Expected, - ) -> Result { - match value { - PdfObject::Name(v) => Ok(v), - _ => Err(E::invalid_type(value.as_unexpected(), expected)), - } - } -} - -forward_deserialize_to_pdf_object_helper!(PdfArray); - -impl PdfObjectDeserializeHelperTrait for PdfArray { - fn expecting(f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str("PdfArray") - } - - fn from_pdf_object( - value: PdfObject, - expected: &dyn de::Expected, - ) -> Result { - match value { - PdfObject::Array(v) => Ok(v), - _ => Err(E::invalid_type(value.as_unexpected(), expected)), - } - } -} - -forward_deserialize_to_pdf_object_helper!(PdfDictionary); - -impl PdfObjectDeserializeHelperTrait for PdfDictionary { - fn expecting(f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str("PdfDictionary") - } - - fn from_pdf_object( - value: PdfObject, - expected: &dyn de::Expected, - ) -> Result { - match value { - PdfObject::Dictionary(v) => Ok(v), - _ => Err(E::invalid_type(value.as_unexpected(), expected)), - } - } -} - -forward_deserialize_to_pdf_object_helper!(PdfStream); - -impl PdfObjectDeserializeHelperTrait for PdfStream { - fn expecting(f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str("PdfStream") - } - - fn from_pdf_object( - value: PdfObject, - expected: &dyn de::Expected, - ) -> Result { - match value { - PdfObject::Stream(v) => Ok(v), - _ => Err(E::invalid_type(value.as_unexpected(), expected)), - } - } -} - -impl<'de> de::Deserializer<'de> for PdfObject { - type Error = PdfParseError; - - fn deserialize_any(self, visitor: V) -> Result - where - V: de::Visitor<'de>, - { - match PdfObjectDirect::from(self) { - PdfObjectDirect::Boolean(v) => visitor.visit_bool(v), - PdfObjectDirect::Integer(v) => visitor.visit_i32(v), - PdfObjectDirect::Real(v) => visitor.visit_f32(v), - v @ (PdfObjectDirect::String(_) | PdfObjectDirect::Stream(_)) => { - Err(de::Error::invalid_type(v.as_unexpected(), &visitor)) - } - PdfObjectDirect::Name(v) => { - if let Ok(v) = str::from_utf8(v.as_bytes()) { - visitor.visit_str(v) - } else { - Err(de::Error::invalid_type( - PdfObject::from(v).as_unexpected(), - &visitor, - )) - } - } - PdfObjectDirect::Array(v) => { - visitor.visit_seq(de::value::SeqDeserializer::new(v.iter().cloned())) - } - PdfObjectDirect::Dictionary(v) => { - visitor.visit_map(de::value::MapDeserializer::new(v.into_iter())) - } - PdfObjectDirect::Null(PdfNull {}) => visitor.visit_unit(), - } - } - - fn deserialize_struct( - self, - name: &'static str, - fields: &'static [&'static str], - visitor: V, - ) -> Result - where - V: de::Visitor<'de>, - { - match (name, fields) { - PdfObject::SERDE_NAME_AND_FIELDS => self.set_thread_local_scoped(|| { - visitor.visit_map(de::value::MapDeserializer::new(std::iter::once(( - PdfObject::SERDE_FIELD_NAME, - (), - )))) - }), - _ => self.deserialize_any(visitor), - } - } - - fn deserialize_option(self, visitor: V) -> Result - where - V: de::Visitor<'de>, - { - let is_null = match self { - Self::Indirect(ref v) => !v.exists(), - Self::Null(_) => true, - _ => false, - }; - if is_null { - visitor.visit_none() - } else { - visitor.visit_some(self) - } - } - - fn deserialize_newtype_struct( - self, - _name: &'static str, - visitor: V, - ) -> Result - where - V: de::Visitor<'de>, - { - visitor.visit_newtype_struct(self) - } - - forward_to_deserialize_any! { - bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string - bytes byte_buf unit unit_struct seq tuple - tuple_struct map enum identifier ignored_any - } -} - -#[derive(Clone, Default, PartialEq, Eq, PartialOrd, Ord)] -pub(crate) struct PdfString { - bytes: ArcOrRef<'static, [u8]>, -} - -impl std::fmt::Debug for PdfString { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("PdfString") - .field("bytes", &&*self.bytes) - .finish() - } -} - -impl PdfString { - pub(crate) fn new(bytes: ArcOrRef<'static, [u8]>) -> Self { - Self { bytes } - } - pub(crate) fn bytes(&self) -> &ArcOrRef<'static, [u8]> { - &self.bytes - } -} - -#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub(crate) struct PdfName { - bytes: ArcOrRef<'static, [u8]>, -} - -impl PdfName { - pub(crate) fn try_new(bytes: impl Into>) -> Option { - let bytes = bytes.into(); - if bytes.contains(&0) { - None - } else { - Some(Self { bytes }) - } - } - #[track_caller] - pub(crate) const fn new_static(bytes: &'static [u8]) -> Self { - let mut i = 0; - while i < bytes.len() { - if bytes[i] == 0 { - panic!("shouldn't contain any nul bytes"); - } - i += 1; - } - Self { - bytes: ArcOrRef::Ref(bytes), - } - } - #[track_caller] - pub(crate) fn new(bytes: ArcOrRef<'static, [u8]>) -> Self { - Self::try_new(bytes).expect("shouldn't contain any nul bytes") - } - pub(crate) fn as_bytes(&self) -> &ArcOrRef<'static, [u8]> { - &self.bytes - } -} - -macro_rules! make_pdf_names { - ( - $vis:vis mod $pdf_names:ident { - $($ident:ident;)* - } - ) => { - $vis mod $pdf_names { - $(#[allow(non_upper_case_globals)] - $vis const $ident: $crate::pdf::PdfName = $crate::pdf::PdfName::new_static(stringify!($ident).as_bytes());)* - } - }; -} - -make_pdf_names! { - pub(crate) mod pdf_names { - DecodeParms; - DL; - F; - FDecodeParms; - FFilter; - Filter; - Length; - } -} - -impl fmt::Debug for PdfName { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "PdfName({self})") - } -} - -impl fmt::Display for PdfName { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str("/")?; - for &b in self.bytes.iter() { - match b { - 0x21..=0x7E if b != b'#' => f.write_char(b.into())?, - _ => write!(f, "#{b:02X}")?, - } - } - Ok(()) - } -} - -macro_rules! make_pdf_object { - ( - $( - #[from = $($from:ident)?, as_unexpected = |$as_unexpected_arg:pat_param| $as_unexpected_expr:expr] - $Variant:ident($ty:ty), - )+ - ) => { - #[derive(Clone, Debug, PartialEq)] - pub(crate) enum PdfObjectNonNull { - $($Variant($ty),)* - } - - #[derive(Clone, Debug, PartialEq)] - pub(crate) enum PdfObjectDirect { - $($Variant($ty),)* - Null(PdfNull), - } - - #[derive(Clone, Debug, PartialEq)] - pub(crate) enum PdfObject { - $($Variant($ty),)* - Null(PdfNull), - Indirect(PdfObjectIndirect), - } - - $($( - impl From<$ty> for PdfObjectNonNull { - fn $from(value: $ty) -> Self { - Self::$Variant(value) - } - } - - impl From<$ty> for PdfObjectDirect { - fn $from(value: $ty) -> Self { - Self::$Variant(value) - } - } - - impl From<$ty> for PdfObject { - fn $from(value: $ty) -> Self { - Self::$Variant(value) - } - } - - impl From> for PdfObjectDirect { - fn $from(value: Option<$ty>) -> Self { - match value { - Some(value) => Self::$Variant(value), - None => Self::Null(PdfNull), - } - } - } - - impl From> for PdfObject { - fn $from(value: Option<$ty>) -> Self { - match value { - Some(value) => Self::$Variant(value), - None => Self::Null(PdfNull), - } - } - } - )?)* - - impl From for PdfObjectDirect { - fn from(value: PdfObjectNonNull) -> Self { - match value { - $(PdfObjectNonNull::$Variant(v) => Self::$Variant(v),)* - } - } - } - - impl From for PdfObject { - fn from(value: PdfObjectNonNull) -> Self { - match value { - $(PdfObjectNonNull::$Variant(v) => Self::$Variant(v),)* - } - } - } - - impl From for PdfObject { - fn from(value: PdfObjectDirect) -> Self { - match value { - $(PdfObjectDirect::$Variant(v) => Self::$Variant(v),)* - PdfObjectDirect::Null(v) => Self::Null(v), - } - } - } - - impl From for PdfObjectDirect { - fn from(value: PdfObject) -> Self { - match value { - $(PdfObject::$Variant(v) => Self::$Variant(v),)* - PdfObject::Null(v) => Self::Null(v), - PdfObject::Indirect(v) => v.into(), - } - } - } - - impl PdfObjectNonNull { - fn as_unexpected(&self) -> de::Unexpected<'static> { - match *self { - $(PdfObjectNonNull::$Variant($as_unexpected_arg) => $as_unexpected_expr,)* - } - } - } - - impl PdfObjectDirect { - fn as_unexpected(&self) -> de::Unexpected<'static> { - match *self { - $(PdfObjectDirect::$Variant($as_unexpected_arg) => $as_unexpected_expr,)* - PdfObjectDirect::Null(_) => de::Unexpected::Option, - } - } - } - - impl PdfObject { - fn as_unexpected(&self) -> de::Unexpected<'static> { - match *self { - $(PdfObject::$Variant($as_unexpected_arg) => $as_unexpected_expr,)* - PdfObject::Null(_) => de::Unexpected::Option, - PdfObject::Indirect(ref v) => v.get().as_unexpected(), - } - } - } - - const _: () = { - fn _assert_impls_deserialize() {} - - $(let _ = _assert_impls_deserialize::<$ty>;)* - }; - }; -} - -make_pdf_object! { - #[from = from, as_unexpected = |v| de::Unexpected::Bool(v)] - Boolean(bool), - #[from = from, as_unexpected = |v| de::Unexpected::Signed(v.into())] - Integer(i32), - #[from = from, as_unexpected = |v| de::Unexpected::Float(v.into())] - Real(f32), - #[from = from, as_unexpected = |_| de::Unexpected::Other("PdfString")] - String(PdfString), - #[from = from, as_unexpected = |_| de::Unexpected::Other("PdfName")] - Name(PdfName), - #[from = from, as_unexpected = |_| de::Unexpected::Seq] - Array(PdfArray), - #[from = from, as_unexpected = |_| de::Unexpected::Map] - Dictionary(PdfDictionary), - #[from = from, as_unexpected = |_| de::Unexpected::Other("PdfStream")] - Stream(PdfStream), -} - -#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct PdfNull; - -impl From for PdfObjectDirect { - fn from(v: PdfNull) -> Self { - Self::Null(v) - } -} - -impl From for PdfObject { - fn from(v: PdfNull) -> Self { - Self::Null(v) - } -} - -impl From for PdfObject { - fn from(v: PdfObjectIndirect) -> Self { - Self::Indirect(v) +pdf_parse! { + #[derive(Clone, Debug)] + pub struct PdfXRefStreamDictionaryRest { + #[pdf(name = "Type")] + pub ty: PdfXRefName, + #[pdf(name = "Size")] + pub size: usize, + #[pdf(name = "Index")] + pub index: Option>, + #[pdf(name = "Prev")] + pub prev: Option, + #[pdf(name = "W")] + pub w: Option>, + #[pdf(name = "Root")] + pub root: PdfDictionary, + #[pdf(name = "Encrypt")] + pub encrypt: Option, + #[pdf(name = "Info")] + pub info: Option, + #[pdf(name = "ID")] + pub id: Option<[PdfString; 2]>, + #[pdf(flatten)] + pub rest: PdfDictionary, } } #[derive(Clone, Debug)] -pub(crate) struct PdfObjectIndirect { - xref_table: Weak, - object_number: NonZero, - generation_number: u16, +pub enum PdfTrailer { + Trailer { + trailer_dictionary: PdfTrailerDictionary, + start_xref: usize, + }, + Stream { + xref_stream: PdfStream, + start_xref: usize, + }, } -impl PartialEq for PdfObjectIndirect { - fn eq(&self, other: &Self) -> bool { - let Self { - xref_table, - object_number, - generation_number, - } = self; - xref_table.ptr_eq(&other.xref_table) - && *object_number == other.object_number - && *generation_number == other.generation_number - } +pub struct Pdf { + pub header: PdfHeader, + pub objects: Arc, + pub trailer: PdfTrailer, } -impl PdfObjectIndirect { - pub fn exists(&self) -> bool { - todo!() - } - pub fn get(&self) -> PdfObjectDirect { - todo!() - } +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +enum PdfCharCategory { + Regular, + Whitespace, + LParen, + RParen, + LAngle, + RAngle, + LBracket, + RBracket, + LBrace, + RBrace, + FSlash, + Percent, } -impl From for PdfObjectDirect { - fn from(value: PdfObjectIndirect) -> Self { - value.get() - } -} - -#[derive(Clone, PartialEq)] -pub(crate) struct PdfDictionary { - fields: Arc>, -} - -impl PdfDictionary { - pub(crate) fn fields(&self) -> &Arc> { - &self.fields - } - pub(crate) fn into_fields(self) -> Arc> { - self.fields - } - pub(crate) fn iter(&self) -> std::collections::btree_map::Iter<'_, PdfName, PdfObject> { - self.fields.iter() - } - pub(crate) fn contains_key(&self, key: &Q) -> bool - where - PdfName: std::borrow::Borrow + Ord, - Q: Ord, - { - self.fields.contains_key(key) - } - pub(crate) fn get(&self, key: &Q) -> Option<&PdfObject> - where - PdfName: std::borrow::Borrow + Ord, - Q: Ord, - { - self.fields.get(key) - } -} - -impl FromIterator<(PdfName, PdfObject)> for PdfDictionary { - fn from_iter>(iter: T) -> Self { - Self { - fields: Arc::new(BTreeMap::from_iter( - iter.into_iter() - .filter(|(_name, value)| !matches!(value, PdfObject::Null(_))), - )), +impl PdfCharCategory { + fn new(b: u8) -> Self { + match b { + b'\0' | b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' => Self::Whitespace, + b'(' => Self::LParen, + b')' => Self::RParen, + b'<' => Self::LAngle, + b'>' => Self::RAngle, + b'[' => Self::LBracket, + b']' => Self::RBracket, + b'{' => Self::LBrace, + b'}' => Self::RBrace, + b'/' => Self::FSlash, + b'%' => Self::Percent, + _ => Self::Regular, } } } -impl IntoIterator for PdfDictionary { - type Item = (PdfName, PdfObject); - type IntoIter = std::collections::btree_map::IntoIter; - - fn into_iter(self) -> Self::IntoIter { - Arc::unwrap_or_clone(self.fields).into_iter() - } +#[derive(Clone, Copy, PartialEq)] +enum PdfToken<'a> { + Regular(&'a [u8]), + LParen, + RParen, + LAngle, + RAngle, + LBracket, + RBracket, + LBrace, + RBrace, + FSlash, + Comment(&'a [u8]), } -impl<'a> IntoIterator for &'a PdfDictionary { - type Item = (&'a PdfName, &'a PdfObject); - type IntoIter = std::collections::btree_map::Iter<'a, PdfName, PdfObject>; - - fn into_iter(self) -> Self::IntoIter { - self.fields.iter() - } -} - -impl fmt::Debug for PdfDictionary { +impl<'a> fmt::Debug for PdfToken<'a> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_map().entries(self).finish() - } -} - -#[derive(Clone, Default, PartialEq)] -pub(crate) struct PdfArray { - elements: Arc<[PdfObject]>, -} - -impl PdfArray { - pub(crate) fn new() -> Self { - Self::default() - } - pub(crate) fn elements(&self) -> &Arc<[PdfObject]> { - &self.elements - } - pub(crate) fn into_elements(self) -> Arc<[PdfObject]> { - self.elements - } - pub(crate) fn iter(&self) -> std::slice::Iter<'_, PdfObject> { - self.elements.iter() - } -} - -impl FromIterator for PdfArray { - fn from_iter>(iter: T) -> Self { - Self { - elements: Arc::from_iter(iter), + match self { + Self::Regular(contents) => { + if let Ok(contents) = str::from_utf8(contents) { + write!(f, "Regular({contents:?})") + } else { + write!(f, "Regular({contents:?})") + } + } + Self::LParen => write!(f, "LParen"), + Self::RParen => write!(f, "RParen"), + Self::LAngle => write!(f, "LAngle"), + Self::RAngle => write!(f, "RAngle"), + Self::LBracket => write!(f, "LBracket"), + Self::RBracket => write!(f, "RBracket"), + Self::LBrace => write!(f, "LBrace"), + Self::RBrace => write!(f, "RBrace"), + Self::FSlash => write!(f, "FSlash"), + Self::Comment(contents) => { + if let Ok(contents) = str::from_utf8(contents) { + write!(f, "Comment({contents:?})") + } else { + write!(f, "Comment({contents:?})") + } + } } } } #[derive(Clone)] -pub(crate) struct PdfArrayIntoIter { - indexes: std::ops::Range, - elements: Arc<[PdfObject]>, +struct PdfTokenizerPeek<'a> { + token: PdfToken<'a>, + pos_after_token: usize, } -impl Iterator for PdfArrayIntoIter { - type Item = PdfObject; +#[derive(Clone)] +struct PdfTokenizer<'a> { + bytes: &'a [u8], + pos: usize, + peek_cache: Option>, +} + +impl<'a> PdfTokenizer<'a> { + fn new(bytes: &'a [u8], pos: usize) -> Self { + Self { + bytes, + pos, + peek_cache: None, + } + } + fn pos(&self) -> PdfInputPosition { + PdfInputPosition::new(self.pos) + } + fn peek_byte(&mut self) -> Option { + self.bytes.get(self.pos).copied() + } + fn next_byte(&mut self) -> Option { + let b = self.bytes.get(self.pos)?; + self.pos += 1; + self.peek_cache = None; + Some(*b) + } + fn skip_whitespace(&mut self) { + while let Some(PdfCharCategory::Whitespace) = self.peek_byte().map(PdfCharCategory::new) { + self.next_byte(); + } + } + fn peek(&mut self) -> Option> { + if let Some(PdfTokenizerPeek { token, .. }) = self.peek_cache { + return Some(token); + } + let mut tokenizer = self.clone(); + let token = tokenizer.next()?; + self.peek_cache = Some(PdfTokenizerPeek { + token, + pos_after_token: tokenizer.pos, + }); + Some(token) + } + fn read_bytes(&mut self, len: usize) -> Option<&'a [u8]> { + let retval = self.bytes.get(self.pos..self.pos.saturating_add(len))?; + self.peek_cache = None; + self.pos += len; + Some(retval) + } +} + +impl<'a> Iterator for PdfTokenizer<'a> { + type Item = PdfToken<'a>; fn next(&mut self) -> Option { - self.indexes.next().map(|i| self.elements[i].clone()) - } - - fn size_hint(&self) -> (usize, Option) { - self.indexes.size_hint() - } - - fn nth(&mut self, n: usize) -> Option { - self.indexes.nth(n).map(|i| self.elements[i].clone()) - } - - fn last(self) -> Option { - self.indexes.last().map(|i| self.elements[i].clone()) - } - - fn fold(self, init: B, mut f: F) -> B - where - F: FnMut(B, Self::Item) -> B, - { - self.indexes - .fold(init, |init, i| f(init, self.elements[i].clone())) - } -} - -impl FusedIterator for PdfArrayIntoIter {} - -impl DoubleEndedIterator for PdfArrayIntoIter { - fn next_back(&mut self) -> Option { - self.indexes.next_back().map(|i| self.elements[i].clone()) - } - fn nth_back(&mut self, n: usize) -> Option { - self.indexes.nth_back(n).map(|i| self.elements[i].clone()) - } - fn rfold(self, init: B, mut f: F) -> B - where - F: FnMut(B, Self::Item) -> B, - { - self.indexes - .rfold(init, |init, i| f(init, self.elements[i].clone())) - } -} - -impl ExactSizeIterator for PdfArrayIntoIter {} - -impl IntoIterator for PdfArray { - type Item = PdfObject; - type IntoIter = PdfArrayIntoIter; - - fn into_iter(self) -> Self::IntoIter { - PdfArrayIntoIter { - indexes: 0..self.elements.len(), - elements: self.elements, + if let Some(PdfTokenizerPeek { + token, + pos_after_token, + }) = self.peek_cache.take() + { + self.pos = pos_after_token; + return Some(token); + } + loop { + let start_pos = self.pos; + break match PdfCharCategory::new(self.next_byte()?) { + PdfCharCategory::Whitespace => continue, + PdfCharCategory::LParen => Some(PdfToken::LParen), + PdfCharCategory::RParen => Some(PdfToken::RParen), + PdfCharCategory::LAngle => Some(PdfToken::LAngle), + PdfCharCategory::RAngle => Some(PdfToken::RAngle), + PdfCharCategory::LBracket => Some(PdfToken::LBracket), + PdfCharCategory::RBracket => Some(PdfToken::RBracket), + PdfCharCategory::LBrace => Some(PdfToken::LBrace), + PdfCharCategory::RBrace => Some(PdfToken::RBrace), + PdfCharCategory::FSlash => Some(PdfToken::FSlash), + PdfCharCategory::Percent => { + loop { + match self.next_byte() { + None | Some(b'\n') => break, + Some(b'\r') => { + if let Some(b'\n') = self.peek_byte() { + self.pos += 1; + } + break; + } + Some(_) => continue, + } + } + Some(PdfToken::Comment(&self.bytes[start_pos..self.pos])) + } + PdfCharCategory::Regular => { + while let Some(PdfCharCategory::Regular) = + self.peek_byte().map(PdfCharCategory::new) + { + self.pos += 1; + } + Some(PdfToken::Regular(&self.bytes[start_pos..self.pos])) + } + }; } } } -impl<'a> IntoIterator for &'a PdfArray { - type Item = &'a PdfObject; - type IntoIter = std::slice::Iter<'a, PdfObject>; +struct PdfParser<'a> { + objects_arc: Arc, + objects_map: BTreeMap, + unparsed_stream_dictionaries: Vec>, + tokenizer: PdfTokenizer<'a>, +} - fn into_iter(self) -> Self::IntoIter { - self.elements.iter() +impl<'a> PdfParser<'a> { + fn parse_header(&mut self) -> Result { + let Some(b'%') = self.tokenizer.bytes.first() else { + return Err(PdfParseError::NotAPdfFile); + }; + let Some(PdfToken::Comment(header)) = self.tokenizer.next() else { + unreachable!() + }; + let Ok(header) = str::from_utf8(header) else { + return Err(PdfParseError::NotAPdfFile); + }; + let header = header.trim_end_matches(['\n', '\r']); + let Some(version) = header.strip_prefix(PdfHeader::PREFIX) else { + return Err(PdfParseError::NotAPdfFile); + }; + let Some((major_str, minor_str)) = version.split_once('.') else { + return Err(PdfParseError::NotAPdfFile); + }; + let (Ok(major), Ok(minor)) = (major_str.parse(), minor_str.parse()) else { + return Err(PdfParseError::NotAPdfFile); + }; + Ok(PdfHeader { major, minor }) + } + fn skip_comments_and_whitespace(&mut self) { + self.tokenizer.skip_whitespace(); + while let Some(PdfToken::Comment(_)) = self.tokenizer.peek() { + self.tokenizer.next(); + self.tokenizer.skip_whitespace(); + } + } + fn parse_digits( + &mut self, + on_parse_failed: impl FnOnce(PdfInputPosition) -> Result, PdfParseError>, + ) -> Result, PdfParseError> { + self.skip_comments_and_whitespace(); + let old_tokenizer = self.tokenizer.clone(); + let pos = self.tokenizer.pos(); + let Some(PdfToken::Regular(number)) = self.tokenizer.next() else { + self.tokenizer = old_tokenizer; + return Ok(None); + }; + if !number.iter().all(|b| b.is_ascii_digit()) { + self.tokenizer = old_tokenizer; + return Ok(None); + } + let Some(number) = str::from_utf8(number).ok().and_then(|v| v.parse().ok()) else { + self.tokenizer = old_tokenizer; + return Ok(match on_parse_failed(pos)? { + None => None, + }); + }; + Ok(Some((pos, number))) + } + fn parse_object_identifier( + &mut self, + return_none_for_out_of_range: bool, + ) -> Result, PdfParseError> { + let old_tokenizer = self.tokenizer.clone(); + let Some((pos, object_number)) = self.parse_digits(|pos| { + if return_none_for_out_of_range { + Ok(None) + } else { + Err(PdfParseError::InvalidObjectNumber { pos }) + } + })? + else { + self.tokenizer = old_tokenizer; + return Ok(None); + }; + let Some((_pos, generation_number)) = self.parse_digits(|pos| { + if return_none_for_out_of_range { + Ok(None) + } else { + Err(PdfParseError::InvalidGenerationNumber { pos }) + } + })? + else { + self.tokenizer = old_tokenizer; + return Ok(None); + }; + Ok(Some(PdfObjectIdentifier { + pos: pos.into(), + object_number, + generation_number, + })) + } + fn parse_indirect_object(&mut self) -> Result, PdfParseError> { + let old_tokenizer = self.tokenizer.clone(); + let Some(id) = self.parse_object_identifier(true)? else { + self.tokenizer = old_tokenizer; + return Ok(None); + }; + if let Some(PdfToken::Regular(b"R")) = self.tokenizer.next() { + Ok(Some(PdfObjectIndirect::new(&self.objects_arc, id))) + } else { + self.tokenizer = old_tokenizer; + Ok(None) + } + } + fn parse_string_after_l_paren(&mut self) -> Result { + let mut contents = Vec::new(); + let mut paren_level = NonZero::new(1usize).expect("non-zero"); + let string_pos = self.tokenizer.pos(); + while let Some(b) = self.tokenizer.next_byte() { + contents.push(match b { + b'(' => { + paren_level = paren_level.checked_add(1).expect("overflow"); + b + } + b')' => { + let Some(new_paren_level) = NonZero::new(paren_level.get() - 1) else { + return Ok(PdfString::new( + string_pos, + ArcOrRef::Arc(Arc::from(contents)), + )); + }; + paren_level = new_paren_level; + b + } + b'\r' if self.tokenizer.peek_byte() == Some(b'\n') => { + self.tokenizer.next_byte(); + b'\n' + } + b'\r' | b'\n' => b'\n', + b'\\' => { + let pos = self.tokenizer.pos(); + let Some(b) = self.tokenizer.next_byte() else { + return Err(PdfParseError::InvalidStringEscape { pos }); + }; + match b { + b'\r' if self.tokenizer.peek_byte() == Some(b'\n') => { + self.tokenizer.next_byte(); + continue; + } + b'\r' | b'\n' => continue, + b'n' => b'\n', + b'r' => b'\r', + b't' => b'\t', + b'b' => b'\x08', + b'f' => b'\x0C', + b'(' | b')' | b'\\' => b, + b'0'..=b'7' => { + const MAX_OCTAL_DIGITS: usize = 3; + let mut value = b - b'0'; + let mut len = 1; + while len < MAX_OCTAL_DIGITS { + let Some(b @ b'0'..=b'7') = self.tokenizer.peek_byte() else { + break; + }; + value <<= 3; + value |= b - b'0'; + len += 1; + self.tokenizer.next_byte(); + } + value + } + _ => { + return Err(PdfParseError::InvalidStringEscape { pos }); + } + } + } + _ => b, + }); + } + Err(PdfParseError::TruncatedFile { + pos: self.tokenizer.pos(), + }) + } + fn parse_string_after_l_angle(&mut self) -> Result { + let mut contents = Vec::new(); + let mut high_digit_value = None; + let mut push_digit_value = |value: u8| { + high_digit_value = match high_digit_value { + Some(high_digit_value) => { + contents.push((high_digit_value << 4) | value); + None + } + None => Some(value), + }; + }; + let string_pos = self.tokenizer.pos(); + loop { + let pos = self.tokenizer.pos(); + match self.tokenizer.next_byte() { + None => { + return Err(PdfParseError::TruncatedFile { pos }); + } + Some(b) if PdfCharCategory::new(b) == PdfCharCategory::Whitespace => {} + Some(b'>') => { + // if we have an odd trailing digit, add the final digit, otherwise doesn't modify contents + push_digit_value(0); + return Ok(PdfString::new( + string_pos, + Arc::<[u8]>::from(contents).into(), + )); + } + Some(b) => { + let Some(value) = (b as char).to_digit(0x10) else { + return Err(PdfParseError::InvalidHexStringDigit { pos }); + }; + push_digit_value(value as u8); + } + } + } + } + fn parse_name_after_f_slash(&mut self) -> Result { + let mut name = vec![]; + let name_pos = self.tokenizer.pos(); + loop { + let Some(PdfCharCategory::Regular) = + self.tokenizer.peek_byte().map(PdfCharCategory::new) + else { + return Ok(PdfName::new(name_pos, ArcOrRef::Arc(Arc::from(name)))); + }; + let pos = self.tokenizer.pos(); + match self + .tokenizer + .next_byte() + .expect("just checked that it's not None") + { + b'#' => { + let mut value = 0u8; + for _ in 0..2 { + let Some(digit) = self + .tokenizer + .next_byte() + .and_then(|b| (b as char).to_digit(0x10)) + else { + return Err(PdfParseError::InvalidNameEscape { pos }); + }; + value <<= 4; + value |= digit as u8; + } + name.push(value); + } + b => name.push(b), + } + } + } + fn parse_array_after_l_bracket(&mut self) -> Result { + let array_pos = self.tokenizer.pos(); + let mut contents: Vec = Vec::new(); + loop { + self.skip_comments_and_whitespace(); + if let Some(PdfToken::RBracket) = self.tokenizer.peek() { + self.tokenizer.next(); + return Ok(PdfArray::from_elements(array_pos, Arc::from(contents))); + } + contents.push(self.parse_object()?); + } + } + /// assumes `self.tokenizer.peek_byte() == Some(b'<')` + fn parse_dictionary_after_one_l_angle(&mut self) -> Result { + let l_angle = self.tokenizer.next_byte(); + assert_eq!(l_angle, Some(b'<')); + let dictionary_pos = self.tokenizer.pos(); + let mut contents: BTreeMap = BTreeMap::new(); + loop { + self.skip_comments_and_whitespace(); + if let Some(PdfToken::RAngle) = self.tokenizer.peek() { + self.tokenizer.next(); + let pos = self.tokenizer.pos(); + let b'>' = self + .tokenizer + .next_byte() + .ok_or(PdfParseError::TruncatedFile { pos })? + else { + return Err(PdfParseError::InvalidDictionaryClosingDoubleRAngle { pos }); + }; + return Ok(PdfDictionary::from_fields( + dictionary_pos, + Arc::new(contents), + )); + } + let name = PdfName::parse(self.parse_object()?.into())?; + let name_pos = name.pos(); + match contents.entry(name) { + std::collections::btree_map::Entry::Vacant(entry) => { + entry.insert(self.parse_object()?.into()); + } + std::collections::btree_map::Entry::Occupied(entry) => { + return Err(PdfParseError::DuplicateDictionaryKey { + pos: name_pos, + name: entry.key().clone(), + }); + } + } + } + } + /// assumes `self.tokenizer.peek() == Some(PdfToken::Regular(b"stream"))` + fn parse_stream_after_dictionary( + &mut self, + dictionary: PdfDictionary, + ) -> Result { + self.tokenizer.skip_whitespace(); + let stream_pos = self.tokenizer.pos(); + let stream = self.tokenizer.next(); + assert_eq!(stream, Some(PdfToken::Regular(b"stream"))); + let len = PdfStreamDictionary::parse_len_from_dictionary(&dictionary)?; + let eol_pos = self.tokenizer.pos(); + match self.tokenizer.next_byte() { + None => return Err(PdfParseError::TruncatedFile { pos: eol_pos }), + Some(b'\r') => { + let Some(b'\n') = self.tokenizer.next_byte() else { + return Err(PdfParseError::InvalidOrMissingEolAfterStreamKeyword { + pos: eol_pos, + }); + }; + } + Some(b'\n') => {} + _ => return Err(PdfParseError::InvalidOrMissingEolAfterStreamKeyword { pos: eol_pos }), + } + let Some(data) = self.tokenizer.read_bytes(len) else { + return Err(PdfParseError::TruncatedFile { + pos: PdfInputPosition::new(self.tokenizer.bytes.len()), + }); + }; + let (stream, unparsed) = PdfStream::new_unparsed(stream_pos, dictionary, Arc::from(data)); + self.unparsed_stream_dictionaries.push(unparsed); + self.skip_comments_and_whitespace(); + let pos = self.tokenizer.pos(); + if let Some(PdfToken::Regular(b"endstream")) = self.tokenizer.next() { + Ok(stream) + } else { + Err(PdfParseError::MissingEndStreamKeyword { pos }) + } + } + fn parse_object(&mut self) -> Result { + self.skip_comments_and_whitespace(); + if let Some(indirect) = self.parse_indirect_object()? { + return Ok(indirect.into()); + } + let pos = self.tokenizer.pos(); + match self + .tokenizer + .next() + .ok_or(PdfParseError::TruncatedFile { pos })? + { + PdfToken::Regular(b"true") => Ok(PdfObject::Boolean(PdfBoolean::new(pos, true))), + PdfToken::Regular(b"false") => Ok(PdfObject::Boolean(PdfBoolean::new(pos, false))), + PdfToken::Regular(b"null") => Ok(PdfObject::Null(PdfNull::new(pos))), + PdfToken::Regular( + number @ ([b'+' | b'-', b'0'..=b'9' | b'.', ..] | [b'0'..=b'9' | b'.', ..]), + ) => { + // parse number + let Ok(number) = str::from_utf8(number) else { + return Err(PdfParseError::InvalidNumber { pos }); + }; + let mut parts = number + .strip_prefix(&['+', '-']) + .unwrap_or(number) + .split('.'); + let integer_part = parts + .next() + .expect("split always returns at least one part"); + let fraction_part = parts.next(); + if parts.next().is_some() { + return Err(PdfParseError::InvalidNumber { pos }); + } + if integer_part.is_empty() && fraction_part.is_none_or(|v| v.is_empty()) { + return Err(PdfParseError::InvalidNumber { pos }); + } + if !integer_part.bytes().all(|v| v.is_ascii_digit()) { + return Err(PdfParseError::InvalidNumber { pos }); + } + if let Some(fraction_part) = fraction_part { + if !fraction_part.bytes().all(|v| v.is_ascii_digit()) { + return Err(PdfParseError::InvalidNumber { pos }); + } + Ok(PdfObject::Real(PdfReal::new( + pos, + number + .parse() + .map_err(|_| PdfParseError::InvalidNumber { pos })?, + ))) + } else { + Ok(PdfObject::Integer(PdfInteger::new( + pos, + number + .parse() + .map_err(|_| PdfParseError::InvalidNumber { pos })?, + ))) + } + } + PdfToken::Regular(items) => todo!("{:?}", str::from_utf8(items)), + PdfToken::LParen => self.parse_string_after_l_paren().map(PdfObject::String), + PdfToken::RParen => todo!(), + PdfToken::LAngle => { + if self.tokenizer.peek_byte() == Some(b'<') { + let dictionary = self.parse_dictionary_after_one_l_angle()?; + self.skip_comments_and_whitespace(); + if let Some(PdfToken::Regular(b"stream")) = self.tokenizer.peek() { + self.parse_stream_after_dictionary(dictionary) + .map(PdfObject::Stream) + } else { + Ok(dictionary.into()) + } + } else { + self.parse_string_after_l_angle().map(PdfObject::String) + } + } + PdfToken::RAngle => todo!(), + PdfToken::LBracket => self.parse_array_after_l_bracket().map(PdfObject::Array), + PdfToken::RBracket => todo!(), + PdfToken::LBrace => todo!(), + PdfToken::RBrace => todo!(), + PdfToken::FSlash => self.parse_name_after_f_slash().map(PdfObject::Name), + PdfToken::Comment(_) => unreachable!(), + } + } + fn parse_indirect_object_definition(&mut self) -> Result, PdfParseError> { + self.skip_comments_and_whitespace(); + let Some(id) = self.parse_object_identifier(false)? else { + return Ok(None); + }; + self.skip_comments_and_whitespace(); + let obj_pos = self.tokenizer.pos(); + let Some(PdfToken::Regular(b"obj")) = self.tokenizer.next() else { + return Err(PdfParseError::MissingObj { pos: obj_pos }); + }; + let object = self.parse_object()?; + self.skip_comments_and_whitespace(); + let end_obj_pos = self.tokenizer.pos(); + let Some(PdfToken::Regular(b"endobj")) = self.tokenizer.next() else { + return Err(PdfParseError::MissingEndObj { pos: end_obj_pos }); + }; + if self.objects_map.insert(id, object).is_some() { + Err(PdfParseError::DuplicateIndirectObjectDefinition { pos: id.pos.0, id }) + } else { + Ok(Some(())) + } + } + fn parse_body(&mut self) -> Result<(), PdfParseError> { + while let Some(()) = self.parse_indirect_object_definition()? {} + let Ok(()) = self + .objects_arc + .objects + .set(std::mem::take(&mut self.objects_map)) + else { + unreachable!(); + }; + self.unparsed_stream_dictionaries + .drain(..) + .try_for_each(|v| v.finish_parsing()) + } + fn parse_xref_table(&mut self) -> Result<(), PdfParseError> { + self.skip_comments_and_whitespace(); + let xref_pos = self.tokenizer.pos(); + let Some(PdfToken::Regular(b"xref")) = self.tokenizer.peek() else { + return Ok(()); + }; + todo!("{xref_pos}") + } + fn parse_trailer(&mut self) -> Result { + self.skip_comments_and_whitespace(); + let trailer_pos = self.tokenizer.pos(); + let trailer_dictionary = match self.tokenizer.peek() { + Some(PdfToken::Regular(b"trailer")) => { + self.tokenizer.next(); + Some(PdfTrailerDictionary::parse(self.parse_object()?)?) + } + Some(PdfToken::Regular(b"startxref")) => None, + _ => { + return Err(PdfParseError::MissingTrailer { pos: trailer_pos }); + } + }; + self.skip_comments_and_whitespace(); + let start_xref_kw_pos = self.tokenizer.pos(); + let Some(PdfToken::Regular(b"startxref")) = self.tokenizer.next() else { + return Err(PdfParseError::MissingStartXRefKeyword { + pos: start_xref_kw_pos, + }); + }; + let start_xref_pos = self.tokenizer.pos(); + let Some((start_xref_pos, start_xref)) = + self.parse_digits(|pos| Err(PdfParseError::IntegerOutOfRange { pos }))? + else { + return Err(PdfParseError::MissingStartXRefValue { + pos: start_xref_pos, + }); + }; + self.tokenizer.skip_whitespace(); + let eof_comment_pos = self.tokenizer.pos(); + let Some(PdfToken::Comment(b"%%EOF" | b"%%EOF\r" | b"%%EOF\r\n" | b"%%EOF\n")) = + self.tokenizer.next() + else { + return Err(PdfParseError::MissingEofComment { + pos: eof_comment_pos, + }); + }; + self.tokenizer.skip_whitespace(); + if let Some(byte) = self.tokenizer.peek_byte() { + return Err(PdfParseError::UnexpectedByte { + pos: self.tokenizer.pos(), + byte, + }); + } + if let Some(trailer_dictionary) = trailer_dictionary { + return Ok(PdfTrailer::Trailer { + trailer_dictionary, + start_xref, + }); + } + let old_tokenizer = self.tokenizer.clone(); + self.tokenizer = PdfTokenizer::new(self.tokenizer.bytes, start_xref); + let id = self.parse_object_identifier(false); + self.tokenizer = old_tokenizer; + let Some(id) = id? else { + return Err(PdfParseError::InvalidStartXRefValue { + pos: start_xref_pos, + start_xref, + }); + }; + let xref_stream = + PdfStream::parse(PdfObjectIndirect::new(&self.objects_arc, id).get().into())?; + Ok(PdfTrailer::Stream { + xref_stream, + start_xref, + }) + } + fn parse_file(mut self) -> Result { + let header = self.parse_header()?; + self.parse_body()?; + self.parse_xref_table()?; + let trailer = self.parse_trailer()?; + Ok(Pdf { + header, + objects: self.objects_arc, + trailer, + }) } } -impl fmt::Debug for PdfArray { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - self.elements.fmt(f) +impl Pdf { + pub fn parse(bytes: impl AsRef<[u8]>) -> Result { + PdfParser { + objects_arc: Arc::new(PdfObjects { + objects: OnceLock::new(), + }), + objects_map: BTreeMap::new(), + unparsed_stream_dictionaries: vec![], + tokenizer: PdfTokenizer::new(bytes.as_ref(), 0), + } + .parse_file() } } -#[derive(Clone, Debug, PartialEq)] -pub(crate) struct PdfStream { - dictionary: PdfDictionary, - data: Arc, -} - -pub(crate) enum PdfBody {} - -pub(crate) struct PdfObjects {} - -pub(crate) struct PdfXRefTable {} - -pub(crate) struct Pdf { - pub(crate) header: PdfHeader, - pub(crate) body: PdfBody, -} - -pub(crate) struct PdfHeader {} - #[cfg(test)] - mod tests { - use super::*; + use crate::{ + pdf::{ + object::{ + PdfBoolean, PdfDictionary, PdfInteger, PdfName, PdfNull, PdfObject, PdfString, + }, + parse::{PdfInputPosition, PdfParse, PdfParseError}, + }, + util::ArcOrRef, + }; #[test] fn test_deserialize_dict() -> Result<(), PdfParseError> { - #[derive(serde::Deserialize, Debug, PartialEq)] - struct TestStruct { - a: i32, - c: i32, - b: i32, - #[serde(flatten)] - others: PdfDictionary, + crate::pdf::parse::pdf_parse! { + #[derive(Debug)] + #[allow(dead_code)] + struct TestStruct { + #[pdf(name = "a")] + a: i32, + #[pdf(name = "c")] + c: i32, + #[pdf(name = "b")] + b: i32, + #[pdf(flatten)] + rest: PdfDictionary, + } } - let v: TestStruct = - de::Deserialize::deserialize(PdfObject::from(PdfDictionary::from_iter([ - (PdfName::new_static(b"a"), 1.into()), - (PdfName::new_static(b"c"), 7.into()), - (PdfName::new_static(b"b"), 5.into()), - (PdfName::new_static(b"d"), false.into()), - (PdfName::new_static(b"e"), PdfNull.into()), - ( - PdfName::new_static(b"f"), - PdfString::new(ArcOrRef::Ref(b"test")).into(), - ), - ])))?; + let v: TestStruct = PdfParse::parse(PdfObject::from(PdfDictionary::from_iter([ + ( + PdfName::new_static(b"a"), + PdfInteger::new(PdfInputPosition::empty(), 1).into(), + ), + ( + PdfName::new_static(b"c"), + PdfInteger::new(PdfInputPosition::empty(), 7).into(), + ), + ( + PdfName::new_static(b"b"), + PdfInteger::new(PdfInputPosition::empty(), 5).into(), + ), + ( + PdfName::new_static(b"d"), + PdfBoolean::new(PdfInputPosition::empty(), false).into(), + ), + ( + PdfName::new_static(b"e"), + PdfNull::new(PdfInputPosition::empty()).into(), + ), + ( + PdfName::new_static(b"f"), + PdfString::new(PdfInputPosition::empty(), ArcOrRef::Ref(b"test")).into(), + ), + ])))?; let expected = TestStruct { a: 1, c: 7, b: 5, - others: PdfDictionary::from_iter([ - (PdfName::new_static(b"d"), false.into()), + rest: PdfDictionary::from_iter([ + ( + PdfName::new_static(b"d"), + PdfBoolean::new(PdfInputPosition::empty(), false).into(), + ), ( PdfName::new_static(b"f"), - PdfString::new(ArcOrRef::Ref(b"test")).into(), + PdfString::new(PdfInputPosition::empty(), ArcOrRef::Ref(b"test")).into(), ), ]), }; - assert_eq!(v, expected); + assert_eq!(format!("{v:?}"), format!("{expected:?}")); Ok(()) } } diff --git a/src/pdf/object.rs b/src/pdf/object.rs new file mode 100644 index 0000000..d3979d8 --- /dev/null +++ b/src/pdf/object.rs @@ -0,0 +1,1111 @@ +use crate::{ + pdf::{ + PdfObjects, + parse::{ + GetPdfInputPosition, PdfInputPosition, PdfInputPositionNoCompare, PdfParse, + PdfParseError, + }, + }, + pdf_parse, + util::ArcOrRef, +}; +use std::{ + any::TypeId, + borrow::Cow, + collections::BTreeMap, + fmt::{self, Write}, + num::NonZero, + sync::{Arc, OnceLock}, +}; + +#[derive(Clone, Default, PartialEq, Eq, PartialOrd, Ord)] +pub struct PdfString { + pos: PdfInputPositionNoCompare, + bytes: ArcOrRef<'static, [u8]>, +} + +impl std::fmt::Debug for PdfString { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let Self { pos, bytes } = self; + f.debug_struct("PdfString") + .field("pos", pos) + .field("bytes", &format_args!("b\"{}\"", bytes.escape_ascii())) + .finish() + } +} + +impl PdfString { + pub fn new(pos: impl Into, bytes: ArcOrRef<'static, [u8]>) -> Self { + Self { + pos: pos.into(), + bytes, + } + } + pub fn pos(&self) -> PdfInputPosition { + self.pos.0 + } + pub fn bytes(&self) -> &ArcOrRef<'static, [u8]> { + &self.bytes + } +} + +impl GetPdfInputPosition for PdfString { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.pos.0 + } +} + +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct PdfName { + pos: PdfInputPositionNoCompare, + bytes: ArcOrRef<'static, [u8]>, +} + +impl PdfName { + pub fn try_new( + pos: impl Into, + bytes: impl Into>, + ) -> Option { + let bytes = bytes.into(); + if bytes.contains(&0) { + None + } else { + Some(Self { + pos: pos.into(), + bytes, + }) + } + } + #[track_caller] + pub const fn new_static(bytes: &'static [u8]) -> Self { + let mut i = 0; + while i < bytes.len() { + if bytes[i] == 0 { + panic!("shouldn't contain any nul bytes"); + } + i += 1; + } + Self { + pos: PdfInputPositionNoCompare::empty(), + bytes: ArcOrRef::Ref(bytes), + } + } + #[track_caller] + pub fn new( + pos: impl Into, + bytes: impl Into>, + ) -> Self { + Self::try_new(pos, bytes).expect("shouldn't contain any nul bytes") + } + pub fn as_bytes(&self) -> &ArcOrRef<'static, [u8]> { + &self.bytes + } + pub fn pos(&self) -> PdfInputPosition { + self.pos.0 + } +} + +impl GetPdfInputPosition for PdfName { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.pos.0 + } +} + +impl fmt::Debug for PdfName { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "PdfName(at {}: {self})", self.pos) + } +} + +impl fmt::Display for PdfName { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("/")?; + for &b in self.bytes.iter() { + match b { + 0x21..=0x7E if b != b'#' => f.write_char(b.into())?, + _ => write!(f, "#{b:02X}")?, + } + } + Ok(()) + } +} + +#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Default)] +pub struct PdfBoolean { + pos: PdfInputPositionNoCompare, + value: bool, +} + +impl PdfBoolean { + pub fn new(pos: impl Into, value: bool) -> Self { + Self { + pos: pos.into(), + value, + } + } + pub fn pos(&self) -> PdfInputPosition { + self.pos.0 + } + pub fn value(&self) -> bool { + self.value + } +} + +impl GetPdfInputPosition for PdfBoolean { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.pos.0 + } +} + +#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Default)] +pub struct PdfInteger { + pos: PdfInputPositionNoCompare, + value: i128, +} + +impl PdfInteger { + pub fn new(pos: impl Into, value: i128) -> Self { + Self { + pos: pos.into(), + value, + } + } + pub fn pos(&self) -> PdfInputPosition { + self.pos.0 + } + pub fn value(&self) -> i128 { + self.value + } +} + +impl GetPdfInputPosition for PdfInteger { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.pos.0 + } +} + +#[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Default)] +pub struct PdfReal { + pos: PdfInputPositionNoCompare, + value: f64, +} + +impl PdfReal { + pub fn new(pos: impl Into, value: f64) -> Self { + Self { + pos: pos.into(), + value, + } + } + pub fn pos(&self) -> PdfInputPosition { + self.pos.0 + } + pub fn value(&self) -> f64 { + self.value + } +} + +impl GetPdfInputPosition for PdfReal { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.pos.0 + } +} + +macro_rules! make_pdf_object { + ( + $( + #[parse = $($parse:ident)?, type_name = $type_name:literal] + $Variant:ident($ty:ty), + )+ + ) => { + #[derive(Clone, Debug)] + pub enum PdfObjectNonNull { + $($Variant($ty),)* + } + + #[derive(Clone, Debug)] + pub enum PdfObjectDirect { + $($Variant($ty),)* + Null(PdfNull), + } + + #[derive(Clone, Debug)] + pub enum PdfObject { + $($Variant($ty),)* + Null(PdfNull), + Indirect(PdfObjectIndirect), + } + + $( + impl From<$ty> for PdfObjectNonNull { + fn from(value: $ty) -> Self { + Self::$Variant(value) + } + } + + impl From<$ty> for PdfObjectDirect { + fn from(value: $ty) -> Self { + Self::$Variant(value) + } + } + + impl From<$ty> for PdfObject { + fn from(value: $ty) -> Self { + Self::$Variant(value) + } + } + + impl From> for PdfObjectDirect { + fn from(value: Option<$ty>) -> Self { + match value { + Some(value) => Self::$Variant(value), + None => Self::Null(Default::default()), + } + } + } + + impl From> for PdfObject { + fn from(value: Option<$ty>) -> Self { + match value { + Some(value) => Self::$Variant(value), + None => Self::Null(Default::default()), + } + } + } + + $(impl crate::pdf::parse::PdfParse for $ty { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed($type_name) + } + fn $parse(object: PdfObject) -> Result { + match PdfObjectDirect::from(object) { + PdfObjectDirect::$Variant(v) => Ok(v), + object => Err(crate::pdf::parse::PdfParseError::InvalidType { + pos: object.get_pdf_input_position(), + ty: object.type_name(), + expected_ty: $type_name, + }), + } + } + })? + )* + + impl From for PdfObjectDirect { + fn from(value: PdfObjectNonNull) -> Self { + match value { + $(PdfObjectNonNull::$Variant(v) => Self::$Variant(v),)* + } + } + } + + impl From for PdfObject { + fn from(value: PdfObjectNonNull) -> Self { + match value { + $(PdfObjectNonNull::$Variant(v) => Self::$Variant(v),)* + } + } + } + + impl From for PdfObject { + fn from(value: PdfObjectDirect) -> Self { + match value { + $(PdfObjectDirect::$Variant(v) => Self::$Variant(v),)* + PdfObjectDirect::Null(v) => Self::Null(v), + } + } + } + + impl From for PdfObjectDirect { + fn from(value: PdfObject) -> Self { + match value { + $(PdfObject::$Variant(v) => Self::$Variant(v),)* + PdfObject::Null(v) => Self::Null(v), + PdfObject::Indirect(v) => v.into(), + } + } + } + + impl PdfObjectNonNull { + pub fn type_name(&self) -> &'static str { + match self { + $(PdfObjectNonNull::$Variant(_) => $type_name,)* + } + } + pub fn pos(&self) -> PdfInputPosition { + self.get_pdf_input_position() + } + } + + impl GetPdfInputPosition for PdfObjectNonNull { + fn get_pdf_input_position(&self) -> PdfInputPosition { + match self { + $(PdfObjectNonNull::$Variant(v) => <$ty as GetPdfInputPosition>::get_pdf_input_position(v),)* + } + } + } + + impl From for Option { + fn from(value: PdfObjectDirect) -> Self { + match value { + $(PdfObjectDirect::$Variant(v) => Some(PdfObjectNonNull::$Variant(v)),)* + PdfObjectDirect::Null(_) => None, + } + } + } + + impl From for Option { + fn from(value: PdfObject) -> Self { + PdfObjectDirect::from(value).into() + } + } + + impl PdfObjectDirect { + pub fn is_null(&self) -> bool { + matches!(self, PdfObjectDirect::Null(_)) + } + pub fn type_name(&self) -> &'static str { + match self { + $(PdfObjectDirect::$Variant(_) => $type_name,)* + PdfObjectDirect::Null(_) => "null", + } + } + pub fn pos(&self) -> PdfInputPosition { + self.get_pdf_input_position() + } + } + + impl GetPdfInputPosition for PdfObjectDirect { + fn get_pdf_input_position(&self) -> PdfInputPosition { + match self { + $(PdfObjectDirect::$Variant(v) => <$ty as GetPdfInputPosition>::get_pdf_input_position(v),)* + PdfObjectDirect::Null(v) => ::get_pdf_input_position(v), + } + } + } + + impl PdfObject { + pub fn is_null(&self) -> bool { + matches!(self, PdfObject::Null(_)) + } + pub fn type_name(&self) -> &'static str { + match self { + $(PdfObject::$Variant(_) => $type_name,)* + PdfObject::Null(_) => "null", + PdfObject::Indirect(_) => "indirect object", + } + } + pub fn pos(&self) -> PdfInputPosition { + self.get_pdf_input_position() + } + } + + impl GetPdfInputPosition for PdfObject { + fn get_pdf_input_position(&self) -> PdfInputPosition { + match self { + $(PdfObject::$Variant(v) => <$ty as GetPdfInputPosition>::get_pdf_input_position(v),)* + PdfObject::Null(v) => ::get_pdf_input_position(v), + PdfObject::Indirect(v) => ::get_pdf_input_position(v), + } + } + } + + const _: () = { + fn _assert_parsable() {} + + $(let _ = _assert_parsable::<$ty>;)* + let _ = _assert_parsable::; + let _ = _assert_parsable::; + let _ = _assert_parsable::; + let _ = _assert_parsable::; + let _ = _assert_parsable::; + }; + }; +} + +make_pdf_object! { + #[parse = parse, type_name = "boolean"] + Boolean(PdfBoolean), + #[parse = parse, type_name = "integer"] + Integer(PdfInteger), + #[parse = parse, type_name = "real"] + Real(PdfReal), + #[parse = parse, type_name = "string"] + String(PdfString), + #[parse = parse, type_name = "name"] + Name(PdfName), + #[parse = parse, type_name = "array"] + Array(PdfArray), + #[parse = parse, type_name = "dictionary"] + Dictionary(PdfDictionary), + #[parse =, type_name = "stream"] + Stream(PdfStream), +} + +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct PdfNull(PdfInputPositionNoCompare); + +impl PdfNull { + pub fn new(pos: impl Into) -> Self { + Self(pos.into()) + } +} + +impl GetPdfInputPosition for PdfNull { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.0.0 + } +} + +impl From for PdfObjectDirect { + fn from(v: PdfNull) -> Self { + Self::Null(v) + } +} + +impl Default for PdfObjectDirect { + fn default() -> Self { + Self::Null(PdfNull(PdfInputPositionNoCompare::empty())) + } +} + +impl From for PdfObject { + fn from(v: PdfNull) -> Self { + Self::Null(v) + } +} + +impl Default for PdfObject { + fn default() -> Self { + Self::Null(PdfNull(PdfInputPositionNoCompare::empty())) + } +} + +impl From for PdfObject { + fn from(v: PdfObjectIndirect) -> Self { + Self::Indirect(v) + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub struct PdfObjectIdentifier { + pub pos: PdfInputPositionNoCompare, + pub object_number: NonZero, + pub generation_number: u16, +} + +impl GetPdfInputPosition for PdfObjectIdentifier { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.pos.0 + } +} + +#[derive(Clone)] +pub struct PdfObjectIndirect { + objects: std::sync::Weak, + id: PdfObjectIdentifier, + final_id: Arc>, +} + +impl fmt::Debug for PdfObjectIndirect { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let Self { + objects: _, + id, + final_id: _, + } = self; + f.debug_struct("PdfObjectIndirect") + .field("id", id) + .finish_non_exhaustive() + } +} + +impl GetPdfInputPosition for PdfObjectIndirect { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.id.get_pdf_input_position() + } +} + +impl PartialEq for PdfObjectIndirect { + fn eq(&self, other: &Self) -> bool { + let Self { + objects, + id, + final_id: _, + } = self; + objects.ptr_eq(&other.objects) && *id == other.id + } +} + +impl PdfObjectIndirect { + pub fn new(objects: &Arc, id: PdfObjectIdentifier) -> Self { + Self { + objects: Arc::downgrade(objects), + id, + final_id: Arc::new(OnceLock::new()), + } + } + pub fn get(&self) -> PdfObjectDirect { + if let Some(objects) = self.objects.upgrade() { + if let Some(objects) = objects.objects.get() { + let final_id = self.final_id.get().copied(); + let limit = if final_id.is_some() { 1 } else { 1000usize }; + let mut id = final_id.unwrap_or(self.id); + for _ in 0..limit { + if let Some(object) = objects.get(&self.id) { + let retval = match object { + PdfObject::Boolean(v) => PdfObjectDirect::Boolean(*v), + PdfObject::Integer(v) => PdfObjectDirect::Integer(*v), + PdfObject::Real(v) => PdfObjectDirect::Real(*v), + PdfObject::String(v) => PdfObjectDirect::String(v.clone()), + PdfObject::Name(v) => PdfObjectDirect::Name(v.clone()), + PdfObject::Array(v) => PdfObjectDirect::Array(v.clone()), + PdfObject::Dictionary(v) => PdfObjectDirect::Dictionary(v.clone()), + PdfObject::Stream(v) => PdfObjectDirect::Stream(v.clone()), + PdfObject::Null(v) => PdfObjectDirect::Null(*v), + PdfObject::Indirect(v) => { + id = v.id; + continue; + } + }; + // we could be racing with another thread, so set can fail but that's not a problem + let _ = self.final_id.set(id); + return retval; + } else { + return PdfObjectDirect::Null(PdfNull::new(id.pos)); + } + } + } + } + PdfObjectDirect::Null(PdfNull::new(self.pos())) + } + pub fn id(&self) -> PdfObjectIdentifier { + self.id + } + pub fn pos(&self) -> PdfInputPosition { + self.id.pos.0 + } +} + +impl From for PdfObjectDirect { + fn from(value: PdfObjectIndirect) -> Self { + value.get() + } +} + +#[derive(Clone)] +pub struct PdfDictionary { + pos: PdfInputPositionNoCompare, + fields: Arc>, +} + +impl PdfDictionary { + pub fn new(pos: impl Into) -> Self { + Self { + pos: pos.into(), + fields: Arc::new(BTreeMap::new()), + } + } + pub fn from_fields( + pos: impl Into, + mut fields: Arc>, + ) -> Self { + if fields.values().any(|v| matches!(v, PdfObject::Null(_))) { + Arc::make_mut(&mut fields).retain(|_k, v| !matches!(v, PdfObject::Null(_))); + } + Self { + pos: pos.into(), + fields, + } + } + pub fn fields(&self) -> &Arc> { + &self.fields + } + pub fn into_fields(self) -> Arc> { + self.fields + } + pub fn iter(&self) -> std::collections::btree_map::Iter<'_, PdfName, PdfObject> { + self.fields.iter() + } + pub fn contains_key(&self, key: &Q) -> bool + where + PdfName: std::borrow::Borrow + Ord, + Q: Ord, + { + self.fields.contains_key(key) + } + pub fn get(&self, key: &Q) -> Option<&PdfObject> + where + PdfName: std::borrow::Borrow + Ord, + Q: Ord, + { + self.fields.get(key) + } + pub fn pos(&self) -> PdfInputPosition { + self.pos.0 + } +} + +impl GetPdfInputPosition for PdfDictionary { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.pos.0 + } +} + +impl Default for PdfDictionary { + fn default() -> Self { + Self::new(PdfInputPosition::empty()) + } +} + +impl FromIterator<(PdfName, PdfObject)> for PdfDictionary { + fn from_iter>(iter: T) -> Self { + Self { + pos: PdfInputPositionNoCompare::empty(), + fields: Arc::new(BTreeMap::from_iter( + iter.into_iter() + .filter(|(_name, value)| !matches!(value, PdfObject::Null(_))), + )), + } + } +} + +impl IntoIterator for PdfDictionary { + type Item = (PdfName, PdfObject); + type IntoIter = std::collections::btree_map::IntoIter; + + fn into_iter(self) -> Self::IntoIter { + Arc::unwrap_or_clone(self.fields).into_iter() + } +} + +impl<'a> IntoIterator for &'a PdfDictionary { + type Item = (&'a PdfName, &'a PdfObject); + type IntoIter = std::collections::btree_map::Iter<'a, PdfName, PdfObject>; + + fn into_iter(self) -> Self::IntoIter { + self.fields.iter() + } +} + +impl fmt::Debug for PdfDictionary { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_map().entries(self).finish() + } +} + +#[derive(Clone, Default)] +pub struct PdfArray { + pos: PdfInputPositionNoCompare, + elements: Arc<[PdfObject]>, +} + +impl PdfArray { + pub fn new(pos: impl Into) -> Self { + Self { + pos: pos.into(), + elements: Arc::default(), + } + } + pub fn from_elements( + pos: impl Into, + elements: Arc<[PdfObject]>, + ) -> Self { + Self { + pos: pos.into(), + elements, + } + } + pub fn pos(&self) -> PdfInputPosition { + self.pos.0 + } + pub fn elements(&self) -> &Arc<[PdfObject]> { + &self.elements + } + pub fn into_elements(self) -> Arc<[PdfObject]> { + self.elements + } + pub fn iter(&self) -> std::slice::Iter<'_, PdfObject> { + self.elements.iter() + } +} + +impl GetPdfInputPosition for PdfArray { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.pos.0 + } +} + +impl FromIterator for PdfArray { + fn from_iter>(iter: T) -> Self { + Self { + pos: PdfInputPositionNoCompare::empty(), + elements: Arc::from_iter(iter), + } + } +} + +#[derive(Clone)] +pub struct PdfArrayIntoIter { + indexes: std::ops::Range, + elements: Arc<[PdfObject]>, +} + +impl Iterator for PdfArrayIntoIter { + type Item = PdfObject; + + fn next(&mut self) -> Option { + self.indexes.next().map(|i| self.elements[i].clone()) + } + + fn size_hint(&self) -> (usize, Option) { + self.indexes.size_hint() + } + + fn nth(&mut self, n: usize) -> Option { + self.indexes.nth(n).map(|i| self.elements[i].clone()) + } + + fn last(self) -> Option { + self.indexes.last().map(|i| self.elements[i].clone()) + } + + fn fold(self, init: B, mut f: F) -> B + where + F: FnMut(B, Self::Item) -> B, + { + self.indexes + .fold(init, |init, i| f(init, self.elements[i].clone())) + } +} + +impl std::iter::FusedIterator for PdfArrayIntoIter {} + +impl DoubleEndedIterator for PdfArrayIntoIter { + fn next_back(&mut self) -> Option { + self.indexes.next_back().map(|i| self.elements[i].clone()) + } + fn nth_back(&mut self, n: usize) -> Option { + self.indexes.nth_back(n).map(|i| self.elements[i].clone()) + } + fn rfold(self, init: B, mut f: F) -> B + where + F: FnMut(B, Self::Item) -> B, + { + self.indexes + .rfold(init, |init, i| f(init, self.elements[i].clone())) + } +} + +impl ExactSizeIterator for PdfArrayIntoIter {} + +impl IntoIterator for PdfArray { + type Item = PdfObject; + type IntoIter = PdfArrayIntoIter; + + fn into_iter(self) -> Self::IntoIter { + PdfArrayIntoIter { + indexes: 0..self.elements.len(), + elements: self.elements, + } + } +} + +impl<'a> IntoIterator for &'a PdfArray { + type Item = &'a PdfObject; + type IntoIter = std::slice::Iter<'a, PdfObject>; + + fn into_iter(self) -> Self::IntoIter { + self.elements.iter() + } +} + +impl fmt::Debug for PdfArray { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.elements.fmt(f) + } +} + +#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct MaybeArray(pub Arc<[T]>); + +impl std::ops::Deref for MaybeArray { + type Target = Arc<[T]>; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl std::ops::DerefMut for MaybeArray { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +pdf_parse! { + #[derive(Clone, Debug, PartialEq, Eq)] + #[non_exhaustive] + pub enum PdfStreamFilter { + #[pdf(name = "ASCIIHexDecode")] + AsciiHexDecode, + #[pdf(name = "ASCII85Decode")] + Ascii85Decode, + #[pdf(name = "LZWDecode")] + LzwDecode, + #[pdf(name = "FlateDecode")] + FlateDecode, + #[pdf(name = "RunLengthDecode")] + RunLengthDecode, + #[pdf(name = "CCITTFaxDecode")] + CcittFaxDecode, + #[pdf(name = "JBIG2Decode")] + Jbig2Decode, + #[pdf(name = "DCTDecode")] + DctDecode, + #[pdf(name = "JPXDecode")] + JpxDecode, + #[pdf(name = "Crypt")] + Crypt, + #[pdf(other)] + Unknown(PdfName), + } +} + +impl Default for MaybeArray { + fn default() -> Self { + Self(Arc::default()) + } +} + +impl<'a, T> IntoIterator for &'a MaybeArray { + type Item = &'a T; + type IntoIter = std::slice::Iter<'a, T>; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +#[derive(Clone, Debug)] +pub enum PdfFileSpecification { + String(PdfString), + Dictionary(PdfDictionary), +} + +impl PdfParse for PdfFileSpecification { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("file specification") + } + fn parse(object: PdfObject) -> Result { + match PdfObjectDirect::from(object) { + PdfObjectDirect::String(v) => Ok(Self::String(v)), + PdfObjectDirect::Dictionary(v) => Ok(Self::Dictionary(v)), + object => Err(PdfParseError::InvalidType { + pos: object.pos(), + ty: object.type_name(), + expected_ty: "PdfFileSpecification", + }), + } + } +} + +pdf_parse! { + #[derive(Clone, Debug)] + pub struct PdfStreamDictionary { + #[pdf(name = PdfStreamDictionary::LENGTH_NAME)] + pub len: usize, + #[pdf(name = "Filter")] + pub filters: MaybeArray, + #[pdf(name = "DecodeParms")] + pub decode_parms: MaybeArray>, + #[pdf(name = "F")] + pub file: Option, + #[pdf(name = "FFilter")] + pub file_filters: MaybeArray, + #[pdf(name = "FDecodeParms")] + pub file_decode_parms: MaybeArray>, + #[pdf(name = "DL")] + pub decoded_len: Option, + #[pdf(flatten)] + pub rest: Rest, + } +} + +impl PdfStreamDictionary { + pub const LENGTH_NAME: &str = "Length"; + pub(crate) fn parse_len_from_dictionary( + dictionary: &PdfDictionary, + ) -> Result { + PdfParse::parse( + dictionary + .get(&PdfName::new_static(Self::LENGTH_NAME.as_bytes())) + .cloned() + .unwrap_or_default(), + ) + } +} + +impl PdfStreamDictionary { + pub fn filters_and_parms( + &self, + ) -> impl Clone + ExactSizeIterator + DoubleEndedIterator + { + self.filters.iter().enumerate().map(|(index, filter)| { + ( + filter.clone(), + self.decode_parms + .0 + .get(index) + .cloned() + .flatten() + .unwrap_or_default(), + ) + }) + } + pub fn file_filters_and_parms( + &self, + ) -> impl Clone + ExactSizeIterator + DoubleEndedIterator + { + self.file_filters.iter().enumerate().map(|(index, filter)| { + ( + filter.clone(), + self.file_decode_parms + .0 + .get(index) + .cloned() + .flatten() + .unwrap_or_default(), + ) + }) + } +} + +pub(crate) struct UnparsedPdfStreamDictionary { + unparsed_dictionary: PdfDictionary, + dictionary: Arc>>, +} + +impl UnparsedPdfStreamDictionary { + pub(crate) fn finish_parsing(self) -> Result<(), PdfParseError> { + let Ok(()) = self + .dictionary + .set(PdfParse::parse(self.unparsed_dictionary.into())?) + else { + unreachable!(); + }; + Ok(()) + } +} + +#[derive(Clone)] +pub struct PdfStream { + pos: PdfInputPositionNoCompare, + dictionary: Arc>>, + data: Arc<[u8]>, +} + +impl fmt::Debug for PdfStream { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("PdfStream") + .field("pos", &self.pos) + .field("dictionary", &self.dictionary) + .field("data", &format_args!("{:02x?}", self.data)) + .finish() + } +} + +impl PdfStream { + pub fn new( + pos: impl Into, + dictionary: PdfStreamDictionary, + data: Arc<[u8]>, + ) -> Self { + Self { + pos: pos.into(), + dictionary: Arc::new(OnceLock::from(dictionary)), + data, + } + } + pub(crate) fn new_unparsed( + pos: impl Into, + unparsed_dictionary: PdfDictionary, + data: Arc<[u8]>, + ) -> (Self, UnparsedPdfStreamDictionary) { + let dictionary = Arc::new(OnceLock::new()); + ( + Self { + pos: pos.into(), + dictionary: dictionary.clone(), + data, + }, + UnparsedPdfStreamDictionary { + unparsed_dictionary, + dictionary, + }, + ) + } + pub fn dictionary(&self) -> &PdfStreamDictionary { + self.dictionary + .get() + .expect("haven't finished parsing all pdf object definitions yet") + } + pub fn data(&self) -> &Arc<[u8]> { + &self.data + } +} + +impl GetPdfInputPosition for PdfStream { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.pos.0 + } +} + +impl PdfParse for PdfStream { + fn type_name() -> Cow<'static, str> { + if TypeId::of::() == TypeId::of::() { + Cow::Borrowed("stream") + } else { + Cow::Owned(format!("PdfStream<{}>", Rest::type_name())) + } + } + fn parse(object: PdfObject) -> Result { + match PdfObjectDirect::from(object) { + PdfObjectDirect::Stream(stream) => Ok(PdfStream { + pos: stream.pos, + dictionary: if let Some(dictionary) = ::downcast_ref::< + Arc>>, + >(&stream.dictionary) + { + dictionary.clone() + } else { + let PdfStreamDictionary { + len, + filters, + decode_parms, + file, + file_filters, + file_decode_parms, + decoded_len, + rest, + } = stream.dictionary(); + Arc::new(OnceLock::from(PdfStreamDictionary { + len: *len, + filters: filters.clone(), + decode_parms: decode_parms.clone(), + file: file.clone(), + file_filters: file_filters.clone(), + file_decode_parms: file_decode_parms.clone(), + decoded_len: *decoded_len, + rest: Rest::parse(rest.clone().into())?, + })) + }, + data: stream.data, + }), + object => Err(PdfParseError::InvalidType { + pos: object.get_pdf_input_position(), + ty: object.type_name(), + expected_ty: "stream", + }), + } + } +} diff --git a/src/pdf/parse.rs b/src/pdf/parse.rs new file mode 100644 index 0000000..aa5bc3d --- /dev/null +++ b/src/pdf/parse.rs @@ -0,0 +1,953 @@ +use crate::pdf::object::{ + MaybeArray, PdfInteger, PdfName, PdfNull, PdfObject, PdfObjectDirect, PdfObjectIdentifier, + PdfObjectIndirect, PdfObjectNonNull, PdfReal, +}; +use std::{any::Any, borrow::Cow, fmt, mem, num::NonZero, sync::Arc}; + +#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Default)] +pub struct PdfInputPosition(Option); + +impl fmt::Debug for PdfInputPosition { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_tuple("PdfInputPosition") + .field(&format_args!("{self}")) + .finish() + } +} + +impl fmt::Display for PdfInputPosition { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if let Some(pos) = self.0 { + write!(f, "{pos:#x}") + } else { + f.write_str("") + } + } +} + +impl PdfInputPosition { + pub const fn new(pos: usize) -> Self { + Self(Some(pos)) + } + pub const fn empty() -> PdfInputPosition { + Self(None) + } +} + +pub trait GetPdfInputPosition { + fn get_pdf_input_position(&self) -> PdfInputPosition; +} + +impl GetPdfInputPosition for &'_ T { + fn get_pdf_input_position(&self) -> PdfInputPosition { + T::get_pdf_input_position(self) + } +} + +impl GetPdfInputPosition for &'_ mut T { + fn get_pdf_input_position(&self) -> PdfInputPosition { + T::get_pdf_input_position(self) + } +} + +impl GetPdfInputPosition for Box { + fn get_pdf_input_position(&self) -> PdfInputPosition { + T::get_pdf_input_position(self) + } +} + +impl GetPdfInputPosition for PdfInputPosition { + fn get_pdf_input_position(&self) -> PdfInputPosition { + *self + } +} + +impl GetPdfInputPosition for bool { + fn get_pdf_input_position(&self) -> PdfInputPosition { + PdfInputPosition::empty() + } +} + +impl GetPdfInputPosition for i128 { + fn get_pdf_input_position(&self) -> PdfInputPosition { + PdfInputPosition::empty() + } +} + +impl GetPdfInputPosition for f64 { + fn get_pdf_input_position(&self) -> PdfInputPosition { + PdfInputPosition::empty() + } +} + +#[derive(Clone, Copy, Default)] +pub struct PdfInputPositionNoCompare(pub PdfInputPosition); + +impl PdfInputPositionNoCompare { + pub const fn empty() -> Self { + Self(PdfInputPosition::empty()) + } + pub const fn new(pos: usize) -> Self { + Self(PdfInputPosition::new(pos)) + } +} + +impl GetPdfInputPosition for PdfInputPositionNoCompare { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.0 + } +} + +impl From for PdfInputPositionNoCompare { + fn from(value: PdfInputPosition) -> Self { + Self(value) + } +} + +impl fmt::Debug for PdfInputPositionNoCompare { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_tuple("PdfInputPositionNoCompare") + .field(&format_args!("{self}")) + .finish() + } +} + +impl fmt::Display for PdfInputPositionNoCompare { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.0.fmt(f) + } +} + +impl Ord for PdfInputPositionNoCompare { + fn cmp(&self, _other: &Self) -> std::cmp::Ordering { + std::cmp::Ordering::Equal + } +} + +impl PartialOrd for PdfInputPositionNoCompare { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl std::hash::Hash for PdfInputPositionNoCompare { + fn hash(&self, _state: &mut H) { + // don't hash anything since Self always compares equal + } +} + +impl Eq for PdfInputPositionNoCompare {} + +impl PartialEq for PdfInputPositionNoCompare { + fn eq(&self, _other: &Self) -> bool { + true + } +} + +#[derive(Debug)] +#[non_exhaustive] +pub enum PdfParseError { + Custom(String), + InvalidType { + pos: PdfInputPosition, + ty: &'static str, + expected_ty: &'static str, + }, + InvalidName { + pos: PdfInputPosition, + name: PdfName, + expected_ty: &'static str, + }, + NotAPdfFile, + TruncatedFile { + pos: PdfInputPosition, + }, + InvalidObjectNumber { + pos: PdfInputPosition, + }, + InvalidGenerationNumber { + pos: PdfInputPosition, + }, + InvalidNumber { + pos: PdfInputPosition, + }, + InvalidStringEscape { + pos: PdfInputPosition, + }, + InvalidHexStringDigit { + pos: PdfInputPosition, + }, + DuplicateIndirectObjectDefinition { + pos: PdfInputPosition, + id: PdfObjectIdentifier, + }, + MissingObj { + pos: PdfInputPosition, + }, + MissingEndObj { + pos: PdfInputPosition, + }, + InvalidDictionaryClosingDoubleRAngle { + pos: PdfInputPosition, + }, + DuplicateDictionaryKey { + pos: PdfInputPosition, + name: PdfName, + }, + InvalidNameEscape { + pos: PdfInputPosition, + }, + InvalidOrMissingEolAfterStreamKeyword { + pos: PdfInputPosition, + }, + MissingEndStreamKeyword { + pos: PdfInputPosition, + }, + IntegerOutOfRange { + pos: PdfInputPosition, + }, + MissingTrailer { + pos: PdfInputPosition, + }, + WrongArrayLength { + pos: PdfInputPosition, + len: usize, + expected_len: usize, + }, + MissingStartXRefKeyword { + pos: PdfInputPosition, + }, + MissingStartXRefValue { + pos: PdfInputPosition, + }, + MissingEofComment { + pos: PdfInputPosition, + }, + UnexpectedByte { + pos: PdfInputPosition, + byte: u8, + }, + InvalidStartXRefValue { + pos: PdfInputPosition, + start_xref: usize, + }, +} + +impl From for PdfParseError { + fn from(value: std::convert::Infallible) -> Self { + match value {} + } +} + +impl GetPdfInputPosition for PdfParseError { + fn get_pdf_input_position(&self) -> PdfInputPosition { + match *self { + PdfParseError::Custom(_) | PdfParseError::NotAPdfFile => PdfInputPosition::empty(), + PdfParseError::InvalidType { pos, .. } + | PdfParseError::InvalidName { pos, .. } + | PdfParseError::TruncatedFile { pos } + | PdfParseError::InvalidObjectNumber { pos } + | PdfParseError::InvalidGenerationNumber { pos } + | PdfParseError::InvalidNumber { pos } + | PdfParseError::InvalidStringEscape { pos } + | PdfParseError::InvalidHexStringDigit { pos } + | PdfParseError::DuplicateIndirectObjectDefinition { pos, .. } + | PdfParseError::MissingObj { pos } + | PdfParseError::MissingEndObj { pos } + | PdfParseError::InvalidDictionaryClosingDoubleRAngle { pos } + | PdfParseError::DuplicateDictionaryKey { pos, .. } + | PdfParseError::InvalidNameEscape { pos } + | PdfParseError::InvalidOrMissingEolAfterStreamKeyword { pos } + | PdfParseError::MissingEndStreamKeyword { pos } + | PdfParseError::IntegerOutOfRange { pos } + | PdfParseError::MissingTrailer { pos } + | PdfParseError::WrongArrayLength { pos, .. } + | PdfParseError::MissingStartXRefKeyword { pos } + | PdfParseError::MissingStartXRefValue { pos } + | PdfParseError::MissingEofComment { pos } + | PdfParseError::UnexpectedByte { pos, .. } + | PdfParseError::InvalidStartXRefValue { pos, .. } => pos, + } + } +} + +impl fmt::Display for PdfParseError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match *self { + PdfParseError::Custom(ref v) => f.write_str(v), + PdfParseError::InvalidType { + pos, + ty, + expected_ty, + } => { + write!( + f, + "at {pos}: invalid type: expected {expected_ty}, got {ty}" + ) + } + PdfParseError::InvalidName { + pos, + ref name, + expected_ty, + } => { + write!( + f, + "at {pos}: invalid name: expected a(n) {expected_ty}, got {name}" + ) + } + PdfParseError::NotAPdfFile => f.write_str("Not a PDF file"), + PdfParseError::TruncatedFile { pos } => { + write!(f, "at {pos}: PDF file is truncated too early") + } + PdfParseError::InvalidObjectNumber { pos } => { + write!(f, "at {pos}: PDF object number is invalid") + } + PdfParseError::InvalidGenerationNumber { pos } => { + write!( + f, + "at {pos}: PDF object identifier's generation number is invalid" + ) + } + PdfParseError::InvalidNumber { pos } => { + write!(f, "at {pos}: invalid number") + } + PdfParseError::InvalidStringEscape { pos } => { + write!(f, "at {pos}: invalid string escape") + } + PdfParseError::InvalidHexStringDigit { pos } => { + write!(f, "at {pos}: invalid hex string digit") + } + PdfParseError::DuplicateIndirectObjectDefinition { pos, id } => { + write!(f, "at {pos}: duplicate indirect object definition: {id:?}") + } + PdfParseError::MissingObj { pos } => { + write!( + f, + "at {pos}: indirect object definition is missing `obj` keyword" + ) + } + PdfParseError::MissingEndObj { pos } => { + write!( + f, + "at {pos}: indirect object definition is missing `endobj` keyword" + ) + } + PdfParseError::InvalidDictionaryClosingDoubleRAngle { pos } => { + write!(f, "at {pos}: dictionary has an invalid closing `>>` symbol") + } + PdfParseError::DuplicateDictionaryKey { pos, ref name } => { + write!(f, "at {pos}: duplicate dictionary key: {name}") + } + PdfParseError::InvalidNameEscape { pos } => { + write!(f, "at {pos}: invalid name escape") + } + PdfParseError::InvalidOrMissingEolAfterStreamKeyword { pos } => { + write!( + f, + "at {pos}: invalid or missing end-of-line after `stream` keyword" + ) + } + PdfParseError::MissingEndStreamKeyword { pos } => { + write!(f, "at {pos}: missing `endstream` keyword") + } + PdfParseError::IntegerOutOfRange { pos } => { + write!(f, "at {pos}: integer out of range") + } + PdfParseError::MissingTrailer { pos } => { + write!(f, "at {pos}: missing `trailer` keyword") + } + PdfParseError::WrongArrayLength { + pos, + len, + expected_len, + } => { + write!( + f, + "at {pos}: wrong array length: expected {expected_len}, got {len}" + ) + } + PdfParseError::MissingStartXRefKeyword { pos } => { + write!(f, "at {pos}: missing `startxref` keyword") + } + PdfParseError::MissingStartXRefValue { pos } => { + write!(f, "at {pos}: missing `startxref` value") + } + PdfParseError::MissingEofComment { pos } => { + write!(f, "at {pos}: missing `%%EOF` comment") + } + PdfParseError::UnexpectedByte { pos, byte } => { + write!(f, "at {pos}: unexpected byte {}", byte.escape_ascii()) + } + PdfParseError::InvalidStartXRefValue { pos, start_xref } => { + write!( + f, + "at {pos}: invalid `startxref` value: {start_xref} ({start_xref:#x})" + ) + } + } + } +} + +impl std::error::Error for PdfParseError {} + +pub trait PdfParse: Sized + 'static { + fn type_name() -> Cow<'static, str>; + fn parse(object: PdfObject) -> Result; + fn parse_option(object: PdfObject) -> Result, PdfParseError> { + match object { + PdfObject::Null(_) => Ok(None), + PdfObject::Indirect(ref v) if v.get().is_null() => Ok(None), + PdfObject::Boolean(_) + | PdfObject::Integer(_) + | PdfObject::Real(_) + | PdfObject::String(_) + | PdfObject::Name(_) + | PdfObject::Array(_) + | PdfObject::Dictionary(_) + | PdfObject::Stream(_) + | PdfObject::Indirect(_) => Self::parse(object).map(Some), + } + } +} + +impl PdfParse for Option { + fn type_name() -> Cow<'static, str> { + T::type_name() + } + fn parse(object: PdfObject) -> Result { + T::parse_option(object) + } + fn parse_option(object: PdfObject) -> Result, PdfParseError> { + if matches!(object, PdfObject::Null(_)) { + Ok(None) + } else { + Self::parse(object).map(Some) + } + } +} + +macro_rules! impl_pdf_parse_prim_int { + ($ty:ident) => { + impl PdfParse for $ty { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed(stringify!($ty)) + } + fn parse(object: PdfObject) -> Result { + let v: PdfInteger = PdfParse::parse(object)?; + v.value() + .try_into() + .map_err(|_| PdfParseError::IntegerOutOfRange { pos: v.pos() }) + } + } + impl PdfParse for NonZero<$ty> { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed(concat!("NonZero<", stringify!($ty), ">")) + } + fn parse(object: PdfObject) -> Result { + let v: PdfInteger = PdfParse::parse(object)?; + v.value() + .try_into() + .ok() + .and_then(NonZero::new) + .ok_or(PdfParseError::IntegerOutOfRange { pos: v.pos() }) + } + } + }; +} + +impl_pdf_parse_prim_int!(u8); +impl_pdf_parse_prim_int!(i8); +impl_pdf_parse_prim_int!(u16); +impl_pdf_parse_prim_int!(i16); +impl_pdf_parse_prim_int!(u32); +impl_pdf_parse_prim_int!(i32); +impl_pdf_parse_prim_int!(u64); +impl_pdf_parse_prim_int!(i64); +impl_pdf_parse_prim_int!(u128); +impl_pdf_parse_prim_int!(usize); +impl_pdf_parse_prim_int!(isize); + +impl PdfParse for i128 { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("i128") + } + fn parse(object: PdfObject) -> Result { + let v: PdfInteger = PdfParse::parse(object)?; + Ok(v.value().into()) + } +} + +impl PdfParse for NonZero { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("NonZero") + } + fn parse(object: PdfObject) -> Result { + let v: PdfInteger = PdfParse::parse(object)?; + NonZero::new(v.value().into()).ok_or(PdfParseError::IntegerOutOfRange { pos: v.pos() }) + } +} + +impl PdfParse for f64 { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("f64") + } + fn parse(object: PdfObject) -> Result { + Ok(::parse(object)?.value()) + } +} + +impl PdfParse for f32 { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("f32") + } + fn parse(object: PdfObject) -> Result { + Ok(::parse(object)? as f32) + } +} + +impl PdfParse for PdfNull { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("null") + } + fn parse(object: PdfObject) -> Result { + match PdfObjectDirect::from(object) { + PdfObjectDirect::Null(v) => Ok(v), + object => Err(PdfParseError::InvalidType { + pos: object.pos(), + ty: object.type_name(), + expected_ty: "null", + }), + } + } + fn parse_option(object: PdfObject) -> Result, PdfParseError> { + Self::parse(object).map(Some) + } +} + +impl PdfParse for PdfObjectNonNull { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("non-null") + } + fn parse(object: PdfObject) -> Result { + Option::::from(object).ok_or(PdfParseError::InvalidType { + pos: PdfInputPosition::empty(), + ty: "null", + expected_ty: "non-null", + }) + } + fn parse_option(object: PdfObject) -> Result, PdfParseError> { + Ok(object.into()) + } +} + +impl PdfParse for PdfObjectDirect { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("direct object") + } + fn parse(object: PdfObject) -> Result { + Ok(object.into()) + } +} + +impl PdfParse for PdfObject { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("object") + } + fn parse(object: PdfObject) -> Result { + Ok(object) + } +} + +impl PdfParse for PdfObjectIndirect { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("indirect object") + } + fn parse(object: PdfObject) -> Result { + match object { + PdfObject::Indirect(v) => Ok(v), + _ => Err(PdfParseError::InvalidType { + pos: object.pos(), + ty: object.type_name(), + expected_ty: "indirect object", + }), + } + } + fn parse_option(object: PdfObject) -> Result, PdfParseError> { + match object { + PdfObject::Indirect(v) => Ok(Some(v)), + PdfObject::Null(_) => Ok(None), + _ => Err(PdfParseError::InvalidType { + pos: object.pos(), + ty: object.type_name(), + expected_ty: "indirect object", + }), + } + } +} + +impl PdfParse for [T; N] { + fn type_name() -> Cow<'static, str> { + Cow::Owned(format!("[{}; {N}]", T::type_name())) + } + fn parse(object: PdfObject) -> Result { + match PdfObjectDirect::from(object) { + PdfObjectDirect::Array(array) => { + let array_pos = array.pos(); + let elements = array.into_elements(); + let mut elements: Arc<[PdfObject; N]> = + elements.try_into().map_err(|elements: Arc<[PdfObject]>| { + PdfParseError::WrongArrayLength { + pos: array_pos, + len: elements.len(), + expected_len: N, + } + })?; + let elements: Box<[T]> = if let Some(elements) = Arc::get_mut(&mut elements) { + Result::from_iter(elements.iter_mut().map(|v| T::parse(mem::take(v))))? + } else { + Result::from_iter(elements.iter().map(|v| T::parse(v.clone())))? + }; + Ok(*Box::<[T; N]>::try_from(elements) + .ok() + .expect("already checked length")) + } + object => Err(PdfParseError::InvalidType { + pos: object.pos(), + ty: object.type_name(), + expected_ty: "array", + }), + } + } +} + +impl PdfParse for Arc<[T]> { + fn type_name() -> Cow<'static, str> { + Cow::Owned(format!("Arc<[{}]>", T::type_name())) + } + fn parse(object: PdfObject) -> Result { + match PdfObjectDirect::from(object) { + PdfObjectDirect::Array(array) => { + let mut elements = array.into_elements(); + if let Some(retval) = ::downcast_ref::(&elements) { + return Ok(retval.clone()); + } + if let Some(elements) = Arc::get_mut(&mut elements) { + Result::from_iter(elements.iter_mut().map(|v| T::parse(mem::take(v)))) + } else { + Result::from_iter(elements.iter().map(|v| T::parse(v.clone()))) + } + } + PdfObjectDirect::Null(_) => Ok(Self::default()), + object => Err(PdfParseError::InvalidType { + pos: object.pos(), + ty: object.type_name(), + expected_ty: "array", + }), + } + } +} + +impl PdfParse for MaybeArray { + fn type_name() -> Cow<'static, str> { + Cow::Owned(format!("MaybeArray<{}>", T::type_name())) + } + fn parse(object: PdfObject) -> Result { + match PdfObjectDirect::from(object) { + PdfObjectDirect::Null(_) => Ok(Self::default()), + PdfObjectDirect::Array(object) => Ok(Self(PdfParse::parse(object.into())?)), + object => Ok(Self(Arc::new([PdfParse::parse(object.into())?]))), + } + } +} + +#[macro_export] +macro_rules! pdf_parse { + ( + $(#[$($struct_meta:tt)*])* + $struct_vis:vis struct $Struct:ident$(<$($StructParam:ident $(: $StructBound:tt)? $(= $StructParamDefault:ty)?),* $(,)?>)? { + $(#[pdf $($pdf_meta:tt)*] + $(#[$($field_meta:tt)*])* + $field_vis:vis $field_name:ident: $field_ty:ty,)* + } + ) => { + $(#[$($struct_meta)*])* + $struct_vis struct $Struct$(<$($StructParam $(: $StructBound)? $(= $StructParamDefault)?),*>)? { + $($(#[$($field_meta)*])* + $field_vis $field_name: $field_ty,)* + } + + $crate::pdf::parse::pdf_parse! { + @impl + struct $Struct$(<$($StructParam $(: $StructBound)?),*>)? { + $(#[pdf $($pdf_meta)*] + $(#[$($field_meta)*])* + $field_name: $field_ty,)* + } + } + }; + ( + @impl + struct $Struct:ident$(<$($StructParam:ident $(: $StructBound:tt)?),* $(,)?>)? { + $($(#[$($field_meta:tt)*])* + $field_name:ident: $field_ty:ty,)* + } + ) => { + impl$(<$($StructParam: $crate::pdf::parse::PdfParse $(+ $StructBound)?),*>)? $crate::pdf::parse::PdfParse for $Struct$(<$($StructParam),*>)? { + fn type_name() -> $crate::__std::borrow::Cow<'static, $crate::__std::primitive::str> { + let args: &[$crate::__std::borrow::Cow<'static, $crate::__std::primitive::str>] = &[ + $($(<$StructParam as $crate::pdf::parse::PdfParse>::type_name()),*)? + ]; + if args.is_empty() { + $crate::__std::borrow::Cow::Borrowed($crate::__std::stringify!($Struct)) + } else { + let mut retval = $crate::__std::string::String::new(); + retval.push_str($crate::__std::stringify!($Struct)); + retval.push_str("<"); + let mut first = true; + for arg in args { + if first { + first = false; + } else { + retval.push_str(", "); + } + retval.push_str(arg); + } + retval.push_str(">"); + $crate::__std::borrow::Cow::Owned(retval) + } + } + fn parse(object: $crate::pdf::object::PdfObject) -> $crate::__std::result::Result { + let object = $crate::__std::convert::From::from(object); + let $crate::pdf::object::PdfObjectDirect::Dictionary(object) = object else { + return $crate::__std::result::Result::Err($crate::pdf::parse::PdfParseError::InvalidType { + pos: object.pos(), + ty: object.type_name(), + expected_ty: $crate::__std::stringify!($Struct), + }); + }; + let pos = object.pos(); + let mut object = object.into_fields(); + let object_mut = $crate::__std::sync::Arc::make_mut(&mut object); + let _ = object_mut; + $($crate::pdf::parse::pdf_parse! { + @impl_struct_field(pos, object, object_mut) + [] + $(#[$($field_meta)*])* + $field_name: $field_ty + })* + $crate::__std::result::Result::Ok(Self { + $($field_name,)* + }) + } + } + }; + ( + @impl_struct_field($pos:ident, $object:ident, $object_mut:ident) + [$(#[$($prev_field_meta:tt)*])*] + #[pdf $pdf_meta:tt] + $(#[$($field_meta:tt)*])* + $field_name:ident: $field_ty:ty + ) => { + $crate::pdf::parse::pdf_parse! { + @impl_struct_field($pos, $object, $object_mut, pdf $pdf_meta) + [$(#[$($prev_field_meta)*])*] + $(#[$($field_meta)*])* + $field_name: $field_ty + } + }; + ( + @impl_struct_field($pos:ident, $object:ident, $object_mut:ident $($pdf_meta:tt)*) + [$(#[$($prev_field_meta:tt)*])*] + #[$($next_field_meta:tt)*] + $(#[$($field_meta:tt)*])* + $field_name:ident: $field_ty:ty + ) => { + $crate::pdf::parse::pdf_parse! { + @impl_struct_field($pos, $object, $object_mut $($pdf_meta)*) + [$(#[$($prev_field_meta)*])* #[$($next_field_meta)*]] + $(#[$($field_meta)*])* + $field_name: $field_ty + } + }; + ( + @impl_struct_field($pos:ident, $object:ident, $object_mut:ident, pdf(flatten)) + [$(#[$($field_meta:tt)*])*] + $field_name:ident: $field_ty:ty + ) => { + let $field_name = <$field_ty as $crate::pdf::parse::PdfParse>::parse( + $crate::pdf::object::PdfObject::Dictionary( + $crate::pdf::object::PdfDictionary::from_fields($pos, $object), + ), + )?; + }; + ( + @impl_struct_field($pos:ident, $object:ident, $object_mut:ident, pdf(name = $name:expr)) + [$(#[$($field_meta:tt)*])*] + $field_name:ident: $field_ty:ty + ) => { + let $field_name = $crate::pdf::object::PdfName::new_static( + $crate::__std::convert::AsRef::<[u8]>::as_ref($name), + ); + let $field_name = <$field_ty as $crate::pdf::parse::PdfParse>::parse( + $object_mut + .remove(&$field_name) + .unwrap_or($crate::pdf::object::PdfObject::Null($crate::pdf::object::PdfNull::new($pos))), + )?; + }; + ( + $(#[$($enum_meta:tt)*])* + $enum_vis:vis enum $Enum:ident { + $(#[pdf $($pdf_meta:tt)*] + $(#[$($variant_meta:tt)*])* + $VariantName:ident $(($($variant_paren_body:tt)*))? $({$($variant_brace_body:tt)*})?,)* + } + ) => { + $(#[$($enum_meta)*])* + $enum_vis enum $Enum { + $($(#[$($variant_meta)*])* + $VariantName $(($($variant_paren_body)*))? $({$($variant_brace_body)*})?,)* + } + + $crate::pdf::parse::pdf_parse! { + @impl + $(#[$($enum_meta)*])* + enum $Enum { + $(#[pdf $($pdf_meta)*] + $(#[$($variant_meta)*])* + $VariantName $(($($variant_paren_body)*))? $({$($variant_brace_body)*})?,)* + } + } + }; + ( + @impl + $(#[$($enum_meta:tt)*])* + enum $Enum:ident { + $(#[pdf(name = $name:expr)] + $(#[$($variant_meta:tt)*])* + $VariantName:ident,)* + $(#[pdf(other)] + $(#[$($variant_meta_other:tt)*])* + $VariantNameOther:ident($($PdfName:tt)*),)? + } + ) => { + impl $crate::__std::convert::From<$Enum> for $crate::pdf::object::PdfName { + fn from(value: $Enum) -> Self { + match value { + $($Enum::$VariantName => $crate::pdf::object::PdfName::new_static( + $crate::__std::convert::AsRef::<[u8]>::as_ref($name), + ),)* + $($Enum::$VariantNameOther(v) => $crate::__std::convert::Into::into(v),)? + } + } + } + + $crate::pdf::parse::pdf_parse! { + @impl_try_from + $(#[$($enum_meta)*])* + enum $Enum { + $(#[pdf(name = $name)] + $(#[$($variant_meta)*])* + $VariantName,)* + $(#[pdf(other)] + $(#[$($variant_meta_other)*])* + $VariantNameOther($($PdfName)*),)? + } + } + + impl $crate::pdf::parse::PdfParse for $Enum { + fn type_name() -> $crate::__std::borrow::Cow<'static, $crate::__std::primitive::str> { + $crate::__std::borrow::Cow::Borrowed($crate::__std::stringify!($Struct)) + } + fn parse(object: $crate::pdf::object::PdfObject) -> $crate::__std::result::Result { + let object = $crate::__std::convert::From::from(object); + let $crate::pdf::object::PdfObjectDirect::Name(name) = object else { + return $crate::__std::result::Result::Err($crate::pdf::parse::PdfParseError::InvalidType { + pos: object.pos(), + ty: object.type_name(), + expected_ty: $crate::__std::stringify!($Struct), + }); + }; + $crate::__std::result::Result::Ok($crate::__std::convert::TryInto::<$Enum>::try_into(name)?) + } + } + }; + ( + @impl_try_from + $(#[$($enum_meta:tt)*])* + enum $Enum:ident { + $(#[pdf(name = $name:expr)] + $(#[$($variant_meta:tt)*])* + $VariantName:ident,)* + #[pdf(other)] + $(#[$($variant_meta_other:tt)*])* + $VariantNameOther:ident(PdfName), + } + ) => { + impl $crate::__std::convert::From<$crate::pdf::object::PdfName> for $Enum { + fn from(name: $crate::pdf::object::PdfName) -> Self { + $(if name == $crate::pdf::object::PdfName::new_static( + $crate::__std::convert::AsRef::<[u8]>::as_ref($name), + ) { + $Enum::$VariantName + } else)* { + $Enum::$VariantNameOther(name) + } + } + } + }; + ( + @impl_try_from + $(#[$($enum_meta:tt)*])* + enum $Enum:ident { + $(#[pdf(name = $name:expr)] + $(#[$($variant_meta:tt)*])* + $VariantName:ident,)* + #[pdf(other)] + $(#[$($variant_meta_other:tt)*])* + $VariantNameOther:ident($PdfName:ty), + } + ) => { + impl $crate::__std::convert::TryFrom<$crate::pdf::object::PdfName> for $Enum { + type Error = $crate::pdf::parse::PdfParseError; + + fn try_from(name: $crate::pdf::object::PdfName) -> $crate::__std::result::Result { + $(if name == $crate::pdf::object::PdfName::new_static( + $crate::__std::convert::AsRef::<[u8]>::as_ref($name), + ) { + $crate::__std::result::Result::Ok($Enum::$VariantName) + } else)* { + $crate::__std::result::Result::Ok($Enum::$VariantNameOther($crate::__std::convert::TryInto::<$PdfName>::try_into(name)?)) + } + } + } + }; + ( + @impl_try_from + $(#[$($enum_meta:tt)*])* + enum $Enum:ident { + $(#[pdf(name = $name:expr)] + $(#[$($variant_meta:tt)*])* + $VariantName:ident,)* + } + ) => { + impl $crate::__std::convert::TryFrom<$crate::pdf::object::PdfName> for $Enum { + type Error = $crate::pdf::parse::PdfParseError; + + fn try_from(name: $crate::pdf::object::PdfName) -> $crate::__std::result::Result { + $(if name == $crate::pdf::object::PdfName::new_static( + $crate::__std::convert::AsRef::<[u8]>::as_ref($name), + ) { + $crate::__std::result::Result::Ok($Enum::$VariantName) + } else)* { + $crate::__std::result::Result::Err($crate::pdf::parse::PdfParseError::InvalidName { + pos: name.pos(), + name, + expected_ty: $crate::__std::stringify!($Struct), + }) + } + } + } + }; +} + +pub use pdf_parse; diff --git a/src/util.rs b/src/util.rs index e234a7d..a7a4978 100644 --- a/src/util.rs +++ b/src/util.rs @@ -5,7 +5,7 @@ use std::{ sync::Arc, }; -pub(crate) enum ArcOrRef<'a, T: ?Sized> { +pub enum ArcOrRef<'a, T: ?Sized> { Arc(Arc), Ref(&'a T), }