From 5247d69ebd94e5540e54e7e833a9bce40e511555 Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Tue, 23 Dec 2025 04:41:09 -0800 Subject: [PATCH 01/42] WIP rust implementation --- .gitignore | 3 +- Cargo.lock | 75 +++++ Cargo.toml | 9 + src/lib.rs | 2 + src/pdf.rs | 956 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/util.rs | 102 ++++++ 6 files changed, 1146 insertions(+), 1 deletion(-) create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 src/lib.rs create mode 100644 src/pdf.rs create mode 100644 src/util.rs diff --git a/.gitignore b/.gitignore index 50e4eb1..e11e257 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ *.egg-info __pycache__ *.log -/powerisa-instructions.xml \ No newline at end of file +/powerisa-instructions.xml +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..2d752ff --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,75 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "parse_powerisa_pdf" +version = "0.1.0" +dependencies = [ + "serde", +] + +[[package]] +name = "proc-macro2" +version = "1.0.103" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "syn" +version = "2.0.111" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "390cc9a294ab71bdb1aa2e99d13be9c753cd2d7bd6560c77118597410c4d2e87" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..d2f159a --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "parse_powerisa_pdf" +version = "0.1.0" +edition = "2024" +license = "LGPL-3.0-or-later" + +[dependencies] +serde = { version = "1.0.228", features = ["derive"] } + diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..938fe11 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,2 @@ +mod pdf; +mod util; diff --git a/src/pdf.rs b/src/pdf.rs new file mode 100644 index 0000000..0af9d6b --- /dev/null +++ b/src/pdf.rs @@ -0,0 +1,956 @@ +use crate::util::ArcOrRef; +use serde::{de, forward_to_deserialize_any}; +use std::{ + cell::RefCell, + collections::BTreeMap, + convert::Infallible, + fmt::{self, Write}, + iter::FusedIterator, + marker::PhantomData, + num::NonZero, + sync::{Arc, Weak}, +}; + +#[derive(Debug)] +pub(crate) enum PdfParseError { + InvalidFieldKind { + containing_ty: &'static str, + field: &'static str, + expected_kind: &'static str, + kind: &'static str, + }, + Custom(String), +} + +impl From for PdfParseError { + fn from(value: Infallible) -> Self { + match value {} + } +} + +impl fmt::Display for PdfParseError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + &PdfParseError::InvalidFieldKind { + containing_ty, + field, + expected_kind, + kind, + } => write!( + f, + "invalid field kind: {containing_ty}.{field}: expected {expected_kind}, got {kind}" + ), + PdfParseError::Custom(msg) => f.write_str(msg), + } + } +} + +impl std::error::Error for PdfParseError {} + +impl de::Error for PdfParseError { + fn custom(msg: T) -> Self + where + T: fmt::Display, + { + PdfParseError::Custom(msg.to_string()) + } +} + +impl<'de> de::IntoDeserializer<'de, PdfParseError> for PdfName { + type Deserializer = PdfObject; + + fn into_deserializer(self) -> Self::Deserializer { + self.into() + } +} + +impl<'de> de::IntoDeserializer<'de, PdfParseError> for PdfObject { + type Deserializer = Self; + + fn into_deserializer(self) -> Self::Deserializer { + self + } +} + +impl PdfObject { + const SERDE_FIELD_NAME: &str = "__PdfObject__look_in_thread_local"; + const SERDE_NAME_AND_FIELDS: (&str, &[&str]) = ("PdfObject", &[Self::SERDE_FIELD_NAME]); + fn with_thread_local(f: impl FnOnce(&RefCell>) -> R) -> R { + thread_local! { + static CURRENT_OBJECT: RefCell> = const { RefCell::new(None) }; + } + CURRENT_OBJECT.with(f) + } + fn set_thread_local_scoped(self, f: impl FnOnce() -> R) -> R { + Self::with_thread_local(|current_object| { + struct PutBackOnDrop<'a> { + current_object: &'a RefCell>, + old_object: Option, + } + impl Drop for PutBackOnDrop<'_> { + fn drop(&mut self) { + self.current_object.replace(self.old_object.take()); + } + } + let put_back_on_drop = PutBackOnDrop { + current_object, + old_object: current_object.replace(Some(self)), + }; + let retval = f(); + drop(put_back_on_drop); + retval + }) + } + fn take_thread_local() -> Option { + Self::with_thread_local(RefCell::take) + } +} + +trait PdfObjectDeserializeHelperTrait: Sized { + fn expecting(f: &mut fmt::Formatter<'_>) -> fmt::Result; + fn from_pdf_object( + value: PdfObject, + expected: &dyn de::Expected, + ) -> Result; +} + +struct PdfObjectDeserializeHelper(T); + +impl<'de, T: PdfObjectDeserializeHelperTrait> de::Deserialize<'de> + for PdfObjectDeserializeHelper +{ + fn deserialize(deserializer: D) -> Result + where + D: de::Deserializer<'de>, + { + struct PdfObjectVisitor(PhantomData); + fn expected_pdf_object() -> E { + de::Error::invalid_type(de::Unexpected::Map, &PdfObjectVisitor::(PhantomData)) + } + impl<'de, T: PdfObjectDeserializeHelperTrait> de::Visitor<'de> for PdfObjectVisitor { + type Value = PdfObject; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + T::expecting(formatter) + } + + fn visit_map(self, mut map: A) -> Result + where + A: de::MapAccess<'de>, + { + struct Field(PhantomData); + impl<'de, T: PdfObjectDeserializeHelperTrait> de::Deserialize<'de> for Field { + fn deserialize(deserializer: D) -> Result + where + D: de::Deserializer<'de>, + { + deserializer.deserialize_identifier(Field(PhantomData)) + } + } + impl<'de, T: PdfObjectDeserializeHelperTrait> de::Visitor<'de> for Field { + type Value = Self; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + T::expecting(formatter) + } + + fn visit_str(self, v: &str) -> Result + where + E: de::Error, + { + if v == PdfObject::SERDE_FIELD_NAME { + Ok(self) + } else { + Err(expected_pdf_object::()) + } + } + } + let (Field::(PhantomData), ()) = map + .next_entry()? + .ok_or_else(expected_pdf_object::)?; + let None = map.next_entry::, ()>()? else { + return Err(expected_pdf_object::<_, T>()); + }; + PdfObject::take_thread_local().ok_or_else(expected_pdf_object::<_, T>) + } + } + let (name, fields) = PdfObject::SERDE_NAME_AND_FIELDS; + let pdf_object = + deserializer.deserialize_struct(name, fields, PdfObjectVisitor::(PhantomData))?; + T::from_pdf_object::(pdf_object, &PdfObjectVisitor::(PhantomData)).map(Self) + } +} + +macro_rules! forward_deserialize_to_pdf_object_helper { + ($ty:ty) => { + impl<'de> de::Deserialize<'de> for $ty { + fn deserialize(deserializer: D) -> Result + where + D: de::Deserializer<'de>, + { + let PdfObjectDeserializeHelper(v) = de::Deserialize::deserialize(deserializer)?; + Ok(v) + } + } + }; +} + +forward_deserialize_to_pdf_object_helper!(PdfObject); + +impl PdfObjectDeserializeHelperTrait for PdfObject { + fn expecting(f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("PdfObject") + } + + fn from_pdf_object( + value: PdfObject, + _expected: &dyn de::Expected, + ) -> Result { + Ok(value) + } +} + +forward_deserialize_to_pdf_object_helper!(PdfObjectIndirect); + +impl PdfObjectDeserializeHelperTrait for PdfObjectIndirect { + fn expecting(f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("PdfObjectIndirect") + } + + fn from_pdf_object( + value: PdfObject, + expected: &dyn de::Expected, + ) -> Result { + match value { + PdfObject::Indirect(v) => Ok(v), + _ => Err(E::invalid_type(value.as_unexpected(), expected)), + } + } +} + +forward_deserialize_to_pdf_object_helper!(PdfString); + +impl PdfObjectDeserializeHelperTrait for PdfString { + fn expecting(f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("PdfString") + } + + fn from_pdf_object( + value: PdfObject, + expected: &dyn de::Expected, + ) -> Result { + match value { + PdfObject::String(v) => Ok(v), + _ => Err(E::invalid_type(value.as_unexpected(), expected)), + } + } +} + +forward_deserialize_to_pdf_object_helper!(PdfName); + +impl PdfObjectDeserializeHelperTrait for PdfName { + fn expecting(f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("PdfName") + } + + fn from_pdf_object( + value: PdfObject, + expected: &dyn de::Expected, + ) -> Result { + match value { + PdfObject::Name(v) => Ok(v), + _ => Err(E::invalid_type(value.as_unexpected(), expected)), + } + } +} + +forward_deserialize_to_pdf_object_helper!(PdfArray); + +impl PdfObjectDeserializeHelperTrait for PdfArray { + fn expecting(f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("PdfArray") + } + + fn from_pdf_object( + value: PdfObject, + expected: &dyn de::Expected, + ) -> Result { + match value { + PdfObject::Array(v) => Ok(v), + _ => Err(E::invalid_type(value.as_unexpected(), expected)), + } + } +} + +forward_deserialize_to_pdf_object_helper!(PdfDictionary); + +impl PdfObjectDeserializeHelperTrait for PdfDictionary { + fn expecting(f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("PdfDictionary") + } + + fn from_pdf_object( + value: PdfObject, + expected: &dyn de::Expected, + ) -> Result { + match value { + PdfObject::Dictionary(v) => Ok(v), + _ => Err(E::invalid_type(value.as_unexpected(), expected)), + } + } +} + +forward_deserialize_to_pdf_object_helper!(PdfStream); + +impl PdfObjectDeserializeHelperTrait for PdfStream { + fn expecting(f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("PdfStream") + } + + fn from_pdf_object( + value: PdfObject, + expected: &dyn de::Expected, + ) -> Result { + match value { + PdfObject::Stream(v) => Ok(v), + _ => Err(E::invalid_type(value.as_unexpected(), expected)), + } + } +} + +impl<'de> de::Deserializer<'de> for PdfObject { + type Error = PdfParseError; + + fn deserialize_any(self, visitor: V) -> Result + where + V: de::Visitor<'de>, + { + match PdfObjectDirect::from(self) { + PdfObjectDirect::Boolean(v) => visitor.visit_bool(v), + PdfObjectDirect::Integer(v) => visitor.visit_i32(v), + PdfObjectDirect::Real(v) => visitor.visit_f32(v), + v @ (PdfObjectDirect::String(_) | PdfObjectDirect::Stream(_)) => { + Err(de::Error::invalid_type(v.as_unexpected(), &visitor)) + } + PdfObjectDirect::Name(v) => { + if let Ok(v) = str::from_utf8(v.as_bytes()) { + visitor.visit_str(v) + } else { + Err(de::Error::invalid_type( + PdfObject::from(v).as_unexpected(), + &visitor, + )) + } + } + PdfObjectDirect::Array(v) => { + visitor.visit_seq(de::value::SeqDeserializer::new(v.iter().cloned())) + } + PdfObjectDirect::Dictionary(v) => { + visitor.visit_map(de::value::MapDeserializer::new(v.into_iter())) + } + PdfObjectDirect::Null(PdfNull {}) => visitor.visit_unit(), + } + } + + fn deserialize_struct( + self, + name: &'static str, + fields: &'static [&'static str], + visitor: V, + ) -> Result + where + V: de::Visitor<'de>, + { + match (name, fields) { + PdfObject::SERDE_NAME_AND_FIELDS => self.set_thread_local_scoped(|| { + visitor.visit_map(de::value::MapDeserializer::new(std::iter::once(( + PdfObject::SERDE_FIELD_NAME, + (), + )))) + }), + _ => self.deserialize_any(visitor), + } + } + + fn deserialize_option(self, visitor: V) -> Result + where + V: de::Visitor<'de>, + { + let is_null = match self { + Self::Indirect(ref v) => !v.exists(), + Self::Null(_) => true, + _ => false, + }; + if is_null { + visitor.visit_none() + } else { + visitor.visit_some(self) + } + } + + fn deserialize_newtype_struct( + self, + _name: &'static str, + visitor: V, + ) -> Result + where + V: de::Visitor<'de>, + { + visitor.visit_newtype_struct(self) + } + + forward_to_deserialize_any! { + bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string + bytes byte_buf unit unit_struct seq tuple + tuple_struct map enum identifier ignored_any + } +} + +#[derive(Clone, Default, PartialEq, Eq, PartialOrd, Ord)] +pub(crate) struct PdfString { + bytes: ArcOrRef<'static, [u8]>, +} + +impl std::fmt::Debug for PdfString { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("PdfString") + .field("bytes", &&*self.bytes) + .finish() + } +} + +impl PdfString { + pub(crate) fn new(bytes: ArcOrRef<'static, [u8]>) -> Self { + Self { bytes } + } + pub(crate) fn bytes(&self) -> &ArcOrRef<'static, [u8]> { + &self.bytes + } +} + +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub(crate) struct PdfName { + bytes: ArcOrRef<'static, [u8]>, +} + +impl PdfName { + pub(crate) fn try_new(bytes: impl Into>) -> Option { + let bytes = bytes.into(); + if bytes.contains(&0) { + None + } else { + Some(Self { bytes }) + } + } + #[track_caller] + pub(crate) const fn new_static(bytes: &'static [u8]) -> Self { + let mut i = 0; + while i < bytes.len() { + if bytes[i] == 0 { + panic!("shouldn't contain any nul bytes"); + } + i += 1; + } + Self { + bytes: ArcOrRef::Ref(bytes), + } + } + #[track_caller] + pub(crate) fn new(bytes: ArcOrRef<'static, [u8]>) -> Self { + Self::try_new(bytes).expect("shouldn't contain any nul bytes") + } + pub(crate) fn as_bytes(&self) -> &ArcOrRef<'static, [u8]> { + &self.bytes + } +} + +macro_rules! make_pdf_names { + ( + $vis:vis mod $pdf_names:ident { + $($ident:ident;)* + } + ) => { + $vis mod $pdf_names { + $(#[allow(non_upper_case_globals)] + $vis const $ident: $crate::pdf::PdfName = $crate::pdf::PdfName::new_static(stringify!($ident).as_bytes());)* + } + }; +} + +make_pdf_names! { + pub(crate) mod pdf_names { + DecodeParms; + DL; + F; + FDecodeParms; + FFilter; + Filter; + Length; + } +} + +impl fmt::Debug for PdfName { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "PdfName({self})") + } +} + +impl fmt::Display for PdfName { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("/")?; + for &b in self.bytes.iter() { + match b { + 0x21..=0x7E if b != b'#' => f.write_char(b.into())?, + _ => write!(f, "#{b:02X}")?, + } + } + Ok(()) + } +} + +macro_rules! make_pdf_object { + ( + $( + #[from = $($from:ident)?, as_unexpected = |$as_unexpected_arg:pat_param| $as_unexpected_expr:expr] + $Variant:ident($ty:ty), + )+ + ) => { + #[derive(Clone, Debug, PartialEq)] + pub(crate) enum PdfObjectNonNull { + $($Variant($ty),)* + } + + #[derive(Clone, Debug, PartialEq)] + pub(crate) enum PdfObjectDirect { + $($Variant($ty),)* + Null(PdfNull), + } + + #[derive(Clone, Debug, PartialEq)] + pub(crate) enum PdfObject { + $($Variant($ty),)* + Null(PdfNull), + Indirect(PdfObjectIndirect), + } + + $($( + impl From<$ty> for PdfObjectNonNull { + fn $from(value: $ty) -> Self { + Self::$Variant(value) + } + } + + impl From<$ty> for PdfObjectDirect { + fn $from(value: $ty) -> Self { + Self::$Variant(value) + } + } + + impl From<$ty> for PdfObject { + fn $from(value: $ty) -> Self { + Self::$Variant(value) + } + } + + impl From> for PdfObjectDirect { + fn $from(value: Option<$ty>) -> Self { + match value { + Some(value) => Self::$Variant(value), + None => Self::Null(PdfNull), + } + } + } + + impl From> for PdfObject { + fn $from(value: Option<$ty>) -> Self { + match value { + Some(value) => Self::$Variant(value), + None => Self::Null(PdfNull), + } + } + } + )?)* + + impl From for PdfObjectDirect { + fn from(value: PdfObjectNonNull) -> Self { + match value { + $(PdfObjectNonNull::$Variant(v) => Self::$Variant(v),)* + } + } + } + + impl From for PdfObject { + fn from(value: PdfObjectNonNull) -> Self { + match value { + $(PdfObjectNonNull::$Variant(v) => Self::$Variant(v),)* + } + } + } + + impl From for PdfObject { + fn from(value: PdfObjectDirect) -> Self { + match value { + $(PdfObjectDirect::$Variant(v) => Self::$Variant(v),)* + PdfObjectDirect::Null(v) => Self::Null(v), + } + } + } + + impl From for PdfObjectDirect { + fn from(value: PdfObject) -> Self { + match value { + $(PdfObject::$Variant(v) => Self::$Variant(v),)* + PdfObject::Null(v) => Self::Null(v), + PdfObject::Indirect(v) => v.into(), + } + } + } + + impl PdfObjectNonNull { + fn as_unexpected(&self) -> de::Unexpected<'static> { + match *self { + $(PdfObjectNonNull::$Variant($as_unexpected_arg) => $as_unexpected_expr,)* + } + } + } + + impl PdfObjectDirect { + fn as_unexpected(&self) -> de::Unexpected<'static> { + match *self { + $(PdfObjectDirect::$Variant($as_unexpected_arg) => $as_unexpected_expr,)* + PdfObjectDirect::Null(_) => de::Unexpected::Option, + } + } + } + + impl PdfObject { + fn as_unexpected(&self) -> de::Unexpected<'static> { + match *self { + $(PdfObject::$Variant($as_unexpected_arg) => $as_unexpected_expr,)* + PdfObject::Null(_) => de::Unexpected::Option, + PdfObject::Indirect(ref v) => v.get().as_unexpected(), + } + } + } + + const _: () = { + fn _assert_impls_deserialize() {} + + $(let _ = _assert_impls_deserialize::<$ty>;)* + }; + }; +} + +make_pdf_object! { + #[from = from, as_unexpected = |v| de::Unexpected::Bool(v)] + Boolean(bool), + #[from = from, as_unexpected = |v| de::Unexpected::Signed(v.into())] + Integer(i32), + #[from = from, as_unexpected = |v| de::Unexpected::Float(v.into())] + Real(f32), + #[from = from, as_unexpected = |_| de::Unexpected::Other("PdfString")] + String(PdfString), + #[from = from, as_unexpected = |_| de::Unexpected::Other("PdfName")] + Name(PdfName), + #[from = from, as_unexpected = |_| de::Unexpected::Seq] + Array(PdfArray), + #[from = from, as_unexpected = |_| de::Unexpected::Map] + Dictionary(PdfDictionary), + #[from = from, as_unexpected = |_| de::Unexpected::Other("PdfStream")] + Stream(PdfStream), +} + +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct PdfNull; + +impl From for PdfObjectDirect { + fn from(v: PdfNull) -> Self { + Self::Null(v) + } +} + +impl From for PdfObject { + fn from(v: PdfNull) -> Self { + Self::Null(v) + } +} + +impl From for PdfObject { + fn from(v: PdfObjectIndirect) -> Self { + Self::Indirect(v) + } +} + +#[derive(Clone, Debug)] +pub(crate) struct PdfObjectIndirect { + xref_table: Weak, + object_number: NonZero, + generation_number: u16, +} + +impl PartialEq for PdfObjectIndirect { + fn eq(&self, other: &Self) -> bool { + let Self { + xref_table, + object_number, + generation_number, + } = self; + xref_table.ptr_eq(&other.xref_table) + && *object_number == other.object_number + && *generation_number == other.generation_number + } +} + +impl PdfObjectIndirect { + pub fn exists(&self) -> bool { + todo!() + } + pub fn get(&self) -> PdfObjectDirect { + todo!() + } +} + +impl From for PdfObjectDirect { + fn from(value: PdfObjectIndirect) -> Self { + value.get() + } +} + +#[derive(Clone, PartialEq)] +pub(crate) struct PdfDictionary { + fields: Arc>, +} + +impl PdfDictionary { + pub(crate) fn fields(&self) -> &Arc> { + &self.fields + } + pub(crate) fn into_fields(self) -> Arc> { + self.fields + } + pub(crate) fn iter(&self) -> std::collections::btree_map::Iter<'_, PdfName, PdfObject> { + self.fields.iter() + } + pub(crate) fn contains_key(&self, key: &Q) -> bool + where + PdfName: std::borrow::Borrow + Ord, + Q: Ord, + { + self.fields.contains_key(key) + } + pub(crate) fn get(&self, key: &Q) -> Option<&PdfObject> + where + PdfName: std::borrow::Borrow + Ord, + Q: Ord, + { + self.fields.get(key) + } +} + +impl FromIterator<(PdfName, PdfObject)> for PdfDictionary { + fn from_iter>(iter: T) -> Self { + Self { + fields: Arc::new(BTreeMap::from_iter( + iter.into_iter() + .filter(|(_name, value)| !matches!(value, PdfObject::Null(_))), + )), + } + } +} + +impl IntoIterator for PdfDictionary { + type Item = (PdfName, PdfObject); + type IntoIter = std::collections::btree_map::IntoIter; + + fn into_iter(self) -> Self::IntoIter { + Arc::unwrap_or_clone(self.fields).into_iter() + } +} + +impl<'a> IntoIterator for &'a PdfDictionary { + type Item = (&'a PdfName, &'a PdfObject); + type IntoIter = std::collections::btree_map::Iter<'a, PdfName, PdfObject>; + + fn into_iter(self) -> Self::IntoIter { + self.fields.iter() + } +} + +impl fmt::Debug for PdfDictionary { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_map().entries(self).finish() + } +} + +#[derive(Clone, Default, PartialEq)] +pub(crate) struct PdfArray { + elements: Arc<[PdfObject]>, +} + +impl PdfArray { + pub(crate) fn new() -> Self { + Self::default() + } + pub(crate) fn elements(&self) -> &Arc<[PdfObject]> { + &self.elements + } + pub(crate) fn into_elements(self) -> Arc<[PdfObject]> { + self.elements + } + pub(crate) fn iter(&self) -> std::slice::Iter<'_, PdfObject> { + self.elements.iter() + } +} + +impl FromIterator for PdfArray { + fn from_iter>(iter: T) -> Self { + Self { + elements: Arc::from_iter(iter), + } + } +} + +#[derive(Clone)] +pub(crate) struct PdfArrayIntoIter { + indexes: std::ops::Range, + elements: Arc<[PdfObject]>, +} + +impl Iterator for PdfArrayIntoIter { + type Item = PdfObject; + + fn next(&mut self) -> Option { + self.indexes.next().map(|i| self.elements[i].clone()) + } + + fn size_hint(&self) -> (usize, Option) { + self.indexes.size_hint() + } + + fn nth(&mut self, n: usize) -> Option { + self.indexes.nth(n).map(|i| self.elements[i].clone()) + } + + fn last(self) -> Option { + self.indexes.last().map(|i| self.elements[i].clone()) + } + + fn fold(self, init: B, mut f: F) -> B + where + F: FnMut(B, Self::Item) -> B, + { + self.indexes + .fold(init, |init, i| f(init, self.elements[i].clone())) + } +} + +impl FusedIterator for PdfArrayIntoIter {} + +impl DoubleEndedIterator for PdfArrayIntoIter { + fn next_back(&mut self) -> Option { + self.indexes.next_back().map(|i| self.elements[i].clone()) + } + fn nth_back(&mut self, n: usize) -> Option { + self.indexes.nth_back(n).map(|i| self.elements[i].clone()) + } + fn rfold(self, init: B, mut f: F) -> B + where + F: FnMut(B, Self::Item) -> B, + { + self.indexes + .rfold(init, |init, i| f(init, self.elements[i].clone())) + } +} + +impl ExactSizeIterator for PdfArrayIntoIter {} + +impl IntoIterator for PdfArray { + type Item = PdfObject; + type IntoIter = PdfArrayIntoIter; + + fn into_iter(self) -> Self::IntoIter { + PdfArrayIntoIter { + indexes: 0..self.elements.len(), + elements: self.elements, + } + } +} + +impl<'a> IntoIterator for &'a PdfArray { + type Item = &'a PdfObject; + type IntoIter = std::slice::Iter<'a, PdfObject>; + + fn into_iter(self) -> Self::IntoIter { + self.elements.iter() + } +} + +impl fmt::Debug for PdfArray { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.elements.fmt(f) + } +} + +#[derive(Clone, Debug, PartialEq)] +pub(crate) struct PdfStream { + dictionary: PdfDictionary, + data: Arc, +} + +pub(crate) enum PdfBody {} + +pub(crate) struct PdfObjects {} + +pub(crate) struct PdfXRefTable {} + +pub(crate) struct Pdf { + pub(crate) header: PdfHeader, + pub(crate) body: PdfBody, +} + +pub(crate) struct PdfHeader {} + +#[cfg(test)] + +mod tests { + use super::*; + + #[test] + fn test_deserialize_dict() -> Result<(), PdfParseError> { + #[derive(serde::Deserialize, Debug, PartialEq)] + struct TestStruct { + a: i32, + c: i32, + b: i32, + #[serde(flatten)] + others: PdfDictionary, + } + + let v: TestStruct = + de::Deserialize::deserialize(PdfObject::from(PdfDictionary::from_iter([ + (PdfName::new_static(b"a"), 1.into()), + (PdfName::new_static(b"c"), 7.into()), + (PdfName::new_static(b"b"), 5.into()), + (PdfName::new_static(b"d"), false.into()), + (PdfName::new_static(b"e"), PdfNull.into()), + ( + PdfName::new_static(b"f"), + PdfString::new(ArcOrRef::Ref(b"test")).into(), + ), + ])))?; + let expected = TestStruct { + a: 1, + c: 7, + b: 5, + others: PdfDictionary::from_iter([ + (PdfName::new_static(b"d"), false.into()), + ( + PdfName::new_static(b"f"), + PdfString::new(ArcOrRef::Ref(b"test")).into(), + ), + ]), + }; + assert_eq!(v, expected); + Ok(()) + } +} diff --git a/src/util.rs b/src/util.rs new file mode 100644 index 0000000..e234a7d --- /dev/null +++ b/src/util.rs @@ -0,0 +1,102 @@ +use std::{ + borrow::Borrow, + fmt, + hash::{Hash, Hasher}, + sync::Arc, +}; + +pub(crate) enum ArcOrRef<'a, T: ?Sized> { + Arc(Arc), + Ref(&'a T), +} + +impl<'a, T: ?Sized> AsRef for ArcOrRef<'a, T> { + fn as_ref(&self) -> &T { + self + } +} + +impl<'a, T: ?Sized> Borrow for ArcOrRef<'a, T> { + fn borrow(&self) -> &T { + self + } +} + +impl<'a, T: ?Sized> From> for ArcOrRef<'a, T> { + fn from(value: Arc) -> Self { + Self::Arc(value) + } +} + +impl<'a, T: ?Sized> From<&'a T> for ArcOrRef<'a, T> { + fn from(value: &'a T) -> Self { + Self::Ref(value) + } +} + +impl<'a, T: ?Sized> Default for ArcOrRef<'a, T> +where + &'a T: Default, +{ + fn default() -> Self { + Self::Ref(Default::default()) + } +} + +impl Clone for ArcOrRef<'_, T> { + fn clone(&self) -> Self { + match self { + Self::Arc(v) => Self::Arc(v.clone()), + Self::Ref(v) => Self::Ref(v), + } + } +} + +impl Hash for ArcOrRef<'_, T> { + fn hash(&self, state: &mut H) { + T::hash(self, state) + } +} + +impl<'a, 'b, T: ?Sized + PartialEq, U: ?Sized> PartialEq> for ArcOrRef<'a, T> { + fn eq(&self, other: &ArcOrRef<'b, U>) -> bool { + T::eq(self, other) + } +} + +impl Eq for ArcOrRef<'_, T> {} + +impl<'a, 'b, T: ?Sized + PartialOrd, U: ?Sized> PartialOrd> for ArcOrRef<'a, T> { + fn partial_cmp(&self, other: &ArcOrRef<'b, U>) -> Option { + T::partial_cmp(self, other) + } +} + +impl Ord for ArcOrRef<'_, T> { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + T::cmp(self, other) + } +} + +impl std::ops::Deref for ArcOrRef<'_, T> { + type Target = T; + + fn deref(&self) -> &Self::Target { + match self { + ArcOrRef::Arc(v) => v, + ArcOrRef::Ref(v) => v, + } + } +} + +impl fmt::Debug for ArcOrRef<'_, T> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + T::fmt(self, f) + } +} + +impl fmt::Display for ArcOrRef<'_, T> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + T::fmt(self, f) + } +} From 5fbfaa80531275c8ec879fbe7126e91d8b8135b8 Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Wed, 24 Dec 2025 07:12:48 -0800 Subject: [PATCH 02/42] WIP --- Cargo.lock | 68 -- Cargo.toml | 1 - src/lib.rs | 7 +- src/main.rs | 45 ++ src/pdf.rs | 1746 ++++++++++++++++++++++----------------------- src/pdf/object.rs | 1111 ++++++++++++++++++++++++++++ src/pdf/parse.rs | 953 +++++++++++++++++++++++++ src/util.rs | 2 +- 8 files changed, 2975 insertions(+), 958 deletions(-) create mode 100644 src/main.rs create mode 100644 src/pdf/object.rs create mode 100644 src/pdf/parse.rs diff --git a/Cargo.lock b/Cargo.lock index 2d752ff..3cb67c2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5,71 +5,3 @@ version = 4 [[package]] name = "parse_powerisa_pdf" version = "0.1.0" -dependencies = [ - "serde", -] - -[[package]] -name = "proc-macro2" -version = "1.0.103" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8" -dependencies = [ - "unicode-ident", -] - -[[package]] -name = "quote" -version = "1.0.42" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f" -dependencies = [ - "proc-macro2", -] - -[[package]] -name = "serde" -version = "1.0.228" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" -dependencies = [ - "serde_core", - "serde_derive", -] - -[[package]] -name = "serde_core" -version = "1.0.228" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" -dependencies = [ - "serde_derive", -] - -[[package]] -name = "serde_derive" -version = "1.0.228" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "syn" -version = "2.0.111" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "390cc9a294ab71bdb1aa2e99d13be9c753cd2d7bd6560c77118597410c4d2e87" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - -[[package]] -name = "unicode-ident" -version = "1.0.22" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" diff --git a/Cargo.toml b/Cargo.toml index d2f159a..125e5e2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,5 +5,4 @@ edition = "2024" license = "LGPL-3.0-or-later" [dependencies] -serde = { version = "1.0.228", features = ["derive"] } diff --git a/src/lib.rs b/src/lib.rs index 938fe11..d0e7860 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,2 +1,5 @@ -mod pdf; -mod util; +#[doc(hidden)] +pub use std as __std; + +pub mod pdf; +pub mod util; diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..34539a3 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,45 @@ +use parse_powerisa_pdf::pdf::{Pdf, PdfTrailer}; +use std::{ + error::Error, + io::{IsTerminal, Read}, + process::ExitCode, +}; + +fn main() -> Result> { + let args: Vec<_> = std::env::args_os().collect(); + if args + .iter() + .skip(1) + .any(|v| v.as_encoded_bytes().starts_with(b"-") && v != "-") + || args.len() > 2 + || (args.len() == 1 && std::io::stdin().is_terminal()) + { + eprintln!( + "Usage: {} []\n\ + Reads the PDF file passed on the command line,\n\ + Reads stdin if no arguments are passed or if the file name is just a dash `-`.\n\ + If stdin is a terminal, you have to pass `-` explicitly to read from it.", + args[0].display() + ); + return Ok(ExitCode::FAILURE); + } + let file_path = args.get(1).filter(|v| *v != "-"); + let input = if let Some(file_path) = file_path { + std::fs::read(file_path)? + } else { + let mut buf = Vec::new(); + std::io::stdin().lock().read_to_end(&mut buf)?; + buf + }; + let pdf = Pdf::parse(input)?; + if let PdfTrailer::Stream { + xref_stream, + start_xref, + } = pdf.trailer + { + dbg!(xref_stream.dictionary()); + } + + todo!(); + Ok(ExitCode::SUCCESS) +} diff --git a/src/pdf.rs b/src/pdf.rs index 0af9d6b..4ba785b 100644 --- a/src/pdf.rs +++ b/src/pdf.rs @@ -1,956 +1,930 @@ -use crate::util::ArcOrRef; -use serde::{de, forward_to_deserialize_any}; +use crate::{ + pdf::{ + object::{ + MaybeArray, PdfArray, PdfBoolean, PdfDictionary, PdfInteger, PdfName, PdfNull, + PdfObject, PdfObjectIdentifier, PdfObjectIndirect, PdfReal, PdfStream, + PdfStreamDictionary, PdfString, UnparsedPdfStreamDictionary, + }, + parse::{PdfInputPosition, PdfParse, PdfParseError}, + }, + pdf_parse, + util::ArcOrRef, +}; use std::{ - cell::RefCell, collections::BTreeMap, convert::Infallible, - fmt::{self, Write}, - iter::FusedIterator, - marker::PhantomData, + fmt, num::NonZero, - sync::{Arc, Weak}, + str::FromStr, + sync::{Arc, OnceLock}, }; -#[derive(Debug)] -pub(crate) enum PdfParseError { - InvalidFieldKind { - containing_ty: &'static str, - field: &'static str, - expected_kind: &'static str, - kind: &'static str, - }, - Custom(String), +pub mod object; +pub mod parse; + +pub struct PdfObjects { + objects: OnceLock>, } -impl From for PdfParseError { - fn from(value: Infallible) -> Self { - match value {} +#[derive(Copy, Clone, Debug)] +pub struct PdfHeader { + pub major: NonZero, + pub minor: u16, +} + +impl PdfHeader { + pub const PREFIX: &str = "%PDF-"; +} + +pdf_parse! { + #[derive(Clone, Debug)] + pub struct PdfTrailerDictionary { + #[pdf(name = "Size")] + pub size: usize, + #[pdf(name = "Prev")] + pub prev: Option, + #[pdf(name = "Root")] + pub root: PdfDictionary, + #[pdf(name = "Encrypt")] + pub encrypt: Option, + #[pdf(name = "Info")] + pub info: Option, + #[pdf(name = "ID")] + pub id: Option<[PdfString; 2]>, + #[pdf(flatten)] + pub rest: PdfDictionary, } } -impl fmt::Display for PdfParseError { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - &PdfParseError::InvalidFieldKind { - containing_ty, - field, - expected_kind, - kind, - } => write!( - f, - "invalid field kind: {containing_ty}.{field}: expected {expected_kind}, got {kind}" - ), - PdfParseError::Custom(msg) => f.write_str(msg), - } +pdf_parse! { + #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Default, Debug)] + pub enum PdfXRefName { + #[pdf(name = "XRef")] + #[default] + XRef, } } -impl std::error::Error for PdfParseError {} - -impl de::Error for PdfParseError { - fn custom(msg: T) -> Self - where - T: fmt::Display, - { - PdfParseError::Custom(msg.to_string()) - } -} - -impl<'de> de::IntoDeserializer<'de, PdfParseError> for PdfName { - type Deserializer = PdfObject; - - fn into_deserializer(self) -> Self::Deserializer { - self.into() - } -} - -impl<'de> de::IntoDeserializer<'de, PdfParseError> for PdfObject { - type Deserializer = Self; - - fn into_deserializer(self) -> Self::Deserializer { - self - } -} - -impl PdfObject { - const SERDE_FIELD_NAME: &str = "__PdfObject__look_in_thread_local"; - const SERDE_NAME_AND_FIELDS: (&str, &[&str]) = ("PdfObject", &[Self::SERDE_FIELD_NAME]); - fn with_thread_local(f: impl FnOnce(&RefCell>) -> R) -> R { - thread_local! { - static CURRENT_OBJECT: RefCell> = const { RefCell::new(None) }; - } - CURRENT_OBJECT.with(f) - } - fn set_thread_local_scoped(self, f: impl FnOnce() -> R) -> R { - Self::with_thread_local(|current_object| { - struct PutBackOnDrop<'a> { - current_object: &'a RefCell>, - old_object: Option, - } - impl Drop for PutBackOnDrop<'_> { - fn drop(&mut self) { - self.current_object.replace(self.old_object.take()); - } - } - let put_back_on_drop = PutBackOnDrop { - current_object, - old_object: current_object.replace(Some(self)), - }; - let retval = f(); - drop(put_back_on_drop); - retval - }) - } - fn take_thread_local() -> Option { - Self::with_thread_local(RefCell::take) - } -} - -trait PdfObjectDeserializeHelperTrait: Sized { - fn expecting(f: &mut fmt::Formatter<'_>) -> fmt::Result; - fn from_pdf_object( - value: PdfObject, - expected: &dyn de::Expected, - ) -> Result; -} - -struct PdfObjectDeserializeHelper(T); - -impl<'de, T: PdfObjectDeserializeHelperTrait> de::Deserialize<'de> - for PdfObjectDeserializeHelper -{ - fn deserialize(deserializer: D) -> Result - where - D: de::Deserializer<'de>, - { - struct PdfObjectVisitor(PhantomData); - fn expected_pdf_object() -> E { - de::Error::invalid_type(de::Unexpected::Map, &PdfObjectVisitor::(PhantomData)) - } - impl<'de, T: PdfObjectDeserializeHelperTrait> de::Visitor<'de> for PdfObjectVisitor { - type Value = PdfObject; - - fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { - T::expecting(formatter) - } - - fn visit_map(self, mut map: A) -> Result - where - A: de::MapAccess<'de>, - { - struct Field(PhantomData); - impl<'de, T: PdfObjectDeserializeHelperTrait> de::Deserialize<'de> for Field { - fn deserialize(deserializer: D) -> Result - where - D: de::Deserializer<'de>, - { - deserializer.deserialize_identifier(Field(PhantomData)) - } - } - impl<'de, T: PdfObjectDeserializeHelperTrait> de::Visitor<'de> for Field { - type Value = Self; - - fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { - T::expecting(formatter) - } - - fn visit_str(self, v: &str) -> Result - where - E: de::Error, - { - if v == PdfObject::SERDE_FIELD_NAME { - Ok(self) - } else { - Err(expected_pdf_object::()) - } - } - } - let (Field::(PhantomData), ()) = map - .next_entry()? - .ok_or_else(expected_pdf_object::)?; - let None = map.next_entry::, ()>()? else { - return Err(expected_pdf_object::<_, T>()); - }; - PdfObject::take_thread_local().ok_or_else(expected_pdf_object::<_, T>) - } - } - let (name, fields) = PdfObject::SERDE_NAME_AND_FIELDS; - let pdf_object = - deserializer.deserialize_struct(name, fields, PdfObjectVisitor::(PhantomData))?; - T::from_pdf_object::(pdf_object, &PdfObjectVisitor::(PhantomData)).map(Self) - } -} - -macro_rules! forward_deserialize_to_pdf_object_helper { - ($ty:ty) => { - impl<'de> de::Deserialize<'de> for $ty { - fn deserialize(deserializer: D) -> Result - where - D: de::Deserializer<'de>, - { - let PdfObjectDeserializeHelper(v) = de::Deserialize::deserialize(deserializer)?; - Ok(v) - } - } - }; -} - -forward_deserialize_to_pdf_object_helper!(PdfObject); - -impl PdfObjectDeserializeHelperTrait for PdfObject { - fn expecting(f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str("PdfObject") - } - - fn from_pdf_object( - value: PdfObject, - _expected: &dyn de::Expected, - ) -> Result { - Ok(value) - } -} - -forward_deserialize_to_pdf_object_helper!(PdfObjectIndirect); - -impl PdfObjectDeserializeHelperTrait for PdfObjectIndirect { - fn expecting(f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str("PdfObjectIndirect") - } - - fn from_pdf_object( - value: PdfObject, - expected: &dyn de::Expected, - ) -> Result { - match value { - PdfObject::Indirect(v) => Ok(v), - _ => Err(E::invalid_type(value.as_unexpected(), expected)), - } - } -} - -forward_deserialize_to_pdf_object_helper!(PdfString); - -impl PdfObjectDeserializeHelperTrait for PdfString { - fn expecting(f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str("PdfString") - } - - fn from_pdf_object( - value: PdfObject, - expected: &dyn de::Expected, - ) -> Result { - match value { - PdfObject::String(v) => Ok(v), - _ => Err(E::invalid_type(value.as_unexpected(), expected)), - } - } -} - -forward_deserialize_to_pdf_object_helper!(PdfName); - -impl PdfObjectDeserializeHelperTrait for PdfName { - fn expecting(f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str("PdfName") - } - - fn from_pdf_object( - value: PdfObject, - expected: &dyn de::Expected, - ) -> Result { - match value { - PdfObject::Name(v) => Ok(v), - _ => Err(E::invalid_type(value.as_unexpected(), expected)), - } - } -} - -forward_deserialize_to_pdf_object_helper!(PdfArray); - -impl PdfObjectDeserializeHelperTrait for PdfArray { - fn expecting(f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str("PdfArray") - } - - fn from_pdf_object( - value: PdfObject, - expected: &dyn de::Expected, - ) -> Result { - match value { - PdfObject::Array(v) => Ok(v), - _ => Err(E::invalid_type(value.as_unexpected(), expected)), - } - } -} - -forward_deserialize_to_pdf_object_helper!(PdfDictionary); - -impl PdfObjectDeserializeHelperTrait for PdfDictionary { - fn expecting(f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str("PdfDictionary") - } - - fn from_pdf_object( - value: PdfObject, - expected: &dyn de::Expected, - ) -> Result { - match value { - PdfObject::Dictionary(v) => Ok(v), - _ => Err(E::invalid_type(value.as_unexpected(), expected)), - } - } -} - -forward_deserialize_to_pdf_object_helper!(PdfStream); - -impl PdfObjectDeserializeHelperTrait for PdfStream { - fn expecting(f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str("PdfStream") - } - - fn from_pdf_object( - value: PdfObject, - expected: &dyn de::Expected, - ) -> Result { - match value { - PdfObject::Stream(v) => Ok(v), - _ => Err(E::invalid_type(value.as_unexpected(), expected)), - } - } -} - -impl<'de> de::Deserializer<'de> for PdfObject { - type Error = PdfParseError; - - fn deserialize_any(self, visitor: V) -> Result - where - V: de::Visitor<'de>, - { - match PdfObjectDirect::from(self) { - PdfObjectDirect::Boolean(v) => visitor.visit_bool(v), - PdfObjectDirect::Integer(v) => visitor.visit_i32(v), - PdfObjectDirect::Real(v) => visitor.visit_f32(v), - v @ (PdfObjectDirect::String(_) | PdfObjectDirect::Stream(_)) => { - Err(de::Error::invalid_type(v.as_unexpected(), &visitor)) - } - PdfObjectDirect::Name(v) => { - if let Ok(v) = str::from_utf8(v.as_bytes()) { - visitor.visit_str(v) - } else { - Err(de::Error::invalid_type( - PdfObject::from(v).as_unexpected(), - &visitor, - )) - } - } - PdfObjectDirect::Array(v) => { - visitor.visit_seq(de::value::SeqDeserializer::new(v.iter().cloned())) - } - PdfObjectDirect::Dictionary(v) => { - visitor.visit_map(de::value::MapDeserializer::new(v.into_iter())) - } - PdfObjectDirect::Null(PdfNull {}) => visitor.visit_unit(), - } - } - - fn deserialize_struct( - self, - name: &'static str, - fields: &'static [&'static str], - visitor: V, - ) -> Result - where - V: de::Visitor<'de>, - { - match (name, fields) { - PdfObject::SERDE_NAME_AND_FIELDS => self.set_thread_local_scoped(|| { - visitor.visit_map(de::value::MapDeserializer::new(std::iter::once(( - PdfObject::SERDE_FIELD_NAME, - (), - )))) - }), - _ => self.deserialize_any(visitor), - } - } - - fn deserialize_option(self, visitor: V) -> Result - where - V: de::Visitor<'de>, - { - let is_null = match self { - Self::Indirect(ref v) => !v.exists(), - Self::Null(_) => true, - _ => false, - }; - if is_null { - visitor.visit_none() - } else { - visitor.visit_some(self) - } - } - - fn deserialize_newtype_struct( - self, - _name: &'static str, - visitor: V, - ) -> Result - where - V: de::Visitor<'de>, - { - visitor.visit_newtype_struct(self) - } - - forward_to_deserialize_any! { - bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string - bytes byte_buf unit unit_struct seq tuple - tuple_struct map enum identifier ignored_any - } -} - -#[derive(Clone, Default, PartialEq, Eq, PartialOrd, Ord)] -pub(crate) struct PdfString { - bytes: ArcOrRef<'static, [u8]>, -} - -impl std::fmt::Debug for PdfString { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("PdfString") - .field("bytes", &&*self.bytes) - .finish() - } -} - -impl PdfString { - pub(crate) fn new(bytes: ArcOrRef<'static, [u8]>) -> Self { - Self { bytes } - } - pub(crate) fn bytes(&self) -> &ArcOrRef<'static, [u8]> { - &self.bytes - } -} - -#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub(crate) struct PdfName { - bytes: ArcOrRef<'static, [u8]>, -} - -impl PdfName { - pub(crate) fn try_new(bytes: impl Into>) -> Option { - let bytes = bytes.into(); - if bytes.contains(&0) { - None - } else { - Some(Self { bytes }) - } - } - #[track_caller] - pub(crate) const fn new_static(bytes: &'static [u8]) -> Self { - let mut i = 0; - while i < bytes.len() { - if bytes[i] == 0 { - panic!("shouldn't contain any nul bytes"); - } - i += 1; - } - Self { - bytes: ArcOrRef::Ref(bytes), - } - } - #[track_caller] - pub(crate) fn new(bytes: ArcOrRef<'static, [u8]>) -> Self { - Self::try_new(bytes).expect("shouldn't contain any nul bytes") - } - pub(crate) fn as_bytes(&self) -> &ArcOrRef<'static, [u8]> { - &self.bytes - } -} - -macro_rules! make_pdf_names { - ( - $vis:vis mod $pdf_names:ident { - $($ident:ident;)* - } - ) => { - $vis mod $pdf_names { - $(#[allow(non_upper_case_globals)] - $vis const $ident: $crate::pdf::PdfName = $crate::pdf::PdfName::new_static(stringify!($ident).as_bytes());)* - } - }; -} - -make_pdf_names! { - pub(crate) mod pdf_names { - DecodeParms; - DL; - F; - FDecodeParms; - FFilter; - Filter; - Length; - } -} - -impl fmt::Debug for PdfName { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "PdfName({self})") - } -} - -impl fmt::Display for PdfName { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str("/")?; - for &b in self.bytes.iter() { - match b { - 0x21..=0x7E if b != b'#' => f.write_char(b.into())?, - _ => write!(f, "#{b:02X}")?, - } - } - Ok(()) - } -} - -macro_rules! make_pdf_object { - ( - $( - #[from = $($from:ident)?, as_unexpected = |$as_unexpected_arg:pat_param| $as_unexpected_expr:expr] - $Variant:ident($ty:ty), - )+ - ) => { - #[derive(Clone, Debug, PartialEq)] - pub(crate) enum PdfObjectNonNull { - $($Variant($ty),)* - } - - #[derive(Clone, Debug, PartialEq)] - pub(crate) enum PdfObjectDirect { - $($Variant($ty),)* - Null(PdfNull), - } - - #[derive(Clone, Debug, PartialEq)] - pub(crate) enum PdfObject { - $($Variant($ty),)* - Null(PdfNull), - Indirect(PdfObjectIndirect), - } - - $($( - impl From<$ty> for PdfObjectNonNull { - fn $from(value: $ty) -> Self { - Self::$Variant(value) - } - } - - impl From<$ty> for PdfObjectDirect { - fn $from(value: $ty) -> Self { - Self::$Variant(value) - } - } - - impl From<$ty> for PdfObject { - fn $from(value: $ty) -> Self { - Self::$Variant(value) - } - } - - impl From> for PdfObjectDirect { - fn $from(value: Option<$ty>) -> Self { - match value { - Some(value) => Self::$Variant(value), - None => Self::Null(PdfNull), - } - } - } - - impl From> for PdfObject { - fn $from(value: Option<$ty>) -> Self { - match value { - Some(value) => Self::$Variant(value), - None => Self::Null(PdfNull), - } - } - } - )?)* - - impl From for PdfObjectDirect { - fn from(value: PdfObjectNonNull) -> Self { - match value { - $(PdfObjectNonNull::$Variant(v) => Self::$Variant(v),)* - } - } - } - - impl From for PdfObject { - fn from(value: PdfObjectNonNull) -> Self { - match value { - $(PdfObjectNonNull::$Variant(v) => Self::$Variant(v),)* - } - } - } - - impl From for PdfObject { - fn from(value: PdfObjectDirect) -> Self { - match value { - $(PdfObjectDirect::$Variant(v) => Self::$Variant(v),)* - PdfObjectDirect::Null(v) => Self::Null(v), - } - } - } - - impl From for PdfObjectDirect { - fn from(value: PdfObject) -> Self { - match value { - $(PdfObject::$Variant(v) => Self::$Variant(v),)* - PdfObject::Null(v) => Self::Null(v), - PdfObject::Indirect(v) => v.into(), - } - } - } - - impl PdfObjectNonNull { - fn as_unexpected(&self) -> de::Unexpected<'static> { - match *self { - $(PdfObjectNonNull::$Variant($as_unexpected_arg) => $as_unexpected_expr,)* - } - } - } - - impl PdfObjectDirect { - fn as_unexpected(&self) -> de::Unexpected<'static> { - match *self { - $(PdfObjectDirect::$Variant($as_unexpected_arg) => $as_unexpected_expr,)* - PdfObjectDirect::Null(_) => de::Unexpected::Option, - } - } - } - - impl PdfObject { - fn as_unexpected(&self) -> de::Unexpected<'static> { - match *self { - $(PdfObject::$Variant($as_unexpected_arg) => $as_unexpected_expr,)* - PdfObject::Null(_) => de::Unexpected::Option, - PdfObject::Indirect(ref v) => v.get().as_unexpected(), - } - } - } - - const _: () = { - fn _assert_impls_deserialize() {} - - $(let _ = _assert_impls_deserialize::<$ty>;)* - }; - }; -} - -make_pdf_object! { - #[from = from, as_unexpected = |v| de::Unexpected::Bool(v)] - Boolean(bool), - #[from = from, as_unexpected = |v| de::Unexpected::Signed(v.into())] - Integer(i32), - #[from = from, as_unexpected = |v| de::Unexpected::Float(v.into())] - Real(f32), - #[from = from, as_unexpected = |_| de::Unexpected::Other("PdfString")] - String(PdfString), - #[from = from, as_unexpected = |_| de::Unexpected::Other("PdfName")] - Name(PdfName), - #[from = from, as_unexpected = |_| de::Unexpected::Seq] - Array(PdfArray), - #[from = from, as_unexpected = |_| de::Unexpected::Map] - Dictionary(PdfDictionary), - #[from = from, as_unexpected = |_| de::Unexpected::Other("PdfStream")] - Stream(PdfStream), -} - -#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct PdfNull; - -impl From for PdfObjectDirect { - fn from(v: PdfNull) -> Self { - Self::Null(v) - } -} - -impl From for PdfObject { - fn from(v: PdfNull) -> Self { - Self::Null(v) - } -} - -impl From for PdfObject { - fn from(v: PdfObjectIndirect) -> Self { - Self::Indirect(v) +pdf_parse! { + #[derive(Clone, Debug)] + pub struct PdfXRefStreamDictionaryRest { + #[pdf(name = "Type")] + pub ty: PdfXRefName, + #[pdf(name = "Size")] + pub size: usize, + #[pdf(name = "Index")] + pub index: Option>, + #[pdf(name = "Prev")] + pub prev: Option, + #[pdf(name = "W")] + pub w: Option>, + #[pdf(name = "Root")] + pub root: PdfDictionary, + #[pdf(name = "Encrypt")] + pub encrypt: Option, + #[pdf(name = "Info")] + pub info: Option, + #[pdf(name = "ID")] + pub id: Option<[PdfString; 2]>, + #[pdf(flatten)] + pub rest: PdfDictionary, } } #[derive(Clone, Debug)] -pub(crate) struct PdfObjectIndirect { - xref_table: Weak, - object_number: NonZero, - generation_number: u16, +pub enum PdfTrailer { + Trailer { + trailer_dictionary: PdfTrailerDictionary, + start_xref: usize, + }, + Stream { + xref_stream: PdfStream, + start_xref: usize, + }, } -impl PartialEq for PdfObjectIndirect { - fn eq(&self, other: &Self) -> bool { - let Self { - xref_table, - object_number, - generation_number, - } = self; - xref_table.ptr_eq(&other.xref_table) - && *object_number == other.object_number - && *generation_number == other.generation_number - } +pub struct Pdf { + pub header: PdfHeader, + pub objects: Arc, + pub trailer: PdfTrailer, } -impl PdfObjectIndirect { - pub fn exists(&self) -> bool { - todo!() - } - pub fn get(&self) -> PdfObjectDirect { - todo!() - } +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +enum PdfCharCategory { + Regular, + Whitespace, + LParen, + RParen, + LAngle, + RAngle, + LBracket, + RBracket, + LBrace, + RBrace, + FSlash, + Percent, } -impl From for PdfObjectDirect { - fn from(value: PdfObjectIndirect) -> Self { - value.get() - } -} - -#[derive(Clone, PartialEq)] -pub(crate) struct PdfDictionary { - fields: Arc>, -} - -impl PdfDictionary { - pub(crate) fn fields(&self) -> &Arc> { - &self.fields - } - pub(crate) fn into_fields(self) -> Arc> { - self.fields - } - pub(crate) fn iter(&self) -> std::collections::btree_map::Iter<'_, PdfName, PdfObject> { - self.fields.iter() - } - pub(crate) fn contains_key(&self, key: &Q) -> bool - where - PdfName: std::borrow::Borrow + Ord, - Q: Ord, - { - self.fields.contains_key(key) - } - pub(crate) fn get(&self, key: &Q) -> Option<&PdfObject> - where - PdfName: std::borrow::Borrow + Ord, - Q: Ord, - { - self.fields.get(key) - } -} - -impl FromIterator<(PdfName, PdfObject)> for PdfDictionary { - fn from_iter>(iter: T) -> Self { - Self { - fields: Arc::new(BTreeMap::from_iter( - iter.into_iter() - .filter(|(_name, value)| !matches!(value, PdfObject::Null(_))), - )), +impl PdfCharCategory { + fn new(b: u8) -> Self { + match b { + b'\0' | b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' => Self::Whitespace, + b'(' => Self::LParen, + b')' => Self::RParen, + b'<' => Self::LAngle, + b'>' => Self::RAngle, + b'[' => Self::LBracket, + b']' => Self::RBracket, + b'{' => Self::LBrace, + b'}' => Self::RBrace, + b'/' => Self::FSlash, + b'%' => Self::Percent, + _ => Self::Regular, } } } -impl IntoIterator for PdfDictionary { - type Item = (PdfName, PdfObject); - type IntoIter = std::collections::btree_map::IntoIter; - - fn into_iter(self) -> Self::IntoIter { - Arc::unwrap_or_clone(self.fields).into_iter() - } +#[derive(Clone, Copy, PartialEq)] +enum PdfToken<'a> { + Regular(&'a [u8]), + LParen, + RParen, + LAngle, + RAngle, + LBracket, + RBracket, + LBrace, + RBrace, + FSlash, + Comment(&'a [u8]), } -impl<'a> IntoIterator for &'a PdfDictionary { - type Item = (&'a PdfName, &'a PdfObject); - type IntoIter = std::collections::btree_map::Iter<'a, PdfName, PdfObject>; - - fn into_iter(self) -> Self::IntoIter { - self.fields.iter() - } -} - -impl fmt::Debug for PdfDictionary { +impl<'a> fmt::Debug for PdfToken<'a> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_map().entries(self).finish() - } -} - -#[derive(Clone, Default, PartialEq)] -pub(crate) struct PdfArray { - elements: Arc<[PdfObject]>, -} - -impl PdfArray { - pub(crate) fn new() -> Self { - Self::default() - } - pub(crate) fn elements(&self) -> &Arc<[PdfObject]> { - &self.elements - } - pub(crate) fn into_elements(self) -> Arc<[PdfObject]> { - self.elements - } - pub(crate) fn iter(&self) -> std::slice::Iter<'_, PdfObject> { - self.elements.iter() - } -} - -impl FromIterator for PdfArray { - fn from_iter>(iter: T) -> Self { - Self { - elements: Arc::from_iter(iter), + match self { + Self::Regular(contents) => { + if let Ok(contents) = str::from_utf8(contents) { + write!(f, "Regular({contents:?})") + } else { + write!(f, "Regular({contents:?})") + } + } + Self::LParen => write!(f, "LParen"), + Self::RParen => write!(f, "RParen"), + Self::LAngle => write!(f, "LAngle"), + Self::RAngle => write!(f, "RAngle"), + Self::LBracket => write!(f, "LBracket"), + Self::RBracket => write!(f, "RBracket"), + Self::LBrace => write!(f, "LBrace"), + Self::RBrace => write!(f, "RBrace"), + Self::FSlash => write!(f, "FSlash"), + Self::Comment(contents) => { + if let Ok(contents) = str::from_utf8(contents) { + write!(f, "Comment({contents:?})") + } else { + write!(f, "Comment({contents:?})") + } + } } } } #[derive(Clone)] -pub(crate) struct PdfArrayIntoIter { - indexes: std::ops::Range, - elements: Arc<[PdfObject]>, +struct PdfTokenizerPeek<'a> { + token: PdfToken<'a>, + pos_after_token: usize, } -impl Iterator for PdfArrayIntoIter { - type Item = PdfObject; +#[derive(Clone)] +struct PdfTokenizer<'a> { + bytes: &'a [u8], + pos: usize, + peek_cache: Option>, +} + +impl<'a> PdfTokenizer<'a> { + fn new(bytes: &'a [u8], pos: usize) -> Self { + Self { + bytes, + pos, + peek_cache: None, + } + } + fn pos(&self) -> PdfInputPosition { + PdfInputPosition::new(self.pos) + } + fn peek_byte(&mut self) -> Option { + self.bytes.get(self.pos).copied() + } + fn next_byte(&mut self) -> Option { + let b = self.bytes.get(self.pos)?; + self.pos += 1; + self.peek_cache = None; + Some(*b) + } + fn skip_whitespace(&mut self) { + while let Some(PdfCharCategory::Whitespace) = self.peek_byte().map(PdfCharCategory::new) { + self.next_byte(); + } + } + fn peek(&mut self) -> Option> { + if let Some(PdfTokenizerPeek { token, .. }) = self.peek_cache { + return Some(token); + } + let mut tokenizer = self.clone(); + let token = tokenizer.next()?; + self.peek_cache = Some(PdfTokenizerPeek { + token, + pos_after_token: tokenizer.pos, + }); + Some(token) + } + fn read_bytes(&mut self, len: usize) -> Option<&'a [u8]> { + let retval = self.bytes.get(self.pos..self.pos.saturating_add(len))?; + self.peek_cache = None; + self.pos += len; + Some(retval) + } +} + +impl<'a> Iterator for PdfTokenizer<'a> { + type Item = PdfToken<'a>; fn next(&mut self) -> Option { - self.indexes.next().map(|i| self.elements[i].clone()) - } - - fn size_hint(&self) -> (usize, Option) { - self.indexes.size_hint() - } - - fn nth(&mut self, n: usize) -> Option { - self.indexes.nth(n).map(|i| self.elements[i].clone()) - } - - fn last(self) -> Option { - self.indexes.last().map(|i| self.elements[i].clone()) - } - - fn fold(self, init: B, mut f: F) -> B - where - F: FnMut(B, Self::Item) -> B, - { - self.indexes - .fold(init, |init, i| f(init, self.elements[i].clone())) - } -} - -impl FusedIterator for PdfArrayIntoIter {} - -impl DoubleEndedIterator for PdfArrayIntoIter { - fn next_back(&mut self) -> Option { - self.indexes.next_back().map(|i| self.elements[i].clone()) - } - fn nth_back(&mut self, n: usize) -> Option { - self.indexes.nth_back(n).map(|i| self.elements[i].clone()) - } - fn rfold(self, init: B, mut f: F) -> B - where - F: FnMut(B, Self::Item) -> B, - { - self.indexes - .rfold(init, |init, i| f(init, self.elements[i].clone())) - } -} - -impl ExactSizeIterator for PdfArrayIntoIter {} - -impl IntoIterator for PdfArray { - type Item = PdfObject; - type IntoIter = PdfArrayIntoIter; - - fn into_iter(self) -> Self::IntoIter { - PdfArrayIntoIter { - indexes: 0..self.elements.len(), - elements: self.elements, + if let Some(PdfTokenizerPeek { + token, + pos_after_token, + }) = self.peek_cache.take() + { + self.pos = pos_after_token; + return Some(token); + } + loop { + let start_pos = self.pos; + break match PdfCharCategory::new(self.next_byte()?) { + PdfCharCategory::Whitespace => continue, + PdfCharCategory::LParen => Some(PdfToken::LParen), + PdfCharCategory::RParen => Some(PdfToken::RParen), + PdfCharCategory::LAngle => Some(PdfToken::LAngle), + PdfCharCategory::RAngle => Some(PdfToken::RAngle), + PdfCharCategory::LBracket => Some(PdfToken::LBracket), + PdfCharCategory::RBracket => Some(PdfToken::RBracket), + PdfCharCategory::LBrace => Some(PdfToken::LBrace), + PdfCharCategory::RBrace => Some(PdfToken::RBrace), + PdfCharCategory::FSlash => Some(PdfToken::FSlash), + PdfCharCategory::Percent => { + loop { + match self.next_byte() { + None | Some(b'\n') => break, + Some(b'\r') => { + if let Some(b'\n') = self.peek_byte() { + self.pos += 1; + } + break; + } + Some(_) => continue, + } + } + Some(PdfToken::Comment(&self.bytes[start_pos..self.pos])) + } + PdfCharCategory::Regular => { + while let Some(PdfCharCategory::Regular) = + self.peek_byte().map(PdfCharCategory::new) + { + self.pos += 1; + } + Some(PdfToken::Regular(&self.bytes[start_pos..self.pos])) + } + }; } } } -impl<'a> IntoIterator for &'a PdfArray { - type Item = &'a PdfObject; - type IntoIter = std::slice::Iter<'a, PdfObject>; +struct PdfParser<'a> { + objects_arc: Arc, + objects_map: BTreeMap, + unparsed_stream_dictionaries: Vec>, + tokenizer: PdfTokenizer<'a>, +} - fn into_iter(self) -> Self::IntoIter { - self.elements.iter() +impl<'a> PdfParser<'a> { + fn parse_header(&mut self) -> Result { + let Some(b'%') = self.tokenizer.bytes.first() else { + return Err(PdfParseError::NotAPdfFile); + }; + let Some(PdfToken::Comment(header)) = self.tokenizer.next() else { + unreachable!() + }; + let Ok(header) = str::from_utf8(header) else { + return Err(PdfParseError::NotAPdfFile); + }; + let header = header.trim_end_matches(['\n', '\r']); + let Some(version) = header.strip_prefix(PdfHeader::PREFIX) else { + return Err(PdfParseError::NotAPdfFile); + }; + let Some((major_str, minor_str)) = version.split_once('.') else { + return Err(PdfParseError::NotAPdfFile); + }; + let (Ok(major), Ok(minor)) = (major_str.parse(), minor_str.parse()) else { + return Err(PdfParseError::NotAPdfFile); + }; + Ok(PdfHeader { major, minor }) + } + fn skip_comments_and_whitespace(&mut self) { + self.tokenizer.skip_whitespace(); + while let Some(PdfToken::Comment(_)) = self.tokenizer.peek() { + self.tokenizer.next(); + self.tokenizer.skip_whitespace(); + } + } + fn parse_digits( + &mut self, + on_parse_failed: impl FnOnce(PdfInputPosition) -> Result, PdfParseError>, + ) -> Result, PdfParseError> { + self.skip_comments_and_whitespace(); + let old_tokenizer = self.tokenizer.clone(); + let pos = self.tokenizer.pos(); + let Some(PdfToken::Regular(number)) = self.tokenizer.next() else { + self.tokenizer = old_tokenizer; + return Ok(None); + }; + if !number.iter().all(|b| b.is_ascii_digit()) { + self.tokenizer = old_tokenizer; + return Ok(None); + } + let Some(number) = str::from_utf8(number).ok().and_then(|v| v.parse().ok()) else { + self.tokenizer = old_tokenizer; + return Ok(match on_parse_failed(pos)? { + None => None, + }); + }; + Ok(Some((pos, number))) + } + fn parse_object_identifier( + &mut self, + return_none_for_out_of_range: bool, + ) -> Result, PdfParseError> { + let old_tokenizer = self.tokenizer.clone(); + let Some((pos, object_number)) = self.parse_digits(|pos| { + if return_none_for_out_of_range { + Ok(None) + } else { + Err(PdfParseError::InvalidObjectNumber { pos }) + } + })? + else { + self.tokenizer = old_tokenizer; + return Ok(None); + }; + let Some((_pos, generation_number)) = self.parse_digits(|pos| { + if return_none_for_out_of_range { + Ok(None) + } else { + Err(PdfParseError::InvalidGenerationNumber { pos }) + } + })? + else { + self.tokenizer = old_tokenizer; + return Ok(None); + }; + Ok(Some(PdfObjectIdentifier { + pos: pos.into(), + object_number, + generation_number, + })) + } + fn parse_indirect_object(&mut self) -> Result, PdfParseError> { + let old_tokenizer = self.tokenizer.clone(); + let Some(id) = self.parse_object_identifier(true)? else { + self.tokenizer = old_tokenizer; + return Ok(None); + }; + if let Some(PdfToken::Regular(b"R")) = self.tokenizer.next() { + Ok(Some(PdfObjectIndirect::new(&self.objects_arc, id))) + } else { + self.tokenizer = old_tokenizer; + Ok(None) + } + } + fn parse_string_after_l_paren(&mut self) -> Result { + let mut contents = Vec::new(); + let mut paren_level = NonZero::new(1usize).expect("non-zero"); + let string_pos = self.tokenizer.pos(); + while let Some(b) = self.tokenizer.next_byte() { + contents.push(match b { + b'(' => { + paren_level = paren_level.checked_add(1).expect("overflow"); + b + } + b')' => { + let Some(new_paren_level) = NonZero::new(paren_level.get() - 1) else { + return Ok(PdfString::new( + string_pos, + ArcOrRef::Arc(Arc::from(contents)), + )); + }; + paren_level = new_paren_level; + b + } + b'\r' if self.tokenizer.peek_byte() == Some(b'\n') => { + self.tokenizer.next_byte(); + b'\n' + } + b'\r' | b'\n' => b'\n', + b'\\' => { + let pos = self.tokenizer.pos(); + let Some(b) = self.tokenizer.next_byte() else { + return Err(PdfParseError::InvalidStringEscape { pos }); + }; + match b { + b'\r' if self.tokenizer.peek_byte() == Some(b'\n') => { + self.tokenizer.next_byte(); + continue; + } + b'\r' | b'\n' => continue, + b'n' => b'\n', + b'r' => b'\r', + b't' => b'\t', + b'b' => b'\x08', + b'f' => b'\x0C', + b'(' | b')' | b'\\' => b, + b'0'..=b'7' => { + const MAX_OCTAL_DIGITS: usize = 3; + let mut value = b - b'0'; + let mut len = 1; + while len < MAX_OCTAL_DIGITS { + let Some(b @ b'0'..=b'7') = self.tokenizer.peek_byte() else { + break; + }; + value <<= 3; + value |= b - b'0'; + len += 1; + self.tokenizer.next_byte(); + } + value + } + _ => { + return Err(PdfParseError::InvalidStringEscape { pos }); + } + } + } + _ => b, + }); + } + Err(PdfParseError::TruncatedFile { + pos: self.tokenizer.pos(), + }) + } + fn parse_string_after_l_angle(&mut self) -> Result { + let mut contents = Vec::new(); + let mut high_digit_value = None; + let mut push_digit_value = |value: u8| { + high_digit_value = match high_digit_value { + Some(high_digit_value) => { + contents.push((high_digit_value << 4) | value); + None + } + None => Some(value), + }; + }; + let string_pos = self.tokenizer.pos(); + loop { + let pos = self.tokenizer.pos(); + match self.tokenizer.next_byte() { + None => { + return Err(PdfParseError::TruncatedFile { pos }); + } + Some(b) if PdfCharCategory::new(b) == PdfCharCategory::Whitespace => {} + Some(b'>') => { + // if we have an odd trailing digit, add the final digit, otherwise doesn't modify contents + push_digit_value(0); + return Ok(PdfString::new( + string_pos, + Arc::<[u8]>::from(contents).into(), + )); + } + Some(b) => { + let Some(value) = (b as char).to_digit(0x10) else { + return Err(PdfParseError::InvalidHexStringDigit { pos }); + }; + push_digit_value(value as u8); + } + } + } + } + fn parse_name_after_f_slash(&mut self) -> Result { + let mut name = vec![]; + let name_pos = self.tokenizer.pos(); + loop { + let Some(PdfCharCategory::Regular) = + self.tokenizer.peek_byte().map(PdfCharCategory::new) + else { + return Ok(PdfName::new(name_pos, ArcOrRef::Arc(Arc::from(name)))); + }; + let pos = self.tokenizer.pos(); + match self + .tokenizer + .next_byte() + .expect("just checked that it's not None") + { + b'#' => { + let mut value = 0u8; + for _ in 0..2 { + let Some(digit) = self + .tokenizer + .next_byte() + .and_then(|b| (b as char).to_digit(0x10)) + else { + return Err(PdfParseError::InvalidNameEscape { pos }); + }; + value <<= 4; + value |= digit as u8; + } + name.push(value); + } + b => name.push(b), + } + } + } + fn parse_array_after_l_bracket(&mut self) -> Result { + let array_pos = self.tokenizer.pos(); + let mut contents: Vec = Vec::new(); + loop { + self.skip_comments_and_whitespace(); + if let Some(PdfToken::RBracket) = self.tokenizer.peek() { + self.tokenizer.next(); + return Ok(PdfArray::from_elements(array_pos, Arc::from(contents))); + } + contents.push(self.parse_object()?); + } + } + /// assumes `self.tokenizer.peek_byte() == Some(b'<')` + fn parse_dictionary_after_one_l_angle(&mut self) -> Result { + let l_angle = self.tokenizer.next_byte(); + assert_eq!(l_angle, Some(b'<')); + let dictionary_pos = self.tokenizer.pos(); + let mut contents: BTreeMap = BTreeMap::new(); + loop { + self.skip_comments_and_whitespace(); + if let Some(PdfToken::RAngle) = self.tokenizer.peek() { + self.tokenizer.next(); + let pos = self.tokenizer.pos(); + let b'>' = self + .tokenizer + .next_byte() + .ok_or(PdfParseError::TruncatedFile { pos })? + else { + return Err(PdfParseError::InvalidDictionaryClosingDoubleRAngle { pos }); + }; + return Ok(PdfDictionary::from_fields( + dictionary_pos, + Arc::new(contents), + )); + } + let name = PdfName::parse(self.parse_object()?.into())?; + let name_pos = name.pos(); + match contents.entry(name) { + std::collections::btree_map::Entry::Vacant(entry) => { + entry.insert(self.parse_object()?.into()); + } + std::collections::btree_map::Entry::Occupied(entry) => { + return Err(PdfParseError::DuplicateDictionaryKey { + pos: name_pos, + name: entry.key().clone(), + }); + } + } + } + } + /// assumes `self.tokenizer.peek() == Some(PdfToken::Regular(b"stream"))` + fn parse_stream_after_dictionary( + &mut self, + dictionary: PdfDictionary, + ) -> Result { + self.tokenizer.skip_whitespace(); + let stream_pos = self.tokenizer.pos(); + let stream = self.tokenizer.next(); + assert_eq!(stream, Some(PdfToken::Regular(b"stream"))); + let len = PdfStreamDictionary::parse_len_from_dictionary(&dictionary)?; + let eol_pos = self.tokenizer.pos(); + match self.tokenizer.next_byte() { + None => return Err(PdfParseError::TruncatedFile { pos: eol_pos }), + Some(b'\r') => { + let Some(b'\n') = self.tokenizer.next_byte() else { + return Err(PdfParseError::InvalidOrMissingEolAfterStreamKeyword { + pos: eol_pos, + }); + }; + } + Some(b'\n') => {} + _ => return Err(PdfParseError::InvalidOrMissingEolAfterStreamKeyword { pos: eol_pos }), + } + let Some(data) = self.tokenizer.read_bytes(len) else { + return Err(PdfParseError::TruncatedFile { + pos: PdfInputPosition::new(self.tokenizer.bytes.len()), + }); + }; + let (stream, unparsed) = PdfStream::new_unparsed(stream_pos, dictionary, Arc::from(data)); + self.unparsed_stream_dictionaries.push(unparsed); + self.skip_comments_and_whitespace(); + let pos = self.tokenizer.pos(); + if let Some(PdfToken::Regular(b"endstream")) = self.tokenizer.next() { + Ok(stream) + } else { + Err(PdfParseError::MissingEndStreamKeyword { pos }) + } + } + fn parse_object(&mut self) -> Result { + self.skip_comments_and_whitespace(); + if let Some(indirect) = self.parse_indirect_object()? { + return Ok(indirect.into()); + } + let pos = self.tokenizer.pos(); + match self + .tokenizer + .next() + .ok_or(PdfParseError::TruncatedFile { pos })? + { + PdfToken::Regular(b"true") => Ok(PdfObject::Boolean(PdfBoolean::new(pos, true))), + PdfToken::Regular(b"false") => Ok(PdfObject::Boolean(PdfBoolean::new(pos, false))), + PdfToken::Regular(b"null") => Ok(PdfObject::Null(PdfNull::new(pos))), + PdfToken::Regular( + number @ ([b'+' | b'-', b'0'..=b'9' | b'.', ..] | [b'0'..=b'9' | b'.', ..]), + ) => { + // parse number + let Ok(number) = str::from_utf8(number) else { + return Err(PdfParseError::InvalidNumber { pos }); + }; + let mut parts = number + .strip_prefix(&['+', '-']) + .unwrap_or(number) + .split('.'); + let integer_part = parts + .next() + .expect("split always returns at least one part"); + let fraction_part = parts.next(); + if parts.next().is_some() { + return Err(PdfParseError::InvalidNumber { pos }); + } + if integer_part.is_empty() && fraction_part.is_none_or(|v| v.is_empty()) { + return Err(PdfParseError::InvalidNumber { pos }); + } + if !integer_part.bytes().all(|v| v.is_ascii_digit()) { + return Err(PdfParseError::InvalidNumber { pos }); + } + if let Some(fraction_part) = fraction_part { + if !fraction_part.bytes().all(|v| v.is_ascii_digit()) { + return Err(PdfParseError::InvalidNumber { pos }); + } + Ok(PdfObject::Real(PdfReal::new( + pos, + number + .parse() + .map_err(|_| PdfParseError::InvalidNumber { pos })?, + ))) + } else { + Ok(PdfObject::Integer(PdfInteger::new( + pos, + number + .parse() + .map_err(|_| PdfParseError::InvalidNumber { pos })?, + ))) + } + } + PdfToken::Regular(items) => todo!("{:?}", str::from_utf8(items)), + PdfToken::LParen => self.parse_string_after_l_paren().map(PdfObject::String), + PdfToken::RParen => todo!(), + PdfToken::LAngle => { + if self.tokenizer.peek_byte() == Some(b'<') { + let dictionary = self.parse_dictionary_after_one_l_angle()?; + self.skip_comments_and_whitespace(); + if let Some(PdfToken::Regular(b"stream")) = self.tokenizer.peek() { + self.parse_stream_after_dictionary(dictionary) + .map(PdfObject::Stream) + } else { + Ok(dictionary.into()) + } + } else { + self.parse_string_after_l_angle().map(PdfObject::String) + } + } + PdfToken::RAngle => todo!(), + PdfToken::LBracket => self.parse_array_after_l_bracket().map(PdfObject::Array), + PdfToken::RBracket => todo!(), + PdfToken::LBrace => todo!(), + PdfToken::RBrace => todo!(), + PdfToken::FSlash => self.parse_name_after_f_slash().map(PdfObject::Name), + PdfToken::Comment(_) => unreachable!(), + } + } + fn parse_indirect_object_definition(&mut self) -> Result, PdfParseError> { + self.skip_comments_and_whitespace(); + let Some(id) = self.parse_object_identifier(false)? else { + return Ok(None); + }; + self.skip_comments_and_whitespace(); + let obj_pos = self.tokenizer.pos(); + let Some(PdfToken::Regular(b"obj")) = self.tokenizer.next() else { + return Err(PdfParseError::MissingObj { pos: obj_pos }); + }; + let object = self.parse_object()?; + self.skip_comments_and_whitespace(); + let end_obj_pos = self.tokenizer.pos(); + let Some(PdfToken::Regular(b"endobj")) = self.tokenizer.next() else { + return Err(PdfParseError::MissingEndObj { pos: end_obj_pos }); + }; + if self.objects_map.insert(id, object).is_some() { + Err(PdfParseError::DuplicateIndirectObjectDefinition { pos: id.pos.0, id }) + } else { + Ok(Some(())) + } + } + fn parse_body(&mut self) -> Result<(), PdfParseError> { + while let Some(()) = self.parse_indirect_object_definition()? {} + let Ok(()) = self + .objects_arc + .objects + .set(std::mem::take(&mut self.objects_map)) + else { + unreachable!(); + }; + self.unparsed_stream_dictionaries + .drain(..) + .try_for_each(|v| v.finish_parsing()) + } + fn parse_xref_table(&mut self) -> Result<(), PdfParseError> { + self.skip_comments_and_whitespace(); + let xref_pos = self.tokenizer.pos(); + let Some(PdfToken::Regular(b"xref")) = self.tokenizer.peek() else { + return Ok(()); + }; + todo!("{xref_pos}") + } + fn parse_trailer(&mut self) -> Result { + self.skip_comments_and_whitespace(); + let trailer_pos = self.tokenizer.pos(); + let trailer_dictionary = match self.tokenizer.peek() { + Some(PdfToken::Regular(b"trailer")) => { + self.tokenizer.next(); + Some(PdfTrailerDictionary::parse(self.parse_object()?)?) + } + Some(PdfToken::Regular(b"startxref")) => None, + _ => { + return Err(PdfParseError::MissingTrailer { pos: trailer_pos }); + } + }; + self.skip_comments_and_whitespace(); + let start_xref_kw_pos = self.tokenizer.pos(); + let Some(PdfToken::Regular(b"startxref")) = self.tokenizer.next() else { + return Err(PdfParseError::MissingStartXRefKeyword { + pos: start_xref_kw_pos, + }); + }; + let start_xref_pos = self.tokenizer.pos(); + let Some((start_xref_pos, start_xref)) = + self.parse_digits(|pos| Err(PdfParseError::IntegerOutOfRange { pos }))? + else { + return Err(PdfParseError::MissingStartXRefValue { + pos: start_xref_pos, + }); + }; + self.tokenizer.skip_whitespace(); + let eof_comment_pos = self.tokenizer.pos(); + let Some(PdfToken::Comment(b"%%EOF" | b"%%EOF\r" | b"%%EOF\r\n" | b"%%EOF\n")) = + self.tokenizer.next() + else { + return Err(PdfParseError::MissingEofComment { + pos: eof_comment_pos, + }); + }; + self.tokenizer.skip_whitespace(); + if let Some(byte) = self.tokenizer.peek_byte() { + return Err(PdfParseError::UnexpectedByte { + pos: self.tokenizer.pos(), + byte, + }); + } + if let Some(trailer_dictionary) = trailer_dictionary { + return Ok(PdfTrailer::Trailer { + trailer_dictionary, + start_xref, + }); + } + let old_tokenizer = self.tokenizer.clone(); + self.tokenizer = PdfTokenizer::new(self.tokenizer.bytes, start_xref); + let id = self.parse_object_identifier(false); + self.tokenizer = old_tokenizer; + let Some(id) = id? else { + return Err(PdfParseError::InvalidStartXRefValue { + pos: start_xref_pos, + start_xref, + }); + }; + let xref_stream = + PdfStream::parse(PdfObjectIndirect::new(&self.objects_arc, id).get().into())?; + Ok(PdfTrailer::Stream { + xref_stream, + start_xref, + }) + } + fn parse_file(mut self) -> Result { + let header = self.parse_header()?; + self.parse_body()?; + self.parse_xref_table()?; + let trailer = self.parse_trailer()?; + Ok(Pdf { + header, + objects: self.objects_arc, + trailer, + }) } } -impl fmt::Debug for PdfArray { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - self.elements.fmt(f) +impl Pdf { + pub fn parse(bytes: impl AsRef<[u8]>) -> Result { + PdfParser { + objects_arc: Arc::new(PdfObjects { + objects: OnceLock::new(), + }), + objects_map: BTreeMap::new(), + unparsed_stream_dictionaries: vec![], + tokenizer: PdfTokenizer::new(bytes.as_ref(), 0), + } + .parse_file() } } -#[derive(Clone, Debug, PartialEq)] -pub(crate) struct PdfStream { - dictionary: PdfDictionary, - data: Arc, -} - -pub(crate) enum PdfBody {} - -pub(crate) struct PdfObjects {} - -pub(crate) struct PdfXRefTable {} - -pub(crate) struct Pdf { - pub(crate) header: PdfHeader, - pub(crate) body: PdfBody, -} - -pub(crate) struct PdfHeader {} - #[cfg(test)] - mod tests { - use super::*; + use crate::{ + pdf::{ + object::{ + PdfBoolean, PdfDictionary, PdfInteger, PdfName, PdfNull, PdfObject, PdfString, + }, + parse::{PdfInputPosition, PdfParse, PdfParseError}, + }, + util::ArcOrRef, + }; #[test] fn test_deserialize_dict() -> Result<(), PdfParseError> { - #[derive(serde::Deserialize, Debug, PartialEq)] - struct TestStruct { - a: i32, - c: i32, - b: i32, - #[serde(flatten)] - others: PdfDictionary, + crate::pdf::parse::pdf_parse! { + #[derive(Debug)] + #[allow(dead_code)] + struct TestStruct { + #[pdf(name = "a")] + a: i32, + #[pdf(name = "c")] + c: i32, + #[pdf(name = "b")] + b: i32, + #[pdf(flatten)] + rest: PdfDictionary, + } } - let v: TestStruct = - de::Deserialize::deserialize(PdfObject::from(PdfDictionary::from_iter([ - (PdfName::new_static(b"a"), 1.into()), - (PdfName::new_static(b"c"), 7.into()), - (PdfName::new_static(b"b"), 5.into()), - (PdfName::new_static(b"d"), false.into()), - (PdfName::new_static(b"e"), PdfNull.into()), - ( - PdfName::new_static(b"f"), - PdfString::new(ArcOrRef::Ref(b"test")).into(), - ), - ])))?; + let v: TestStruct = PdfParse::parse(PdfObject::from(PdfDictionary::from_iter([ + ( + PdfName::new_static(b"a"), + PdfInteger::new(PdfInputPosition::empty(), 1).into(), + ), + ( + PdfName::new_static(b"c"), + PdfInteger::new(PdfInputPosition::empty(), 7).into(), + ), + ( + PdfName::new_static(b"b"), + PdfInteger::new(PdfInputPosition::empty(), 5).into(), + ), + ( + PdfName::new_static(b"d"), + PdfBoolean::new(PdfInputPosition::empty(), false).into(), + ), + ( + PdfName::new_static(b"e"), + PdfNull::new(PdfInputPosition::empty()).into(), + ), + ( + PdfName::new_static(b"f"), + PdfString::new(PdfInputPosition::empty(), ArcOrRef::Ref(b"test")).into(), + ), + ])))?; let expected = TestStruct { a: 1, c: 7, b: 5, - others: PdfDictionary::from_iter([ - (PdfName::new_static(b"d"), false.into()), + rest: PdfDictionary::from_iter([ + ( + PdfName::new_static(b"d"), + PdfBoolean::new(PdfInputPosition::empty(), false).into(), + ), ( PdfName::new_static(b"f"), - PdfString::new(ArcOrRef::Ref(b"test")).into(), + PdfString::new(PdfInputPosition::empty(), ArcOrRef::Ref(b"test")).into(), ), ]), }; - assert_eq!(v, expected); + assert_eq!(format!("{v:?}"), format!("{expected:?}")); Ok(()) } } diff --git a/src/pdf/object.rs b/src/pdf/object.rs new file mode 100644 index 0000000..d3979d8 --- /dev/null +++ b/src/pdf/object.rs @@ -0,0 +1,1111 @@ +use crate::{ + pdf::{ + PdfObjects, + parse::{ + GetPdfInputPosition, PdfInputPosition, PdfInputPositionNoCompare, PdfParse, + PdfParseError, + }, + }, + pdf_parse, + util::ArcOrRef, +}; +use std::{ + any::TypeId, + borrow::Cow, + collections::BTreeMap, + fmt::{self, Write}, + num::NonZero, + sync::{Arc, OnceLock}, +}; + +#[derive(Clone, Default, PartialEq, Eq, PartialOrd, Ord)] +pub struct PdfString { + pos: PdfInputPositionNoCompare, + bytes: ArcOrRef<'static, [u8]>, +} + +impl std::fmt::Debug for PdfString { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let Self { pos, bytes } = self; + f.debug_struct("PdfString") + .field("pos", pos) + .field("bytes", &format_args!("b\"{}\"", bytes.escape_ascii())) + .finish() + } +} + +impl PdfString { + pub fn new(pos: impl Into, bytes: ArcOrRef<'static, [u8]>) -> Self { + Self { + pos: pos.into(), + bytes, + } + } + pub fn pos(&self) -> PdfInputPosition { + self.pos.0 + } + pub fn bytes(&self) -> &ArcOrRef<'static, [u8]> { + &self.bytes + } +} + +impl GetPdfInputPosition for PdfString { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.pos.0 + } +} + +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct PdfName { + pos: PdfInputPositionNoCompare, + bytes: ArcOrRef<'static, [u8]>, +} + +impl PdfName { + pub fn try_new( + pos: impl Into, + bytes: impl Into>, + ) -> Option { + let bytes = bytes.into(); + if bytes.contains(&0) { + None + } else { + Some(Self { + pos: pos.into(), + bytes, + }) + } + } + #[track_caller] + pub const fn new_static(bytes: &'static [u8]) -> Self { + let mut i = 0; + while i < bytes.len() { + if bytes[i] == 0 { + panic!("shouldn't contain any nul bytes"); + } + i += 1; + } + Self { + pos: PdfInputPositionNoCompare::empty(), + bytes: ArcOrRef::Ref(bytes), + } + } + #[track_caller] + pub fn new( + pos: impl Into, + bytes: impl Into>, + ) -> Self { + Self::try_new(pos, bytes).expect("shouldn't contain any nul bytes") + } + pub fn as_bytes(&self) -> &ArcOrRef<'static, [u8]> { + &self.bytes + } + pub fn pos(&self) -> PdfInputPosition { + self.pos.0 + } +} + +impl GetPdfInputPosition for PdfName { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.pos.0 + } +} + +impl fmt::Debug for PdfName { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "PdfName(at {}: {self})", self.pos) + } +} + +impl fmt::Display for PdfName { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("/")?; + for &b in self.bytes.iter() { + match b { + 0x21..=0x7E if b != b'#' => f.write_char(b.into())?, + _ => write!(f, "#{b:02X}")?, + } + } + Ok(()) + } +} + +#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Default)] +pub struct PdfBoolean { + pos: PdfInputPositionNoCompare, + value: bool, +} + +impl PdfBoolean { + pub fn new(pos: impl Into, value: bool) -> Self { + Self { + pos: pos.into(), + value, + } + } + pub fn pos(&self) -> PdfInputPosition { + self.pos.0 + } + pub fn value(&self) -> bool { + self.value + } +} + +impl GetPdfInputPosition for PdfBoolean { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.pos.0 + } +} + +#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Default)] +pub struct PdfInteger { + pos: PdfInputPositionNoCompare, + value: i128, +} + +impl PdfInteger { + pub fn new(pos: impl Into, value: i128) -> Self { + Self { + pos: pos.into(), + value, + } + } + pub fn pos(&self) -> PdfInputPosition { + self.pos.0 + } + pub fn value(&self) -> i128 { + self.value + } +} + +impl GetPdfInputPosition for PdfInteger { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.pos.0 + } +} + +#[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Default)] +pub struct PdfReal { + pos: PdfInputPositionNoCompare, + value: f64, +} + +impl PdfReal { + pub fn new(pos: impl Into, value: f64) -> Self { + Self { + pos: pos.into(), + value, + } + } + pub fn pos(&self) -> PdfInputPosition { + self.pos.0 + } + pub fn value(&self) -> f64 { + self.value + } +} + +impl GetPdfInputPosition for PdfReal { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.pos.0 + } +} + +macro_rules! make_pdf_object { + ( + $( + #[parse = $($parse:ident)?, type_name = $type_name:literal] + $Variant:ident($ty:ty), + )+ + ) => { + #[derive(Clone, Debug)] + pub enum PdfObjectNonNull { + $($Variant($ty),)* + } + + #[derive(Clone, Debug)] + pub enum PdfObjectDirect { + $($Variant($ty),)* + Null(PdfNull), + } + + #[derive(Clone, Debug)] + pub enum PdfObject { + $($Variant($ty),)* + Null(PdfNull), + Indirect(PdfObjectIndirect), + } + + $( + impl From<$ty> for PdfObjectNonNull { + fn from(value: $ty) -> Self { + Self::$Variant(value) + } + } + + impl From<$ty> for PdfObjectDirect { + fn from(value: $ty) -> Self { + Self::$Variant(value) + } + } + + impl From<$ty> for PdfObject { + fn from(value: $ty) -> Self { + Self::$Variant(value) + } + } + + impl From> for PdfObjectDirect { + fn from(value: Option<$ty>) -> Self { + match value { + Some(value) => Self::$Variant(value), + None => Self::Null(Default::default()), + } + } + } + + impl From> for PdfObject { + fn from(value: Option<$ty>) -> Self { + match value { + Some(value) => Self::$Variant(value), + None => Self::Null(Default::default()), + } + } + } + + $(impl crate::pdf::parse::PdfParse for $ty { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed($type_name) + } + fn $parse(object: PdfObject) -> Result { + match PdfObjectDirect::from(object) { + PdfObjectDirect::$Variant(v) => Ok(v), + object => Err(crate::pdf::parse::PdfParseError::InvalidType { + pos: object.get_pdf_input_position(), + ty: object.type_name(), + expected_ty: $type_name, + }), + } + } + })? + )* + + impl From for PdfObjectDirect { + fn from(value: PdfObjectNonNull) -> Self { + match value { + $(PdfObjectNonNull::$Variant(v) => Self::$Variant(v),)* + } + } + } + + impl From for PdfObject { + fn from(value: PdfObjectNonNull) -> Self { + match value { + $(PdfObjectNonNull::$Variant(v) => Self::$Variant(v),)* + } + } + } + + impl From for PdfObject { + fn from(value: PdfObjectDirect) -> Self { + match value { + $(PdfObjectDirect::$Variant(v) => Self::$Variant(v),)* + PdfObjectDirect::Null(v) => Self::Null(v), + } + } + } + + impl From for PdfObjectDirect { + fn from(value: PdfObject) -> Self { + match value { + $(PdfObject::$Variant(v) => Self::$Variant(v),)* + PdfObject::Null(v) => Self::Null(v), + PdfObject::Indirect(v) => v.into(), + } + } + } + + impl PdfObjectNonNull { + pub fn type_name(&self) -> &'static str { + match self { + $(PdfObjectNonNull::$Variant(_) => $type_name,)* + } + } + pub fn pos(&self) -> PdfInputPosition { + self.get_pdf_input_position() + } + } + + impl GetPdfInputPosition for PdfObjectNonNull { + fn get_pdf_input_position(&self) -> PdfInputPosition { + match self { + $(PdfObjectNonNull::$Variant(v) => <$ty as GetPdfInputPosition>::get_pdf_input_position(v),)* + } + } + } + + impl From for Option { + fn from(value: PdfObjectDirect) -> Self { + match value { + $(PdfObjectDirect::$Variant(v) => Some(PdfObjectNonNull::$Variant(v)),)* + PdfObjectDirect::Null(_) => None, + } + } + } + + impl From for Option { + fn from(value: PdfObject) -> Self { + PdfObjectDirect::from(value).into() + } + } + + impl PdfObjectDirect { + pub fn is_null(&self) -> bool { + matches!(self, PdfObjectDirect::Null(_)) + } + pub fn type_name(&self) -> &'static str { + match self { + $(PdfObjectDirect::$Variant(_) => $type_name,)* + PdfObjectDirect::Null(_) => "null", + } + } + pub fn pos(&self) -> PdfInputPosition { + self.get_pdf_input_position() + } + } + + impl GetPdfInputPosition for PdfObjectDirect { + fn get_pdf_input_position(&self) -> PdfInputPosition { + match self { + $(PdfObjectDirect::$Variant(v) => <$ty as GetPdfInputPosition>::get_pdf_input_position(v),)* + PdfObjectDirect::Null(v) => ::get_pdf_input_position(v), + } + } + } + + impl PdfObject { + pub fn is_null(&self) -> bool { + matches!(self, PdfObject::Null(_)) + } + pub fn type_name(&self) -> &'static str { + match self { + $(PdfObject::$Variant(_) => $type_name,)* + PdfObject::Null(_) => "null", + PdfObject::Indirect(_) => "indirect object", + } + } + pub fn pos(&self) -> PdfInputPosition { + self.get_pdf_input_position() + } + } + + impl GetPdfInputPosition for PdfObject { + fn get_pdf_input_position(&self) -> PdfInputPosition { + match self { + $(PdfObject::$Variant(v) => <$ty as GetPdfInputPosition>::get_pdf_input_position(v),)* + PdfObject::Null(v) => ::get_pdf_input_position(v), + PdfObject::Indirect(v) => ::get_pdf_input_position(v), + } + } + } + + const _: () = { + fn _assert_parsable() {} + + $(let _ = _assert_parsable::<$ty>;)* + let _ = _assert_parsable::; + let _ = _assert_parsable::; + let _ = _assert_parsable::; + let _ = _assert_parsable::; + let _ = _assert_parsable::; + }; + }; +} + +make_pdf_object! { + #[parse = parse, type_name = "boolean"] + Boolean(PdfBoolean), + #[parse = parse, type_name = "integer"] + Integer(PdfInteger), + #[parse = parse, type_name = "real"] + Real(PdfReal), + #[parse = parse, type_name = "string"] + String(PdfString), + #[parse = parse, type_name = "name"] + Name(PdfName), + #[parse = parse, type_name = "array"] + Array(PdfArray), + #[parse = parse, type_name = "dictionary"] + Dictionary(PdfDictionary), + #[parse =, type_name = "stream"] + Stream(PdfStream), +} + +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct PdfNull(PdfInputPositionNoCompare); + +impl PdfNull { + pub fn new(pos: impl Into) -> Self { + Self(pos.into()) + } +} + +impl GetPdfInputPosition for PdfNull { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.0.0 + } +} + +impl From for PdfObjectDirect { + fn from(v: PdfNull) -> Self { + Self::Null(v) + } +} + +impl Default for PdfObjectDirect { + fn default() -> Self { + Self::Null(PdfNull(PdfInputPositionNoCompare::empty())) + } +} + +impl From for PdfObject { + fn from(v: PdfNull) -> Self { + Self::Null(v) + } +} + +impl Default for PdfObject { + fn default() -> Self { + Self::Null(PdfNull(PdfInputPositionNoCompare::empty())) + } +} + +impl From for PdfObject { + fn from(v: PdfObjectIndirect) -> Self { + Self::Indirect(v) + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub struct PdfObjectIdentifier { + pub pos: PdfInputPositionNoCompare, + pub object_number: NonZero, + pub generation_number: u16, +} + +impl GetPdfInputPosition for PdfObjectIdentifier { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.pos.0 + } +} + +#[derive(Clone)] +pub struct PdfObjectIndirect { + objects: std::sync::Weak, + id: PdfObjectIdentifier, + final_id: Arc>, +} + +impl fmt::Debug for PdfObjectIndirect { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let Self { + objects: _, + id, + final_id: _, + } = self; + f.debug_struct("PdfObjectIndirect") + .field("id", id) + .finish_non_exhaustive() + } +} + +impl GetPdfInputPosition for PdfObjectIndirect { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.id.get_pdf_input_position() + } +} + +impl PartialEq for PdfObjectIndirect { + fn eq(&self, other: &Self) -> bool { + let Self { + objects, + id, + final_id: _, + } = self; + objects.ptr_eq(&other.objects) && *id == other.id + } +} + +impl PdfObjectIndirect { + pub fn new(objects: &Arc, id: PdfObjectIdentifier) -> Self { + Self { + objects: Arc::downgrade(objects), + id, + final_id: Arc::new(OnceLock::new()), + } + } + pub fn get(&self) -> PdfObjectDirect { + if let Some(objects) = self.objects.upgrade() { + if let Some(objects) = objects.objects.get() { + let final_id = self.final_id.get().copied(); + let limit = if final_id.is_some() { 1 } else { 1000usize }; + let mut id = final_id.unwrap_or(self.id); + for _ in 0..limit { + if let Some(object) = objects.get(&self.id) { + let retval = match object { + PdfObject::Boolean(v) => PdfObjectDirect::Boolean(*v), + PdfObject::Integer(v) => PdfObjectDirect::Integer(*v), + PdfObject::Real(v) => PdfObjectDirect::Real(*v), + PdfObject::String(v) => PdfObjectDirect::String(v.clone()), + PdfObject::Name(v) => PdfObjectDirect::Name(v.clone()), + PdfObject::Array(v) => PdfObjectDirect::Array(v.clone()), + PdfObject::Dictionary(v) => PdfObjectDirect::Dictionary(v.clone()), + PdfObject::Stream(v) => PdfObjectDirect::Stream(v.clone()), + PdfObject::Null(v) => PdfObjectDirect::Null(*v), + PdfObject::Indirect(v) => { + id = v.id; + continue; + } + }; + // we could be racing with another thread, so set can fail but that's not a problem + let _ = self.final_id.set(id); + return retval; + } else { + return PdfObjectDirect::Null(PdfNull::new(id.pos)); + } + } + } + } + PdfObjectDirect::Null(PdfNull::new(self.pos())) + } + pub fn id(&self) -> PdfObjectIdentifier { + self.id + } + pub fn pos(&self) -> PdfInputPosition { + self.id.pos.0 + } +} + +impl From for PdfObjectDirect { + fn from(value: PdfObjectIndirect) -> Self { + value.get() + } +} + +#[derive(Clone)] +pub struct PdfDictionary { + pos: PdfInputPositionNoCompare, + fields: Arc>, +} + +impl PdfDictionary { + pub fn new(pos: impl Into) -> Self { + Self { + pos: pos.into(), + fields: Arc::new(BTreeMap::new()), + } + } + pub fn from_fields( + pos: impl Into, + mut fields: Arc>, + ) -> Self { + if fields.values().any(|v| matches!(v, PdfObject::Null(_))) { + Arc::make_mut(&mut fields).retain(|_k, v| !matches!(v, PdfObject::Null(_))); + } + Self { + pos: pos.into(), + fields, + } + } + pub fn fields(&self) -> &Arc> { + &self.fields + } + pub fn into_fields(self) -> Arc> { + self.fields + } + pub fn iter(&self) -> std::collections::btree_map::Iter<'_, PdfName, PdfObject> { + self.fields.iter() + } + pub fn contains_key(&self, key: &Q) -> bool + where + PdfName: std::borrow::Borrow + Ord, + Q: Ord, + { + self.fields.contains_key(key) + } + pub fn get(&self, key: &Q) -> Option<&PdfObject> + where + PdfName: std::borrow::Borrow + Ord, + Q: Ord, + { + self.fields.get(key) + } + pub fn pos(&self) -> PdfInputPosition { + self.pos.0 + } +} + +impl GetPdfInputPosition for PdfDictionary { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.pos.0 + } +} + +impl Default for PdfDictionary { + fn default() -> Self { + Self::new(PdfInputPosition::empty()) + } +} + +impl FromIterator<(PdfName, PdfObject)> for PdfDictionary { + fn from_iter>(iter: T) -> Self { + Self { + pos: PdfInputPositionNoCompare::empty(), + fields: Arc::new(BTreeMap::from_iter( + iter.into_iter() + .filter(|(_name, value)| !matches!(value, PdfObject::Null(_))), + )), + } + } +} + +impl IntoIterator for PdfDictionary { + type Item = (PdfName, PdfObject); + type IntoIter = std::collections::btree_map::IntoIter; + + fn into_iter(self) -> Self::IntoIter { + Arc::unwrap_or_clone(self.fields).into_iter() + } +} + +impl<'a> IntoIterator for &'a PdfDictionary { + type Item = (&'a PdfName, &'a PdfObject); + type IntoIter = std::collections::btree_map::Iter<'a, PdfName, PdfObject>; + + fn into_iter(self) -> Self::IntoIter { + self.fields.iter() + } +} + +impl fmt::Debug for PdfDictionary { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_map().entries(self).finish() + } +} + +#[derive(Clone, Default)] +pub struct PdfArray { + pos: PdfInputPositionNoCompare, + elements: Arc<[PdfObject]>, +} + +impl PdfArray { + pub fn new(pos: impl Into) -> Self { + Self { + pos: pos.into(), + elements: Arc::default(), + } + } + pub fn from_elements( + pos: impl Into, + elements: Arc<[PdfObject]>, + ) -> Self { + Self { + pos: pos.into(), + elements, + } + } + pub fn pos(&self) -> PdfInputPosition { + self.pos.0 + } + pub fn elements(&self) -> &Arc<[PdfObject]> { + &self.elements + } + pub fn into_elements(self) -> Arc<[PdfObject]> { + self.elements + } + pub fn iter(&self) -> std::slice::Iter<'_, PdfObject> { + self.elements.iter() + } +} + +impl GetPdfInputPosition for PdfArray { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.pos.0 + } +} + +impl FromIterator for PdfArray { + fn from_iter>(iter: T) -> Self { + Self { + pos: PdfInputPositionNoCompare::empty(), + elements: Arc::from_iter(iter), + } + } +} + +#[derive(Clone)] +pub struct PdfArrayIntoIter { + indexes: std::ops::Range, + elements: Arc<[PdfObject]>, +} + +impl Iterator for PdfArrayIntoIter { + type Item = PdfObject; + + fn next(&mut self) -> Option { + self.indexes.next().map(|i| self.elements[i].clone()) + } + + fn size_hint(&self) -> (usize, Option) { + self.indexes.size_hint() + } + + fn nth(&mut self, n: usize) -> Option { + self.indexes.nth(n).map(|i| self.elements[i].clone()) + } + + fn last(self) -> Option { + self.indexes.last().map(|i| self.elements[i].clone()) + } + + fn fold(self, init: B, mut f: F) -> B + where + F: FnMut(B, Self::Item) -> B, + { + self.indexes + .fold(init, |init, i| f(init, self.elements[i].clone())) + } +} + +impl std::iter::FusedIterator for PdfArrayIntoIter {} + +impl DoubleEndedIterator for PdfArrayIntoIter { + fn next_back(&mut self) -> Option { + self.indexes.next_back().map(|i| self.elements[i].clone()) + } + fn nth_back(&mut self, n: usize) -> Option { + self.indexes.nth_back(n).map(|i| self.elements[i].clone()) + } + fn rfold(self, init: B, mut f: F) -> B + where + F: FnMut(B, Self::Item) -> B, + { + self.indexes + .rfold(init, |init, i| f(init, self.elements[i].clone())) + } +} + +impl ExactSizeIterator for PdfArrayIntoIter {} + +impl IntoIterator for PdfArray { + type Item = PdfObject; + type IntoIter = PdfArrayIntoIter; + + fn into_iter(self) -> Self::IntoIter { + PdfArrayIntoIter { + indexes: 0..self.elements.len(), + elements: self.elements, + } + } +} + +impl<'a> IntoIterator for &'a PdfArray { + type Item = &'a PdfObject; + type IntoIter = std::slice::Iter<'a, PdfObject>; + + fn into_iter(self) -> Self::IntoIter { + self.elements.iter() + } +} + +impl fmt::Debug for PdfArray { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.elements.fmt(f) + } +} + +#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct MaybeArray(pub Arc<[T]>); + +impl std::ops::Deref for MaybeArray { + type Target = Arc<[T]>; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl std::ops::DerefMut for MaybeArray { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +pdf_parse! { + #[derive(Clone, Debug, PartialEq, Eq)] + #[non_exhaustive] + pub enum PdfStreamFilter { + #[pdf(name = "ASCIIHexDecode")] + AsciiHexDecode, + #[pdf(name = "ASCII85Decode")] + Ascii85Decode, + #[pdf(name = "LZWDecode")] + LzwDecode, + #[pdf(name = "FlateDecode")] + FlateDecode, + #[pdf(name = "RunLengthDecode")] + RunLengthDecode, + #[pdf(name = "CCITTFaxDecode")] + CcittFaxDecode, + #[pdf(name = "JBIG2Decode")] + Jbig2Decode, + #[pdf(name = "DCTDecode")] + DctDecode, + #[pdf(name = "JPXDecode")] + JpxDecode, + #[pdf(name = "Crypt")] + Crypt, + #[pdf(other)] + Unknown(PdfName), + } +} + +impl Default for MaybeArray { + fn default() -> Self { + Self(Arc::default()) + } +} + +impl<'a, T> IntoIterator for &'a MaybeArray { + type Item = &'a T; + type IntoIter = std::slice::Iter<'a, T>; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +#[derive(Clone, Debug)] +pub enum PdfFileSpecification { + String(PdfString), + Dictionary(PdfDictionary), +} + +impl PdfParse for PdfFileSpecification { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("file specification") + } + fn parse(object: PdfObject) -> Result { + match PdfObjectDirect::from(object) { + PdfObjectDirect::String(v) => Ok(Self::String(v)), + PdfObjectDirect::Dictionary(v) => Ok(Self::Dictionary(v)), + object => Err(PdfParseError::InvalidType { + pos: object.pos(), + ty: object.type_name(), + expected_ty: "PdfFileSpecification", + }), + } + } +} + +pdf_parse! { + #[derive(Clone, Debug)] + pub struct PdfStreamDictionary { + #[pdf(name = PdfStreamDictionary::LENGTH_NAME)] + pub len: usize, + #[pdf(name = "Filter")] + pub filters: MaybeArray, + #[pdf(name = "DecodeParms")] + pub decode_parms: MaybeArray>, + #[pdf(name = "F")] + pub file: Option, + #[pdf(name = "FFilter")] + pub file_filters: MaybeArray, + #[pdf(name = "FDecodeParms")] + pub file_decode_parms: MaybeArray>, + #[pdf(name = "DL")] + pub decoded_len: Option, + #[pdf(flatten)] + pub rest: Rest, + } +} + +impl PdfStreamDictionary { + pub const LENGTH_NAME: &str = "Length"; + pub(crate) fn parse_len_from_dictionary( + dictionary: &PdfDictionary, + ) -> Result { + PdfParse::parse( + dictionary + .get(&PdfName::new_static(Self::LENGTH_NAME.as_bytes())) + .cloned() + .unwrap_or_default(), + ) + } +} + +impl PdfStreamDictionary { + pub fn filters_and_parms( + &self, + ) -> impl Clone + ExactSizeIterator + DoubleEndedIterator + { + self.filters.iter().enumerate().map(|(index, filter)| { + ( + filter.clone(), + self.decode_parms + .0 + .get(index) + .cloned() + .flatten() + .unwrap_or_default(), + ) + }) + } + pub fn file_filters_and_parms( + &self, + ) -> impl Clone + ExactSizeIterator + DoubleEndedIterator + { + self.file_filters.iter().enumerate().map(|(index, filter)| { + ( + filter.clone(), + self.file_decode_parms + .0 + .get(index) + .cloned() + .flatten() + .unwrap_or_default(), + ) + }) + } +} + +pub(crate) struct UnparsedPdfStreamDictionary { + unparsed_dictionary: PdfDictionary, + dictionary: Arc>>, +} + +impl UnparsedPdfStreamDictionary { + pub(crate) fn finish_parsing(self) -> Result<(), PdfParseError> { + let Ok(()) = self + .dictionary + .set(PdfParse::parse(self.unparsed_dictionary.into())?) + else { + unreachable!(); + }; + Ok(()) + } +} + +#[derive(Clone)] +pub struct PdfStream { + pos: PdfInputPositionNoCompare, + dictionary: Arc>>, + data: Arc<[u8]>, +} + +impl fmt::Debug for PdfStream { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("PdfStream") + .field("pos", &self.pos) + .field("dictionary", &self.dictionary) + .field("data", &format_args!("{:02x?}", self.data)) + .finish() + } +} + +impl PdfStream { + pub fn new( + pos: impl Into, + dictionary: PdfStreamDictionary, + data: Arc<[u8]>, + ) -> Self { + Self { + pos: pos.into(), + dictionary: Arc::new(OnceLock::from(dictionary)), + data, + } + } + pub(crate) fn new_unparsed( + pos: impl Into, + unparsed_dictionary: PdfDictionary, + data: Arc<[u8]>, + ) -> (Self, UnparsedPdfStreamDictionary) { + let dictionary = Arc::new(OnceLock::new()); + ( + Self { + pos: pos.into(), + dictionary: dictionary.clone(), + data, + }, + UnparsedPdfStreamDictionary { + unparsed_dictionary, + dictionary, + }, + ) + } + pub fn dictionary(&self) -> &PdfStreamDictionary { + self.dictionary + .get() + .expect("haven't finished parsing all pdf object definitions yet") + } + pub fn data(&self) -> &Arc<[u8]> { + &self.data + } +} + +impl GetPdfInputPosition for PdfStream { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.pos.0 + } +} + +impl PdfParse for PdfStream { + fn type_name() -> Cow<'static, str> { + if TypeId::of::() == TypeId::of::() { + Cow::Borrowed("stream") + } else { + Cow::Owned(format!("PdfStream<{}>", Rest::type_name())) + } + } + fn parse(object: PdfObject) -> Result { + match PdfObjectDirect::from(object) { + PdfObjectDirect::Stream(stream) => Ok(PdfStream { + pos: stream.pos, + dictionary: if let Some(dictionary) = ::downcast_ref::< + Arc>>, + >(&stream.dictionary) + { + dictionary.clone() + } else { + let PdfStreamDictionary { + len, + filters, + decode_parms, + file, + file_filters, + file_decode_parms, + decoded_len, + rest, + } = stream.dictionary(); + Arc::new(OnceLock::from(PdfStreamDictionary { + len: *len, + filters: filters.clone(), + decode_parms: decode_parms.clone(), + file: file.clone(), + file_filters: file_filters.clone(), + file_decode_parms: file_decode_parms.clone(), + decoded_len: *decoded_len, + rest: Rest::parse(rest.clone().into())?, + })) + }, + data: stream.data, + }), + object => Err(PdfParseError::InvalidType { + pos: object.get_pdf_input_position(), + ty: object.type_name(), + expected_ty: "stream", + }), + } + } +} diff --git a/src/pdf/parse.rs b/src/pdf/parse.rs new file mode 100644 index 0000000..aa5bc3d --- /dev/null +++ b/src/pdf/parse.rs @@ -0,0 +1,953 @@ +use crate::pdf::object::{ + MaybeArray, PdfInteger, PdfName, PdfNull, PdfObject, PdfObjectDirect, PdfObjectIdentifier, + PdfObjectIndirect, PdfObjectNonNull, PdfReal, +}; +use std::{any::Any, borrow::Cow, fmt, mem, num::NonZero, sync::Arc}; + +#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Default)] +pub struct PdfInputPosition(Option); + +impl fmt::Debug for PdfInputPosition { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_tuple("PdfInputPosition") + .field(&format_args!("{self}")) + .finish() + } +} + +impl fmt::Display for PdfInputPosition { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if let Some(pos) = self.0 { + write!(f, "{pos:#x}") + } else { + f.write_str("") + } + } +} + +impl PdfInputPosition { + pub const fn new(pos: usize) -> Self { + Self(Some(pos)) + } + pub const fn empty() -> PdfInputPosition { + Self(None) + } +} + +pub trait GetPdfInputPosition { + fn get_pdf_input_position(&self) -> PdfInputPosition; +} + +impl GetPdfInputPosition for &'_ T { + fn get_pdf_input_position(&self) -> PdfInputPosition { + T::get_pdf_input_position(self) + } +} + +impl GetPdfInputPosition for &'_ mut T { + fn get_pdf_input_position(&self) -> PdfInputPosition { + T::get_pdf_input_position(self) + } +} + +impl GetPdfInputPosition for Box { + fn get_pdf_input_position(&self) -> PdfInputPosition { + T::get_pdf_input_position(self) + } +} + +impl GetPdfInputPosition for PdfInputPosition { + fn get_pdf_input_position(&self) -> PdfInputPosition { + *self + } +} + +impl GetPdfInputPosition for bool { + fn get_pdf_input_position(&self) -> PdfInputPosition { + PdfInputPosition::empty() + } +} + +impl GetPdfInputPosition for i128 { + fn get_pdf_input_position(&self) -> PdfInputPosition { + PdfInputPosition::empty() + } +} + +impl GetPdfInputPosition for f64 { + fn get_pdf_input_position(&self) -> PdfInputPosition { + PdfInputPosition::empty() + } +} + +#[derive(Clone, Copy, Default)] +pub struct PdfInputPositionNoCompare(pub PdfInputPosition); + +impl PdfInputPositionNoCompare { + pub const fn empty() -> Self { + Self(PdfInputPosition::empty()) + } + pub const fn new(pos: usize) -> Self { + Self(PdfInputPosition::new(pos)) + } +} + +impl GetPdfInputPosition for PdfInputPositionNoCompare { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.0 + } +} + +impl From for PdfInputPositionNoCompare { + fn from(value: PdfInputPosition) -> Self { + Self(value) + } +} + +impl fmt::Debug for PdfInputPositionNoCompare { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_tuple("PdfInputPositionNoCompare") + .field(&format_args!("{self}")) + .finish() + } +} + +impl fmt::Display for PdfInputPositionNoCompare { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.0.fmt(f) + } +} + +impl Ord for PdfInputPositionNoCompare { + fn cmp(&self, _other: &Self) -> std::cmp::Ordering { + std::cmp::Ordering::Equal + } +} + +impl PartialOrd for PdfInputPositionNoCompare { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl std::hash::Hash for PdfInputPositionNoCompare { + fn hash(&self, _state: &mut H) { + // don't hash anything since Self always compares equal + } +} + +impl Eq for PdfInputPositionNoCompare {} + +impl PartialEq for PdfInputPositionNoCompare { + fn eq(&self, _other: &Self) -> bool { + true + } +} + +#[derive(Debug)] +#[non_exhaustive] +pub enum PdfParseError { + Custom(String), + InvalidType { + pos: PdfInputPosition, + ty: &'static str, + expected_ty: &'static str, + }, + InvalidName { + pos: PdfInputPosition, + name: PdfName, + expected_ty: &'static str, + }, + NotAPdfFile, + TruncatedFile { + pos: PdfInputPosition, + }, + InvalidObjectNumber { + pos: PdfInputPosition, + }, + InvalidGenerationNumber { + pos: PdfInputPosition, + }, + InvalidNumber { + pos: PdfInputPosition, + }, + InvalidStringEscape { + pos: PdfInputPosition, + }, + InvalidHexStringDigit { + pos: PdfInputPosition, + }, + DuplicateIndirectObjectDefinition { + pos: PdfInputPosition, + id: PdfObjectIdentifier, + }, + MissingObj { + pos: PdfInputPosition, + }, + MissingEndObj { + pos: PdfInputPosition, + }, + InvalidDictionaryClosingDoubleRAngle { + pos: PdfInputPosition, + }, + DuplicateDictionaryKey { + pos: PdfInputPosition, + name: PdfName, + }, + InvalidNameEscape { + pos: PdfInputPosition, + }, + InvalidOrMissingEolAfterStreamKeyword { + pos: PdfInputPosition, + }, + MissingEndStreamKeyword { + pos: PdfInputPosition, + }, + IntegerOutOfRange { + pos: PdfInputPosition, + }, + MissingTrailer { + pos: PdfInputPosition, + }, + WrongArrayLength { + pos: PdfInputPosition, + len: usize, + expected_len: usize, + }, + MissingStartXRefKeyword { + pos: PdfInputPosition, + }, + MissingStartXRefValue { + pos: PdfInputPosition, + }, + MissingEofComment { + pos: PdfInputPosition, + }, + UnexpectedByte { + pos: PdfInputPosition, + byte: u8, + }, + InvalidStartXRefValue { + pos: PdfInputPosition, + start_xref: usize, + }, +} + +impl From for PdfParseError { + fn from(value: std::convert::Infallible) -> Self { + match value {} + } +} + +impl GetPdfInputPosition for PdfParseError { + fn get_pdf_input_position(&self) -> PdfInputPosition { + match *self { + PdfParseError::Custom(_) | PdfParseError::NotAPdfFile => PdfInputPosition::empty(), + PdfParseError::InvalidType { pos, .. } + | PdfParseError::InvalidName { pos, .. } + | PdfParseError::TruncatedFile { pos } + | PdfParseError::InvalidObjectNumber { pos } + | PdfParseError::InvalidGenerationNumber { pos } + | PdfParseError::InvalidNumber { pos } + | PdfParseError::InvalidStringEscape { pos } + | PdfParseError::InvalidHexStringDigit { pos } + | PdfParseError::DuplicateIndirectObjectDefinition { pos, .. } + | PdfParseError::MissingObj { pos } + | PdfParseError::MissingEndObj { pos } + | PdfParseError::InvalidDictionaryClosingDoubleRAngle { pos } + | PdfParseError::DuplicateDictionaryKey { pos, .. } + | PdfParseError::InvalidNameEscape { pos } + | PdfParseError::InvalidOrMissingEolAfterStreamKeyword { pos } + | PdfParseError::MissingEndStreamKeyword { pos } + | PdfParseError::IntegerOutOfRange { pos } + | PdfParseError::MissingTrailer { pos } + | PdfParseError::WrongArrayLength { pos, .. } + | PdfParseError::MissingStartXRefKeyword { pos } + | PdfParseError::MissingStartXRefValue { pos } + | PdfParseError::MissingEofComment { pos } + | PdfParseError::UnexpectedByte { pos, .. } + | PdfParseError::InvalidStartXRefValue { pos, .. } => pos, + } + } +} + +impl fmt::Display for PdfParseError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match *self { + PdfParseError::Custom(ref v) => f.write_str(v), + PdfParseError::InvalidType { + pos, + ty, + expected_ty, + } => { + write!( + f, + "at {pos}: invalid type: expected {expected_ty}, got {ty}" + ) + } + PdfParseError::InvalidName { + pos, + ref name, + expected_ty, + } => { + write!( + f, + "at {pos}: invalid name: expected a(n) {expected_ty}, got {name}" + ) + } + PdfParseError::NotAPdfFile => f.write_str("Not a PDF file"), + PdfParseError::TruncatedFile { pos } => { + write!(f, "at {pos}: PDF file is truncated too early") + } + PdfParseError::InvalidObjectNumber { pos } => { + write!(f, "at {pos}: PDF object number is invalid") + } + PdfParseError::InvalidGenerationNumber { pos } => { + write!( + f, + "at {pos}: PDF object identifier's generation number is invalid" + ) + } + PdfParseError::InvalidNumber { pos } => { + write!(f, "at {pos}: invalid number") + } + PdfParseError::InvalidStringEscape { pos } => { + write!(f, "at {pos}: invalid string escape") + } + PdfParseError::InvalidHexStringDigit { pos } => { + write!(f, "at {pos}: invalid hex string digit") + } + PdfParseError::DuplicateIndirectObjectDefinition { pos, id } => { + write!(f, "at {pos}: duplicate indirect object definition: {id:?}") + } + PdfParseError::MissingObj { pos } => { + write!( + f, + "at {pos}: indirect object definition is missing `obj` keyword" + ) + } + PdfParseError::MissingEndObj { pos } => { + write!( + f, + "at {pos}: indirect object definition is missing `endobj` keyword" + ) + } + PdfParseError::InvalidDictionaryClosingDoubleRAngle { pos } => { + write!(f, "at {pos}: dictionary has an invalid closing `>>` symbol") + } + PdfParseError::DuplicateDictionaryKey { pos, ref name } => { + write!(f, "at {pos}: duplicate dictionary key: {name}") + } + PdfParseError::InvalidNameEscape { pos } => { + write!(f, "at {pos}: invalid name escape") + } + PdfParseError::InvalidOrMissingEolAfterStreamKeyword { pos } => { + write!( + f, + "at {pos}: invalid or missing end-of-line after `stream` keyword" + ) + } + PdfParseError::MissingEndStreamKeyword { pos } => { + write!(f, "at {pos}: missing `endstream` keyword") + } + PdfParseError::IntegerOutOfRange { pos } => { + write!(f, "at {pos}: integer out of range") + } + PdfParseError::MissingTrailer { pos } => { + write!(f, "at {pos}: missing `trailer` keyword") + } + PdfParseError::WrongArrayLength { + pos, + len, + expected_len, + } => { + write!( + f, + "at {pos}: wrong array length: expected {expected_len}, got {len}" + ) + } + PdfParseError::MissingStartXRefKeyword { pos } => { + write!(f, "at {pos}: missing `startxref` keyword") + } + PdfParseError::MissingStartXRefValue { pos } => { + write!(f, "at {pos}: missing `startxref` value") + } + PdfParseError::MissingEofComment { pos } => { + write!(f, "at {pos}: missing `%%EOF` comment") + } + PdfParseError::UnexpectedByte { pos, byte } => { + write!(f, "at {pos}: unexpected byte {}", byte.escape_ascii()) + } + PdfParseError::InvalidStartXRefValue { pos, start_xref } => { + write!( + f, + "at {pos}: invalid `startxref` value: {start_xref} ({start_xref:#x})" + ) + } + } + } +} + +impl std::error::Error for PdfParseError {} + +pub trait PdfParse: Sized + 'static { + fn type_name() -> Cow<'static, str>; + fn parse(object: PdfObject) -> Result; + fn parse_option(object: PdfObject) -> Result, PdfParseError> { + match object { + PdfObject::Null(_) => Ok(None), + PdfObject::Indirect(ref v) if v.get().is_null() => Ok(None), + PdfObject::Boolean(_) + | PdfObject::Integer(_) + | PdfObject::Real(_) + | PdfObject::String(_) + | PdfObject::Name(_) + | PdfObject::Array(_) + | PdfObject::Dictionary(_) + | PdfObject::Stream(_) + | PdfObject::Indirect(_) => Self::parse(object).map(Some), + } + } +} + +impl PdfParse for Option { + fn type_name() -> Cow<'static, str> { + T::type_name() + } + fn parse(object: PdfObject) -> Result { + T::parse_option(object) + } + fn parse_option(object: PdfObject) -> Result, PdfParseError> { + if matches!(object, PdfObject::Null(_)) { + Ok(None) + } else { + Self::parse(object).map(Some) + } + } +} + +macro_rules! impl_pdf_parse_prim_int { + ($ty:ident) => { + impl PdfParse for $ty { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed(stringify!($ty)) + } + fn parse(object: PdfObject) -> Result { + let v: PdfInteger = PdfParse::parse(object)?; + v.value() + .try_into() + .map_err(|_| PdfParseError::IntegerOutOfRange { pos: v.pos() }) + } + } + impl PdfParse for NonZero<$ty> { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed(concat!("NonZero<", stringify!($ty), ">")) + } + fn parse(object: PdfObject) -> Result { + let v: PdfInteger = PdfParse::parse(object)?; + v.value() + .try_into() + .ok() + .and_then(NonZero::new) + .ok_or(PdfParseError::IntegerOutOfRange { pos: v.pos() }) + } + } + }; +} + +impl_pdf_parse_prim_int!(u8); +impl_pdf_parse_prim_int!(i8); +impl_pdf_parse_prim_int!(u16); +impl_pdf_parse_prim_int!(i16); +impl_pdf_parse_prim_int!(u32); +impl_pdf_parse_prim_int!(i32); +impl_pdf_parse_prim_int!(u64); +impl_pdf_parse_prim_int!(i64); +impl_pdf_parse_prim_int!(u128); +impl_pdf_parse_prim_int!(usize); +impl_pdf_parse_prim_int!(isize); + +impl PdfParse for i128 { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("i128") + } + fn parse(object: PdfObject) -> Result { + let v: PdfInteger = PdfParse::parse(object)?; + Ok(v.value().into()) + } +} + +impl PdfParse for NonZero { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("NonZero") + } + fn parse(object: PdfObject) -> Result { + let v: PdfInteger = PdfParse::parse(object)?; + NonZero::new(v.value().into()).ok_or(PdfParseError::IntegerOutOfRange { pos: v.pos() }) + } +} + +impl PdfParse for f64 { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("f64") + } + fn parse(object: PdfObject) -> Result { + Ok(::parse(object)?.value()) + } +} + +impl PdfParse for f32 { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("f32") + } + fn parse(object: PdfObject) -> Result { + Ok(::parse(object)? as f32) + } +} + +impl PdfParse for PdfNull { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("null") + } + fn parse(object: PdfObject) -> Result { + match PdfObjectDirect::from(object) { + PdfObjectDirect::Null(v) => Ok(v), + object => Err(PdfParseError::InvalidType { + pos: object.pos(), + ty: object.type_name(), + expected_ty: "null", + }), + } + } + fn parse_option(object: PdfObject) -> Result, PdfParseError> { + Self::parse(object).map(Some) + } +} + +impl PdfParse for PdfObjectNonNull { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("non-null") + } + fn parse(object: PdfObject) -> Result { + Option::::from(object).ok_or(PdfParseError::InvalidType { + pos: PdfInputPosition::empty(), + ty: "null", + expected_ty: "non-null", + }) + } + fn parse_option(object: PdfObject) -> Result, PdfParseError> { + Ok(object.into()) + } +} + +impl PdfParse for PdfObjectDirect { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("direct object") + } + fn parse(object: PdfObject) -> Result { + Ok(object.into()) + } +} + +impl PdfParse for PdfObject { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("object") + } + fn parse(object: PdfObject) -> Result { + Ok(object) + } +} + +impl PdfParse for PdfObjectIndirect { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("indirect object") + } + fn parse(object: PdfObject) -> Result { + match object { + PdfObject::Indirect(v) => Ok(v), + _ => Err(PdfParseError::InvalidType { + pos: object.pos(), + ty: object.type_name(), + expected_ty: "indirect object", + }), + } + } + fn parse_option(object: PdfObject) -> Result, PdfParseError> { + match object { + PdfObject::Indirect(v) => Ok(Some(v)), + PdfObject::Null(_) => Ok(None), + _ => Err(PdfParseError::InvalidType { + pos: object.pos(), + ty: object.type_name(), + expected_ty: "indirect object", + }), + } + } +} + +impl PdfParse for [T; N] { + fn type_name() -> Cow<'static, str> { + Cow::Owned(format!("[{}; {N}]", T::type_name())) + } + fn parse(object: PdfObject) -> Result { + match PdfObjectDirect::from(object) { + PdfObjectDirect::Array(array) => { + let array_pos = array.pos(); + let elements = array.into_elements(); + let mut elements: Arc<[PdfObject; N]> = + elements.try_into().map_err(|elements: Arc<[PdfObject]>| { + PdfParseError::WrongArrayLength { + pos: array_pos, + len: elements.len(), + expected_len: N, + } + })?; + let elements: Box<[T]> = if let Some(elements) = Arc::get_mut(&mut elements) { + Result::from_iter(elements.iter_mut().map(|v| T::parse(mem::take(v))))? + } else { + Result::from_iter(elements.iter().map(|v| T::parse(v.clone())))? + }; + Ok(*Box::<[T; N]>::try_from(elements) + .ok() + .expect("already checked length")) + } + object => Err(PdfParseError::InvalidType { + pos: object.pos(), + ty: object.type_name(), + expected_ty: "array", + }), + } + } +} + +impl PdfParse for Arc<[T]> { + fn type_name() -> Cow<'static, str> { + Cow::Owned(format!("Arc<[{}]>", T::type_name())) + } + fn parse(object: PdfObject) -> Result { + match PdfObjectDirect::from(object) { + PdfObjectDirect::Array(array) => { + let mut elements = array.into_elements(); + if let Some(retval) = ::downcast_ref::(&elements) { + return Ok(retval.clone()); + } + if let Some(elements) = Arc::get_mut(&mut elements) { + Result::from_iter(elements.iter_mut().map(|v| T::parse(mem::take(v)))) + } else { + Result::from_iter(elements.iter().map(|v| T::parse(v.clone()))) + } + } + PdfObjectDirect::Null(_) => Ok(Self::default()), + object => Err(PdfParseError::InvalidType { + pos: object.pos(), + ty: object.type_name(), + expected_ty: "array", + }), + } + } +} + +impl PdfParse for MaybeArray { + fn type_name() -> Cow<'static, str> { + Cow::Owned(format!("MaybeArray<{}>", T::type_name())) + } + fn parse(object: PdfObject) -> Result { + match PdfObjectDirect::from(object) { + PdfObjectDirect::Null(_) => Ok(Self::default()), + PdfObjectDirect::Array(object) => Ok(Self(PdfParse::parse(object.into())?)), + object => Ok(Self(Arc::new([PdfParse::parse(object.into())?]))), + } + } +} + +#[macro_export] +macro_rules! pdf_parse { + ( + $(#[$($struct_meta:tt)*])* + $struct_vis:vis struct $Struct:ident$(<$($StructParam:ident $(: $StructBound:tt)? $(= $StructParamDefault:ty)?),* $(,)?>)? { + $(#[pdf $($pdf_meta:tt)*] + $(#[$($field_meta:tt)*])* + $field_vis:vis $field_name:ident: $field_ty:ty,)* + } + ) => { + $(#[$($struct_meta)*])* + $struct_vis struct $Struct$(<$($StructParam $(: $StructBound)? $(= $StructParamDefault)?),*>)? { + $($(#[$($field_meta)*])* + $field_vis $field_name: $field_ty,)* + } + + $crate::pdf::parse::pdf_parse! { + @impl + struct $Struct$(<$($StructParam $(: $StructBound)?),*>)? { + $(#[pdf $($pdf_meta)*] + $(#[$($field_meta)*])* + $field_name: $field_ty,)* + } + } + }; + ( + @impl + struct $Struct:ident$(<$($StructParam:ident $(: $StructBound:tt)?),* $(,)?>)? { + $($(#[$($field_meta:tt)*])* + $field_name:ident: $field_ty:ty,)* + } + ) => { + impl$(<$($StructParam: $crate::pdf::parse::PdfParse $(+ $StructBound)?),*>)? $crate::pdf::parse::PdfParse for $Struct$(<$($StructParam),*>)? { + fn type_name() -> $crate::__std::borrow::Cow<'static, $crate::__std::primitive::str> { + let args: &[$crate::__std::borrow::Cow<'static, $crate::__std::primitive::str>] = &[ + $($(<$StructParam as $crate::pdf::parse::PdfParse>::type_name()),*)? + ]; + if args.is_empty() { + $crate::__std::borrow::Cow::Borrowed($crate::__std::stringify!($Struct)) + } else { + let mut retval = $crate::__std::string::String::new(); + retval.push_str($crate::__std::stringify!($Struct)); + retval.push_str("<"); + let mut first = true; + for arg in args { + if first { + first = false; + } else { + retval.push_str(", "); + } + retval.push_str(arg); + } + retval.push_str(">"); + $crate::__std::borrow::Cow::Owned(retval) + } + } + fn parse(object: $crate::pdf::object::PdfObject) -> $crate::__std::result::Result { + let object = $crate::__std::convert::From::from(object); + let $crate::pdf::object::PdfObjectDirect::Dictionary(object) = object else { + return $crate::__std::result::Result::Err($crate::pdf::parse::PdfParseError::InvalidType { + pos: object.pos(), + ty: object.type_name(), + expected_ty: $crate::__std::stringify!($Struct), + }); + }; + let pos = object.pos(); + let mut object = object.into_fields(); + let object_mut = $crate::__std::sync::Arc::make_mut(&mut object); + let _ = object_mut; + $($crate::pdf::parse::pdf_parse! { + @impl_struct_field(pos, object, object_mut) + [] + $(#[$($field_meta)*])* + $field_name: $field_ty + })* + $crate::__std::result::Result::Ok(Self { + $($field_name,)* + }) + } + } + }; + ( + @impl_struct_field($pos:ident, $object:ident, $object_mut:ident) + [$(#[$($prev_field_meta:tt)*])*] + #[pdf $pdf_meta:tt] + $(#[$($field_meta:tt)*])* + $field_name:ident: $field_ty:ty + ) => { + $crate::pdf::parse::pdf_parse! { + @impl_struct_field($pos, $object, $object_mut, pdf $pdf_meta) + [$(#[$($prev_field_meta)*])*] + $(#[$($field_meta)*])* + $field_name: $field_ty + } + }; + ( + @impl_struct_field($pos:ident, $object:ident, $object_mut:ident $($pdf_meta:tt)*) + [$(#[$($prev_field_meta:tt)*])*] + #[$($next_field_meta:tt)*] + $(#[$($field_meta:tt)*])* + $field_name:ident: $field_ty:ty + ) => { + $crate::pdf::parse::pdf_parse! { + @impl_struct_field($pos, $object, $object_mut $($pdf_meta)*) + [$(#[$($prev_field_meta)*])* #[$($next_field_meta)*]] + $(#[$($field_meta)*])* + $field_name: $field_ty + } + }; + ( + @impl_struct_field($pos:ident, $object:ident, $object_mut:ident, pdf(flatten)) + [$(#[$($field_meta:tt)*])*] + $field_name:ident: $field_ty:ty + ) => { + let $field_name = <$field_ty as $crate::pdf::parse::PdfParse>::parse( + $crate::pdf::object::PdfObject::Dictionary( + $crate::pdf::object::PdfDictionary::from_fields($pos, $object), + ), + )?; + }; + ( + @impl_struct_field($pos:ident, $object:ident, $object_mut:ident, pdf(name = $name:expr)) + [$(#[$($field_meta:tt)*])*] + $field_name:ident: $field_ty:ty + ) => { + let $field_name = $crate::pdf::object::PdfName::new_static( + $crate::__std::convert::AsRef::<[u8]>::as_ref($name), + ); + let $field_name = <$field_ty as $crate::pdf::parse::PdfParse>::parse( + $object_mut + .remove(&$field_name) + .unwrap_or($crate::pdf::object::PdfObject::Null($crate::pdf::object::PdfNull::new($pos))), + )?; + }; + ( + $(#[$($enum_meta:tt)*])* + $enum_vis:vis enum $Enum:ident { + $(#[pdf $($pdf_meta:tt)*] + $(#[$($variant_meta:tt)*])* + $VariantName:ident $(($($variant_paren_body:tt)*))? $({$($variant_brace_body:tt)*})?,)* + } + ) => { + $(#[$($enum_meta)*])* + $enum_vis enum $Enum { + $($(#[$($variant_meta)*])* + $VariantName $(($($variant_paren_body)*))? $({$($variant_brace_body)*})?,)* + } + + $crate::pdf::parse::pdf_parse! { + @impl + $(#[$($enum_meta)*])* + enum $Enum { + $(#[pdf $($pdf_meta)*] + $(#[$($variant_meta)*])* + $VariantName $(($($variant_paren_body)*))? $({$($variant_brace_body)*})?,)* + } + } + }; + ( + @impl + $(#[$($enum_meta:tt)*])* + enum $Enum:ident { + $(#[pdf(name = $name:expr)] + $(#[$($variant_meta:tt)*])* + $VariantName:ident,)* + $(#[pdf(other)] + $(#[$($variant_meta_other:tt)*])* + $VariantNameOther:ident($($PdfName:tt)*),)? + } + ) => { + impl $crate::__std::convert::From<$Enum> for $crate::pdf::object::PdfName { + fn from(value: $Enum) -> Self { + match value { + $($Enum::$VariantName => $crate::pdf::object::PdfName::new_static( + $crate::__std::convert::AsRef::<[u8]>::as_ref($name), + ),)* + $($Enum::$VariantNameOther(v) => $crate::__std::convert::Into::into(v),)? + } + } + } + + $crate::pdf::parse::pdf_parse! { + @impl_try_from + $(#[$($enum_meta)*])* + enum $Enum { + $(#[pdf(name = $name)] + $(#[$($variant_meta)*])* + $VariantName,)* + $(#[pdf(other)] + $(#[$($variant_meta_other)*])* + $VariantNameOther($($PdfName)*),)? + } + } + + impl $crate::pdf::parse::PdfParse for $Enum { + fn type_name() -> $crate::__std::borrow::Cow<'static, $crate::__std::primitive::str> { + $crate::__std::borrow::Cow::Borrowed($crate::__std::stringify!($Struct)) + } + fn parse(object: $crate::pdf::object::PdfObject) -> $crate::__std::result::Result { + let object = $crate::__std::convert::From::from(object); + let $crate::pdf::object::PdfObjectDirect::Name(name) = object else { + return $crate::__std::result::Result::Err($crate::pdf::parse::PdfParseError::InvalidType { + pos: object.pos(), + ty: object.type_name(), + expected_ty: $crate::__std::stringify!($Struct), + }); + }; + $crate::__std::result::Result::Ok($crate::__std::convert::TryInto::<$Enum>::try_into(name)?) + } + } + }; + ( + @impl_try_from + $(#[$($enum_meta:tt)*])* + enum $Enum:ident { + $(#[pdf(name = $name:expr)] + $(#[$($variant_meta:tt)*])* + $VariantName:ident,)* + #[pdf(other)] + $(#[$($variant_meta_other:tt)*])* + $VariantNameOther:ident(PdfName), + } + ) => { + impl $crate::__std::convert::From<$crate::pdf::object::PdfName> for $Enum { + fn from(name: $crate::pdf::object::PdfName) -> Self { + $(if name == $crate::pdf::object::PdfName::new_static( + $crate::__std::convert::AsRef::<[u8]>::as_ref($name), + ) { + $Enum::$VariantName + } else)* { + $Enum::$VariantNameOther(name) + } + } + } + }; + ( + @impl_try_from + $(#[$($enum_meta:tt)*])* + enum $Enum:ident { + $(#[pdf(name = $name:expr)] + $(#[$($variant_meta:tt)*])* + $VariantName:ident,)* + #[pdf(other)] + $(#[$($variant_meta_other:tt)*])* + $VariantNameOther:ident($PdfName:ty), + } + ) => { + impl $crate::__std::convert::TryFrom<$crate::pdf::object::PdfName> for $Enum { + type Error = $crate::pdf::parse::PdfParseError; + + fn try_from(name: $crate::pdf::object::PdfName) -> $crate::__std::result::Result { + $(if name == $crate::pdf::object::PdfName::new_static( + $crate::__std::convert::AsRef::<[u8]>::as_ref($name), + ) { + $crate::__std::result::Result::Ok($Enum::$VariantName) + } else)* { + $crate::__std::result::Result::Ok($Enum::$VariantNameOther($crate::__std::convert::TryInto::<$PdfName>::try_into(name)?)) + } + } + } + }; + ( + @impl_try_from + $(#[$($enum_meta:tt)*])* + enum $Enum:ident { + $(#[pdf(name = $name:expr)] + $(#[$($variant_meta:tt)*])* + $VariantName:ident,)* + } + ) => { + impl $crate::__std::convert::TryFrom<$crate::pdf::object::PdfName> for $Enum { + type Error = $crate::pdf::parse::PdfParseError; + + fn try_from(name: $crate::pdf::object::PdfName) -> $crate::__std::result::Result { + $(if name == $crate::pdf::object::PdfName::new_static( + $crate::__std::convert::AsRef::<[u8]>::as_ref($name), + ) { + $crate::__std::result::Result::Ok($Enum::$VariantName) + } else)* { + $crate::__std::result::Result::Err($crate::pdf::parse::PdfParseError::InvalidName { + pos: name.pos(), + name, + expected_ty: $crate::__std::stringify!($Struct), + }) + } + } + } + }; +} + +pub use pdf_parse; diff --git a/src/util.rs b/src/util.rs index e234a7d..a7a4978 100644 --- a/src/util.rs +++ b/src/util.rs @@ -5,7 +5,7 @@ use std::{ sync::Arc, }; -pub(crate) enum ArcOrRef<'a, T: ?Sized> { +pub enum ArcOrRef<'a, T: ?Sized> { Arc(Arc), Ref(&'a T), } From 83631cc4c6f315ffe3cdb949c143ae174b0069a4 Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Wed, 24 Dec 2025 21:49:57 -0800 Subject: [PATCH 03/42] parses root successfully --- Cargo.lock | 50 +++++ Cargo.toml | 1 + src/pdf.rs | 154 +++++++++++--- src/pdf/object.rs | 345 ++++++++++++++++++++++++-------- src/pdf/parse.rs | 53 ++++- src/pdf/stream_filters.rs | 65 ++++++ src/pdf/stream_filters/flate.rs | 73 +++++++ 7 files changed, 623 insertions(+), 118 deletions(-) create mode 100644 src/pdf/stream_filters.rs create mode 100644 src/pdf/stream_filters/flate.rs diff --git a/Cargo.lock b/Cargo.lock index 3cb67c2..07f112f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,56 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "adler2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "crc32fast" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "flate2" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfe33edd8e85a12a67454e37f8c75e730830d83e313556ab9ebf9ee7fbeb3bfb" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "miniz_oxide" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" +dependencies = [ + "adler2", + "simd-adler32", +] + [[package]] name = "parse_powerisa_pdf" version = "0.1.0" +dependencies = [ + "flate2", +] + +[[package]] +name = "simd-adler32" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" diff --git a/Cargo.toml b/Cargo.toml index 125e5e2..c5d18eb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,4 +5,5 @@ edition = "2024" license = "LGPL-3.0-or-later" [dependencies] +flate2 = "1.1.5" diff --git a/src/pdf.rs b/src/pdf.rs index 4ba785b..d4a4922 100644 --- a/src/pdf.rs +++ b/src/pdf.rs @@ -1,11 +1,11 @@ use crate::{ pdf::{ object::{ - MaybeArray, PdfArray, PdfBoolean, PdfDictionary, PdfInteger, PdfName, PdfNull, - PdfObject, PdfObjectIdentifier, PdfObjectIndirect, PdfReal, PdfStream, + PdfArray, PdfBoolean, PdfDictionary, PdfInteger, PdfName, PdfNull, PdfObject, + PdfObjectIdentifier, PdfObjectIndirect, PdfObjectStreamDictionary, PdfReal, PdfStream, PdfStreamDictionary, PdfString, UnparsedPdfStreamDictionary, }, - parse::{PdfInputPosition, PdfParse, PdfParseError}, + parse::{GetPdfInputPosition, PdfInputPosition, PdfParse, PdfParseError}, }, pdf_parse, util::ArcOrRef, @@ -21,9 +21,15 @@ use std::{ pub mod object; pub mod parse; +pub mod stream_filters; + +struct PdfObjectsInner { + objects: BTreeMap, + object_streams: Vec>, +} pub struct PdfObjects { - objects: OnceLock>, + inner: OnceLock, } #[derive(Copy, Clone, Debug)] @@ -70,24 +76,12 @@ pdf_parse! { pub struct PdfXRefStreamDictionaryRest { #[pdf(name = "Type")] pub ty: PdfXRefName, - #[pdf(name = "Size")] - pub size: usize, #[pdf(name = "Index")] pub index: Option>, - #[pdf(name = "Prev")] - pub prev: Option, #[pdf(name = "W")] pub w: Option>, - #[pdf(name = "Root")] - pub root: PdfDictionary, - #[pdf(name = "Encrypt")] - pub encrypt: Option, - #[pdf(name = "Info")] - pub info: Option, - #[pdf(name = "ID")] - pub id: Option<[PdfString; 2]>, #[pdf(flatten)] - pub rest: PdfDictionary, + pub rest: PdfTrailerDictionary, } } @@ -308,6 +302,38 @@ struct PdfParser<'a> { } impl<'a> PdfParser<'a> { + fn with_tokenizer<'b, R>( + &mut self, + tokenizer: PdfTokenizer<'b>, + f: impl FnOnce(&mut PdfParser<'b>) -> R, + ) -> R { + let PdfParser { + objects_arc, + objects_map, + unparsed_stream_dictionaries, + tokenizer: _, + } = self; + let objects_arc = objects_arc.clone(); + let objects_map = std::mem::take(objects_map); + let unparsed_stream_dictionaries = std::mem::take(unparsed_stream_dictionaries); + let mut new_parser = PdfParser { + objects_arc, + objects_map, + unparsed_stream_dictionaries, + tokenizer, + }; + let retval = f(&mut new_parser); + let PdfParser { + objects_arc, + objects_map, + unparsed_stream_dictionaries, + tokenizer: _, + } = new_parser; + self.objects_arc = objects_arc; + self.objects_map = objects_map; + self.unparsed_stream_dictionaries = unparsed_stream_dictionaries; + retval + } fn parse_header(&mut self) -> Result { let Some(b'%') = self.tokenizer.bytes.first() else { return Err(PdfParseError::NotAPdfFile); @@ -739,18 +765,94 @@ impl<'a> PdfParser<'a> { Ok(Some(())) } } + fn parse_object_stream_inner( + &mut self, + object_stream: &PdfStream, + ) -> Result<(), PdfParseError> { + let mut object_ids_and_byte_positions = + Vec::<(PdfObjectIdentifier, usize)>::with_capacity(object_stream.dictionary().rest.n); + for _ in 0..object_stream.dictionary().rest.n { + self.skip_comments_and_whitespace(); + let Some((pos, object_number)) = + self.parse_digits(|pos| Err(PdfParseError::InvalidObjectNumber { pos }))? + else { + return Err(PdfParseError::InvalidObjectNumber { + pos: self.tokenizer.pos(), + }); + }; + self.skip_comments_and_whitespace(); + let Some((_, byte_position)) = + self.parse_digits(|pos| Err(PdfParseError::InvalidNumber { pos }))? + else { + return Err(PdfParseError::InvalidNumber { + pos: self.tokenizer.pos(), + }); + }; + object_ids_and_byte_positions.push(( + PdfObjectIdentifier { + pos: pos.into(), + object_number, + generation_number: 0, + }, + byte_position, + )); + } + for (id, _byte_position) in object_ids_and_byte_positions { + let object = self.parse_object()?; + if self.objects_map.insert(id, object).is_some() { + return Err(PdfParseError::DuplicateIndirectObjectDefinition { pos: id.pos.0, id }); + } + } + Ok(()) + } + fn parse_object_stream( + &mut self, + object_stream: &PdfStream, + ) -> Result<(), PdfParseError> { + let data = object_stream.decoded_data().as_ref()?; + self.with_tokenizer(PdfTokenizer::new(data, 0), |parser| { + parser.parse_object_stream_inner(object_stream) + }) + .map_err(|e| PdfParseError::ObjectStreamParseError { + stream_pos: object_stream.get_pdf_input_position(), + error: Arc::new(e), + }) + } fn parse_body(&mut self) -> Result<(), PdfParseError> { while let Some(()) = self.parse_indirect_object_definition()? {} - let Ok(()) = self - .objects_arc - .objects - .set(std::mem::take(&mut self.objects_map)) - else { - unreachable!(); - }; self.unparsed_stream_dictionaries .drain(..) - .try_for_each(|v| v.finish_parsing()) + .try_for_each(|v| v.finish_parsing())?; + let mut object_streams: Vec> = Vec::new(); + for object in self.objects_map.values_mut() { + let stream = match object { + PdfObject::Stream(stream) => stream, + PdfObject::Boolean(_) + | PdfObject::Integer(_) + | PdfObject::Real(_) + | PdfObject::String(_) + | PdfObject::Name(_) + | PdfObject::Array(_) + | PdfObject::Dictionary(_) + | PdfObject::Null(_) + | PdfObject::Indirect(_) => continue, + }; + if PdfObjectStreamDictionary::parse_type_from_dictionary(&stream.dictionary().rest) + .is_ok() + { + object_streams.push(PdfStream::parse(object.clone())?); + } + } + for object_stream in &object_streams { + self.parse_object_stream(object_stream)?; + } + let Ok(()) = self.objects_arc.inner.set(PdfObjectsInner { + objects: std::mem::take(&mut self.objects_map), + object_streams, + }) else { + unreachable!(); + }; + Ok(()) } fn parse_xref_table(&mut self) -> Result<(), PdfParseError> { self.skip_comments_and_whitespace(); @@ -844,7 +946,7 @@ impl Pdf { pub fn parse(bytes: impl AsRef<[u8]>) -> Result { PdfParser { objects_arc: Arc::new(PdfObjects { - objects: OnceLock::new(), + inner: OnceLock::new(), }), objects_map: BTreeMap::new(), unparsed_stream_dictionaries: vec![], diff --git a/src/pdf/object.rs b/src/pdf/object.rs index d3979d8..2a17df5 100644 --- a/src/pdf/object.rs +++ b/src/pdf/object.rs @@ -5,15 +5,17 @@ use crate::{ GetPdfInputPosition, PdfInputPosition, PdfInputPositionNoCompare, PdfParse, PdfParseError, }, + stream_filters::PdfStreamFilter, }, pdf_parse, util::ArcOrRef, }; use std::{ any::TypeId, - borrow::Cow, + borrow::{Borrow, Cow}, collections::BTreeMap, fmt::{self, Write}, + iter::FusedIterator, num::NonZero, sync::{Arc, OnceLock}, }; @@ -61,6 +63,12 @@ pub struct PdfName { bytes: ArcOrRef<'static, [u8]>, } +impl Borrow<[u8]> for PdfName { + fn borrow(&self) -> &[u8] { + &self.bytes + } +} + impl PdfName { pub fn try_new( pos: impl Into, @@ -218,24 +226,51 @@ macro_rules! make_pdf_object { $Variant:ident($ty:ty), )+ ) => { - #[derive(Clone, Debug)] + #[derive(Clone)] pub enum PdfObjectNonNull { $($Variant($ty),)* } - #[derive(Clone, Debug)] + impl fmt::Debug for PdfObjectNonNull { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + $(Self::$Variant(v) => v.fmt(f),)* + } + } + } + + #[derive(Clone)] pub enum PdfObjectDirect { $($Variant($ty),)* Null(PdfNull), } - #[derive(Clone, Debug)] + impl fmt::Debug for PdfObjectDirect { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + $(Self::$Variant(v) => v.fmt(f),)* + Self::Null(v) => v.fmt(f), + } + } + } + + #[derive(Clone)] pub enum PdfObject { $($Variant($ty),)* Null(PdfNull), Indirect(PdfObjectIndirect), } + impl fmt::Debug for PdfObject { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + $(Self::$Variant(v) => v.fmt(f),)* + Self::Null(v) => v.fmt(f), + Self::Indirect(v) => v.fmt(f), + } + } + } + $( impl From<$ty> for PdfObjectNonNull { fn from(value: $ty) -> Self { @@ -546,12 +581,12 @@ impl PdfObjectIndirect { } pub fn get(&self) -> PdfObjectDirect { if let Some(objects) = self.objects.upgrade() { - if let Some(objects) = objects.objects.get() { + if let Some(objects) = objects.inner.get() { let final_id = self.final_id.get().copied(); let limit = if final_id.is_some() { 1 } else { 1000usize }; let mut id = final_id.unwrap_or(self.id); for _ in 0..limit { - if let Some(object) = objects.get(&self.id) { + if let Some(object) = objects.objects.get(&self.id) { let retval = match object { PdfObject::Boolean(v) => PdfObjectDirect::Boolean(*v), PdfObject::Integer(v) => PdfObjectDirect::Integer(*v), @@ -628,18 +663,27 @@ impl PdfDictionary { } pub fn contains_key(&self, key: &Q) -> bool where - PdfName: std::borrow::Borrow + Ord, + PdfName: std::borrow::Borrow, Q: Ord, { self.fields.contains_key(key) } pub fn get(&self, key: &Q) -> Option<&PdfObject> where - PdfName: std::borrow::Borrow + Ord, + PdfName: std::borrow::Borrow, Q: Ord, { self.fields.get(key) } + pub fn get_or_null(&self, key: &Q) -> PdfObject + where + PdfName: std::borrow::Borrow, + Q: Ord, + { + self.get(key) + .cloned() + .unwrap_or(PdfObject::Null(PdfNull(self.pos))) + } pub fn pos(&self) -> PdfInputPosition { self.pos.0 } @@ -842,35 +886,6 @@ impl std::ops::DerefMut for MaybeArray { } } -pdf_parse! { - #[derive(Clone, Debug, PartialEq, Eq)] - #[non_exhaustive] - pub enum PdfStreamFilter { - #[pdf(name = "ASCIIHexDecode")] - AsciiHexDecode, - #[pdf(name = "ASCII85Decode")] - Ascii85Decode, - #[pdf(name = "LZWDecode")] - LzwDecode, - #[pdf(name = "FlateDecode")] - FlateDecode, - #[pdf(name = "RunLengthDecode")] - RunLengthDecode, - #[pdf(name = "CCITTFaxDecode")] - CcittFaxDecode, - #[pdf(name = "JBIG2Decode")] - Jbig2Decode, - #[pdf(name = "DCTDecode")] - DctDecode, - #[pdf(name = "JPXDecode")] - JpxDecode, - #[pdf(name = "Crypt")] - Crypt, - #[pdf(other)] - Unknown(PdfName), - } -} - impl Default for MaybeArray { fn default() -> Self { Self(Arc::default()) @@ -936,47 +951,101 @@ impl PdfStreamDictionary { pub(crate) fn parse_len_from_dictionary( dictionary: &PdfDictionary, ) -> Result { - PdfParse::parse( - dictionary - .get(&PdfName::new_static(Self::LENGTH_NAME.as_bytes())) - .cloned() - .unwrap_or_default(), + PdfParse::parse(dictionary.get_or_null(Self::LENGTH_NAME.as_bytes())) + } +} + +#[derive(Debug, Clone, Default)] +pub struct PdfStreamDictionaryFiltersAndParms<'a> { + filters: std::iter::Enumerate>, + decode_parms: &'a [Option], +} + +impl<'a> PdfStreamDictionaryFiltersAndParms<'a> { + fn item_helper( + filter: (usize, &'a PdfStreamFilter), + decode_parms: &'a [Option], + ) -> (&'a PdfStreamFilter, &'a PdfDictionary) { + static EMPTY_DICTIONARY: OnceLock = OnceLock::new(); + let (index, filter) = filter; + ( + filter, + match decode_parms.get(index) { + Some(Some(v)) => v, + _ => EMPTY_DICTIONARY.get_or_init(PdfDictionary::default), + }, ) } } -impl PdfStreamDictionary { - pub fn filters_and_parms( - &self, - ) -> impl Clone + ExactSizeIterator + DoubleEndedIterator - { - self.filters.iter().enumerate().map(|(index, filter)| { - ( - filter.clone(), - self.decode_parms - .0 - .get(index) - .cloned() - .flatten() - .unwrap_or_default(), - ) - }) +impl<'a> Iterator for PdfStreamDictionaryFiltersAndParms<'a> { + type Item = (&'a PdfStreamFilter, &'a PdfDictionary); + + fn next(&mut self) -> Option { + self.filters + .next() + .map(|filter| Self::item_helper(filter, self.decode_parms)) } - pub fn file_filters_and_parms( - &self, - ) -> impl Clone + ExactSizeIterator + DoubleEndedIterator + + fn size_hint(&self) -> (usize, Option) { + self.filters.size_hint() + } + + fn nth(&mut self, n: usize) -> Option { + self.filters + .nth(n) + .map(|filter| Self::item_helper(filter, self.decode_parms)) + } + + fn fold(self, init: B, f: F) -> B + where + F: FnMut(B, Self::Item) -> B, { - self.file_filters.iter().enumerate().map(|(index, filter)| { - ( - filter.clone(), - self.file_decode_parms - .0 - .get(index) - .cloned() - .flatten() - .unwrap_or_default(), - ) - }) + self.filters + .map(|filter| Self::item_helper(filter, self.decode_parms)) + .fold(init, f) + } +} + +impl<'a> FusedIterator for PdfStreamDictionaryFiltersAndParms<'a> {} + +impl<'a> ExactSizeIterator for PdfStreamDictionaryFiltersAndParms<'a> {} + +impl<'a> DoubleEndedIterator for PdfStreamDictionaryFiltersAndParms<'a> { + fn next_back(&mut self) -> Option { + self.filters + .next_back() + .map(|filter| Self::item_helper(filter, self.decode_parms)) + } + + fn nth_back(&mut self, n: usize) -> Option { + self.filters + .nth_back(n) + .map(|filter| Self::item_helper(filter, self.decode_parms)) + } + + fn rfold(self, init: B, f: F) -> B + where + F: FnMut(B, Self::Item) -> B, + { + self.filters + .map(|filter| Self::item_helper(filter, self.decode_parms)) + .rfold(init, f) + } +} + +impl PdfStreamDictionary { + pub fn filters_and_parms<'a>(&'a self) -> PdfStreamDictionaryFiltersAndParms<'a> { + PdfStreamDictionaryFiltersAndParms { + filters: self.filters.iter().enumerate(), + decode_parms: &self.decode_parms, + } + } + pub fn file_filters_and_parms<'a>(&'a self) -> PdfStreamDictionaryFiltersAndParms<'a> { + PdfStreamDictionaryFiltersAndParms { + filters: self.file_filters.iter().enumerate(), + decode_parms: &self.file_decode_parms, + } } } @@ -1001,16 +1070,64 @@ impl UnparsedPdfStreamDictionary { pub struct PdfStream { pos: PdfInputPositionNoCompare, dictionary: Arc>>, - data: Arc<[u8]>, + encoded_data: Arc<[u8]>, + decoded_data: Arc, PdfParseError>>>, +} + +struct DumpBytes<'a>(&'a [u8]); + +impl<'a> fmt::Debug for DumpBytes<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(self, f) + } +} + +impl fmt::Display for DumpBytes<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut first = true; + let mut fmt_chunk = |chunk: &[u8]| -> fmt::Result { + if first { + first = false; + } else { + f.write_str("\n")?; + } + write!(f, "\"{}\"", chunk.escape_ascii()) + }; + if self.0.is_empty() { + return fmt_chunk(self.0); + } + for chunk in self.0.chunks(32) { + fmt_chunk(chunk)?; + } + Ok(()) + } } impl fmt::Debug for PdfStream { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("PdfStream") - .field("pos", &self.pos) - .field("dictionary", &self.dictionary) - .field("data", &format_args!("{:02x?}", self.data)) - .finish() + let Self { + pos, + dictionary, + encoded_data, + decoded_data, + } = self; + let mut debug_struct = f.debug_struct("PdfStream"); + debug_struct.field("pos", pos); + if let Some(dictionary) = dictionary.get() { + debug_struct.field("dictionary", dictionary); + } else { + debug_struct.field("dictionary", &format_args!("")); + } + debug_struct.field("encoded_data", &DumpBytes(encoded_data)); + if let Some(decoded_data) = decoded_data.get() { + match decoded_data { + Ok(decoded_data) => debug_struct.field("decoded_data", &DumpBytes(decoded_data)), + Err(e) => debug_struct.field("decoded_data", &Err::<(), _>(e)), + }; + } else { + debug_struct.field("decoded_data", &format_args!("")); + } + debug_struct.finish() } } @@ -1018,25 +1135,27 @@ impl PdfStream { pub fn new( pos: impl Into, dictionary: PdfStreamDictionary, - data: Arc<[u8]>, + encoded_data: Arc<[u8]>, ) -> Self { Self { pos: pos.into(), dictionary: Arc::new(OnceLock::from(dictionary)), - data, + encoded_data, + decoded_data: Arc::new(OnceLock::new()), } } pub(crate) fn new_unparsed( pos: impl Into, unparsed_dictionary: PdfDictionary, - data: Arc<[u8]>, + encoded_data: Arc<[u8]>, ) -> (Self, UnparsedPdfStreamDictionary) { let dictionary = Arc::new(OnceLock::new()); ( Self { pos: pos.into(), dictionary: dictionary.clone(), - data, + encoded_data, + decoded_data: Arc::new(OnceLock::new()), }, UnparsedPdfStreamDictionary { unparsed_dictionary, @@ -1049,8 +1168,29 @@ impl PdfStream { .get() .expect("haven't finished parsing all pdf object definitions yet") } - pub fn data(&self) -> &Arc<[u8]> { - &self.data + pub fn encoded_data(&self) -> &Arc<[u8]> { + &self.encoded_data + } + fn try_decode_data(&self) -> Result, PdfParseError> { + let dictionary = self.dictionary(); + let (data, filters) = if let Some(file) = &dictionary.file { + todo!() + } else { + (&self.encoded_data, dictionary.filters_and_parms()) + }; + if filters.len() == 0 { + return Ok(data.clone()); + } + let mut data: &[u8] = data; + let mut buffer; + for (filter, filter_parms) in filters { + buffer = filter.decode_stream_data(filter_parms.clone(), self.pos.0, &data)?; + data = &buffer; + } + Ok(Arc::from(data)) + } + pub fn decoded_data(&self) -> &Result, PdfParseError> { + self.decoded_data.get_or_init(|| self.try_decode_data()) } } @@ -1099,7 +1239,8 @@ impl PdfParse for PdfStream { rest: Rest::parse(rest.clone().into())?, })) }, - data: stream.data, + encoded_data: stream.encoded_data, + decoded_data: stream.decoded_data, }), object => Err(PdfParseError::InvalidType { pos: object.get_pdf_input_position(), @@ -1109,3 +1250,37 @@ impl PdfParse for PdfStream { } } } + +pdf_parse! { + #[derive(Clone, Copy, Debug, Hash, Default, PartialEq, Eq, PartialOrd, Ord)] + pub enum PdfObjectStreamType { + #[pdf(name = "ObjStm")] + #[default] + ObjStm, + } +} + +pdf_parse! { + #[derive(Clone, Debug)] + pub struct PdfObjectStreamDictionary { + #[pdf(name = Self::TYPE_NAME)] + pub ty: PdfObjectStreamType, + #[pdf(name = "N")] + pub n: usize, + #[pdf(name = "First")] + pub first: usize, + #[pdf(name = "Extends")] + pub extends: Option, + #[pdf(flatten)] + pub rest: PdfDictionary, + } +} + +impl PdfObjectStreamDictionary { + pub const TYPE_NAME: &str = "Type"; + pub(crate) fn parse_type_from_dictionary( + dictionary: &PdfDictionary, + ) -> Result { + PdfParse::parse(dictionary.get_or_null(Self::TYPE_NAME.as_bytes())) + } +} diff --git a/src/pdf/parse.rs b/src/pdf/parse.rs index aa5bc3d..2287dbf 100644 --- a/src/pdf/parse.rs +++ b/src/pdf/parse.rs @@ -144,7 +144,7 @@ impl PartialEq for PdfInputPositionNoCompare { } } -#[derive(Debug)] +#[derive(Debug, Clone)] #[non_exhaustive] pub enum PdfParseError { Custom(String), @@ -231,6 +231,19 @@ pub enum PdfParseError { pos: PdfInputPosition, start_xref: usize, }, + UnknownStreamFilter { + pos: PdfInputPosition, + filter: PdfName, + }, + StreamFilterError { + pos: PdfInputPosition, + filter: PdfName, + error: String, + }, + ObjectStreamParseError { + stream_pos: PdfInputPosition, + error: Arc, + }, } impl From for PdfParseError { @@ -239,6 +252,12 @@ impl From for PdfParseError { } } +impl<'a> From<&'a Self> for PdfParseError { + fn from(value: &'a Self) -> Self { + value.clone() + } +} + impl GetPdfInputPosition for PdfParseError { fn get_pdf_input_position(&self) -> PdfInputPosition { match *self { @@ -266,7 +285,12 @@ impl GetPdfInputPosition for PdfParseError { | PdfParseError::MissingStartXRefValue { pos } | PdfParseError::MissingEofComment { pos } | PdfParseError::UnexpectedByte { pos, .. } - | PdfParseError::InvalidStartXRefValue { pos, .. } => pos, + | PdfParseError::InvalidStartXRefValue { pos, .. } + | PdfParseError::UnknownStreamFilter { pos, .. } + | PdfParseError::StreamFilterError { pos, .. } + | PdfParseError::ObjectStreamParseError { + stream_pos: pos, .. + } => pos, } } } @@ -376,7 +400,7 @@ impl fmt::Display for PdfParseError { write!(f, "at {pos}: missing `%%EOF` comment") } PdfParseError::UnexpectedByte { pos, byte } => { - write!(f, "at {pos}: unexpected byte {}", byte.escape_ascii()) + write!(f, "at {pos}: unexpected byte '{}'", byte.escape_ascii()) } PdfParseError::InvalidStartXRefValue { pos, start_xref } => { write!( @@ -384,6 +408,23 @@ impl fmt::Display for PdfParseError { "at {pos}: invalid `startxref` value: {start_xref} ({start_xref:#x})" ) } + PdfParseError::UnknownStreamFilter { pos, ref filter } => { + write!(f, "at {pos}: unknown stream filter: {filter}") + } + PdfParseError::StreamFilterError { + pos, + ref filter, + ref error, + } => { + write!(f, "at {pos}: stream filter {filter} error: {error}") + } + PdfParseError::ObjectStreamParseError { + stream_pos, + ref error, + } => { + write!(f, "at {stream_pos}: object stream error: ")?; + error.fmt(f) + } } } } @@ -785,12 +826,10 @@ macro_rules! pdf_parse { [$(#[$($field_meta:tt)*])*] $field_name:ident: $field_ty:ty ) => { - let $field_name = $crate::pdf::object::PdfName::new_static( - $crate::__std::convert::AsRef::<[u8]>::as_ref($name), - ); + let $field_name = $crate::__std::convert::AsRef::<[u8]>::as_ref($name); let $field_name = <$field_ty as $crate::pdf::parse::PdfParse>::parse( $object_mut - .remove(&$field_name) + .remove($field_name) .unwrap_or($crate::pdf::object::PdfObject::Null($crate::pdf::object::PdfNull::new($pos))), )?; }; diff --git a/src/pdf/stream_filters.rs b/src/pdf/stream_filters.rs new file mode 100644 index 0000000..51a3884 --- /dev/null +++ b/src/pdf/stream_filters.rs @@ -0,0 +1,65 @@ +use crate::pdf::{ + object::{PdfDictionary, PdfName}, + parse::{PdfInputPosition, PdfParse, PdfParseError}, + pdf_parse, +}; + +pub mod flate; + +pdf_parse! { + #[derive(Clone, Debug, PartialEq, Eq)] + #[non_exhaustive] + pub enum PdfStreamFilter { + #[pdf(name = "ASCIIHexDecode")] + AsciiHexDecode, + #[pdf(name = "ASCII85Decode")] + Ascii85Decode, + #[pdf(name = "LZWDecode")] + LzwDecode, + #[pdf(name = "FlateDecode")] + FlateDecode, + #[pdf(name = "RunLengthDecode")] + RunLengthDecode, + #[pdf(name = "CCITTFaxDecode")] + CcittFaxDecode, + #[pdf(name = "JBIG2Decode")] + Jbig2Decode, + #[pdf(name = "DCTDecode")] + DctDecode, + #[pdf(name = "JPXDecode")] + JpxDecode, + #[pdf(name = "Crypt")] + Crypt, + #[pdf(other)] + Unknown(PdfName), + } +} + +impl PdfStreamFilter { + pub fn decode_stream_data( + &self, + filter_parms: PdfDictionary, + stream_pos: PdfInputPosition, + encoded_data: &[u8], + ) -> Result, PdfParseError> { + match self { + PdfStreamFilter::AsciiHexDecode => todo!(), + PdfStreamFilter::Ascii85Decode => todo!(), + PdfStreamFilter::LzwDecode => todo!(), + PdfStreamFilter::FlateDecode => { + flate::PdfFilterParmsFlateDecode::parse(filter_parms.into())? + .decode_stream_data(stream_pos, encoded_data) + } + PdfStreamFilter::RunLengthDecode => todo!(), + PdfStreamFilter::CcittFaxDecode => todo!(), + PdfStreamFilter::Jbig2Decode => todo!(), + PdfStreamFilter::DctDecode => todo!(), + PdfStreamFilter::JpxDecode => todo!(), + PdfStreamFilter::Crypt => todo!(), + PdfStreamFilter::Unknown(filter) => Err(PdfParseError::UnknownStreamFilter { + pos: stream_pos, + filter: filter.clone(), + }), + } + } +} diff --git a/src/pdf/stream_filters/flate.rs b/src/pdf/stream_filters/flate.rs new file mode 100644 index 0000000..46d01a8 --- /dev/null +++ b/src/pdf/stream_filters/flate.rs @@ -0,0 +1,73 @@ +use crate::pdf::{ + object::PdfDictionary, + parse::{PdfInputPosition, PdfParseError}, + pdf_parse, + stream_filters::PdfStreamFilter, +}; +use std::{io::Read, num::NonZero}; + +pdf_parse! { + #[derive(Clone, Debug, Default)] + pub struct PdfFilterParmsFlateDecode { + #[pdf(name = "Predictor")] + pub predictor: Option>, + #[pdf(name = "Colors")] + pub colors: Option>, + #[pdf(name = "BitsPerComponent")] + pub bits_per_component: Option>, + #[pdf(name = "Columns")] + pub columns: Option>, + #[pdf(flatten)] + pub rest: PdfDictionary, + } +} + +impl PdfFilterParmsFlateDecode { + pub const FILTER: PdfStreamFilter = PdfStreamFilter::FlateDecode; + pub const DEFAULT_PREDICTOR: NonZero = const { NonZero::new(1).unwrap() }; + pub const DEFAULT_COLORS: NonZero = const { NonZero::new(1).unwrap() }; + pub const DEFAULT_BITS_PER_COMPONENT: NonZero = const { NonZero::new(8).unwrap() }; + pub const DEFAULT_COLUMNS: NonZero = const { NonZero::new(1).unwrap() }; + pub fn predictor(&self) -> NonZero { + self.predictor.unwrap_or(Self::DEFAULT_PREDICTOR) + } + pub fn colors(&self) -> NonZero { + self.colors.unwrap_or(Self::DEFAULT_COLORS) + } + pub fn bits_per_component(&self) -> NonZero { + self.bits_per_component + .unwrap_or(Self::DEFAULT_BITS_PER_COMPONENT) + } + pub fn columns(&self) -> NonZero { + self.columns.unwrap_or(Self::DEFAULT_COLUMNS) + } + pub fn decode_stream_data( + &self, + stream_pos: PdfInputPosition, + encoded_data: &[u8], + ) -> Result, PdfParseError> { + let Self { + predictor: _, + colors: _, + bits_per_component: _, + columns: _, + rest: _, + } = self; + let mut decoded_data = vec![]; + flate2::bufread::ZlibDecoder::new(encoded_data) + .read_to_end(&mut decoded_data) + .map_err(|e| PdfParseError::StreamFilterError { + pos: stream_pos, + filter: Self::FILTER.into(), + error: e.to_string(), + })?; + let predictor = self.predictor(); + let colors = self.colors(); + let bits_per_component = self.bits_per_component(); + let columns = self.columns(); + match predictor { + Self::DEFAULT_PREDICTOR => Ok(decoded_data), + _ => todo!("{predictor}"), + } + } +} From e0993fdb4a609120e0c37bc7e00cce7d218e2c26 Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Fri, 26 Dec 2025 01:13:52 -0800 Subject: [PATCH 04/42] parsing more of the pdf structure --- src/main.rs | 11 +- src/pdf.rs | 98 ++++-- src/pdf/content_stream.rs | 6 + src/pdf/document_structure.rs | 501 +++++++++++++++++++++++++++++ src/pdf/font.rs | 236 ++++++++++++++ src/pdf/object.rs | 545 +++++++++++++++++++++++++++++--- src/pdf/parse.rs | 250 +++++++++++++-- src/pdf/stream_filters.rs | 1 + src/pdf/stream_filters/flate.rs | 1 + 9 files changed, 1549 insertions(+), 100 deletions(-) create mode 100644 src/pdf/content_stream.rs create mode 100644 src/pdf/document_structure.rs create mode 100644 src/pdf/font.rs diff --git a/src/main.rs b/src/main.rs index 34539a3..533a408 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,4 +1,4 @@ -use parse_powerisa_pdf::pdf::{Pdf, PdfTrailer}; +use parse_powerisa_pdf::pdf::Pdf; use std::{ error::Error, io::{IsTerminal, Read}, @@ -32,14 +32,7 @@ fn main() -> Result> { buf }; let pdf = Pdf::parse(input)?; - if let PdfTrailer::Stream { - xref_stream, - start_xref, - } = pdf.trailer - { - dbg!(xref_stream.dictionary()); - } - + println!("{:#?}", pdf.trailer.trailer_dictionary()); todo!(); Ok(ExitCode::SUCCESS) } diff --git a/src/pdf.rs b/src/pdf.rs index d4a4922..0ccd293 100644 --- a/src/pdf.rs +++ b/src/pdf.rs @@ -1,11 +1,14 @@ use crate::{ pdf::{ + document_structure::PdfDocumentCatalog, object::{ PdfArray, PdfBoolean, PdfDictionary, PdfInteger, PdfName, PdfNull, PdfObject, PdfObjectIdentifier, PdfObjectIndirect, PdfObjectStreamDictionary, PdfReal, PdfStream, PdfStreamDictionary, PdfString, UnparsedPdfStreamDictionary, }, - parse::{GetPdfInputPosition, PdfInputPosition, PdfParse, PdfParseError}, + parse::{ + GetPdfInputPosition, PdfInputPosition, PdfInputPositionKnown, PdfParse, PdfParseError, + }, }, pdf_parse, util::ArcOrRef, @@ -19,12 +22,16 @@ use std::{ sync::{Arc, OnceLock}, }; +pub mod content_stream; +pub mod document_structure; +pub mod font; pub mod object; pub mod parse; pub mod stream_filters; struct PdfObjectsInner { objects: BTreeMap, + #[allow(dead_code)] object_streams: Vec>, } @@ -43,6 +50,7 @@ impl PdfHeader { } pdf_parse! { + #[pdf] #[derive(Clone, Debug)] pub struct PdfTrailerDictionary { #[pdf(name = "Size")] @@ -50,7 +58,7 @@ pdf_parse! { #[pdf(name = "Prev")] pub prev: Option, #[pdf(name = "Root")] - pub root: PdfDictionary, + pub root: PdfDocumentCatalog, #[pdf(name = "Encrypt")] pub encrypt: Option, #[pdf(name = "Info")] @@ -63,6 +71,7 @@ pdf_parse! { } pdf_parse! { + #[pdf(name)] #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Default, Debug)] pub enum PdfXRefName { #[pdf(name = "XRef")] @@ -72,6 +81,7 @@ pdf_parse! { } pdf_parse! { + #[pdf] #[derive(Clone, Debug)] pub struct PdfXRefStreamDictionaryRest { #[pdf(name = "Type")] @@ -97,6 +107,17 @@ pub enum PdfTrailer { }, } +impl PdfTrailer { + pub fn trailer_dictionary(&self) -> &PdfTrailerDictionary { + match self { + PdfTrailer::Trailer { + trailer_dictionary, .. + } => trailer_dictionary, + PdfTrailer::Stream { xref_stream, .. } => &xref_stream.dictionary().rest.rest, + } + } +} + pub struct Pdf { pub header: PdfHeader, pub objects: Arc, @@ -192,12 +213,12 @@ struct PdfTokenizerPeek<'a> { #[derive(Clone)] struct PdfTokenizer<'a> { bytes: &'a [u8], - pos: usize, + pos: PdfInputPositionKnown, peek_cache: Option>, } impl<'a> PdfTokenizer<'a> { - fn new(bytes: &'a [u8], pos: usize) -> Self { + fn new(bytes: &'a [u8], pos: PdfInputPositionKnown) -> Self { Self { bytes, pos, @@ -205,14 +226,14 @@ impl<'a> PdfTokenizer<'a> { } } fn pos(&self) -> PdfInputPosition { - PdfInputPosition::new(self.pos) + PdfInputPosition::new(Some(self.pos)) } fn peek_byte(&mut self) -> Option { - self.bytes.get(self.pos).copied() + self.bytes.get(self.pos.pos).copied() } fn next_byte(&mut self) -> Option { - let b = self.bytes.get(self.pos)?; - self.pos += 1; + let b = self.bytes.get(self.pos.pos)?; + self.pos.pos += 1; self.peek_cache = None; Some(*b) } @@ -229,14 +250,16 @@ impl<'a> PdfTokenizer<'a> { let token = tokenizer.next()?; self.peek_cache = Some(PdfTokenizerPeek { token, - pos_after_token: tokenizer.pos, + pos_after_token: tokenizer.pos.pos, }); Some(token) } fn read_bytes(&mut self, len: usize) -> Option<&'a [u8]> { - let retval = self.bytes.get(self.pos..self.pos.saturating_add(len))?; + let retval = self + .bytes + .get(self.pos.pos..self.pos.pos.saturating_add(len))?; self.peek_cache = None; - self.pos += len; + self.pos.pos += len; Some(retval) } } @@ -250,11 +273,11 @@ impl<'a> Iterator for PdfTokenizer<'a> { pos_after_token, }) = self.peek_cache.take() { - self.pos = pos_after_token; + self.pos.pos = pos_after_token; return Some(token); } loop { - let start_pos = self.pos; + let start_pos = self.pos.pos; break match PdfCharCategory::new(self.next_byte()?) { PdfCharCategory::Whitespace => continue, PdfCharCategory::LParen => Some(PdfToken::LParen), @@ -272,22 +295,22 @@ impl<'a> Iterator for PdfTokenizer<'a> { None | Some(b'\n') => break, Some(b'\r') => { if let Some(b'\n') = self.peek_byte() { - self.pos += 1; + self.pos.pos += 1; } break; } Some(_) => continue, } } - Some(PdfToken::Comment(&self.bytes[start_pos..self.pos])) + Some(PdfToken::Comment(&self.bytes[start_pos..self.pos.pos])) } PdfCharCategory::Regular => { while let Some(PdfCharCategory::Regular) = self.peek_byte().map(PdfCharCategory::new) { - self.pos += 1; + self.pos.pos += 1; } - Some(PdfToken::Regular(&self.bytes[start_pos..self.pos])) + Some(PdfToken::Regular(&self.bytes[start_pos..self.pos.pos])) } }; } @@ -647,7 +670,10 @@ impl<'a> PdfParser<'a> { } let Some(data) = self.tokenizer.read_bytes(len) else { return Err(PdfParseError::TruncatedFile { - pos: PdfInputPosition::new(self.tokenizer.bytes.len()), + pos: PdfInputPosition::new(Some(PdfInputPositionKnown { + pos: self.tokenizer.bytes.len(), + ..self.tokenizer.pos + })), }); }; let (stream, unparsed) = PdfStream::new_unparsed(stream_pos, dictionary, Arc::from(data)); @@ -810,9 +836,22 @@ impl<'a> PdfParser<'a> { object_stream: &PdfStream, ) -> Result<(), PdfParseError> { let data = object_stream.decoded_data().as_ref()?; - self.with_tokenizer(PdfTokenizer::new(data, 0), |parser| { - parser.parse_object_stream_inner(object_stream) - }) + self.with_tokenizer( + PdfTokenizer::new( + data, + PdfInputPositionKnown { + pos: 0, + containing_streams_pos: Some( + object_stream + .get_pdf_input_position() + .get() + .expect("known to be set") + .pos, + ), + }, + ), + |parser| parser.parse_object_stream_inner(object_stream), + ) .map_err(|e| PdfParseError::ObjectStreamParseError { stream_pos: object_stream.get_pdf_input_position(), error: Arc::new(e), @@ -913,7 +952,13 @@ impl<'a> PdfParser<'a> { }); } let old_tokenizer = self.tokenizer.clone(); - self.tokenizer = PdfTokenizer::new(self.tokenizer.bytes, start_xref); + self.tokenizer = PdfTokenizer::new( + self.tokenizer.bytes, + PdfInputPositionKnown { + pos: start_xref, + containing_streams_pos: None, + }, + ); let id = self.parse_object_identifier(false); self.tokenizer = old_tokenizer; let Some(id) = id? else { @@ -950,7 +995,13 @@ impl Pdf { }), objects_map: BTreeMap::new(), unparsed_stream_dictionaries: vec![], - tokenizer: PdfTokenizer::new(bytes.as_ref(), 0), + tokenizer: PdfTokenizer::new( + bytes.as_ref(), + PdfInputPositionKnown { + pos: 0, + containing_streams_pos: None, + }, + ), } .parse_file() } @@ -971,6 +1022,7 @@ mod tests { #[test] fn test_deserialize_dict() -> Result<(), PdfParseError> { crate::pdf::parse::pdf_parse! { + #[pdf] #[derive(Debug)] #[allow(dead_code)] struct TestStruct { diff --git a/src/pdf/content_stream.rs b/src/pdf/content_stream.rs new file mode 100644 index 0000000..f58737e --- /dev/null +++ b/src/pdf/content_stream.rs @@ -0,0 +1,6 @@ +use crate::pdf::object::PdfStream; + +pub struct PdfContentStream { + stream: PdfStream, + // TODO +} diff --git a/src/pdf/document_structure.rs b/src/pdf/document_structure.rs new file mode 100644 index 0000000..265182c --- /dev/null +++ b/src/pdf/document_structure.rs @@ -0,0 +1,501 @@ +use core::fmt; +use std::{borrow::Cow, sync::Arc}; + +use crate::pdf::{ + font::PdfFont, + object::{ + IsPdfNull, MaybeArray, PdfDate, PdfDictionary, PdfInteger, PdfName, PdfObject, + PdfObjectDirect, PdfObjectIndirect, PdfRectangle, PdfStream, PdfString, + }, + parse::{PdfParse, PdfParseError}, + pdf_parse, +}; + +pdf_parse! { + #[pdf(name)] + #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Default, Debug)] + pub enum PdfDocumentCatalogType { + #[pdf(name = "Catalog")] + #[default] + Catalog, + } +} + +pdf_parse! { + #[pdf] + #[derive(Clone, Debug)] + pub struct PdfDocumentCatalog { + #[pdf(name = "Type")] + pub ty: PdfDocumentCatalogType, + #[pdf(name = "Version")] + pub version: Option, + #[pdf(name = "Extensions")] + pub extensions: Option, + #[pdf(name = "Pages")] + pub pages: PdfPageTree, + // TODO + #[pdf(flatten)] + pub rest: PdfDictionary, + } +} + +pdf_parse! { + #[pdf] + #[derive(Clone, Debug)] + pub struct PdfResourcesDictionary { + #[pdf(name = "Font")] + pub fonts: PdfDictionary, + #[pdf(flatten)] + pub rest: PdfDictionary, + } +} + +#[derive(Clone)] +pub struct PdfPageTree { + page_tree: PdfPageTreeNode, + pages: Arc<[PdfPage]>, +} + +impl PdfPageTree { + fn parse_pages(node: &PdfPageTreeNode, pages: &mut Vec) -> Result<(), PdfParseError> { + for kid in node.kids.iter() { + match kid { + PdfPageTreeNodeOrLeaf::Node(node) => Self::parse_pages(node, pages)?, + PdfPageTreeNodeOrLeaf::Leaf(leaf) => { + pages.push(PdfPage::parse_after_propagating_inheritable_data( + leaf.clone(), + )?); + } + PdfPageTreeNodeOrLeaf::Other(v) => { + return Err(PdfParseError::InvalidType { + pos: v.pos(), + ty: "dictionary", + expected_ty: "PdfPageTreeNodeOrLeaf", + }); + } + } + } + Ok(()) + } + pub fn try_from_page_tree_root(mut page_tree: PdfPageTreeNode) -> Result { + page_tree.propagate_inheritable_data_to_leaves(); + let mut pages = Vec::new(); + Self::parse_pages(&page_tree, &mut pages)?; + Ok(Self { + page_tree, + pages: Arc::from(pages), + }) + } + pub fn page_tree(&self) -> &PdfPageTreeNode { + &self.page_tree + } + pub fn pages(&self) -> &Arc<[PdfPage]> { + &self.pages + } +} + +impl fmt::Debug for PdfPageTree { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("PdfPageTree") + .field("pages", &self.pages) + .finish_non_exhaustive() + } +} + +impl IsPdfNull for PdfPageTree { + fn is_pdf_null(&self) -> bool { + self.page_tree.is_pdf_null() + } +} + +impl PdfParse for PdfPageTree { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("PdfPageTree") + } + fn parse(object: PdfObject) -> Result { + Self::try_from_page_tree_root(PdfParse::parse(object)?) + } +} + +pdf_parse! { + #[pdf] + #[derive(Clone, Debug, Default)] + pub struct PdfPageInheritableData { + #[pdf(name = "Resources")] + pub resources: Option, + #[pdf(name = "MediaBox")] + pub media_box: Option, + #[pdf(name = "CropBox")] + pub crop_box: Option, + #[pdf(name = "Rotate")] + pub rotate: Option, + #[pdf(flatten)] + pub rest: PdfDictionary, + } +} + +impl PdfPageInheritableData { + pub fn propagate_to(&self, target: &mut Self) { + let Self { + resources, + media_box, + crop_box, + rotate, + rest: _, + } = self; + fn propagate_to(this: &Option, target: &mut Option) { + if let (Some(this), target @ None) = (this, target) { + *target = Some(this.clone()); + } + } + propagate_to(resources, &mut target.resources); + propagate_to(media_box, &mut target.media_box); + propagate_to(crop_box, &mut target.crop_box); + propagate_to(rotate, &mut target.rotate); + } +} + +pdf_parse! { + #[pdf(name)] + #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Default, Debug)] + pub enum PdfPageTreeNodeType { + #[pdf(name = "Pages")] + #[default] + Pages, + } +} + +pdf_parse! { + #[pdf] + #[derive(Clone, Debug)] + pub struct PdfPageTreeNode { + #[pdf(name = "Type")] + pub ty: PdfPageTreeNodeType, + #[pdf(name = "Parent")] + pub parent: Option, + #[pdf(name = "Kids")] + pub kids: Arc<[PdfPageTreeNodeOrLeaf]>, + #[pdf(name = "Count")] + pub count: usize, + // TODO + #[pdf(flatten)] + pub inheritable: PdfPageInheritableData, + } +} + +impl PdfPageTreeNode { + pub fn propagate_inheritable_data_to_leaves(&mut self) { + for kid in Arc::make_mut(&mut self.kids) { + if let Some(target) = kid.inheritable_data_mut() { + self.inheritable.propagate_to(target); + } + kid.propagate_inheritable_data_to_leaves(); + } + } +} + +pdf_parse! { + #[pdf(name)] + #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Default, Debug)] + pub enum PdfPageType { + #[pdf(name = "Page")] + #[default] + Page, + } +} + +pdf_parse! { + #[pdf(name)] + #[derive(Clone, PartialEq, Eq, Hash, Debug)] + pub enum PdfPageAnnotationsTabOrder { + #[pdf(name = "R")] + RowOrder, + #[pdf(name = "C")] + ColumnOrder, + #[pdf(name = "S")] + StructureOrder, + #[pdf(other)] + Other(PdfName), + } +} + +pdf_parse! { + #[pdf] + #[derive(Clone, Debug)] + pub struct PdfPageTreeLeaf { + #[pdf(name = "Type")] + pub ty: PdfPageType, + #[pdf(name = "Parent")] + pub parent: PdfObjectIndirect, + #[pdf(name = "LastModified")] + pub last_modified: Option, + #[pdf(name = "BleedBox")] + pub bleed_box: Option, + #[pdf(name = "TrimBox")] + pub trim_box: Option, + #[pdf(name = "ArtBox")] + pub art_box: Option, + #[pdf(name = "BoxColorInfo")] + pub box_color_info: Option, + #[pdf(name = "Contents")] + pub contents: MaybeArray, + #[pdf(name = "Group")] + pub group: Option, + #[pdf(name = "Thumb")] + pub thumbnail: Option, + #[pdf(name = "B")] + pub beads: Option>, + #[pdf(name = "Dur")] + pub duration: Option, + #[pdf(name = "Trans")] + pub transition: Option, + #[pdf(name = "Annots")] + pub annotations: Option>, + #[pdf(name = "AA")] + pub additional_actions: Option, + #[pdf(name = "Metadata")] + pub metadata: Option, + #[pdf(name = "PieceInfo")] + pub piece_info: Option, + #[pdf(name = "StructParents")] + pub structural_parents: Option, + #[pdf(name = "ID")] + pub parent_web_capture_content_set_id: Option, + #[pdf(name = "PZ")] + pub preferred_zoom_factor: Option, + #[pdf(name = "SeparationInfo")] + pub separation_info: Option, + #[pdf(name = "Tabs")] + pub annotations_tab_order: Option, + #[pdf(name = "TemplateInstantiated")] + pub template_instantiated: Option, + #[pdf(name = "PresSteps")] + pub pres_steps: Option, + #[pdf(name = "UserUnit")] + pub user_unit: Option, + #[pdf(name = "VP")] + pub viewports: Option>, + #[pdf(flatten)] + pub inheritable: PdfPageInheritableData, + } +} + +pdf_parse! { + #[pdf(tag = "Type")] + #[derive(Clone)] + pub enum PdfPageTreeNodeOrLeaf { + #[pdf(tag_value = "Pages")] + Node(PdfPageTreeNode), + #[pdf(tag_value = "Page")] + Leaf(PdfPageTreeLeaf), + #[pdf(other)] + Other(PdfDictionary), + } +} + +impl PdfPageTreeNodeOrLeaf { + pub fn propagate_inheritable_data_to_leaves(&mut self) { + match self { + PdfPageTreeNodeOrLeaf::Node(v) => v.propagate_inheritable_data_to_leaves(), + PdfPageTreeNodeOrLeaf::Leaf(_) | PdfPageTreeNodeOrLeaf::Other(_) => {} + } + } + pub fn inheritable_data_mut(&mut self) -> Option<&mut PdfPageInheritableData> { + match self { + PdfPageTreeNodeOrLeaf::Node(v) => Some(&mut v.inheritable), + PdfPageTreeNodeOrLeaf::Leaf(v) => Some(&mut v.inheritable), + PdfPageTreeNodeOrLeaf::Other(_) => None, + } + } + pub fn inheritable_data(&self) -> Option<&PdfPageInheritableData> { + match self { + PdfPageTreeNodeOrLeaf::Node(v) => Some(&v.inheritable), + PdfPageTreeNodeOrLeaf::Leaf(v) => Some(&v.inheritable), + PdfPageTreeNodeOrLeaf::Other(_) => None, + } + } +} + +impl fmt::Debug for PdfPageTreeNodeOrLeaf { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Node(v) => v.fmt(f), + Self::Leaf(v) => v.fmt(f), + Self::Other(v) => v.fmt(f), + } + } +} + +/// the amount by which the page is rotated clockwise when displaying or printing, is always a multiple of 90 degrees. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] +pub enum PdfPageRotation { + #[default] + NoRotation = 0, + ClockwiseBy90Degrees = 90, + By180Degrees = 180, + ClockwiseBy270Degrees = 270, +} + +impl PdfPageRotation { + pub fn from_clockwise_angle_in_degrees(angle: i32) -> Option { + match angle.rem_euclid(360) { + 0 => Some(Self::NoRotation), + 90 => Some(Self::ClockwiseBy90Degrees), + 180 => Some(Self::By180Degrees), + 270 => Some(Self::ClockwiseBy270Degrees), + _ => None, + } + } + pub fn from_clockwise_angle_in_degrees_i128(angle: i128) -> Option { + Self::from_clockwise_angle_in_degrees((angle % 360) as i32) + } +} + +impl From for i32 { + fn from(value: PdfPageRotation) -> Self { + value as i32 + } +} + +impl IsPdfNull for PdfPageRotation { + fn is_pdf_null(&self) -> bool { + false + } +} + +impl PdfParse for PdfPageRotation { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("page rotation") + } + fn parse(object: PdfObject) -> Result { + let object = PdfObjectDirect::from(object); + let pos = object.pos(); + let angle = PdfInteger::parse(object.into())?; + Self::from_clockwise_angle_in_degrees_i128(angle.value()) + .ok_or(PdfParseError::IntegerOutOfRange { pos }) + } +} + +#[derive(Clone, Debug)] +pub struct PdfPage { + pub ty: PdfPageType, + pub parent: PdfObjectIndirect, + pub last_modified: Option, + pub resources: PdfResourcesDictionary, + pub media_box: PdfRectangle, + pub crop_box: PdfRectangle, + pub bleed_box: PdfRectangle, + pub trim_box: PdfRectangle, + pub art_box: PdfRectangle, + pub box_color_info: Option, + pub contents: Arc<[PdfStream]>, + pub rotate: PdfPageRotation, + pub group: Option, + pub thumbnail: Option, + pub beads: Option>, + pub duration: Option, + pub transition: Option, + pub annotations: Option>, + pub additional_actions: Option, + pub metadata: Option, + pub piece_info: Option, + pub structural_parents: Option, + pub parent_web_capture_content_set_id: Option, + pub preferred_zoom_factor: Option, + pub separation_info: Option, + pub annotations_tab_order: Option, + pub template_instantiated: Option, + pub pres_steps: Option, + pub user_unit: f32, + pub viewports: Option>, + pub rest: PdfDictionary, +} + +impl PdfPage { + pub fn parse_after_propagating_inheritable_data( + leaf: PdfPageTreeLeaf, + ) -> Result { + let PdfPageTreeLeaf { + ty, + parent, + last_modified, + bleed_box, + trim_box, + art_box, + box_color_info, + contents, + group, + thumbnail, + beads, + duration, + transition, + annotations, + additional_actions, + metadata, + piece_info, + structural_parents, + parent_web_capture_content_set_id, + preferred_zoom_factor, + separation_info, + annotations_tab_order, + template_instantiated, + pres_steps, + user_unit, + viewports, + inheritable: + PdfPageInheritableData { + resources, + media_box, + crop_box, + rotate, + rest, + }, + } = leaf; + let pos = rest.pos(); + let resources = resources.ok_or(PdfParseError::InvalidType { + pos, + ty: "null", + expected_ty: "page resources dictionary", + })?; + let media_box = media_box.ok_or(PdfParseError::InvalidType { + pos, + ty: "null", + expected_ty: "page MediaBox rectangle", + })?; + let crop_box = crop_box.unwrap_or(media_box); + let rotate = rotate.unwrap_or(PdfPageRotation::NoRotation); + Ok(Self { + ty, + parent, + last_modified, + resources, + media_box, + crop_box, + bleed_box: bleed_box.unwrap_or(crop_box), + trim_box: trim_box.unwrap_or(crop_box), + art_box: art_box.unwrap_or(crop_box), + box_color_info, + contents: contents.0, + rotate, + group, + thumbnail, + beads, + duration, + transition, + annotations, + additional_actions, + metadata, + piece_info, + structural_parents, + parent_web_capture_content_set_id, + preferred_zoom_factor, + separation_info, + annotations_tab_order, + template_instantiated, + pres_steps, + user_unit: user_unit.unwrap_or(1.0), + viewports, + rest, + }) + } +} diff --git a/src/pdf/font.rs b/src/pdf/font.rs new file mode 100644 index 0000000..23d479e --- /dev/null +++ b/src/pdf/font.rs @@ -0,0 +1,236 @@ +use std::{borrow::Cow, sync::Arc}; + +use crate::pdf::{ + object::{IsPdfNull, PdfDictionary, PdfName, PdfObject, PdfObjectDirect, PdfStream}, + parse::{PdfParse, PdfParseError}, + pdf_parse, +}; + +pdf_parse! { + #[pdf(transparent)] + #[derive(Clone, Debug)] + // TODO: actually parse the stream + pub struct PdfFontToUnicode { + #[pdf] + stream: PdfStream, + } +} + +pdf_parse! { + #[pdf(transparent)] + #[derive(Clone, Debug)] + // TODO: actually parse the dictionary + pub struct PdfFontDescriptor { + #[pdf] + dictionary: PdfDictionary, + } +} + +pdf_parse! { + #[pdf(name)] + #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Default, Debug)] + pub enum PdfFontType { + #[pdf(name = "Font")] + #[default] + Font, + } +} + +#[derive(Clone, Debug)] +pub enum PdfTodo {} + +impl IsPdfNull for PdfTodo { + fn is_pdf_null(&self) -> bool { + match *self {} + } +} + +impl PdfParse for PdfTodo { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("PdfTodo") + } + #[track_caller] + fn parse(object: PdfObject) -> Result { + todo!("{object:?}") + } +} + +pdf_parse! { + #[pdf(tag = "Subtype")] + #[derive(Clone, Debug)] + pub enum PdfFont { + #[pdf(tag_value = "Type0")] + Type0(PdfFontType0), + #[pdf(tag_value = "Type1")] + Type1(PdfFontType1), + #[pdf(other)] + Other(PdfTodo), + } +} + +pdf_parse! { + #[pdf(name)] + #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Default, Debug)] + pub enum PdfFontType0Subtype { + #[pdf(name = "Type0")] + #[default] + Type0, + } +} + +pdf_parse! { + #[pdf] + #[derive(Clone, Debug)] + pub struct PdfFontType0 { + #[pdf(name = "Type")] + pub ty: PdfFontType, + #[pdf(name = "Subtype")] + pub subtype: PdfFontType0Subtype, + #[pdf(name = "BaseFont")] + pub base_font: PdfName, + #[pdf(name = "Encoding")] + // TODO + pub encoding: PdfObjectDirect, + #[pdf(name = "DescendentFonts")] + // TODO + pub descendent_fonts: [PdfDictionary; 1], + #[pdf(name = "ToUnicode")] + pub to_unicode: Option, + #[pdf(flatten)] + pub rest: PdfDictionary, + } +} + +pdf_parse! { + #[pdf(name)] + #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Default, Debug)] + pub enum PdfFontType1Subtype { + #[pdf(name = "Type1")] + #[default] + Type1, + } +} + +pdf_parse! { + #[pdf(name)] + #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] + pub enum PdfStandardFontName { + #[pdf(name = "Times-Roman")] + TimesRoman, + #[pdf(name = "Helvetica")] + Helvetica, + #[pdf(name = "Courier")] + Courier, + #[pdf(name = "Symbol")] + Symbol, + #[pdf(name = "Times-Bold")] + TimesBold, + #[pdf(name = "Helvetica-Bold")] + HelveticaBold, + #[pdf(name = "Courier-Bold")] + CourierBold, + #[pdf(name = "ZapfDingbats")] + ZapfDingbats, + #[pdf(name = "Times-Italic")] + TimesItalic, + #[pdf(name = "Helvetica-Oblique")] + HelveticaOblique, + #[pdf(name = "Courier-Oblique")] + CourierOblique, + #[pdf(name = "Times-BoldItalic")] + TimesBoldItalic, + #[pdf(name = "Helvetica-BoldOblique")] + HelveticaBoldOblique, + #[pdf(name = "Courier-BoldOblique")] + CourierBoldOblique, + } +} + +#[derive(Clone, Debug)] +pub enum PdfFontType1 { + Standard(PdfFontType1Standard), + Other(PdfFontType1Other), +} + +impl IsPdfNull for PdfFontType1 { + fn is_pdf_null(&self) -> bool { + false + } +} + +impl PdfParse for PdfFontType1 { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("PdfFontType1") + } + fn parse(object: PdfObject) -> Result { + let object = object.into(); + let PdfObjectDirect::Dictionary(object) = object else { + return PdfFontType1Other::parse(object.into()).map(Self::Other); + }; + if let Ok(_) = PdfStandardFontName::parse(object.get_or_null(b"BaseFont".as_slice())) { + PdfFontType1Standard::parse(object.into()).map(Self::Standard) + } else { + PdfFontType1Other::parse(object.into()).map(Self::Other) + } + } +} + +pdf_parse! { + #[pdf] + #[derive(Clone, Debug)] + pub struct PdfFontType1Standard { + #[pdf(name = "Type")] + pub ty: PdfFontType, + #[pdf(name = "Subtype")] + pub subtype: PdfFontType1Subtype, + #[pdf(name = "Name")] + pub name: Option, + #[pdf(name = "BaseFont")] + pub base_font: PdfStandardFontName, + #[pdf(name = "FirstChar")] + pub first_char: Option, + #[pdf(name = "LastChar")] + pub last_char: Option, + #[pdf(name = "Widths")] + pub widths: Option>, + #[pdf(name = "FontDescriptor")] + pub font_descriptor: Option, + #[pdf(name = "Encoding")] + // TODO + pub encoding: PdfObjectDirect, + #[pdf(name = "ToUnicode")] + pub to_unicode: Option, + #[pdf(flatten)] + pub rest: PdfDictionary, + } +} + +pdf_parse! { + #[pdf] + #[derive(Clone, Debug)] + pub struct PdfFontType1Other { + #[pdf(name = "Type")] + pub ty: PdfFontType, + #[pdf(name = "Subtype")] + pub subtype: PdfFontType1Subtype, + #[pdf(name = "Name")] + pub name: Option, + #[pdf(name = "BaseFont")] + pub base_font: PdfName, + #[pdf(name = "FirstChar")] + pub first_char: u32, + #[pdf(name = "LastChar")] + pub last_char: u32, + #[pdf(name = "Widths")] + pub widths: Arc<[f32]>, + #[pdf(name = "FontDescriptor")] + pub font_descriptor: PdfFontDescriptor, + #[pdf(name = "Encoding")] + // TODO + pub encoding: PdfObjectDirect, + #[pdf(name = "ToUnicode")] + pub to_unicode: Option, + #[pdf(flatten)] + pub rest: PdfDictionary, + } +} diff --git a/src/pdf/object.rs b/src/pdf/object.rs index 2a17df5..dad6e49 100644 --- a/src/pdf/object.rs +++ b/src/pdf/object.rs @@ -28,11 +28,23 @@ pub struct PdfString { impl std::fmt::Debug for PdfString { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let Self { pos, bytes } = self; - f.debug_struct("PdfString") - .field("pos", pos) - .field("bytes", &format_args!("b\"{}\"", bytes.escape_ascii())) - .finish() + let Self { pos, bytes: _ } = self; + write!(f, "PdfString(at {pos}, {})", self.bytes_debug()) + } +} + +#[derive(Clone, Copy)] +pub struct PdfStringBytesDebug<'a>(&'a [u8]); + +impl<'a> fmt::Display for PdfStringBytesDebug<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "b\"{}\"", self.0.escape_ascii()) + } +} + +impl<'a> fmt::Debug for PdfStringBytesDebug<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(self, f) } } @@ -49,6 +61,9 @@ impl PdfString { pub fn bytes(&self) -> &ArcOrRef<'static, [u8]> { &self.bytes } + pub fn bytes_debug(&self) -> PdfStringBytesDebug<'_> { + PdfStringBytesDebug(&self.bytes) + } } impl GetPdfInputPosition for PdfString { @@ -57,6 +72,50 @@ impl GetPdfInputPosition for PdfString { } } +#[derive(Clone, PartialEq, Eq)] +pub struct PdfDate { + text: PdfString, +} + +impl fmt::Debug for PdfDate { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let Self { text } = self; + let pos = text.pos(); + write!(f, "PdfDate(at {pos}, {})", text.bytes_debug()) + } +} + +impl IsPdfNull for PdfDate { + fn is_pdf_null(&self) -> bool { + false + } +} + +impl PdfParse for PdfDate { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("date") + } + fn parse(object: PdfObject) -> Result { + Self::try_new(PdfString::parse(object)?) + } +} + +impl PdfDate { + pub fn try_new(text: PdfString) -> Result { + // TODO: check syntax + Ok(Self { text }) + } + pub fn text(&self) -> &PdfString { + &self.text + } +} + +impl GetPdfInputPosition for PdfDate { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.text.pos() + } +} + #[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct PdfName { pos: PdfInputPositionNoCompare, @@ -138,12 +197,19 @@ impl fmt::Display for PdfName { } } -#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Default)] +#[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord, Default)] pub struct PdfBoolean { pos: PdfInputPositionNoCompare, value: bool, } +impl fmt::Debug for PdfBoolean { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let Self { pos, value } = *self; + write!(f, "PdfBoolean(at {pos}, {value})") + } +} + impl PdfBoolean { pub fn new(pos: impl Into, value: bool) -> Self { Self { @@ -165,12 +231,19 @@ impl GetPdfInputPosition for PdfBoolean { } } -#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Default)] +#[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord, Default)] pub struct PdfInteger { pos: PdfInputPositionNoCompare, value: i128, } +impl fmt::Debug for PdfInteger { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let Self { pos, value } = *self; + write!(f, "PdfInteger(at {pos}, {value})") + } +} + impl PdfInteger { pub fn new(pos: impl Into, value: i128) -> Self { Self { @@ -192,12 +265,19 @@ impl GetPdfInputPosition for PdfInteger { } } -#[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Default)] +#[derive(Clone, Copy, PartialEq, PartialOrd, Default)] pub struct PdfReal { pos: PdfInputPositionNoCompare, value: f64, } +impl fmt::Debug for PdfReal { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let Self { pos, value } = *self; + write!(f, "PdfReal(at {pos}, {value})") + } +} + impl PdfReal { pub fn new(pos: impl Into, value: f64) -> Self { Self { @@ -219,6 +299,114 @@ impl GetPdfInputPosition for PdfReal { } } +#[derive(Clone, Copy)] +pub enum PdfNumber { + Integer(PdfInteger), + Real(PdfReal), +} + +impl fmt::Debug for PdfNumber { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Integer(v) => v.fmt(f), + Self::Real(v) => v.fmt(f), + } + } +} + +impl PdfNumber { + pub fn pos(self) -> PdfInputPosition { + match self { + Self::Integer(v) => v.pos(), + Self::Real(v) => v.pos(), + } + } + pub fn as_f64(self) -> f64 { + match self { + Self::Integer(v) => v.value as f64, + Self::Real(v) => v.value, + } + } + pub fn as_f32(self) -> f32 { + match self { + Self::Integer(v) => v.value as f32, + Self::Real(v) => v.value as f32, + } + } +} + +impl PartialOrd for PdfNumber { + fn partial_cmp(&self, other: &Self) -> Option { + match (self, other) { + (Self::Integer(this), Self::Integer(other)) => Some(this.cmp(other)), + _ => self.as_f64().partial_cmp(&other.as_f64()), + } + } +} + +impl PartialEq for PdfNumber { + fn eq(&self, other: &Self) -> bool { + self.partial_cmp(other).is_some_and(|v| v.is_eq()) + } +} + +impl Default for PdfNumber { + fn default() -> Self { + PdfNumber::Integer(PdfInteger::default()) + } +} + +impl PdfObjectDirect { + pub fn number(&self) -> Option { + match *self { + PdfObjectDirect::Integer(v) => Some(PdfNumber::Integer(v)), + PdfObjectDirect::Real(v) => Some(PdfNumber::Real(v)), + PdfObjectDirect::Boolean(_) + | PdfObjectDirect::String(_) + | PdfObjectDirect::Name(_) + | PdfObjectDirect::Array(_) + | PdfObjectDirect::Dictionary(_) + | PdfObjectDirect::Stream(_) + | PdfObjectDirect::Null(_) => None, + } + } +} + +impl PdfObjectNonNull { + pub fn number(&self) -> Option { + match *self { + PdfObjectNonNull::Integer(v) => Some(PdfNumber::Integer(v)), + PdfObjectNonNull::Real(v) => Some(PdfNumber::Real(v)), + PdfObjectNonNull::Boolean(_) + | PdfObjectNonNull::String(_) + | PdfObjectNonNull::Name(_) + | PdfObjectNonNull::Array(_) + | PdfObjectNonNull::Dictionary(_) + | PdfObjectNonNull::Stream(_) => None, + } + } +} + +impl IsPdfNull for PdfNumber { + fn is_pdf_null(&self) -> bool { + false + } +} + +impl PdfParse for PdfNumber { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("number") + } + fn parse(object: PdfObject) -> Result { + let object = PdfObjectDirect::from(object); + object.number().ok_or(PdfParseError::InvalidType { + pos: object.pos(), + ty: object.type_name(), + expected_ty: "number", + }) + } +} + macro_rules! make_pdf_object { ( $( @@ -239,12 +427,24 @@ macro_rules! make_pdf_object { } } + impl IsPdfNull for PdfObjectNonNull { + fn is_pdf_null(&self) -> bool { + false + } + } + #[derive(Clone)] pub enum PdfObjectDirect { $($Variant($ty),)* Null(PdfNull), } + impl IsPdfNull for PdfObjectDirect { + fn is_pdf_null(&self) -> bool { + self.is_null() + } + } + impl fmt::Debug for PdfObjectDirect { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { @@ -261,6 +461,12 @@ macro_rules! make_pdf_object { Indirect(PdfObjectIndirect), } + impl IsPdfNull for PdfObject { + fn is_pdf_null(&self) -> bool { + self.is_null() + } + } + impl fmt::Debug for PdfObject { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { @@ -308,14 +514,20 @@ macro_rules! make_pdf_object { } } - $(impl crate::pdf::parse::PdfParse for $ty { + $(impl IsPdfNull for $ty { + fn is_pdf_null(&self) -> bool { + false + } + } + + impl PdfParse for $ty { fn type_name() -> Cow<'static, str> { Cow::Borrowed($type_name) } - fn $parse(object: PdfObject) -> Result { + fn $parse(object: PdfObject) -> Result { match PdfObjectDirect::from(object) { PdfObjectDirect::$Variant(v) => Ok(v), - object => Err(crate::pdf::parse::PdfParseError::InvalidType { + object => Err(PdfParseError::InvalidType { pos: object.get_pdf_input_position(), ty: object.type_name(), expected_ty: $type_name, @@ -445,7 +657,7 @@ macro_rules! make_pdf_object { } const _: () = { - fn _assert_parsable() {} + fn _assert_parsable() {} $(let _ = _assert_parsable::<$ty>;)* let _ = _assert_parsable::; @@ -470,15 +682,21 @@ make_pdf_object! { Name(PdfName), #[parse = parse, type_name = "array"] Array(PdfArray), - #[parse = parse, type_name = "dictionary"] + #[parse =, type_name = "dictionary"] Dictionary(PdfDictionary), #[parse =, type_name = "stream"] Stream(PdfStream), } -#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Clone, Copy, Default, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct PdfNull(PdfInputPositionNoCompare); +impl fmt::Debug for PdfNull { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "PdfNull(at {})", self.0) + } +} + impl PdfNull { pub fn new(pos: impl Into) -> Self { Self(pos.into()) @@ -521,13 +739,27 @@ impl From for PdfObject { } } -#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[derive(Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub struct PdfObjectIdentifier { pub pos: PdfInputPositionNoCompare, pub object_number: NonZero, pub generation_number: u16, } +impl fmt::Debug for PdfObjectIdentifier { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let Self { + pos, + object_number, + generation_number, + } = *self; + write!( + f, + "PdfObjectIdentifier(at {pos}, {object_number}, {generation_number})" + ) + } +} + impl GetPdfInputPosition for PdfObjectIdentifier { fn get_pdf_input_position(&self) -> PdfInputPosition { self.pos.0 @@ -545,12 +777,18 @@ impl fmt::Debug for PdfObjectIndirect { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let Self { objects: _, - id, + id: + PdfObjectIdentifier { + pos, + object_number, + generation_number, + }, final_id: _, - } = self; - f.debug_struct("PdfObjectIndirect") - .field("id", id) - .finish_non_exhaustive() + } = *self; + write!( + f, + "PdfObjectIndirect(at {pos}, {object_number}, {generation_number})" + ) } } @@ -627,13 +865,31 @@ impl From for PdfObjectDirect { } } -#[derive(Clone)] -pub struct PdfDictionary { - pos: PdfInputPositionNoCompare, - fields: Arc>, +pub trait IsPdfNull { + fn is_pdf_null(&self) -> bool; } -impl PdfDictionary { +impl IsPdfNull for Option { + fn is_pdf_null(&self) -> bool { + self.as_ref().is_none_or(IsPdfNull::is_pdf_null) + } +} + +pub struct PdfDictionary { + pos: PdfInputPositionNoCompare, + fields: Arc>, +} + +impl Clone for PdfDictionary { + fn clone(&self) -> Self { + Self { + pos: self.pos, + fields: self.fields.clone(), + } + } +} + +impl PdfDictionary { pub fn new(pos: impl Into) -> Self { Self { pos: pos.into(), @@ -642,23 +898,26 @@ impl PdfDictionary { } pub fn from_fields( pos: impl Into, - mut fields: Arc>, - ) -> Self { - if fields.values().any(|v| matches!(v, PdfObject::Null(_))) { - Arc::make_mut(&mut fields).retain(|_k, v| !matches!(v, PdfObject::Null(_))); + mut fields: Arc>, + ) -> Self + where + T: IsPdfNull + Clone, + { + if fields.values().any(T::is_pdf_null) { + Arc::make_mut(&mut fields).retain(|_k, v| !v.is_pdf_null()); } Self { pos: pos.into(), fields, } } - pub fn fields(&self) -> &Arc> { + pub fn fields(&self) -> &Arc> { &self.fields } - pub fn into_fields(self) -> Arc> { + pub fn into_fields(self) -> Arc> { self.fields } - pub fn iter(&self) -> std::collections::btree_map::Iter<'_, PdfName, PdfObject> { + pub fn iter(&self) -> std::collections::btree_map::Iter<'_, PdfName, T> { self.fields.iter() } pub fn contains_key(&self, key: &Q) -> bool @@ -668,75 +927,122 @@ impl PdfDictionary { { self.fields.contains_key(key) } - pub fn get(&self, key: &Q) -> Option<&PdfObject> + pub fn get(&self, key: &Q) -> Option<&T> where PdfName: std::borrow::Borrow, Q: Ord, { self.fields.get(key) } - pub fn get_or_null(&self, key: &Q) -> PdfObject + pub fn get_or_null(&self, key: &Q) -> T where PdfName: std::borrow::Borrow, Q: Ord, + T: Clone + From, { self.get(key) .cloned() - .unwrap_or(PdfObject::Null(PdfNull(self.pos))) + .unwrap_or_else(|| PdfNull(self.pos).into()) } pub fn pos(&self) -> PdfInputPosition { self.pos.0 } } -impl GetPdfInputPosition for PdfDictionary { +impl GetPdfInputPosition for PdfDictionary { fn get_pdf_input_position(&self) -> PdfInputPosition { self.pos.0 } } -impl Default for PdfDictionary { +impl Default for PdfDictionary { fn default() -> Self { Self::new(PdfInputPosition::empty()) } } -impl FromIterator<(PdfName, PdfObject)> for PdfDictionary { - fn from_iter>(iter: T) -> Self { +impl FromIterator<(PdfName, T)> for PdfDictionary { + fn from_iter>(iter: I) -> Self { Self { pos: PdfInputPositionNoCompare::empty(), fields: Arc::new(BTreeMap::from_iter( iter.into_iter() - .filter(|(_name, value)| !matches!(value, PdfObject::Null(_))), + .filter(|(_name, value)| !value.is_pdf_null()), )), } } } -impl IntoIterator for PdfDictionary { - type Item = (PdfName, PdfObject); - type IntoIter = std::collections::btree_map::IntoIter; +impl IntoIterator for PdfDictionary { + type Item = (PdfName, T); + type IntoIter = std::collections::btree_map::IntoIter; fn into_iter(self) -> Self::IntoIter { Arc::unwrap_or_clone(self.fields).into_iter() } } -impl<'a> IntoIterator for &'a PdfDictionary { - type Item = (&'a PdfName, &'a PdfObject); - type IntoIter = std::collections::btree_map::Iter<'a, PdfName, PdfObject>; +impl<'a, T> IntoIterator for &'a PdfDictionary { + type Item = (&'a PdfName, &'a T); + type IntoIter = std::collections::btree_map::Iter<'a, PdfName, T>; fn into_iter(self) -> Self::IntoIter { self.fields.iter() } } -impl fmt::Debug for PdfDictionary { +impl fmt::Debug for PdfDictionary { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_map().entries(self).finish() } } +impl IsPdfNull for PdfDictionary { + fn is_pdf_null(&self) -> bool { + false + } +} + +impl PdfParse for PdfDictionary { + fn type_name() -> Cow<'static, str> { + if TypeId::of::() == TypeId::of::() { + Cow::Borrowed("dictionary") + } else { + Cow::Owned(format!("PdfDictionary<{}>", T::type_name())) + } + } + fn parse(object: PdfObject) -> Result { + let object = PdfObjectDirect::from(object); + let PdfObjectDirect::Dictionary(object) = object else { + return Err(PdfParseError::InvalidType { + pos: object.pos(), + ty: object.type_name(), + expected_ty: "dictionary", + }); + }; + if let Some(retval) = ::downcast_ref::(&object) { + return Ok(retval.clone()); + } + let pos = object.pos; + let fields = Result::from_iter(object.fields.iter().filter_map(|(name, value)| { + match T::parse(value.clone()) { + Ok(value) => { + if value.is_pdf_null() { + None + } else { + Some(Ok((name.clone(), value))) + } + } + Err(e) => Some(Err(e)), + } + }))?; + Ok(Self { + pos, + fields: Arc::new(fields), + }) + } +} + #[derive(Clone, Default)] pub struct PdfArray { pos: PdfInputPositionNoCompare, @@ -869,9 +1175,15 @@ impl fmt::Debug for PdfArray { } } -#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct MaybeArray(pub Arc<[T]>); +impl fmt::Debug for MaybeArray { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.0.fmt(f) + } +} + impl std::ops::Deref for MaybeArray { type Target = Arc<[T]>; @@ -901,12 +1213,138 @@ impl<'a, T> IntoIterator for &'a MaybeArray { } } -#[derive(Clone, Debug)] +#[derive(Copy, Clone, PartialEq)] +pub struct PdfPoint { + pub pos: PdfInputPositionNoCompare, + pub x: f32, + pub y: f32, +} + +impl fmt::Debug for PdfPoint { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let Self { pos, x, y } = *self; + write!(f, "PdfPoint(at {pos}, {x}, {y})") + } +} + +impl PdfPoint { + pub fn parse(x: PdfObject, y: PdfObject) -> Result { + Ok(Self { + pos: x.pos().into(), + x: PdfParse::parse(x)?, + y: PdfParse::parse(y)?, + }) + } +} + +impl GetPdfInputPosition for PdfPoint { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.pos.0 + } +} + +#[derive(Copy, Clone, Debug)] +pub struct PdfRectangle { + /// the corner with the smaller x and y coordinates + smaller: PdfPoint, + /// the corner with the larger x and y coordinates + larger: PdfPoint, +} + +impl PdfRectangle { + pub fn new(mut smaller: PdfPoint, mut larger: PdfPoint) -> Self { + // `pos` follows the `x` coordinate + if smaller.x.is_nan() { + smaller.pos = larger.pos; + } else if larger.x.is_nan() { + larger.pos = smaller.pos; + } else if larger.x < smaller.x { + std::mem::swap(&mut smaller.pos, &mut larger.pos); + } + Self { + smaller: PdfPoint { + pos: smaller.pos, + x: smaller.x.min(larger.x), + y: smaller.y.min(larger.y), + }, + larger: PdfPoint { + pos: larger.pos, + x: smaller.x.max(larger.x), + y: smaller.y.max(larger.y), + }, + } + } + /// return the corner with the smaller x and y coordinates + pub fn smaller(&self) -> PdfPoint { + self.smaller + } + /// return the corner with the larger x and y coordinates + pub fn larger(&self) -> PdfPoint { + self.larger + } +} + +impl GetPdfInputPosition for PdfRectangle { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.smaller.get_pdf_input_position() + } +} + +impl IsPdfNull for PdfRectangle { + fn is_pdf_null(&self) -> bool { + false + } +} + +impl PdfParse for PdfRectangle { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("rectangle") + } + + fn parse(object: PdfObject) -> Result { + let object = object.into(); + let PdfObjectDirect::Array(array) = &object else { + return Err(PdfParseError::InvalidType { + pos: object.pos(), + ty: object.type_name(), + expected_ty: "rectangle", + }); + }; + let [lower_left_x, lower_left_y, upper_right_x, upper_right_y] = &**array.elements() else { + return Err(PdfParseError::InvalidType { + pos: object.pos(), + ty: object.type_name(), + expected_ty: "rectangle", + }); + }; + Ok(Self::new( + PdfPoint::parse(lower_left_x.clone(), lower_left_y.clone())?, + PdfPoint::parse(upper_right_x.clone(), upper_right_y.clone())?, + )) + } +} + +#[derive(Clone)] pub enum PdfFileSpecification { String(PdfString), Dictionary(PdfDictionary), } +impl fmt::Debug for PdfFileSpecification { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::String(v) => v.fmt(f), + Self::Dictionary(v) => v.fmt(f), + } + } +} + +impl IsPdfNull for PdfFileSpecification { + fn is_pdf_null(&self) -> bool { + false + } +} + impl PdfParse for PdfFileSpecification { fn type_name() -> Cow<'static, str> { Cow::Borrowed("file specification") @@ -925,6 +1363,7 @@ impl PdfParse for PdfFileSpecification { } pdf_parse! { + #[pdf] #[derive(Clone, Debug)] pub struct PdfStreamDictionary { #[pdf(name = PdfStreamDictionary::LENGTH_NAME)] @@ -1200,6 +1639,12 @@ impl GetPdfInputPosition for PdfStream { } } +impl IsPdfNull for PdfStream { + fn is_pdf_null(&self) -> bool { + false + } +} + impl PdfParse for PdfStream { fn type_name() -> Cow<'static, str> { if TypeId::of::() == TypeId::of::() { @@ -1252,6 +1697,7 @@ impl PdfParse for PdfStream { } pdf_parse! { + #[pdf(name)] #[derive(Clone, Copy, Debug, Hash, Default, PartialEq, Eq, PartialOrd, Ord)] pub enum PdfObjectStreamType { #[pdf(name = "ObjStm")] @@ -1261,6 +1707,7 @@ pdf_parse! { } pdf_parse! { + #[pdf] #[derive(Clone, Debug)] pub struct PdfObjectStreamDictionary { #[pdf(name = Self::TYPE_NAME)] diff --git a/src/pdf/parse.rs b/src/pdf/parse.rs index 2287dbf..8e5a7fc 100644 --- a/src/pdf/parse.rs +++ b/src/pdf/parse.rs @@ -1,24 +1,48 @@ use crate::pdf::object::{ - MaybeArray, PdfInteger, PdfName, PdfNull, PdfObject, PdfObjectDirect, PdfObjectIdentifier, - PdfObjectIndirect, PdfObjectNonNull, PdfReal, + IsPdfNull, MaybeArray, PdfInteger, PdfName, PdfNull, PdfNumber, PdfObject, PdfObjectDirect, + PdfObjectIdentifier, PdfObjectIndirect, PdfObjectNonNull, }; use std::{any::Any, borrow::Cow, fmt, mem, num::NonZero, sync::Arc}; +#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub struct PdfInputPositionKnown { + pub pos: usize, + pub containing_streams_pos: Option, +} + +impl fmt::Debug for PdfInputPositionKnown { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(self, f) + } +} + +impl fmt::Display for PdfInputPositionKnown { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let Self { + pos, + containing_streams_pos, + } = *self; + if let Some(containing_streams_pos) = containing_streams_pos { + write!(f, "{pos:#x} in stream at {containing_streams_pos:#x}") + } else { + write!(f, "{pos:#x}") + } + } +} + #[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Default)] -pub struct PdfInputPosition(Option); +pub struct PdfInputPosition(Option); impl fmt::Debug for PdfInputPosition { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_tuple("PdfInputPosition") - .field(&format_args!("{self}")) - .finish() + write!(f, "at {self}") } } impl fmt::Display for PdfInputPosition { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { if let Some(pos) = self.0 { - write!(f, "{pos:#x}") + pos.fmt(f) } else { f.write_str("") } @@ -26,12 +50,15 @@ impl fmt::Display for PdfInputPosition { } impl PdfInputPosition { - pub const fn new(pos: usize) -> Self { - Self(Some(pos)) + pub const fn new(pos: Option) -> Self { + Self(pos) } pub const fn empty() -> PdfInputPosition { Self(None) } + pub const fn get(self) -> Option { + self.0 + } } pub trait GetPdfInputPosition { @@ -87,7 +114,7 @@ impl PdfInputPositionNoCompare { pub const fn empty() -> Self { Self(PdfInputPosition::empty()) } - pub const fn new(pos: usize) -> Self { + pub const fn new(pos: Option) -> Self { Self(PdfInputPosition::new(pos)) } } @@ -106,9 +133,7 @@ impl From for PdfInputPositionNoCompare { impl fmt::Debug for PdfInputPositionNoCompare { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_tuple("PdfInputPositionNoCompare") - .field(&format_args!("{self}")) - .finish() + self.0.fmt(f) } } @@ -431,7 +456,7 @@ impl fmt::Display for PdfParseError { impl std::error::Error for PdfParseError {} -pub trait PdfParse: Sized + 'static { +pub trait PdfParse: Sized + 'static + IsPdfNull { fn type_name() -> Cow<'static, str>; fn parse(object: PdfObject) -> Result; fn parse_option(object: PdfObject) -> Result, PdfParseError> { @@ -469,6 +494,11 @@ impl PdfParse for Option { macro_rules! impl_pdf_parse_prim_int { ($ty:ident) => { + impl IsPdfNull for $ty { + fn is_pdf_null(&self) -> bool { + false + } + } impl PdfParse for $ty { fn type_name() -> Cow<'static, str> { Cow::Borrowed(stringify!($ty)) @@ -480,6 +510,11 @@ macro_rules! impl_pdf_parse_prim_int { .map_err(|_| PdfParseError::IntegerOutOfRange { pos: v.pos() }) } } + impl IsPdfNull for NonZero<$ty> { + fn is_pdf_null(&self) -> bool { + false + } + } impl PdfParse for NonZero<$ty> { fn type_name() -> Cow<'static, str> { Cow::Borrowed(concat!("NonZero<", stringify!($ty), ">")) @@ -508,6 +543,12 @@ impl_pdf_parse_prim_int!(u128); impl_pdf_parse_prim_int!(usize); impl_pdf_parse_prim_int!(isize); +impl IsPdfNull for i128 { + fn is_pdf_null(&self) -> bool { + false + } +} + impl PdfParse for i128 { fn type_name() -> Cow<'static, str> { Cow::Borrowed("i128") @@ -518,6 +559,12 @@ impl PdfParse for i128 { } } +impl IsPdfNull for NonZero { + fn is_pdf_null(&self) -> bool { + false + } +} + impl PdfParse for NonZero { fn type_name() -> Cow<'static, str> { Cow::Borrowed("NonZero") @@ -528,12 +575,24 @@ impl PdfParse for NonZero { } } +impl IsPdfNull for f64 { + fn is_pdf_null(&self) -> bool { + false + } +} + impl PdfParse for f64 { fn type_name() -> Cow<'static, str> { Cow::Borrowed("f64") } fn parse(object: PdfObject) -> Result { - Ok(::parse(object)?.value()) + Ok(::parse(object)?.as_f64()) + } +} + +impl IsPdfNull for f32 { + fn is_pdf_null(&self) -> bool { + false } } @@ -542,7 +601,13 @@ impl PdfParse for f32 { Cow::Borrowed("f32") } fn parse(object: PdfObject) -> Result { - Ok(::parse(object)? as f32) + Ok(::parse(object)?.as_f32()) + } +} + +impl IsPdfNull for PdfNull { + fn is_pdf_null(&self) -> bool { + true } } @@ -599,6 +664,12 @@ impl PdfParse for PdfObject { } } +impl IsPdfNull for PdfObjectIndirect { + fn is_pdf_null(&self) -> bool { + self.get().is_pdf_null() + } +} + impl PdfParse for PdfObjectIndirect { fn type_name() -> Cow<'static, str> { Cow::Borrowed("indirect object") @@ -626,6 +697,12 @@ impl PdfParse for PdfObjectIndirect { } } +impl IsPdfNull for [T; N] { + fn is_pdf_null(&self) -> bool { + false + } +} + impl PdfParse for [T; N] { fn type_name() -> Cow<'static, str> { Cow::Owned(format!("[{}; {N}]", T::type_name())) @@ -661,6 +738,12 @@ impl PdfParse for [T; N] { } } +impl IsPdfNull for Arc<[T]> { + fn is_pdf_null(&self) -> bool { + false + } +} + impl PdfParse for Arc<[T]> { fn type_name() -> Cow<'static, str> { Cow::Owned(format!("Arc<[{}]>", T::type_name())) @@ -688,6 +771,12 @@ impl PdfParse for Arc<[T]> { } } +impl IsPdfNull for MaybeArray { + fn is_pdf_null(&self) -> bool { + false + } +} + impl PdfParse for MaybeArray { fn type_name() -> Cow<'static, str> { Cow::Owned(format!("MaybeArray<{}>", T::type_name())) @@ -704,6 +793,7 @@ impl PdfParse for MaybeArray { #[macro_export] macro_rules! pdf_parse { ( + #[pdf $($struct_pdf_meta:tt)*] $(#[$($struct_meta:tt)*])* $struct_vis:vis struct $Struct:ident$(<$($StructParam:ident $(: $StructBound:tt)? $(= $StructParamDefault:ty)?),* $(,)?>)? { $(#[pdf $($pdf_meta:tt)*] @@ -719,6 +809,7 @@ macro_rules! pdf_parse { $crate::pdf::parse::pdf_parse! { @impl + #[pdf $($struct_pdf_meta)*] struct $Struct$(<$($StructParam $(: $StructBound)?),*>)? { $(#[pdf $($pdf_meta)*] $(#[$($field_meta)*])* @@ -728,11 +819,66 @@ macro_rules! pdf_parse { }; ( @impl + #[pdf(transparent)] + struct $Struct:ident$(<$($StructParam:ident $(: $StructBound:tt)?),* $(,)?>)? { + #[pdf] + $(#[$($field_meta:tt)*])* + $field_name:ident: $field_ty:ty, + $(#[pdf] + $(#[$($phantom_meta:tt)*])* + $phantom_name:ident: PhantomData<$phantom_ty:ty>,)? + } + ) => { + impl$(<$($StructParam: $crate::pdf::parse::IsPdfNull $(+ $StructBound)?),*>)? $crate::pdf::object::IsPdfNull for $Struct$(<$($StructParam),*>)? { + fn is_pdf_null(&self) -> $crate::__std::primitive::bool { + <$field_ty as $crate::pdf::object::IsPdfNull>::is_pdf_null(&self.$field_name) + } + } + impl$(<$($StructParam: $crate::pdf::parse::PdfParse $(+ $StructBound)?),*>)? $crate::pdf::parse::PdfParse for $Struct$(<$($StructParam),*>)? { + fn type_name() -> $crate::__std::borrow::Cow<'static, $crate::__std::primitive::str> { + let args: &[$crate::__std::borrow::Cow<'static, $crate::__std::primitive::str>] = &[ + $($(<$StructParam as $crate::pdf::parse::PdfParse>::type_name()),*)? + ]; + if args.is_empty() { + $crate::__std::borrow::Cow::Borrowed($crate::__std::stringify!($Struct)) + } else { + let mut retval = $crate::__std::string::String::new(); + retval.push_str($crate::__std::stringify!($Struct)); + retval.push_str("<"); + let mut first = true; + for arg in args { + if first { + first = false; + } else { + retval.push_str(", "); + } + retval.push_str(arg); + } + retval.push_str(">"); + $crate::__std::borrow::Cow::Owned(retval) + } + } + fn parse(object: $crate::pdf::object::PdfObject) -> $crate::__std::result::Result { + $crate::__std::result::Result::Ok(Self { + $field_name: <$field_ty as $crate::pdf::parse::PdfParse>::parse(object)?, + $($phantom_name: $crate::__std::marker::PhantomData,)? + }) + } + } + }; + ( + @impl + #[pdf] struct $Struct:ident$(<$($StructParam:ident $(: $StructBound:tt)?),* $(,)?>)? { $($(#[$($field_meta:tt)*])* $field_name:ident: $field_ty:ty,)* } ) => { + impl$(<$($StructParam $(: $StructBound)?),*>)? $crate::pdf::object::IsPdfNull for $Struct$(<$($StructParam),*>)? { + fn is_pdf_null(&self) -> $crate::__std::primitive::bool { + false + } + } impl$(<$($StructParam: $crate::pdf::parse::PdfParse $(+ $StructBound)?),*>)? $crate::pdf::parse::PdfParse for $Struct$(<$($StructParam),*>)? { fn type_name() -> $crate::__std::borrow::Cow<'static, $crate::__std::primitive::str> { let args: &[$crate::__std::borrow::Cow<'static, $crate::__std::primitive::str>] = &[ @@ -826,7 +972,7 @@ macro_rules! pdf_parse { [$(#[$($field_meta:tt)*])*] $field_name:ident: $field_ty:ty ) => { - let $field_name = $crate::__std::convert::AsRef::<[u8]>::as_ref($name); + let $field_name = $crate::__std::convert::AsRef::<[$crate::__std::primitive::u8]>::as_ref($name); let $field_name = <$field_ty as $crate::pdf::parse::PdfParse>::parse( $object_mut .remove($field_name) @@ -834,6 +980,7 @@ macro_rules! pdf_parse { )?; }; ( + #[pdf $($enum_pdf_meta:tt)*] $(#[$($enum_meta:tt)*])* $enum_vis:vis enum $Enum:ident { $(#[pdf $($pdf_meta:tt)*] @@ -849,6 +996,7 @@ macro_rules! pdf_parse { $crate::pdf::parse::pdf_parse! { @impl + #[pdf $($enum_pdf_meta)*] $(#[$($enum_meta)*])* enum $Enum { $(#[pdf $($pdf_meta)*] @@ -859,6 +1007,64 @@ macro_rules! pdf_parse { }; ( @impl + #[pdf(tag = $tag_name:expr)] + $(#[$($enum_meta:tt)*])* + enum $Enum:ident { + $(#[pdf(tag_value = $tag_value:expr)] + $(#[$($variant_meta:tt)*])* + $VariantName:ident($Body:ty),)* + #[pdf(other)] + $(#[$($variant_meta_other:tt)*])* + $VariantNameOther:ident($Other:ty), + } + ) => { + impl $crate::pdf::object::IsPdfNull for $Enum { + fn is_pdf_null(&self) -> $crate::__std::primitive::bool { + if let Self::$VariantNameOther(other) = self { + $crate::pdf::object::IsPdfNull::is_pdf_null(other) + } else { + false + } + } + } + impl $crate::pdf::parse::PdfParse for $Enum { + fn type_name() -> $crate::__std::borrow::Cow<'static, $crate::__std::primitive::str> { + $crate::__std::borrow::Cow::Borrowed($crate::__std::stringify!($Enum)) + } + fn parse(object: $crate::pdf::object::PdfObject) -> $crate::__std::result::Result { + let object = $crate::__std::convert::From::from(object); + let $crate::pdf::object::PdfObjectDirect::Dictionary(object) = object else { + return <$Other as $crate::pdf::parse::PdfParse>::parse( + $crate::__std::convert::From::from(object), + ).map($Enum::$VariantNameOther); + }; + 'a: { + let tag_name = $crate::__std::convert::AsRef::<[$crate::__std::primitive::u8]>::as_ref($tag_name); + let $crate::__std::option::Option::Some(tag_value) = object.get(tag_name).cloned() else { + break 'a; + }; + let tag_value = $crate::__std::convert::From::from(tag_value); + let $crate::pdf::object::PdfObjectDirect::Name(tag_value) = tag_value else { + break 'a; + }; + let _ = tag_value; + $(if tag_value == $crate::pdf::object::PdfName::new_static( + $crate::__std::convert::AsRef::<[u8]>::as_ref($tag_value), + ) { + return <$Body as $crate::pdf::parse::PdfParse>::parse( + $crate::pdf::object::PdfObject::Dictionary(object), + ).map($Enum::$VariantName); + })* + } + <$Other as $crate::pdf::parse::PdfParse>::parse( + $crate::pdf::object::PdfObject::Dictionary(object), + ).map($Enum::$VariantNameOther) + } + } + }; + ( + @impl + #[pdf(name)] $(#[$($enum_meta:tt)*])* enum $Enum:ident { $(#[pdf(name = $name:expr)] @@ -893,9 +1099,15 @@ macro_rules! pdf_parse { } } + impl $crate::pdf::object::IsPdfNull for $Enum { + fn is_pdf_null(&self) -> $crate::__std::primitive::bool { + false + } + } + impl $crate::pdf::parse::PdfParse for $Enum { fn type_name() -> $crate::__std::borrow::Cow<'static, $crate::__std::primitive::str> { - $crate::__std::borrow::Cow::Borrowed($crate::__std::stringify!($Struct)) + $crate::__std::borrow::Cow::Borrowed($crate::__std::stringify!($Enum)) } fn parse(object: $crate::pdf::object::PdfObject) -> $crate::__std::result::Result { let object = $crate::__std::convert::From::from(object); @@ -903,7 +1115,7 @@ macro_rules! pdf_parse { return $crate::__std::result::Result::Err($crate::pdf::parse::PdfParseError::InvalidType { pos: object.pos(), ty: object.type_name(), - expected_ty: $crate::__std::stringify!($Struct), + expected_ty: $crate::__std::stringify!($Enum), }); }; $crate::__std::result::Result::Ok($crate::__std::convert::TryInto::<$Enum>::try_into(name)?) diff --git a/src/pdf/stream_filters.rs b/src/pdf/stream_filters.rs index 51a3884..baf6670 100644 --- a/src/pdf/stream_filters.rs +++ b/src/pdf/stream_filters.rs @@ -7,6 +7,7 @@ use crate::pdf::{ pub mod flate; pdf_parse! { + #[pdf(name)] #[derive(Clone, Debug, PartialEq, Eq)] #[non_exhaustive] pub enum PdfStreamFilter { diff --git a/src/pdf/stream_filters/flate.rs b/src/pdf/stream_filters/flate.rs index 46d01a8..b5d49e8 100644 --- a/src/pdf/stream_filters/flate.rs +++ b/src/pdf/stream_filters/flate.rs @@ -7,6 +7,7 @@ use crate::pdf::{ use std::{io::Read, num::NonZero}; pdf_parse! { + #[pdf] #[derive(Clone, Debug, Default)] pub struct PdfFilterParmsFlateDecode { #[pdf(name = "Predictor")] From 13dcea1dabc93c0f46d83c279eb2140810bcaaf4 Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Fri, 26 Dec 2025 01:42:18 -0800 Subject: [PATCH 05/42] add more font structures --- src/pdf/font.rs | 231 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 226 insertions(+), 5 deletions(-) diff --git a/src/pdf/font.rs b/src/pdf/font.rs index 23d479e..bfc52b7 100644 --- a/src/pdf/font.rs +++ b/src/pdf/font.rs @@ -1,7 +1,10 @@ use std::{borrow::Cow, sync::Arc}; use crate::pdf::{ - object::{IsPdfNull, PdfDictionary, PdfName, PdfObject, PdfObjectDirect, PdfStream}, + object::{ + IsPdfNull, PdfDictionary, PdfName, PdfObject, PdfObjectDirect, PdfRectangle, PdfStream, + PdfString, + }, parse::{PdfParse, PdfParseError}, pdf_parse, }; @@ -17,12 +20,90 @@ pdf_parse! { } pdf_parse! { - #[pdf(transparent)] + #[pdf(name)] + #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Default, Debug)] + pub enum PdfFontDescriptorType { + #[pdf(name = "FontDescriptor")] + #[default] + FontDescriptor, + } +} + +pdf_parse! { + #[pdf(name)] + #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] + pub enum PdfFontStretch { + #[pdf(name = "UltraCondensed")] + UltraCondensed, + #[pdf(name = "ExtraCondensed")] + ExtraCondensed, + #[pdf(name = "Condensed")] + Condensed, + #[pdf(name = "SemiCondensed")] + SemiCondensed, + #[pdf(name = "Normal")] + Normal, + #[pdf(name = "SemiExpanded")] + SemiExpanded, + #[pdf(name = "Expanded")] + Expanded, + #[pdf(name = "ExtraExpanded")] + ExtraExpanded, + #[pdf(name = "UltraExpanded")] + UltraExpanded, + } +} + +pdf_parse! { + #[pdf] #[derive(Clone, Debug)] - // TODO: actually parse the dictionary pub struct PdfFontDescriptor { - #[pdf] - dictionary: PdfDictionary, + #[pdf(name = "Type")] + pub ty: PdfFontDescriptorType, + #[pdf(name = "FontName")] + pub font_name: PdfName, + #[pdf(name = "FontFamily")] + pub font_family: Option, + #[pdf(name = "FontStretch")] + pub font_stretch: Option, + #[pdf(name = "FontWeight")] + pub font_weight: Option, + #[pdf(name = "Flags")] + pub flags: u32, + #[pdf(name = "FontBBox")] + pub font_bounding_box: Option, + #[pdf(name = "ItalicAngle")] + pub italic_angle: f32, + #[pdf(name = "Ascent")] + pub ascent: Option, + #[pdf(name = "Descent")] + pub descent: Option, + #[pdf(name = "Leading")] + pub leading: Option, + #[pdf(name = "CapHeight")] + pub cap_height: Option, + #[pdf(name = "XHeight")] + pub x_height: Option, + #[pdf(name = "StemV")] + pub stem_v: Option, + #[pdf(name = "StemH")] + pub stem_h: Option, + #[pdf(name = "AvgWidth")] + pub avg_width: Option, + #[pdf(name = "MaxWidth")] + pub max_width: Option, + #[pdf(name = "MissingWidth")] + pub missing_width: Option, + #[pdf(name = "FontFile")] + pub font_file: Option, + #[pdf(name = "FontFile2")] + pub font_file2: Option, + #[pdf(name = "FontFile3")] + pub font_file3: Option, + #[pdf(name = "CharSet")] + pub char_set: Option, + #[pdf(flatten)] + pub rest: PdfDictionary, } } @@ -152,6 +233,69 @@ pub enum PdfFontType1 { Other(PdfFontType1Other), } +impl PdfFontType1 { + pub fn common(&self) -> PdfFontType1Common { + match self { + PdfFontType1::Standard(v) => v.common(), + PdfFontType1::Other(v) => v.common(), + } + } + pub fn name(&self) -> &Option { + match self { + Self::Standard(v) => &v.name, + Self::Other(v) => &v.name, + } + } + pub fn base_font(&self) -> PdfName { + match self { + Self::Standard(v) => v.base_font.into(), + Self::Other(v) => v.base_font.clone(), + } + } + pub fn first_char(&self) -> Option { + match self { + Self::Standard(v) => v.first_char, + Self::Other(v) => Some(v.first_char), + } + } + pub fn last_char(&self) -> Option { + match self { + Self::Standard(v) => v.last_char, + Self::Other(v) => Some(v.last_char), + } + } + pub fn widths(&self) -> Option<&Arc<[f32]>> { + match self { + Self::Standard(v) => v.widths.as_ref(), + Self::Other(v) => Some(&v.widths), + } + } + pub fn font_descriptor(&self) -> Option<&PdfFontDescriptor> { + match self { + Self::Standard(v) => v.font_descriptor.as_ref(), + Self::Other(v) => Some(&v.font_descriptor), + } + } + pub fn encoding(&self) -> &PdfObjectDirect { + match self { + Self::Standard(v) => &v.encoding, + Self::Other(v) => &v.encoding, + } + } + pub fn to_unicode(&self) -> &Option { + match self { + Self::Standard(v) => &v.to_unicode, + Self::Other(v) => &v.to_unicode, + } + } + pub fn rest(&self) -> &PdfDictionary { + match self { + Self::Standard(v) => &v.rest, + Self::Other(v) => &v.rest, + } + } +} + impl IsPdfNull for PdfFontType1 { fn is_pdf_null(&self) -> bool { false @@ -175,6 +319,21 @@ impl PdfParse for PdfFontType1 { } } +#[derive(Clone, Debug)] +pub struct PdfFontType1Common { + pub ty: PdfFontType, + pub subtype: PdfFontType1Subtype, + pub name: Option, + pub base_font: PdfName, + pub first_char: Option, + pub last_char: Option, + pub widths: Option>, + pub font_descriptor: Option, + pub encoding: PdfObjectDirect, + pub to_unicode: Option, + pub rest: PdfDictionary, +} + pdf_parse! { #[pdf] #[derive(Clone, Debug)] @@ -205,6 +364,37 @@ pdf_parse! { } } +impl PdfFontType1Standard { + pub fn common(&self) -> PdfFontType1Common { + let Self { + ty, + subtype, + ref name, + base_font, + first_char, + last_char, + ref widths, + ref font_descriptor, + ref encoding, + ref to_unicode, + ref rest, + } = *self; + PdfFontType1Common { + ty, + subtype, + name: name.clone(), + base_font: base_font.into(), + first_char, + last_char, + widths: widths.clone(), + font_descriptor: font_descriptor.clone(), + encoding: encoding.clone(), + to_unicode: to_unicode.clone(), + rest: rest.clone(), + } + } +} + pdf_parse! { #[pdf] #[derive(Clone, Debug)] @@ -234,3 +424,34 @@ pdf_parse! { pub rest: PdfDictionary, } } + +impl PdfFontType1Other { + pub fn common(&self) -> PdfFontType1Common { + let Self { + ty, + subtype, + ref name, + ref base_font, + first_char, + last_char, + ref widths, + ref font_descriptor, + ref encoding, + ref to_unicode, + ref rest, + } = *self; + PdfFontType1Common { + ty, + subtype, + name: name.clone(), + base_font: base_font.clone(), + first_char: Some(first_char), + last_char: Some(last_char), + widths: Some(widths.clone()), + font_descriptor: Some(font_descriptor.clone()), + encoding: encoding.clone(), + to_unicode: to_unicode.clone(), + rest: rest.clone(), + } + } +} From aba63689484c588f559f60bc671e011e3b96dcb5 Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Sun, 28 Dec 2025 00:08:39 -0800 Subject: [PATCH 06/42] parse content streams into a list of operators --- src/pdf.rs | 557 ++++++++++++----------- src/pdf/content_stream.rs | 815 +++++++++++++++++++++++++++++++++- src/pdf/document_structure.rs | 5 +- src/pdf/object.rs | 505 +++++++++++++++------ src/pdf/parse.rs | 67 ++- 5 files changed, 1541 insertions(+), 408 deletions(-) diff --git a/src/pdf.rs b/src/pdf.rs index 0ccd293..1933489 100644 --- a/src/pdf.rs +++ b/src/pdf.rs @@ -1,5 +1,6 @@ use crate::{ pdf::{ + content_stream::PdfOperatorUnparsed, document_structure::PdfDocumentCatalog, object::{ PdfArray, PdfBoolean, PdfDictionary, PdfInteger, PdfName, PdfNull, PdfObject, @@ -318,67 +319,35 @@ impl<'a> Iterator for PdfTokenizer<'a> { } struct PdfParser<'a> { - objects_arc: Arc, - objects_map: BTreeMap, - unparsed_stream_dictionaries: Vec>, + objects: Arc, tokenizer: PdfTokenizer<'a>, } +enum PdfObjectOrStreamDictionaryOrOperator { + StreamDictionary { + dictionary: PdfDictionary, + stream_kw_pos: PdfInputPosition, + }, + Object(PdfObject), + Operator(PdfOperatorUnparsed), +} + +impl PdfObjectOrStreamDictionaryOrOperator { + fn error_on_stream_or_operator(self) -> Result { + match self { + PdfObjectOrStreamDictionaryOrOperator::StreamDictionary { + dictionary: _, + stream_kw_pos, + } => Err(PdfParseError::StreamNotAllowedHere { pos: stream_kw_pos }), + PdfObjectOrStreamDictionaryOrOperator::Object(object) => Ok(object), + PdfObjectOrStreamDictionaryOrOperator::Operator(operator) => { + Err(PdfParseError::OperatorNotAllowedHere { operator }) + } + } + } +} + impl<'a> PdfParser<'a> { - fn with_tokenizer<'b, R>( - &mut self, - tokenizer: PdfTokenizer<'b>, - f: impl FnOnce(&mut PdfParser<'b>) -> R, - ) -> R { - let PdfParser { - objects_arc, - objects_map, - unparsed_stream_dictionaries, - tokenizer: _, - } = self; - let objects_arc = objects_arc.clone(); - let objects_map = std::mem::take(objects_map); - let unparsed_stream_dictionaries = std::mem::take(unparsed_stream_dictionaries); - let mut new_parser = PdfParser { - objects_arc, - objects_map, - unparsed_stream_dictionaries, - tokenizer, - }; - let retval = f(&mut new_parser); - let PdfParser { - objects_arc, - objects_map, - unparsed_stream_dictionaries, - tokenizer: _, - } = new_parser; - self.objects_arc = objects_arc; - self.objects_map = objects_map; - self.unparsed_stream_dictionaries = unparsed_stream_dictionaries; - retval - } - fn parse_header(&mut self) -> Result { - let Some(b'%') = self.tokenizer.bytes.first() else { - return Err(PdfParseError::NotAPdfFile); - }; - let Some(PdfToken::Comment(header)) = self.tokenizer.next() else { - unreachable!() - }; - let Ok(header) = str::from_utf8(header) else { - return Err(PdfParseError::NotAPdfFile); - }; - let header = header.trim_end_matches(['\n', '\r']); - let Some(version) = header.strip_prefix(PdfHeader::PREFIX) else { - return Err(PdfParseError::NotAPdfFile); - }; - let Some((major_str, minor_str)) = version.split_once('.') else { - return Err(PdfParseError::NotAPdfFile); - }; - let (Ok(major), Ok(minor)) = (major_str.parse(), minor_str.parse()) else { - return Err(PdfParseError::NotAPdfFile); - }; - Ok(PdfHeader { major, minor }) - } fn skip_comments_and_whitespace(&mut self) { self.tokenizer.skip_whitespace(); while let Some(PdfToken::Comment(_)) = self.tokenizer.peek() { @@ -449,7 +418,7 @@ impl<'a> PdfParser<'a> { return Ok(None); }; if let Some(PdfToken::Regular(b"R")) = self.tokenizer.next() { - Ok(Some(PdfObjectIndirect::new(&self.objects_arc, id))) + Ok(Some(PdfObjectIndirect::new(&self.objects, id))) } else { self.tokenizer = old_tokenizer; Ok(None) @@ -604,7 +573,10 @@ impl<'a> PdfParser<'a> { self.tokenizer.next(); return Ok(PdfArray::from_elements(array_pos, Arc::from(contents))); } - contents.push(self.parse_object()?); + contents.push( + self.parse_object_or_operator()? + .error_on_stream_or_operator()?, + ); } } /// assumes `self.tokenizer.peek_byte() == Some(b'<')` @@ -630,11 +602,17 @@ impl<'a> PdfParser<'a> { Arc::new(contents), )); } - let name = PdfName::parse(self.parse_object()?.into())?; + let name = PdfName::parse( + self.parse_object_or_operator()? + .error_on_stream_or_operator()?, + )?; let name_pos = name.pos(); match contents.entry(name) { std::collections::btree_map::Entry::Vacant(entry) => { - entry.insert(self.parse_object()?.into()); + entry.insert( + self.parse_object_or_operator()? + .error_on_stream_or_operator()?, + ); } std::collections::btree_map::Entry::Occupied(entry) => { return Err(PdfParseError::DuplicateDictionaryKey { @@ -645,21 +623,146 @@ impl<'a> PdfParser<'a> { } } } + fn parse_object_or_operator( + &mut self, + ) -> Result { + self.skip_comments_and_whitespace(); + if let Some(indirect) = self.parse_indirect_object()? { + return Ok(PdfObjectOrStreamDictionaryOrOperator::Object( + indirect.into(), + )); + } + let pos = self.tokenizer.pos(); + Ok(PdfObjectOrStreamDictionaryOrOperator::Object( + match self + .tokenizer + .next() + .ok_or(PdfParseError::TruncatedFile { pos })? + { + PdfToken::Regular(b"true") => PdfObject::Boolean(PdfBoolean::new(pos, true)), + PdfToken::Regular(b"false") => PdfObject::Boolean(PdfBoolean::new(pos, false)), + PdfToken::Regular(b"null") => PdfObject::Null(PdfNull::new(pos)), + PdfToken::Regular( + number @ ([b'+' | b'-', b'0'..=b'9' | b'.', ..] | [b'0'..=b'9' | b'.', ..]), + ) => { + // parse number + let Ok(number) = str::from_utf8(number) else { + return Err(PdfParseError::InvalidNumber { pos }); + }; + let mut parts = number + .strip_prefix(&['+', '-']) + .unwrap_or(number) + .split('.'); + let integer_part = parts + .next() + .expect("split always returns at least one part"); + let fraction_part = parts.next(); + if parts.next().is_some() { + return Err(PdfParseError::InvalidNumber { pos }); + } + if integer_part.is_empty() && fraction_part.is_none_or(|v| v.is_empty()) { + return Err(PdfParseError::InvalidNumber { pos }); + } + if !integer_part.bytes().all(|v| v.is_ascii_digit()) { + return Err(PdfParseError::InvalidNumber { pos }); + } + if let Some(fraction_part) = fraction_part { + if !fraction_part.bytes().all(|v| v.is_ascii_digit()) { + return Err(PdfParseError::InvalidNumber { pos }); + } + PdfObject::Real(PdfReal::new( + pos, + number + .parse() + .map_err(|_| PdfParseError::InvalidNumber { pos })?, + )) + } else { + PdfObject::Integer(PdfInteger::new( + pos, + number + .parse() + .map_err(|_| PdfParseError::InvalidNumber { pos })?, + )) + } + } + PdfToken::Regular(name) => { + return Ok(PdfObjectOrStreamDictionaryOrOperator::Operator( + PdfOperatorUnparsed::new(pos, ArcOrRef::Arc(name.into())), + )); + } + PdfToken::LParen => PdfObject::String(self.parse_string_after_l_paren()?), + PdfToken::RParen => todo!(), + PdfToken::LAngle => { + if self.tokenizer.peek_byte() == Some(b'<') { + let dictionary = self.parse_dictionary_after_one_l_angle()?; + self.skip_comments_and_whitespace(); + if let Some(PdfToken::Regular(b"stream")) = self.tokenizer.peek() { + return Ok(PdfObjectOrStreamDictionaryOrOperator::StreamDictionary { + dictionary, + stream_kw_pos: self.tokenizer.pos(), + }); + } else { + dictionary.into() + } + } else { + self.parse_string_after_l_angle()?.into() + } + } + PdfToken::RAngle => todo!(), + PdfToken::LBracket => self.parse_array_after_l_bracket()?.into(), + PdfToken::RBracket => todo!(), + PdfToken::LBrace => todo!(), + PdfToken::RBrace => todo!(), + PdfToken::FSlash => self.parse_name_after_f_slash()?.into(), + PdfToken::Comment(_) => unreachable!(), + }, + )) + } +} + +struct PdfFileParser<'a> { + parser: PdfParser<'a>, + objects_map: BTreeMap, +} + +impl<'a> PdfFileParser<'a> { + fn parse_header(&mut self) -> Result { + let Some(b'%') = self.parser.tokenizer.bytes.first() else { + return Err(PdfParseError::NotAPdfFile); + }; + let Some(PdfToken::Comment(header)) = self.parser.tokenizer.next() else { + unreachable!() + }; + let Ok(header) = str::from_utf8(header) else { + return Err(PdfParseError::NotAPdfFile); + }; + let header = header.trim_end_matches(['\n', '\r']); + let Some(version) = header.strip_prefix(PdfHeader::PREFIX) else { + return Err(PdfParseError::NotAPdfFile); + }; + let Some((major_str, minor_str)) = version.split_once('.') else { + return Err(PdfParseError::NotAPdfFile); + }; + let (Ok(major), Ok(minor)) = (major_str.parse(), minor_str.parse()) else { + return Err(PdfParseError::NotAPdfFile); + }; + Ok(PdfHeader { major, minor }) + } /// assumes `self.tokenizer.peek() == Some(PdfToken::Regular(b"stream"))` fn parse_stream_after_dictionary( &mut self, dictionary: PdfDictionary, ) -> Result { - self.tokenizer.skip_whitespace(); - let stream_pos = self.tokenizer.pos(); - let stream = self.tokenizer.next(); + self.parser.tokenizer.skip_whitespace(); + let stream_pos = self.parser.tokenizer.pos(); + let stream = self.parser.tokenizer.next(); assert_eq!(stream, Some(PdfToken::Regular(b"stream"))); - let len = PdfStreamDictionary::parse_len_from_dictionary(&dictionary)?; - let eol_pos = self.tokenizer.pos(); - match self.tokenizer.next_byte() { + let dictionary = PdfStreamDictionary::parse(dictionary.into())?; + let eol_pos = self.parser.tokenizer.pos(); + match self.parser.tokenizer.next_byte() { None => return Err(PdfParseError::TruncatedFile { pos: eol_pos }), Some(b'\r') => { - let Some(b'\n') = self.tokenizer.next_byte() else { + let Some(b'\n') = self.parser.tokenizer.next_byte() else { return Err(PdfParseError::InvalidOrMissingEolAfterStreamKeyword { pos: eol_pos, }); @@ -668,121 +771,56 @@ impl<'a> PdfParser<'a> { Some(b'\n') => {} _ => return Err(PdfParseError::InvalidOrMissingEolAfterStreamKeyword { pos: eol_pos }), } - let Some(data) = self.tokenizer.read_bytes(len) else { + let Some(data) = self.parser.tokenizer.read_bytes(dictionary.len) else { return Err(PdfParseError::TruncatedFile { pos: PdfInputPosition::new(Some(PdfInputPositionKnown { - pos: self.tokenizer.bytes.len(), - ..self.tokenizer.pos + pos: self.parser.tokenizer.bytes.len(), + ..self.parser.tokenizer.pos })), }); }; - let (stream, unparsed) = PdfStream::new_unparsed(stream_pos, dictionary, Arc::from(data)); - self.unparsed_stream_dictionaries.push(unparsed); - self.skip_comments_and_whitespace(); - let pos = self.tokenizer.pos(); - if let Some(PdfToken::Regular(b"endstream")) = self.tokenizer.next() { + let stream = PdfStream::new( + stream_pos, + &self.parser.objects, + dictionary, + Arc::from(data), + ); + self.parser.skip_comments_and_whitespace(); + let pos = self.parser.tokenizer.pos(); + if let Some(PdfToken::Regular(b"endstream")) = self.parser.tokenizer.next() { Ok(stream) } else { Err(PdfParseError::MissingEndStreamKeyword { pos }) } } fn parse_object(&mut self) -> Result { - self.skip_comments_and_whitespace(); - if let Some(indirect) = self.parse_indirect_object()? { - return Ok(indirect.into()); - } - let pos = self.tokenizer.pos(); - match self - .tokenizer - .next() - .ok_or(PdfParseError::TruncatedFile { pos })? - { - PdfToken::Regular(b"true") => Ok(PdfObject::Boolean(PdfBoolean::new(pos, true))), - PdfToken::Regular(b"false") => Ok(PdfObject::Boolean(PdfBoolean::new(pos, false))), - PdfToken::Regular(b"null") => Ok(PdfObject::Null(PdfNull::new(pos))), - PdfToken::Regular( - number @ ([b'+' | b'-', b'0'..=b'9' | b'.', ..] | [b'0'..=b'9' | b'.', ..]), - ) => { - // parse number - let Ok(number) = str::from_utf8(number) else { - return Err(PdfParseError::InvalidNumber { pos }); - }; - let mut parts = number - .strip_prefix(&['+', '-']) - .unwrap_or(number) - .split('.'); - let integer_part = parts - .next() - .expect("split always returns at least one part"); - let fraction_part = parts.next(); - if parts.next().is_some() { - return Err(PdfParseError::InvalidNumber { pos }); - } - if integer_part.is_empty() && fraction_part.is_none_or(|v| v.is_empty()) { - return Err(PdfParseError::InvalidNumber { pos }); - } - if !integer_part.bytes().all(|v| v.is_ascii_digit()) { - return Err(PdfParseError::InvalidNumber { pos }); - } - if let Some(fraction_part) = fraction_part { - if !fraction_part.bytes().all(|v| v.is_ascii_digit()) { - return Err(PdfParseError::InvalidNumber { pos }); - } - Ok(PdfObject::Real(PdfReal::new( - pos, - number - .parse() - .map_err(|_| PdfParseError::InvalidNumber { pos })?, - ))) - } else { - Ok(PdfObject::Integer(PdfInteger::new( - pos, - number - .parse() - .map_err(|_| PdfParseError::InvalidNumber { pos })?, - ))) - } + match self.parser.parse_object_or_operator()? { + PdfObjectOrStreamDictionaryOrOperator::StreamDictionary { + dictionary, + stream_kw_pos: _, + } => Ok(PdfObject::Stream( + self.parse_stream_after_dictionary(dictionary)?, + )), + PdfObjectOrStreamDictionaryOrOperator::Object(object) => Ok(object), + PdfObjectOrStreamDictionaryOrOperator::Operator(operator) => { + Err(PdfParseError::OperatorNotAllowedHere { operator }) } - PdfToken::Regular(items) => todo!("{:?}", str::from_utf8(items)), - PdfToken::LParen => self.parse_string_after_l_paren().map(PdfObject::String), - PdfToken::RParen => todo!(), - PdfToken::LAngle => { - if self.tokenizer.peek_byte() == Some(b'<') { - let dictionary = self.parse_dictionary_after_one_l_angle()?; - self.skip_comments_and_whitespace(); - if let Some(PdfToken::Regular(b"stream")) = self.tokenizer.peek() { - self.parse_stream_after_dictionary(dictionary) - .map(PdfObject::Stream) - } else { - Ok(dictionary.into()) - } - } else { - self.parse_string_after_l_angle().map(PdfObject::String) - } - } - PdfToken::RAngle => todo!(), - PdfToken::LBracket => self.parse_array_after_l_bracket().map(PdfObject::Array), - PdfToken::RBracket => todo!(), - PdfToken::LBrace => todo!(), - PdfToken::RBrace => todo!(), - PdfToken::FSlash => self.parse_name_after_f_slash().map(PdfObject::Name), - PdfToken::Comment(_) => unreachable!(), } } fn parse_indirect_object_definition(&mut self) -> Result, PdfParseError> { - self.skip_comments_and_whitespace(); - let Some(id) = self.parse_object_identifier(false)? else { + self.parser.skip_comments_and_whitespace(); + let Some(id) = self.parser.parse_object_identifier(false)? else { return Ok(None); }; - self.skip_comments_and_whitespace(); - let obj_pos = self.tokenizer.pos(); - let Some(PdfToken::Regular(b"obj")) = self.tokenizer.next() else { + self.parser.skip_comments_and_whitespace(); + let obj_pos = self.parser.tokenizer.pos(); + let Some(PdfToken::Regular(b"obj")) = self.parser.tokenizer.next() else { return Err(PdfParseError::MissingObj { pos: obj_pos }); }; let object = self.parse_object()?; - self.skip_comments_and_whitespace(); - let end_obj_pos = self.tokenizer.pos(); - let Some(PdfToken::Regular(b"endobj")) = self.tokenizer.next() else { + self.parser.skip_comments_and_whitespace(); + let end_obj_pos = self.parser.tokenizer.pos(); + let Some(PdfToken::Regular(b"endobj")) = self.parser.tokenizer.next() else { return Err(PdfParseError::MissingEndObj { pos: end_obj_pos }); }; if self.objects_map.insert(id, object).is_some() { @@ -791,53 +829,13 @@ impl<'a> PdfParser<'a> { Ok(Some(())) } } - fn parse_object_stream_inner( - &mut self, - object_stream: &PdfStream, - ) -> Result<(), PdfParseError> { - let mut object_ids_and_byte_positions = - Vec::<(PdfObjectIdentifier, usize)>::with_capacity(object_stream.dictionary().rest.n); - for _ in 0..object_stream.dictionary().rest.n { - self.skip_comments_and_whitespace(); - let Some((pos, object_number)) = - self.parse_digits(|pos| Err(PdfParseError::InvalidObjectNumber { pos }))? - else { - return Err(PdfParseError::InvalidObjectNumber { - pos: self.tokenizer.pos(), - }); - }; - self.skip_comments_and_whitespace(); - let Some((_, byte_position)) = - self.parse_digits(|pos| Err(PdfParseError::InvalidNumber { pos }))? - else { - return Err(PdfParseError::InvalidNumber { - pos: self.tokenizer.pos(), - }); - }; - object_ids_and_byte_positions.push(( - PdfObjectIdentifier { - pos: pos.into(), - object_number, - generation_number: 0, - }, - byte_position, - )); - } - for (id, _byte_position) in object_ids_and_byte_positions { - let object = self.parse_object()?; - if self.objects_map.insert(id, object).is_some() { - return Err(PdfParseError::DuplicateIndirectObjectDefinition { pos: id.pos.0, id }); - } - } - Ok(()) - } fn parse_object_stream( &mut self, object_stream: &PdfStream, ) -> Result<(), PdfParseError> { let data = object_stream.decoded_data().as_ref()?; - self.with_tokenizer( - PdfTokenizer::new( + let mut parser = PdfParser { + tokenizer: PdfTokenizer::new( data, PdfInputPositionKnown { pos: 0, @@ -850,18 +848,48 @@ impl<'a> PdfParser<'a> { ), }, ), - |parser| parser.parse_object_stream_inner(object_stream), - ) - .map_err(|e| PdfParseError::ObjectStreamParseError { - stream_pos: object_stream.get_pdf_input_position(), - error: Arc::new(e), - }) + objects: self.parser.objects.clone(), + }; + let mut object_ids_and_byte_positions = + Vec::<(PdfObjectIdentifier, usize)>::with_capacity(object_stream.dictionary().rest.n); + for _ in 0..object_stream.dictionary().rest.n { + parser.skip_comments_and_whitespace(); + let Some((pos, object_number)) = + parser.parse_digits(|pos| Err(PdfParseError::InvalidObjectNumber { pos }))? + else { + return Err(PdfParseError::InvalidObjectNumber { + pos: parser.tokenizer.pos(), + }); + }; + parser.skip_comments_and_whitespace(); + let Some((_, byte_position)) = + parser.parse_digits(|pos| Err(PdfParseError::InvalidNumber { pos }))? + else { + return Err(PdfParseError::InvalidNumber { + pos: parser.tokenizer.pos(), + }); + }; + object_ids_and_byte_positions.push(( + PdfObjectIdentifier { + pos: pos.into(), + object_number, + generation_number: 0, + }, + byte_position, + )); + } + for (id, _byte_position) in object_ids_and_byte_positions { + let object = parser + .parse_object_or_operator()? + .error_on_stream_or_operator()?; + if self.objects_map.insert(id, object).is_some() { + return Err(PdfParseError::DuplicateIndirectObjectDefinition { pos: id.pos.0, id }); + } + } + Ok(()) } fn parse_body(&mut self) -> Result<(), PdfParseError> { while let Some(()) = self.parse_indirect_object_definition()? {} - self.unparsed_stream_dictionaries - .drain(..) - .try_for_each(|v| v.finish_parsing())?; let mut object_streams: Vec> = Vec::new(); for object in self.objects_map.values_mut() { let stream = match object { @@ -885,7 +913,7 @@ impl<'a> PdfParser<'a> { for object_stream in &object_streams { self.parse_object_stream(object_stream)?; } - let Ok(()) = self.objects_arc.inner.set(PdfObjectsInner { + let Ok(()) = self.parser.objects.inner.set(PdfObjectsInner { objects: std::mem::take(&mut self.objects_map), object_streams, }) else { @@ -894,19 +922,19 @@ impl<'a> PdfParser<'a> { Ok(()) } fn parse_xref_table(&mut self) -> Result<(), PdfParseError> { - self.skip_comments_and_whitespace(); - let xref_pos = self.tokenizer.pos(); - let Some(PdfToken::Regular(b"xref")) = self.tokenizer.peek() else { + self.parser.skip_comments_and_whitespace(); + let xref_pos = self.parser.tokenizer.pos(); + let Some(PdfToken::Regular(b"xref")) = self.parser.tokenizer.peek() else { return Ok(()); }; todo!("{xref_pos}") } fn parse_trailer(&mut self) -> Result { - self.skip_comments_and_whitespace(); - let trailer_pos = self.tokenizer.pos(); - let trailer_dictionary = match self.tokenizer.peek() { + self.parser.skip_comments_and_whitespace(); + let trailer_pos = self.parser.tokenizer.pos(); + let trailer_dictionary = match self.parser.tokenizer.peek() { Some(PdfToken::Regular(b"trailer")) => { - self.tokenizer.next(); + self.parser.tokenizer.next(); Some(PdfTrailerDictionary::parse(self.parse_object()?)?) } Some(PdfToken::Regular(b"startxref")) => None, @@ -914,34 +942,35 @@ impl<'a> PdfParser<'a> { return Err(PdfParseError::MissingTrailer { pos: trailer_pos }); } }; - self.skip_comments_and_whitespace(); - let start_xref_kw_pos = self.tokenizer.pos(); - let Some(PdfToken::Regular(b"startxref")) = self.tokenizer.next() else { + self.parser.skip_comments_and_whitespace(); + let start_xref_kw_pos = self.parser.tokenizer.pos(); + let Some(PdfToken::Regular(b"startxref")) = self.parser.tokenizer.next() else { return Err(PdfParseError::MissingStartXRefKeyword { pos: start_xref_kw_pos, }); }; - let start_xref_pos = self.tokenizer.pos(); - let Some((start_xref_pos, start_xref)) = - self.parse_digits(|pos| Err(PdfParseError::IntegerOutOfRange { pos }))? + let start_xref_pos = self.parser.tokenizer.pos(); + let Some((start_xref_pos, start_xref)) = self + .parser + .parse_digits(|pos| Err(PdfParseError::IntegerOutOfRange { pos }))? else { return Err(PdfParseError::MissingStartXRefValue { pos: start_xref_pos, }); }; - self.tokenizer.skip_whitespace(); - let eof_comment_pos = self.tokenizer.pos(); + self.parser.tokenizer.skip_whitespace(); + let eof_comment_pos = self.parser.tokenizer.pos(); let Some(PdfToken::Comment(b"%%EOF" | b"%%EOF\r" | b"%%EOF\r\n" | b"%%EOF\n")) = - self.tokenizer.next() + self.parser.tokenizer.next() else { return Err(PdfParseError::MissingEofComment { pos: eof_comment_pos, }); }; - self.tokenizer.skip_whitespace(); - if let Some(byte) = self.tokenizer.peek_byte() { + self.parser.tokenizer.skip_whitespace(); + if let Some(byte) = self.parser.tokenizer.peek_byte() { return Err(PdfParseError::UnexpectedByte { - pos: self.tokenizer.pos(), + pos: self.parser.tokenizer.pos(), byte, }); } @@ -951,24 +980,28 @@ impl<'a> PdfParser<'a> { start_xref, }); } - let old_tokenizer = self.tokenizer.clone(); - self.tokenizer = PdfTokenizer::new( - self.tokenizer.bytes, - PdfInputPositionKnown { - pos: start_xref, - containing_streams_pos: None, - }, - ); - let id = self.parse_object_identifier(false); - self.tokenizer = old_tokenizer; + let id = PdfParser { + tokenizer: PdfTokenizer::new( + self.parser.tokenizer.bytes, + PdfInputPositionKnown { + pos: start_xref, + containing_streams_pos: None, + }, + ), + objects: self.parser.objects.clone(), + } + .parse_object_identifier(false); let Some(id) = id? else { return Err(PdfParseError::InvalidStartXRefValue { pos: start_xref_pos, start_xref, }); }; - let xref_stream = - PdfStream::parse(PdfObjectIndirect::new(&self.objects_arc, id).get().into())?; + let xref_stream = PdfStream::parse( + PdfObjectIndirect::new(&self.parser.objects, id) + .get() + .into(), + )?; Ok(PdfTrailer::Stream { xref_stream, start_xref, @@ -979,9 +1012,14 @@ impl<'a> PdfParser<'a> { self.parse_body()?; self.parse_xref_table()?; let trailer = self.parse_trailer()?; + for page in trailer.trailer_dictionary().root.pages.pages().iter() { + for content in page.contents.iter() { + content.decoded_data().as_ref()?; + } + } Ok(Pdf { header, - objects: self.objects_arc, + objects: self.parser.objects, trailer, }) } @@ -989,19 +1027,20 @@ impl<'a> PdfParser<'a> { impl Pdf { pub fn parse(bytes: impl AsRef<[u8]>) -> Result { - PdfParser { - objects_arc: Arc::new(PdfObjects { - inner: OnceLock::new(), - }), + PdfFileParser { + parser: PdfParser { + objects: Arc::new(PdfObjects { + inner: OnceLock::new(), + }), + tokenizer: PdfTokenizer::new( + bytes.as_ref(), + PdfInputPositionKnown { + pos: 0, + containing_streams_pos: None, + }, + ), + }, objects_map: BTreeMap::new(), - unparsed_stream_dictionaries: vec![], - tokenizer: PdfTokenizer::new( - bytes.as_ref(), - PdfInputPositionKnown { - pos: 0, - containing_streams_pos: None, - }, - ), } .parse_file() } diff --git a/src/pdf/content_stream.rs b/src/pdf/content_stream.rs index f58737e..2552df7 100644 --- a/src/pdf/content_stream.rs +++ b/src/pdf/content_stream.rs @@ -1,6 +1,813 @@ -use crate::pdf::object::PdfStream; +use crate::{ + pdf::{ + PdfObjectOrStreamDictionaryOrOperator, PdfObjects, PdfParser, PdfTokenizer, + object::{ + NameOr, PdfDictionary, PdfMatrix, PdfName, PdfObject, PdfObjectDirect, PdfRectangle, + PdfStream, PdfStreamContents, PdfString, PdfStringBytesDebug, PdfStringOrNumber, + PdfVec2D, + }, + parse::{ + GetPdfInputPosition, PdfInputPosition, PdfInputPositionKnown, + PdfInputPositionNoCompare, PdfParse, PdfParseError, + }, + }, + util::ArcOrRef, +}; +use std::{fmt, sync::Arc}; -pub struct PdfContentStream { - stream: PdfStream, - // TODO +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord)] +pub struct PdfOperatorUnparsed { + pos: PdfInputPositionNoCompare, + bytes: ArcOrRef<'static, [u8]>, } + +impl GetPdfInputPosition for PdfOperatorUnparsed { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.pos() + } +} + +impl fmt::Debug for PdfOperatorUnparsed { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + Self::debug_with_name("PdfOperatorUnparsed", &self.bytes, self.pos.0, f) + } +} + +trait PdfParseIter: Sized { + fn parse_iter(iter: impl IntoIterator) -> Result; +} + +impl PdfParseIter for Arc<[T]> { + fn parse_iter(iter: impl IntoIterator) -> Result { + FromIterator::from_iter(iter.into_iter().map(T::parse)) + } +} + +impl PdfOperatorUnparsed { + pub fn new( + pos: impl Into, + bytes: impl Into>, + ) -> Self { + Self { + pos: pos.into(), + bytes: bytes.into(), + } + } + pub const fn new_static(bytes: &'static [u8]) -> Self { + Self { + pos: PdfInputPositionNoCompare::empty(), + bytes: ArcOrRef::Ref(bytes), + } + } + pub fn pos(&self) -> PdfInputPosition { + self.pos.0 + } + pub fn bytes(&self) -> &ArcOrRef<'static, [u8]> { + &self.bytes + } + fn debug_with_name( + name: &str, + pdf_name: &[u8], + pos: PdfInputPosition, + f: &mut fmt::Formatter<'_>, + ) -> fmt::Result { + write!(f, "{name}(at {pos}, {})", PdfStringBytesDebug(pdf_name)) + } + pub fn bytes_debug(&self) -> PdfStringBytesDebug<'_> { + PdfStringBytesDebug(&self.bytes) + } +} + +macro_rules! make_pdf_operator_enum { + ( + $(#[$($operator_meta:tt)*])* + $operator_enum_vis:vis enum $PdfOperator:ident; + + $(#[$($operator_and_operands_meta:tt)*])* + $enum_vis:vis enum $PdfOperatorAndOperands:ident { + $(#[$($unknown_variant_meta:tt)*])* + $Unknown:ident { + $(#[$($unknown_operands_meta:tt)*])* + $unknown_operands:ident: $unknown_operands_ty:ty, + $(#[$($unknown_operator_meta:tt)*])* + $unknown_operator:ident: $unknown_operator_ty:ty, + }, + $( + #[kw = $kw:literal] + $(#[$($variant_meta:tt)*])* + $Variant:ident($VariantStruct:ident { + $pos:ident: PdfInputPositionNoCompare, + $( + #[$field_parse:ident($($parse_args:tt)*)] + $(#[$($field_meta:tt)*])* + $field:ident: $field_ty:ty, + )* + }), + )* + } + ) => { + $(#[$($operator_meta)*])* + $operator_enum_vis enum $PdfOperator { + $(#[$($unknown_variant_meta)*])* + $Unknown($unknown_operator_ty), + $( + $(#[$($variant_meta)*])* + $Variant(PdfInputPositionNoCompare), + )* + } + + impl $PdfOperator { + $operator_enum_vis fn parse(self, operands: impl IntoIterator) -> Result<$PdfOperatorAndOperands, PdfParseError> { + let operands = operands.into_iter(); + Ok(match self { + Self::$Unknown(operator) => $PdfOperatorAndOperands::$Unknown { + operands: FromIterator::from_iter(operands.map(Into::into)), + operator, + }, + $(Self::$Variant(pos) => $VariantStruct::parse(pos, operands)?.into(),)* + }) + } + $operator_enum_vis fn pos(&self) -> PdfInputPosition { + match *self { + Self::$Unknown(ref operator) => operator.pos(), + $(Self::$Variant(pos) => pos.0,)* + } + } + } + + impl fmt::Debug for $PdfOperator { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::$Unknown(operator) => PdfOperatorUnparsed::debug_with_name("Unknown", &operator.bytes, operator.pos.0, f), + $(Self::$Variant(pos) => PdfOperatorUnparsed::debug_with_name(stringify!($Variant), $kw, pos.0, f),)* + } + } + } + + impl From<$PdfOperator> for PdfOperatorUnparsed { + fn from(v: $PdfOperator) -> PdfOperatorUnparsed { + match v { + $PdfOperator::$Unknown(operator) => operator, + $($PdfOperator::$Variant(pos) => PdfOperatorUnparsed { pos, bytes: ArcOrRef::Ref($kw) },)* + } + } + } + + impl From for $PdfOperator { + fn from(v: PdfOperatorUnparsed) -> $PdfOperator { + match &**v.bytes() { + $($kw => Self::$Variant(v.pos),)* + _ => Self::$Unknown(v), + } + } + } + + $(#[derive(Clone)] + $(#[$($variant_meta)*])* + $enum_vis struct $VariantStruct { + $enum_vis $pos: PdfInputPositionNoCompare, + $( + $(#[$($field_meta)*])* + $enum_vis $field: $field_ty, + )* + } + + impl fmt::Debug for $VariantStruct { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct(stringify!($VariantStruct)).field("pos", &self.pos)$(.field(stringify!($field), &self.$field))*.finish() + } + } + + impl GetPdfInputPosition for $VariantStruct { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.pos() + } + } + + impl From<$VariantStruct> for $PdfOperatorAndOperands { + fn from(v: $VariantStruct) -> Self { + Self::$Variant(v) + } + } + + impl $VariantStruct { + $enum_vis fn operator_from_pos(pos: impl Into) -> $PdfOperator { + $PdfOperator::$Variant(pos.into()) + } + $enum_vis fn operator(&self) -> $PdfOperator { + $PdfOperator::$Variant(self.pos) + } + $enum_vis fn pos(&self) -> PdfInputPosition { + self.pos.0 + } + } + + make_pdf_operator_enum! { + @impl_variant_parse + $enum_vis enum; + struct $VariantStruct { + $pos: PdfInputPositionNoCompare, + $( + #[$field_parse($($parse_args)*)] + $(#[$($field_meta)*])* + $field: $field_ty, + )* + } + })* + + $(#[$($operator_and_operands_meta)*])* + $enum_vis enum $PdfOperatorAndOperands { + $(#[$($unknown_variant_meta)*])* + $Unknown { + $(#[$($unknown_operands_meta)*])* + $unknown_operands: $unknown_operands_ty, + $(#[$($unknown_operator_meta)*])* + $unknown_operator: $unknown_operator_ty, + }, + $( + $(#[$($variant_meta)*])* + $Variant($VariantStruct), + )* + } + + impl $PdfOperatorAndOperands { + $enum_vis fn operator(&self) -> $PdfOperator { + match self { + Self::Unknown { operator, .. } => $PdfOperator::Unknown(operator.clone()), + $(Self::$Variant(v) => v.operator(),)* + } + } + $enum_vis fn pos(&self) -> PdfInputPosition { + match self { + Self::$Unknown { operator, .. } => operator.pos(), + $(Self::$Variant(v) => v.pos(),)* + } + } + } + + impl fmt::Debug for $PdfOperatorAndOperands { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::$Unknown { + operands, + operator, + } => f.debug_struct("Unknown").field("operator", operator).field("operands", operands).finish(), + $(Self::$Variant($VariantStruct { + $pos, + $($field,)* + }) => f.debug_struct(stringify!($Variant)).field("pos", $pos)$(.field(stringify!($field), $field))*.finish(),)* + } + } + } + }; + ( + @impl_variant_parse + $enum_vis:vis enum; + struct $VariantStruct:ident { + $pos:ident: PdfInputPositionNoCompare, + $( + #[$field_parse:ident($($parse_args:ident),* $(,)?)] + $(#[$($field_meta:tt)*])* + $field:ident: $field_ty:ty, + )* + } + ) => { + impl $VariantStruct { + $enum_vis fn parse(pos: impl Into, operands: impl IntoIterator) -> Result { + let pos = pos.into(); + let mut operands = operands.into_iter(); + $($(let Some($parse_args) = operands.next() else { + return Err(PdfParseError::OperatorHasTooFewOperands { operator: Self::operator_from_pos(pos) }); + };)*)* + if operands.next().is_some() { + return Err(PdfParseError::OperatorHasTooManyOperands { operator: Self::operator_from_pos(pos) }); + } + Ok(Self { + pos, + $($field: <$field_ty>::$field_parse($($parse_args),*)?,)* + }) + } + } + }; + ( + @impl_variant_parse + $enum_vis:vis enum; + struct $VariantStruct:ident { + $pos:ident: PdfInputPositionNoCompare, + #[$field_parse:ident(...)] + $(#[$($field_meta:tt)*])* + $field:ident: $field_ty:ty, + } + ) => { + impl $VariantStruct { + $enum_vis fn parse(pos: impl Into, operands: impl IntoIterator) -> Result { + let pos = pos.into(); + let operands = operands.into_iter(); + Ok(Self { + pos, + $field: <$field_ty>::$field_parse(operands)?, + }) + } + } + }; +} + +make_pdf_operator_enum! { + #[derive(Clone)] + pub enum PdfOperator; + #[derive(Clone)] + pub enum PdfOperatorAndOperands { + Unknown { + operands: Arc<[PdfObjectDirect]>, + operator: PdfOperatorUnparsed, + }, + #[kw = b"b"] + CloseFillAndStrokePath(PdfOperatorCloseFillAndStrokePath { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"B"] + FillAndStrokePath(PdfOperatorFillAndStrokePath { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"b*"] + CloseFillAndStrokePathEvenOdd(PdfOperatorCloseFillAndStrokePathEvenOdd { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"B*"] + FillAndStrokePathEvenOdd(PdfOperatorFillAndStrokePathEvenOdd { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"BDC"] + BeginMarkedContentWithProperties(PdfOperatorBeginMarkedContentWithProperties { + pos: PdfInputPositionNoCompare, + #[parse(tag)] + tag: PdfName, + #[parse(properties)] + properties: NameOr, + }), + #[kw = b"BI"] + BeginInlineImage(PdfOperatorBeginInlineImage { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"BMC"] + BeginMarkedContent(PdfOperatorBeginMarkedContent { + pos: PdfInputPositionNoCompare, + #[parse(tag)] + tag: PdfName, + }), + #[kw = b"BT"] + BeginText(PdfOperatorBeginText { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"BX"] + BeginCompatibilitySection(PdfOperatorBeginCompatibilitySection { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"c"] + CurveTo(PdfOperatorCurveTo { + pos: PdfInputPositionNoCompare, + #[parse(x1, y1)] + p1: PdfVec2D, + #[parse(x2, y2)] + p2: PdfVec2D, + #[parse(x3, y3)] + p3: PdfVec2D, + }), + #[kw = b"cm"] + ConcatMatrix(PdfOperatorConcatMatrix { + pos: PdfInputPositionNoCompare, + #[parse_flat(a, b, c, d, e, f)] + matrix: PdfMatrix, + }), + #[kw = b"CS"] + SetStrokeColorSpace(PdfOperatorSetStrokeColorSpace { + pos: PdfInputPositionNoCompare, + #[parse(name)] + name: PdfName, + }), + #[kw = b"cs"] + SetNonStrokeColorSpace(PdfOperatorSetNonStrokeColorSpace { + pos: PdfInputPositionNoCompare, + #[parse(name)] + name: PdfName, + }), + #[kw = b"d"] + SetLineDashPattern(PdfOperatorSetLineDashPattern { + pos: PdfInputPositionNoCompare, + #[parse(dash_array)] + dash_array: PdfObject, // TODO: actually parse + #[parse(dash_phase)] + dash_phase: PdfObject, // TODO: actually parse + }), + #[kw = b"d0"] + FontType3SetWidth(PdfOperatorFontType3SetWidth { + pos: PdfInputPositionNoCompare, + #[parse(x, y)] + width: PdfVec2D, + }), + #[kw = b"d1"] + FontType3SetWidthAndBBox(PdfOperatorFontType3SetWidthAndBBox { + pos: PdfInputPositionNoCompare, + #[parse(width_x, width_y)] + width: PdfVec2D, + #[parse_flat(lower_left_x, lower_left_y, upper_right_x, upper_right_y)] + bbox: PdfRectangle, + }), + #[kw = b"Do"] + PaintXObject(PdfOperatorPaintXObject { + pos: PdfInputPositionNoCompare, + #[parse(name)] + name: PdfName, + }), + #[kw = b"DP"] + DesignateMarkedContentPointWithProperties(PdfOperatorDesignateMarkedContentPointWithProperties { + pos: PdfInputPositionNoCompare, + #[parse(tag)] + tag: PdfName, + #[parse(properties)] + properties: NameOr, + }), + #[kw = b"EI"] + EndInlineImage(PdfOperatorEndInlineImage { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"EMC"] + EndMarkedContent(PdfOperatorEndMarkedContent { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"ET"] + EndText(PdfOperatorEndText { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"EX"] + EndCompatibilitySection(PdfOperatorEndCompatibilitySection { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"f"] + FillPath(PdfOperatorFillPath { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"F"] + FillPathObsolete(PdfOperatorFillPathObsolete { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"f*"] + FillPathEvenOdd(PdfOperatorFillPathEvenOdd { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"G"] + SetStrokeGray(PdfOperatorSetStrokeGray { + pos: PdfInputPositionNoCompare, + #[parse(gray)] + gray: f32, + }), + #[kw = b"g"] + SetNonStrokeGray(PdfOperatorSetNonStrokeGray { + pos: PdfInputPositionNoCompare, + #[parse(gray)] + gray: f32, + }), + #[kw = b"gs"] + SetGraphicsState(PdfOperatorSetGraphicsState { + pos: PdfInputPositionNoCompare, + #[parse(dictionary_name)] + dictionary_name: PdfName, + }), + #[kw = b"h"] + CloseSubpath(PdfOperatorCloseSubpath { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"i"] + SetFlatnessTolerance(PdfOperatorSetFlatnessTolerance { + pos: PdfInputPositionNoCompare, + #[parse(flatness)] + flatness: f32, + }), + #[kw = b"ID"] + BeginInlineImageData(PdfOperatorBeginInlineImageData { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"j"] + SetLineJoinStyle(PdfOperatorSetLineJoinStyle { + pos: PdfInputPositionNoCompare, + #[parse(line_join_style)] + line_join_style: u8, // TODO parse + }), + #[kw = b"J"] + SetLineCapStyle(PdfOperatorSetLineCapStyle { + pos: PdfInputPositionNoCompare, + #[parse(line_cap_style)] + line_cap_style: u8, // TODO parse + }), + #[kw = b"K"] + SetStrokeCmyk(PdfOperatorSetStrokeCmyk { + pos: PdfInputPositionNoCompare, + #[parse(c)] + c: f32, + #[parse(m)] + m: f32, + #[parse(y)] + y: f32, + #[parse(k)] + k: f32, + }), + #[kw = b"k"] + SetNonStrokeCmyk(PdfOperatorSetNonStrokeCmyk { + pos: PdfInputPositionNoCompare, + #[parse(c)] + c: f32, + #[parse(m)] + m: f32, + #[parse(y)] + y: f32, + #[parse(k)] + k: f32, + }), + #[kw = b"l"] + LineTo(PdfOperatorLineTo { + pos: PdfInputPositionNoCompare, + #[parse(x, y)] + to: PdfVec2D, + }), + #[kw = b"m"] + MoveTo(PdfOperatorMoveTo { + pos: PdfInputPositionNoCompare, + #[parse(x, y)] + to: PdfVec2D, + }), + #[kw = b"M"] + SetMiterLimit(PdfOperatorSetMiterLimit { + pos: PdfInputPositionNoCompare, + #[parse(limit)] + limit: f32, + }), + #[kw = b"MP"] + DesignateMarkedContentPoint(PdfOperatorDesignateMarkedContentPoint { + pos: PdfInputPositionNoCompare, + #[parse(tag)] + tag: PdfName, + }), + #[kw = b"n"] + EndPath(PdfOperatorEndPath { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"q"] + SaveGraphicsState(PdfOperatorSaveGraphicsState { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"Q"] + RestoreGraphicsState(PdfOperatorRestoreGraphicsState { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"re"] + Rectangle(PdfOperatorRectangle { + pos: PdfInputPositionNoCompare, + #[parse(x, y)] + p: PdfVec2D, + #[parse(width, height)] + size: PdfVec2D, + }), + #[kw = b"RG"] + SetStrokeRgb(PdfOperatorSetStrokeRgb { + pos: PdfInputPositionNoCompare, + #[parse(r)] + r: f32, + #[parse(g)] + g: f32, + #[parse(b)] + b: f32, + }), + #[kw = b"rg"] + SetNonStrokeRgb(PdfOperatorSetNonStrokeRgb { + pos: PdfInputPositionNoCompare, + #[parse(r)] + r: f32, + #[parse(g)] + g: f32, + #[parse(b)] + b: f32, + }), + #[kw = b"ri"] + SetColorRenderingIntent(PdfOperatorSetColorRenderingIntent { + pos: PdfInputPositionNoCompare, + #[parse(intent)] + intent: PdfName, + }), + #[kw = b"s"] + CloseAndStrokePath(PdfOperatorCloseAndStrokePath { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"S"] + StrokePath(PdfOperatorStrokePath { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"SC"] + SetStrokeColor(PdfOperatorSetStrokeColor { + pos: PdfInputPositionNoCompare, + #[parse_iter(...)] + color: Arc<[f32]>, + }), + #[kw = b"sc"] + SetNonStrokeColor(PdfOperatorSetNonStrokeColor { + pos: PdfInputPositionNoCompare, + #[parse_iter(...)] + color: Arc<[f32]>, + }), + #[kw = b"SCN"] + SetStrokeColorWithName(PdfOperatorSetStrokeColorWithName { + pos: PdfInputPositionNoCompare, + #[parse_iter(...)] + color_and_name: Arc<[NameOr]>, + }), + #[kw = b"scn"] + SetNonStrokeColorWithName(PdfOperatorSetNonStrokeColorWithName { + pos: PdfInputPositionNoCompare, + #[parse_iter(...)] + color_and_name: Arc<[NameOr]>, + }), + #[kw = b"sh"] + Shade(PdfOperatorShade { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"T*"] + TextNextLine(PdfOperatorTextNextLine { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"Tc"] + SetCharacterSpacing(PdfOperatorSetCharacterSpacing { + pos: PdfInputPositionNoCompare, + #[parse(char_space)] + char_space: f32, + }), + #[kw = b"Td"] + TextNextLineWithOffset(PdfOperatorTextNextLineWithOffset { + pos: PdfInputPositionNoCompare, + #[parse(x, y)] + offset: PdfVec2D, + }), + #[kw = b"TD"] + TextNextLineWithOffsetAndLeading(PdfOperatorTextNextLineWithOffsetAndLeading { + pos: PdfInputPositionNoCompare, + #[parse(x, y)] + offset: PdfVec2D, + }), + #[kw = b"Tf"] + SetFontAndSize(PdfOperatorSetFontAndSize { + pos: PdfInputPositionNoCompare, + #[parse(font)] + font: PdfName, + #[parse(size)] + size: f32, + }), + #[kw = b"Tj"] + ShowText(PdfOperatorShowText { + pos: PdfInputPositionNoCompare, + #[parse(text)] + text: PdfString, + }), + #[kw = b"TJ"] + ShowTextWithGlyphPositioning(PdfOperatorShowTextWithGlyphPositioning { + pos: PdfInputPositionNoCompare, + #[parse(text_and_positioning)] + text_and_positioning: Arc<[PdfStringOrNumber]>, + }), + #[kw = b"TL"] + SetTextLeading(PdfOperatorSetTextLeading { + pos: PdfInputPositionNoCompare, + #[parse(leading)] + leading: f32, + }), + #[kw = b"Tm"] + SetTextMatrix(PdfOperatorSetTextMatrix { + pos: PdfInputPositionNoCompare, + #[parse_flat(a, b, c, d, e, f)] + matrix: PdfMatrix, + }), + #[kw = b"Tr"] + SetTextRenderingMode(PdfOperatorSetTextRenderingMode { + pos: PdfInputPositionNoCompare, + #[parse(rendering_mode)] + rendering_mode: u8, // TODO: parse + }), + #[kw = b"Ts"] + SetTextRise(PdfOperatorSetTextRise { + pos: PdfInputPositionNoCompare, + #[parse(rise)] + rise: f32, + }), + #[kw = b"Tw"] + SetWordSpacing(PdfOperatorSetWordSpacing { + pos: PdfInputPositionNoCompare, + #[parse(word_space)] + word_space: f32, + }), + #[kw = b"Tz"] + SetTextHorizontalScaling(PdfOperatorSetTextHorizontalScaling { + pos: PdfInputPositionNoCompare, + #[parse(scale_percent)] + scale_percent: f32, + }), + #[kw = b"v"] + CurveTo23(PdfOperatorCurveTo23 { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"w"] + SetLineWidth(PdfOperatorSetLineWidth { + pos: PdfInputPositionNoCompare, + #[parse(line_width)] + line_width: f32, + }), + #[kw = b"W"] + Clip(PdfOperatorClip { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"W*"] + ClipEvenOdd(PdfOperatorClipEvenOdd { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"y"] + CurveTo13(PdfOperatorCurveTo13 { + pos: PdfInputPositionNoCompare, + }), + #[kw = b"'"] + TextNextLineAndShow(PdfOperatorTextNextLineAndShow { + pos: PdfInputPositionNoCompare, + #[parse(text)] + text: PdfString, + }), + #[kw = b"\""] + SetSpacingThenTextNextLineAndShow(PdfOperatorSetSpacingThenTextNextLineAndShow { + pos: PdfInputPositionNoCompare, + #[parse(word_space)] + word_space: f32, + #[parse(char_space)] + char_space: f32, + #[parse(text)] + text: PdfString, + }), + } +} + +impl GetPdfInputPosition for PdfOperator { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.pos() + } +} + +impl GetPdfInputPosition for PdfOperatorAndOperands { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.pos() + } +} + +#[derive(Debug, Clone)] +pub struct PdfContentStreamData { + pub operators: Arc<[PdfOperatorAndOperands]>, +} + +impl PdfStreamContents for PdfContentStreamData { + fn parse( + data: &[u8], + stream_pos: PdfInputPosition, + objects: Arc, + ) -> Result { + let mut parser = PdfParser { + objects, + tokenizer: PdfTokenizer::new( + data, + PdfInputPositionKnown { + pos: 0, + containing_streams_pos: stream_pos.get().map(|v| v.pos), + }, + ), + }; + let mut operands = Vec::new(); + let mut operators = Vec::new(); + loop { + parser.skip_comments_and_whitespace(); + if parser.tokenizer.peek().is_none() { + break; + } + match parser.parse_object_or_operator()? { + PdfObjectOrStreamDictionaryOrOperator::StreamDictionary { + stream_kw_pos, .. + } => return Err(PdfParseError::StreamNotAllowedHere { pos: stream_kw_pos }), + PdfObjectOrStreamDictionaryOrOperator::Object(object) => operands.push(object), + PdfObjectOrStreamDictionaryOrOperator::Operator(operator) => { + operators.push(PdfOperator::from(operator).parse(operands.drain(..))?); + } + } + } + if operands.is_empty() { + Ok(Self { + operators: operators.into(), + }) + } else { + Err(PdfParseError::MissingOperator { + pos: parser.tokenizer.pos(), + }) + } + } +} + +pub type PdfContentStream = PdfStream; diff --git a/src/pdf/document_structure.rs b/src/pdf/document_structure.rs index 265182c..13c0de3 100644 --- a/src/pdf/document_structure.rs +++ b/src/pdf/document_structure.rs @@ -2,6 +2,7 @@ use core::fmt; use std::{borrow::Cow, sync::Arc}; use crate::pdf::{ + content_stream::PdfContentStream, font::PdfFont, object::{ IsPdfNull, MaybeArray, PdfDate, PdfDictionary, PdfInteger, PdfName, PdfObject, @@ -238,7 +239,7 @@ pdf_parse! { #[pdf(name = "BoxColorInfo")] pub box_color_info: Option, #[pdf(name = "Contents")] - pub contents: MaybeArray, + pub contents: MaybeArray, #[pdf(name = "Group")] pub group: Option, #[pdf(name = "Thumb")] @@ -388,7 +389,7 @@ pub struct PdfPage { pub trim_box: PdfRectangle, pub art_box: PdfRectangle, pub box_color_info: Option, - pub contents: Arc<[PdfStream]>, + pub contents: Arc<[PdfContentStream]>, pub rotate: PdfPageRotation, pub group: Option, pub thumbnail: Option, diff --git a/src/pdf/object.rs b/src/pdf/object.rs index dad6e49..de3b6da 100644 --- a/src/pdf/object.rs +++ b/src/pdf/object.rs @@ -34,7 +34,7 @@ impl std::fmt::Debug for PdfString { } #[derive(Clone, Copy)] -pub struct PdfStringBytesDebug<'a>(&'a [u8]); +pub struct PdfStringBytesDebug<'a>(pub &'a [u8]); impl<'a> fmt::Display for PdfStringBytesDebug<'a> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { @@ -407,6 +407,81 @@ impl PdfParse for PdfNumber { } } +#[derive(Clone)] +pub enum PdfStringOrNumber { + String(PdfString), + Number(PdfNumber), +} + +impl fmt::Debug for PdfStringOrNumber { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::String(v) => v.fmt(f), + Self::Number(v) => v.fmt(f), + } + } +} + +impl PdfStringOrNumber { + pub fn pos(self) -> PdfInputPosition { + match self { + Self::String(v) => v.pos(), + Self::Number(v) => v.pos(), + } + } +} + +impl PdfObjectDirect { + pub fn string_or_number(&self) -> Option { + match *self { + PdfObjectDirect::Integer(v) => Some(PdfStringOrNumber::Number(PdfNumber::Integer(v))), + PdfObjectDirect::Real(v) => Some(PdfStringOrNumber::Number(PdfNumber::Real(v))), + PdfObjectDirect::String(ref v) => Some(PdfStringOrNumber::String(v.clone())), + PdfObjectDirect::Boolean(_) + | PdfObjectDirect::Name(_) + | PdfObjectDirect::Array(_) + | PdfObjectDirect::Dictionary(_) + | PdfObjectDirect::Stream(_) + | PdfObjectDirect::Null(_) => None, + } + } +} + +impl PdfObjectNonNull { + pub fn string_or_number(&self) -> Option { + match *self { + PdfObjectNonNull::Integer(v) => Some(PdfStringOrNumber::Number(PdfNumber::Integer(v))), + PdfObjectNonNull::Real(v) => Some(PdfStringOrNumber::Number(PdfNumber::Real(v))), + PdfObjectNonNull::String(ref v) => Some(PdfStringOrNumber::String(v.clone())), + PdfObjectNonNull::Boolean(_) + | PdfObjectNonNull::Name(_) + | PdfObjectNonNull::Array(_) + | PdfObjectNonNull::Dictionary(_) + | PdfObjectNonNull::Stream(_) => None, + } + } +} + +impl IsPdfNull for PdfStringOrNumber { + fn is_pdf_null(&self) -> bool { + false + } +} + +impl PdfParse for PdfStringOrNumber { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("string or number") + } + fn parse(object: PdfObject) -> Result { + let object = PdfObjectDirect::from(object); + object.string_or_number().ok_or(PdfParseError::InvalidType { + pos: object.pos(), + ty: object.type_name(), + expected_ty: "string or number", + }) + } +} + macro_rules! make_pdf_object { ( $( @@ -818,34 +893,35 @@ impl PdfObjectIndirect { } } pub fn get(&self) -> PdfObjectDirect { - if let Some(objects) = self.objects.upgrade() { - if let Some(objects) = objects.inner.get() { - let final_id = self.final_id.get().copied(); - let limit = if final_id.is_some() { 1 } else { 1000usize }; - let mut id = final_id.unwrap_or(self.id); - for _ in 0..limit { - if let Some(object) = objects.objects.get(&self.id) { - let retval = match object { - PdfObject::Boolean(v) => PdfObjectDirect::Boolean(*v), - PdfObject::Integer(v) => PdfObjectDirect::Integer(*v), - PdfObject::Real(v) => PdfObjectDirect::Real(*v), - PdfObject::String(v) => PdfObjectDirect::String(v.clone()), - PdfObject::Name(v) => PdfObjectDirect::Name(v.clone()), - PdfObject::Array(v) => PdfObjectDirect::Array(v.clone()), - PdfObject::Dictionary(v) => PdfObjectDirect::Dictionary(v.clone()), - PdfObject::Stream(v) => PdfObjectDirect::Stream(v.clone()), - PdfObject::Null(v) => PdfObjectDirect::Null(*v), - PdfObject::Indirect(v) => { - id = v.id; - continue; - } - }; - // we could be racing with another thread, so set can fail but that's not a problem - let _ = self.final_id.set(id); - return retval; - } else { - return PdfObjectDirect::Null(PdfNull::new(id.pos)); - } + let Some(objects) = self.objects.upgrade() else { + panic!("PdfObjects is no longer available"); + }; + if let Some(objects) = objects.inner.get() { + let final_id = self.final_id.get().copied(); + let limit = if final_id.is_some() { 1 } else { 1000usize }; + let mut id = final_id.unwrap_or(self.id); + for _ in 0..limit { + if let Some(object) = objects.objects.get(&self.id) { + let retval = match object { + PdfObject::Boolean(v) => PdfObjectDirect::Boolean(*v), + PdfObject::Integer(v) => PdfObjectDirect::Integer(*v), + PdfObject::Real(v) => PdfObjectDirect::Real(*v), + PdfObject::String(v) => PdfObjectDirect::String(v.clone()), + PdfObject::Name(v) => PdfObjectDirect::Name(v.clone()), + PdfObject::Array(v) => PdfObjectDirect::Array(v.clone()), + PdfObject::Dictionary(v) => PdfObjectDirect::Dictionary(v.clone()), + PdfObject::Stream(v) => PdfObjectDirect::Stream(v.clone()), + PdfObject::Null(v) => PdfObjectDirect::Null(*v), + PdfObject::Indirect(v) => { + id = v.id; + continue; + } + }; + // we could be racing with another thread, so set can fail but that's not a problem + let _ = self.final_id.set(id); + return retval; + } else { + return PdfObjectDirect::Null(PdfNull::new(id.pos)); } } } @@ -1213,21 +1289,150 @@ impl<'a, T> IntoIterator for &'a MaybeArray { } } +#[derive(Clone)] +pub enum NameOr { + Name(PdfName), + Value(T), +} + +impl NameOr { + pub fn into_resolved(self, resolve: impl FnOnce(PdfName) -> Result) -> Result { + match self { + Self::Name(name) => resolve(name), + Self::Value(v) => Ok(v), + } + } + pub fn replace_with_resolved( + &mut self, + resolve: impl FnOnce(&PdfName) -> Result, + ) -> Result<&mut T, E> { + match self { + Self::Name(name) => { + *self = Self::Value(resolve(name)?); + let Self::Value(v) = self else { + unreachable!(); + }; + Ok(v) + } + Self::Value(v) => Ok(v), + } + } +} + +impl fmt::Debug for NameOr { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Name(v) => v.fmt(f), + Self::Value(v) => v.fmt(f), + } + } +} + +impl GetPdfInputPosition for NameOr { + fn get_pdf_input_position(&self) -> PdfInputPosition { + match self { + Self::Name(v) => v.pos(), + Self::Value(v) => v.get_pdf_input_position(), + } + } +} + +impl IsPdfNull for NameOr { + fn is_pdf_null(&self) -> bool { + match self { + Self::Name(_) => false, + Self::Value(v) => v.is_pdf_null(), + } + } +} + +impl PdfParse for NameOr { + fn type_name() -> Cow<'static, str> { + Cow::Owned(format!("NameOr<{}>", T::type_name())) + } + fn parse(object: PdfObject) -> Result { + Ok(match PdfObjectDirect::from(object) { + PdfObjectDirect::Name(name) => Self::Name(name), + object => Self::Value(T::parse(object.into())?), + }) + } +} + #[derive(Copy, Clone, PartialEq)] -pub struct PdfPoint { +pub struct PdfMatrix { + pub pos: PdfInputPositionNoCompare, + pub elements: [f32; 6], +} + +impl fmt::Debug for PdfMatrix { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let Self { pos, elements } = *self; + write!(f, "PdfMatrix(at {pos}, {elements:?})") + } +} + +impl IsPdfNull for PdfMatrix { + fn is_pdf_null(&self) -> bool { + false + } +} + +impl PdfParse for PdfMatrix { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("matrix") + } + fn parse(object: PdfObject) -> Result { + Ok(Self { + pos: object.pos().into(), + elements: PdfParse::parse(object)?, + }) + } +} + +impl PdfMatrix { + pub fn parse_flat( + a: PdfObject, + b: PdfObject, + c: PdfObject, + d: PdfObject, + e: PdfObject, + f: PdfObject, + ) -> Result { + Ok(Self { + pos: a.pos().into(), + elements: [ + PdfParse::parse(a)?, + PdfParse::parse(b)?, + PdfParse::parse(c)?, + PdfParse::parse(d)?, + PdfParse::parse(e)?, + PdfParse::parse(f)?, + ], + }) + } +} + +impl GetPdfInputPosition for PdfMatrix { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.pos.0 + } +} + +#[derive(Copy, Clone, PartialEq)] +pub struct PdfVec2D { pub pos: PdfInputPositionNoCompare, pub x: f32, pub y: f32, } -impl fmt::Debug for PdfPoint { +impl fmt::Debug for PdfVec2D { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let Self { pos, x, y } = *self; - write!(f, "PdfPoint(at {pos}, {x}, {y})") + write!(f, "PdfVec2D(at {pos}, {x}, {y})") } } -impl PdfPoint { +impl PdfVec2D { pub fn parse(x: PdfObject, y: PdfObject) -> Result { Ok(Self { pos: x.pos().into(), @@ -1237,7 +1442,7 @@ impl PdfPoint { } } -impl GetPdfInputPosition for PdfPoint { +impl GetPdfInputPosition for PdfVec2D { fn get_pdf_input_position(&self) -> PdfInputPosition { self.pos.0 } @@ -1246,13 +1451,13 @@ impl GetPdfInputPosition for PdfPoint { #[derive(Copy, Clone, Debug)] pub struct PdfRectangle { /// the corner with the smaller x and y coordinates - smaller: PdfPoint, + smaller: PdfVec2D, /// the corner with the larger x and y coordinates - larger: PdfPoint, + larger: PdfVec2D, } impl PdfRectangle { - pub fn new(mut smaller: PdfPoint, mut larger: PdfPoint) -> Self { + pub fn new(mut smaller: PdfVec2D, mut larger: PdfVec2D) -> Self { // `pos` follows the `x` coordinate if smaller.x.is_nan() { smaller.pos = larger.pos; @@ -1262,12 +1467,12 @@ impl PdfRectangle { std::mem::swap(&mut smaller.pos, &mut larger.pos); } Self { - smaller: PdfPoint { + smaller: PdfVec2D { pos: smaller.pos, x: smaller.x.min(larger.x), y: smaller.y.min(larger.y), }, - larger: PdfPoint { + larger: PdfVec2D { pos: larger.pos, x: smaller.x.max(larger.x), y: smaller.y.max(larger.y), @@ -1275,13 +1480,24 @@ impl PdfRectangle { } } /// return the corner with the smaller x and y coordinates - pub fn smaller(&self) -> PdfPoint { + pub fn smaller(&self) -> PdfVec2D { self.smaller } /// return the corner with the larger x and y coordinates - pub fn larger(&self) -> PdfPoint { + pub fn larger(&self) -> PdfVec2D { self.larger } + pub fn parse_flat( + lower_left_x: PdfObject, + lower_left_y: PdfObject, + upper_right_x: PdfObject, + upper_right_y: PdfObject, + ) -> Result { + Ok(Self::new( + PdfVec2D::parse(lower_left_x, lower_left_y)?, + PdfVec2D::parse(upper_right_x, upper_right_y)?, + )) + } } impl GetPdfInputPosition for PdfRectangle { @@ -1317,10 +1533,12 @@ impl PdfParse for PdfRectangle { expected_ty: "rectangle", }); }; - Ok(Self::new( - PdfPoint::parse(lower_left_x.clone(), lower_left_y.clone())?, - PdfPoint::parse(upper_right_x.clone(), upper_right_y.clone())?, - )) + Self::parse_flat( + lower_left_x.clone(), + lower_left_y.clone(), + upper_right_x.clone(), + upper_right_y.clone(), + ) } } @@ -1366,7 +1584,7 @@ pdf_parse! { #[pdf] #[derive(Clone, Debug)] pub struct PdfStreamDictionary { - #[pdf(name = PdfStreamDictionary::LENGTH_NAME)] + #[pdf(name = "Length")] pub len: usize, #[pdf(name = "Filter")] pub filters: MaybeArray, @@ -1385,15 +1603,6 @@ pdf_parse! { } } -impl PdfStreamDictionary { - pub const LENGTH_NAME: &str = "Length"; - pub(crate) fn parse_len_from_dictionary( - dictionary: &PdfDictionary, - ) -> Result { - PdfParse::parse(dictionary.get_or_null(Self::LENGTH_NAME.as_bytes())) - } -} - #[derive(Debug, Clone, Default)] pub struct PdfStreamDictionaryFiltersAndParms<'a> { filters: std::iter::Enumerate>, @@ -1505,12 +1714,45 @@ impl UnparsedPdfStreamDictionary { } } +pub trait PdfStreamContents: Sized + fmt::Debug + 'static { + fn parse( + data: &[u8], + stream_pos: PdfInputPosition, + objects: Arc, + ) -> Result; + fn parse_arc( + data: Arc<[u8]>, + stream_pos: PdfInputPosition, + objects: Arc, + ) -> Result { + Self::parse(&*data, stream_pos, objects) + } +} + +impl PdfStreamContents for Arc<[u8]> { + fn parse( + data: &[u8], + _stream_pos: PdfInputPosition, + _objects: Arc, + ) -> Result { + Ok(Arc::from(data)) + } + fn parse_arc( + data: Arc<[u8]>, + _stream_pos: PdfInputPosition, + _objects: Arc, + ) -> Result { + Ok(data.clone()) + } +} + #[derive(Clone)] -pub struct PdfStream { +pub struct PdfStream> { pos: PdfInputPositionNoCompare, - dictionary: Arc>>, + objects: std::sync::Weak, + dictionary: PdfStreamDictionary, encoded_data: Arc<[u8]>, - decoded_data: Arc, PdfParseError>>>, + decoded_data: Arc>>, } struct DumpBytes<'a>(&'a [u8]); @@ -1542,25 +1784,30 @@ impl fmt::Display for DumpBytes<'_> { } } -impl fmt::Debug for PdfStream { +impl fmt::Debug for PdfStream { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let Self { pos, + objects: _, dictionary, encoded_data, decoded_data, } = self; let mut debug_struct = f.debug_struct("PdfStream"); debug_struct.field("pos", pos); - if let Some(dictionary) = dictionary.get() { - debug_struct.field("dictionary", dictionary); - } else { - debug_struct.field("dictionary", &format_args!("")); - } + debug_struct.field("dictionary", dictionary); debug_struct.field("encoded_data", &DumpBytes(encoded_data)); if let Some(decoded_data) = decoded_data.get() { match decoded_data { - Ok(decoded_data) => debug_struct.field("decoded_data", &DumpBytes(decoded_data)), + Ok(decoded_data) => { + if let Some(decoded_data) = + ::downcast_ref::>(decoded_data) + { + debug_struct.field("decoded_data", &DumpBytes(&**decoded_data)) + } else { + debug_struct.field("decoded_data", decoded_data) + } + } Err(e) => debug_struct.field("decoded_data", &Err::<(), _>(e)), }; } else { @@ -1570,47 +1817,31 @@ impl fmt::Debug for PdfStream { } } -impl PdfStream { +impl PdfStream { pub fn new( pos: impl Into, + objects: &Arc, dictionary: PdfStreamDictionary, encoded_data: Arc<[u8]>, ) -> Self { Self { pos: pos.into(), - dictionary: Arc::new(OnceLock::from(dictionary)), + objects: Arc::downgrade(objects), + dictionary, encoded_data, decoded_data: Arc::new(OnceLock::new()), } } - pub(crate) fn new_unparsed( - pos: impl Into, - unparsed_dictionary: PdfDictionary, - encoded_data: Arc<[u8]>, - ) -> (Self, UnparsedPdfStreamDictionary) { - let dictionary = Arc::new(OnceLock::new()); - ( - Self { - pos: pos.into(), - dictionary: dictionary.clone(), - encoded_data, - decoded_data: Arc::new(OnceLock::new()), - }, - UnparsedPdfStreamDictionary { - unparsed_dictionary, - dictionary, - }, - ) - } pub fn dictionary(&self) -> &PdfStreamDictionary { - self.dictionary - .get() - .expect("haven't finished parsing all pdf object definitions yet") + &self.dictionary } pub fn encoded_data(&self) -> &Arc<[u8]> { &self.encoded_data } - fn try_decode_data(&self) -> Result, PdfParseError> { + fn try_decode_data(&self) -> Result { + let Some(objects) = self.objects.upgrade() else { + panic!("PdfObjects is no longer available"); + }; let dictionary = self.dictionary(); let (data, filters) = if let Some(file) = &dictionary.file { todo!() @@ -1618,7 +1849,7 @@ impl PdfStream { (&self.encoded_data, dictionary.filters_and_parms()) }; if filters.len() == 0 { - return Ok(data.clone()); + return Data::parse_arc(data.clone(), self.pos.0, objects); } let mut data: &[u8] = data; let mut buffer; @@ -1626,26 +1857,26 @@ impl PdfStream { buffer = filter.decode_stream_data(filter_parms.clone(), self.pos.0, &data)?; data = &buffer; } - Ok(Arc::from(data)) + Data::parse(data, self.pos.0, objects) } - pub fn decoded_data(&self) -> &Result, PdfParseError> { + pub fn decoded_data(&self) -> &Result { self.decoded_data.get_or_init(|| self.try_decode_data()) } } -impl GetPdfInputPosition for PdfStream { +impl GetPdfInputPosition for PdfStream { fn get_pdf_input_position(&self) -> PdfInputPosition { self.pos.0 } } -impl IsPdfNull for PdfStream { +impl IsPdfNull for PdfStream { fn is_pdf_null(&self) -> bool { false } } -impl PdfParse for PdfStream { +impl PdfParse for PdfStream { fn type_name() -> Cow<'static, str> { if TypeId::of::() == TypeId::of::() { Cow::Borrowed("stream") @@ -1655,38 +1886,56 @@ impl PdfParse for PdfStream { } fn parse(object: PdfObject) -> Result { match PdfObjectDirect::from(object) { - PdfObjectDirect::Stream(stream) => Ok(PdfStream { - pos: stream.pos, - dictionary: if let Some(dictionary) = ::downcast_ref::< - Arc>>, - >(&stream.dictionary) - { - dictionary.clone() - } else { - let PdfStreamDictionary { - len, - filters, - decode_parms, - file, - file_filters, - file_decode_parms, - decoded_len, - rest, - } = stream.dictionary(); - Arc::new(OnceLock::from(PdfStreamDictionary { - len: *len, - filters: filters.clone(), - decode_parms: decode_parms.clone(), - file: file.clone(), - file_filters: file_filters.clone(), - file_decode_parms: file_decode_parms.clone(), - decoded_len: *decoded_len, - rest: Rest::parse(rest.clone().into())?, - })) - }, - encoded_data: stream.encoded_data, - decoded_data: stream.decoded_data, - }), + PdfObjectDirect::Stream(stream) => { + Ok(PdfStream { + pos: stream.pos, + dictionary: { + let PdfStreamDictionary { + len, + filters, + decode_parms, + file, + file_filters, + file_decode_parms, + decoded_len, + rest, + } = stream.dictionary; + PdfStreamDictionary { + len, + filters, + decode_parms, + file, + file_filters, + file_decode_parms, + decoded_len, + rest: Rest::parse(rest.into())?, + } + }, + encoded_data: stream.encoded_data, + decoded_data: if let Some(decoded_data) = + ::downcast_ref(&stream.decoded_data) + { + Arc::clone(decoded_data) + } else { + let Some(objects) = stream.objects.upgrade() else { + panic!("PdfObjects is no longer available"); + }; + Arc::new( + stream + .decoded_data + .get() + .cloned() + .map(|data| { + OnceLock::from(data.and_then(|data| { + Data::parse_arc(data, stream.pos.0, objects) + })) + }) + .unwrap_or_default(), + ) + }, + objects: stream.objects, + }) + } object => Err(PdfParseError::InvalidType { pos: object.get_pdf_input_position(), ty: object.type_name(), diff --git a/src/pdf/parse.rs b/src/pdf/parse.rs index 8e5a7fc..95e58ac 100644 --- a/src/pdf/parse.rs +++ b/src/pdf/parse.rs @@ -1,6 +1,9 @@ -use crate::pdf::object::{ - IsPdfNull, MaybeArray, PdfInteger, PdfName, PdfNull, PdfNumber, PdfObject, PdfObjectDirect, - PdfObjectIdentifier, PdfObjectIndirect, PdfObjectNonNull, +use crate::pdf::{ + content_stream::{PdfOperator, PdfOperatorUnparsed}, + object::{ + IsPdfNull, MaybeArray, PdfInteger, PdfName, PdfNull, PdfNumber, PdfObject, PdfObjectDirect, + PdfObjectIdentifier, PdfObjectIndirect, PdfObjectNonNull, + }, }; use std::{any::Any, borrow::Cow, fmt, mem, num::NonZero, sync::Arc}; @@ -265,9 +268,20 @@ pub enum PdfParseError { filter: PdfName, error: String, }, - ObjectStreamParseError { - stream_pos: PdfInputPosition, - error: Arc, + StreamNotAllowedHere { + pos: PdfInputPosition, + }, + OperatorNotAllowedHere { + operator: PdfOperatorUnparsed, + }, + MissingOperator { + pos: PdfInputPosition, + }, + OperatorHasTooFewOperands { + operator: PdfOperator, + }, + OperatorHasTooManyOperands { + operator: PdfOperator, }, } @@ -313,9 +327,11 @@ impl GetPdfInputPosition for PdfParseError { | PdfParseError::InvalidStartXRefValue { pos, .. } | PdfParseError::UnknownStreamFilter { pos, .. } | PdfParseError::StreamFilterError { pos, .. } - | PdfParseError::ObjectStreamParseError { - stream_pos: pos, .. - } => pos, + | PdfParseError::StreamNotAllowedHere { pos } + | PdfParseError::MissingOperator { pos } => pos, + PdfParseError::OperatorNotAllowedHere { ref operator } => operator.pos(), + PdfParseError::OperatorHasTooFewOperands { ref operator } + | PdfParseError::OperatorHasTooManyOperands { ref operator } => operator.pos(), } } } @@ -443,12 +459,33 @@ impl fmt::Display for PdfParseError { } => { write!(f, "at {pos}: stream filter {filter} error: {error}") } - PdfParseError::ObjectStreamParseError { - stream_pos, - ref error, - } => { - write!(f, "at {stream_pos}: object stream error: ")?; - error.fmt(f) + PdfParseError::StreamNotAllowedHere { pos } => { + write!(f, "at {pos}: stream not allowed here") + } + PdfParseError::OperatorNotAllowedHere { ref operator } => { + write!( + f, + "at {}: operator not allowed here: {}", + operator.pos(), + operator.bytes_debug() + ) + } + PdfParseError::MissingOperator { pos } => { + write!(f, "at {pos}: stream not allowed here") + } + PdfParseError::OperatorHasTooFewOperands { ref operator } => { + write!( + f, + "at {}: operator has too few operands: {operator:?}", + operator.pos(), + ) + } + PdfParseError::OperatorHasTooManyOperands { ref operator } => { + write!( + f, + "at {}: operator has too many operands: {operator:?}", + operator.pos(), + ) } } } From 9445599850b5458c0ad9642a42d60f022462bfda Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Mon, 29 Dec 2025 03:18:28 -0800 Subject: [PATCH 07/42] WIP adding rendering --- Cargo.lock | 52 ++ Cargo.toml | 1 + src/pdf.rs | 192 +++++- src/pdf/content_stream.rs | 48 +- src/pdf/document_structure.rs | 297 +++++++++- src/pdf/font.rs | 274 ++++++++- src/pdf/object.rs | 205 +++++-- src/pdf/parse.rs | 74 ++- src/pdf/render.rs | 1054 +++++++++++++++++++++++++++++++++ src/util.rs | 221 +++++++ 10 files changed, 2271 insertions(+), 147 deletions(-) create mode 100644 src/pdf/render.rs diff --git a/Cargo.lock b/Cargo.lock index 07f112f..10be7b1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -23,6 +23,37 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + [[package]] name = "flate2" version = "1.1.5" @@ -48,6 +79,27 @@ name = "parse_powerisa_pdf" version = "0.1.0" dependencies = [ "flate2", + "rayon", +] + +[[package]] +name = "rayon" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index c5d18eb..20ecf46 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,4 +6,5 @@ license = "LGPL-3.0-or-later" [dependencies] flate2 = "1.1.5" +rayon = "1.11.0" diff --git a/src/pdf.rs b/src/pdf.rs index 1933489..be57b31 100644 --- a/src/pdf.rs +++ b/src/pdf.rs @@ -5,22 +5,23 @@ use crate::{ object::{ PdfArray, PdfBoolean, PdfDictionary, PdfInteger, PdfName, PdfNull, PdfObject, PdfObjectIdentifier, PdfObjectIndirect, PdfObjectStreamDictionary, PdfReal, PdfStream, - PdfStreamDictionary, PdfString, UnparsedPdfStreamDictionary, + PdfStreamDictionary, PdfString, }, parse::{ GetPdfInputPosition, PdfInputPosition, PdfInputPositionKnown, PdfParse, PdfParseError, }, }, pdf_parse, - util::ArcOrRef, + util::{ArcOrRef, DagDebugState}, }; use std::{ - collections::BTreeMap, + any::{Any, TypeId}, + collections::{BTreeMap, HashMap}, convert::Infallible, fmt, num::NonZero, str::FromStr, - sync::{Arc, OnceLock}, + sync::{Arc, Mutex, OnceLock}, }; pub mod content_stream; @@ -28,10 +29,100 @@ pub mod document_structure; pub mod font; pub mod object; pub mod parse; +pub mod render; pub mod stream_filters; +struct ParseCache { + parse_results: HashMap>, + steps_till_next_gc: usize, +} + +impl Default for ParseCache { + fn default() -> Self { + Self { + parse_results: HashMap::new(), + steps_till_next_gc: 1, + } + } +} + +impl ParseCache { + fn gc(&mut self) { + if self.steps_till_next_gc == 0 { + self.parse_results.retain(|_k, v| v.strong_count() > 0); + let mut adjusted_len = self.parse_results.len(); + if adjusted_len < 10 { + adjusted_len = 10; + } + self.steps_till_next_gc = adjusted_len.saturating_mul(20); + } else { + self.steps_till_next_gc -= 1; + } + } + fn get(&mut self) -> Option> { + self.gc(); + let Ok(retval) = self + .parse_results + .get(&TypeId::of::())? + .upgrade()? + .downcast() + else { + unreachable!(); + }; + Some(retval) + } + fn get_or_insert( + &mut self, + value: Arc, + ) -> (Arc, impl Sized + use) { + use std::collections::hash_map::Entry; + self.gc(); + match self.parse_results.entry(TypeId::of::()) { + Entry::Occupied(mut entry) => { + if let Some(retval) = entry.get().upgrade() { + let Ok(retval) = retval.downcast::() else { + unreachable!(); + }; + (retval, Some(value)) + } else { + entry.insert(Arc::::downgrade(&value)); + (value, None) + } + } + Entry::Vacant(entry) => { + entry.insert(Arc::::downgrade(&value)); + (value, None) + } + } + } +} + +struct PdfObjectAndParseCache { + object: PdfObject, + parse_cache: Mutex, +} + +impl PdfObjectAndParseCache { + fn new(object: PdfObject) -> Self { + Self { + object, + parse_cache: Mutex::default(), + } + } + fn parse_cache_get(&self) -> Option> { + self.parse_cache.lock().expect("not poisoned").get() + } + fn parse_cache_get_or_insert(&self, value: Arc) -> Arc { + let mut parse_cache = self.parse_cache.lock().expect("not poisoned"); + let (retval, to_drop_after_unlock) = parse_cache.get_or_insert(value); + drop(parse_cache); + drop(to_drop_after_unlock); + retval + } +} + struct PdfObjectsInner { - objects: BTreeMap, + objects: BTreeMap, #[allow(dead_code)] object_streams: Vec>, } @@ -52,7 +143,7 @@ impl PdfHeader { pdf_parse! { #[pdf] - #[derive(Clone, Debug)] + #[derive(Clone)] pub struct PdfTrailerDictionary { #[pdf(name = "Size")] pub size: usize, @@ -71,6 +162,31 @@ pdf_parse! { } } +impl fmt::Debug for PdfTrailerDictionary { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + DagDebugState::scope(|_state| { + let Self { + size, + prev, + root, + encrypt, + info, + id, + rest, + } = self; + f.debug_struct("PdfTrailerDictionary") + .field("size", size) + .field("prev", prev) + .field("root", root) + .field("encrypt", encrypt) + .field("info", info) + .field("id", id) + .field("rest", rest) + .finish() + }) + } +} + pdf_parse! { #[pdf(name)] #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Default, Debug)] @@ -83,7 +199,7 @@ pdf_parse! { pdf_parse! { #[pdf] - #[derive(Clone, Debug)] + #[derive(Clone)] pub struct PdfXRefStreamDictionaryRest { #[pdf(name = "Type")] pub ty: PdfXRefName, @@ -96,7 +212,21 @@ pdf_parse! { } } -#[derive(Clone, Debug)] +impl fmt::Debug for PdfXRefStreamDictionaryRest { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + DagDebugState::scope(|_state| { + let Self { ty, index, w, rest } = self; + f.debug_struct("PdfXRefStreamDictionaryRest") + .field("ty", ty) + .field("index", index) + .field("w", w) + .field("rest", rest) + .finish() + }) + } +} + +#[derive(Clone)] pub enum PdfTrailer { Trailer { trailer_dictionary: PdfTrailerDictionary, @@ -108,6 +238,29 @@ pub enum PdfTrailer { }, } +impl fmt::Debug for PdfTrailer { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + DagDebugState::scope(|_state| match self { + Self::Trailer { + trailer_dictionary, + start_xref, + } => f + .debug_struct("Trailer") + .field("trailer_dictionary", trailer_dictionary) + .field("start_xref", start_xref) + .finish(), + Self::Stream { + xref_stream, + start_xref, + } => f + .debug_struct("Stream") + .field("xref_stream", xref_stream) + .field("start_xref", start_xref) + .finish(), + }) + } +} + impl PdfTrailer { pub fn trailer_dictionary(&self) -> &PdfTrailerDictionary { match self { @@ -722,7 +875,7 @@ impl<'a> PdfParser<'a> { struct PdfFileParser<'a> { parser: PdfParser<'a>, - objects_map: BTreeMap, + objects_map: BTreeMap, } impl<'a> PdfFileParser<'a> { @@ -823,7 +976,11 @@ impl<'a> PdfFileParser<'a> { let Some(PdfToken::Regular(b"endobj")) = self.parser.tokenizer.next() else { return Err(PdfParseError::MissingEndObj { pos: end_obj_pos }); }; - if self.objects_map.insert(id, object).is_some() { + if self + .objects_map + .insert(id, PdfObjectAndParseCache::new(object)) + .is_some() + { Err(PdfParseError::DuplicateIndirectObjectDefinition { pos: id.pos.0, id }) } else { Ok(Some(())) @@ -882,7 +1039,11 @@ impl<'a> PdfFileParser<'a> { let object = parser .parse_object_or_operator()? .error_on_stream_or_operator()?; - if self.objects_map.insert(id, object).is_some() { + if self + .objects_map + .insert(id, PdfObjectAndParseCache::new(object)) + .is_some() + { return Err(PdfParseError::DuplicateIndirectObjectDefinition { pos: id.pos.0, id }); } } @@ -892,7 +1053,7 @@ impl<'a> PdfFileParser<'a> { while let Some(()) = self.parse_indirect_object_definition()? {} let mut object_streams: Vec> = Vec::new(); for object in self.objects_map.values_mut() { - let stream = match object { + let stream = match &object.object { PdfObject::Stream(stream) => stream, PdfObject::Boolean(_) | PdfObject::Integer(_) @@ -907,7 +1068,7 @@ impl<'a> PdfFileParser<'a> { if PdfObjectStreamDictionary::parse_type_from_dictionary(&stream.dictionary().rest) .is_ok() { - object_streams.push(PdfStream::parse(object.clone())?); + object_streams.push(PdfStream::parse(object.object.clone())?); } } for object_stream in &object_streams { @@ -1012,11 +1173,6 @@ impl<'a> PdfFileParser<'a> { self.parse_body()?; self.parse_xref_table()?; let trailer = self.parse_trailer()?; - for page in trailer.trailer_dictionary().root.pages.pages().iter() { - for content in page.contents.iter() { - content.decoded_data().as_ref()?; - } - } Ok(Pdf { header, objects: self.parser.objects, diff --git a/src/pdf/content_stream.rs b/src/pdf/content_stream.rs index 2552df7..79764c2 100644 --- a/src/pdf/content_stream.rs +++ b/src/pdf/content_stream.rs @@ -10,6 +10,10 @@ use crate::{ GetPdfInputPosition, PdfInputPosition, PdfInputPositionKnown, PdfInputPositionNoCompare, PdfParse, PdfParseError, }, + render::{ + PdfColorDeviceGray, PdfColorDeviceRgb, PdfRenderOperator, PdfRenderState, + PdfRenderingIntent, + }, }, util::ArcOrRef, }; @@ -259,6 +263,18 @@ macro_rules! make_pdf_operator_enum { } } } + + impl PdfRenderOperator for $PdfOperatorAndOperands { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + match self { + Self::$Unknown { + operands, + operator, + } => state.handle_unknown_operator(operator, operands), + $(Self::$Variant(v) => <$VariantStruct as PdfRenderOperator>::render(v, state),)* + } + } + } }; ( @impl_variant_parse @@ -459,13 +475,13 @@ make_pdf_operator_enum! { SetStrokeGray(PdfOperatorSetStrokeGray { pos: PdfInputPositionNoCompare, #[parse(gray)] - gray: f32, + gray: PdfColorDeviceGray, }), #[kw = b"g"] SetNonStrokeGray(PdfOperatorSetNonStrokeGray { pos: PdfInputPositionNoCompare, #[parse(gray)] - gray: f32, + gray: PdfColorDeviceGray, }), #[kw = b"gs"] SetGraphicsState(PdfOperatorSetGraphicsState { @@ -570,28 +586,20 @@ make_pdf_operator_enum! { #[kw = b"RG"] SetStrokeRgb(PdfOperatorSetStrokeRgb { pos: PdfInputPositionNoCompare, - #[parse(r)] - r: f32, - #[parse(g)] - g: f32, - #[parse(b)] - b: f32, + #[parse_flat(r, g, b)] + color: PdfColorDeviceRgb, }), #[kw = b"rg"] SetNonStrokeRgb(PdfOperatorSetNonStrokeRgb { pos: PdfInputPositionNoCompare, - #[parse(r)] - r: f32, - #[parse(g)] - g: f32, - #[parse(b)] - b: f32, + #[parse_flat(r, g, b)] + color: PdfColorDeviceRgb, }), #[kw = b"ri"] SetColorRenderingIntent(PdfOperatorSetColorRenderingIntent { pos: PdfInputPositionNoCompare, #[parse(intent)] - intent: PdfName, + intent: PdfRenderingIntent, }), #[kw = b"s"] CloseAndStrokePath(PdfOperatorCloseAndStrokePath { @@ -760,11 +768,19 @@ impl GetPdfInputPosition for PdfOperatorAndOperands { } } -#[derive(Debug, Clone)] +#[derive(Clone)] pub struct PdfContentStreamData { pub operators: Arc<[PdfOperatorAndOperands]>, } +impl fmt::Debug for PdfContentStreamData { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("PdfContentStreamData") + .field("operators", &self.operators) + .finish() + } +} + impl PdfStreamContents for PdfContentStreamData { fn parse( data: &[u8], diff --git a/src/pdf/document_structure.rs b/src/pdf/document_structure.rs index 13c0de3..268d503 100644 --- a/src/pdf/document_structure.rs +++ b/src/pdf/document_structure.rs @@ -1,16 +1,19 @@ -use core::fmt; -use std::{borrow::Cow, sync::Arc}; - -use crate::pdf::{ - content_stream::PdfContentStream, - font::PdfFont, - object::{ - IsPdfNull, MaybeArray, PdfDate, PdfDictionary, PdfInteger, PdfName, PdfObject, - PdfObjectDirect, PdfObjectIndirect, PdfRectangle, PdfStream, PdfString, +use crate::{ + pdf::{ + content_stream::PdfContentStream, + font::PdfFont, + object::{ + IsPdfNull, MaybeArray, PdfDate, PdfDictionary, PdfInteger, PdfName, PdfObject, + PdfObjectDirect, PdfObjectIndirect, PdfRectangle, PdfStream, PdfString, + }, + parse::{PdfParse, PdfParseError}, + pdf_parse, + render::{PdfRenderOperator, PdfRenderState}, }, - parse::{PdfParse, PdfParseError}, - pdf_parse, + util::DagDebugState, }; +use rayon::iter::{FromParallelIterator, IntoParallelIterator, ParallelIterator}; +use std::{borrow::Cow, fmt, sync::Arc}; pdf_parse! { #[pdf(name)] @@ -24,7 +27,7 @@ pdf_parse! { pdf_parse! { #[pdf] - #[derive(Clone, Debug)] + #[derive(Clone)] pub struct PdfDocumentCatalog { #[pdf(name = "Type")] pub ty: PdfDocumentCatalogType, @@ -40,6 +43,27 @@ pdf_parse! { } } +impl fmt::Debug for PdfDocumentCatalog { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + DagDebugState::scope(|_state| { + let Self { + ty, + version, + extensions, + pages, + rest, + } = self; + f.debug_struct("PdfDocumentCatalog") + .field("ty", ty) + .field("version", version) + .field("extensions", extensions) + .field("pages", pages) + .field("rest", rest) + .finish() + }) + } +} + pdf_parse! { #[pdf] #[derive(Clone, Debug)] @@ -58,14 +82,15 @@ pub struct PdfPageTree { } impl PdfPageTree { - fn parse_pages(node: &PdfPageTreeNode, pages: &mut Vec) -> Result<(), PdfParseError> { + fn collect_leaves( + node: &PdfPageTreeNode, + leaves: &mut Vec, + ) -> Result<(), PdfParseError> { for kid in node.kids.iter() { match kid { - PdfPageTreeNodeOrLeaf::Node(node) => Self::parse_pages(node, pages)?, + PdfPageTreeNodeOrLeaf::Node(node) => Self::collect_leaves(node, leaves)?, PdfPageTreeNodeOrLeaf::Leaf(leaf) => { - pages.push(PdfPage::parse_after_propagating_inheritable_data( - leaf.clone(), - )?); + leaves.push(leaf.clone()); } PdfPageTreeNodeOrLeaf::Other(v) => { return Err(PdfParseError::InvalidType { @@ -80,11 +105,16 @@ impl PdfPageTree { } pub fn try_from_page_tree_root(mut page_tree: PdfPageTreeNode) -> Result { page_tree.propagate_inheritable_data_to_leaves(); - let mut pages = Vec::new(); - Self::parse_pages(&page_tree, &mut pages)?; + let mut leaves = Vec::new(); + Self::collect_leaves(&page_tree, &mut leaves)?; Ok(Self { page_tree, - pages: Arc::from(pages), + pages: Result::from_par_iter( + leaves + .into_par_iter() + .map(PdfPage::parse_after_propagating_inheritable_data) + .panic_fuse(), + )?, }) } pub fn page_tree(&self) -> &PdfPageTreeNode { @@ -97,9 +127,15 @@ impl PdfPageTree { impl fmt::Debug for PdfPageTree { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("PdfPageTree") - .field("pages", &self.pages) - .finish_non_exhaustive() + DagDebugState::scope(|_state| { + let Self { + page_tree: _, + pages, + } = self; + f.debug_struct("PdfPageTree") + .field("pages", pages) + .finish_non_exhaustive() + }) } } @@ -120,7 +156,7 @@ impl PdfParse for PdfPageTree { pdf_parse! { #[pdf] - #[derive(Clone, Debug, Default)] + #[derive(Clone, Default, Debug)] pub struct PdfPageInheritableData { #[pdf(name = "Resources")] pub resources: Option, @@ -168,7 +204,7 @@ pdf_parse! { pdf_parse! { #[pdf] - #[derive(Clone, Debug)] + #[derive(Clone)] pub struct PdfPageTreeNode { #[pdf(name = "Type")] pub ty: PdfPageTreeNodeType, @@ -184,6 +220,27 @@ pdf_parse! { } } +impl fmt::Debug for PdfPageTreeNode { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + DagDebugState::scope(|_state| { + let Self { + ty, + parent, + kids, + count, + inheritable, + } = self; + f.debug_struct("PdfPageTreeNode") + .field("ty", ty) + .field("parent", parent) + .field("kids", kids) + .field("count", count) + .field("inheritable", inheritable) + .finish() + }) + } +} + impl PdfPageTreeNode { pub fn propagate_inheritable_data_to_leaves(&mut self) { for kid in Arc::make_mut(&mut self.kids) { @@ -222,7 +279,7 @@ pdf_parse! { pdf_parse! { #[pdf] - #[derive(Clone, Debug)] + #[derive(Clone)] pub struct PdfPageTreeLeaf { #[pdf(name = "Type")] pub ty: PdfPageType, @@ -281,6 +338,74 @@ pdf_parse! { } } +impl fmt::Debug for PdfPageTreeLeaf { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + DagDebugState::scope(|_state| { + let Self { + ty, + parent, + last_modified, + bleed_box, + trim_box, + art_box, + box_color_info, + contents, + group, + thumbnail, + beads, + duration, + transition, + annotations, + additional_actions, + metadata, + piece_info, + structural_parents, + parent_web_capture_content_set_id, + preferred_zoom_factor, + separation_info, + annotations_tab_order, + template_instantiated, + pres_steps, + user_unit, + viewports, + inheritable, + } = self; + f.debug_struct("PdfPageTreeLeaf") + .field("ty", ty) + .field("parent", parent) + .field("last_modified", last_modified) + .field("bleed_box", bleed_box) + .field("trim_box", trim_box) + .field("art_box", art_box) + .field("box_color_info", box_color_info) + .field("contents", contents) + .field("group", group) + .field("thumbnail", thumbnail) + .field("beads", beads) + .field("duration", duration) + .field("transition", transition) + .field("annotations", annotations) + .field("additional_actions", additional_actions) + .field("metadata", metadata) + .field("piece_info", piece_info) + .field("structural_parents", structural_parents) + .field( + "parent_web_capture_content_set_id", + parent_web_capture_content_set_id, + ) + .field("preferred_zoom_factor", preferred_zoom_factor) + .field("separation_info", separation_info) + .field("annotations_tab_order", annotations_tab_order) + .field("template_instantiated", template_instantiated) + .field("pres_steps", pres_steps) + .field("user_unit", user_unit) + .field("viewports", viewports) + .field("inheritable", inheritable) + .finish() + }) + } +} + pdf_parse! { #[pdf(tag = "Type")] #[derive(Clone)] @@ -377,7 +502,7 @@ impl PdfParse for PdfPageRotation { } } -#[derive(Clone, Debug)] +#[derive(Clone)] pub struct PdfPage { pub ty: PdfPageType, pub parent: PdfObjectIndirect, @@ -410,9 +535,16 @@ pub struct PdfPage { pub user_unit: f32, pub viewports: Option>, pub rest: PdfDictionary, + rendered_objects: Option, } impl PdfPage { + pub fn rendered_objects(&self) -> &PdfPageRenderedObjects { + let Some(retval) = &self.rendered_objects else { + unreachable!(); + }; + retval + } pub fn parse_after_propagating_inheritable_data( leaf: PdfPageTreeLeaf, ) -> Result { @@ -465,7 +597,7 @@ impl PdfPage { })?; let crop_box = crop_box.unwrap_or(media_box); let rotate = rotate.unwrap_or(PdfPageRotation::NoRotation); - Ok(Self { + let mut retval = Self { ty, parent, last_modified, @@ -497,6 +629,115 @@ impl PdfPage { user_unit: user_unit.unwrap_or(1.0), viewports, rest, + rendered_objects: None, + }; + retval.rendered_objects = Some(PdfPageRenderedObjects::render_page(&retval)?); + Ok(retval) + } +} + +impl fmt::Debug for PdfPage { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + DagDebugState::scope(|_state| { + let Self { + ty, + parent, + last_modified, + resources, + media_box, + crop_box, + bleed_box, + trim_box, + art_box, + box_color_info, + contents, + rotate, + group, + thumbnail, + beads, + duration, + transition, + annotations, + additional_actions, + metadata, + piece_info, + structural_parents, + parent_web_capture_content_set_id, + preferred_zoom_factor, + separation_info, + annotations_tab_order, + template_instantiated, + pres_steps, + user_unit, + viewports, + rest, + rendered_objects, + } = self; + struct Unparsed; + impl fmt::Debug for Unparsed { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("") + } + } + f.debug_struct("PdfPage") + .field("ty", ty) + .field("parent", parent) + .field("last_modified", last_modified) + .field("resources", resources) + .field("media_box", media_box) + .field("crop_box", crop_box) + .field("bleed_box", bleed_box) + .field("trim_box", trim_box) + .field("art_box", art_box) + .field("box_color_info", box_color_info) + .field("contents", contents) + .field("rotate", rotate) + .field("group", group) + .field("thumbnail", thumbnail) + .field("beads", beads) + .field("duration", duration) + .field("transition", transition) + .field("annotations", annotations) + .field("additional_actions", additional_actions) + .field("metadata", metadata) + .field("piece_info", piece_info) + .field("structural_parents", structural_parents) + .field( + "parent_web_capture_content_set_id", + parent_web_capture_content_set_id, + ) + .field("preferred_zoom_factor", preferred_zoom_factor) + .field("separation_info", separation_info) + .field("annotations_tab_order", annotations_tab_order) + .field("template_instantiated", template_instantiated) + .field("pres_steps", pres_steps) + .field("user_unit", user_unit) + .field("viewports", viewports) + .field("rest", rest) + .field( + "rendered_objects", + if let Some(rendered_objects) = rendered_objects { + rendered_objects + } else { + &Unparsed + }, + ) + .finish() }) } } + +#[derive(Clone, Debug)] +pub struct PdfPageRenderedObjects {} + +impl PdfPageRenderedObjects { + fn render_page(page: &PdfPage) -> Result { + let mut state = PdfRenderState::new(page); + for content_stream in page.contents.iter() { + for op in content_stream.decoded_data().as_ref()?.operators.iter() { + op.render(&mut state)?; + } + } + Ok(Self {}) + } +} diff --git a/src/pdf/font.rs b/src/pdf/font.rs index bfc52b7..14086f1 100644 --- a/src/pdf/font.rs +++ b/src/pdf/font.rs @@ -1,17 +1,19 @@ -use std::{borrow::Cow, sync::Arc}; - -use crate::pdf::{ - object::{ - IsPdfNull, PdfDictionary, PdfName, PdfObject, PdfObjectDirect, PdfRectangle, PdfStream, - PdfString, +use crate::{ + pdf::{ + object::{ + IsPdfNull, PdfDictionary, PdfName, PdfObject, PdfObjectDirect, PdfRectangle, PdfStream, + PdfString, + }, + parse::{PdfParse, PdfParseError}, + pdf_parse, }, - parse::{PdfParse, PdfParseError}, - pdf_parse, + util::DagDebugState, }; +use std::{borrow::Cow, fmt, sync::Arc}; pdf_parse! { #[pdf(transparent)] - #[derive(Clone, Debug)] + #[derive(Clone)] // TODO: actually parse the stream pub struct PdfFontToUnicode { #[pdf] @@ -19,6 +21,17 @@ pdf_parse! { } } +impl fmt::Debug for PdfFontToUnicode { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + DagDebugState::scope(|_state| { + let Self { stream } = self; + f.debug_struct("PdfFontToUnicode") + .field("stream", stream) + .finish() + }) + } +} + pdf_parse! { #[pdf(name)] #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Default, Debug)] @@ -56,7 +69,7 @@ pdf_parse! { pdf_parse! { #[pdf] - #[derive(Clone, Debug)] + #[derive(Clone)] pub struct PdfFontDescriptor { #[pdf(name = "Type")] pub ty: PdfFontDescriptorType, @@ -107,6 +120,63 @@ pdf_parse! { } } +impl fmt::Debug for PdfFontDescriptor { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + DagDebugState::scope(|_state| { + let Self { + ty, + font_name, + font_family, + font_stretch, + font_weight, + flags, + font_bounding_box, + italic_angle, + ascent, + descent, + leading, + cap_height, + x_height, + stem_v, + stem_h, + avg_width, + max_width, + missing_width, + font_file, + font_file2, + font_file3, + char_set, + rest, + } = self; + f.debug_struct("PdfFontDescriptor") + .field("ty", ty) + .field("font_name", font_name) + .field("font_family", font_family) + .field("font_stretch", font_stretch) + .field("font_weight", font_weight) + .field("flags", flags) + .field("font_bounding_box", font_bounding_box) + .field("italic_angle", italic_angle) + .field("ascent", ascent) + .field("descent", descent) + .field("leading", leading) + .field("cap_height", cap_height) + .field("x_height", x_height) + .field("stem_v", stem_v) + .field("stem_h", stem_h) + .field("avg_width", avg_width) + .field("max_width", max_width) + .field("missing_width", missing_width) + .field("font_file", font_file) + .field("font_file2", font_file2) + .field("font_file3", font_file3) + .field("char_set", char_set) + .field("rest", rest) + .finish() + }) + } +} + pdf_parse! { #[pdf(name)] #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Default, Debug)] @@ -117,9 +187,15 @@ pdf_parse! { } } -#[derive(Clone, Debug)] +#[derive(Clone)] pub enum PdfTodo {} +impl fmt::Debug for PdfTodo { + fn fmt(&self, _f: &mut fmt::Formatter<'_>) -> fmt::Result { + match *self {} + } +} + impl IsPdfNull for PdfTodo { fn is_pdf_null(&self) -> bool { match *self {} @@ -138,14 +214,31 @@ impl PdfParse for PdfTodo { pdf_parse! { #[pdf(tag = "Subtype")] - #[derive(Clone, Debug)] + #[derive(Clone)] pub enum PdfFont { #[pdf(tag_value = "Type0")] - Type0(PdfFontType0), + Type0(Arc), #[pdf(tag_value = "Type1")] Type1(PdfFontType1), #[pdf(other)] - Other(PdfTodo), + Other(Arc), + } +} + +impl fmt::Debug for PdfFont { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + DagDebugState::scope(|state| match self { + PdfFont::Type0(v) => state.debug_or_id(v, "PdfFontType0(...)").fmt(f), + PdfFont::Type1(v) => v.fmt(f), + PdfFont::Other(v) => match **v {}, + }) + } +} + +impl PdfFont { + pub(crate) fn is_vertical_writing_mode(&self) -> bool { + // TODO: + false } } @@ -161,7 +254,7 @@ pdf_parse! { pdf_parse! { #[pdf] - #[derive(Clone, Debug)] + #[derive(Clone)] pub struct PdfFontType0 { #[pdf(name = "Type")] pub ty: PdfFontType, @@ -182,6 +275,31 @@ pdf_parse! { } } +impl fmt::Debug for PdfFontType0 { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + DagDebugState::scope(|_state| { + let Self { + ty, + subtype, + base_font, + encoding, + descendent_fonts, + to_unicode, + rest, + } = self; + f.debug_struct("PdfFontType0") + .field("ty", ty) + .field("subtype", subtype) + .field("base_font", base_font) + .field("encoding", encoding) + .field("descendent_fonts", descendent_fonts) + .field("to_unicode", to_unicode) + .field("rest", rest) + .finish() + }) + } +} + pdf_parse! { #[pdf(name)] #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Default, Debug)] @@ -227,10 +345,19 @@ pdf_parse! { } } -#[derive(Clone, Debug)] +#[derive(Clone)] pub enum PdfFontType1 { - Standard(PdfFontType1Standard), - Other(PdfFontType1Other), + Standard(Arc), + Other(Arc), +} + +impl fmt::Debug for PdfFontType1 { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + DagDebugState::scope(|state| match self { + PdfFontType1::Standard(v) => state.debug_or_id(v, "PdfFontType1Standard(...)").fmt(f), + PdfFontType1::Other(v) => state.debug_or_id(v, "PdfFontType1Other(...)").fmt(f), + }) + } } impl PdfFontType1 { @@ -309,17 +436,17 @@ impl PdfParse for PdfFontType1 { fn parse(object: PdfObject) -> Result { let object = object.into(); let PdfObjectDirect::Dictionary(object) = object else { - return PdfFontType1Other::parse(object.into()).map(Self::Other); + return Arc::::parse(object.into()).map(Self::Other); }; if let Ok(_) = PdfStandardFontName::parse(object.get_or_null(b"BaseFont".as_slice())) { - PdfFontType1Standard::parse(object.into()).map(Self::Standard) + Arc::::parse(object.into()).map(Self::Standard) } else { - PdfFontType1Other::parse(object.into()).map(Self::Other) + Arc::::parse(object.into()).map(Self::Other) } } } -#[derive(Clone, Debug)] +#[derive(Clone)] pub struct PdfFontType1Common { pub ty: PdfFontType, pub subtype: PdfFontType1Subtype, @@ -334,9 +461,42 @@ pub struct PdfFontType1Common { pub rest: PdfDictionary, } +impl fmt::Debug for PdfFontType1Common { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + DagDebugState::scope(|_state| { + let Self { + ty, + subtype, + name, + base_font, + first_char, + last_char, + widths, + font_descriptor, + encoding, + to_unicode, + rest, + } = self; + f.debug_struct("PdfFontType1Common") + .field("ty", ty) + .field("subtype", subtype) + .field("name", name) + .field("base_font", base_font) + .field("first_char", first_char) + .field("last_char", last_char) + .field("widths", widths) + .field("font_descriptor", font_descriptor) + .field("encoding", encoding) + .field("to_unicode", to_unicode) + .field("rest", rest) + .finish() + }) + } +} + pdf_parse! { #[pdf] - #[derive(Clone, Debug)] + #[derive(Clone)] pub struct PdfFontType1Standard { #[pdf(name = "Type")] pub ty: PdfFontType, @@ -364,6 +524,39 @@ pdf_parse! { } } +impl fmt::Debug for PdfFontType1Standard { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + DagDebugState::scope(|_state| { + let Self { + ty, + subtype, + name, + base_font, + first_char, + last_char, + widths, + font_descriptor, + encoding, + to_unicode, + rest, + } = self; + f.debug_struct("PdfFontType1Standard") + .field("ty", ty) + .field("subtype", subtype) + .field("name", name) + .field("base_font", base_font) + .field("first_char", first_char) + .field("last_char", last_char) + .field("widths", widths) + .field("font_descriptor", font_descriptor) + .field("encoding", encoding) + .field("to_unicode", to_unicode) + .field("rest", rest) + .finish() + }) + } +} + impl PdfFontType1Standard { pub fn common(&self) -> PdfFontType1Common { let Self { @@ -397,7 +590,7 @@ impl PdfFontType1Standard { pdf_parse! { #[pdf] - #[derive(Clone, Debug)] + #[derive(Clone)] pub struct PdfFontType1Other { #[pdf(name = "Type")] pub ty: PdfFontType, @@ -425,6 +618,39 @@ pdf_parse! { } } +impl fmt::Debug for PdfFontType1Other { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + DagDebugState::scope(|_state| { + let Self { + ty, + subtype, + name, + base_font, + first_char, + last_char, + widths, + font_descriptor, + encoding, + to_unicode, + rest, + } = self; + f.debug_struct("PdfFontType1Other") + .field("ty", ty) + .field("subtype", subtype) + .field("name", name) + .field("base_font", base_font) + .field("first_char", first_char) + .field("last_char", last_char) + .field("widths", widths) + .field("font_descriptor", font_descriptor) + .field("encoding", encoding) + .field("to_unicode", to_unicode) + .field("rest", rest) + .finish() + }) + } +} + impl PdfFontType1Other { pub fn common(&self) -> PdfFontType1Common { let Self { diff --git a/src/pdf/object.rs b/src/pdf/object.rs index de3b6da..0931286 100644 --- a/src/pdf/object.rs +++ b/src/pdf/object.rs @@ -1,6 +1,6 @@ use crate::{ pdf::{ - PdfObjects, + PdfObjectAndParseCache, PdfObjects, parse::{ GetPdfInputPosition, PdfInputPosition, PdfInputPositionNoCompare, PdfParse, PdfParseError, @@ -8,7 +8,7 @@ use crate::{ stream_filters::PdfStreamFilter, }, pdf_parse, - util::ArcOrRef, + util::{ArcOrRef, DagDebugState}, }; use std::{ any::TypeId, @@ -892,17 +892,35 @@ impl PdfObjectIndirect { final_id: Arc::new(OnceLock::new()), } } - pub fn get(&self) -> PdfObjectDirect { - let Some(objects) = self.objects.upgrade() else { - panic!("PdfObjects is no longer available"); - }; + pub(crate) fn cache_parse( + &self, + parse_inner: impl FnOnce(PdfObjectDirect) -> Result, E>, + ) -> Result, E> { + self.get_object_and_parse_cache(|object, object_and_parse_cache| { + match object_and_parse_cache { + Some(object_and_parse_cache) => { + if let Some(retval) = object_and_parse_cache.parse_cache_get::() { + println!("cache reused for {object:?}"); + return Ok(retval); + } + parse_inner(object) + .map(|retval| object_and_parse_cache.parse_cache_get_or_insert::(retval)) + } + None => parse_inner(object), + } + }) + } + fn get_object_and_parse_cache_inner<'a>( + &self, + objects: &'a PdfObjects, + ) -> (PdfObjectDirect, Option<&'a PdfObjectAndParseCache>) { if let Some(objects) = objects.inner.get() { let final_id = self.final_id.get().copied(); let limit = if final_id.is_some() { 1 } else { 1000usize }; let mut id = final_id.unwrap_or(self.id); for _ in 0..limit { - if let Some(object) = objects.objects.get(&self.id) { - let retval = match object { + if let Some(object_and_parse_cache) = objects.objects.get(&self.id) { + let object = match &object_and_parse_cache.object { PdfObject::Boolean(v) => PdfObjectDirect::Boolean(*v), PdfObject::Integer(v) => PdfObjectDirect::Integer(*v), PdfObject::Real(v) => PdfObjectDirect::Real(*v), @@ -919,13 +937,26 @@ impl PdfObjectIndirect { }; // we could be racing with another thread, so set can fail but that's not a problem let _ = self.final_id.set(id); - return retval; + return (object, Some(object_and_parse_cache)); } else { - return PdfObjectDirect::Null(PdfNull::new(id.pos)); + return (PdfNull::new(id.pos).into(), None); } } } - PdfObjectDirect::Null(PdfNull::new(self.pos())) + (PdfNull::new(self.pos()).into(), None) + } + fn get_object_and_parse_cache( + &self, + f: impl FnOnce(PdfObjectDirect, Option<&PdfObjectAndParseCache>) -> R, + ) -> R { + let Some(objects) = self.objects.upgrade() else { + panic!("PdfObjects is no longer available"); + }; + let (object, object_and_parse_cache) = self.get_object_and_parse_cache_inner(&objects); + f(object, object_and_parse_cache) + } + pub fn get(&self) -> PdfObjectDirect { + self.get_object_and_parse_cache(|object, _object_and_parse_cache| object) } pub fn id(&self) -> PdfObjectIdentifier { self.id @@ -1067,9 +1098,17 @@ impl<'a, T> IntoIterator for &'a PdfDictionary { } } -impl fmt::Debug for PdfDictionary { +impl fmt::Debug for PdfDictionary { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_map().entries(self).finish() + DagDebugState::scope(|state| { + state + .debug_or_id_with( + &self.fields, + |_, f| f.debug_map().entries(self).finish(), + |f| f.write_str("{...}"), + ) + .fmt(f) + }) } } @@ -1364,6 +1403,31 @@ pub struct PdfMatrix { pub elements: [f32; 6], } +impl PdfMatrix { + pub fn identity(pos: impl Into) -> Self { + Self { + pos: pos.into(), + elements: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0], + } + } + #[must_use] + pub fn mul(self, other: PdfMatrix, new_pos: impl Into) -> Self { + let [la, lb, lc, ld, le, lf] = self.elements; + let [ra, rb, rc, rd, re, rf] = other.elements; + Self { + pos: new_pos.into(), + elements: [ + lb * rc + la * ra, + lb * rd + la * rb, + ld * rc + lc * ra, + ld * rd + lc * rb, + re + lf * rc + le * ra, + rf + lf * rd + le * rb, + ], + } + } +} + impl fmt::Debug for PdfMatrix { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let Self { pos, elements } = *self; @@ -1582,7 +1646,7 @@ impl PdfParse for PdfFileSpecification { pdf_parse! { #[pdf] - #[derive(Clone, Debug)] + #[derive(Clone)] pub struct PdfStreamDictionary { #[pdf(name = "Length")] pub len: usize, @@ -1603,6 +1667,33 @@ pdf_parse! { } } +impl fmt::Debug for PdfStreamDictionary { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + DagDebugState::scope(|_state| { + let Self { + len, + filters, + decode_parms, + file, + file_filters, + file_decode_parms, + decoded_len, + rest, + } = self; + f.debug_struct("PdfStreamDictionary") + .field("len", len) + .field("filters", filters) + .field("decode_parms", decode_parms) + .field("file", file) + .field("file_filters", file_filters) + .field("file_decode_parms", file_decode_parms) + .field("decoded_len", decoded_len) + .field("rest", rest) + .finish() + }) + } +} + #[derive(Debug, Clone, Default)] pub struct PdfStreamDictionaryFiltersAndParms<'a> { filters: std::iter::Enumerate>, @@ -1697,23 +1788,6 @@ impl PdfStreamDictionary { } } -pub(crate) struct UnparsedPdfStreamDictionary { - unparsed_dictionary: PdfDictionary, - dictionary: Arc>>, -} - -impl UnparsedPdfStreamDictionary { - pub(crate) fn finish_parsing(self) -> Result<(), PdfParseError> { - let Ok(()) = self - .dictionary - .set(PdfParse::parse(self.unparsed_dictionary.into())?) - else { - unreachable!(); - }; - Ok(()) - } -} - pub trait PdfStreamContents: Sized + fmt::Debug + 'static { fn parse( data: &[u8], @@ -1786,34 +1860,45 @@ impl fmt::Display for DumpBytes<'_> { impl fmt::Debug for PdfStream { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let Self { - pos, - objects: _, - dictionary, - encoded_data, - decoded_data, - } = self; - let mut debug_struct = f.debug_struct("PdfStream"); - debug_struct.field("pos", pos); - debug_struct.field("dictionary", dictionary); - debug_struct.field("encoded_data", &DumpBytes(encoded_data)); - if let Some(decoded_data) = decoded_data.get() { - match decoded_data { - Ok(decoded_data) => { - if let Some(decoded_data) = - ::downcast_ref::>(decoded_data) - { - debug_struct.field("decoded_data", &DumpBytes(&**decoded_data)) - } else { - debug_struct.field("decoded_data", decoded_data) - } - } - Err(e) => debug_struct.field("decoded_data", &Err::<(), _>(e)), - }; - } else { - debug_struct.field("decoded_data", &format_args!("")); - } - debug_struct.finish() + DagDebugState::scope(|state| { + state + .debug_or_id_with( + &self.decoded_data, + |_, f| { + let Self { + pos, + objects: _, + dictionary, + encoded_data, + decoded_data, + } = self; + let mut debug_struct = f.debug_struct("PdfStream"); + debug_struct.field("pos", pos); + debug_struct.field("dictionary", dictionary); + debug_struct.field("encoded_data", &DumpBytes(encoded_data)); + if let Some(decoded_data) = decoded_data.get() { + match decoded_data { + Ok(decoded_data) => { + if let Some(decoded_data) = + ::downcast_ref::>(decoded_data) + { + debug_struct + .field("decoded_data", &DumpBytes(&**decoded_data)) + } else { + debug_struct.field("decoded_data", decoded_data) + } + } + Err(e) => debug_struct.field("decoded_data", &Err::<(), _>(e)), + }; + } else { + debug_struct.field("decoded_data", &format_args!("")); + } + debug_struct.finish() + }, + |f| f.write_str("PdfStream(...)"), + ) + .fmt(f) + }) } } @@ -1957,7 +2042,7 @@ pdf_parse! { pdf_parse! { #[pdf] - #[derive(Clone, Debug)] + #[derive(Clone)] pub struct PdfObjectStreamDictionary { #[pdf(name = Self::TYPE_NAME)] pub ty: PdfObjectStreamType, diff --git a/src/pdf/parse.rs b/src/pdf/parse.rs index 95e58ac..4ec885e 100644 --- a/src/pdf/parse.rs +++ b/src/pdf/parse.rs @@ -283,6 +283,19 @@ pub enum PdfParseError { OperatorHasTooManyOperands { operator: PdfOperator, }, + CantRestoreGraphicsStateWithEmptyStack { + pos: PdfInputPosition, + }, + FontResourceNotFound { + pos: PdfInputPosition, + font: PdfName, + }, + MissingBeginTextOperator { + pos: PdfInputPosition, + }, + MissingSetFontOperator { + pos: PdfInputPosition, + }, } impl From for PdfParseError { @@ -328,7 +341,11 @@ impl GetPdfInputPosition for PdfParseError { | PdfParseError::UnknownStreamFilter { pos, .. } | PdfParseError::StreamFilterError { pos, .. } | PdfParseError::StreamNotAllowedHere { pos } - | PdfParseError::MissingOperator { pos } => pos, + | PdfParseError::MissingOperator { pos } + | PdfParseError::CantRestoreGraphicsStateWithEmptyStack { pos } + | PdfParseError::FontResourceNotFound { pos, .. } + | PdfParseError::MissingBeginTextOperator { pos } + | PdfParseError::MissingSetFontOperator { pos } => pos, PdfParseError::OperatorNotAllowedHere { ref operator } => operator.pos(), PdfParseError::OperatorHasTooFewOperands { ref operator } | PdfParseError::OperatorHasTooManyOperands { ref operator } => operator.pos(), @@ -487,6 +504,27 @@ impl fmt::Display for PdfParseError { operator.pos(), ) } + PdfParseError::CantRestoreGraphicsStateWithEmptyStack { pos } => { + write!( + f, + "at {pos}: can't restore graphics state when the graphics state stack is empty" + ) + } + PdfParseError::FontResourceNotFound { pos, ref font } => { + write!(f, "at {pos}: font resource not found: {font:?}") + } + PdfParseError::MissingBeginTextOperator { pos } => { + write!( + f, + "at {pos}: missing begin text `BT` operator before this text operator" + ) + } + PdfParseError::MissingSetFontOperator { pos } => { + write!( + f, + "at {pos}: missing set font `Tf` operator before this text showing operator" + ) + } } } } @@ -808,6 +846,40 @@ impl PdfParse for Arc<[T]> { } } +impl IsPdfNull for Arc { + fn is_pdf_null(&self) -> bool { + false + } +} + +impl PdfParse for Arc { + fn type_name() -> Cow<'static, str> { + T::type_name() + } + fn parse(object: PdfObject) -> Result { + if let PdfObject::Indirect(indirect) = object { + indirect.cache_parse(|object| T::parse(object.into()).map(Arc::new)) + } else { + T::parse(object).map(Arc::new) + } + } + fn parse_option(object: PdfObject) -> Result, PdfParseError> { + if let PdfObject::Indirect(indirect) = object { + match indirect.cache_parse(|object| match T::parse_option(object.into()) { + Ok(Some(v)) => Ok(Arc::new(v)), + Ok(None) => Err(None), + Err(e) => Err(Some(e)), + }) { + Ok(v) => Ok(Some(v)), + Err(None) => Ok(None), + Err(Some(e)) => Err(e), + } + } else { + Ok(T::parse_option(object)?.map(Arc::new)) + } + } +} + impl IsPdfNull for MaybeArray { fn is_pdf_null(&self) -> bool { false diff --git a/src/pdf/render.rs b/src/pdf/render.rs new file mode 100644 index 0000000..4fb56eb --- /dev/null +++ b/src/pdf/render.rs @@ -0,0 +1,1054 @@ +use crate::{ + pdf::{ + content_stream::{ + PdfOperatorAndOperands, PdfOperatorBeginCompatibilitySection, + PdfOperatorBeginInlineImage, PdfOperatorBeginInlineImageData, + PdfOperatorBeginMarkedContent, PdfOperatorBeginMarkedContentWithProperties, + PdfOperatorBeginText, PdfOperatorClip, PdfOperatorClipEvenOdd, + PdfOperatorCloseAndStrokePath, PdfOperatorCloseFillAndStrokePath, + PdfOperatorCloseFillAndStrokePathEvenOdd, PdfOperatorCloseSubpath, + PdfOperatorConcatMatrix, PdfOperatorCurveTo, PdfOperatorCurveTo13, + PdfOperatorCurveTo23, PdfOperatorDesignateMarkedContentPoint, + PdfOperatorDesignateMarkedContentPointWithProperties, + PdfOperatorEndCompatibilitySection, PdfOperatorEndInlineImage, + PdfOperatorEndMarkedContent, PdfOperatorEndPath, PdfOperatorEndText, + PdfOperatorFillAndStrokePath, PdfOperatorFillAndStrokePathEvenOdd, PdfOperatorFillPath, + PdfOperatorFillPathEvenOdd, PdfOperatorFillPathObsolete, PdfOperatorFontType3SetWidth, + PdfOperatorFontType3SetWidthAndBBox, PdfOperatorLineTo, PdfOperatorMoveTo, + PdfOperatorPaintXObject, PdfOperatorRectangle, PdfOperatorRestoreGraphicsState, + PdfOperatorSaveGraphicsState, PdfOperatorSetCharacterSpacing, + PdfOperatorSetColorRenderingIntent, PdfOperatorSetFlatnessTolerance, + PdfOperatorSetFontAndSize, PdfOperatorSetGraphicsState, PdfOperatorSetLineCapStyle, + PdfOperatorSetLineDashPattern, PdfOperatorSetLineJoinStyle, PdfOperatorSetLineWidth, + PdfOperatorSetMiterLimit, PdfOperatorSetNonStrokeCmyk, PdfOperatorSetNonStrokeColor, + PdfOperatorSetNonStrokeColorSpace, PdfOperatorSetNonStrokeColorWithName, + PdfOperatorSetNonStrokeGray, PdfOperatorSetNonStrokeRgb, + PdfOperatorSetSpacingThenTextNextLineAndShow, PdfOperatorSetStrokeCmyk, + PdfOperatorSetStrokeColor, PdfOperatorSetStrokeColorSpace, + PdfOperatorSetStrokeColorWithName, PdfOperatorSetStrokeGray, PdfOperatorSetStrokeRgb, + PdfOperatorSetTextHorizontalScaling, PdfOperatorSetTextLeading, + PdfOperatorSetTextMatrix, PdfOperatorSetTextRenderingMode, PdfOperatorSetTextRise, + PdfOperatorSetWordSpacing, PdfOperatorShade, PdfOperatorShowText, + PdfOperatorShowTextWithGlyphPositioning, PdfOperatorStrokePath, + PdfOperatorTextNextLine, PdfOperatorTextNextLineAndShow, + PdfOperatorTextNextLineWithOffset, PdfOperatorTextNextLineWithOffsetAndLeading, + PdfOperatorUnparsed, + }, + document_structure::{PdfPage, PdfResourcesDictionary}, + font::{PdfFont, PdfTodo}, + object::{ + IsPdfNull, PdfMatrix, PdfName, PdfNumber, PdfObject, PdfObjectDirect, + PdfStringOrNumber, PdfVec2D, + }, + parse::{ + GetPdfInputPosition, PdfInputPosition, PdfInputPositionNoCompare, PdfParse, + PdfParseError, + }, + }, + pdf_parse, +}; +use std::borrow::Cow; + +#[derive(Clone, Debug)] +#[non_exhaustive] +pub struct PdfPath {} + +pdf_parse! { + #[pdf(name)] + #[derive(Clone, Debug)] + pub enum PdfColorSpace { + #[pdf(name = "DeviceGray")] + DeviceGray, + #[pdf(name = "DeviceRGB")] + DeviceRgb, + // TODO: add others + #[pdf(other)] + Unknown(PdfName), + } +} + +pdf_parse! { + #[pdf(name)] + #[derive(Clone, Debug)] + pub enum PdfRenderingIntent { + #[pdf(name = "RelativeColorimetric")] + RelativeColorimetric, + // TODO: add others + #[pdf(other)] + Unknown(PdfName), + } +} + +pdf_parse! { + #[pdf(name)] + #[derive(Clone, Debug)] + pub enum PdfBlendMode { + #[pdf(name = "Normal")] + Normal, + // TODO: add others + #[pdf(other)] + Unknown(PdfName), + } +} + +#[derive(Clone, Copy, PartialEq, PartialOrd)] +pub struct PdfColorDeviceGray { + pos: PdfInputPositionNoCompare, + level: f32, +} + +impl PdfColorDeviceGray { + pub fn pos(self) -> PdfInputPosition { + self.pos.0 + } + pub fn level(self) -> f32 { + self.level + } +} + +impl IsPdfNull for PdfColorDeviceGray { + fn is_pdf_null(&self) -> bool { + false + } +} + +impl PdfParse for PdfColorDeviceGray { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("PdfColorDeviceGray") + } + fn parse(object: PdfObject) -> Result { + let number = PdfNumber::parse(object)?; + Ok(Self { + pos: number.pos().into(), + level: number.as_f32(), + }) + } +} + +impl std::fmt::Debug for PdfColorDeviceGray { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let Self { pos, level } = self; + write!(f, "PdfColorDeviceGray(at {pos}, {level})") + } +} + +#[derive(Clone, Copy, PartialEq, PartialOrd)] +pub struct PdfColorDeviceRgb { + pos: PdfInputPositionNoCompare, + r: f32, + g: f32, + b: f32, +} + +impl PdfColorDeviceRgb { + pub fn pos(self) -> PdfInputPosition { + self.pos.0 + } + pub fn r(self) -> f32 { + self.r + } + pub fn g(self) -> f32 { + self.g + } + pub fn b(self) -> f32 { + self.b + } + pub fn parse_flat(r: PdfObject, g: PdfObject, b: PdfObject) -> Result { + let r = PdfNumber::parse(r)?; + let g = f32::parse(g)?; + let b = f32::parse(b)?; + Ok(Self { + pos: r.pos().into(), + r: r.as_f32(), + g, + b, + }) + } +} + +impl IsPdfNull for PdfColorDeviceRgb { + fn is_pdf_null(&self) -> bool { + false + } +} + +impl std::fmt::Debug for PdfColorDeviceRgb { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let Self { pos, r, g, b } = self; + write!(f, "PdfColorDeviceRgb(at {pos}, {r}, {g}, {b})") + } +} + +#[derive(Clone, Debug)] +pub enum PdfColor { + DeviceGray(PdfColorDeviceGray), + DeviceRgb(PdfColorDeviceRgb), + // TODO +} + +#[derive(Clone, Debug)] +pub struct PdfTextObjectState { + pub text_matrix: PdfMatrix, + pub text_line_matrix: PdfMatrix, +} + +impl PdfTextObjectState { + fn require>( + v: Option, + pos: PdfInputPosition, + ) -> Result { + v.ok_or(PdfParseError::MissingBeginTextOperator { pos }) + } +} + +#[derive(Clone, Debug)] +pub struct PdfTextState { + pub char_spacing: f32, + pub word_spacing: f32, + pub horizontal_scaling_percent: f32, + pub leading: f32, + pub font: Option, + pub font_size: f32, + pub rendering_mode: u8, // TODO: replace with enum + pub rise: f32, + pub knockout: bool, + pub text_object: Option, +} + +#[derive(Clone, Debug)] +#[non_exhaustive] +pub struct PdfLineDashPattern { + // TODO +} + +impl PdfLineDashPattern { + pub fn solid() -> Self { + Self {} + } +} + +#[derive(Clone, Debug)] +pub struct PdfGraphicsState { + pub current_transformation_matrix: PdfMatrix, + pub clipping_path: PdfPath, + pub stroking_color: PdfColor, + pub non_stroking_color: PdfColor, + pub text_state: PdfTextState, + pub line_width: f32, + pub line_cap_style: u8, // TODO: replace with enum + pub line_join_style: u8, // TODO: replace with enum + pub miter_limit: f32, + pub line_dash_pattern: PdfLineDashPattern, + pub rendering_intent: PdfRenderingIntent, + pub automatic_stroke_adjustment: bool, + pub current_blend_mode: PdfBlendMode, + pub soft_mask: Option, // TODO: replace with struct + pub stroking_alpha_constant: f32, + pub non_stroking_alpha_constant: f32, + pub alpha_source: bool, +} + +impl PdfGraphicsState { + pub fn text_rendering_matrix(&self, pos: PdfInputPosition) -> Result { + let text_object = PdfTextObjectState::require(self.text_state.text_object.as_ref(), pos)?; + Ok(PdfMatrix { + pos: PdfInputPositionNoCompare::empty(), + elements: [ + self.text_state.font_size * self.text_state.horizontal_scaling_percent * 1e-2, + 0.0, + 0.0, + self.text_state.font_size, + 0.0, + self.text_state.rise, + ], + } + .mul(text_object.text_matrix, PdfInputPositionNoCompare::empty()) + .mul( + self.current_transformation_matrix, + text_object.text_matrix.pos, + )) + } + pub fn advance_text_matrix( + &mut self, + pos: PdfInputPosition, + glyph_displacement: PdfVec2D, + position_adjustment: f32, + has_char_spacing: bool, + has_word_spacing: bool, + ) -> Result<(), PdfParseError> { + let text_object = PdfTextObjectState::require(self.text_state.text_object.as_mut(), pos)?; + let (tx, ty) = if self + .text_state + .font + .as_ref() + .ok_or(PdfParseError::MissingSetFontOperator { pos })? + .is_vertical_writing_mode() + { + let mut ty = + (glyph_displacement.y - position_adjustment * 1e-3) * self.text_state.font_size; + if has_char_spacing { + ty += self.text_state.char_spacing; + } + if has_word_spacing { + ty += self.text_state.word_spacing; + } + (0.0, ty) + } else { + let mut tx = + (glyph_displacement.x - position_adjustment * 1e-3) * self.text_state.font_size; + if has_char_spacing { + tx += self.text_state.char_spacing; + } + if has_word_spacing { + tx += self.text_state.word_spacing; + } + (tx * self.text_state.horizontal_scaling_percent * 1e-2, 0.0) + }; + text_object.text_matrix = PdfMatrix { + pos: pos.into(), + elements: [1.0, 0.0, 0.0, 1.0, tx, ty], + } + .mul(text_object.text_matrix, pos); + Ok(()) + } +} + +#[derive(Debug)] +pub struct PdfRenderState<'a> { + pub graphics_state: PdfGraphicsState, + pub graphics_state_stack: Vec, + pub resources: &'a PdfResourcesDictionary, +} + +impl<'a> PdfRenderState<'a> { + pub fn new(page: &'a PdfPage) -> Self { + let pos = page.rest.pos().into(); + Self { + graphics_state: PdfGraphicsState { + current_transformation_matrix: PdfMatrix::identity(pos), + clipping_path: PdfPath {}, + stroking_color: PdfColor::DeviceGray(PdfColorDeviceGray { pos, level: 0.0 }), + non_stroking_color: PdfColor::DeviceGray(PdfColorDeviceGray { pos, level: 0.0 }), + text_state: PdfTextState { + char_spacing: 0.0, + word_spacing: 0.0, + horizontal_scaling_percent: 100.0, + leading: 0.0, + font: None, + font_size: 0.0, + rendering_mode: 0, + rise: 0.0, + knockout: true, + text_object: None, + }, + line_width: 1.0, + line_cap_style: 0, + line_join_style: 0, + miter_limit: 10.0, + line_dash_pattern: PdfLineDashPattern::solid(), + rendering_intent: PdfRenderingIntent::RelativeColorimetric, + automatic_stroke_adjustment: false, + current_blend_mode: PdfBlendMode::Normal, + soft_mask: None, + stroking_alpha_constant: 1.0, + non_stroking_alpha_constant: 1.0, + alpha_source: false, + }, + graphics_state_stack: Vec::with_capacity(3), + resources: &page.resources, + } + } + pub fn handle_unknown_operator( + &mut self, + operator: &PdfOperatorUnparsed, + operands: &[PdfObjectDirect], + ) -> Result<(), PdfParseError> { + todo!() + } +} + +pub trait PdfRenderOperator: Into { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError>; +} + +impl PdfRenderOperator for PdfOperatorCloseFillAndStrokePath { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorFillAndStrokePath { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorCloseFillAndStrokePathEvenOdd { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorFillAndStrokePathEvenOdd { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorBeginMarkedContentWithProperties { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { + pos, + tag, + properties, + } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorBeginInlineImage { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorBeginMarkedContent { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos, tag } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorBeginText { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + let Self { pos } = *self; + state.graphics_state.text_state.text_object = Some(PdfTextObjectState { + text_matrix: PdfMatrix::identity(pos), + text_line_matrix: PdfMatrix::identity(pos), + }); + Ok(()) + } +} + +impl PdfRenderOperator for PdfOperatorBeginCompatibilitySection { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorCurveTo { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos, p1, p2, p3 } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorConcatMatrix { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + let Self { pos, matrix } = *self; + state.graphics_state.current_transformation_matrix = state + .graphics_state + .current_transformation_matrix + .mul(matrix, pos); + Ok(()) + } +} + +impl PdfRenderOperator for PdfOperatorSetStrokeColorSpace { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos, name } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorSetNonStrokeColorSpace { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos, name } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorSetLineDashPattern { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { + pos, + dash_array, + dash_phase, + } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorFontType3SetWidth { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos, width } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorFontType3SetWidthAndBBox { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos, width, bbox } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorPaintXObject { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos, name } = self; */ + let _ = state; + // TODO: implement + Ok(()) + } +} + +impl PdfRenderOperator for PdfOperatorDesignateMarkedContentPointWithProperties { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { + pos, + tag, + properties, + } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorEndInlineImage { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorEndMarkedContent { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorEndText { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + let Self { pos: _ } = self; + state.graphics_state.text_state.text_object = None; + Ok(()) + } +} + +impl PdfRenderOperator for PdfOperatorEndCompatibilitySection { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorFillPath { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorFillPathObsolete { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorFillPathEvenOdd { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorSetStrokeGray { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + let Self { pos: _, gray } = *self; + state.graphics_state.stroking_color = PdfColor::DeviceGray(gray); + Ok(()) + } +} + +impl PdfRenderOperator for PdfOperatorSetNonStrokeGray { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + let Self { pos: _, gray } = *self; + state.graphics_state.non_stroking_color = PdfColor::DeviceGray(gray); + Ok(()) + } +} + +impl PdfRenderOperator for PdfOperatorSetGraphicsState { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { + pos, + dictionary_name, + } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorCloseSubpath { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorSetFlatnessTolerance { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos, flatness } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorBeginInlineImageData { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorSetLineJoinStyle { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { + pos, + line_join_style, + } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorSetLineCapStyle { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { + pos, + line_cap_style, + } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorSetStrokeCmyk { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos, c, m, y, k } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorSetNonStrokeCmyk { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos, c, m, y, k } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorLineTo { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos, to } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorMoveTo { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos, to } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorSetMiterLimit { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos, limit } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorDesignateMarkedContentPoint { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos, tag } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorEndPath { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorSaveGraphicsState { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + let Self { pos: _ } = self; + state + .graphics_state_stack + .push(state.graphics_state.clone()); + Ok(()) + } +} + +impl PdfRenderOperator for PdfOperatorRestoreGraphicsState { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + let Self { pos } = self; + state.graphics_state = state + .graphics_state_stack + .pop() + .ok_or(PdfParseError::CantRestoreGraphicsStateWithEmptyStack { pos: pos.0 })?; + Ok(()) + } +} + +impl PdfRenderOperator for PdfOperatorRectangle { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos, p, size } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorSetStrokeRgb { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + let Self { pos: _, color } = *self; + state.graphics_state.stroking_color = PdfColor::DeviceRgb(color); + Ok(()) + } +} + +impl PdfRenderOperator for PdfOperatorSetNonStrokeRgb { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + let Self { pos: _, color } = *self; + state.graphics_state.non_stroking_color = PdfColor::DeviceRgb(color); + Ok(()) + } +} + +impl PdfRenderOperator for PdfOperatorSetColorRenderingIntent { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos, intent } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorCloseAndStrokePath { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorStrokePath { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorSetStrokeColor { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos, color } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorSetNonStrokeColor { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos, color } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorSetStrokeColorWithName { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { + pos, + color_and_name, + } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorSetNonStrokeColorWithName { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { + pos, + color_and_name, + } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorShade { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorTextNextLine { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorSetCharacterSpacing { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos, char_space } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorTextNextLineWithOffset { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + let Self { pos, offset } = *self; + let PdfTextObjectState { + text_matrix, + text_line_matrix, + } = PdfTextObjectState::require( + state.graphics_state.text_state.text_object.as_mut(), + pos.0, + )?; + let matrix = PdfMatrix { + pos, + elements: [1.0, 0.0, 0.0, 1.0, offset.x, offset.y], + } + .mul(*text_line_matrix, pos); + *text_line_matrix = matrix; + *text_matrix = matrix; + Ok(()) + } +} + +impl PdfRenderOperator for PdfOperatorTextNextLineWithOffsetAndLeading { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos, offset } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorSetFontAndSize { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + let Self { + pos, + ref font, + size, + } = *self; + state.graphics_state.text_state.font = Some( + state + .resources + .fonts + .get(font) + .ok_or_else(|| PdfParseError::FontResourceNotFound { + pos: pos.0, + font: font.clone(), + })? + .clone(), + ); + state.graphics_state.text_state.font_size = size; + Ok(()) + } +} + +impl PdfRenderOperator for PdfOperatorShowText { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos, text } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorShowTextWithGlyphPositioning { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + let Self { + pos, + ref text_and_positioning, + } = *self; + let font = state + .graphics_state + .text_state + .font + .as_ref() + .ok_or(PdfParseError::MissingSetFontOperator { pos: pos.0 })?; + let PdfFont::Type1(font) = font else { todo!() }; + let mut positioning = 0.0; + for text_or_positioning in text_and_positioning.iter() { + match text_or_positioning { + PdfStringOrNumber::String(s) => { + for glyph in s.bytes().iter() { + let positioning = std::mem::replace(&mut positioning, 0.0); + let encoding = font.encoding(); + todo!("{encoding:?}"); + } + } + PdfStringOrNumber::Number(number) => positioning = number.as_f32(), + } + } + let _ = state; + todo!("{text_and_positioning:?}") + } +} + +impl PdfRenderOperator for PdfOperatorSetTextLeading { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos, leading } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorSetTextMatrix { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos, matrix } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorSetTextRenderingMode { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { + pos, + rendering_mode, + } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorSetTextRise { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos, rise } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorSetWordSpacing { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos, word_space } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorSetTextHorizontalScaling { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos, scale_percent } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorCurveTo23 { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorSetLineWidth { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos, line_width } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorClip { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorClipEvenOdd { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorCurveTo13 { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorTextNextLineAndShow { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { pos, text } = self; */ + let _ = state; + todo!() + } +} + +impl PdfRenderOperator for PdfOperatorSetSpacingThenTextNextLineAndShow { + fn render(&self, state: &mut PdfRenderState) -> Result<(), PdfParseError> { + /* let Self { + pos, + word_space, + char_space, + text, + } = self; */ + let _ = state; + todo!() + } +} diff --git a/src/util.rs b/src/util.rs index a7a4978..1a4440c 100644 --- a/src/util.rs +++ b/src/util.rs @@ -1,5 +1,8 @@ use std::{ + any::{Any, TypeId}, borrow::Borrow, + cell::Cell, + collections::HashMap, fmt, hash::{Hash, Hasher}, sync::Arc, @@ -100,3 +103,221 @@ impl fmt::Display for ArcOrRef<'_, T> { T::fmt(self, f) } } + +trait DagDebugStateSealed {} + +#[expect(private_bounds)] +pub trait SupportsDagDebugState: DagDebugStateSealed + 'static + Clone { + type Key: Clone + Hash + Eq + 'static; + fn key(this: &Self) -> Self::Key; +} + +impl DagDebugStateSealed for Arc {} + +impl SupportsDagDebugState for Arc { + type Key = *const T; + + fn key(this: &Self) -> Self::Key { + Arc::as_ptr(this) + } +} + +impl DagDebugStateSealed for Arc<[T]> {} + +impl SupportsDagDebugState for Arc<[T]> { + type Key = *const [T]; + + fn key(this: &Self) -> Self::Key { + Arc::as_ptr(this) + } +} + +impl DagDebugStateSealed for Arc {} + +impl SupportsDagDebugState for Arc { + type Key = *const str; + + fn key(this: &Self) -> Self::Key { + Arc::as_ptr(this) + } +} + +trait DagDebugStatePartTrait: 'static { + fn reset(&mut self); + fn as_any_mut(&mut self) -> &mut dyn Any; +} + +struct DagDebugStatePart { + table: HashMap, + next_id: u64, +} + +impl DagDebugStatePartTrait for DagDebugStatePart { + fn reset(&mut self) { + let Self { table, next_id } = self; + table.clear(); + *next_id = 0; + } + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } +} + +impl DagDebugStatePart { + fn insert(&mut self, value: &T) -> DagDebugStateInsertResult { + use std::collections::hash_map::Entry; + match self.table.entry(T::key(value)) { + Entry::Occupied(entry) => DagDebugStateInsertResult::Old { id: entry.get().0 }, + Entry::Vacant(entry) => { + let value = T::clone(value); + let id = self.next_id; + self.next_id += 1; + entry.insert((id, value)); + DagDebugStateInsertResult::New { id } + } + } + } +} + +impl Default for DagDebugStatePart { + fn default() -> Self { + Self { + table: HashMap::default(), + next_id: 0, + } + } +} + +pub struct DagDebugState { + parts: std::cell::RefCell>>, + ref_count: Cell, +} + +#[derive(Clone, Copy, Debug)] +pub enum DagDebugStateInsertResult { + New { id: u64 }, + Old { id: u64 }, +} + +impl DagDebugStateInsertResult { + pub fn id(self) -> u64 { + match self { + Self::New { id } | Self::Old { id } => id, + } + } +} + +impl DagDebugState { + fn with_part( + &self, + f: impl FnOnce(&mut DagDebugStatePart) -> R, + ) -> R { + let mut parts = self.parts.borrow_mut(); + let Some(part) = parts + .entry(TypeId::of::>()) + .or_insert_with(|| Box::new(DagDebugStatePart::::default())) + .as_any_mut() + .downcast_mut::>() + else { + unreachable!() + }; + f(part) + } + pub fn insert(&self, value: &T) -> DagDebugStateInsertResult { + self.with_part(|part: &mut DagDebugStatePart| part.insert(value)) + } + pub fn debug_or_id<'a, T: SupportsDagDebugState + fmt::Debug, Abbreviated: fmt::Display>( + &self, + value: &'a T, + abbreviated: Abbreviated, + ) -> impl fmt::Debug + fmt::Display + use<'a, T, Abbreviated> { + self.debug_or_id_with(value, fmt::Debug::fmt, move |f| abbreviated.fmt(f)) + } + pub fn debug_or_id_with< + 'a, + T: SupportsDagDebugState, + DebugValue: Fn(&'a T, &mut fmt::Formatter<'_>) -> fmt::Result, + DebugAbbreviated: Fn(&mut fmt::Formatter<'_>) -> fmt::Result, + >( + &self, + value: &'a T, + debug_value: DebugValue, + debug_abbreviated: DebugAbbreviated, + ) -> impl fmt::Debug + fmt::Display + use<'a, T, DebugValue, DebugAbbreviated> { + struct DebugOrIdWith< + 'a, + T: SupportsDagDebugState, + DebugValue: Fn(&'a T, &mut fmt::Formatter<'_>) -> fmt::Result, + DebugAbbreviated: Fn(&mut fmt::Formatter<'_>) -> fmt::Result, + > { + insert_result: DagDebugStateInsertResult, + value: &'a T, + debug_value: DebugValue, + debug_abbreviated: DebugAbbreviated, + } + impl< + 'a, + T: SupportsDagDebugState, + DebugValue: Fn(&'a T, &mut fmt::Formatter<'_>) -> fmt::Result, + DebugAbbreviated: Fn(&mut fmt::Formatter<'_>) -> fmt::Result, + > fmt::Debug for DebugOrIdWith<'a, T, DebugValue, DebugAbbreviated> + { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(self, f) + } + } + impl< + 'a, + T: SupportsDagDebugState, + DebugValue: Fn(&'a T, &mut fmt::Formatter<'_>) -> fmt::Result, + DebugAbbreviated: Fn(&mut fmt::Formatter<'_>) -> fmt::Result, + > fmt::Display for DebugOrIdWith<'a, T, DebugValue, DebugAbbreviated> + { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "#{} ", self.insert_result.id())?; + match self.insert_result { + DagDebugStateInsertResult::New { id: _ } => (self.debug_value)(self.value, f), + DagDebugStateInsertResult::Old { id: _ } => (self.debug_abbreviated)(f), + } + } + } + DebugOrIdWith { + insert_result: self.insert(value), + value, + debug_value, + debug_abbreviated, + } + } + #[must_use] + fn inc_ref_count_scope(&self) -> impl Sized { + struct DecRefCountOnDrop<'a>(&'a DagDebugState); + impl Drop for DecRefCountOnDrop<'_> { + fn drop(&mut self) { + self.0.ref_count.set(self.0.ref_count.get() - 1); + if self.0.ref_count.get() == 0 { + self.0 + .parts + .borrow_mut() + .values_mut() + .for_each(|v| v.reset()); + } + } + } + self.ref_count.set( + self.ref_count + .get() + .checked_add(1) + .expect("too many nested calls"), + ); + DecRefCountOnDrop(self) + } + pub fn scope(f: impl FnOnce(&Self) -> R) -> R { + thread_local! { + static STATE: DagDebugState = DagDebugState { parts: Default::default(), ref_count: Cell::new(0) }; + } + STATE.with(|state| { + let _scope = state.inc_ref_count_scope(); + f(state) + }) + } +} From d7727289ebe2eb366a005100b9d528965b878283 Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Tue, 30 Dec 2025 07:01:16 -0800 Subject: [PATCH 08/42] parse info from type 1 fonts --- src/pdf/font.rs | 277 ++++++- src/pdf/font/tables.rs | 1067 +++++++++++++++++++++++++ src/pdf/font/type_1_parse.rs | 1423 ++++++++++++++++++++++++++++++++++ src/pdf/object.rs | 75 ++ src/pdf/parse.rs | 2 +- src/pdf/render.rs | 22 +- src/util.rs | 59 ++ 7 files changed, 2900 insertions(+), 25 deletions(-) create mode 100644 src/pdf/font/tables.rs create mode 100644 src/pdf/font/type_1_parse.rs diff --git a/src/pdf/font.rs b/src/pdf/font.rs index 14086f1..04b62f3 100644 --- a/src/pdf/font.rs +++ b/src/pdf/font.rs @@ -1,15 +1,21 @@ use crate::{ pdf::{ object::{ - IsPdfNull, PdfDictionary, PdfName, PdfObject, PdfObjectDirect, PdfRectangle, PdfStream, - PdfString, + IsPdfNull, PdfArray, PdfDictionary, PdfMatrix, PdfName, PdfNameOrInteger, PdfObject, + PdfObjectDirect, PdfRectangle, PdfStream, PdfString, + }, + parse::{ + GetPdfInputPosition, PdfInputPosition, PdfInputPositionNoCompare, PdfParse, + PdfParseError, }, - parse::{PdfParse, PdfParseError}, pdf_parse, }, - util::DagDebugState, + util::{ArcOrRef, DagDebugState}, }; -use std::{borrow::Cow, fmt, sync::Arc}; +use std::{borrow::Cow, collections::BTreeMap, fmt, sync::Arc}; + +mod tables; +mod type_1_parse; pdf_parse! { #[pdf(transparent)] @@ -108,7 +114,7 @@ pdf_parse! { #[pdf(name = "MissingWidth")] pub missing_width: Option, #[pdf(name = "FontFile")] - pub font_file: Option, + pub font_file: Option>, #[pdf(name = "FontFile2")] pub font_file2: Option, #[pdf(name = "FontFile3")] @@ -403,7 +409,7 @@ impl PdfFontType1 { Self::Other(v) => Some(&v.font_descriptor), } } - pub fn encoding(&self) -> &PdfObjectDirect { + pub fn encoding(&self) -> &Option { match self { Self::Standard(v) => &v.encoding, Self::Other(v) => &v.encoding, @@ -435,14 +441,19 @@ impl PdfParse for PdfFontType1 { } fn parse(object: PdfObject) -> Result { let object = object.into(); - let PdfObjectDirect::Dictionary(object) = object else { - return Arc::::parse(object.into()).map(Self::Other); - }; - if let Ok(_) = PdfStandardFontName::parse(object.get_or_null(b"BaseFont".as_slice())) { - Arc::::parse(object.into()).map(Self::Standard) + let font = if let PdfObjectDirect::Dictionary(object) = object { + if let Ok(_) = PdfStandardFontName::parse(object.get_or_null(b"BaseFont".as_slice())) { + Self::Standard(PdfParse::parse(object.into())?) + } else { + Self::Other(PdfParse::parse(object.into())?) + } } else { - Arc::::parse(object.into()).map(Self::Other) + Self::Other(PdfParse::parse(object.into())?) + }; + if let Some(font_file) = font.font_descriptor().and_then(|v| v.font_file.as_ref()) { + font_file.decoded_data().as_ref()?; } + Ok(font) } } @@ -456,7 +467,7 @@ pub struct PdfFontType1Common { pub last_char: Option, pub widths: Option>, pub font_descriptor: Option, - pub encoding: PdfObjectDirect, + pub encoding: Option, pub to_unicode: Option, pub rest: PdfDictionary, } @@ -515,8 +526,7 @@ pdf_parse! { #[pdf(name = "FontDescriptor")] pub font_descriptor: Option, #[pdf(name = "Encoding")] - // TODO - pub encoding: PdfObjectDirect, + pub encoding: Option, #[pdf(name = "ToUnicode")] pub to_unicode: Option, #[pdf(flatten)] @@ -609,8 +619,7 @@ pdf_parse! { #[pdf(name = "FontDescriptor")] pub font_descriptor: PdfFontDescriptor, #[pdf(name = "Encoding")] - // TODO - pub encoding: PdfObjectDirect, + pub encoding: Option, #[pdf(name = "ToUnicode")] pub to_unicode: Option, #[pdf(flatten)] @@ -681,3 +690,235 @@ impl PdfFontType1Other { } } } + +pdf_parse! { + #[pdf(name)] + #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] + pub enum PdfSimpleFontEncodingPredefined { + #[pdf(name = "MacRomanEncoding")] + MacRomanEncoding, + #[pdf(name = "MacExpertEncoding")] + MacExpertEncoding, + #[pdf(name = "WinAnsiEncoding")] + WinAnsiEncoding, + } +} + +impl PdfSimpleFontEncodingPredefined { + pub const fn table(self) -> PdfSimpleFontEncodingTable { + match self { + Self::MacRomanEncoding => PdfSimpleFontEncodingTable::MAC_ROMAN, + Self::MacExpertEncoding => PdfSimpleFontEncodingTable::MAC_EXPERT, + Self::WinAnsiEncoding => PdfSimpleFontEncodingTable::WIN_ANSI, + } + } +} + +pdf_parse! { + #[pdf(name)] + #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug, Default)] + pub enum PdfSimpleFontEncodingDictionaryType { + #[pdf(name = "Encoding")] + #[default] + Encoding, + } +} + +pdf_parse! { + #[pdf] + #[derive(Clone, Debug)] + pub struct PdfSimpleFontEncodingDictionary { + #[pdf(name = "Type")] + pub ty: Option, + #[pdf(name = "BaseEncoding")] + pub base_encoding: Option, + #[pdf(name = "Differences")] + pub differences: Option, + #[pdf(flatten)] + pub rest: PdfDictionary, + } +} + +impl PdfSimpleFontEncodingDictionary { + pub fn table( + &self, + default_table: impl FnOnce() -> PdfSimpleFontEncodingTable, + ) -> PdfSimpleFontEncodingTable { + let Self { + ty: _, + base_encoding, + differences, + rest: _, + } = self; + let mut retval = base_encoding + .map(|v| v.table()) + .unwrap_or_else(default_table); + if let Some(differences) = differences { + retval = differences.table(retval); + } + retval + } +} + +#[derive(Clone, Debug)] +pub struct PdfSimpleFontEncodingDifferences { + pos: PdfInputPositionNoCompare, + map: Arc>, +} + +impl PdfSimpleFontEncodingDifferences { + pub fn new(pos: impl Into, map: Arc>) -> Self { + Self { + pos: pos.into(), + map, + } + } + pub fn pos(&self) -> PdfInputPosition { + self.pos.0 + } + pub fn map(&self) -> &Arc> { + &self.map + } + pub fn table(&self, base_table: PdfSimpleFontEncodingTable) -> PdfSimpleFontEncodingTable { + let mut retval = base_table; + let table: &mut [_; 0x100] = ArcOrRef::make_mut(&mut retval.table); + for (&byte, name) in self.map.iter() { + table[usize::from(byte)] = PdfSimpleFontEncodingTableEntry { + name: Some(name.clone()), + presumed_unicode: None, + }; + } + retval + } +} + +impl GetPdfInputPosition for PdfSimpleFontEncodingDifferences { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.pos.0 + } +} + +impl IsPdfNull for PdfSimpleFontEncodingDifferences { + fn is_pdf_null(&self) -> bool { + false + } +} + +impl PdfParse for PdfSimpleFontEncodingDifferences { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("PdfSimpleFontEncodingDifferences") + } + fn parse(object: PdfObject) -> Result { + let array = PdfArray::parse(object)?; + let pos = array.pos(); + let mut map = BTreeMap::new(); + let mut next_byte = None::; + for i in array.iter() { + let i = PdfNameOrInteger::parse(i.clone())?; + match i { + PdfNameOrInteger::Name(name) => { + let pos = name.pos(); + let byte = next_byte.ok_or(PdfParseError::IntegerOutOfRange { pos })?; + next_byte = byte.checked_add(1); + map.insert(byte, name); + } + PdfNameOrInteger::Integer(v) => next_byte = Some(u8::parse(v.into())?), + } + } + Ok(Self { + pos: pos.into(), + map: Arc::new(map), + }) + } +} + +#[derive(Clone, Default, Debug)] +pub struct PdfSimpleFontEncodingTableEntry { + pub name: Option, + pub presumed_unicode: Option<&'static str>, +} + +impl PdfSimpleFontEncodingTableEntry { + pub const fn new_static( + name: Option<&'static [u8]>, + presumed_unicode: Option<&'static str>, + ) -> Self { + Self { + name: match name { + Some(name) => Some(PdfName::new_static(name)), + None => None, + }, + presumed_unicode, + } + } +} + +#[derive(Clone, Debug)] +pub struct PdfSimpleFontEncodingTable { + pub table: ArcOrRef<'static, [PdfSimpleFontEncodingTableEntry; 0x100]>, +} + +#[derive(Clone, Debug)] +pub enum PdfSimpleFontEncoding { + Predefined(PdfSimpleFontEncodingPredefined), + Dictionary(PdfSimpleFontEncodingDictionary), +} + +impl PdfSimpleFontEncoding { + pub fn table( + &self, + default_table: impl FnOnce() -> PdfSimpleFontEncodingTable, + ) -> PdfSimpleFontEncodingTable { + match self { + PdfSimpleFontEncoding::Predefined(v) => v.table(), + PdfSimpleFontEncoding::Dictionary(v) => v.table(default_table), + } + } +} + +impl IsPdfNull for PdfSimpleFontEncoding { + fn is_pdf_null(&self) -> bool { + false + } +} + +impl PdfParse for PdfSimpleFontEncoding { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("PdfSimpleFontEncoding") + } + fn parse(object: PdfObject) -> Result { + let object = PdfObjectDirect::from(object); + match object { + PdfObjectDirect::Name(v) => Ok(Self::Predefined(PdfParse::parse(v.into())?)), + PdfObjectDirect::Dictionary(v) => Ok(Self::Dictionary(PdfParse::parse(v.into())?)), + _ => Err(PdfParseError::InvalidType { + pos: object.pos(), + ty: object.type_name(), + expected_ty: "PdfSimpleFontEncoding", + }), + } + } +} + +#[derive(Clone, Debug)] +#[non_exhaustive] +pub struct PdfFontType1Program { + pub encoding: Option]>>, + pub font_bbox: Option, + pub font_info: Option, + pub font_matrix: Option, + pub font_name: Option, +} + +#[derive(Clone, Debug)] +pub struct PdfFontType1FontInfo { + pub family_name: Option, + pub full_name: Option, + pub notice: Option, + pub weight: Option, + pub version: Option, + pub italic_angle: Option, + pub is_fixed_pitch: Option, + pub underline_position: Option, + pub underline_thickness: Option, +} diff --git a/src/pdf/font/tables.rs b/src/pdf/font/tables.rs new file mode 100644 index 0000000..fcb8218 --- /dev/null +++ b/src/pdf/font/tables.rs @@ -0,0 +1,1067 @@ +use crate::{ + pdf::font::{PdfSimpleFontEncodingTable, PdfSimpleFontEncodingTableEntry}, + util::ArcOrRef, +}; + +macro_rules! opt_lit { + (None) => { + None + }; + ($v:literal) => { + Some($v) + }; +} + +macro_rules! array_from_fn_0x100 { + (|$i:ident| $value:expr) => { + array_from_fn_0x100!(@step1 |$i| $value; + [ + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, + ]; + [ + 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xA0, 0xB0, 0xC0, 0xD0, 0xE0, 0xF0, + ] + ) + }; + (@step1 |$i:ident| $value:expr; [$($low_digits:literal,)*]; $high_digits:tt) => { + array_from_fn_0x100!(@step2 |$i| $value; [$($low_digits + $high_digits,)*]) + }; + (@step2 |$i:ident| $value:expr; [$($low_digits:literal + [$($high_digits:literal,)*],)*]) => { + [$($({ + let $i = $low_digits + $high_digits; + $value + },)*)*] + }; +} + +macro_rules! builtin_simple_font_encoding_table { + ( + $vis:vis const $const_name:ident; + $(#[default = ($default_name:literal, $default_presumed_unicode:literal)])? + [ + $(($byte:literal, $name:tt, $presumed_unicode:tt),)* + ] + ) => { + impl PdfSimpleFontEncodingTable { + $vis const $const_name: Self = { + const TABLE: [PdfSimpleFontEncodingTableEntry; 0x100] = { + #[allow(unused_mut, unused_assignments)] + const DEFAULT: (Option<&[u8]>, Option<&str>) = { + let mut value: (Option<&[u8]>, Option<&str>) = (None, None); + $(value = (Some($default_name), Some($default_presumed_unicode));)? + value + }; + let mut encoding = [DEFAULT; 0x100]; + $(encoding[$byte] = (opt_lit!($name), opt_lit!($presumed_unicode));)* + array_from_fn_0x100!(|i| PdfSimpleFontEncodingTableEntry::new_static(encoding[i].0, encoding[i].1)) + }; + Self { table: ArcOrRef::Ref(&TABLE)} + }; + } + }; +} + +builtin_simple_font_encoding_table! { + pub const MAC_ROMAN; + [ + (0o040, b"space", " "), + (0o041, b"exclam", "!"), + (0o042, b"quotedbl", "\""), + (0o043, b"numbersign", "#"), + (0o044, b"dollar", "$"), + (0o045, b"percent", "%"), + (0o046, b"ampersand", "&"), + (0o047, b"quotesingle", "\'"), + (0o050, b"parenleft", "("), + (0o051, b"parenright", ")"), + (0o052, b"asterisk", "*"), + (0o053, b"plus", "+"), + (0o054, b"comma", ","), + (0o055, b"hyphen", "-"), + (0o056, b"period", "."), + (0o057, b"slash", "/"), + (0o060, b"zero", "0"), + (0o061, b"one", "1"), + (0o062, b"two", "2"), + (0o063, b"three", "3"), + (0o064, b"four", "4"), + (0o065, b"five", "5"), + (0o066, b"six", "6"), + (0o067, b"seven", "7"), + (0o070, b"eight", "8"), + (0o071, b"nine", "9"), + (0o072, b"colon", ":"), + (0o073, b"semicolon", ";"), + (0o074, b"less", "<"), + (0o075, b"equal", "="), + (0o076, b"greater", ">"), + (0o077, b"question", "?"), + (0o100, b"at", "@"), + (0o101, b"A", "A"), + (0o102, b"B", "B"), + (0o103, b"C", "C"), + (0o104, b"D", "D"), + (0o105, b"E", "E"), + (0o106, b"F", "F"), + (0o107, b"G", "G"), + (0o110, b"H", "H"), + (0o111, b"I", "I"), + (0o112, b"J", "J"), + (0o113, b"K", "K"), + (0o114, b"L", "L"), + (0o115, b"M", "M"), + (0o116, b"N", "N"), + (0o117, b"O", "O"), + (0o120, b"P", "P"), + (0o121, b"Q", "Q"), + (0o122, b"R", "R"), + (0o123, b"S", "S"), + (0o124, b"T", "T"), + (0o125, b"U", "U"), + (0o126, b"V", "V"), + (0o127, b"W", "W"), + (0o130, b"X", "X"), + (0o131, b"Y", "Y"), + (0o132, b"Z", "Z"), + (0o133, b"bracketleft", "["), + (0o134, b"backslash", "\\"), + (0o135, b"bracketright", "]"), + (0o136, b"asciicircum", "^"), + (0o137, b"underscore", "_"), + (0o140, b"grave", "`"), + (0o141, b"a", "a"), + (0o142, b"b", "b"), + (0o143, b"c", "c"), + (0o144, b"d", "d"), + (0o145, b"e", "e"), + (0o146, b"f", "f"), + (0o147, b"g", "g"), + (0o150, b"h", "h"), + (0o151, b"i", "i"), + (0o152, b"j", "j"), + (0o153, b"k", "k"), + (0o154, b"l", "l"), + (0o155, b"m", "m"), + (0o156, b"n", "n"), + (0o157, b"o", "o"), + (0o160, b"p", "p"), + (0o161, b"q", "q"), + (0o162, b"r", "r"), + (0o163, b"s", "s"), + (0o164, b"t", "t"), + (0o165, b"u", "u"), + (0o166, b"v", "v"), + (0o167, b"w", "w"), + (0o170, b"x", "x"), + (0o171, b"y", "y"), + (0o172, b"z", "z"), + (0o173, b"braceleft", "{"), + (0o174, b"bar", "|"), + (0o175, b"braceright", "}"), + (0o176, b"asciitilde", "~"), + (0o200, b"Adieresis", "\u{c4}"), + (0o201, b"Aring", "\u{c5}"), + (0o202, b"Ccedilla", "\u{c7}"), + (0o203, b"Eacute", "\u{c9}"), + (0o204, b"Ntilde", "\u{d1}"), + (0o205, b"Odieresis", "\u{d6}"), + (0o206, b"Udieresis", "\u{dc}"), + (0o207, b"aacute", "\u{e1}"), + (0o210, b"agrave", "\u{e0}"), + (0o211, b"acircumflex", "\u{e2}"), + (0o212, b"adieresis", "\u{e4}"), + (0o213, b"atilde", "\u{e3}"), + (0o214, b"aring", "\u{e5}"), + (0o215, b"ccedilla", "\u{e7}"), + (0o216, b"eacute", "\u{e9}"), + (0o217, b"egrave", "\u{e8}"), + (0o220, b"ecircumflex", "\u{ea}"), + (0o221, b"edieresis", "\u{eb}"), + (0o222, b"iacute", "\u{ed}"), + (0o223, b"igrave", "\u{ec}"), + (0o224, b"icircumflex", "\u{ee}"), + (0o225, b"idieresis", "\u{ef}"), + (0o226, b"ntilde", "\u{f1}"), + (0o227, b"oacute", "\u{f3}"), + (0o230, b"ograve", "\u{f2}"), + (0o231, b"ocircumflex", "\u{f4}"), + (0o232, b"odieresis", "\u{f6}"), + (0o233, b"otilde", "\u{f5}"), + (0o234, b"uacute", "\u{fa}"), + (0o235, b"ugrave", "\u{f9}"), + (0o236, b"ucircumflex", "\u{fb}"), + (0o237, b"udieresis", "\u{fc}"), + (0o240, b"dagger", "\u{2020}"), + (0o241, b"degree", "\u{b0}"), + (0o242, b"cent", "\u{a2}"), + (0o243, b"sterling", "\u{a3}"), + (0o244, b"section", "\u{a7}"), + (0o245, b"bullet", "\u{2022}"), + (0o246, b"paragraph", "\u{b6}"), + (0o247, b"germandbls", "\u{df}"), + (0o250, b"registered", "\u{ae}"), + (0o251, b"copyright", "\u{a9}"), + (0o252, b"trademark", "\u{2122}"), + (0o253, b"acute", "\u{b4}"), + (0o254, b"dieresis", "\u{a8}"), + (0o256, b"AE", "\u{c6}"), + (0o257, b"Oslash", "\u{d8}"), + (0o261, b"plusminus", "\u{b1}"), + (0o264, b"yen", "\u{a5}"), + (0o265, b"mu", "\u{3bc}"), + (0o273, b"ordfeminine", "\u{aa}"), + (0o274, b"ordmasculine", "\u{ba}"), + (0o276, b"ae", "\u{e6}"), + (0o277, b"oslash", "\u{f8}"), + (0o300, b"questiondown", "\u{bf}"), + (0o301, b"exclamdown", "\u{a1}"), + (0o302, b"logicalnot", "\u{ac}"), + (0o304, b"florin", "\u{192}"), + (0o307, b"guillemotleft", "\u{ab}"), + (0o310, b"guillemotright", "\u{bb}"), + (0o311, b"ellipsis", "\u{2026}"), + (0o312, b"space", "\u{a0}"), + (0o313, b"Agrave", "\u{c0}"), + (0o314, b"Atilde", "\u{c3}"), + (0o315, b"Otilde", "\u{d5}"), + (0o316, b"OE", "\u{152}"), + (0o317, b"oe", "\u{153}"), + (0o320, b"endash", "\u{2013}"), + (0o321, b"emdash", "\u{2014}"), + (0o322, b"quotedblleft", "\u{201c}"), + (0o323, b"quotedblright", "\u{201d}"), + (0o324, b"quoteleft", "\u{2018}"), + (0o325, b"quoteright", "\u{2019}"), + (0o326, b"divide", "\u{f7}"), + (0o330, b"ydieresis", "\u{ff}"), + (0o331, b"Ydieresis", "\u{178}"), + (0o332, b"fraction", "\u{2044}"), + (0o333, b"currency", "\u{a4}"), + (0o334, b"guilsinglleft", "\u{2039}"), + (0o335, b"guilsinglright", "\u{203a}"), + (0o336, b"fi", "\u{fb01}"), + (0o337, b"fl", "\u{fb02}"), + (0o340, b"daggerdbl", "\u{2021}"), + (0o341, b"periodcentered", "\u{b7}"), + (0o342, b"quotesinglbase", "\u{201a}"), + (0o343, b"quotedblbase", "\u{201e}"), + (0o344, b"perthousand", "\u{2030}"), + (0o345, b"Acircumflex", "\u{c2}"), + (0o346, b"Ecircumflex", "\u{ca}"), + (0o347, b"Aacute", "\u{c1}"), + (0o350, b"Edieresis", "\u{cb}"), + (0o351, b"Egrave", "\u{c8}"), + (0o352, b"Iacute", "\u{cd}"), + (0o353, b"Icircumflex", "\u{ce}"), + (0o354, b"Idieresis", "\u{cf}"), + (0o355, b"Igrave", "\u{cc}"), + (0o356, b"Oacute", "\u{d3}"), + (0o357, b"Ocircumflex", "\u{d4}"), + (0o361, b"Ograve", "\u{d2}"), + (0o362, b"Uacute", "\u{da}"), + (0o363, b"Ucircumflex", "\u{db}"), + (0o364, b"Ugrave", "\u{d9}"), + (0o365, b"dotlessi", "\u{131}"), + (0o366, b"circumflex", "\u{2c6}"), + (0o367, b"tilde", "\u{2dc}"), + (0o370, b"macron", "\u{af}"), + (0o371, b"breve", "\u{2d8}"), + (0o372, b"dotaccent", "\u{2d9}"), + (0o373, b"ring", "\u{2da}"), + (0o374, b"cedilla", "\u{b8}"), + (0o375, b"hungarumlaut", "\u{2dd}"), + (0o376, b"ogonek", "\u{2db}"), + (0o377, b"caron", "\u{2c7}"), + ] +} + +builtin_simple_font_encoding_table! { + pub const STANDARD; + [ + (0o040, b"space", " "), + (0o041, b"exclam", "!"), + (0o042, b"quotedbl", "\""), + (0o043, b"numbersign", "#"), + (0o044, b"dollar", "$"), + (0o045, b"percent", "%"), + (0o046, b"ampersand", "&"), + (0o047, b"quoteright", "\u{2019}"), + (0o050, b"parenleft", "("), + (0o051, b"parenright", ")"), + (0o052, b"asterisk", "*"), + (0o053, b"plus", "+"), + (0o054, b"comma", ","), + (0o055, b"hyphen", "-"), + (0o056, b"period", "."), + (0o057, b"slash", "/"), + (0o060, b"zero", "0"), + (0o061, b"one", "1"), + (0o062, b"two", "2"), + (0o063, b"three", "3"), + (0o064, b"four", "4"), + (0o065, b"five", "5"), + (0o066, b"six", "6"), + (0o067, b"seven", "7"), + (0o070, b"eight", "8"), + (0o071, b"nine", "9"), + (0o072, b"colon", ":"), + (0o073, b"semicolon", ";"), + (0o074, b"less", "<"), + (0o075, b"equal", "="), + (0o076, b"greater", ">"), + (0o077, b"question", "?"), + (0o100, b"at", "@"), + (0o101, b"A", "A"), + (0o102, b"B", "B"), + (0o103, b"C", "C"), + (0o104, b"D", "D"), + (0o105, b"E", "E"), + (0o106, b"F", "F"), + (0o107, b"G", "G"), + (0o110, b"H", "H"), + (0o111, b"I", "I"), + (0o112, b"J", "J"), + (0o113, b"K", "K"), + (0o114, b"L", "L"), + (0o115, b"M", "M"), + (0o116, b"N", "N"), + (0o117, b"O", "O"), + (0o120, b"P", "P"), + (0o121, b"Q", "Q"), + (0o122, b"R", "R"), + (0o123, b"S", "S"), + (0o124, b"T", "T"), + (0o125, b"U", "U"), + (0o126, b"V", "V"), + (0o127, b"W", "W"), + (0o130, b"X", "X"), + (0o131, b"Y", "Y"), + (0o132, b"Z", "Z"), + (0o133, b"bracketleft", "["), + (0o134, b"backslash", "\\"), + (0o135, b"bracketright", "]"), + (0o136, b"asciicircum", "^"), + (0o137, b"underscore", "_"), + (0o140, b"quoteleft", "\u{2018}"), + (0o141, b"a", "a"), + (0o142, b"b", "b"), + (0o143, b"c", "c"), + (0o144, b"d", "d"), + (0o145, b"e", "e"), + (0o146, b"f", "f"), + (0o147, b"g", "g"), + (0o150, b"h", "h"), + (0o151, b"i", "i"), + (0o152, b"j", "j"), + (0o153, b"k", "k"), + (0o154, b"l", "l"), + (0o155, b"m", "m"), + (0o156, b"n", "n"), + (0o157, b"o", "o"), + (0o160, b"p", "p"), + (0o161, b"q", "q"), + (0o162, b"r", "r"), + (0o163, b"s", "s"), + (0o164, b"t", "t"), + (0o165, b"u", "u"), + (0o166, b"v", "v"), + (0o167, b"w", "w"), + (0o170, b"x", "x"), + (0o171, b"y", "y"), + (0o172, b"z", "z"), + (0o173, b"braceleft", "{"), + (0o174, b"bar", "|"), + (0o175, b"braceright", "}"), + (0o176, b"asciitilde", "~"), + (0o241, b"exclamdown", "\u{a1}"), + (0o242, b"cent", "\u{a2}"), + (0o243, b"sterling", "\u{a3}"), + (0o244, b"fraction", "\u{2044}"), + (0o245, b"yen", "\u{a5}"), + (0o246, b"florin", "\u{192}"), + (0o247, b"section", "\u{a7}"), + (0o250, b"currency", "\u{a4}"), + (0o251, b"quotesingle", "\'"), + (0o252, b"quotedblleft", "\u{201c}"), + (0o253, b"guillemotleft", "\u{ab}"), + (0o254, b"guilsinglleft", "\u{2039}"), + (0o255, b"guilsinglright", "\u{203a}"), + (0o256, b"fi", "\u{fb01}"), + (0o257, b"fl", "\u{fb02}"), + (0o261, b"endash", "\u{2013}"), + (0o262, b"dagger", "\u{2020}"), + (0o263, b"daggerdbl", "\u{2021}"), + (0o264, b"periodcentered", "\u{b7}"), + (0o266, b"paragraph", "\u{b6}"), + (0o267, b"bullet", "\u{2022}"), + (0o270, b"quotesinglbase", "\u{201a}"), + (0o271, b"quotedblbase", "\u{201e}"), + (0o272, b"quotedblright", "\u{201d}"), + (0o273, b"guillemotright", "\u{bb}"), + (0o274, b"ellipsis", "\u{2026}"), + (0o275, b"perthousand", "\u{2030}"), + (0o277, b"questiondown", "\u{bf}"), + (0o301, b"grave", "`"), + (0o302, b"acute", "\u{b4}"), + (0o303, b"circumflex", "\u{2c6}"), + (0o304, b"tilde", "\u{2dc}"), + (0o305, b"macron", "\u{af}"), + (0o306, b"breve", "\u{2d8}"), + (0o307, b"dotaccent", "\u{2d9}"), + (0o310, b"dieresis", "\u{a8}"), + (0o312, b"ring", "\u{2da}"), + (0o313, b"cedilla", "\u{b8}"), + (0o315, b"hungarumlaut", "\u{2dd}"), + (0o316, b"ogonek", "\u{2db}"), + (0o317, b"caron", "\u{2c7}"), + (0o320, b"emdash", "\u{2014}"), + (0o341, b"AE", "\u{c6}"), + (0o343, b"ordfeminine", "\u{aa}"), + (0o350, b"Lslash", "\u{141}"), + (0o351, b"Oslash", "\u{d8}"), + (0o352, b"OE", "\u{152}"), + (0o353, b"ordmasculine", "\u{ba}"), + (0o361, b"ae", "\u{e6}"), + (0o365, b"dotlessi", "\u{131}"), + (0o370, b"lslash", "\u{142}"), + (0o371, b"oslash", "\u{f8}"), + (0o372, b"oe", "\u{153}"), + (0o373, b"germandbls", "\u{df}"), + ] +} + +builtin_simple_font_encoding_table! { + pub const WIN_ANSI; + #[default = (b"bullet", "\u{2022}")] + [ + (0o040, b"space", " "), + (0o041, b"exclam", "!"), + (0o042, b"quotedbl", "\""), + (0o043, b"numbersign", "#"), + (0o044, b"dollar", "$"), + (0o045, b"percent", "%"), + (0o046, b"ampersand", "&"), + (0o047, b"quotesingle", "\'"), + (0o050, b"parenleft", "("), + (0o051, b"parenright", ")"), + (0o052, b"asterisk", "*"), + (0o053, b"plus", "+"), + (0o054, b"comma", ","), + (0o055, b"hyphen", "-"), + (0o056, b"period", "."), + (0o057, b"slash", "/"), + (0o060, b"zero", "0"), + (0o061, b"one", "1"), + (0o062, b"two", "2"), + (0o063, b"three", "3"), + (0o064, b"four", "4"), + (0o065, b"five", "5"), + (0o066, b"six", "6"), + (0o067, b"seven", "7"), + (0o070, b"eight", "8"), + (0o071, b"nine", "9"), + (0o072, b"colon", ":"), + (0o073, b"semicolon", ";"), + (0o074, b"less", "<"), + (0o075, b"equal", "="), + (0o076, b"greater", ">"), + (0o077, b"question", "?"), + (0o100, b"at", "@"), + (0o101, b"A", "A"), + (0o102, b"B", "B"), + (0o103, b"C", "C"), + (0o104, b"D", "D"), + (0o105, b"E", "E"), + (0o106, b"F", "F"), + (0o107, b"G", "G"), + (0o110, b"H", "H"), + (0o111, b"I", "I"), + (0o112, b"J", "J"), + (0o113, b"K", "K"), + (0o114, b"L", "L"), + (0o115, b"M", "M"), + (0o116, b"N", "N"), + (0o117, b"O", "O"), + (0o120, b"P", "P"), + (0o121, b"Q", "Q"), + (0o122, b"R", "R"), + (0o123, b"S", "S"), + (0o124, b"T", "T"), + (0o125, b"U", "U"), + (0o126, b"V", "V"), + (0o127, b"W", "W"), + (0o130, b"X", "X"), + (0o131, b"Y", "Y"), + (0o132, b"Z", "Z"), + (0o133, b"bracketleft", "["), + (0o134, b"backslash", "\\"), + (0o135, b"bracketright", "]"), + (0o136, b"asciicircum", "^"), + (0o137, b"underscore", "_"), + (0o140, b"grave", "`"), + (0o141, b"a", "a"), + (0o142, b"b", "b"), + (0o143, b"c", "c"), + (0o144, b"d", "d"), + (0o145, b"e", "e"), + (0o146, b"f", "f"), + (0o147, b"g", "g"), + (0o150, b"h", "h"), + (0o151, b"i", "i"), + (0o152, b"j", "j"), + (0o153, b"k", "k"), + (0o154, b"l", "l"), + (0o155, b"m", "m"), + (0o156, b"n", "n"), + (0o157, b"o", "o"), + (0o160, b"p", "p"), + (0o161, b"q", "q"), + (0o162, b"r", "r"), + (0o163, b"s", "s"), + (0o164, b"t", "t"), + (0o165, b"u", "u"), + (0o166, b"v", "v"), + (0o167, b"w", "w"), + (0o170, b"x", "x"), + (0o171, b"y", "y"), + (0o172, b"z", "z"), + (0o173, b"braceleft", "{"), + (0o174, b"bar", "|"), + (0o175, b"braceright", "}"), + (0o176, b"asciitilde", "~"), + (0o200, b"Euro", "\u{20ac}"), + (0o202, b"quotesinglbase", "\u{201a}"), + (0o203, b"florin", "\u{192}"), + (0o204, b"quotedblbase", "\u{201e}"), + (0o205, b"ellipsis", "\u{2026}"), + (0o206, b"dagger", "\u{2020}"), + (0o207, b"daggerdbl", "\u{2021}"), + (0o210, b"circumflex", "\u{2c6}"), + (0o211, b"perthousand", "\u{2030}"), + (0o212, b"Scaron", "\u{160}"), + (0o213, b"guilsinglleft", "\u{2039}"), + (0o214, b"OE", "\u{152}"), + (0o216, b"Zcaron", "\u{17d}"), + (0o221, b"quoteleft", "\u{2018}"), + (0o222, b"quoteright", "\u{2019}"), + (0o223, b"quotedblleft", "\u{201c}"), + (0o224, b"quotedblright", "\u{201d}"), + (0o225, b"bullet", "\u{2022}"), + (0o226, b"endash", "\u{2013}"), + (0o227, b"emdash", "\u{2014}"), + (0o230, b"tilde", "\u{2dc}"), + (0o231, b"trademark", "\u{2122}"), + (0o232, b"scaron", "\u{161}"), + (0o233, b"guilsinglright", "\u{203a}"), + (0o234, b"oe", "\u{153}"), + (0o236, b"zcaron", "\u{17e}"), + (0o237, b"Ydieresis", "\u{178}"), + (0o240, b"space", "\u{a0}"), + (0o241, b"exclamdown", "\u{a1}"), + (0o242, b"cent", "\u{a2}"), + (0o243, b"sterling", "\u{a3}"), + (0o244, b"currency", "\u{a4}"), + (0o245, b"yen", "\u{a5}"), + (0o246, b"brokenbar", "\u{a6}"), + (0o247, b"section", "\u{a7}"), + (0o250, b"dieresis", "\u{a8}"), + (0o251, b"copyright", "\u{a9}"), + (0o252, b"ordfeminine", "\u{aa}"), + (0o253, b"guillemotleft", "\u{ab}"), + (0o254, b"logicalnot", "\u{ac}"), + (0o255, b"hyphen", "\u{ad}"), + (0o256, b"registered", "\u{ae}"), + (0o257, b"macron", "\u{af}"), + (0o260, b"degree", "\u{b0}"), + (0o261, b"plusminus", "\u{b1}"), + (0o262, b"twosuperior", "\u{b2}"), + (0o263, b"threesuperior", "\u{b3}"), + (0o264, b"acute", "\u{b4}"), + (0o265, b"mu", "\u{3bc}"), + (0o266, b"paragraph", "\u{b6}"), + (0o267, b"periodcentered", "\u{b7}"), + (0o270, b"cedilla", "\u{b8}"), + (0o271, b"onesuperior", "\u{b9}"), + (0o272, b"ordmasculine", "\u{ba}"), + (0o273, b"guillemotright", "\u{bb}"), + (0o274, b"onequarter", "\u{bc}"), + (0o275, b"onehalf", "\u{bd}"), + (0o276, b"threequarters", "\u{be}"), + (0o277, b"questiondown", "\u{bf}"), + (0o300, b"Agrave", "\u{c0}"), + (0o301, b"Aacute", "\u{c1}"), + (0o302, b"Acircumflex", "\u{c2}"), + (0o303, b"Atilde", "\u{c3}"), + (0o304, b"Adieresis", "\u{c4}"), + (0o305, b"Aring", "\u{c5}"), + (0o306, b"AE", "\u{c6}"), + (0o307, b"Ccedilla", "\u{c7}"), + (0o310, b"Egrave", "\u{c8}"), + (0o311, b"Eacute", "\u{c9}"), + (0o312, b"Ecircumflex", "\u{ca}"), + (0o313, b"Edieresis", "\u{cb}"), + (0o314, b"Igrave", "\u{cc}"), + (0o315, b"Iacute", "\u{cd}"), + (0o316, b"Icircumflex", "\u{ce}"), + (0o317, b"Idieresis", "\u{cf}"), + (0o320, b"Eth", "\u{d0}"), + (0o321, b"Ntilde", "\u{d1}"), + (0o322, b"Ograve", "\u{d2}"), + (0o323, b"Oacute", "\u{d3}"), + (0o324, b"Ocircumflex", "\u{d4}"), + (0o325, b"Otilde", "\u{d5}"), + (0o326, b"Odieresis", "\u{d6}"), + (0o327, b"multiply", "\u{d7}"), + (0o330, b"Oslash", "\u{d8}"), + (0o331, b"Ugrave", "\u{d9}"), + (0o332, b"Uacute", "\u{da}"), + (0o333, b"Ucircumflex", "\u{db}"), + (0o334, b"Udieresis", "\u{dc}"), + (0o335, b"Yacute", "\u{dd}"), + (0o336, b"Thorn", "\u{de}"), + (0o337, b"germandbls", "\u{df}"), + (0o340, b"agrave", "\u{e0}"), + (0o341, b"aacute", "\u{e1}"), + (0o342, b"acircumflex", "\u{e2}"), + (0o343, b"atilde", "\u{e3}"), + (0o344, b"adieresis", "\u{e4}"), + (0o345, b"aring", "\u{e5}"), + (0o346, b"ae", "\u{e6}"), + (0o347, b"ccedilla", "\u{e7}"), + (0o350, b"egrave", "\u{e8}"), + (0o351, b"eacute", "\u{e9}"), + (0o352, b"ecircumflex", "\u{ea}"), + (0o353, b"edieresis", "\u{eb}"), + (0o354, b"igrave", "\u{ec}"), + (0o355, b"iacute", "\u{ed}"), + (0o356, b"icircumflex", "\u{ee}"), + (0o357, b"idieresis", "\u{ef}"), + (0o360, b"eth", "\u{f0}"), + (0o361, b"ntilde", "\u{f1}"), + (0o362, b"ograve", "\u{f2}"), + (0o363, b"oacute", "\u{f3}"), + (0o364, b"ocircumflex", "\u{f4}"), + (0o365, b"otilde", "\u{f5}"), + (0o366, b"odieresis", "\u{f6}"), + (0o367, b"divide", "\u{f7}"), + (0o370, b"oslash", "\u{f8}"), + (0o371, b"ugrave", "\u{f9}"), + (0o372, b"uacute", "\u{fa}"), + (0o373, b"ucircumflex", "\u{fb}"), + (0o374, b"udieresis", "\u{fc}"), + (0o375, b"yacute", "\u{fd}"), + (0o376, b"thorn", "\u{fe}"), + (0o377, b"ydieresis", "\u{ff}"), + ] +} + +builtin_simple_font_encoding_table! { + pub const PDF_DOC; + [ + (0o011, None, "\t"), + (0o012, None, "\n"), + (0o015, None, "\r"), + (0o030, b"breve", "\u{2d8}"), + (0o031, b"caron", "\u{2c7}"), + (0o032, b"circumflex", "\u{2c6}"), + (0o033, b"dotaccent", "\u{2d9}"), + (0o034, b"hungarumlaut", "\u{2dd}"), + (0o035, b"ogonek", "\u{2db}"), + (0o036, b"ring", "\u{2da}"), + (0o037, b"tilde", "\u{2dc}"), + (0o040, b"space", " "), + (0o041, b"exclam", "!"), + (0o042, b"quotedbl", "\""), + (0o043, b"numbersign", "#"), + (0o044, b"dollar", "$"), + (0o045, b"percent", "%"), + (0o046, b"ampersand", "&"), + (0o047, b"quotesingle", "\'"), + (0o050, b"parenleft", "("), + (0o051, b"parenright", ")"), + (0o052, b"asterisk", "*"), + (0o053, b"plus", "+"), + (0o054, b"comma", ","), + (0o055, b"hyphen", "-"), + (0o056, b"period", "."), + (0o057, b"slash", "/"), + (0o060, b"zero", "0"), + (0o061, b"one", "1"), + (0o062, b"two", "2"), + (0o063, b"three", "3"), + (0o064, b"four", "4"), + (0o065, b"five", "5"), + (0o066, b"six", "6"), + (0o067, b"seven", "7"), + (0o070, b"eight", "8"), + (0o071, b"nine", "9"), + (0o072, b"colon", ":"), + (0o073, b"semicolon", ";"), + (0o074, b"less", "<"), + (0o075, b"equal", "="), + (0o076, b"greater", ">"), + (0o077, b"question", "?"), + (0o100, b"at", "@"), + (0o101, b"A", "A"), + (0o102, b"B", "B"), + (0o103, b"C", "C"), + (0o104, b"D", "D"), + (0o105, b"E", "E"), + (0o106, b"F", "F"), + (0o107, b"G", "G"), + (0o110, b"H", "H"), + (0o111, b"I", "I"), + (0o112, b"J", "J"), + (0o113, b"K", "K"), + (0o114, b"L", "L"), + (0o115, b"M", "M"), + (0o116, b"N", "N"), + (0o117, b"O", "O"), + (0o120, b"P", "P"), + (0o121, b"Q", "Q"), + (0o122, b"R", "R"), + (0o123, b"S", "S"), + (0o124, b"T", "T"), + (0o125, b"U", "U"), + (0o126, b"V", "V"), + (0o127, b"W", "W"), + (0o130, b"X", "X"), + (0o131, b"Y", "Y"), + (0o132, b"Z", "Z"), + (0o133, b"bracketleft", "["), + (0o134, b"backslash", "\\"), + (0o135, b"bracketright", "]"), + (0o136, b"asciicircum", "^"), + (0o137, b"underscore", "_"), + (0o140, b"grave", "`"), + (0o141, b"a", "a"), + (0o142, b"b", "b"), + (0o143, b"c", "c"), + (0o144, b"d", "d"), + (0o145, b"e", "e"), + (0o146, b"f", "f"), + (0o147, b"g", "g"), + (0o150, b"h", "h"), + (0o151, b"i", "i"), + (0o152, b"j", "j"), + (0o153, b"k", "k"), + (0o154, b"l", "l"), + (0o155, b"m", "m"), + (0o156, b"n", "n"), + (0o157, b"o", "o"), + (0o160, b"p", "p"), + (0o161, b"q", "q"), + (0o162, b"r", "r"), + (0o163, b"s", "s"), + (0o164, b"t", "t"), + (0o165, b"u", "u"), + (0o166, b"v", "v"), + (0o167, b"w", "w"), + (0o170, b"x", "x"), + (0o171, b"y", "y"), + (0o172, b"z", "z"), + (0o173, b"braceleft", "{"), + (0o174, b"bar", "|"), + (0o175, b"braceright", "}"), + (0o176, b"asciitilde", "~"), + (0o200, b"bullet", "\u{2022}"), + (0o201, b"dagger", "\u{2020}"), + (0o202, b"daggerdbl", "\u{2021}"), + (0o203, b"ellipsis", "\u{2026}"), + (0o204, b"emdash", "\u{2014}"), + (0o205, b"endash", "\u{2013}"), + (0o206, b"florin", "\u{192}"), + (0o207, b"fraction", "\u{2044}"), + (0o210, b"guilsinglleft", "\u{2039}"), + (0o211, b"guilsinglright", "\u{203a}"), + (0o212, b"minus", "\u{2212}"), + (0o213, b"perthousand", "\u{2030}"), + (0o214, b"quotedblbase", "\u{201e}"), + (0o215, b"quotedblleft", "\u{201c}"), + (0o216, b"quotedblright", "\u{201d}"), + (0o217, b"quoteleft", "\u{2018}"), + (0o220, b"quoteright", "\u{2019}"), + (0o221, b"quotesinglbase", "\u{201a}"), + (0o222, b"trademark", "\u{2122}"), + (0o223, b"fi", "\u{fb01}"), + (0o224, b"fl", "\u{fb02}"), + (0o225, b"Lslash", "\u{141}"), + (0o226, b"OE", "\u{152}"), + (0o227, b"Scaron", "\u{160}"), + (0o230, b"Ydieresis", "\u{178}"), + (0o231, b"Zcaron", "\u{17d}"), + (0o232, b"dotlessi", "\u{131}"), + (0o233, b"lslash", "\u{142}"), + (0o234, b"oe", "\u{153}"), + (0o235, b"scaron", "\u{161}"), + (0o236, b"zcaron", "\u{17e}"), + (0o240, b"Euro", "\u{20ac}"), + (0o241, b"exclamdown", "\u{a1}"), + (0o242, b"cent", "\u{a2}"), + (0o243, b"sterling", "\u{a3}"), + (0o244, b"currency", "\u{a4}"), + (0o245, b"yen", "\u{a5}"), + (0o246, b"brokenbar", "\u{a6}"), + (0o247, b"section", "\u{a7}"), + (0o250, b"dieresis", "\u{a8}"), + (0o251, b"copyright", "\u{a9}"), + (0o252, b"ordfeminine", "\u{aa}"), + (0o253, b"guillemotleft", "\u{ab}"), + (0o254, b"logicalnot", "\u{ac}"), + (0o256, b"registered", "\u{ae}"), + (0o257, b"macron", "\u{af}"), + (0o260, b"degree", "\u{b0}"), + (0o261, b"plusminus", "\u{b1}"), + (0o262, b"twosuperior", "\u{b2}"), + (0o263, b"threesuperior", "\u{b3}"), + (0o264, b"acute", "\u{b4}"), + (0o265, b"mu", "\u{3bc}"), + (0o266, b"paragraph", "\u{b6}"), + (0o267, b"periodcentered", "\u{b7}"), + (0o270, b"cedilla", "\u{b8}"), + (0o271, b"onesuperior", "\u{b9}"), + (0o272, b"ordmasculine", "\u{ba}"), + (0o273, b"guillemotright", "\u{bb}"), + (0o274, b"onequarter", "\u{bc}"), + (0o275, b"onehalf", "\u{bd}"), + (0o276, b"threequarters", "\u{be}"), + (0o277, b"questiondown", "\u{bf}"), + (0o300, b"Agrave", "\u{c0}"), + (0o301, b"Aacute", "\u{c1}"), + (0o302, b"Acircumflex", "\u{c2}"), + (0o303, b"Atilde", "\u{c3}"), + (0o304, b"Adieresis", "\u{c4}"), + (0o305, b"Aring", "\u{c5}"), + (0o306, b"AE", "\u{c6}"), + (0o307, b"Ccedilla", "\u{c7}"), + (0o310, b"Egrave", "\u{c8}"), + (0o311, b"Eacute", "\u{c9}"), + (0o312, b"Ecircumflex", "\u{ca}"), + (0o313, b"Edieresis", "\u{cb}"), + (0o314, b"Igrave", "\u{cc}"), + (0o315, b"Iacute", "\u{cd}"), + (0o316, b"Icircumflex", "\u{ce}"), + (0o317, b"Idieresis", "\u{cf}"), + (0o320, b"Eth", "\u{d0}"), + (0o321, b"Ntilde", "\u{d1}"), + (0o322, b"Ograve", "\u{d2}"), + (0o323, b"Oacute", "\u{d3}"), + (0o324, b"Ocircumflex", "\u{d4}"), + (0o325, b"Otilde", "\u{d5}"), + (0o326, b"Odieresis", "\u{d6}"), + (0o327, b"multiply", "\u{d7}"), + (0o330, b"Oslash", "\u{d8}"), + (0o331, b"Ugrave", "\u{d9}"), + (0o332, b"Uacute", "\u{da}"), + (0o333, b"Ucircumflex", "\u{db}"), + (0o334, b"Udieresis", "\u{dc}"), + (0o335, b"Yacute", "\u{dd}"), + (0o336, b"Thorn", "\u{de}"), + (0o337, b"germandbls", "\u{df}"), + (0o340, b"agrave", "\u{e0}"), + (0o341, b"aacute", "\u{e1}"), + (0o342, b"acircumflex", "\u{e2}"), + (0o343, b"atilde", "\u{e3}"), + (0o344, b"adieresis", "\u{e4}"), + (0o345, b"aring", "\u{e5}"), + (0o346, b"ae", "\u{e6}"), + (0o347, b"ccedilla", "\u{e7}"), + (0o350, b"egrave", "\u{e8}"), + (0o351, b"eacute", "\u{e9}"), + (0o352, b"ecircumflex", "\u{ea}"), + (0o353, b"edieresis", "\u{eb}"), + (0o354, b"igrave", "\u{ec}"), + (0o355, b"iacute", "\u{ed}"), + (0o356, b"icircumflex", "\u{ee}"), + (0o357, b"idieresis", "\u{ef}"), + (0o360, b"eth", "\u{f0}"), + (0o361, b"ntilde", "\u{f1}"), + (0o362, b"ograve", "\u{f2}"), + (0o363, b"oacute", "\u{f3}"), + (0o364, b"ocircumflex", "\u{f4}"), + (0o365, b"otilde", "\u{f5}"), + (0o366, b"odieresis", "\u{f6}"), + (0o367, b"divide", "\u{f7}"), + (0o370, b"oslash", "\u{f8}"), + (0o371, b"ugrave", "\u{f9}"), + (0o372, b"uacute", "\u{fa}"), + (0o373, b"ucircumflex", "\u{fb}"), + (0o374, b"udieresis", "\u{fc}"), + (0o375, b"yacute", "\u{fd}"), + (0o376, b"thorn", "\u{fe}"), + (0o377, b"ydieresis", "\u{ff}"), + ] +} + +builtin_simple_font_encoding_table! { + pub const MAC_EXPERT; + [ + (0o040, b"space", " "), + (0o041, b"exclamsmall", "!"), + (0o042, b"Hungarumlautsmall", "\u{2dd}"), + (0o043, b"centoldstyle", "\u{a2}"), + (0o044, b"dollaroldstyle", "$"), + (0o045, b"dollarsuperior", "$"), + (0o046, b"ampersandsmall", "&"), + (0o047, b"Acutesmall", "\u{b4}"), + (0o050, b"parenleftsuperior", "\u{207d}"), + (0o051, b"parenrightsuperior", "\u{207e}"), + (0o052, b"twodotenleader", "\u{2025}"), + (0o053, b"onedotenleader", "\u{2024}"), + (0o054, b"comma", ","), + (0o055, b"hyphen", "-"), + (0o056, b"period", "."), + (0o057, b"fraction", "\u{2044}"), + (0o060, b"zerooldstyle", "0"), + (0o061, b"oneoldstyle", "1"), + (0o062, b"twooldstyle", "2"), + (0o063, b"threeoldstyle", "3"), + (0o064, b"fouroldstyle", "4"), + (0o065, b"fiveoldstyle", "5"), + (0o066, b"sixoldstyle", "6"), + (0o067, b"sevenoldstyle", "7"), + (0o070, b"eightoldstyle", "8"), + (0o071, b"nineoldstyle", "9"), + (0o072, b"colon", ":"), + (0o073, b"semicolon", ";"), + (0o075, b"threequartersemdash", "\u{2014}"), + (0o077, b"questionsmall", "?"), + (0o104, b"Ethsmall", "\u{f0}"), + (0o107, b"onequarter", "\u{bc}"), + (0o110, b"onehalf", "\u{bd}"), + (0o111, b"threequarters", "\u{be}"), + (0o112, b"oneeighth", "\u{215b}"), + (0o113, b"threeeighths", "\u{215c}"), + (0o114, b"fiveeighths", "\u{215d}"), + (0o115, b"seveneighths", "\u{215e}"), + (0o116, b"onethird", "\u{2153}"), + (0o117, b"twothirds", "\u{2154}"), + (0o126, b"ff", "\u{fb00}"), + (0o127, b"fi", "\u{fb01}"), + (0o130, b"fl", "\u{fb02}"), + (0o131, b"ffi", "\u{fb03}"), + (0o132, b"ffl", "\u{fb04}"), + (0o133, b"parenleftinferior", "\u{208d}"), + (0o135, b"parenrightinferior", "\u{208e}"), + (0o136, b"Circumflexsmall", "\u{2c6}"), + (0o137, b"hypheninferior", "-"), + (0o140, b"Gravesmall", "`"), + (0o141, b"Asmall", "a"), + (0o142, b"Bsmall", "b"), + (0o143, b"Csmall", "c"), + (0o144, b"Dsmall", "d"), + (0o145, b"Esmall", "e"), + (0o146, b"Fsmall", "f"), + (0o147, b"Gsmall", "g"), + (0o150, b"Hsmall", "h"), + (0o151, b"Ismall", "i"), + (0o152, b"Jsmall", "j"), + (0o153, b"Ksmall", "k"), + (0o154, b"Lsmall", "l"), + (0o155, b"Msmall", "m"), + (0o156, b"Nsmall", "n"), + (0o157, b"Osmall", "o"), + (0o160, b"Psmall", "p"), + (0o161, b"Qsmall", "q"), + (0o162, b"Rsmall", "r"), + (0o163, b"Ssmall", "s"), + (0o164, b"Tsmall", "t"), + (0o165, b"Usmall", "u"), + (0o166, b"Vsmall", "v"), + (0o167, b"Wsmall", "w"), + (0o170, b"Xsmall", "x"), + (0o171, b"Ysmall", "y"), + (0o172, b"Zsmall", "z"), + (0o173, b"colonmonetary", "\u{20a1}"), + (0o174, b"onefitted", "1"), + (0o175, b"rupiah", "Rp"), + (0o176, b"Tildesmall", "\u{2dc}"), + (0o201, b"asuperior", "a"), + (0o202, b"centsuperior", "\u{a2}"), + (0o207, b"Aacutesmall", "\u{e1}"), + (0o210, b"Agravesmall", "\u{e0}"), + (0o211, b"Acircumflexsmall", "\u{e2}"), + (0o212, b"Adieresissmall", "\u{e4}"), + (0o213, b"Atildesmall", "\u{e3}"), + (0o214, b"Aringsmall", "\u{e5}"), + (0o215, b"Ccedillasmall", "\u{e7}"), + (0o216, b"Eacutesmall", "\u{e9}"), + (0o217, b"Egravesmall", "\u{e8}"), + (0o220, b"Ecircumflexsmall", "\u{ea}"), + (0o221, b"Edieresissmall", "\u{eb}"), + (0o222, b"Iacutesmall", "\u{ed}"), + (0o223, b"Igravesmall", "\u{ec}"), + (0o224, b"Icircumflexsmall", "\u{ee}"), + (0o225, b"Idieresissmall", "\u{ef}"), + (0o226, b"Ntildesmall", "\u{f1}"), + (0o227, b"Oacutesmall", "\u{f3}"), + (0o230, b"Ogravesmall", "\u{f2}"), + (0o231, b"Ocircumflexsmall", "\u{f4}"), + (0o232, b"Odieresissmall", "\u{f6}"), + (0o233, b"Otildesmall", "\u{f5}"), + (0o234, b"Uacutesmall", "\u{fa}"), + (0o235, b"Ugravesmall", "\u{f9}"), + (0o236, b"Ucircumflexsmall", "\u{fb}"), + (0o237, b"Udieresissmall", "\u{fc}"), + (0o241, b"eightsuperior", "\u{2078}"), + (0o242, b"fourinferior", "\u{2084}"), + (0o243, b"threeinferior", "\u{2083}"), + (0o244, b"sixinferior", "\u{2086}"), + (0o245, b"eightinferior", "\u{2088}"), + (0o246, b"seveninferior", "\u{2087}"), + (0o247, b"Scaronsmall", "\u{161}"), + (0o251, b"centinferior", "\u{a2}"), + (0o252, b"twoinferior", "\u{2082}"), + (0o254, b"Dieresissmall", "\u{a8}"), + (0o256, b"Caronsmall", "\u{2c7}"), + (0o257, b"osuperior", "o"), + (0o260, b"fiveinferior", "\u{2085}"), + (0o262, b"commainferior", ","), + (0o263, b"periodinferior", "."), + (0o264, b"Yacutesmall", "\u{fd}"), + (0o266, b"dollarinferior", "$"), + (0o271, b"Thornsmall", "\u{fe}"), + (0o273, b"nineinferior", "\u{2089}"), + (0o274, b"zeroinferior", "\u{2080}"), + (0o275, b"Zcaronsmall", "\u{17e}"), + (0o276, b"AEsmall", "\u{e6}"), + (0o277, b"Oslashsmall", "\u{f8}"), + (0o300, b"questiondownsmall", "\u{bf}"), + (0o301, b"oneinferior", "\u{2081}"), + (0o302, b"Lslashsmall", "\u{142}"), + (0o311, b"Cedillasmall", "\u{b8}"), + (0o317, b"OEsmall", "\u{153}"), + (0o320, b"figuredash", "\u{2012}"), + (0o321, b"hyphensuperior", "-"), + (0o326, b"exclamdownsmall", "\u{a1}"), + (0o330, b"Ydieresissmall", "\u{ff}"), + (0o332, b"onesuperior", "\u{b9}"), + (0o333, b"twosuperior", "\u{b2}"), + (0o334, b"threesuperior", "\u{b3}"), + (0o335, b"foursuperior", "\u{2074}"), + (0o336, b"fivesuperior", "\u{2075}"), + (0o337, b"sixsuperior", "\u{2076}"), + (0o340, b"sevensuperior", "\u{2077}"), + (0o341, b"ninesuperior", "\u{2079}"), + (0o342, b"zerosuperior", "\u{2070}"), + (0o344, b"esuperior", "e"), + (0o345, b"rsuperior", "r"), + (0o346, b"tsuperior", "t"), + (0o351, b"isuperior", "i"), + (0o352, b"ssuperior", "s"), + (0o353, b"dsuperior", "d"), + (0o361, b"lsuperior", "l"), + (0o362, b"Ogoneksmall", "\u{2db}"), + (0o363, b"Brevesmall", "\u{2d8}"), + (0o364, b"Macronsmall", "\u{af}"), + (0o365, b"bsuperior", "b"), + (0o366, b"nsuperior", "\u{207f}"), + (0o367, b"msuperior", "m"), + (0o370, b"commasuperior", ","), + (0o371, b"periodsuperior", "."), + (0o372, b"Dotaccentsmall", "\u{2d9}"), + (0o373, b"Ringsmall", "\u{2da}"), + ] +} diff --git a/src/pdf/font/type_1_parse.rs b/src/pdf/font/type_1_parse.rs new file mode 100644 index 0000000..c557d5a --- /dev/null +++ b/src/pdf/font/type_1_parse.rs @@ -0,0 +1,1423 @@ +use crate::{ + pdf::{ + PdfObjects, + font::{PdfFontType1FontInfo, PdfFontType1Program}, + object::{PdfMatrix, PdfName, PdfRectangle, PdfStreamContents, PdfString, PdfVec2D}, + parse::{ + PdfInputPosition, PdfInputPositionKnown, PdfInputPositionNoCompare, PdfParseError, + }, + }, + util::ArcOrRef, +}; +use std::{ + cell::{Cell, RefCell}, + collections::BTreeMap, + fmt, + num::NonZero, + rc::Rc, + sync::Arc, +}; + +#[derive(Debug)] +enum PsBreakReason { + FoundEExec, + Error(PdfParseError), +} + +fn custom_err>(msg: impl ToString) -> Result { + Err(PdfParseError::Custom(msg.to_string()).into()) +} + +impl From for PsBreakReason { + fn from(value: PdfParseError) -> Self { + Self::Error(value) + } +} + +struct PsFileDecryptedSource { + source: Box>, + decoded: Vec, +} + +impl PsFileDecryptedSource { + fn get(&mut self, index: usize) -> Option { + loop { + if let Some(byte) = self.decoded.get(index) { + return Some(*byte); + } + self.decoded.push(self.source.next()?); + } + } +} + +#[derive(Clone)] +enum PsFileSource { + Bytes(Rc<[u8]>), + Decrypted(Rc>), +} + +impl PsFileSource { + fn get(&self, index: usize) -> Option { + match self { + PsFileSource::Bytes(bytes) => bytes.get(index).copied(), + PsFileSource::Decrypted(src) => src.borrow_mut().get(index), + } + } +} + +#[derive(Clone)] +struct PsFile { + id: u64, + source: PsFileSource, + pos: Rc>, +} + +impl PartialEq for PsFile { + fn eq(&self, other: &Self) -> bool { + self.id == other.id + } +} + +impl Eq for PsFile {} + +impl PartialOrd for PsFile { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for PsFile { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.id.cmp(&other.id) + } +} + +impl fmt::Debug for PsFile { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let Self { id, source: _, pos } = self; + f.debug_struct("PsFile") + .field("id", id) + .field("pos", pos) + .finish_non_exhaustive() + } +} + +fn is_whitespace_char(v: u8) -> bool { + matches!(v, b'\0' | b'\t' | b'\n' | b'\x0C' | b'\r' | b' ') +} + +fn is_special_char(v: u8) -> bool { + matches!( + v, + b'(' | b')' | b'<' | b'>' | b'[' | b']' | b'{' | b'}' | b'/' | b'%' + ) +} + +fn is_regular_char(v: u8) -> bool { + !(is_whitespace_char(v) || is_special_char(v)) +} + +struct NotALineEnd; + +#[derive(Clone)] +enum Token { + Integer(i128), + Real(f64), + ArrayStart, + ArrayEnd, + ProcedureStart, + ProcedureEnd, + ExecutableName(Vec), + LiteralName(Vec), + ImmediatelyEvaluatedName(Vec), + String(Vec), +} + +impl fmt::Debug for Token { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Integer(v) => f.debug_tuple("Integer").field(v).finish(), + Self::Real(v) => f.debug_tuple("Real").field(v).finish(), + Self::ArrayStart => write!(f, "ArrayStart"), + Self::ArrayEnd => write!(f, "ArrayEnd"), + Self::ProcedureStart => write!(f, "ProcedureStart"), + Self::ProcedureEnd => write!(f, "ProcedureEnd"), + Self::ExecutableName(name) => write!(f, "ExecutableName({})", name.escape_ascii()), + Self::LiteralName(name) => write!(f, "LiteralName({})", name.escape_ascii()), + Self::ImmediatelyEvaluatedName(name) => { + write!(f, "ImmediatelyEvaluatedName({})", name.escape_ascii()) + } + Self::String(contents) => { + write!(f, "String({})", contents.escape_ascii()) + } + } + } +} + +impl PsFile { + fn new(id: u64, source: PsFileSource, pos: usize, stream_pos: PdfInputPosition) -> Self { + Self { + id, + source, + pos: Rc::new(Cell::new(PdfInputPositionKnown { + pos, + containing_streams_pos: stream_pos.get().map(|v| v.pos), + })), + } + } + fn pos(&self) -> PdfInputPosition { + PdfInputPosition::new(Some(self.pos.get())) + } + fn peek_byte(&self) -> Option { + self.source.get(self.pos.get().pos) + } + fn next_byte(&mut self) -> Option { + if let Some(b) = self.source.get(self.pos.get().pos) { + self.pos.update(|mut pos| { + pos.pos += 1; + pos + }); + Some(b) + } else { + None + } + } + fn try_parse_line_end(&mut self) -> Result<(), NotALineEnd> { + match self.peek_byte().ok_or(NotALineEnd)? { + b'\r' => { + self.next_byte(); + if self.peek_byte() == Some(b'\n') { + self.next_byte(); + } + Ok(()) + } + b'\x0C' | b'\n' => Ok(()), + _ => Err(NotALineEnd), + } + } + fn skip_whitespace(&mut self) { + while let Some(b'\0' | b'\t' | b'\n' | b'\x0C' | b'\r' | b' ') = self.peek_byte() { + self.next_byte(); + } + } + fn skip_comments_and_whitespace(&mut self) { + loop { + self.skip_whitespace(); + let Some(b'%') = self.peek_byte() else { + break; + }; + while self.peek_byte().is_some() { + if let Ok(()) = self.try_parse_line_end() { + break; + } + self.next_byte(); + } + } + } + fn parse_number(mut text: &[u8]) -> Option { + let full_text = text; + let sign = match text { + [sign @ (b'-' | b'+'), rest @ ..] => { + text = rest; + Some(*sign) + } + _ => None, + }; + let mut radix = Some(0u32); + let mut any_digits = false; + while let [digit @ b'0'..=b'9', rest @ ..] = text { + text = rest; + any_digits = true; + radix = radix + .and_then(|v| v.checked_mul(10)) + .and_then(|v| v.checked_add((digit - b'0').into())); + } + if let (Some(radix @ (2..=36)), [b'#', rest @ ..]) = (radix, text) { + text = rest; + if sign.is_some() || text.is_empty() { + return None; + } + let mut value = 0i128; + for &digit in text { + let digit = (digit as char).to_digit(radix)?; + value = value.checked_mul(radix.into())?; + value = value.checked_add(digit.into())?; + } + return Some(Token::Integer(value)); + } + let mut is_real = false; + if let [b'.', rest @ ..] = text { + text = rest; + is_real = true; + while let [b'0'..=b'9', rest @ ..] = text { + text = rest; + any_digits = true; + } + } + if !any_digits { + return None; + } + if let [b'e' | b'E', rest @ ..] = text { + text = rest; + is_real = true; + if let [b'+' | b'-', rest @ ..] = text { + text = rest; + } + let [b'0'..=b'9', ..] = text else { + return None; + }; + while let [b'0'..=b'9', rest @ ..] = text { + text = rest; + } + } + let full_text = str::from_utf8(full_text).ok()?; + if is_real { + Some(Token::Real(full_text.parse().ok()?)) + } else { + Some(Token::Integer(full_text.parse().ok()?)) + } + } + fn parse_string_after_l_paren(&mut self) -> Result { + let mut contents = Vec::new(); + let mut paren_level = NonZero::new(1usize).expect("non-zero"); + while let Some(b) = self.next_byte() { + contents.push(match b { + b'(' => { + paren_level = paren_level.checked_add(1).expect("overflow"); + b + } + b')' => { + let Some(new_paren_level) = NonZero::new(paren_level.get() - 1) else { + return Ok(Token::String(contents)); + }; + paren_level = new_paren_level; + b + } + b'\r' if self.peek_byte() == Some(b'\n') => { + self.next_byte(); + b'\n' + } + b'\r' | b'\n' => b'\n', + b'\\' => { + let pos = self.pos(); + let Some(b) = self.next_byte() else { + return Err(PdfParseError::InvalidStringEscape { pos }); + }; + match b { + b'\r' if self.peek_byte() == Some(b'\n') => { + self.next_byte(); + continue; + } + b'\r' | b'\n' => continue, + b'n' => b'\n', + b'r' => b'\r', + b't' => b'\t', + b'b' => b'\x08', + b'f' => b'\x0C', + b'(' | b')' | b'\\' => b, + b'0'..=b'7' => { + const MAX_OCTAL_DIGITS: usize = 3; + let mut value = b - b'0'; + let mut len = 1; + while len < MAX_OCTAL_DIGITS { + let Some(b @ b'0'..=b'7') = self.peek_byte() else { + break; + }; + value <<= 3; + value |= b - b'0'; + len += 1; + self.next_byte(); + } + value + } + _ => { + return Err(PdfParseError::InvalidStringEscape { pos }); + } + } + } + _ => b, + }); + } + Err(PdfParseError::TruncatedFile { pos: self.pos() }) + } + fn next_token(&mut self) -> Result, PdfParseError> { + self.skip_comments_and_whitespace(); + let Some(first_byte) = self.peek_byte() else { + return Ok(None); + }; + match first_byte { + b'(' => { + self.next_byte(); + self.parse_string_after_l_paren().map(Some) + } + b')' => todo!(), + b'<' => { + todo!("encoded string"); + } + b'>' => todo!(), + b'[' => { + self.next_byte(); + Ok(Some(Token::ArrayStart)) + } + b']' => { + self.next_byte(); + Ok(Some(Token::ArrayEnd)) + } + b'{' => { + self.next_byte(); + Ok(Some(Token::ProcedureStart)) + } + b'}' => { + self.next_byte(); + Ok(Some(Token::ProcedureEnd)) + } + b'/' => { + self.next_byte(); + let is_immediately_evaluated_name = self.peek_byte() == Some(b'/'); + if is_immediately_evaluated_name { + self.next_byte(); + } + let mut name = Vec::new(); + while self.peek_byte().is_some_and(is_regular_char) { + name.extend(self.next_byte()); + } + Ok(Some(if is_immediately_evaluated_name { + Token::ImmediatelyEvaluatedName(name) + } else { + Token::LiteralName(name) + })) + } + _ => { + let mut name = Vec::new(); + name.extend(self.next_byte()); + while self.peek_byte().is_some_and(is_regular_char) { + name.extend(self.next_byte()); + } + if let Some(token) = Self::parse_number(&name) { + Ok(Some(token)) + } else { + Ok(Some(Token::ExecutableName(name))) + } + } + } + } + fn decrypt_for_eexec_helper( + mut self, + new_id: u64, + random_bytes: Option<[u8; 4]>, + next_byte: impl Fn(&mut Self) -> Option + 'static, + ) -> Result { + let read_first_4 = || -> Option<[u8; 4]> { + let b0 = next_byte(&mut self)?; + let b1 = next_byte(&mut self)?; + let b2 = next_byte(&mut self)?; + let b3 = next_byte(&mut self)?; + Some([b0, b1, b2, b3]) + }; + let random_bytes = random_bytes.or_else(read_first_4).ok_or_else(|| { + PdfParseError::Custom("postscript eexec operator: can't read the 4 random bytes".into()) + })?; + let mut r = 55665u16; + let c1 = 52845u16; + let c2 = 22719u16; + let mut decrypt_one = move |cipher: u8| -> u8 { + dbg!(cipher); + let plain = cipher ^ (r >> 8) as u8; + dbg!(plain); + r = (cipher as u16) + .wrapping_add(r) + .wrapping_mul(c1) + .wrapping_add(c2); + dbg!(r); + plain + }; + for b in random_bytes { + decrypt_one(b); + } + let stream_pos = self.pos(); + Ok(Self::new( + new_id, + PsFileSource::Decrypted(Rc::new(RefCell::new(PsFileDecryptedSource { + source: Box::new(std::iter::from_fn(move || { + dbg!(next_byte(&mut self)).map(decrypt_one) + })), + decoded: Vec::new(), + }))), + 0, + stream_pos, + )) + } + fn decrypt_for_eexec(mut self, new_id: u64) -> Result { + while let Some(b' ' | b'\t' | b'\r' | b'\n') = self.peek_byte() { + dbg!(self.next_byte()); + } + let start_pos = self.pos.get(); + let mut read_first_4_binary = || -> Option<[u8; 4]> { + let b0 = self.next_byte()?; + let b1 = self.next_byte()?; + let b2 = self.next_byte()?; + let b3 = self.next_byte()?; + let retval = [b0, b1, b2, b3]; + if retval.iter().all(u8::is_ascii_hexdigit) { + None + } else { + Some(retval) + } + }; + if let Some(random_bytes) = dbg!(read_first_4_binary()) { + self.decrypt_for_eexec_helper(new_id, Some(random_bytes), PsFile::next_byte) + } else { + self.pos.set(start_pos); + let next_byte = |this: &mut Self| { + let mut first_digit = None; + loop { + let byte = this.peek_byte()?; + if matches!(byte, b' ' | b'\t' | b'\r' | b'\n') { + this.next_byte(); + continue; + } + let digit = (byte as char).to_digit(0x10)?; + this.next_byte(); + if let Some(first_digit) = first_digit { + return Some(((first_digit << 4) | digit) as u8); + } else { + first_digit = Some(digit); + } + } + }; + self.decrypt_for_eexec_helper(new_id, None, next_byte) + } + } +} + +#[derive(Clone, Copy, Debug, Default)] +struct PsReal(f64); + +impl PartialOrd for PsReal { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Eq for PsReal {} + +impl PartialEq for PsReal { + fn eq(&self, other: &Self) -> bool { + self.cmp(other).is_eq() + } +} + +impl Ord for PsReal { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + let l = (!self.0.is_nan()).then_some(self.0); + let r = (!other.0.is_nan()).then_some(other.0); + l.partial_cmp(&r).expect("already checked for NaN") + } +} + +macro_rules! make_operator_enum { + ( + enum $enum_name:ident { + $($Variant:ident = $name:literal,)* + } + ) => { + #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug)] + enum $enum_name { + $($Variant,)* + } + + impl $enum_name { + const VARIANTS: &[Self] = &[$(Self::$Variant,)*]; + fn name(self) -> &'static str { + match self { + $(Self::$Variant => $name,)* + } + } + } + }; +} + +make_operator_enum! { + enum PsOperator { + Array = "array", + ArrayStart = "[", + ArrayEnd = "]", + Begin = "begin", + CurrentDict = "currentdict", + CurrentFile = "currentfile", + Def = "def", + Dict = "dict", + Dup = "dup", + FontDirectory = "FontDirectory", + For = "for", + EExec = "eexec", + End = "end", + Exch = "exch", + IfElse = "ifelse", + Index = "index", + Known = "known", + Put = "put", + ReadOnly = "readonly", + } +} + +impl PsOperator { + fn run(self, parser: &mut PsParser) -> Result<(), PsBreakReason> { + match self { + PsOperator::Array => { + let Some(len) = parser.operand_stack.pop().and_then(|v| v.to_int()) else { + return custom_err( + "postscript array operator is missing required integer operand", + ); + }; + let Ok(len) = len.try_into() else { + return custom_err("postscript array operator passed invalid length"); + }; + let array = PsArray::from_elements(parser, vec![PsObject::Null; len]); + parser.operand_stack.push(PsObject::Array(array)); + Ok(()) + } + PsOperator::ArrayStart => { + parser.operand_stack.push(PsObject::Mark); + Ok(()) + } + PsOperator::ArrayEnd => { + let mut elements = Vec::new(); + while let Some(object) = parser.operand_stack.pop() { + match object { + PsObject::Mark => { + elements.reverse(); + let array = PsArray::from_elements(parser, elements); + parser.operand_stack.push(PsObject::Array(array)); + return Ok(()); + } + _ => elements.push(object), + } + } + custom_err("postscript ] operator is missing required mark operand") + } + PsOperator::Begin => { + let Some(PsObject::Dictionary(dict)) = parser.operand_stack.pop() else { + return custom_err( + "postscript begin operator is missing required dictionary operand", + ); + }; + parser.dictionary_stack.push(dict); + Ok(()) + } + PsOperator::CurrentDict => { + let Some(dict) = parser.dictionary_stack.last().cloned() else { + unreachable!(); + }; + parser.operand_stack.push(PsObject::Dictionary(dict)); + Ok(()) + } + PsOperator::CurrentFile => { + parser + .operand_stack + .push(PsObject::File(parser.tokenizer.clone())); + Ok(()) + } + PsOperator::Def => { + let Some(value) = parser.operand_stack.pop() else { + return custom_err("postscript def operator is missing required operand"); + }; + let Some(key) = parser.operand_stack.pop() else { + return custom_err("postscript def operator is missing required operand"); + }; + let Some(dict) = parser.dictionary_stack.last_mut() else { + unreachable!(); + }; + dict.insert(key, value); + Ok(()) + } + PsOperator::Dict => { + let Some(_capacity) = parser.operand_stack.pop().and_then(|v| v.to_int()) else { + return custom_err( + "postscript dict operator is missing required integer operand", + ); + }; + let dict = PsDictionary::new(parser); + parser.operand_stack.push(PsObject::Dictionary(dict)); + Ok(()) + } + PsOperator::Dup => { + let Some(value) = parser.operand_stack.pop() else { + return custom_err("postscript dup operator is missing required operand"); + }; + parser.operand_stack.push(value.clone()); + parser.operand_stack.push(value); + Ok(()) + } + PsOperator::EExec => { + if parser.break_at_eexec { + return Err(PsBreakReason::FoundEExec); + } + let Some(source) = parser.operand_stack.pop() else { + return custom_err("postscript eexec operator is missing required operand"); + }; + let file = match source { + PsObject::String(string) => todo!(), + PsObject::File(file) => file, + _ => { + return custom_err("postscript eexec operator has invalid operand"); + } + }; + dbg!(&parser.dictionary_stack); + dbg!(&parser.operand_stack); + let file = file.decrypt_for_eexec(parser.next_file_id)?; + parser.next_file_id += 1; + struct PutBackTokenizerOnDrop<'a> { + parser: &'a mut PsParser, + old_tokenizer: PsFile, + } + impl Drop for PutBackTokenizerOnDrop<'_> { + fn drop(&mut self) { + self.parser.tokenizer = self.old_tokenizer.clone(); + } + } + let put_back_tokenizer_on_drop = PutBackTokenizerOnDrop { + old_tokenizer: std::mem::replace(&mut parser.tokenizer, file), + parser, + }; + put_back_tokenizer_on_drop.parser.parse_file() + } + PsOperator::End => { + if parser.dictionary_stack.len() <= PsParser::MIN_DICTIONARY_STACK_SIZE { + return custom_err("postscript end operator without corresponding begin"); + } + parser.dictionary_stack.pop(); + Ok(()) + } + PsOperator::Exch => { + let Some([a, b]) = parser.operand_stack.last_chunk_mut() else { + return custom_err("postscript exch operator is missing required operands"); + }; + std::mem::swap(a, b); + Ok(()) + } + PsOperator::FontDirectory => { + parser.operand_stack.push(PsObject::Dictionary( + parser.font_directory.clone().expect("set in PsParser::new"), + )); + Ok(()) + } + PsOperator::For => { + let Some(PsObject::Procedure(proc)) = parser.operand_stack.pop() else { + return custom_err( + "postscript for operator is missing required procedure operand", + ); + }; + let Some(limit) = parser.operand_stack.pop() else { + return custom_err("postscript for operator is missing required limit operand"); + }; + let Some(increment) = parser.operand_stack.pop() else { + return custom_err( + "postscript for operator is missing required increment operand", + ); + }; + let Some(initial) = parser.operand_stack.pop() else { + return custom_err( + "postscript for operator is missing required initial operand", + ); + }; + let PsObject::Integer(initial) = initial else { + todo!("{initial:?}"); + }; + let PsObject::Integer(increment @ (..=-1 | 1..)) = increment else { + todo!("{increment:?}"); + }; + let PsObject::Integer(limit) = limit else { + todo!("{limit:?} {:?}", parser.operand_stack); + }; + let mut counter = initial; + let proc = proc.into_vec(); + loop { + if increment < 0 { + if counter < limit { + break; + } + } else if counter > limit { + break; + } + parser.operand_stack.push(PsObject::Integer(counter)); + parser.run_procedure(&proc)?; + counter = counter.checked_add(increment).ok_or_else(|| { + PdfParseError::Custom("postscript arithmetic overflow".into()) + })?; + } + Ok(()) + } + PsOperator::IfElse => { + let Some(PsObject::Procedure(else_proc)) = parser.operand_stack.pop() else { + return custom_err( + "postscript ifelse operator is missing required procedure operand", + ); + }; + let Some(PsObject::Procedure(then_proc)) = parser.operand_stack.pop() else { + return custom_err( + "postscript ifelse operator is missing required procedure operand", + ); + }; + let Some(PsObject::Boolean(cond)) = parser.operand_stack.pop() else { + return custom_err( + "postscript ifelse operator is missing required bool operand", + ); + }; + if cond { + parser.run_procedure(&then_proc.into_vec())?; + } else { + parser.run_procedure(&else_proc.into_vec())?; + } + Ok(()) + } + PsOperator::Index => { + let Some(index) = parser.operand_stack.pop().and_then(|v| v.to_int()) else { + return custom_err( + "postscript index operator is missing required integer operand", + ); + }; + let Some(object) = index + .try_into() + .ok() + .and_then(|index| parser.operand_stack.iter().nth_back(index).cloned()) + else { + return custom_err("postscript index operator passed invalid integer"); + }; + parser.operand_stack.push(object); + Ok(()) + } + PsOperator::Known => { + let Some(key) = parser.operand_stack.pop() else { + return custom_err("postscript known operator is missing required key operand"); + }; + let Some(PsObject::Dictionary(dictionary)) = parser.operand_stack.pop() else { + return custom_err( + "postscript known operator is missing required dictionary operand", + ); + }; + parser + .operand_stack + .push(PsObject::Boolean(dictionary.get(key).is_some())); + Ok(()) + } + PsOperator::Put => { + let Some(value) = parser.operand_stack.pop() else { + return custom_err("postscript put operator is missing required value operand"); + }; + let Some(key_or_index) = parser.operand_stack.pop() else { + return custom_err( + "postscript put operator is missing required key/index operand", + ); + }; + let Some(container) = parser.operand_stack.pop() else { + return custom_err( + "postscript put operator is missing required container operand", + ); + }; + match container { + PsObject::Array(array) => { + let array = array.rc(); + let mut array = array.borrow_mut(); + let Some(target) = key_or_index + .to_int() + .and_then(|index| array.get_mut(usize::try_from(index).ok()?)) + else { + return custom_err("postscript put operator has invalid index operand"); + }; + *target = value; + Ok(()) + } + PsObject::Dictionary(mut dict) => { + dict.insert(key_or_index, value); + Ok(()) + } + PsObject::String(s) => todo!(), + _ => custom_err("postscript put operator was passed invalid container operand"), + } + } + PsOperator::ReadOnly => { + // TODO: implement permissions + Ok(()) + } + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Default)] +struct PsDictionaryImpl { + named: BTreeMap, + other: BTreeMap, +} + +#[derive(Clone)] +struct PsDictionary { + id: usize, + weak: std::rc::Weak>, +} + +impl fmt::Debug for PsDictionary { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let Self { id, weak } = self; + if let Some(weak) = weak.upgrade() { + if let Ok(weak) = weak.try_borrow() { + write!(f, "#{id} ")?; + let PsDictionaryImpl { named, other } = &*weak; + return f.debug_map().entries(named).entries(other).finish(); + } + } + f.debug_struct("PsDictionary") + .field("id", id) + .field("weak", &weak) + .finish() + } +} + +impl Ord for PsDictionary { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.id.cmp(&other.id) + } +} + +impl PartialOrd for PsDictionary { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Eq for PsDictionary {} + +impl PartialEq for PsDictionary { + fn eq(&self, other: &Self) -> bool { + self.id == other.id + } +} + +impl PsDictionary { + fn new(parser: &mut PsParser) -> Self { + Self::from_impl(parser, PsDictionaryImpl::default()) + } + fn rc(&self) -> Rc> { + self.weak.upgrade().expect("still in parser scope") + } + fn from_impl(parser: &mut PsParser, impl_: PsDictionaryImpl) -> Self { + let dict = Rc::new(RefCell::new(impl_)); + let weak = Rc::downgrade(&dict); + let id = parser.dictionaries.len(); + parser.dictionaries.push(dict); + Self { id, weak } + } + fn from_name_value_pairs<'a>( + parser: &mut PsParser, + iter: impl IntoIterator, + ) -> Self { + Self::from_impl( + parser, + PsDictionaryImpl { + named: BTreeMap::from_iter( + iter.into_iter() + .map(|(k, v)| (PsName(k.as_bytes().into()), v)), + ), + other: BTreeMap::new(), + }, + ) + } + fn get_named(&self, key: &PsName) -> Option { + self.rc().borrow().named.get(key).cloned() + } + fn insert(&mut self, key: PsObject, value: PsObject) -> Option { + let this = self.rc(); + let mut this = this.borrow_mut(); + let PsDictionaryImpl { named, other } = &mut *this; + match key { + PsObject::String(s) => named.insert(PsName(s.0.borrow().clone()), value), + PsObject::Name(name) => named.insert(name, value), + _ => other.insert(key, value), + } + } + fn insert_named(&mut self, name: PsName, value: PsObject) -> Option { + self.rc().borrow_mut().named.insert(name, value) + } + fn into_impl(self) -> PsDictionaryImpl { + self.rc().borrow().clone() + } + fn get(&self, key: PsObject) -> Option { + let this = self.rc(); + let this = this.borrow(); + let PsDictionaryImpl { named, other } = &*this; + match key { + PsObject::String(s) => named.get(&PsName(s.0.borrow().clone())).cloned(), + PsObject::Name(name) => named.get(&name).cloned(), + _ => other.get(&key).cloned(), + } + } +} + +#[derive(Clone)] +struct PsArray { + id: usize, + weak: std::rc::Weak>>, +} + +impl fmt::Debug for PsArray { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let Self { id, weak } = self; + if let Some(weak) = weak.upgrade() { + if let Ok(weak) = weak.try_borrow() { + write!(f, "#{id} ")?; + return Vec::fmt(&weak, f); + } + } + f.debug_struct("PsArray") + .field("id", id) + .field("weak", &weak) + .finish() + } +} + +impl Ord for PsArray { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.id.cmp(&other.id) + } +} + +impl PartialOrd for PsArray { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Eq for PsArray {} + +impl PartialEq for PsArray { + fn eq(&self, other: &Self) -> bool { + self.id == other.id + } +} + +impl PsArray { + fn new(parser: &mut PsParser) -> Self { + Self::from_elements(parser, Vec::new()) + } + fn rc(&self) -> Rc>> { + self.weak.upgrade().expect("still in parser scope") + } + fn from_elements(parser: &mut PsParser, elements: Vec) -> Self { + let array = Rc::new(RefCell::new(elements)); + let weak = Rc::downgrade(&array); + let id = parser.arrays.len(); + parser.arrays.push(array); + Self { id, weak } + } + fn into_vec(self) -> Vec { + self.rc().borrow().clone() + } +} + +#[derive(PartialEq, Eq, PartialOrd, Ord, Clone)] +struct PsName(Vec); + +impl fmt::Debug for PsName { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "PsName({})", self.0.escape_ascii()) + } +} + +impl From for PdfName { + fn from(value: PsName) -> Self { + PdfName::new(PdfInputPosition::empty(), ArcOrRef::Arc(value.0.into())) + } +} + +#[derive(PartialEq, Eq, PartialOrd, Ord, Clone)] +struct PsString(Rc>>); + +impl fmt::Debug for PsString { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "PsString({})", self.0.borrow().escape_ascii()) + } +} + +impl From for PdfString { + fn from(value: PsString) -> Self { + PdfString::new( + PdfInputPosition::empty(), + ArcOrRef::Arc(value.0.borrow().as_slice().into()), + ) + } +} + +#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] +enum PsObject { + Array(PsArray), + Procedure(PsArray), + Dictionary(PsDictionary), + Integer(i128), + Mark, + Name(PsName), + Operator(PsOperator), + Real(PsReal), + String(PsString), + Boolean(bool), + Null, + ExecutableName(PsName), + File(PsFile), +} + +impl PsObject { + fn to_int(&self) -> Option { + match self { + PsObject::Integer(v) => Some(*v), + PsObject::Real(PsReal(v)) => Some(*v as i128), + _ => None, + } + } + fn to_f32(&self) -> Option { + match self { + PsObject::Integer(v) => Some(*v as f32), + PsObject::Real(PsReal(v)) => Some(*v as f32), + _ => None, + } + } +} + +struct PsParser { + tokenizer: PsFile, + operand_stack: Vec, + dictionary_stack: Vec, + dictionaries: Vec>>, + arrays: Vec>>>, + next_file_id: u64, + break_at_eexec: bool, + font_directory: Option, +} + +impl PsParser { + const MIN_DICTIONARY_STACK_SIZE: usize = 3; + fn new(tokenizer: PsFile) -> Self { + let mut retval = Self { + next_file_id: tokenizer.id + 1, + tokenizer, + operand_stack: Vec::with_capacity(16), + dictionary_stack: Vec::with_capacity(8), + dictionaries: Vec::with_capacity(16), + arrays: Vec::with_capacity(16), + break_at_eexec: false, + font_directory: None, + }; + let mut system_dict = PsDictionary::from_name_value_pairs( + &mut retval, + PsOperator::VARIANTS + .iter() + .map(|&op| (op.name(), PsObject::Operator(op))), + ); + system_dict.insert_named(PsName(b"false".into()), PsObject::Boolean(false)); + system_dict.insert_named(PsName(b"true".into()), PsObject::Boolean(true)); + retval.dictionary_stack.push(system_dict); + let dict = PsDictionary::new(&mut retval); + retval.dictionary_stack.push(dict); + let dict = PsDictionary::new(&mut retval); + retval.dictionary_stack.push(dict); + retval.font_directory = Some(PsDictionary::new(&mut retval)); + retval + } + + fn run_name(&mut self, name: &PsName) -> Result<(), PsBreakReason> { + let Some(value) = self + .dictionary_stack + .iter() + .rev() + .find_map(|dict| dict.get_named(name)) + else { + todo!("unimplemented PS operator {name:?}"); + }; + match value { + PsObject::Integer(_) | PsObject::Real(_) | PsObject::Boolean(_) => { + self.operand_stack.push(value); + Ok(()) + } + PsObject::Name(v) => todo!(), + PsObject::ExecutableName(v) => todo!(), + PsObject::Null => todo!(), + PsObject::String(v) => todo!(), + PsObject::Array(v) => todo!(), + PsObject::Dictionary(v) => todo!(), + PsObject::Operator(value) => value.run(self), + PsObject::Mark => todo!(), + PsObject::Procedure(v) => todo!(), + PsObject::File(v) => todo!(), + } + } + + fn parse_procedure(&mut self) -> Result { + self.tokenizer.skip_comments_and_whitespace(); + let mut objects = Vec::new(); + while let Some(token) = self.tokenizer.next_token()? { + objects.push(match token { + Token::Integer(v) => PsObject::Integer(v), + Token::Real(v) => PsObject::Real(PsReal(v)), + Token::ArrayStart => PsObject::ExecutableName(PsName(b"[".into())), + Token::ArrayEnd => PsObject::ExecutableName(PsName(b"]".into())), + Token::ProcedureStart => PsObject::Procedure(self.parse_procedure()?), + Token::ProcedureEnd => return Ok(PsArray::from_elements(self, objects)), + Token::ExecutableName(name) => PsObject::ExecutableName(PsName(name.into())), + Token::LiteralName(name) => PsObject::Name(PsName(name.into())), + Token::ImmediatelyEvaluatedName(_) => todo!("{token:?}"), + Token::String(v) => PsObject::String(PsString(Rc::new(RefCell::new(v)))), + }); + } + custom_err("postscript missing closing }") + } + fn parse_file(&mut self) -> Result<(), PsBreakReason> { + self.tokenizer.skip_comments_and_whitespace(); + while let Some(token) = self.tokenizer.next_token()? { + match token { + Token::Integer(v) => self.operand_stack.push(PsObject::Integer(v)), + Token::Real(v) => self.operand_stack.push(PsObject::Real(PsReal(v))), + Token::ArrayStart => self.run_name(&PsName(b"[".into()))?, + Token::ArrayEnd => self.run_name(&PsName(b"]".into()))?, + Token::ProcedureStart => { + let procedure = self.parse_procedure()?; + self.operand_stack.push(PsObject::Procedure(procedure)) + } + Token::ProcedureEnd => todo!(), + Token::ExecutableName(name) => { + let name = PsName(name.into()); + self.run_name(&name)? + } + Token::LiteralName(name) => { + self.operand_stack.push(PsObject::Name(PsName(name.into()))) + } + Token::ImmediatelyEvaluatedName(_) => todo!("{token:?}"), + Token::String(v) => self + .operand_stack + .push(PsObject::String(PsString(Rc::new(RefCell::new(v))))), + } + } + Ok(()) + } + fn parse_font_encoding( + &mut self, + value: PsArray, + ) -> Result]>, PdfParseError> { + let value = value.rc(); + let value = value.borrow(); + let mut vec = Vec::with_capacity(value.len()); + for entry in value.iter() { + match entry { + PsObject::Name(name) => { + if name.0 == b".notdef" { + vec.push(None); + } else { + vec.push(Some(PdfName::new( + self.tokenizer.pos(), + Arc::from(&*name.0), + ))); + } + } + _ => todo!("{entry:?}"), + } + } + Ok(Arc::from(vec)) + } + fn parse_font_bbox(&mut self, value: PsArray) -> Result { + let value = value.rc(); + let value = value.borrow(); + let mut vec = Vec::new(); + for entry in value.iter() { + let Some(v) = entry.to_f32() else { + return custom_err("postscript invalid FontBBox entry"); + }; + vec.push(v); + } + match <[f32; 4]>::try_from(vec) { + Ok([x1, y1, x2, y2]) => Ok(PdfRectangle::new( + PdfVec2D { + pos: PdfInputPositionNoCompare::empty(), + x: x1, + y: y1, + }, + PdfVec2D { + pos: PdfInputPositionNoCompare::empty(), + x: x2, + y: y2, + }, + )), + Err(_) => custom_err("postscript invalid FontBBox entry"), + } + } + fn parse_font_matrix(&mut self, value: PsArray) -> Result { + let value = value.rc(); + let value = value.borrow(); + let mut vec = Vec::new(); + for entry in value.iter() { + let Some(v) = entry.to_f32() else { + return custom_err("postscript invalid FontBBox entry"); + }; + vec.push(v); + } + match vec.try_into() { + Ok(elements) => Ok(PdfMatrix { + pos: PdfInputPositionNoCompare::empty(), + elements, + }), + Err(_) => custom_err("postscript invalid FontBBox entry"), + } + } + fn parse_font_info_dict( + &mut self, + font_info_dict: PsDictionary, + ) -> Result { + let PsDictionaryImpl { named, other: _ } = font_info_dict.into_impl(); + let mut family_name = None; + let mut full_name = None; + let mut notice = None; + let mut weight = None; + let mut version = None; + let mut italic_angle = None; + let mut is_fixed_pitch = None; + let mut underline_position = None; + let mut underline_thickness = None; + for (key, value) in named { + match (&*key.0, value) { + (b"FamilyName", PsObject::String(string)) => family_name = Some(string.into()), + (b"FullName", PsObject::String(string)) => full_name = Some(string.into()), + (b"Notice", PsObject::String(string)) => notice = Some(string.into()), + (b"Weight", PsObject::String(string)) => weight = Some(string.into()), + (b"version", PsObject::String(string)) => version = Some(string.into()), + (b"ItalicAngle", value) => { + if let Some(value) = value.to_f32() { + italic_angle = Some(value); + } else { + todo!("{value:?}") + } + } + (b"isFixedPitch", PsObject::Boolean(v)) => is_fixed_pitch = Some(v), + (b"UnderlinePosition", value) => { + if let Some(value) = value.to_f32() { + underline_position = Some(value); + } else { + todo!("{value:?}") + } + } + (b"UnderlineThickness", value) => { + if let Some(value) = value.to_f32() { + underline_thickness = Some(value); + } else { + todo!("{value:?}") + } + } + _ => {} + } + } + Ok(PdfFontType1FontInfo { + family_name, + full_name, + notice, + weight, + version, + italic_angle, + is_fixed_pitch, + underline_position, + underline_thickness, + }) + } + fn parse_font_dict( + &mut self, + dict: PsDictionary, + ) -> Result { + let PsDictionaryImpl { named, other } = dict.into_impl(); + let mut encoding = None; + let mut font_bbox = None; + let mut font_info = None; + let mut font_matrix = None; + let mut font_name = None; + for (key, value) in named { + match (&*key.0, value) { + (b"Encoding", PsObject::Array(value)) => { + encoding = Some(self.parse_font_encoding(value)?); + } + (b"FontBBox", PsObject::Array(value) | PsObject::Procedure(value)) => { + font_bbox = Some(self.parse_font_bbox(value)?); + } + (b"FontInfo", PsObject::Dictionary(value)) => { + font_info = Some(self.parse_font_info_dict(value)?); + } + (b"FontMatrix", PsObject::Array(value) | PsObject::Procedure(value)) => { + font_matrix = Some(self.parse_font_matrix(value)?); + } + (b"FontName", PsObject::Name(value)) => { + font_name = Some(value.into()); + } + (b"FontType", _) => { + // TODO + } + (b"PaintType", _) => { + // TODO + } + (_, value) => todo!("{key:?}: {value:?}"), + } + } + for (key, value) in other { + todo!("{key:?}: {value:?}"); + } + Ok(PdfFontType1Program { + encoding, + font_bbox, + font_info, + font_matrix, + font_name, + }) + } + fn parse(mut self) -> Result { + self.break_at_eexec = true; + match self.parse_file() { + Ok(()) => return custom_err("postscript eexec operator not found"), + Err(PsBreakReason::FoundEExec) => {} + Err(PsBreakReason::Error(e)) => return Err(e), + } + let Some(PsObject::File(_)) = self.operand_stack.pop() else { + return custom_err("postscript eexec operand not found"); + }; + let Some(PsObject::Dictionary(dict)) = self.operand_stack.pop() else { + todo!(); + }; + self.parse_font_dict(dict) + } + fn run_procedure(&mut self, proc: &[PsObject]) -> Result<(), PsBreakReason> { + for object in proc { + match object { + PsObject::Array(v) => todo!(), + PsObject::Procedure(v) => todo!(), + PsObject::Dictionary(v) => todo!(), + PsObject::Integer(_) => self.operand_stack.push(object.clone()), + PsObject::Mark => todo!(), + PsObject::Name(_) => self.operand_stack.push(object.clone()), + PsObject::ExecutableName(name) => self.run_name(name)?, + PsObject::Operator(v) => todo!(), + PsObject::Real(v) => todo!(), + PsObject::String(v) => todo!(), + PsObject::Boolean(v) => todo!(), + PsObject::Null => todo!(), + PsObject::File(v) => todo!(), + } + } + Ok(()) + } +} + +impl PdfStreamContents for PdfFontType1Program { + fn parse( + data: &[u8], + stream_pos: PdfInputPosition, + _objects: Arc, + ) -> Result { + PsParser::new(PsFile::new( + 0, + PsFileSource::Bytes(Rc::from(data)), + 0, + stream_pos, + )) + .parse() + } +} diff --git a/src/pdf/object.rs b/src/pdf/object.rs index 0931286..bf91fac 100644 --- a/src/pdf/object.rs +++ b/src/pdf/object.rs @@ -482,6 +482,81 @@ impl PdfParse for PdfStringOrNumber { } } +#[derive(Clone)] +pub enum PdfNameOrInteger { + Name(PdfName), + Integer(PdfInteger), +} + +impl fmt::Debug for PdfNameOrInteger { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Name(v) => v.fmt(f), + Self::Integer(v) => v.fmt(f), + } + } +} + +impl PdfNameOrInteger { + pub fn pos(self) -> PdfInputPosition { + match self { + Self::Name(v) => v.pos(), + Self::Integer(v) => v.pos(), + } + } +} + +impl PdfObjectDirect { + pub fn name_or_integer(&self) -> Option { + match *self { + PdfObjectDirect::Name(ref v) => Some(PdfNameOrInteger::Name(v.clone())), + PdfObjectDirect::Integer(v) => Some(PdfNameOrInteger::Integer(v)), + PdfObjectDirect::Boolean(_) + | PdfObjectDirect::Real(_) + | PdfObjectDirect::String(_) + | PdfObjectDirect::Array(_) + | PdfObjectDirect::Dictionary(_) + | PdfObjectDirect::Stream(_) + | PdfObjectDirect::Null(_) => None, + } + } +} + +impl PdfObjectNonNull { + pub fn name_or_integer(&self) -> Option { + match *self { + PdfObjectNonNull::Name(ref v) => Some(PdfNameOrInteger::Name(v.clone())), + PdfObjectNonNull::Integer(v) => Some(PdfNameOrInteger::Integer(v)), + PdfObjectNonNull::Boolean(_) + | PdfObjectNonNull::Real(_) + | PdfObjectNonNull::String(_) + | PdfObjectNonNull::Array(_) + | PdfObjectNonNull::Dictionary(_) + | PdfObjectNonNull::Stream(_) => None, + } + } +} + +impl IsPdfNull for PdfNameOrInteger { + fn is_pdf_null(&self) -> bool { + false + } +} + +impl PdfParse for PdfNameOrInteger { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("name or integer") + } + fn parse(object: PdfObject) -> Result { + let object = PdfObjectDirect::from(object); + object.name_or_integer().ok_or(PdfParseError::InvalidType { + pos: object.pos(), + ty: object.type_name(), + expected_ty: "name or integer", + }) + } +} + macro_rules! make_pdf_object { ( $( diff --git a/src/pdf/parse.rs b/src/pdf/parse.rs index 4ec885e..1d57f5e 100644 --- a/src/pdf/parse.rs +++ b/src/pdf/parse.rs @@ -1302,7 +1302,7 @@ macro_rules! pdf_parse { $crate::__std::result::Result::Err($crate::pdf::parse::PdfParseError::InvalidName { pos: name.pos(), name, - expected_ty: $crate::__std::stringify!($Struct), + expected_ty: $crate::__std::stringify!($Enum), }) } } diff --git a/src/pdf/render.rs b/src/pdf/render.rs index 4fb56eb..8ffbcac 100644 --- a/src/pdf/render.rs +++ b/src/pdf/render.rs @@ -40,10 +40,7 @@ use crate::{ IsPdfNull, PdfMatrix, PdfName, PdfNumber, PdfObject, PdfObjectDirect, PdfStringOrNumber, PdfVec2D, }, - parse::{ - GetPdfInputPosition, PdfInputPosition, PdfInputPositionNoCompare, PdfParse, - PdfParseError, - }, + parse::{PdfInputPosition, PdfInputPositionNoCompare, PdfParse, PdfParseError}, }, pdf_parse, }; @@ -929,8 +926,21 @@ impl PdfRenderOperator for PdfOperatorShowTextWithGlyphPositioning { PdfStringOrNumber::String(s) => { for glyph in s.bytes().iter() { let positioning = std::mem::replace(&mut positioning, 0.0); - let encoding = font.encoding(); - todo!("{encoding:?}"); + let Some(encoding) = font.encoding() else { + todo!(); + }; + let table = encoding.table(|| { + let Some(font_encoding) = font + .font_descriptor() + .and_then(|v| v.font_file.as_ref()) + .and_then(|v| v.decoded_data().as_ref().ok()) + .and_then(|v| v.encoding.as_ref()) + else { + todo!() + }; + todo!("{font_encoding:?}"); + }); + todo!("{table:?}"); } } PdfStringOrNumber::Number(number) => positioning = number.as_f32(), diff --git a/src/util.rs b/src/util.rs index 1a4440c..9d576f8 100644 --- a/src/util.rs +++ b/src/util.rs @@ -104,6 +104,65 @@ impl fmt::Display for ArcOrRef<'_, T> { } } +/// a stable alternative to `CloneToUninit` for `Arc` +pub trait ArcFromRef { + /// like `Arc::new(Self::clone(self))` but works for unsized types too + fn arc_from_ref(&self) -> Arc; + /// generic version of `Arc::make_mut` + fn make_mut(this: &mut Arc) -> &mut Self; +} + +impl ArcFromRef for T { + fn arc_from_ref(&self) -> Arc { + Arc::new(Self::clone(self)) + } + fn make_mut(this: &mut Arc) -> &mut Self { + Arc::make_mut(this) + } +} + +impl ArcFromRef for [T] { + fn arc_from_ref(&self) -> Arc { + Arc::from(self) + } + fn make_mut(this: &mut Arc) -> &mut Self { + Arc::make_mut(this) + } +} + +impl ArcFromRef for str { + fn arc_from_ref(&self) -> Arc { + Arc::from(self) + } + fn make_mut(this: &mut Arc) -> &mut Self { + Arc::make_mut(this) + } +} + +impl<'a, T: ?Sized + ArcFromRef> ArcOrRef<'a, T> { + pub fn into_arc(this: Self) -> Arc { + match this { + ArcOrRef::Arc(v) => v, + ArcOrRef::Ref(v) => T::arc_from_ref(v), + } + } + pub fn make_arc(this: &mut Self) -> &mut Arc { + match this { + ArcOrRef::Arc(v) => v, + ArcOrRef::Ref(v) => { + *this = ArcOrRef::Arc(T::arc_from_ref(v)); + let ArcOrRef::Arc(v) = this else { + unreachable!(); + }; + v + } + } + } + pub fn make_mut(this: &mut Self) -> &mut T { + T::make_mut(Self::make_arc(this)) + } +} + trait DagDebugStateSealed {} #[expect(private_bounds)] From b8a97a2326b5bab5858b55a969822cbc5b25dc24 Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Tue, 30 Dec 2025 22:48:56 -0800 Subject: [PATCH 09/42] parse ToUnicode stream --- src/pdf/font.rs | 105 ++++++++-- src/pdf/font/to_unicode_parse.rs | 325 +++++++++++++++++++++++++++++++ src/pdf/font/type_1_parse.rs | 126 ++++++++++-- src/pdf/parse.rs | 19 +- src/pdf/render.rs | 12 +- 5 files changed, 545 insertions(+), 42 deletions(-) create mode 100644 src/pdf/font/to_unicode_parse.rs diff --git a/src/pdf/font.rs b/src/pdf/font.rs index 04b62f3..2196bf6 100644 --- a/src/pdf/font.rs +++ b/src/pdf/font.rs @@ -1,5 +1,6 @@ use crate::{ pdf::{ + font::type_1_parse::PsFile, object::{ IsPdfNull, PdfArray, PdfDictionary, PdfMatrix, PdfName, PdfNameOrInteger, PdfObject, PdfObjectDirect, PdfRectangle, PdfStream, PdfString, @@ -15,26 +16,88 @@ use crate::{ use std::{borrow::Cow, collections::BTreeMap, fmt, sync::Arc}; mod tables; +mod to_unicode_parse; mod type_1_parse; pdf_parse! { - #[pdf(transparent)] - #[derive(Clone)] - // TODO: actually parse the stream - pub struct PdfFontToUnicode { - #[pdf] - stream: PdfStream, + #[pdf] + #[derive(Clone, Debug)] + pub struct PdfFontToUnicodeDictionary { + #[pdf(name = "UseCMap")] + pub base_map: Option, // TODO: parse + #[pdf(flatten)] + pub rest: PdfDictionary, } } +#[derive(Clone)] +pub struct PdfFontToUnicode { + pub base_map: Option, // TODO: parse + pub char_map_name: PdfName, + pub src_ranges: Arc<[std::ops::RangeInclusive]>, + pub to_unicode_map: Arc>>, +} + impl fmt::Debug for PdfFontToUnicode { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - DagDebugState::scope(|_state| { - let Self { stream } = self; - f.debug_struct("PdfFontToUnicode") - .field("stream", stream) - .finish() - }) + struct DebugFn) -> fmt::Result>(F); + impl) -> fmt::Result> fmt::Debug for DebugFn { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + (self.0)(f) + } + } + let Self { + base_map, + char_map_name, + src_ranges, + to_unicode_map, + } = self; + f.debug_struct("PdfFontToUnicode") + .field("base_map", base_map) + .field("char_map_name", char_map_name) + .field( + "src_ranges", + &DebugFn(|f| { + f.debug_set() + .entries( + src_ranges + .iter() + .map(|v| v.start().bytes_debug()..=v.end().bytes_debug()), + ) + .finish() + }), + ) + .field( + "to_unicode_map", + &DebugFn(|f| { + f.debug_map() + .entries(to_unicode_map.iter().map(|(k, v)| (k.bytes_debug(), v))) + .finish() + }), + ) + .finish() + } +} + +impl IsPdfNull for PdfFontToUnicode { + fn is_pdf_null(&self) -> bool { + false + } +} + +impl PdfParse for PdfFontToUnicode { + fn type_name() -> Cow<'static, str> { + Cow::Borrowed("PdfFontToUnicode") + } + fn parse(object: PdfObject) -> Result { + let stream = PdfStream::::parse(object)?; + let base_map = stream.dictionary().rest.base_map.clone(); + let decoded_data = stream.decoded_data().clone()?; + to_unicode_parse::ToUnicodeParser::new(PsFile::from_arc_bytes( + decoded_data, + stream.get_pdf_input_position(), + )) + .parse(base_map) } } @@ -858,6 +921,17 @@ pub struct PdfSimpleFontEncodingTable { pub table: ArcOrRef<'static, [PdfSimpleFontEncodingTableEntry; 0x100]>, } +impl PdfSimpleFontEncodingTable { + pub const fn empty() -> Self { + const EMPTY_ENTRY: PdfSimpleFontEncodingTableEntry = + PdfSimpleFontEncodingTableEntry::new_static(None, None); + const EMPTY_TABLE: &[PdfSimpleFontEncodingTableEntry; 0x100] = &[EMPTY_ENTRY; 0x100]; + Self { + table: ArcOrRef::Ref(EMPTY_TABLE), + } + } +} + #[derive(Clone, Debug)] pub enum PdfSimpleFontEncoding { Predefined(PdfSimpleFontEncodingPredefined), @@ -903,11 +977,12 @@ impl PdfParse for PdfSimpleFontEncoding { #[derive(Clone, Debug)] #[non_exhaustive] pub struct PdfFontType1Program { - pub encoding: Option]>>, - pub font_bbox: Option, + pub encoding: PdfSimpleFontEncodingTable, + pub font_bbox: PdfRectangle, pub font_info: Option, - pub font_matrix: Option, + pub font_matrix: PdfMatrix, pub font_name: Option, + pub vertical_writing_mode: bool, } #[derive(Clone, Debug)] diff --git a/src/pdf/font/to_unicode_parse.rs b/src/pdf/font/to_unicode_parse.rs new file mode 100644 index 0000000..e5c57be --- /dev/null +++ b/src/pdf/font/to_unicode_parse.rs @@ -0,0 +1,325 @@ +use std::{collections::BTreeMap, sync::Arc}; + +use crate::{ + pdf::{ + font::{ + PdfFontToUnicode, + type_1_parse::{PsFile, Token}, + }, + object::{PdfName, PdfObjectDirect, PdfString}, + parse::{PdfInputPosition, PdfParseError}, + }, + util::ArcOrRef, +}; + +pub(crate) struct ToUnicodeParser { + tokenizer: PsFile, +} + +#[track_caller] +fn invalid_token_err(pos: PdfInputPosition, token: Option) -> Result { + Err(PdfParseError::InvalidTokenInToUnicodeStream { + pos, + token: format!("{token:?}"), + }) +} + +impl ToUnicodeParser { + pub(crate) fn new(tokenizer: PsFile) -> Self { + Self { tokenizer } + } + fn expect_any_string(&mut self) -> Result, PdfParseError> { + self.tokenizer.skip_comments_and_whitespace(); + let pos = self.tokenizer.pos(); + match self.tokenizer.next_token()? { + Some(Token::String(string)) => Ok(string), + token => invalid_token_err(pos, token), + } + } + pub(crate) fn expect_string_with_len( + &mut self, + expected_len: usize, + ) -> Result, PdfParseError> { + self.tokenizer.skip_comments_and_whitespace(); + let pos = self.tokenizer.pos(); + match self.tokenizer.next_token()? { + Some(Token::String(string)) if string.len() == expected_len => Ok(string), + token => invalid_token_err(pos, token), + } + } + pub(crate) fn expect_literal_name( + &mut self, + expected_name: &[u8], + ) -> Result<(), PdfParseError> { + self.tokenizer.skip_comments_and_whitespace(); + let pos = self.tokenizer.pos(); + match self.tokenizer.next_token()? { + Some(Token::LiteralName(name)) if name == expected_name => Ok(()), + token => invalid_token_err(pos, token), + } + } + pub(crate) fn expect_any_literal_name(&mut self) -> Result, PdfParseError> { + self.tokenizer.skip_comments_and_whitespace(); + let pos = self.tokenizer.pos(); + match self.tokenizer.next_token()? { + Some(Token::LiteralName(name)) => Ok(name), + token => invalid_token_err(pos, token), + } + } + pub(crate) fn expect_executable_name( + &mut self, + expected_name: &[u8], + ) -> Result<(), PdfParseError> { + self.tokenizer.skip_comments_and_whitespace(); + let pos = self.tokenizer.pos(); + match self.tokenizer.next_token()? { + Some(Token::ExecutableName(name)) if name == expected_name => Ok(()), + token => invalid_token_err(pos, token), + } + } + pub(crate) fn expect(&mut self, expected_token: Token) -> Result<(), PdfParseError> { + self.tokenizer.skip_comments_and_whitespace(); + let pos = self.tokenizer.pos(); + match self.tokenizer.next_token()? { + Some(token) if token == expected_token => Ok(()), + token => invalid_token_err(pos, token), + } + } + pub(crate) fn expect_integer(&mut self) -> Result { + self.tokenizer.skip_comments_and_whitespace(); + let pos = self.tokenizer.pos(); + match self.tokenizer.next_token()? { + Some(Token::Integer(value)) => Ok(value), + token => invalid_token_err(pos, token), + } + } + pub(crate) fn parse_dict( + &mut self, + mut entry_callback: impl FnMut(Vec, PdfInputPosition, Token) -> Result<(), PdfParseError>, + ) -> Result<(), PdfParseError> { + self.expect(Token::DictStart)?; + loop { + self.tokenizer.skip_comments_and_whitespace(); + let name_pos = self.tokenizer.pos(); + match self.tokenizer.next_token()? { + Some(Token::DictEnd) => return Ok(()), + Some(Token::LiteralName(name)) => { + self.tokenizer.skip_comments_and_whitespace(); + let value_pos = self.tokenizer.pos(); + let Some(value) = self.tokenizer.next_token()? else { + return invalid_token_err(value_pos, None); + }; + entry_callback(name, value_pos, value)?; + } + token => { + return invalid_token_err(name_pos, token); + } + } + } + } + pub(crate) fn parse( + mut self, + base_map: Option, + ) -> Result { + self.tokenizer.skip_comments_and_whitespace(); + self.expect_literal_name(b"CIDInit")?; + self.expect_literal_name(b"ProcSet")?; + self.expect_executable_name(b"findresource")?; + self.expect_executable_name(b"begin")?; + self.expect_integer()?; + self.expect_executable_name(b"dict")?; + self.expect_executable_name(b"begin")?; + self.expect_executable_name(b"begincmap")?; + self.expect_literal_name(b"CIDSystemInfo")?; + let mut registry = None; + let mut ordering = None; + let mut supplement = None; + self.parse_dict(|name, value_pos, value| match &*name { + b"Registry" => { + let Token::String(v) = value else { + return invalid_token_err(value_pos, Some(value)); + }; + registry = Some(v); + Ok(()) + } + b"Ordering" => { + let Token::String(v) = value else { + return invalid_token_err(value_pos, Some(value)); + }; + ordering = Some(v); + Ok(()) + } + b"Supplement" => { + let Token::Integer(v) = value else { + return invalid_token_err(value_pos, Some(value)); + }; + supplement = Some(v); + Ok(()) + } + _ => todo!("{}: {value:?}", name.escape_ascii()), + })?; + self.expect_executable_name(b"def")?; + self.expect_literal_name(b"CMapName")?; + self.tokenizer.skip_comments_and_whitespace(); + let char_map_name_pos = self.tokenizer.pos(); + let char_map_name = self.expect_any_literal_name()?; + self.expect_executable_name(b"def")?; + self.expect_literal_name(b"CMapType")?; + self.expect(Token::Integer(2))?; + self.expect_executable_name(b"def")?; + self.expect(Token::Integer(1))?; + self.expect_executable_name(b"begincodespacerange")?; + self.tokenizer.skip_comments_and_whitespace(); + let range_start_pos = self.tokenizer.pos(); + let range_start = self.expect_any_string()?; + if range_start.is_empty() { + return invalid_token_err(range_start_pos, Some(Token::String(range_start))); + } + self.tokenizer.skip_comments_and_whitespace(); + let range_end_pos = self.tokenizer.pos(); + let range_end = self.expect_string_with_len(range_start.len())?; + self.expect_executable_name(b"endcodespacerange")?; + let mut to_unicode_map: BTreeMap> = BTreeMap::new(); + let mut dest_str = String::new(); + let mut insert_mapping = |src_pos: PdfInputPosition, + src: &[u8], + dest_pos: PdfInputPosition, + dest_utf16_be: &[u8]| + -> Result<(), PdfParseError> { + dest_str.clear(); + for ch in char::decode_utf16( + dest_utf16_be + .chunks(2) + .map(|chunk| u16::from_be_bytes([chunk[0], chunk[1]])), + ) { + match ch { + Ok(ch) => dest_str.push(ch), + Err(_) => { + return Err(PdfParseError::InvalidUtf16 { pos: dest_pos }); + } + } + } + to_unicode_map.insert( + PdfString::new(src_pos, ArcOrRef::Arc(src.into())), + dest_str.as_str().into(), + ); + Ok(()) + }; + loop { + match self.tokenizer.next_token()? { + Some(Token::Integer(size)) => match self.tokenizer.next_token()? { + Some(Token::ExecutableName(name)) if name == b"beginbfrange" => { + for _ in 0..size { + self.tokenizer.skip_comments_and_whitespace(); + let src_pos = self.tokenizer.pos(); + let src_low = self.expect_string_with_len(range_start.len())?; + self.tokenizer.skip_comments_and_whitespace(); + let src_high_pos = self.tokenizer.pos(); + let src_high = self.expect_string_with_len(range_start.len())?; + if src_low.split_last().map(|(_, prefix)| prefix) + != src_high.split_last().map(|(_, prefix)| prefix) + { + return invalid_token_err( + src_high_pos, + Some(Token::String(src_high)), + ); + } + let src_last_range = *src_low.last().expect("known to be non-empty") + ..=*src_high.last().expect("known to be non-empty"); + self.tokenizer.skip_comments_and_whitespace(); + let dest_pos = self.tokenizer.pos(); + match self.tokenizer.next_token()? { + Some(Token::String(dest)) + if dest.len() >= 2 && dest.len() % 2 == 0 => + { + let mut src = src_low; + for (index, src_last_byte) in src_last_range.enumerate() { + *src.last_mut().expect("known to be non-empty") = + src_last_byte; + let mut dest = dest.clone(); + let [.., last] = &mut *dest else { + unreachable!(); + }; + *last += index as u8; + insert_mapping(src_pos, &src, dest_pos, &dest)?; + } + } + Some(token @ Token::String(_)) => { + todo!("odd number of dest bytes: {token:?}"); + } + Some(Token::ArrayStart) => { + let mut src = src_low; + for src_last_byte in src_last_range { + *src.last_mut().expect("known to be non-empty") = + src_last_byte; + self.tokenizer.skip_comments_and_whitespace(); + let dest_pos = self.tokenizer.pos(); + match self.tokenizer.next_token()? { + Some(Token::String(dest)) + if dest.len() >= 2 && dest.len() % 2 == 0 => + { + insert_mapping(src_pos, &src, dest_pos, &dest)?; + } + Some(token @ Token::String(_)) => { + todo!("odd number of dest bytes: {token:?}"); + } + token => return invalid_token_err(dest_pos, token), + } + } + self.expect(Token::ArrayEnd)?; + } + token => return invalid_token_err(dest_pos, token), + } + } + self.expect_executable_name(b"endbfrange")?; + } + Some(Token::ExecutableName(name)) if name == b"beginbfchar" => { + for _ in 0..size { + self.tokenizer.skip_comments_and_whitespace(); + let src_pos = self.tokenizer.pos(); + let src = self.expect_string_with_len(range_start.len())?; + self.tokenizer.skip_comments_and_whitespace(); + let dest_pos = self.tokenizer.pos(); + match self.tokenizer.next_token()? { + Some(Token::String(dest)) if dest.len() % 2 == 0 => { + insert_mapping(src_pos, &src, dest_pos, &dest)?; + } + Some(token @ Token::String(_)) => { + todo!("odd number of dest bytes: {token:?}"); + } + token => return invalid_token_err(dest_pos, token), + } + } + self.expect_executable_name(b"endbfchar")?; + } + token => todo!("{token:?}"), + }, + Some(Token::ExecutableName(name)) if name == b"endcmap" => { + break; + } + token => todo!("{token:?}"), + } + } + self.expect_executable_name(b"CMapName")?; + self.expect_executable_name(b"currentdict")?; + self.expect_literal_name(b"CMap")?; + self.expect_executable_name(b"defineresource")?; + self.expect_executable_name(b"pop")?; + self.expect_executable_name(b"end")?; + self.expect_executable_name(b"end")?; + self.tokenizer.skip_comments_and_whitespace(); + let eof_pos = self.tokenizer.pos(); + if let token @ Some(_) = self.tokenizer.next_token()? { + return invalid_token_err(eof_pos, token); + } + Ok(PdfFontToUnicode { + base_map, + char_map_name: PdfName::new(char_map_name_pos, Arc::<[u8]>::from(char_map_name)), + src_ranges: Arc::new([ + PdfString::new(range_start_pos, ArcOrRef::Arc(range_start.into())) + ..=PdfString::new(range_end_pos, ArcOrRef::Arc(range_end.into())), + ]), + to_unicode_map: Arc::new(to_unicode_map), + }) + } +} diff --git a/src/pdf/font/type_1_parse.rs b/src/pdf/font/type_1_parse.rs index c557d5a..1825e42 100644 --- a/src/pdf/font/type_1_parse.rs +++ b/src/pdf/font/type_1_parse.rs @@ -1,7 +1,10 @@ use crate::{ pdf::{ PdfObjects, - font::{PdfFontType1FontInfo, PdfFontType1Program}, + font::{ + PdfFontType1FontInfo, PdfFontType1Program, PdfSimpleFontEncodingTable, + PdfSimpleFontEncodingTableEntry, + }, object::{PdfMatrix, PdfName, PdfRectangle, PdfStreamContents, PdfString, PdfVec2D}, parse::{ PdfInputPosition, PdfInputPositionKnown, PdfInputPositionNoCompare, PdfParseError, @@ -52,7 +55,7 @@ impl PsFileDecryptedSource { #[derive(Clone)] enum PsFileSource { - Bytes(Rc<[u8]>), + Bytes(Arc<[u8]>), Decrypted(Rc>), } @@ -66,7 +69,7 @@ impl PsFileSource { } #[derive(Clone)] -struct PsFile { +pub(crate) struct PsFile { id: u64, source: PsFileSource, pos: Rc>, @@ -119,8 +122,8 @@ fn is_regular_char(v: u8) -> bool { struct NotALineEnd; -#[derive(Clone)] -enum Token { +#[derive(Clone, PartialEq)] +pub(crate) enum Token { Integer(i128), Real(f64), ArrayStart, @@ -131,6 +134,8 @@ enum Token { LiteralName(Vec), ImmediatelyEvaluatedName(Vec), String(Vec), + DictStart, + DictEnd, } impl fmt::Debug for Token { @@ -150,6 +155,8 @@ impl fmt::Debug for Token { Self::String(contents) => { write!(f, "String({})", contents.escape_ascii()) } + Self::DictStart => write!(f, "DictStart"), + Self::DictEnd => write!(f, "DictEnd"), } } } @@ -165,7 +172,10 @@ impl PsFile { })), } } - fn pos(&self) -> PdfInputPosition { + pub(crate) fn from_arc_bytes(bytes: Arc<[u8]>, stream_pos: PdfInputPosition) -> Self { + Self::new(0, PsFileSource::Bytes(bytes), 0, stream_pos) + } + pub(crate) fn pos(&self) -> PdfInputPosition { PdfInputPosition::new(Some(self.pos.get())) } fn peek_byte(&self) -> Option { @@ -200,7 +210,7 @@ impl PsFile { self.next_byte(); } } - fn skip_comments_and_whitespace(&mut self) { + pub(crate) fn skip_comments_and_whitespace(&mut self) { loop { self.skip_whitespace(); let Some(b'%') = self.peek_byte() else { @@ -340,7 +350,41 @@ impl PsFile { } Err(PdfParseError::TruncatedFile { pos: self.pos() }) } - fn next_token(&mut self) -> Result, PdfParseError> { + fn parse_string_after_l_angle(&mut self) -> Result, PdfParseError> { + let mut contents = Vec::new(); + let mut high_digit_value = None; + let mut push_digit_value = |value: u8| { + high_digit_value = match high_digit_value { + Some(high_digit_value) => { + contents.push((high_digit_value << 4) | value); + None + } + None => Some(value), + }; + }; + let string_pos = self.pos(); + loop { + let pos = self.pos(); + match self.next_byte() { + None => { + return Err(PdfParseError::TruncatedFile { pos }); + } + Some(b'\0' | b'\t' | b'\n' | b'\x0C' | b'\r' | b' ') => {} + Some(b'>') => { + // if we have an odd trailing digit, add the final digit, otherwise doesn't modify contents + push_digit_value(0); + return Ok(contents); + } + Some(b) => { + let Some(value) = (b as char).to_digit(0x10) else { + return Err(PdfParseError::InvalidHexStringDigit { pos }); + }; + push_digit_value(value as u8); + } + } + } + } + pub(crate) fn next_token(&mut self) -> Result, PdfParseError> { self.skip_comments_and_whitespace(); let Some(first_byte) = self.peek_byte() else { return Ok(None); @@ -352,9 +396,26 @@ impl PsFile { } b')' => todo!(), b'<' => { - todo!("encoded string"); + self.next_byte(); + match self.peek_byte() { + Some(b'<') => { + self.next_byte(); + Ok(Some(Token::DictStart)) + } + Some(b'~') => todo!("base 85 encoded string"), + _ => Ok(Some(Token::String(self.parse_string_after_l_angle()?))), + } + } + b'>' => { + self.next_byte(); + match self.peek_byte() { + Some(b'>') => { + self.next_byte(); + Ok(Some(Token::DictEnd)) + } + _ => todo!("stray >"), + } } - b'>' => todo!(), b'[' => { self.next_byte(); Ok(Some(Token::ArrayStart)) @@ -724,12 +785,15 @@ impl PsOperator { let PsObject::Integer(initial) = initial else { todo!("{initial:?}"); }; - let PsObject::Integer(increment @ (..=-1 | 1..)) = increment else { + let PsObject::Integer(increment) = increment else { todo!("{increment:?}"); }; let PsObject::Integer(limit) = limit else { todo!("{limit:?} {:?}", parser.operand_stack); }; + if increment == 0 { + return custom_err("postscript for operator: increment can't be zero"); + }; let mut counter = initial; let proc = proc.into_vec(); loop { @@ -1158,6 +1222,8 @@ impl PsParser { Token::Real(v) => PsObject::Real(PsReal(v)), Token::ArrayStart => PsObject::ExecutableName(PsName(b"[".into())), Token::ArrayEnd => PsObject::ExecutableName(PsName(b"]".into())), + Token::DictStart => PsObject::ExecutableName(PsName(b"<<".into())), + Token::DictEnd => PsObject::ExecutableName(PsName(b">>".into())), Token::ProcedureStart => PsObject::Procedure(self.parse_procedure()?), Token::ProcedureEnd => return Ok(PsArray::from_elements(self, objects)), Token::ExecutableName(name) => PsObject::ExecutableName(PsName(name.into())), @@ -1176,6 +1242,8 @@ impl PsParser { Token::Real(v) => self.operand_stack.push(PsObject::Real(PsReal(v))), Token::ArrayStart => self.run_name(&PsName(b"[".into()))?, Token::ArrayEnd => self.run_name(&PsName(b"]".into()))?, + Token::DictStart => self.run_name(&PsName(b"<<".into()))?, + Token::DictEnd => self.run_name(&PsName(b">>".into()))?, Token::ProcedureStart => { let procedure = self.parse_procedure()?; self.operand_stack.push(PsObject::Procedure(procedure)) @@ -1199,26 +1267,30 @@ impl PsParser { fn parse_font_encoding( &mut self, value: PsArray, - ) -> Result]>, PdfParseError> { + ) -> Result { let value = value.rc(); let value = value.borrow(); - let mut vec = Vec::with_capacity(value.len()); + let mut retval = PdfSimpleFontEncodingTable::empty(); + let mut table_iter = ArcOrRef::make_mut(&mut retval.table).iter_mut(); for entry in value.iter() { match entry { PsObject::Name(name) => { - if name.0 == b".notdef" { - vec.push(None); + let name = if name.0 == b".notdef" { + None } else { - vec.push(Some(PdfName::new( - self.tokenizer.pos(), - Arc::from(&*name.0), - ))); + Some(PdfName::new(self.tokenizer.pos(), Arc::from(&*name.0))) + }; + if let Some(entry) = table_iter.next() { + *entry = PdfSimpleFontEncodingTableEntry { + name, + presumed_unicode: None, + }; } } _ => todo!("{entry:?}"), } } - Ok(Arc::from(vec)) + Ok(retval) } fn parse_font_bbox(&mut self, value: PsArray) -> Result { let value = value.rc(); @@ -1332,6 +1404,7 @@ impl PsParser { let mut font_info = None; let mut font_matrix = None; let mut font_name = None; + let mut vertical_writing_mode = false; for (key, value) in named { match (&*key.0, value) { (b"Encoding", PsObject::Array(value)) => { @@ -1349,6 +1422,7 @@ impl PsParser { (b"FontName", PsObject::Name(value)) => { font_name = Some(value.into()); } + (b"WMode", PsObject::Boolean(v)) => vertical_writing_mode = v, (b"FontType", _) => { // TODO } @@ -1361,12 +1435,22 @@ impl PsParser { for (key, value) in other { todo!("{key:?}: {value:?}"); } + let Some(encoding) = encoding else { + return custom_err("postscript type 1 font must have Encoding"); + }; + let Some(font_bbox) = font_bbox else { + return custom_err("postscript type 1 font must have FontBBox"); + }; + let Some(font_matrix) = font_matrix else { + return custom_err("postscript type 1 font must have FontMatrix"); + }; Ok(PdfFontType1Program { encoding, font_bbox, font_info, font_matrix, font_name, + vertical_writing_mode, }) } fn parse(mut self) -> Result { @@ -1414,7 +1498,7 @@ impl PdfStreamContents for PdfFontType1Program { ) -> Result { PsParser::new(PsFile::new( 0, - PsFileSource::Bytes(Rc::from(data)), + PsFileSource::Bytes(Arc::from(data)), 0, stream_pos, )) diff --git a/src/pdf/parse.rs b/src/pdf/parse.rs index 1d57f5e..4e5502a 100644 --- a/src/pdf/parse.rs +++ b/src/pdf/parse.rs @@ -296,6 +296,13 @@ pub enum PdfParseError { MissingSetFontOperator { pos: PdfInputPosition, }, + InvalidTokenInToUnicodeStream { + pos: PdfInputPosition, + token: String, + }, + InvalidUtf16 { + pos: PdfInputPosition, + }, } impl From for PdfParseError { @@ -345,7 +352,9 @@ impl GetPdfInputPosition for PdfParseError { | PdfParseError::CantRestoreGraphicsStateWithEmptyStack { pos } | PdfParseError::FontResourceNotFound { pos, .. } | PdfParseError::MissingBeginTextOperator { pos } - | PdfParseError::MissingSetFontOperator { pos } => pos, + | PdfParseError::MissingSetFontOperator { pos } + | PdfParseError::InvalidTokenInToUnicodeStream { pos, .. } + | PdfParseError::InvalidUtf16 { pos } => pos, PdfParseError::OperatorNotAllowedHere { ref operator } => operator.pos(), PdfParseError::OperatorHasTooFewOperands { ref operator } | PdfParseError::OperatorHasTooManyOperands { ref operator } => operator.pos(), @@ -488,7 +497,7 @@ impl fmt::Display for PdfParseError { ) } PdfParseError::MissingOperator { pos } => { - write!(f, "at {pos}: stream not allowed here") + write!(f, "at {pos}: missing operator") } PdfParseError::OperatorHasTooFewOperands { ref operator } => { write!( @@ -525,6 +534,12 @@ impl fmt::Display for PdfParseError { "at {pos}: missing set font `Tf` operator before this text showing operator" ) } + PdfParseError::InvalidTokenInToUnicodeStream { pos, ref token } => { + write!(f, "at {pos}: invalid token in ToUnicode stream: {token}") + } + PdfParseError::InvalidUtf16 { pos } => { + write!(f, "at {pos}: invalid UTF-16") + } } } } diff --git a/src/pdf/render.rs b/src/pdf/render.rs index 8ffbcac..586b6c4 100644 --- a/src/pdf/render.rs +++ b/src/pdf/render.rs @@ -35,7 +35,7 @@ use crate::{ PdfOperatorUnparsed, }, document_structure::{PdfPage, PdfResourcesDictionary}, - font::{PdfFont, PdfTodo}, + font::{PdfFont, PdfSimpleFontEncodingTableEntry, PdfTodo}, object::{ IsPdfNull, PdfMatrix, PdfName, PdfNumber, PdfObject, PdfObjectDirect, PdfStringOrNumber, PdfVec2D, @@ -934,13 +934,17 @@ impl PdfRenderOperator for PdfOperatorShowTextWithGlyphPositioning { .font_descriptor() .and_then(|v| v.font_file.as_ref()) .and_then(|v| v.decoded_data().as_ref().ok()) - .and_then(|v| v.encoding.as_ref()) + .map(|v| v.encoding.clone()) else { todo!() }; - todo!("{font_encoding:?}"); + font_encoding }); - todo!("{table:?}"); + let PdfSimpleFontEncodingTableEntry { + name, + presumed_unicode, + } = table.table[usize::from(*glyph)].clone(); + todo!("{name:?} {presumed_unicode:?} {:#?}", font.to_unicode()); } } PdfStringOrNumber::Number(number) => positioning = number.as_f32(), From 0688724f03dd808becdf1249e13b962ccf8db446 Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Wed, 31 Dec 2025 19:43:19 -0800 Subject: [PATCH 10/42] WIP --- src/pdf/font.rs | 62 +++++++++++++++++++++- src/pdf/parse.rs | 21 ++++++-- src/pdf/render.rs | 127 ++++++++++++++++++++++++++++++---------------- 3 files changed, 160 insertions(+), 50 deletions(-) diff --git a/src/pdf/font.rs b/src/pdf/font.rs index 2196bf6..5f096b4 100644 --- a/src/pdf/font.rs +++ b/src/pdf/font.rs @@ -246,6 +246,10 @@ impl fmt::Debug for PdfFontDescriptor { } } +impl PdfFontDescriptor { + pub const DEFAULT_MISSING_WIDTH: f32 = 0.0; +} + pdf_parse! { #[pdf(name)] #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Default, Debug)] @@ -520,6 +524,15 @@ impl PdfParse for PdfFontType1 { } } +impl GetPdfInputPosition for PdfFontType1 { + fn get_pdf_input_position(&self) -> PdfInputPosition { + match self { + PdfFontType1::Standard(v) => v.get_pdf_input_position(), + PdfFontType1::Other(v) => v.get_pdf_input_position(), + } + } +} + #[derive(Clone)] pub struct PdfFontType1Common { pub ty: PdfFontType, @@ -535,6 +548,29 @@ pub struct PdfFontType1Common { pub rest: PdfDictionary, } +impl PdfFontType1Common { + fn validate_first_last_char_and_widths( + pos: PdfInputPosition, + first_char: Option, + last_char: Option, + widths: Option<&[f32]>, + ) -> Result<(), PdfParseError> { + if first_char.is_some() || last_char.is_some() || widths.is_some() { + let (Some(first_char), Some(last_char), Some(widths)) = (first_char, last_char, widths) + else { + return Err(PdfParseError::InvalidFontFirstLastCharWidths { pos }); + }; + let Some(widths_len) = (u64::from(last_char) + 1).checked_sub(first_char.into()) else { + return Err(PdfParseError::InvalidFontFirstLastCharWidths { pos }); + }; + if u64::try_from(widths.len()).ok() != Some(widths_len) { + return Err(PdfParseError::InvalidFontFirstLastCharWidths { pos }); + } + } + Ok(()) + } +} + impl fmt::Debug for PdfFontType1Common { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { DagDebugState::scope(|_state| { @@ -568,8 +604,16 @@ impl fmt::Debug for PdfFontType1Common { } } +impl GetPdfInputPosition for PdfFontType1Common { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.rest.pos() + } +} + pdf_parse! { - #[pdf] + #[pdf(validate = |pos| { + PdfFontType1Common::validate_first_last_char_and_widths(pos, first_char, last_char, widths.as_deref())?; + })] #[derive(Clone)] pub struct PdfFontType1Standard { #[pdf(name = "Type")] @@ -661,8 +705,16 @@ impl PdfFontType1Standard { } } +impl GetPdfInputPosition for PdfFontType1Standard { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.rest.pos() + } +} + pdf_parse! { - #[pdf] + #[pdf(validate = |pos| { + PdfFontType1Common::validate_first_last_char_and_widths(pos, Some(first_char), Some(last_char), Some(&widths))?; + })] #[derive(Clone)] pub struct PdfFontType1Other { #[pdf(name = "Type")] @@ -754,6 +806,12 @@ impl PdfFontType1Other { } } +impl GetPdfInputPosition for PdfFontType1Other { + fn get_pdf_input_position(&self) -> PdfInputPosition { + self.rest.pos() + } +} + pdf_parse! { #[pdf(name)] #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] diff --git a/src/pdf/parse.rs b/src/pdf/parse.rs index 4e5502a..3f0ff21 100644 --- a/src/pdf/parse.rs +++ b/src/pdf/parse.rs @@ -303,6 +303,9 @@ pub enum PdfParseError { InvalidUtf16 { pos: PdfInputPosition, }, + InvalidFontFirstLastCharWidths { + pos: PdfInputPosition, + }, } impl From for PdfParseError { @@ -354,7 +357,8 @@ impl GetPdfInputPosition for PdfParseError { | PdfParseError::MissingBeginTextOperator { pos } | PdfParseError::MissingSetFontOperator { pos } | PdfParseError::InvalidTokenInToUnicodeStream { pos, .. } - | PdfParseError::InvalidUtf16 { pos } => pos, + | PdfParseError::InvalidUtf16 { pos } + | PdfParseError::InvalidFontFirstLastCharWidths { pos } => pos, PdfParseError::OperatorNotAllowedHere { ref operator } => operator.pos(), PdfParseError::OperatorHasTooFewOperands { ref operator } | PdfParseError::OperatorHasTooManyOperands { ref operator } => operator.pos(), @@ -540,6 +544,9 @@ impl fmt::Display for PdfParseError { PdfParseError::InvalidUtf16 { pos } => { write!(f, "at {pos}: invalid UTF-16") } + PdfParseError::InvalidFontFirstLastCharWidths { pos } => { + write!(f, "at {pos}: invalid font first/last_char and/or widths") + } } } } @@ -992,7 +999,7 @@ macro_rules! pdf_parse { }; ( @impl - #[pdf] + #[pdf$(($(validate = |$pos:pat_param| $validate:expr)?))?] struct $Struct:ident$(<$($StructParam:ident $(: $StructBound:tt)?),* $(,)?>)? { $($(#[$($field_meta:tt)*])* $field_name:ident: $field_ty:ty,)* @@ -1046,6 +1053,10 @@ macro_rules! pdf_parse { $(#[$($field_meta)*])* $field_name: $field_ty })* + $($({ + let $pos = pos; + $validate + })?)? $crate::__std::result::Result::Ok(Self { $($field_name,)* }) @@ -1085,7 +1096,8 @@ macro_rules! pdf_parse { [$(#[$($field_meta:tt)*])*] $field_name:ident: $field_ty:ty ) => { - let $field_name = <$field_ty as $crate::pdf::parse::PdfParse>::parse( + #[allow(unused_mut)] + let mut $field_name = <$field_ty as $crate::pdf::parse::PdfParse>::parse( $crate::pdf::object::PdfObject::Dictionary( $crate::pdf::object::PdfDictionary::from_fields($pos, $object), ), @@ -1097,7 +1109,8 @@ macro_rules! pdf_parse { $field_name:ident: $field_ty:ty ) => { let $field_name = $crate::__std::convert::AsRef::<[$crate::__std::primitive::u8]>::as_ref($name); - let $field_name = <$field_ty as $crate::pdf::parse::PdfParse>::parse( + #[allow(unused_mut)] + let mut $field_name = <$field_ty as $crate::pdf::parse::PdfParse>::parse( $object_mut .remove($field_name) .unwrap_or($crate::pdf::object::PdfObject::Null($crate::pdf::object::PdfNull::new($pos))), diff --git a/src/pdf/render.rs b/src/pdf/render.rs index 586b6c4..4603ae1 100644 --- a/src/pdf/render.rs +++ b/src/pdf/render.rs @@ -35,12 +35,15 @@ use crate::{ PdfOperatorUnparsed, }, document_structure::{PdfPage, PdfResourcesDictionary}, - font::{PdfFont, PdfSimpleFontEncodingTableEntry, PdfTodo}, + font::{PdfFont, PdfFontDescriptor, PdfSimpleFontEncodingTableEntry, PdfTodo}, object::{ - IsPdfNull, PdfMatrix, PdfName, PdfNumber, PdfObject, PdfObjectDirect, + IsPdfNull, PdfMatrix, PdfName, PdfNumber, PdfObject, PdfObjectDirect, PdfString, PdfStringOrNumber, PdfVec2D, }, - parse::{PdfInputPosition, PdfInputPositionNoCompare, PdfParse, PdfParseError}, + parse::{ + GetPdfInputPosition, PdfInputPosition, PdfInputPositionNoCompare, PdfParse, + PdfParseError, + }, }, pdf_parse, }; @@ -270,8 +273,7 @@ impl PdfGraphicsState { pos: PdfInputPosition, glyph_displacement: PdfVec2D, position_adjustment: f32, - has_char_spacing: bool, - has_word_spacing: bool, + use_word_spacing: bool, ) -> Result<(), PdfParseError> { let text_object = PdfTextObjectState::require(self.text_state.text_object.as_mut(), pos)?; let (tx, ty) = if self @@ -283,21 +285,19 @@ impl PdfGraphicsState { { let mut ty = (glyph_displacement.y - position_adjustment * 1e-3) * self.text_state.font_size; - if has_char_spacing { - ty += self.text_state.char_spacing; - } - if has_word_spacing { + if use_word_spacing { ty += self.text_state.word_spacing; + } else { + ty += self.text_state.char_spacing; } (0.0, ty) } else { let mut tx = (glyph_displacement.x - position_adjustment * 1e-3) * self.text_state.font_size; - if has_char_spacing { - tx += self.text_state.char_spacing; - } - if has_word_spacing { + if use_word_spacing { tx += self.text_state.word_spacing; + } else { + tx += self.text_state.char_spacing; } (tx * self.text_state.horizontal_scaling_percent * 1e-2, 0.0) }; @@ -362,6 +362,73 @@ impl<'a> PdfRenderState<'a> { ) -> Result<(), PdfParseError> { todo!() } + pub fn render_string( + &mut self, + operator_pos: PdfInputPosition, + position_adjustment: f32, + s: &PdfString, + ) -> Result<(), PdfParseError> { + for &glyph in s.bytes().iter() { + let font = self + .graphics_state + .text_state + .font + .as_ref() + .ok_or(PdfParseError::MissingSetFontOperator { pos: operator_pos })?; + let PdfFont::Type1(font) = font else { todo!() }; + let Some(encoding) = font.encoding() else { + todo!(); + }; + let Some(widths) = font.widths() else { + todo!(); + }; + let Some(first_char) = font.first_char() else { + todo!(); + }; + let Some(last_char) = font.last_char() else { + todo!(); + }; + let Some(font_descriptor) = font.font_descriptor() else { + todo!(); + }; + let Some(font_program) = font_descriptor + .font_file + .as_ref() + .and_then(|v| v.decoded_data().as_ref().ok()) + else { + todo!(); + }; + if font_program.vertical_writing_mode { + todo!(); + } + let width = if u32::from(glyph) >= first_char && u32::from(glyph) <= last_char { + widths[usize::from(glyph) - first_char as usize] + } else { + font_descriptor + .missing_width + .unwrap_or(PdfFontDescriptor::DEFAULT_MISSING_WIDTH) + }; + todo!("handle position_adjustment"); + let matrix = self.graphics_state.text_rendering_matrix(s.pos())?; + let table = encoding.table(|| font_program.encoding.clone()); + let PdfSimpleFontEncodingTableEntry { + name, + presumed_unicode, + } = table.table[usize::from(glyph)].clone(); + todo!("{name:?} {presumed_unicode:?} {:#?}", font.to_unicode()); + self.graphics_state.advance_text_matrix( + s.pos(), + PdfVec2D { + pos: font.get_pdf_input_position().into(), + x: width * 1e-3, + y: 0.0, + }, + position_adjustment, + glyph == 32, + )?; + } + Ok(()) + } } pub trait PdfRenderOperator: Into { @@ -913,45 +980,17 @@ impl PdfRenderOperator for PdfOperatorShowTextWithGlyphPositioning { pos, ref text_and_positioning, } = *self; - let font = state - .graphics_state - .text_state - .font - .as_ref() - .ok_or(PdfParseError::MissingSetFontOperator { pos: pos.0 })?; - let PdfFont::Type1(font) = font else { todo!() }; let mut positioning = 0.0; for text_or_positioning in text_and_positioning.iter() { match text_or_positioning { PdfStringOrNumber::String(s) => { - for glyph in s.bytes().iter() { - let positioning = std::mem::replace(&mut positioning, 0.0); - let Some(encoding) = font.encoding() else { - todo!(); - }; - let table = encoding.table(|| { - let Some(font_encoding) = font - .font_descriptor() - .and_then(|v| v.font_file.as_ref()) - .and_then(|v| v.decoded_data().as_ref().ok()) - .map(|v| v.encoding.clone()) - else { - todo!() - }; - font_encoding - }); - let PdfSimpleFontEncodingTableEntry { - name, - presumed_unicode, - } = table.table[usize::from(*glyph)].clone(); - todo!("{name:?} {presumed_unicode:?} {:#?}", font.to_unicode()); - } + let positioning = std::mem::replace(&mut positioning, 0.0); + state.render_string(pos.0, positioning, s)?; } PdfStringOrNumber::Number(number) => positioning = number.as_f32(), } } - let _ = state; - todo!("{text_and_positioning:?}") + Ok(()) } } From 2381421776858bcc982f9835cdd8b9f53cbc2db4 Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Wed, 31 Dec 2025 20:34:10 -0800 Subject: [PATCH 11/42] add copyright headers and check script --- .gitignore | 4 +- README.md | 4 ++ parse_powerisa_pdf/parse_powerisa_pdf.py | 2 + parse_powerisa_pdf/quad_tree.py | 2 + parse_powerisa_pdf/set_by_id.py | 2 + pyproject.toml | 2 + scripts/check-copyright.sh | 63 ++++++++++++++++++++++++ 7 files changed, 78 insertions(+), 1 deletion(-) create mode 100755 scripts/check-copyright.sh diff --git a/.gitignore b/.gitignore index 50e4eb1..59baa47 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,8 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +# See Notices.txt for copyright information /.venv /.vscode *.egg-info __pycache__ *.log -/powerisa-instructions.xml \ No newline at end of file +/powerisa-instructions.xml diff --git a/README.md b/README.md index f8fae5d..7fd652f 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,7 @@ + parser for the OPF PowerISA 3.1C pdf to attempt to extract all instructions' pseudo-code including subscripts/superscripts and other formatting Usage: diff --git a/parse_powerisa_pdf/parse_powerisa_pdf.py b/parse_powerisa_pdf/parse_powerisa_pdf.py index c7187d1..a4afd09 100755 --- a/parse_powerisa_pdf/parse_powerisa_pdf.py +++ b/parse_powerisa_pdf/parse_powerisa_pdf.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +# See Notices.txt for copyright information from __future__ import annotations from collections import defaultdict from collections.abc import Generator, Iterable, Iterator, Callable diff --git a/parse_powerisa_pdf/quad_tree.py b/parse_powerisa_pdf/quad_tree.py index 34343e8..bee9d76 100644 --- a/parse_powerisa_pdf/quad_tree.py +++ b/parse_powerisa_pdf/quad_tree.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +# See Notices.txt for copyright information from __future__ import annotations from typing import Callable, Generic, Iterable, Iterator, TypeVar from math import frexp, isfinite, isnan, ldexp diff --git a/parse_powerisa_pdf/set_by_id.py b/parse_powerisa_pdf/set_by_id.py index 444741b..969f8d7 100644 --- a/parse_powerisa_pdf/set_by_id.py +++ b/parse_powerisa_pdf/set_by_id.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +# See Notices.txt for copyright information from collections import abc from typing import Callable, Generic, Iterable, Iterator, Protocol, TypeAlias, TypeVar, overload diff --git a/pyproject.toml b/pyproject.toml index c2ec3e0..68c4029 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +# See Notices.txt for copyright information [build-system] requires = ["setuptools >= 61.0"] build-backend = "setuptools.build_meta" diff --git a/scripts/check-copyright.sh b/scripts/check-copyright.sh new file mode 100755 index 0000000..640fb4b --- /dev/null +++ b/scripts/check-copyright.sh @@ -0,0 +1,63 @@ +#!/bin/bash +# SPDX-License-Identifier: LGPL-3.0-or-later +# See Notices.txt for copyright information +set -e + +function fail() +{ + local error="$1" + echo "error: $error" >&2 + exit 1 +} + +function fail_file() +{ + local file="$1" line="$2" error="$3" + fail "$file:$((line + 1)): $error" +} + +function check_file() +{ + local file="$1" regexes=("${@:2}") + local lines + mapfile -t lines < "$file" + if (("${#lines[@]}" == 0)); then + return # empty file, no copyright needed + fi + local line + for line in "${!regexes[@]}"; do + eval '[[ "${lines[i]}" =~ '"${regexes[i]}"' ]]' || + fail_file "$file" "$line" "doesn't match regex: ${regexes[i]}" + done +} + +POUND_HEADER=('^"# SPDX-License-Identifier: LGPL-3.0-or-later"$' '^"# See Notices.txt for copyright information"$') +MD_HEADER=('^" parser for the OPF PowerISA 3.1C pdf to attempt to extract all instructions' pseudo-code including subscripts/superscripts and other formatting +# Using the new Rust code: + +Usage: +* Download the OPF PowerISA 3.1C pdf (yes you need that exact version) from + +* Install Rust -- you need version 1.89.0 or later. + + Getting it from https://rustup.rs/ is recommended. + +* Compile and run: + + ```bash + cargo run -- path/to/downloaded/OPF_PowerISA_v3.1C.pdf > out.log + ``` + +* This will spit out lots of errors and then successfully create + the output file -- `powerisa-instructions.xml` in the current directory. + +# Using the old Python code: + Usage: * Download the OPF PowerISA 3.1C pdf (yes you need that exact version) from * Obtain CPython 3.11 (the default `python3` in [Debian Bookworm](https://www.debian.org/releases/bookworm/)) From 38a1fb328bd44f26389c28fbf66716154f4113dc Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Tue, 6 Jan 2026 16:13:36 -0800 Subject: [PATCH 42/42] add build dependencies to readme --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index 9fba609..f589559 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,15 @@ Usage: Getting it from https://rustup.rs/ is recommended. +* Install required build dependencies: + + On Debian 12: + + ```bash + sudo apt update + sudo apt install build-essential clang unzip + ``` + * Compile and run: ```bash