parses root successfully

This commit is contained in:
Jacob Lifshay 2025-12-24 21:49:57 -08:00
parent 5fbfaa8053
commit 83631cc4c6
Signed by: programmerjake
SSH key fingerprint: SHA256:HnFTLGpSm4Q4Fj502oCFisjZSoakwEuTsJJMSke63RQ
7 changed files with 623 additions and 118 deletions

View file

@ -5,15 +5,17 @@ use crate::{
GetPdfInputPosition, PdfInputPosition, PdfInputPositionNoCompare, PdfParse,
PdfParseError,
},
stream_filters::PdfStreamFilter,
},
pdf_parse,
util::ArcOrRef,
};
use std::{
any::TypeId,
borrow::Cow,
borrow::{Borrow, Cow},
collections::BTreeMap,
fmt::{self, Write},
iter::FusedIterator,
num::NonZero,
sync::{Arc, OnceLock},
};
@ -61,6 +63,12 @@ pub struct PdfName {
bytes: ArcOrRef<'static, [u8]>,
}
impl Borrow<[u8]> for PdfName {
fn borrow(&self) -> &[u8] {
&self.bytes
}
}
impl PdfName {
pub fn try_new(
pos: impl Into<PdfInputPositionNoCompare>,
@ -218,24 +226,51 @@ macro_rules! make_pdf_object {
$Variant:ident($ty:ty),
)+
) => {
#[derive(Clone, Debug)]
#[derive(Clone)]
pub enum PdfObjectNonNull {
$($Variant($ty),)*
}
#[derive(Clone, Debug)]
impl fmt::Debug for PdfObjectNonNull {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
$(Self::$Variant(v) => v.fmt(f),)*
}
}
}
#[derive(Clone)]
pub enum PdfObjectDirect {
$($Variant($ty),)*
Null(PdfNull),
}
#[derive(Clone, Debug)]
impl fmt::Debug for PdfObjectDirect {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
$(Self::$Variant(v) => v.fmt(f),)*
Self::Null(v) => v.fmt(f),
}
}
}
#[derive(Clone)]
pub enum PdfObject {
$($Variant($ty),)*
Null(PdfNull),
Indirect(PdfObjectIndirect),
}
impl fmt::Debug for PdfObject {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
$(Self::$Variant(v) => v.fmt(f),)*
Self::Null(v) => v.fmt(f),
Self::Indirect(v) => v.fmt(f),
}
}
}
$(
impl From<$ty> for PdfObjectNonNull {
fn from(value: $ty) -> Self {
@ -546,12 +581,12 @@ impl PdfObjectIndirect {
}
pub fn get(&self) -> PdfObjectDirect {
if let Some(objects) = self.objects.upgrade() {
if let Some(objects) = objects.objects.get() {
if let Some(objects) = objects.inner.get() {
let final_id = self.final_id.get().copied();
let limit = if final_id.is_some() { 1 } else { 1000usize };
let mut id = final_id.unwrap_or(self.id);
for _ in 0..limit {
if let Some(object) = objects.get(&self.id) {
if let Some(object) = objects.objects.get(&self.id) {
let retval = match object {
PdfObject::Boolean(v) => PdfObjectDirect::Boolean(*v),
PdfObject::Integer(v) => PdfObjectDirect::Integer(*v),
@ -628,18 +663,27 @@ impl PdfDictionary {
}
pub fn contains_key<Q: ?Sized>(&self, key: &Q) -> bool
where
PdfName: std::borrow::Borrow<Q> + Ord,
PdfName: std::borrow::Borrow<Q>,
Q: Ord,
{
self.fields.contains_key(key)
}
pub fn get<Q: ?Sized>(&self, key: &Q) -> Option<&PdfObject>
where
PdfName: std::borrow::Borrow<Q> + Ord,
PdfName: std::borrow::Borrow<Q>,
Q: Ord,
{
self.fields.get(key)
}
pub fn get_or_null<Q: ?Sized>(&self, key: &Q) -> PdfObject
where
PdfName: std::borrow::Borrow<Q>,
Q: Ord,
{
self.get(key)
.cloned()
.unwrap_or(PdfObject::Null(PdfNull(self.pos)))
}
pub fn pos(&self) -> PdfInputPosition {
self.pos.0
}
@ -842,35 +886,6 @@ impl<T> std::ops::DerefMut for MaybeArray<T> {
}
}
pdf_parse! {
#[derive(Clone, Debug, PartialEq, Eq)]
#[non_exhaustive]
pub enum PdfStreamFilter {
#[pdf(name = "ASCIIHexDecode")]
AsciiHexDecode,
#[pdf(name = "ASCII85Decode")]
Ascii85Decode,
#[pdf(name = "LZWDecode")]
LzwDecode,
#[pdf(name = "FlateDecode")]
FlateDecode,
#[pdf(name = "RunLengthDecode")]
RunLengthDecode,
#[pdf(name = "CCITTFaxDecode")]
CcittFaxDecode,
#[pdf(name = "JBIG2Decode")]
Jbig2Decode,
#[pdf(name = "DCTDecode")]
DctDecode,
#[pdf(name = "JPXDecode")]
JpxDecode,
#[pdf(name = "Crypt")]
Crypt,
#[pdf(other)]
Unknown(PdfName),
}
}
impl<T> Default for MaybeArray<T> {
fn default() -> Self {
Self(Arc::default())
@ -936,47 +951,101 @@ impl PdfStreamDictionary {
pub(crate) fn parse_len_from_dictionary(
dictionary: &PdfDictionary,
) -> Result<usize, PdfParseError> {
PdfParse::parse(
dictionary
.get(&PdfName::new_static(Self::LENGTH_NAME.as_bytes()))
.cloned()
.unwrap_or_default(),
PdfParse::parse(dictionary.get_or_null(Self::LENGTH_NAME.as_bytes()))
}
}
#[derive(Debug, Clone, Default)]
pub struct PdfStreamDictionaryFiltersAndParms<'a> {
filters: std::iter::Enumerate<std::slice::Iter<'a, PdfStreamFilter>>,
decode_parms: &'a [Option<PdfDictionary>],
}
impl<'a> PdfStreamDictionaryFiltersAndParms<'a> {
fn item_helper(
filter: (usize, &'a PdfStreamFilter),
decode_parms: &'a [Option<PdfDictionary>],
) -> (&'a PdfStreamFilter, &'a PdfDictionary) {
static EMPTY_DICTIONARY: OnceLock<PdfDictionary> = OnceLock::new();
let (index, filter) = filter;
(
filter,
match decode_parms.get(index) {
Some(Some(v)) => v,
_ => EMPTY_DICTIONARY.get_or_init(PdfDictionary::default),
},
)
}
}
impl<Rest> PdfStreamDictionary<Rest> {
pub fn filters_and_parms(
&self,
) -> impl Clone + ExactSizeIterator + DoubleEndedIterator<Item = (PdfStreamFilter, PdfDictionary)>
{
self.filters.iter().enumerate().map(|(index, filter)| {
(
filter.clone(),
self.decode_parms
.0
.get(index)
.cloned()
.flatten()
.unwrap_or_default(),
)
})
impl<'a> Iterator for PdfStreamDictionaryFiltersAndParms<'a> {
type Item = (&'a PdfStreamFilter, &'a PdfDictionary);
fn next(&mut self) -> Option<Self::Item> {
self.filters
.next()
.map(|filter| Self::item_helper(filter, self.decode_parms))
}
pub fn file_filters_and_parms(
&self,
) -> impl Clone + ExactSizeIterator + DoubleEndedIterator<Item = (PdfStreamFilter, PdfDictionary)>
fn size_hint(&self) -> (usize, Option<usize>) {
self.filters.size_hint()
}
fn nth(&mut self, n: usize) -> Option<Self::Item> {
self.filters
.nth(n)
.map(|filter| Self::item_helper(filter, self.decode_parms))
}
fn fold<B, F>(self, init: B, f: F) -> B
where
F: FnMut(B, Self::Item) -> B,
{
self.file_filters.iter().enumerate().map(|(index, filter)| {
(
filter.clone(),
self.file_decode_parms
.0
.get(index)
.cloned()
.flatten()
.unwrap_or_default(),
)
})
self.filters
.map(|filter| Self::item_helper(filter, self.decode_parms))
.fold(init, f)
}
}
impl<'a> FusedIterator for PdfStreamDictionaryFiltersAndParms<'a> {}
impl<'a> ExactSizeIterator for PdfStreamDictionaryFiltersAndParms<'a> {}
impl<'a> DoubleEndedIterator for PdfStreamDictionaryFiltersAndParms<'a> {
fn next_back(&mut self) -> Option<Self::Item> {
self.filters
.next_back()
.map(|filter| Self::item_helper(filter, self.decode_parms))
}
fn nth_back(&mut self, n: usize) -> Option<Self::Item> {
self.filters
.nth_back(n)
.map(|filter| Self::item_helper(filter, self.decode_parms))
}
fn rfold<B, F>(self, init: B, f: F) -> B
where
F: FnMut(B, Self::Item) -> B,
{
self.filters
.map(|filter| Self::item_helper(filter, self.decode_parms))
.rfold(init, f)
}
}
impl<Rest> PdfStreamDictionary<Rest> {
pub fn filters_and_parms<'a>(&'a self) -> PdfStreamDictionaryFiltersAndParms<'a> {
PdfStreamDictionaryFiltersAndParms {
filters: self.filters.iter().enumerate(),
decode_parms: &self.decode_parms,
}
}
pub fn file_filters_and_parms<'a>(&'a self) -> PdfStreamDictionaryFiltersAndParms<'a> {
PdfStreamDictionaryFiltersAndParms {
filters: self.file_filters.iter().enumerate(),
decode_parms: &self.file_decode_parms,
}
}
}
@ -1001,16 +1070,64 @@ impl<Rest: PdfParse> UnparsedPdfStreamDictionary<Rest> {
pub struct PdfStream<Rest = PdfDictionary> {
pos: PdfInputPositionNoCompare,
dictionary: Arc<OnceLock<PdfStreamDictionary<Rest>>>,
data: Arc<[u8]>,
encoded_data: Arc<[u8]>,
decoded_data: Arc<OnceLock<Result<Arc<[u8]>, PdfParseError>>>,
}
struct DumpBytes<'a>(&'a [u8]);
impl<'a> fmt::Debug for DumpBytes<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Display::fmt(self, f)
}
}
impl fmt::Display for DumpBytes<'_> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let mut first = true;
let mut fmt_chunk = |chunk: &[u8]| -> fmt::Result {
if first {
first = false;
} else {
f.write_str("\n")?;
}
write!(f, "\"{}\"", chunk.escape_ascii())
};
if self.0.is_empty() {
return fmt_chunk(self.0);
}
for chunk in self.0.chunks(32) {
fmt_chunk(chunk)?;
}
Ok(())
}
}
impl<Rest: fmt::Debug> fmt::Debug for PdfStream<Rest> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("PdfStream")
.field("pos", &self.pos)
.field("dictionary", &self.dictionary)
.field("data", &format_args!("{:02x?}", self.data))
.finish()
let Self {
pos,
dictionary,
encoded_data,
decoded_data,
} = self;
let mut debug_struct = f.debug_struct("PdfStream");
debug_struct.field("pos", pos);
if let Some(dictionary) = dictionary.get() {
debug_struct.field("dictionary", dictionary);
} else {
debug_struct.field("dictionary", &format_args!("<not-yet-parsed>"));
}
debug_struct.field("encoded_data", &DumpBytes(encoded_data));
if let Some(decoded_data) = decoded_data.get() {
match decoded_data {
Ok(decoded_data) => debug_struct.field("decoded_data", &DumpBytes(decoded_data)),
Err(e) => debug_struct.field("decoded_data", &Err::<(), _>(e)),
};
} else {
debug_struct.field("decoded_data", &format_args!("<not-yet-decoded>"));
}
debug_struct.finish()
}
}
@ -1018,25 +1135,27 @@ impl<Rest> PdfStream<Rest> {
pub fn new(
pos: impl Into<PdfInputPositionNoCompare>,
dictionary: PdfStreamDictionary<Rest>,
data: Arc<[u8]>,
encoded_data: Arc<[u8]>,
) -> Self {
Self {
pos: pos.into(),
dictionary: Arc::new(OnceLock::from(dictionary)),
data,
encoded_data,
decoded_data: Arc::new(OnceLock::new()),
}
}
pub(crate) fn new_unparsed(
pos: impl Into<PdfInputPositionNoCompare>,
unparsed_dictionary: PdfDictionary,
data: Arc<[u8]>,
encoded_data: Arc<[u8]>,
) -> (Self, UnparsedPdfStreamDictionary<Rest>) {
let dictionary = Arc::new(OnceLock::new());
(
Self {
pos: pos.into(),
dictionary: dictionary.clone(),
data,
encoded_data,
decoded_data: Arc::new(OnceLock::new()),
},
UnparsedPdfStreamDictionary {
unparsed_dictionary,
@ -1049,8 +1168,29 @@ impl<Rest> PdfStream<Rest> {
.get()
.expect("haven't finished parsing all pdf object definitions yet")
}
pub fn data(&self) -> &Arc<[u8]> {
&self.data
pub fn encoded_data(&self) -> &Arc<[u8]> {
&self.encoded_data
}
fn try_decode_data(&self) -> Result<Arc<[u8]>, PdfParseError> {
let dictionary = self.dictionary();
let (data, filters) = if let Some(file) = &dictionary.file {
todo!()
} else {
(&self.encoded_data, dictionary.filters_and_parms())
};
if filters.len() == 0 {
return Ok(data.clone());
}
let mut data: &[u8] = data;
let mut buffer;
for (filter, filter_parms) in filters {
buffer = filter.decode_stream_data(filter_parms.clone(), self.pos.0, &data)?;
data = &buffer;
}
Ok(Arc::from(data))
}
pub fn decoded_data(&self) -> &Result<Arc<[u8]>, PdfParseError> {
self.decoded_data.get_or_init(|| self.try_decode_data())
}
}
@ -1099,7 +1239,8 @@ impl<Rest: PdfParse> PdfParse for PdfStream<Rest> {
rest: Rest::parse(rest.clone().into())?,
}))
},
data: stream.data,
encoded_data: stream.encoded_data,
decoded_data: stream.decoded_data,
}),
object => Err(PdfParseError::InvalidType {
pos: object.get_pdf_input_position(),
@ -1109,3 +1250,37 @@ impl<Rest: PdfParse> PdfParse for PdfStream<Rest> {
}
}
}
pdf_parse! {
#[derive(Clone, Copy, Debug, Hash, Default, PartialEq, Eq, PartialOrd, Ord)]
pub enum PdfObjectStreamType {
#[pdf(name = "ObjStm")]
#[default]
ObjStm,
}
}
pdf_parse! {
#[derive(Clone, Debug)]
pub struct PdfObjectStreamDictionary {
#[pdf(name = Self::TYPE_NAME)]
pub ty: PdfObjectStreamType,
#[pdf(name = "N")]
pub n: usize,
#[pdf(name = "First")]
pub first: usize,
#[pdf(name = "Extends")]
pub extends: Option<PdfObjectIndirect>,
#[pdf(flatten)]
pub rest: PdfDictionary,
}
}
impl PdfObjectStreamDictionary {
pub const TYPE_NAME: &str = "Type";
pub(crate) fn parse_type_from_dictionary(
dictionary: &PdfDictionary,
) -> Result<PdfObjectStreamType, PdfParseError> {
PdfParse::parse(dictionary.get_or_null(Self::TYPE_NAME.as_bytes()))
}
}