parses root successfully

This commit is contained in:
Jacob Lifshay 2025-12-24 21:49:57 -08:00
parent 5fbfaa8053
commit 83631cc4c6
Signed by: programmerjake
SSH key fingerprint: SHA256:HnFTLGpSm4Q4Fj502oCFisjZSoakwEuTsJJMSke63RQ
7 changed files with 623 additions and 118 deletions

View file

@ -1,11 +1,11 @@
use crate::{
pdf::{
object::{
MaybeArray, PdfArray, PdfBoolean, PdfDictionary, PdfInteger, PdfName, PdfNull,
PdfObject, PdfObjectIdentifier, PdfObjectIndirect, PdfReal, PdfStream,
PdfArray, PdfBoolean, PdfDictionary, PdfInteger, PdfName, PdfNull, PdfObject,
PdfObjectIdentifier, PdfObjectIndirect, PdfObjectStreamDictionary, PdfReal, PdfStream,
PdfStreamDictionary, PdfString, UnparsedPdfStreamDictionary,
},
parse::{PdfInputPosition, PdfParse, PdfParseError},
parse::{GetPdfInputPosition, PdfInputPosition, PdfParse, PdfParseError},
},
pdf_parse,
util::ArcOrRef,
@ -21,9 +21,15 @@ use std::{
pub mod object;
pub mod parse;
pub mod stream_filters;
struct PdfObjectsInner {
objects: BTreeMap<PdfObjectIdentifier, PdfObject>,
object_streams: Vec<PdfStream<PdfObjectStreamDictionary>>,
}
pub struct PdfObjects {
objects: OnceLock<BTreeMap<PdfObjectIdentifier, PdfObject>>,
inner: OnceLock<PdfObjectsInner>,
}
#[derive(Copy, Clone, Debug)]
@ -70,24 +76,12 @@ pdf_parse! {
pub struct PdfXRefStreamDictionaryRest {
#[pdf(name = "Type")]
pub ty: PdfXRefName,
#[pdf(name = "Size")]
pub size: usize,
#[pdf(name = "Index")]
pub index: Option<Arc<[usize]>>,
#[pdf(name = "Prev")]
pub prev: Option<usize>,
#[pdf(name = "W")]
pub w: Option<Arc<[usize]>>,
#[pdf(name = "Root")]
pub root: PdfDictionary,
#[pdf(name = "Encrypt")]
pub encrypt: Option<PdfDictionary>,
#[pdf(name = "Info")]
pub info: Option<PdfDictionary>,
#[pdf(name = "ID")]
pub id: Option<[PdfString; 2]>,
#[pdf(flatten)]
pub rest: PdfDictionary,
pub rest: PdfTrailerDictionary,
}
}
@ -308,6 +302,38 @@ struct PdfParser<'a> {
}
impl<'a> PdfParser<'a> {
fn with_tokenizer<'b, R>(
&mut self,
tokenizer: PdfTokenizer<'b>,
f: impl FnOnce(&mut PdfParser<'b>) -> R,
) -> R {
let PdfParser {
objects_arc,
objects_map,
unparsed_stream_dictionaries,
tokenizer: _,
} = self;
let objects_arc = objects_arc.clone();
let objects_map = std::mem::take(objects_map);
let unparsed_stream_dictionaries = std::mem::take(unparsed_stream_dictionaries);
let mut new_parser = PdfParser {
objects_arc,
objects_map,
unparsed_stream_dictionaries,
tokenizer,
};
let retval = f(&mut new_parser);
let PdfParser {
objects_arc,
objects_map,
unparsed_stream_dictionaries,
tokenizer: _,
} = new_parser;
self.objects_arc = objects_arc;
self.objects_map = objects_map;
self.unparsed_stream_dictionaries = unparsed_stream_dictionaries;
retval
}
fn parse_header(&mut self) -> Result<PdfHeader, PdfParseError> {
let Some(b'%') = self.tokenizer.bytes.first() else {
return Err(PdfParseError::NotAPdfFile);
@ -739,18 +765,94 @@ impl<'a> PdfParser<'a> {
Ok(Some(()))
}
}
fn parse_object_stream_inner(
&mut self,
object_stream: &PdfStream<PdfObjectStreamDictionary>,
) -> Result<(), PdfParseError> {
let mut object_ids_and_byte_positions =
Vec::<(PdfObjectIdentifier, usize)>::with_capacity(object_stream.dictionary().rest.n);
for _ in 0..object_stream.dictionary().rest.n {
self.skip_comments_and_whitespace();
let Some((pos, object_number)) =
self.parse_digits(|pos| Err(PdfParseError::InvalidObjectNumber { pos }))?
else {
return Err(PdfParseError::InvalidObjectNumber {
pos: self.tokenizer.pos(),
});
};
self.skip_comments_and_whitespace();
let Some((_, byte_position)) =
self.parse_digits(|pos| Err(PdfParseError::InvalidNumber { pos }))?
else {
return Err(PdfParseError::InvalidNumber {
pos: self.tokenizer.pos(),
});
};
object_ids_and_byte_positions.push((
PdfObjectIdentifier {
pos: pos.into(),
object_number,
generation_number: 0,
},
byte_position,
));
}
for (id, _byte_position) in object_ids_and_byte_positions {
let object = self.parse_object()?;
if self.objects_map.insert(id, object).is_some() {
return Err(PdfParseError::DuplicateIndirectObjectDefinition { pos: id.pos.0, id });
}
}
Ok(())
}
fn parse_object_stream(
&mut self,
object_stream: &PdfStream<PdfObjectStreamDictionary>,
) -> Result<(), PdfParseError> {
let data = object_stream.decoded_data().as_ref()?;
self.with_tokenizer(PdfTokenizer::new(data, 0), |parser| {
parser.parse_object_stream_inner(object_stream)
})
.map_err(|e| PdfParseError::ObjectStreamParseError {
stream_pos: object_stream.get_pdf_input_position(),
error: Arc::new(e),
})
}
fn parse_body(&mut self) -> Result<(), PdfParseError> {
while let Some(()) = self.parse_indirect_object_definition()? {}
let Ok(()) = self
.objects_arc
.objects
.set(std::mem::take(&mut self.objects_map))
else {
unreachable!();
};
self.unparsed_stream_dictionaries
.drain(..)
.try_for_each(|v| v.finish_parsing())
.try_for_each(|v| v.finish_parsing())?;
let mut object_streams: Vec<PdfStream<PdfObjectStreamDictionary>> = Vec::new();
for object in self.objects_map.values_mut() {
let stream = match object {
PdfObject::Stream(stream) => stream,
PdfObject::Boolean(_)
| PdfObject::Integer(_)
| PdfObject::Real(_)
| PdfObject::String(_)
| PdfObject::Name(_)
| PdfObject::Array(_)
| PdfObject::Dictionary(_)
| PdfObject::Null(_)
| PdfObject::Indirect(_) => continue,
};
if PdfObjectStreamDictionary::parse_type_from_dictionary(&stream.dictionary().rest)
.is_ok()
{
object_streams.push(PdfStream::parse(object.clone())?);
}
}
for object_stream in &object_streams {
self.parse_object_stream(object_stream)?;
}
let Ok(()) = self.objects_arc.inner.set(PdfObjectsInner {
objects: std::mem::take(&mut self.objects_map),
object_streams,
}) else {
unreachable!();
};
Ok(())
}
fn parse_xref_table(&mut self) -> Result<(), PdfParseError> {
self.skip_comments_and_whitespace();
@ -844,7 +946,7 @@ impl Pdf {
pub fn parse(bytes: impl AsRef<[u8]>) -> Result<Pdf, PdfParseError> {
PdfParser {
objects_arc: Arc::new(PdfObjects {
objects: OnceLock::new(),
inner: OnceLock::new(),
}),
objects_map: BTreeMap::new(),
unparsed_stream_dictionaries: vec![],