parsing more of the pdf structure

This commit is contained in:
Jacob Lifshay 2025-12-26 01:13:52 -08:00
parent 83631cc4c6
commit e0993fdb4a
Signed by: programmerjake
SSH key fingerprint: SHA256:HnFTLGpSm4Q4Fj502oCFisjZSoakwEuTsJJMSke63RQ
9 changed files with 1549 additions and 100 deletions

View file

@ -1,11 +1,14 @@
use crate::{
pdf::{
document_structure::PdfDocumentCatalog,
object::{
PdfArray, PdfBoolean, PdfDictionary, PdfInteger, PdfName, PdfNull, PdfObject,
PdfObjectIdentifier, PdfObjectIndirect, PdfObjectStreamDictionary, PdfReal, PdfStream,
PdfStreamDictionary, PdfString, UnparsedPdfStreamDictionary,
},
parse::{GetPdfInputPosition, PdfInputPosition, PdfParse, PdfParseError},
parse::{
GetPdfInputPosition, PdfInputPosition, PdfInputPositionKnown, PdfParse, PdfParseError,
},
},
pdf_parse,
util::ArcOrRef,
@ -19,12 +22,16 @@ use std::{
sync::{Arc, OnceLock},
};
pub mod content_stream;
pub mod document_structure;
pub mod font;
pub mod object;
pub mod parse;
pub mod stream_filters;
struct PdfObjectsInner {
objects: BTreeMap<PdfObjectIdentifier, PdfObject>,
#[allow(dead_code)]
object_streams: Vec<PdfStream<PdfObjectStreamDictionary>>,
}
@ -43,6 +50,7 @@ impl PdfHeader {
}
pdf_parse! {
#[pdf]
#[derive(Clone, Debug)]
pub struct PdfTrailerDictionary {
#[pdf(name = "Size")]
@ -50,7 +58,7 @@ pdf_parse! {
#[pdf(name = "Prev")]
pub prev: Option<usize>,
#[pdf(name = "Root")]
pub root: PdfDictionary,
pub root: PdfDocumentCatalog,
#[pdf(name = "Encrypt")]
pub encrypt: Option<PdfDictionary>,
#[pdf(name = "Info")]
@ -63,6 +71,7 @@ pdf_parse! {
}
pdf_parse! {
#[pdf(name)]
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Default, Debug)]
pub enum PdfXRefName {
#[pdf(name = "XRef")]
@ -72,6 +81,7 @@ pdf_parse! {
}
pdf_parse! {
#[pdf]
#[derive(Clone, Debug)]
pub struct PdfXRefStreamDictionaryRest {
#[pdf(name = "Type")]
@ -97,6 +107,17 @@ pub enum PdfTrailer {
},
}
impl PdfTrailer {
pub fn trailer_dictionary(&self) -> &PdfTrailerDictionary {
match self {
PdfTrailer::Trailer {
trailer_dictionary, ..
} => trailer_dictionary,
PdfTrailer::Stream { xref_stream, .. } => &xref_stream.dictionary().rest.rest,
}
}
}
pub struct Pdf {
pub header: PdfHeader,
pub objects: Arc<PdfObjects>,
@ -192,12 +213,12 @@ struct PdfTokenizerPeek<'a> {
#[derive(Clone)]
struct PdfTokenizer<'a> {
bytes: &'a [u8],
pos: usize,
pos: PdfInputPositionKnown,
peek_cache: Option<PdfTokenizerPeek<'a>>,
}
impl<'a> PdfTokenizer<'a> {
fn new(bytes: &'a [u8], pos: usize) -> Self {
fn new(bytes: &'a [u8], pos: PdfInputPositionKnown) -> Self {
Self {
bytes,
pos,
@ -205,14 +226,14 @@ impl<'a> PdfTokenizer<'a> {
}
}
fn pos(&self) -> PdfInputPosition {
PdfInputPosition::new(self.pos)
PdfInputPosition::new(Some(self.pos))
}
fn peek_byte(&mut self) -> Option<u8> {
self.bytes.get(self.pos).copied()
self.bytes.get(self.pos.pos).copied()
}
fn next_byte(&mut self) -> Option<u8> {
let b = self.bytes.get(self.pos)?;
self.pos += 1;
let b = self.bytes.get(self.pos.pos)?;
self.pos.pos += 1;
self.peek_cache = None;
Some(*b)
}
@ -229,14 +250,16 @@ impl<'a> PdfTokenizer<'a> {
let token = tokenizer.next()?;
self.peek_cache = Some(PdfTokenizerPeek {
token,
pos_after_token: tokenizer.pos,
pos_after_token: tokenizer.pos.pos,
});
Some(token)
}
fn read_bytes(&mut self, len: usize) -> Option<&'a [u8]> {
let retval = self.bytes.get(self.pos..self.pos.saturating_add(len))?;
let retval = self
.bytes
.get(self.pos.pos..self.pos.pos.saturating_add(len))?;
self.peek_cache = None;
self.pos += len;
self.pos.pos += len;
Some(retval)
}
}
@ -250,11 +273,11 @@ impl<'a> Iterator for PdfTokenizer<'a> {
pos_after_token,
}) = self.peek_cache.take()
{
self.pos = pos_after_token;
self.pos.pos = pos_after_token;
return Some(token);
}
loop {
let start_pos = self.pos;
let start_pos = self.pos.pos;
break match PdfCharCategory::new(self.next_byte()?) {
PdfCharCategory::Whitespace => continue,
PdfCharCategory::LParen => Some(PdfToken::LParen),
@ -272,22 +295,22 @@ impl<'a> Iterator for PdfTokenizer<'a> {
None | Some(b'\n') => break,
Some(b'\r') => {
if let Some(b'\n') = self.peek_byte() {
self.pos += 1;
self.pos.pos += 1;
}
break;
}
Some(_) => continue,
}
}
Some(PdfToken::Comment(&self.bytes[start_pos..self.pos]))
Some(PdfToken::Comment(&self.bytes[start_pos..self.pos.pos]))
}
PdfCharCategory::Regular => {
while let Some(PdfCharCategory::Regular) =
self.peek_byte().map(PdfCharCategory::new)
{
self.pos += 1;
self.pos.pos += 1;
}
Some(PdfToken::Regular(&self.bytes[start_pos..self.pos]))
Some(PdfToken::Regular(&self.bytes[start_pos..self.pos.pos]))
}
};
}
@ -647,7 +670,10 @@ impl<'a> PdfParser<'a> {
}
let Some(data) = self.tokenizer.read_bytes(len) else {
return Err(PdfParseError::TruncatedFile {
pos: PdfInputPosition::new(self.tokenizer.bytes.len()),
pos: PdfInputPosition::new(Some(PdfInputPositionKnown {
pos: self.tokenizer.bytes.len(),
..self.tokenizer.pos
})),
});
};
let (stream, unparsed) = PdfStream::new_unparsed(stream_pos, dictionary, Arc::from(data));
@ -810,9 +836,22 @@ impl<'a> PdfParser<'a> {
object_stream: &PdfStream<PdfObjectStreamDictionary>,
) -> Result<(), PdfParseError> {
let data = object_stream.decoded_data().as_ref()?;
self.with_tokenizer(PdfTokenizer::new(data, 0), |parser| {
parser.parse_object_stream_inner(object_stream)
})
self.with_tokenizer(
PdfTokenizer::new(
data,
PdfInputPositionKnown {
pos: 0,
containing_streams_pos: Some(
object_stream
.get_pdf_input_position()
.get()
.expect("known to be set")
.pos,
),
},
),
|parser| parser.parse_object_stream_inner(object_stream),
)
.map_err(|e| PdfParseError::ObjectStreamParseError {
stream_pos: object_stream.get_pdf_input_position(),
error: Arc::new(e),
@ -913,7 +952,13 @@ impl<'a> PdfParser<'a> {
});
}
let old_tokenizer = self.tokenizer.clone();
self.tokenizer = PdfTokenizer::new(self.tokenizer.bytes, start_xref);
self.tokenizer = PdfTokenizer::new(
self.tokenizer.bytes,
PdfInputPositionKnown {
pos: start_xref,
containing_streams_pos: None,
},
);
let id = self.parse_object_identifier(false);
self.tokenizer = old_tokenizer;
let Some(id) = id? else {
@ -950,7 +995,13 @@ impl Pdf {
}),
objects_map: BTreeMap::new(),
unparsed_stream_dictionaries: vec![],
tokenizer: PdfTokenizer::new(bytes.as_ref(), 0),
tokenizer: PdfTokenizer::new(
bytes.as_ref(),
PdfInputPositionKnown {
pos: 0,
containing_streams_pos: None,
},
),
}
.parse_file()
}
@ -971,6 +1022,7 @@ mod tests {
#[test]
fn test_deserialize_dict() -> Result<(), PdfParseError> {
crate::pdf::parse::pdf_parse! {
#[pdf]
#[derive(Debug)]
#[allow(dead_code)]
struct TestStruct {