parsing more of the pdf structure
This commit is contained in:
parent
83631cc4c6
commit
e0993fdb4a
9 changed files with 1549 additions and 100 deletions
98
src/pdf.rs
98
src/pdf.rs
|
|
@ -1,11 +1,14 @@
|
|||
use crate::{
|
||||
pdf::{
|
||||
document_structure::PdfDocumentCatalog,
|
||||
object::{
|
||||
PdfArray, PdfBoolean, PdfDictionary, PdfInteger, PdfName, PdfNull, PdfObject,
|
||||
PdfObjectIdentifier, PdfObjectIndirect, PdfObjectStreamDictionary, PdfReal, PdfStream,
|
||||
PdfStreamDictionary, PdfString, UnparsedPdfStreamDictionary,
|
||||
},
|
||||
parse::{GetPdfInputPosition, PdfInputPosition, PdfParse, PdfParseError},
|
||||
parse::{
|
||||
GetPdfInputPosition, PdfInputPosition, PdfInputPositionKnown, PdfParse, PdfParseError,
|
||||
},
|
||||
},
|
||||
pdf_parse,
|
||||
util::ArcOrRef,
|
||||
|
|
@ -19,12 +22,16 @@ use std::{
|
|||
sync::{Arc, OnceLock},
|
||||
};
|
||||
|
||||
pub mod content_stream;
|
||||
pub mod document_structure;
|
||||
pub mod font;
|
||||
pub mod object;
|
||||
pub mod parse;
|
||||
pub mod stream_filters;
|
||||
|
||||
struct PdfObjectsInner {
|
||||
objects: BTreeMap<PdfObjectIdentifier, PdfObject>,
|
||||
#[allow(dead_code)]
|
||||
object_streams: Vec<PdfStream<PdfObjectStreamDictionary>>,
|
||||
}
|
||||
|
||||
|
|
@ -43,6 +50,7 @@ impl PdfHeader {
|
|||
}
|
||||
|
||||
pdf_parse! {
|
||||
#[pdf]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct PdfTrailerDictionary {
|
||||
#[pdf(name = "Size")]
|
||||
|
|
@ -50,7 +58,7 @@ pdf_parse! {
|
|||
#[pdf(name = "Prev")]
|
||||
pub prev: Option<usize>,
|
||||
#[pdf(name = "Root")]
|
||||
pub root: PdfDictionary,
|
||||
pub root: PdfDocumentCatalog,
|
||||
#[pdf(name = "Encrypt")]
|
||||
pub encrypt: Option<PdfDictionary>,
|
||||
#[pdf(name = "Info")]
|
||||
|
|
@ -63,6 +71,7 @@ pdf_parse! {
|
|||
}
|
||||
|
||||
pdf_parse! {
|
||||
#[pdf(name)]
|
||||
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Default, Debug)]
|
||||
pub enum PdfXRefName {
|
||||
#[pdf(name = "XRef")]
|
||||
|
|
@ -72,6 +81,7 @@ pdf_parse! {
|
|||
}
|
||||
|
||||
pdf_parse! {
|
||||
#[pdf]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct PdfXRefStreamDictionaryRest {
|
||||
#[pdf(name = "Type")]
|
||||
|
|
@ -97,6 +107,17 @@ pub enum PdfTrailer {
|
|||
},
|
||||
}
|
||||
|
||||
impl PdfTrailer {
|
||||
pub fn trailer_dictionary(&self) -> &PdfTrailerDictionary {
|
||||
match self {
|
||||
PdfTrailer::Trailer {
|
||||
trailer_dictionary, ..
|
||||
} => trailer_dictionary,
|
||||
PdfTrailer::Stream { xref_stream, .. } => &xref_stream.dictionary().rest.rest,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Pdf {
|
||||
pub header: PdfHeader,
|
||||
pub objects: Arc<PdfObjects>,
|
||||
|
|
@ -192,12 +213,12 @@ struct PdfTokenizerPeek<'a> {
|
|||
#[derive(Clone)]
|
||||
struct PdfTokenizer<'a> {
|
||||
bytes: &'a [u8],
|
||||
pos: usize,
|
||||
pos: PdfInputPositionKnown,
|
||||
peek_cache: Option<PdfTokenizerPeek<'a>>,
|
||||
}
|
||||
|
||||
impl<'a> PdfTokenizer<'a> {
|
||||
fn new(bytes: &'a [u8], pos: usize) -> Self {
|
||||
fn new(bytes: &'a [u8], pos: PdfInputPositionKnown) -> Self {
|
||||
Self {
|
||||
bytes,
|
||||
pos,
|
||||
|
|
@ -205,14 +226,14 @@ impl<'a> PdfTokenizer<'a> {
|
|||
}
|
||||
}
|
||||
fn pos(&self) -> PdfInputPosition {
|
||||
PdfInputPosition::new(self.pos)
|
||||
PdfInputPosition::new(Some(self.pos))
|
||||
}
|
||||
fn peek_byte(&mut self) -> Option<u8> {
|
||||
self.bytes.get(self.pos).copied()
|
||||
self.bytes.get(self.pos.pos).copied()
|
||||
}
|
||||
fn next_byte(&mut self) -> Option<u8> {
|
||||
let b = self.bytes.get(self.pos)?;
|
||||
self.pos += 1;
|
||||
let b = self.bytes.get(self.pos.pos)?;
|
||||
self.pos.pos += 1;
|
||||
self.peek_cache = None;
|
||||
Some(*b)
|
||||
}
|
||||
|
|
@ -229,14 +250,16 @@ impl<'a> PdfTokenizer<'a> {
|
|||
let token = tokenizer.next()?;
|
||||
self.peek_cache = Some(PdfTokenizerPeek {
|
||||
token,
|
||||
pos_after_token: tokenizer.pos,
|
||||
pos_after_token: tokenizer.pos.pos,
|
||||
});
|
||||
Some(token)
|
||||
}
|
||||
fn read_bytes(&mut self, len: usize) -> Option<&'a [u8]> {
|
||||
let retval = self.bytes.get(self.pos..self.pos.saturating_add(len))?;
|
||||
let retval = self
|
||||
.bytes
|
||||
.get(self.pos.pos..self.pos.pos.saturating_add(len))?;
|
||||
self.peek_cache = None;
|
||||
self.pos += len;
|
||||
self.pos.pos += len;
|
||||
Some(retval)
|
||||
}
|
||||
}
|
||||
|
|
@ -250,11 +273,11 @@ impl<'a> Iterator for PdfTokenizer<'a> {
|
|||
pos_after_token,
|
||||
}) = self.peek_cache.take()
|
||||
{
|
||||
self.pos = pos_after_token;
|
||||
self.pos.pos = pos_after_token;
|
||||
return Some(token);
|
||||
}
|
||||
loop {
|
||||
let start_pos = self.pos;
|
||||
let start_pos = self.pos.pos;
|
||||
break match PdfCharCategory::new(self.next_byte()?) {
|
||||
PdfCharCategory::Whitespace => continue,
|
||||
PdfCharCategory::LParen => Some(PdfToken::LParen),
|
||||
|
|
@ -272,22 +295,22 @@ impl<'a> Iterator for PdfTokenizer<'a> {
|
|||
None | Some(b'\n') => break,
|
||||
Some(b'\r') => {
|
||||
if let Some(b'\n') = self.peek_byte() {
|
||||
self.pos += 1;
|
||||
self.pos.pos += 1;
|
||||
}
|
||||
break;
|
||||
}
|
||||
Some(_) => continue,
|
||||
}
|
||||
}
|
||||
Some(PdfToken::Comment(&self.bytes[start_pos..self.pos]))
|
||||
Some(PdfToken::Comment(&self.bytes[start_pos..self.pos.pos]))
|
||||
}
|
||||
PdfCharCategory::Regular => {
|
||||
while let Some(PdfCharCategory::Regular) =
|
||||
self.peek_byte().map(PdfCharCategory::new)
|
||||
{
|
||||
self.pos += 1;
|
||||
self.pos.pos += 1;
|
||||
}
|
||||
Some(PdfToken::Regular(&self.bytes[start_pos..self.pos]))
|
||||
Some(PdfToken::Regular(&self.bytes[start_pos..self.pos.pos]))
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
@ -647,7 +670,10 @@ impl<'a> PdfParser<'a> {
|
|||
}
|
||||
let Some(data) = self.tokenizer.read_bytes(len) else {
|
||||
return Err(PdfParseError::TruncatedFile {
|
||||
pos: PdfInputPosition::new(self.tokenizer.bytes.len()),
|
||||
pos: PdfInputPosition::new(Some(PdfInputPositionKnown {
|
||||
pos: self.tokenizer.bytes.len(),
|
||||
..self.tokenizer.pos
|
||||
})),
|
||||
});
|
||||
};
|
||||
let (stream, unparsed) = PdfStream::new_unparsed(stream_pos, dictionary, Arc::from(data));
|
||||
|
|
@ -810,9 +836,22 @@ impl<'a> PdfParser<'a> {
|
|||
object_stream: &PdfStream<PdfObjectStreamDictionary>,
|
||||
) -> Result<(), PdfParseError> {
|
||||
let data = object_stream.decoded_data().as_ref()?;
|
||||
self.with_tokenizer(PdfTokenizer::new(data, 0), |parser| {
|
||||
parser.parse_object_stream_inner(object_stream)
|
||||
})
|
||||
self.with_tokenizer(
|
||||
PdfTokenizer::new(
|
||||
data,
|
||||
PdfInputPositionKnown {
|
||||
pos: 0,
|
||||
containing_streams_pos: Some(
|
||||
object_stream
|
||||
.get_pdf_input_position()
|
||||
.get()
|
||||
.expect("known to be set")
|
||||
.pos,
|
||||
),
|
||||
},
|
||||
),
|
||||
|parser| parser.parse_object_stream_inner(object_stream),
|
||||
)
|
||||
.map_err(|e| PdfParseError::ObjectStreamParseError {
|
||||
stream_pos: object_stream.get_pdf_input_position(),
|
||||
error: Arc::new(e),
|
||||
|
|
@ -913,7 +952,13 @@ impl<'a> PdfParser<'a> {
|
|||
});
|
||||
}
|
||||
let old_tokenizer = self.tokenizer.clone();
|
||||
self.tokenizer = PdfTokenizer::new(self.tokenizer.bytes, start_xref);
|
||||
self.tokenizer = PdfTokenizer::new(
|
||||
self.tokenizer.bytes,
|
||||
PdfInputPositionKnown {
|
||||
pos: start_xref,
|
||||
containing_streams_pos: None,
|
||||
},
|
||||
);
|
||||
let id = self.parse_object_identifier(false);
|
||||
self.tokenizer = old_tokenizer;
|
||||
let Some(id) = id? else {
|
||||
|
|
@ -950,7 +995,13 @@ impl Pdf {
|
|||
}),
|
||||
objects_map: BTreeMap::new(),
|
||||
unparsed_stream_dictionaries: vec![],
|
||||
tokenizer: PdfTokenizer::new(bytes.as_ref(), 0),
|
||||
tokenizer: PdfTokenizer::new(
|
||||
bytes.as_ref(),
|
||||
PdfInputPositionKnown {
|
||||
pos: 0,
|
||||
containing_streams_pos: None,
|
||||
},
|
||||
),
|
||||
}
|
||||
.parse_file()
|
||||
}
|
||||
|
|
@ -971,6 +1022,7 @@ mod tests {
|
|||
#[test]
|
||||
fn test_deserialize_dict() -> Result<(), PdfParseError> {
|
||||
crate::pdf::parse::pdf_parse! {
|
||||
#[pdf]
|
||||
#[derive(Debug)]
|
||||
#[allow(dead_code)]
|
||||
struct TestStruct {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue