parse content streams into a list of operators

This commit is contained in:
Jacob Lifshay 2025-12-28 00:08:39 -08:00
parent 13dcea1dab
commit aba6368948
Signed by: programmerjake
SSH key fingerprint: SHA256:HnFTLGpSm4Q4Fj502oCFisjZSoakwEuTsJJMSke63RQ
5 changed files with 1541 additions and 408 deletions

View file

@ -1,5 +1,6 @@
use crate::{
pdf::{
content_stream::PdfOperatorUnparsed,
document_structure::PdfDocumentCatalog,
object::{
PdfArray, PdfBoolean, PdfDictionary, PdfInteger, PdfName, PdfNull, PdfObject,
@ -318,67 +319,35 @@ impl<'a> Iterator for PdfTokenizer<'a> {
}
struct PdfParser<'a> {
objects_arc: Arc<PdfObjects>,
objects_map: BTreeMap<PdfObjectIdentifier, PdfObject>,
unparsed_stream_dictionaries: Vec<UnparsedPdfStreamDictionary<PdfDictionary>>,
objects: Arc<PdfObjects>,
tokenizer: PdfTokenizer<'a>,
}
enum PdfObjectOrStreamDictionaryOrOperator {
StreamDictionary {
dictionary: PdfDictionary,
stream_kw_pos: PdfInputPosition,
},
Object(PdfObject),
Operator(PdfOperatorUnparsed),
}
impl PdfObjectOrStreamDictionaryOrOperator {
fn error_on_stream_or_operator(self) -> Result<PdfObject, PdfParseError> {
match self {
PdfObjectOrStreamDictionaryOrOperator::StreamDictionary {
dictionary: _,
stream_kw_pos,
} => Err(PdfParseError::StreamNotAllowedHere { pos: stream_kw_pos }),
PdfObjectOrStreamDictionaryOrOperator::Object(object) => Ok(object),
PdfObjectOrStreamDictionaryOrOperator::Operator(operator) => {
Err(PdfParseError::OperatorNotAllowedHere { operator })
}
}
}
}
impl<'a> PdfParser<'a> {
fn with_tokenizer<'b, R>(
&mut self,
tokenizer: PdfTokenizer<'b>,
f: impl FnOnce(&mut PdfParser<'b>) -> R,
) -> R {
let PdfParser {
objects_arc,
objects_map,
unparsed_stream_dictionaries,
tokenizer: _,
} = self;
let objects_arc = objects_arc.clone();
let objects_map = std::mem::take(objects_map);
let unparsed_stream_dictionaries = std::mem::take(unparsed_stream_dictionaries);
let mut new_parser = PdfParser {
objects_arc,
objects_map,
unparsed_stream_dictionaries,
tokenizer,
};
let retval = f(&mut new_parser);
let PdfParser {
objects_arc,
objects_map,
unparsed_stream_dictionaries,
tokenizer: _,
} = new_parser;
self.objects_arc = objects_arc;
self.objects_map = objects_map;
self.unparsed_stream_dictionaries = unparsed_stream_dictionaries;
retval
}
fn parse_header(&mut self) -> Result<PdfHeader, PdfParseError> {
let Some(b'%') = self.tokenizer.bytes.first() else {
return Err(PdfParseError::NotAPdfFile);
};
let Some(PdfToken::Comment(header)) = self.tokenizer.next() else {
unreachable!()
};
let Ok(header) = str::from_utf8(header) else {
return Err(PdfParseError::NotAPdfFile);
};
let header = header.trim_end_matches(['\n', '\r']);
let Some(version) = header.strip_prefix(PdfHeader::PREFIX) else {
return Err(PdfParseError::NotAPdfFile);
};
let Some((major_str, minor_str)) = version.split_once('.') else {
return Err(PdfParseError::NotAPdfFile);
};
let (Ok(major), Ok(minor)) = (major_str.parse(), minor_str.parse()) else {
return Err(PdfParseError::NotAPdfFile);
};
Ok(PdfHeader { major, minor })
}
fn skip_comments_and_whitespace(&mut self) {
self.tokenizer.skip_whitespace();
while let Some(PdfToken::Comment(_)) = self.tokenizer.peek() {
@ -449,7 +418,7 @@ impl<'a> PdfParser<'a> {
return Ok(None);
};
if let Some(PdfToken::Regular(b"R")) = self.tokenizer.next() {
Ok(Some(PdfObjectIndirect::new(&self.objects_arc, id)))
Ok(Some(PdfObjectIndirect::new(&self.objects, id)))
} else {
self.tokenizer = old_tokenizer;
Ok(None)
@ -604,7 +573,10 @@ impl<'a> PdfParser<'a> {
self.tokenizer.next();
return Ok(PdfArray::from_elements(array_pos, Arc::from(contents)));
}
contents.push(self.parse_object()?);
contents.push(
self.parse_object_or_operator()?
.error_on_stream_or_operator()?,
);
}
}
/// assumes `self.tokenizer.peek_byte() == Some(b'<')`
@ -630,11 +602,17 @@ impl<'a> PdfParser<'a> {
Arc::new(contents),
));
}
let name = PdfName::parse(self.parse_object()?.into())?;
let name = PdfName::parse(
self.parse_object_or_operator()?
.error_on_stream_or_operator()?,
)?;
let name_pos = name.pos();
match contents.entry(name) {
std::collections::btree_map::Entry::Vacant(entry) => {
entry.insert(self.parse_object()?.into());
entry.insert(
self.parse_object_or_operator()?
.error_on_stream_or_operator()?,
);
}
std::collections::btree_map::Entry::Occupied(entry) => {
return Err(PdfParseError::DuplicateDictionaryKey {
@ -645,21 +623,146 @@ impl<'a> PdfParser<'a> {
}
}
}
fn parse_object_or_operator(
&mut self,
) -> Result<PdfObjectOrStreamDictionaryOrOperator, PdfParseError> {
self.skip_comments_and_whitespace();
if let Some(indirect) = self.parse_indirect_object()? {
return Ok(PdfObjectOrStreamDictionaryOrOperator::Object(
indirect.into(),
));
}
let pos = self.tokenizer.pos();
Ok(PdfObjectOrStreamDictionaryOrOperator::Object(
match self
.tokenizer
.next()
.ok_or(PdfParseError::TruncatedFile { pos })?
{
PdfToken::Regular(b"true") => PdfObject::Boolean(PdfBoolean::new(pos, true)),
PdfToken::Regular(b"false") => PdfObject::Boolean(PdfBoolean::new(pos, false)),
PdfToken::Regular(b"null") => PdfObject::Null(PdfNull::new(pos)),
PdfToken::Regular(
number @ ([b'+' | b'-', b'0'..=b'9' | b'.', ..] | [b'0'..=b'9' | b'.', ..]),
) => {
// parse number
let Ok(number) = str::from_utf8(number) else {
return Err(PdfParseError::InvalidNumber { pos });
};
let mut parts = number
.strip_prefix(&['+', '-'])
.unwrap_or(number)
.split('.');
let integer_part = parts
.next()
.expect("split always returns at least one part");
let fraction_part = parts.next();
if parts.next().is_some() {
return Err(PdfParseError::InvalidNumber { pos });
}
if integer_part.is_empty() && fraction_part.is_none_or(|v| v.is_empty()) {
return Err(PdfParseError::InvalidNumber { pos });
}
if !integer_part.bytes().all(|v| v.is_ascii_digit()) {
return Err(PdfParseError::InvalidNumber { pos });
}
if let Some(fraction_part) = fraction_part {
if !fraction_part.bytes().all(|v| v.is_ascii_digit()) {
return Err(PdfParseError::InvalidNumber { pos });
}
PdfObject::Real(PdfReal::new(
pos,
number
.parse()
.map_err(|_| PdfParseError::InvalidNumber { pos })?,
))
} else {
PdfObject::Integer(PdfInteger::new(
pos,
number
.parse()
.map_err(|_| PdfParseError::InvalidNumber { pos })?,
))
}
}
PdfToken::Regular(name) => {
return Ok(PdfObjectOrStreamDictionaryOrOperator::Operator(
PdfOperatorUnparsed::new(pos, ArcOrRef::Arc(name.into())),
));
}
PdfToken::LParen => PdfObject::String(self.parse_string_after_l_paren()?),
PdfToken::RParen => todo!(),
PdfToken::LAngle => {
if self.tokenizer.peek_byte() == Some(b'<') {
let dictionary = self.parse_dictionary_after_one_l_angle()?;
self.skip_comments_and_whitespace();
if let Some(PdfToken::Regular(b"stream")) = self.tokenizer.peek() {
return Ok(PdfObjectOrStreamDictionaryOrOperator::StreamDictionary {
dictionary,
stream_kw_pos: self.tokenizer.pos(),
});
} else {
dictionary.into()
}
} else {
self.parse_string_after_l_angle()?.into()
}
}
PdfToken::RAngle => todo!(),
PdfToken::LBracket => self.parse_array_after_l_bracket()?.into(),
PdfToken::RBracket => todo!(),
PdfToken::LBrace => todo!(),
PdfToken::RBrace => todo!(),
PdfToken::FSlash => self.parse_name_after_f_slash()?.into(),
PdfToken::Comment(_) => unreachable!(),
},
))
}
}
struct PdfFileParser<'a> {
parser: PdfParser<'a>,
objects_map: BTreeMap<PdfObjectIdentifier, PdfObject>,
}
impl<'a> PdfFileParser<'a> {
fn parse_header(&mut self) -> Result<PdfHeader, PdfParseError> {
let Some(b'%') = self.parser.tokenizer.bytes.first() else {
return Err(PdfParseError::NotAPdfFile);
};
let Some(PdfToken::Comment(header)) = self.parser.tokenizer.next() else {
unreachable!()
};
let Ok(header) = str::from_utf8(header) else {
return Err(PdfParseError::NotAPdfFile);
};
let header = header.trim_end_matches(['\n', '\r']);
let Some(version) = header.strip_prefix(PdfHeader::PREFIX) else {
return Err(PdfParseError::NotAPdfFile);
};
let Some((major_str, minor_str)) = version.split_once('.') else {
return Err(PdfParseError::NotAPdfFile);
};
let (Ok(major), Ok(minor)) = (major_str.parse(), minor_str.parse()) else {
return Err(PdfParseError::NotAPdfFile);
};
Ok(PdfHeader { major, minor })
}
/// assumes `self.tokenizer.peek() == Some(PdfToken::Regular(b"stream"))`
fn parse_stream_after_dictionary(
&mut self,
dictionary: PdfDictionary,
) -> Result<PdfStream, PdfParseError> {
self.tokenizer.skip_whitespace();
let stream_pos = self.tokenizer.pos();
let stream = self.tokenizer.next();
self.parser.tokenizer.skip_whitespace();
let stream_pos = self.parser.tokenizer.pos();
let stream = self.parser.tokenizer.next();
assert_eq!(stream, Some(PdfToken::Regular(b"stream")));
let len = PdfStreamDictionary::parse_len_from_dictionary(&dictionary)?;
let eol_pos = self.tokenizer.pos();
match self.tokenizer.next_byte() {
let dictionary = PdfStreamDictionary::parse(dictionary.into())?;
let eol_pos = self.parser.tokenizer.pos();
match self.parser.tokenizer.next_byte() {
None => return Err(PdfParseError::TruncatedFile { pos: eol_pos }),
Some(b'\r') => {
let Some(b'\n') = self.tokenizer.next_byte() else {
let Some(b'\n') = self.parser.tokenizer.next_byte() else {
return Err(PdfParseError::InvalidOrMissingEolAfterStreamKeyword {
pos: eol_pos,
});
@ -668,121 +771,56 @@ impl<'a> PdfParser<'a> {
Some(b'\n') => {}
_ => return Err(PdfParseError::InvalidOrMissingEolAfterStreamKeyword { pos: eol_pos }),
}
let Some(data) = self.tokenizer.read_bytes(len) else {
let Some(data) = self.parser.tokenizer.read_bytes(dictionary.len) else {
return Err(PdfParseError::TruncatedFile {
pos: PdfInputPosition::new(Some(PdfInputPositionKnown {
pos: self.tokenizer.bytes.len(),
..self.tokenizer.pos
pos: self.parser.tokenizer.bytes.len(),
..self.parser.tokenizer.pos
})),
});
};
let (stream, unparsed) = PdfStream::new_unparsed(stream_pos, dictionary, Arc::from(data));
self.unparsed_stream_dictionaries.push(unparsed);
self.skip_comments_and_whitespace();
let pos = self.tokenizer.pos();
if let Some(PdfToken::Regular(b"endstream")) = self.tokenizer.next() {
let stream = PdfStream::new(
stream_pos,
&self.parser.objects,
dictionary,
Arc::from(data),
);
self.parser.skip_comments_and_whitespace();
let pos = self.parser.tokenizer.pos();
if let Some(PdfToken::Regular(b"endstream")) = self.parser.tokenizer.next() {
Ok(stream)
} else {
Err(PdfParseError::MissingEndStreamKeyword { pos })
}
}
fn parse_object(&mut self) -> Result<PdfObject, PdfParseError> {
self.skip_comments_and_whitespace();
if let Some(indirect) = self.parse_indirect_object()? {
return Ok(indirect.into());
}
let pos = self.tokenizer.pos();
match self
.tokenizer
.next()
.ok_or(PdfParseError::TruncatedFile { pos })?
{
PdfToken::Regular(b"true") => Ok(PdfObject::Boolean(PdfBoolean::new(pos, true))),
PdfToken::Regular(b"false") => Ok(PdfObject::Boolean(PdfBoolean::new(pos, false))),
PdfToken::Regular(b"null") => Ok(PdfObject::Null(PdfNull::new(pos))),
PdfToken::Regular(
number @ ([b'+' | b'-', b'0'..=b'9' | b'.', ..] | [b'0'..=b'9' | b'.', ..]),
) => {
// parse number
let Ok(number) = str::from_utf8(number) else {
return Err(PdfParseError::InvalidNumber { pos });
};
let mut parts = number
.strip_prefix(&['+', '-'])
.unwrap_or(number)
.split('.');
let integer_part = parts
.next()
.expect("split always returns at least one part");
let fraction_part = parts.next();
if parts.next().is_some() {
return Err(PdfParseError::InvalidNumber { pos });
}
if integer_part.is_empty() && fraction_part.is_none_or(|v| v.is_empty()) {
return Err(PdfParseError::InvalidNumber { pos });
}
if !integer_part.bytes().all(|v| v.is_ascii_digit()) {
return Err(PdfParseError::InvalidNumber { pos });
}
if let Some(fraction_part) = fraction_part {
if !fraction_part.bytes().all(|v| v.is_ascii_digit()) {
return Err(PdfParseError::InvalidNumber { pos });
}
Ok(PdfObject::Real(PdfReal::new(
pos,
number
.parse()
.map_err(|_| PdfParseError::InvalidNumber { pos })?,
)))
} else {
Ok(PdfObject::Integer(PdfInteger::new(
pos,
number
.parse()
.map_err(|_| PdfParseError::InvalidNumber { pos })?,
)))
}
match self.parser.parse_object_or_operator()? {
PdfObjectOrStreamDictionaryOrOperator::StreamDictionary {
dictionary,
stream_kw_pos: _,
} => Ok(PdfObject::Stream(
self.parse_stream_after_dictionary(dictionary)?,
)),
PdfObjectOrStreamDictionaryOrOperator::Object(object) => Ok(object),
PdfObjectOrStreamDictionaryOrOperator::Operator(operator) => {
Err(PdfParseError::OperatorNotAllowedHere { operator })
}
PdfToken::Regular(items) => todo!("{:?}", str::from_utf8(items)),
PdfToken::LParen => self.parse_string_after_l_paren().map(PdfObject::String),
PdfToken::RParen => todo!(),
PdfToken::LAngle => {
if self.tokenizer.peek_byte() == Some(b'<') {
let dictionary = self.parse_dictionary_after_one_l_angle()?;
self.skip_comments_and_whitespace();
if let Some(PdfToken::Regular(b"stream")) = self.tokenizer.peek() {
self.parse_stream_after_dictionary(dictionary)
.map(PdfObject::Stream)
} else {
Ok(dictionary.into())
}
} else {
self.parse_string_after_l_angle().map(PdfObject::String)
}
}
PdfToken::RAngle => todo!(),
PdfToken::LBracket => self.parse_array_after_l_bracket().map(PdfObject::Array),
PdfToken::RBracket => todo!(),
PdfToken::LBrace => todo!(),
PdfToken::RBrace => todo!(),
PdfToken::FSlash => self.parse_name_after_f_slash().map(PdfObject::Name),
PdfToken::Comment(_) => unreachable!(),
}
}
fn parse_indirect_object_definition(&mut self) -> Result<Option<()>, PdfParseError> {
self.skip_comments_and_whitespace();
let Some(id) = self.parse_object_identifier(false)? else {
self.parser.skip_comments_and_whitespace();
let Some(id) = self.parser.parse_object_identifier(false)? else {
return Ok(None);
};
self.skip_comments_and_whitespace();
let obj_pos = self.tokenizer.pos();
let Some(PdfToken::Regular(b"obj")) = self.tokenizer.next() else {
self.parser.skip_comments_and_whitespace();
let obj_pos = self.parser.tokenizer.pos();
let Some(PdfToken::Regular(b"obj")) = self.parser.tokenizer.next() else {
return Err(PdfParseError::MissingObj { pos: obj_pos });
};
let object = self.parse_object()?;
self.skip_comments_and_whitespace();
let end_obj_pos = self.tokenizer.pos();
let Some(PdfToken::Regular(b"endobj")) = self.tokenizer.next() else {
self.parser.skip_comments_and_whitespace();
let end_obj_pos = self.parser.tokenizer.pos();
let Some(PdfToken::Regular(b"endobj")) = self.parser.tokenizer.next() else {
return Err(PdfParseError::MissingEndObj { pos: end_obj_pos });
};
if self.objects_map.insert(id, object).is_some() {
@ -791,53 +829,13 @@ impl<'a> PdfParser<'a> {
Ok(Some(()))
}
}
fn parse_object_stream_inner(
&mut self,
object_stream: &PdfStream<PdfObjectStreamDictionary>,
) -> Result<(), PdfParseError> {
let mut object_ids_and_byte_positions =
Vec::<(PdfObjectIdentifier, usize)>::with_capacity(object_stream.dictionary().rest.n);
for _ in 0..object_stream.dictionary().rest.n {
self.skip_comments_and_whitespace();
let Some((pos, object_number)) =
self.parse_digits(|pos| Err(PdfParseError::InvalidObjectNumber { pos }))?
else {
return Err(PdfParseError::InvalidObjectNumber {
pos: self.tokenizer.pos(),
});
};
self.skip_comments_and_whitespace();
let Some((_, byte_position)) =
self.parse_digits(|pos| Err(PdfParseError::InvalidNumber { pos }))?
else {
return Err(PdfParseError::InvalidNumber {
pos: self.tokenizer.pos(),
});
};
object_ids_and_byte_positions.push((
PdfObjectIdentifier {
pos: pos.into(),
object_number,
generation_number: 0,
},
byte_position,
));
}
for (id, _byte_position) in object_ids_and_byte_positions {
let object = self.parse_object()?;
if self.objects_map.insert(id, object).is_some() {
return Err(PdfParseError::DuplicateIndirectObjectDefinition { pos: id.pos.0, id });
}
}
Ok(())
}
fn parse_object_stream(
&mut self,
object_stream: &PdfStream<PdfObjectStreamDictionary>,
) -> Result<(), PdfParseError> {
let data = object_stream.decoded_data().as_ref()?;
self.with_tokenizer(
PdfTokenizer::new(
let mut parser = PdfParser {
tokenizer: PdfTokenizer::new(
data,
PdfInputPositionKnown {
pos: 0,
@ -850,18 +848,48 @@ impl<'a> PdfParser<'a> {
),
},
),
|parser| parser.parse_object_stream_inner(object_stream),
)
.map_err(|e| PdfParseError::ObjectStreamParseError {
stream_pos: object_stream.get_pdf_input_position(),
error: Arc::new(e),
})
objects: self.parser.objects.clone(),
};
let mut object_ids_and_byte_positions =
Vec::<(PdfObjectIdentifier, usize)>::with_capacity(object_stream.dictionary().rest.n);
for _ in 0..object_stream.dictionary().rest.n {
parser.skip_comments_and_whitespace();
let Some((pos, object_number)) =
parser.parse_digits(|pos| Err(PdfParseError::InvalidObjectNumber { pos }))?
else {
return Err(PdfParseError::InvalidObjectNumber {
pos: parser.tokenizer.pos(),
});
};
parser.skip_comments_and_whitespace();
let Some((_, byte_position)) =
parser.parse_digits(|pos| Err(PdfParseError::InvalidNumber { pos }))?
else {
return Err(PdfParseError::InvalidNumber {
pos: parser.tokenizer.pos(),
});
};
object_ids_and_byte_positions.push((
PdfObjectIdentifier {
pos: pos.into(),
object_number,
generation_number: 0,
},
byte_position,
));
}
for (id, _byte_position) in object_ids_and_byte_positions {
let object = parser
.parse_object_or_operator()?
.error_on_stream_or_operator()?;
if self.objects_map.insert(id, object).is_some() {
return Err(PdfParseError::DuplicateIndirectObjectDefinition { pos: id.pos.0, id });
}
}
Ok(())
}
fn parse_body(&mut self) -> Result<(), PdfParseError> {
while let Some(()) = self.parse_indirect_object_definition()? {}
self.unparsed_stream_dictionaries
.drain(..)
.try_for_each(|v| v.finish_parsing())?;
let mut object_streams: Vec<PdfStream<PdfObjectStreamDictionary>> = Vec::new();
for object in self.objects_map.values_mut() {
let stream = match object {
@ -885,7 +913,7 @@ impl<'a> PdfParser<'a> {
for object_stream in &object_streams {
self.parse_object_stream(object_stream)?;
}
let Ok(()) = self.objects_arc.inner.set(PdfObjectsInner {
let Ok(()) = self.parser.objects.inner.set(PdfObjectsInner {
objects: std::mem::take(&mut self.objects_map),
object_streams,
}) else {
@ -894,19 +922,19 @@ impl<'a> PdfParser<'a> {
Ok(())
}
fn parse_xref_table(&mut self) -> Result<(), PdfParseError> {
self.skip_comments_and_whitespace();
let xref_pos = self.tokenizer.pos();
let Some(PdfToken::Regular(b"xref")) = self.tokenizer.peek() else {
self.parser.skip_comments_and_whitespace();
let xref_pos = self.parser.tokenizer.pos();
let Some(PdfToken::Regular(b"xref")) = self.parser.tokenizer.peek() else {
return Ok(());
};
todo!("{xref_pos}")
}
fn parse_trailer(&mut self) -> Result<PdfTrailer, PdfParseError> {
self.skip_comments_and_whitespace();
let trailer_pos = self.tokenizer.pos();
let trailer_dictionary = match self.tokenizer.peek() {
self.parser.skip_comments_and_whitespace();
let trailer_pos = self.parser.tokenizer.pos();
let trailer_dictionary = match self.parser.tokenizer.peek() {
Some(PdfToken::Regular(b"trailer")) => {
self.tokenizer.next();
self.parser.tokenizer.next();
Some(PdfTrailerDictionary::parse(self.parse_object()?)?)
}
Some(PdfToken::Regular(b"startxref")) => None,
@ -914,34 +942,35 @@ impl<'a> PdfParser<'a> {
return Err(PdfParseError::MissingTrailer { pos: trailer_pos });
}
};
self.skip_comments_and_whitespace();
let start_xref_kw_pos = self.tokenizer.pos();
let Some(PdfToken::Regular(b"startxref")) = self.tokenizer.next() else {
self.parser.skip_comments_and_whitespace();
let start_xref_kw_pos = self.parser.tokenizer.pos();
let Some(PdfToken::Regular(b"startxref")) = self.parser.tokenizer.next() else {
return Err(PdfParseError::MissingStartXRefKeyword {
pos: start_xref_kw_pos,
});
};
let start_xref_pos = self.tokenizer.pos();
let Some((start_xref_pos, start_xref)) =
self.parse_digits(|pos| Err(PdfParseError::IntegerOutOfRange { pos }))?
let start_xref_pos = self.parser.tokenizer.pos();
let Some((start_xref_pos, start_xref)) = self
.parser
.parse_digits(|pos| Err(PdfParseError::IntegerOutOfRange { pos }))?
else {
return Err(PdfParseError::MissingStartXRefValue {
pos: start_xref_pos,
});
};
self.tokenizer.skip_whitespace();
let eof_comment_pos = self.tokenizer.pos();
self.parser.tokenizer.skip_whitespace();
let eof_comment_pos = self.parser.tokenizer.pos();
let Some(PdfToken::Comment(b"%%EOF" | b"%%EOF\r" | b"%%EOF\r\n" | b"%%EOF\n")) =
self.tokenizer.next()
self.parser.tokenizer.next()
else {
return Err(PdfParseError::MissingEofComment {
pos: eof_comment_pos,
});
};
self.tokenizer.skip_whitespace();
if let Some(byte) = self.tokenizer.peek_byte() {
self.parser.tokenizer.skip_whitespace();
if let Some(byte) = self.parser.tokenizer.peek_byte() {
return Err(PdfParseError::UnexpectedByte {
pos: self.tokenizer.pos(),
pos: self.parser.tokenizer.pos(),
byte,
});
}
@ -951,24 +980,28 @@ impl<'a> PdfParser<'a> {
start_xref,
});
}
let old_tokenizer = self.tokenizer.clone();
self.tokenizer = PdfTokenizer::new(
self.tokenizer.bytes,
PdfInputPositionKnown {
pos: start_xref,
containing_streams_pos: None,
},
);
let id = self.parse_object_identifier(false);
self.tokenizer = old_tokenizer;
let id = PdfParser {
tokenizer: PdfTokenizer::new(
self.parser.tokenizer.bytes,
PdfInputPositionKnown {
pos: start_xref,
containing_streams_pos: None,
},
),
objects: self.parser.objects.clone(),
}
.parse_object_identifier(false);
let Some(id) = id? else {
return Err(PdfParseError::InvalidStartXRefValue {
pos: start_xref_pos,
start_xref,
});
};
let xref_stream =
PdfStream::parse(PdfObjectIndirect::new(&self.objects_arc, id).get().into())?;
let xref_stream = PdfStream::parse(
PdfObjectIndirect::new(&self.parser.objects, id)
.get()
.into(),
)?;
Ok(PdfTrailer::Stream {
xref_stream,
start_xref,
@ -979,9 +1012,14 @@ impl<'a> PdfParser<'a> {
self.parse_body()?;
self.parse_xref_table()?;
let trailer = self.parse_trailer()?;
for page in trailer.trailer_dictionary().root.pages.pages().iter() {
for content in page.contents.iter() {
content.decoded_data().as_ref()?;
}
}
Ok(Pdf {
header,
objects: self.objects_arc,
objects: self.parser.objects,
trailer,
})
}
@ -989,19 +1027,20 @@ impl<'a> PdfParser<'a> {
impl Pdf {
pub fn parse(bytes: impl AsRef<[u8]>) -> Result<Pdf, PdfParseError> {
PdfParser {
objects_arc: Arc::new(PdfObjects {
inner: OnceLock::new(),
}),
PdfFileParser {
parser: PdfParser {
objects: Arc::new(PdfObjects {
inner: OnceLock::new(),
}),
tokenizer: PdfTokenizer::new(
bytes.as_ref(),
PdfInputPositionKnown {
pos: 0,
containing_streams_pos: None,
},
),
},
objects_map: BTreeMap::new(),
unparsed_stream_dictionaries: vec![],
tokenizer: PdfTokenizer::new(
bytes.as_ref(),
PdfInputPositionKnown {
pos: 0,
containing_streams_pos: None,
},
),
}
.parse_file()
}