parse content streams into a list of operators
This commit is contained in:
parent
13dcea1dab
commit
aba6368948
5 changed files with 1541 additions and 408 deletions
557
src/pdf.rs
557
src/pdf.rs
|
|
@ -1,5 +1,6 @@
|
|||
use crate::{
|
||||
pdf::{
|
||||
content_stream::PdfOperatorUnparsed,
|
||||
document_structure::PdfDocumentCatalog,
|
||||
object::{
|
||||
PdfArray, PdfBoolean, PdfDictionary, PdfInteger, PdfName, PdfNull, PdfObject,
|
||||
|
|
@ -318,67 +319,35 @@ impl<'a> Iterator for PdfTokenizer<'a> {
|
|||
}
|
||||
|
||||
struct PdfParser<'a> {
|
||||
objects_arc: Arc<PdfObjects>,
|
||||
objects_map: BTreeMap<PdfObjectIdentifier, PdfObject>,
|
||||
unparsed_stream_dictionaries: Vec<UnparsedPdfStreamDictionary<PdfDictionary>>,
|
||||
objects: Arc<PdfObjects>,
|
||||
tokenizer: PdfTokenizer<'a>,
|
||||
}
|
||||
|
||||
enum PdfObjectOrStreamDictionaryOrOperator {
|
||||
StreamDictionary {
|
||||
dictionary: PdfDictionary,
|
||||
stream_kw_pos: PdfInputPosition,
|
||||
},
|
||||
Object(PdfObject),
|
||||
Operator(PdfOperatorUnparsed),
|
||||
}
|
||||
|
||||
impl PdfObjectOrStreamDictionaryOrOperator {
|
||||
fn error_on_stream_or_operator(self) -> Result<PdfObject, PdfParseError> {
|
||||
match self {
|
||||
PdfObjectOrStreamDictionaryOrOperator::StreamDictionary {
|
||||
dictionary: _,
|
||||
stream_kw_pos,
|
||||
} => Err(PdfParseError::StreamNotAllowedHere { pos: stream_kw_pos }),
|
||||
PdfObjectOrStreamDictionaryOrOperator::Object(object) => Ok(object),
|
||||
PdfObjectOrStreamDictionaryOrOperator::Operator(operator) => {
|
||||
Err(PdfParseError::OperatorNotAllowedHere { operator })
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> PdfParser<'a> {
|
||||
fn with_tokenizer<'b, R>(
|
||||
&mut self,
|
||||
tokenizer: PdfTokenizer<'b>,
|
||||
f: impl FnOnce(&mut PdfParser<'b>) -> R,
|
||||
) -> R {
|
||||
let PdfParser {
|
||||
objects_arc,
|
||||
objects_map,
|
||||
unparsed_stream_dictionaries,
|
||||
tokenizer: _,
|
||||
} = self;
|
||||
let objects_arc = objects_arc.clone();
|
||||
let objects_map = std::mem::take(objects_map);
|
||||
let unparsed_stream_dictionaries = std::mem::take(unparsed_stream_dictionaries);
|
||||
let mut new_parser = PdfParser {
|
||||
objects_arc,
|
||||
objects_map,
|
||||
unparsed_stream_dictionaries,
|
||||
tokenizer,
|
||||
};
|
||||
let retval = f(&mut new_parser);
|
||||
let PdfParser {
|
||||
objects_arc,
|
||||
objects_map,
|
||||
unparsed_stream_dictionaries,
|
||||
tokenizer: _,
|
||||
} = new_parser;
|
||||
self.objects_arc = objects_arc;
|
||||
self.objects_map = objects_map;
|
||||
self.unparsed_stream_dictionaries = unparsed_stream_dictionaries;
|
||||
retval
|
||||
}
|
||||
fn parse_header(&mut self) -> Result<PdfHeader, PdfParseError> {
|
||||
let Some(b'%') = self.tokenizer.bytes.first() else {
|
||||
return Err(PdfParseError::NotAPdfFile);
|
||||
};
|
||||
let Some(PdfToken::Comment(header)) = self.tokenizer.next() else {
|
||||
unreachable!()
|
||||
};
|
||||
let Ok(header) = str::from_utf8(header) else {
|
||||
return Err(PdfParseError::NotAPdfFile);
|
||||
};
|
||||
let header = header.trim_end_matches(['\n', '\r']);
|
||||
let Some(version) = header.strip_prefix(PdfHeader::PREFIX) else {
|
||||
return Err(PdfParseError::NotAPdfFile);
|
||||
};
|
||||
let Some((major_str, minor_str)) = version.split_once('.') else {
|
||||
return Err(PdfParseError::NotAPdfFile);
|
||||
};
|
||||
let (Ok(major), Ok(minor)) = (major_str.parse(), minor_str.parse()) else {
|
||||
return Err(PdfParseError::NotAPdfFile);
|
||||
};
|
||||
Ok(PdfHeader { major, minor })
|
||||
}
|
||||
fn skip_comments_and_whitespace(&mut self) {
|
||||
self.tokenizer.skip_whitespace();
|
||||
while let Some(PdfToken::Comment(_)) = self.tokenizer.peek() {
|
||||
|
|
@ -449,7 +418,7 @@ impl<'a> PdfParser<'a> {
|
|||
return Ok(None);
|
||||
};
|
||||
if let Some(PdfToken::Regular(b"R")) = self.tokenizer.next() {
|
||||
Ok(Some(PdfObjectIndirect::new(&self.objects_arc, id)))
|
||||
Ok(Some(PdfObjectIndirect::new(&self.objects, id)))
|
||||
} else {
|
||||
self.tokenizer = old_tokenizer;
|
||||
Ok(None)
|
||||
|
|
@ -604,7 +573,10 @@ impl<'a> PdfParser<'a> {
|
|||
self.tokenizer.next();
|
||||
return Ok(PdfArray::from_elements(array_pos, Arc::from(contents)));
|
||||
}
|
||||
contents.push(self.parse_object()?);
|
||||
contents.push(
|
||||
self.parse_object_or_operator()?
|
||||
.error_on_stream_or_operator()?,
|
||||
);
|
||||
}
|
||||
}
|
||||
/// assumes `self.tokenizer.peek_byte() == Some(b'<')`
|
||||
|
|
@ -630,11 +602,17 @@ impl<'a> PdfParser<'a> {
|
|||
Arc::new(contents),
|
||||
));
|
||||
}
|
||||
let name = PdfName::parse(self.parse_object()?.into())?;
|
||||
let name = PdfName::parse(
|
||||
self.parse_object_or_operator()?
|
||||
.error_on_stream_or_operator()?,
|
||||
)?;
|
||||
let name_pos = name.pos();
|
||||
match contents.entry(name) {
|
||||
std::collections::btree_map::Entry::Vacant(entry) => {
|
||||
entry.insert(self.parse_object()?.into());
|
||||
entry.insert(
|
||||
self.parse_object_or_operator()?
|
||||
.error_on_stream_or_operator()?,
|
||||
);
|
||||
}
|
||||
std::collections::btree_map::Entry::Occupied(entry) => {
|
||||
return Err(PdfParseError::DuplicateDictionaryKey {
|
||||
|
|
@ -645,21 +623,146 @@ impl<'a> PdfParser<'a> {
|
|||
}
|
||||
}
|
||||
}
|
||||
fn parse_object_or_operator(
|
||||
&mut self,
|
||||
) -> Result<PdfObjectOrStreamDictionaryOrOperator, PdfParseError> {
|
||||
self.skip_comments_and_whitespace();
|
||||
if let Some(indirect) = self.parse_indirect_object()? {
|
||||
return Ok(PdfObjectOrStreamDictionaryOrOperator::Object(
|
||||
indirect.into(),
|
||||
));
|
||||
}
|
||||
let pos = self.tokenizer.pos();
|
||||
Ok(PdfObjectOrStreamDictionaryOrOperator::Object(
|
||||
match self
|
||||
.tokenizer
|
||||
.next()
|
||||
.ok_or(PdfParseError::TruncatedFile { pos })?
|
||||
{
|
||||
PdfToken::Regular(b"true") => PdfObject::Boolean(PdfBoolean::new(pos, true)),
|
||||
PdfToken::Regular(b"false") => PdfObject::Boolean(PdfBoolean::new(pos, false)),
|
||||
PdfToken::Regular(b"null") => PdfObject::Null(PdfNull::new(pos)),
|
||||
PdfToken::Regular(
|
||||
number @ ([b'+' | b'-', b'0'..=b'9' | b'.', ..] | [b'0'..=b'9' | b'.', ..]),
|
||||
) => {
|
||||
// parse number
|
||||
let Ok(number) = str::from_utf8(number) else {
|
||||
return Err(PdfParseError::InvalidNumber { pos });
|
||||
};
|
||||
let mut parts = number
|
||||
.strip_prefix(&['+', '-'])
|
||||
.unwrap_or(number)
|
||||
.split('.');
|
||||
let integer_part = parts
|
||||
.next()
|
||||
.expect("split always returns at least one part");
|
||||
let fraction_part = parts.next();
|
||||
if parts.next().is_some() {
|
||||
return Err(PdfParseError::InvalidNumber { pos });
|
||||
}
|
||||
if integer_part.is_empty() && fraction_part.is_none_or(|v| v.is_empty()) {
|
||||
return Err(PdfParseError::InvalidNumber { pos });
|
||||
}
|
||||
if !integer_part.bytes().all(|v| v.is_ascii_digit()) {
|
||||
return Err(PdfParseError::InvalidNumber { pos });
|
||||
}
|
||||
if let Some(fraction_part) = fraction_part {
|
||||
if !fraction_part.bytes().all(|v| v.is_ascii_digit()) {
|
||||
return Err(PdfParseError::InvalidNumber { pos });
|
||||
}
|
||||
PdfObject::Real(PdfReal::new(
|
||||
pos,
|
||||
number
|
||||
.parse()
|
||||
.map_err(|_| PdfParseError::InvalidNumber { pos })?,
|
||||
))
|
||||
} else {
|
||||
PdfObject::Integer(PdfInteger::new(
|
||||
pos,
|
||||
number
|
||||
.parse()
|
||||
.map_err(|_| PdfParseError::InvalidNumber { pos })?,
|
||||
))
|
||||
}
|
||||
}
|
||||
PdfToken::Regular(name) => {
|
||||
return Ok(PdfObjectOrStreamDictionaryOrOperator::Operator(
|
||||
PdfOperatorUnparsed::new(pos, ArcOrRef::Arc(name.into())),
|
||||
));
|
||||
}
|
||||
PdfToken::LParen => PdfObject::String(self.parse_string_after_l_paren()?),
|
||||
PdfToken::RParen => todo!(),
|
||||
PdfToken::LAngle => {
|
||||
if self.tokenizer.peek_byte() == Some(b'<') {
|
||||
let dictionary = self.parse_dictionary_after_one_l_angle()?;
|
||||
self.skip_comments_and_whitespace();
|
||||
if let Some(PdfToken::Regular(b"stream")) = self.tokenizer.peek() {
|
||||
return Ok(PdfObjectOrStreamDictionaryOrOperator::StreamDictionary {
|
||||
dictionary,
|
||||
stream_kw_pos: self.tokenizer.pos(),
|
||||
});
|
||||
} else {
|
||||
dictionary.into()
|
||||
}
|
||||
} else {
|
||||
self.parse_string_after_l_angle()?.into()
|
||||
}
|
||||
}
|
||||
PdfToken::RAngle => todo!(),
|
||||
PdfToken::LBracket => self.parse_array_after_l_bracket()?.into(),
|
||||
PdfToken::RBracket => todo!(),
|
||||
PdfToken::LBrace => todo!(),
|
||||
PdfToken::RBrace => todo!(),
|
||||
PdfToken::FSlash => self.parse_name_after_f_slash()?.into(),
|
||||
PdfToken::Comment(_) => unreachable!(),
|
||||
},
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
struct PdfFileParser<'a> {
|
||||
parser: PdfParser<'a>,
|
||||
objects_map: BTreeMap<PdfObjectIdentifier, PdfObject>,
|
||||
}
|
||||
|
||||
impl<'a> PdfFileParser<'a> {
|
||||
fn parse_header(&mut self) -> Result<PdfHeader, PdfParseError> {
|
||||
let Some(b'%') = self.parser.tokenizer.bytes.first() else {
|
||||
return Err(PdfParseError::NotAPdfFile);
|
||||
};
|
||||
let Some(PdfToken::Comment(header)) = self.parser.tokenizer.next() else {
|
||||
unreachable!()
|
||||
};
|
||||
let Ok(header) = str::from_utf8(header) else {
|
||||
return Err(PdfParseError::NotAPdfFile);
|
||||
};
|
||||
let header = header.trim_end_matches(['\n', '\r']);
|
||||
let Some(version) = header.strip_prefix(PdfHeader::PREFIX) else {
|
||||
return Err(PdfParseError::NotAPdfFile);
|
||||
};
|
||||
let Some((major_str, minor_str)) = version.split_once('.') else {
|
||||
return Err(PdfParseError::NotAPdfFile);
|
||||
};
|
||||
let (Ok(major), Ok(minor)) = (major_str.parse(), minor_str.parse()) else {
|
||||
return Err(PdfParseError::NotAPdfFile);
|
||||
};
|
||||
Ok(PdfHeader { major, minor })
|
||||
}
|
||||
/// assumes `self.tokenizer.peek() == Some(PdfToken::Regular(b"stream"))`
|
||||
fn parse_stream_after_dictionary(
|
||||
&mut self,
|
||||
dictionary: PdfDictionary,
|
||||
) -> Result<PdfStream, PdfParseError> {
|
||||
self.tokenizer.skip_whitespace();
|
||||
let stream_pos = self.tokenizer.pos();
|
||||
let stream = self.tokenizer.next();
|
||||
self.parser.tokenizer.skip_whitespace();
|
||||
let stream_pos = self.parser.tokenizer.pos();
|
||||
let stream = self.parser.tokenizer.next();
|
||||
assert_eq!(stream, Some(PdfToken::Regular(b"stream")));
|
||||
let len = PdfStreamDictionary::parse_len_from_dictionary(&dictionary)?;
|
||||
let eol_pos = self.tokenizer.pos();
|
||||
match self.tokenizer.next_byte() {
|
||||
let dictionary = PdfStreamDictionary::parse(dictionary.into())?;
|
||||
let eol_pos = self.parser.tokenizer.pos();
|
||||
match self.parser.tokenizer.next_byte() {
|
||||
None => return Err(PdfParseError::TruncatedFile { pos: eol_pos }),
|
||||
Some(b'\r') => {
|
||||
let Some(b'\n') = self.tokenizer.next_byte() else {
|
||||
let Some(b'\n') = self.parser.tokenizer.next_byte() else {
|
||||
return Err(PdfParseError::InvalidOrMissingEolAfterStreamKeyword {
|
||||
pos: eol_pos,
|
||||
});
|
||||
|
|
@ -668,121 +771,56 @@ impl<'a> PdfParser<'a> {
|
|||
Some(b'\n') => {}
|
||||
_ => return Err(PdfParseError::InvalidOrMissingEolAfterStreamKeyword { pos: eol_pos }),
|
||||
}
|
||||
let Some(data) = self.tokenizer.read_bytes(len) else {
|
||||
let Some(data) = self.parser.tokenizer.read_bytes(dictionary.len) else {
|
||||
return Err(PdfParseError::TruncatedFile {
|
||||
pos: PdfInputPosition::new(Some(PdfInputPositionKnown {
|
||||
pos: self.tokenizer.bytes.len(),
|
||||
..self.tokenizer.pos
|
||||
pos: self.parser.tokenizer.bytes.len(),
|
||||
..self.parser.tokenizer.pos
|
||||
})),
|
||||
});
|
||||
};
|
||||
let (stream, unparsed) = PdfStream::new_unparsed(stream_pos, dictionary, Arc::from(data));
|
||||
self.unparsed_stream_dictionaries.push(unparsed);
|
||||
self.skip_comments_and_whitespace();
|
||||
let pos = self.tokenizer.pos();
|
||||
if let Some(PdfToken::Regular(b"endstream")) = self.tokenizer.next() {
|
||||
let stream = PdfStream::new(
|
||||
stream_pos,
|
||||
&self.parser.objects,
|
||||
dictionary,
|
||||
Arc::from(data),
|
||||
);
|
||||
self.parser.skip_comments_and_whitespace();
|
||||
let pos = self.parser.tokenizer.pos();
|
||||
if let Some(PdfToken::Regular(b"endstream")) = self.parser.tokenizer.next() {
|
||||
Ok(stream)
|
||||
} else {
|
||||
Err(PdfParseError::MissingEndStreamKeyword { pos })
|
||||
}
|
||||
}
|
||||
fn parse_object(&mut self) -> Result<PdfObject, PdfParseError> {
|
||||
self.skip_comments_and_whitespace();
|
||||
if let Some(indirect) = self.parse_indirect_object()? {
|
||||
return Ok(indirect.into());
|
||||
}
|
||||
let pos = self.tokenizer.pos();
|
||||
match self
|
||||
.tokenizer
|
||||
.next()
|
||||
.ok_or(PdfParseError::TruncatedFile { pos })?
|
||||
{
|
||||
PdfToken::Regular(b"true") => Ok(PdfObject::Boolean(PdfBoolean::new(pos, true))),
|
||||
PdfToken::Regular(b"false") => Ok(PdfObject::Boolean(PdfBoolean::new(pos, false))),
|
||||
PdfToken::Regular(b"null") => Ok(PdfObject::Null(PdfNull::new(pos))),
|
||||
PdfToken::Regular(
|
||||
number @ ([b'+' | b'-', b'0'..=b'9' | b'.', ..] | [b'0'..=b'9' | b'.', ..]),
|
||||
) => {
|
||||
// parse number
|
||||
let Ok(number) = str::from_utf8(number) else {
|
||||
return Err(PdfParseError::InvalidNumber { pos });
|
||||
};
|
||||
let mut parts = number
|
||||
.strip_prefix(&['+', '-'])
|
||||
.unwrap_or(number)
|
||||
.split('.');
|
||||
let integer_part = parts
|
||||
.next()
|
||||
.expect("split always returns at least one part");
|
||||
let fraction_part = parts.next();
|
||||
if parts.next().is_some() {
|
||||
return Err(PdfParseError::InvalidNumber { pos });
|
||||
}
|
||||
if integer_part.is_empty() && fraction_part.is_none_or(|v| v.is_empty()) {
|
||||
return Err(PdfParseError::InvalidNumber { pos });
|
||||
}
|
||||
if !integer_part.bytes().all(|v| v.is_ascii_digit()) {
|
||||
return Err(PdfParseError::InvalidNumber { pos });
|
||||
}
|
||||
if let Some(fraction_part) = fraction_part {
|
||||
if !fraction_part.bytes().all(|v| v.is_ascii_digit()) {
|
||||
return Err(PdfParseError::InvalidNumber { pos });
|
||||
}
|
||||
Ok(PdfObject::Real(PdfReal::new(
|
||||
pos,
|
||||
number
|
||||
.parse()
|
||||
.map_err(|_| PdfParseError::InvalidNumber { pos })?,
|
||||
)))
|
||||
} else {
|
||||
Ok(PdfObject::Integer(PdfInteger::new(
|
||||
pos,
|
||||
number
|
||||
.parse()
|
||||
.map_err(|_| PdfParseError::InvalidNumber { pos })?,
|
||||
)))
|
||||
}
|
||||
match self.parser.parse_object_or_operator()? {
|
||||
PdfObjectOrStreamDictionaryOrOperator::StreamDictionary {
|
||||
dictionary,
|
||||
stream_kw_pos: _,
|
||||
} => Ok(PdfObject::Stream(
|
||||
self.parse_stream_after_dictionary(dictionary)?,
|
||||
)),
|
||||
PdfObjectOrStreamDictionaryOrOperator::Object(object) => Ok(object),
|
||||
PdfObjectOrStreamDictionaryOrOperator::Operator(operator) => {
|
||||
Err(PdfParseError::OperatorNotAllowedHere { operator })
|
||||
}
|
||||
PdfToken::Regular(items) => todo!("{:?}", str::from_utf8(items)),
|
||||
PdfToken::LParen => self.parse_string_after_l_paren().map(PdfObject::String),
|
||||
PdfToken::RParen => todo!(),
|
||||
PdfToken::LAngle => {
|
||||
if self.tokenizer.peek_byte() == Some(b'<') {
|
||||
let dictionary = self.parse_dictionary_after_one_l_angle()?;
|
||||
self.skip_comments_and_whitespace();
|
||||
if let Some(PdfToken::Regular(b"stream")) = self.tokenizer.peek() {
|
||||
self.parse_stream_after_dictionary(dictionary)
|
||||
.map(PdfObject::Stream)
|
||||
} else {
|
||||
Ok(dictionary.into())
|
||||
}
|
||||
} else {
|
||||
self.parse_string_after_l_angle().map(PdfObject::String)
|
||||
}
|
||||
}
|
||||
PdfToken::RAngle => todo!(),
|
||||
PdfToken::LBracket => self.parse_array_after_l_bracket().map(PdfObject::Array),
|
||||
PdfToken::RBracket => todo!(),
|
||||
PdfToken::LBrace => todo!(),
|
||||
PdfToken::RBrace => todo!(),
|
||||
PdfToken::FSlash => self.parse_name_after_f_slash().map(PdfObject::Name),
|
||||
PdfToken::Comment(_) => unreachable!(),
|
||||
}
|
||||
}
|
||||
fn parse_indirect_object_definition(&mut self) -> Result<Option<()>, PdfParseError> {
|
||||
self.skip_comments_and_whitespace();
|
||||
let Some(id) = self.parse_object_identifier(false)? else {
|
||||
self.parser.skip_comments_and_whitespace();
|
||||
let Some(id) = self.parser.parse_object_identifier(false)? else {
|
||||
return Ok(None);
|
||||
};
|
||||
self.skip_comments_and_whitespace();
|
||||
let obj_pos = self.tokenizer.pos();
|
||||
let Some(PdfToken::Regular(b"obj")) = self.tokenizer.next() else {
|
||||
self.parser.skip_comments_and_whitespace();
|
||||
let obj_pos = self.parser.tokenizer.pos();
|
||||
let Some(PdfToken::Regular(b"obj")) = self.parser.tokenizer.next() else {
|
||||
return Err(PdfParseError::MissingObj { pos: obj_pos });
|
||||
};
|
||||
let object = self.parse_object()?;
|
||||
self.skip_comments_and_whitespace();
|
||||
let end_obj_pos = self.tokenizer.pos();
|
||||
let Some(PdfToken::Regular(b"endobj")) = self.tokenizer.next() else {
|
||||
self.parser.skip_comments_and_whitespace();
|
||||
let end_obj_pos = self.parser.tokenizer.pos();
|
||||
let Some(PdfToken::Regular(b"endobj")) = self.parser.tokenizer.next() else {
|
||||
return Err(PdfParseError::MissingEndObj { pos: end_obj_pos });
|
||||
};
|
||||
if self.objects_map.insert(id, object).is_some() {
|
||||
|
|
@ -791,53 +829,13 @@ impl<'a> PdfParser<'a> {
|
|||
Ok(Some(()))
|
||||
}
|
||||
}
|
||||
fn parse_object_stream_inner(
|
||||
&mut self,
|
||||
object_stream: &PdfStream<PdfObjectStreamDictionary>,
|
||||
) -> Result<(), PdfParseError> {
|
||||
let mut object_ids_and_byte_positions =
|
||||
Vec::<(PdfObjectIdentifier, usize)>::with_capacity(object_stream.dictionary().rest.n);
|
||||
for _ in 0..object_stream.dictionary().rest.n {
|
||||
self.skip_comments_and_whitespace();
|
||||
let Some((pos, object_number)) =
|
||||
self.parse_digits(|pos| Err(PdfParseError::InvalidObjectNumber { pos }))?
|
||||
else {
|
||||
return Err(PdfParseError::InvalidObjectNumber {
|
||||
pos: self.tokenizer.pos(),
|
||||
});
|
||||
};
|
||||
self.skip_comments_and_whitespace();
|
||||
let Some((_, byte_position)) =
|
||||
self.parse_digits(|pos| Err(PdfParseError::InvalidNumber { pos }))?
|
||||
else {
|
||||
return Err(PdfParseError::InvalidNumber {
|
||||
pos: self.tokenizer.pos(),
|
||||
});
|
||||
};
|
||||
object_ids_and_byte_positions.push((
|
||||
PdfObjectIdentifier {
|
||||
pos: pos.into(),
|
||||
object_number,
|
||||
generation_number: 0,
|
||||
},
|
||||
byte_position,
|
||||
));
|
||||
}
|
||||
for (id, _byte_position) in object_ids_and_byte_positions {
|
||||
let object = self.parse_object()?;
|
||||
if self.objects_map.insert(id, object).is_some() {
|
||||
return Err(PdfParseError::DuplicateIndirectObjectDefinition { pos: id.pos.0, id });
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
fn parse_object_stream(
|
||||
&mut self,
|
||||
object_stream: &PdfStream<PdfObjectStreamDictionary>,
|
||||
) -> Result<(), PdfParseError> {
|
||||
let data = object_stream.decoded_data().as_ref()?;
|
||||
self.with_tokenizer(
|
||||
PdfTokenizer::new(
|
||||
let mut parser = PdfParser {
|
||||
tokenizer: PdfTokenizer::new(
|
||||
data,
|
||||
PdfInputPositionKnown {
|
||||
pos: 0,
|
||||
|
|
@ -850,18 +848,48 @@ impl<'a> PdfParser<'a> {
|
|||
),
|
||||
},
|
||||
),
|
||||
|parser| parser.parse_object_stream_inner(object_stream),
|
||||
)
|
||||
.map_err(|e| PdfParseError::ObjectStreamParseError {
|
||||
stream_pos: object_stream.get_pdf_input_position(),
|
||||
error: Arc::new(e),
|
||||
})
|
||||
objects: self.parser.objects.clone(),
|
||||
};
|
||||
let mut object_ids_and_byte_positions =
|
||||
Vec::<(PdfObjectIdentifier, usize)>::with_capacity(object_stream.dictionary().rest.n);
|
||||
for _ in 0..object_stream.dictionary().rest.n {
|
||||
parser.skip_comments_and_whitespace();
|
||||
let Some((pos, object_number)) =
|
||||
parser.parse_digits(|pos| Err(PdfParseError::InvalidObjectNumber { pos }))?
|
||||
else {
|
||||
return Err(PdfParseError::InvalidObjectNumber {
|
||||
pos: parser.tokenizer.pos(),
|
||||
});
|
||||
};
|
||||
parser.skip_comments_and_whitespace();
|
||||
let Some((_, byte_position)) =
|
||||
parser.parse_digits(|pos| Err(PdfParseError::InvalidNumber { pos }))?
|
||||
else {
|
||||
return Err(PdfParseError::InvalidNumber {
|
||||
pos: parser.tokenizer.pos(),
|
||||
});
|
||||
};
|
||||
object_ids_and_byte_positions.push((
|
||||
PdfObjectIdentifier {
|
||||
pos: pos.into(),
|
||||
object_number,
|
||||
generation_number: 0,
|
||||
},
|
||||
byte_position,
|
||||
));
|
||||
}
|
||||
for (id, _byte_position) in object_ids_and_byte_positions {
|
||||
let object = parser
|
||||
.parse_object_or_operator()?
|
||||
.error_on_stream_or_operator()?;
|
||||
if self.objects_map.insert(id, object).is_some() {
|
||||
return Err(PdfParseError::DuplicateIndirectObjectDefinition { pos: id.pos.0, id });
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
fn parse_body(&mut self) -> Result<(), PdfParseError> {
|
||||
while let Some(()) = self.parse_indirect_object_definition()? {}
|
||||
self.unparsed_stream_dictionaries
|
||||
.drain(..)
|
||||
.try_for_each(|v| v.finish_parsing())?;
|
||||
let mut object_streams: Vec<PdfStream<PdfObjectStreamDictionary>> = Vec::new();
|
||||
for object in self.objects_map.values_mut() {
|
||||
let stream = match object {
|
||||
|
|
@ -885,7 +913,7 @@ impl<'a> PdfParser<'a> {
|
|||
for object_stream in &object_streams {
|
||||
self.parse_object_stream(object_stream)?;
|
||||
}
|
||||
let Ok(()) = self.objects_arc.inner.set(PdfObjectsInner {
|
||||
let Ok(()) = self.parser.objects.inner.set(PdfObjectsInner {
|
||||
objects: std::mem::take(&mut self.objects_map),
|
||||
object_streams,
|
||||
}) else {
|
||||
|
|
@ -894,19 +922,19 @@ impl<'a> PdfParser<'a> {
|
|||
Ok(())
|
||||
}
|
||||
fn parse_xref_table(&mut self) -> Result<(), PdfParseError> {
|
||||
self.skip_comments_and_whitespace();
|
||||
let xref_pos = self.tokenizer.pos();
|
||||
let Some(PdfToken::Regular(b"xref")) = self.tokenizer.peek() else {
|
||||
self.parser.skip_comments_and_whitespace();
|
||||
let xref_pos = self.parser.tokenizer.pos();
|
||||
let Some(PdfToken::Regular(b"xref")) = self.parser.tokenizer.peek() else {
|
||||
return Ok(());
|
||||
};
|
||||
todo!("{xref_pos}")
|
||||
}
|
||||
fn parse_trailer(&mut self) -> Result<PdfTrailer, PdfParseError> {
|
||||
self.skip_comments_and_whitespace();
|
||||
let trailer_pos = self.tokenizer.pos();
|
||||
let trailer_dictionary = match self.tokenizer.peek() {
|
||||
self.parser.skip_comments_and_whitespace();
|
||||
let trailer_pos = self.parser.tokenizer.pos();
|
||||
let trailer_dictionary = match self.parser.tokenizer.peek() {
|
||||
Some(PdfToken::Regular(b"trailer")) => {
|
||||
self.tokenizer.next();
|
||||
self.parser.tokenizer.next();
|
||||
Some(PdfTrailerDictionary::parse(self.parse_object()?)?)
|
||||
}
|
||||
Some(PdfToken::Regular(b"startxref")) => None,
|
||||
|
|
@ -914,34 +942,35 @@ impl<'a> PdfParser<'a> {
|
|||
return Err(PdfParseError::MissingTrailer { pos: trailer_pos });
|
||||
}
|
||||
};
|
||||
self.skip_comments_and_whitespace();
|
||||
let start_xref_kw_pos = self.tokenizer.pos();
|
||||
let Some(PdfToken::Regular(b"startxref")) = self.tokenizer.next() else {
|
||||
self.parser.skip_comments_and_whitespace();
|
||||
let start_xref_kw_pos = self.parser.tokenizer.pos();
|
||||
let Some(PdfToken::Regular(b"startxref")) = self.parser.tokenizer.next() else {
|
||||
return Err(PdfParseError::MissingStartXRefKeyword {
|
||||
pos: start_xref_kw_pos,
|
||||
});
|
||||
};
|
||||
let start_xref_pos = self.tokenizer.pos();
|
||||
let Some((start_xref_pos, start_xref)) =
|
||||
self.parse_digits(|pos| Err(PdfParseError::IntegerOutOfRange { pos }))?
|
||||
let start_xref_pos = self.parser.tokenizer.pos();
|
||||
let Some((start_xref_pos, start_xref)) = self
|
||||
.parser
|
||||
.parse_digits(|pos| Err(PdfParseError::IntegerOutOfRange { pos }))?
|
||||
else {
|
||||
return Err(PdfParseError::MissingStartXRefValue {
|
||||
pos: start_xref_pos,
|
||||
});
|
||||
};
|
||||
self.tokenizer.skip_whitespace();
|
||||
let eof_comment_pos = self.tokenizer.pos();
|
||||
self.parser.tokenizer.skip_whitespace();
|
||||
let eof_comment_pos = self.parser.tokenizer.pos();
|
||||
let Some(PdfToken::Comment(b"%%EOF" | b"%%EOF\r" | b"%%EOF\r\n" | b"%%EOF\n")) =
|
||||
self.tokenizer.next()
|
||||
self.parser.tokenizer.next()
|
||||
else {
|
||||
return Err(PdfParseError::MissingEofComment {
|
||||
pos: eof_comment_pos,
|
||||
});
|
||||
};
|
||||
self.tokenizer.skip_whitespace();
|
||||
if let Some(byte) = self.tokenizer.peek_byte() {
|
||||
self.parser.tokenizer.skip_whitespace();
|
||||
if let Some(byte) = self.parser.tokenizer.peek_byte() {
|
||||
return Err(PdfParseError::UnexpectedByte {
|
||||
pos: self.tokenizer.pos(),
|
||||
pos: self.parser.tokenizer.pos(),
|
||||
byte,
|
||||
});
|
||||
}
|
||||
|
|
@ -951,24 +980,28 @@ impl<'a> PdfParser<'a> {
|
|||
start_xref,
|
||||
});
|
||||
}
|
||||
let old_tokenizer = self.tokenizer.clone();
|
||||
self.tokenizer = PdfTokenizer::new(
|
||||
self.tokenizer.bytes,
|
||||
PdfInputPositionKnown {
|
||||
pos: start_xref,
|
||||
containing_streams_pos: None,
|
||||
},
|
||||
);
|
||||
let id = self.parse_object_identifier(false);
|
||||
self.tokenizer = old_tokenizer;
|
||||
let id = PdfParser {
|
||||
tokenizer: PdfTokenizer::new(
|
||||
self.parser.tokenizer.bytes,
|
||||
PdfInputPositionKnown {
|
||||
pos: start_xref,
|
||||
containing_streams_pos: None,
|
||||
},
|
||||
),
|
||||
objects: self.parser.objects.clone(),
|
||||
}
|
||||
.parse_object_identifier(false);
|
||||
let Some(id) = id? else {
|
||||
return Err(PdfParseError::InvalidStartXRefValue {
|
||||
pos: start_xref_pos,
|
||||
start_xref,
|
||||
});
|
||||
};
|
||||
let xref_stream =
|
||||
PdfStream::parse(PdfObjectIndirect::new(&self.objects_arc, id).get().into())?;
|
||||
let xref_stream = PdfStream::parse(
|
||||
PdfObjectIndirect::new(&self.parser.objects, id)
|
||||
.get()
|
||||
.into(),
|
||||
)?;
|
||||
Ok(PdfTrailer::Stream {
|
||||
xref_stream,
|
||||
start_xref,
|
||||
|
|
@ -979,9 +1012,14 @@ impl<'a> PdfParser<'a> {
|
|||
self.parse_body()?;
|
||||
self.parse_xref_table()?;
|
||||
let trailer = self.parse_trailer()?;
|
||||
for page in trailer.trailer_dictionary().root.pages.pages().iter() {
|
||||
for content in page.contents.iter() {
|
||||
content.decoded_data().as_ref()?;
|
||||
}
|
||||
}
|
||||
Ok(Pdf {
|
||||
header,
|
||||
objects: self.objects_arc,
|
||||
objects: self.parser.objects,
|
||||
trailer,
|
||||
})
|
||||
}
|
||||
|
|
@ -989,19 +1027,20 @@ impl<'a> PdfParser<'a> {
|
|||
|
||||
impl Pdf {
|
||||
pub fn parse(bytes: impl AsRef<[u8]>) -> Result<Pdf, PdfParseError> {
|
||||
PdfParser {
|
||||
objects_arc: Arc::new(PdfObjects {
|
||||
inner: OnceLock::new(),
|
||||
}),
|
||||
PdfFileParser {
|
||||
parser: PdfParser {
|
||||
objects: Arc::new(PdfObjects {
|
||||
inner: OnceLock::new(),
|
||||
}),
|
||||
tokenizer: PdfTokenizer::new(
|
||||
bytes.as_ref(),
|
||||
PdfInputPositionKnown {
|
||||
pos: 0,
|
||||
containing_streams_pos: None,
|
||||
},
|
||||
),
|
||||
},
|
||||
objects_map: BTreeMap::new(),
|
||||
unparsed_stream_dictionaries: vec![],
|
||||
tokenizer: PdfTokenizer::new(
|
||||
bytes.as_ref(),
|
||||
PdfInputPositionKnown {
|
||||
pos: 0,
|
||||
containing_streams_pos: None,
|
||||
},
|
||||
),
|
||||
}
|
||||
.parse_file()
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue