parse_powerisa_pdf/src/pdf.rs
2025-12-24 07:12:48 -08:00

930 lines
33 KiB
Rust

use crate::{
pdf::{
object::{
MaybeArray, PdfArray, PdfBoolean, PdfDictionary, PdfInteger, PdfName, PdfNull,
PdfObject, PdfObjectIdentifier, PdfObjectIndirect, PdfReal, PdfStream,
PdfStreamDictionary, PdfString, UnparsedPdfStreamDictionary,
},
parse::{PdfInputPosition, PdfParse, PdfParseError},
},
pdf_parse,
util::ArcOrRef,
};
use std::{
collections::BTreeMap,
convert::Infallible,
fmt,
num::NonZero,
str::FromStr,
sync::{Arc, OnceLock},
};
pub mod object;
pub mod parse;
pub struct PdfObjects {
objects: OnceLock<BTreeMap<PdfObjectIdentifier, PdfObject>>,
}
#[derive(Copy, Clone, Debug)]
pub struct PdfHeader {
pub major: NonZero<u16>,
pub minor: u16,
}
impl PdfHeader {
pub const PREFIX: &str = "%PDF-";
}
pdf_parse! {
#[derive(Clone, Debug)]
pub struct PdfTrailerDictionary {
#[pdf(name = "Size")]
pub size: usize,
#[pdf(name = "Prev")]
pub prev: Option<usize>,
#[pdf(name = "Root")]
pub root: PdfDictionary,
#[pdf(name = "Encrypt")]
pub encrypt: Option<PdfDictionary>,
#[pdf(name = "Info")]
pub info: Option<PdfDictionary>,
#[pdf(name = "ID")]
pub id: Option<[PdfString; 2]>,
#[pdf(flatten)]
pub rest: PdfDictionary,
}
}
pdf_parse! {
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Default, Debug)]
pub enum PdfXRefName {
#[pdf(name = "XRef")]
#[default]
XRef,
}
}
pdf_parse! {
#[derive(Clone, Debug)]
pub struct PdfXRefStreamDictionaryRest {
#[pdf(name = "Type")]
pub ty: PdfXRefName,
#[pdf(name = "Size")]
pub size: usize,
#[pdf(name = "Index")]
pub index: Option<Arc<[usize]>>,
#[pdf(name = "Prev")]
pub prev: Option<usize>,
#[pdf(name = "W")]
pub w: Option<Arc<[usize]>>,
#[pdf(name = "Root")]
pub root: PdfDictionary,
#[pdf(name = "Encrypt")]
pub encrypt: Option<PdfDictionary>,
#[pdf(name = "Info")]
pub info: Option<PdfDictionary>,
#[pdf(name = "ID")]
pub id: Option<[PdfString; 2]>,
#[pdf(flatten)]
pub rest: PdfDictionary,
}
}
#[derive(Clone, Debug)]
pub enum PdfTrailer {
Trailer {
trailer_dictionary: PdfTrailerDictionary,
start_xref: usize,
},
Stream {
xref_stream: PdfStream<PdfXRefStreamDictionaryRest>,
start_xref: usize,
},
}
pub struct Pdf {
pub header: PdfHeader,
pub objects: Arc<PdfObjects>,
pub trailer: PdfTrailer,
}
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
enum PdfCharCategory {
Regular,
Whitespace,
LParen,
RParen,
LAngle,
RAngle,
LBracket,
RBracket,
LBrace,
RBrace,
FSlash,
Percent,
}
impl PdfCharCategory {
fn new(b: u8) -> Self {
match b {
b'\0' | b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' => Self::Whitespace,
b'(' => Self::LParen,
b')' => Self::RParen,
b'<' => Self::LAngle,
b'>' => Self::RAngle,
b'[' => Self::LBracket,
b']' => Self::RBracket,
b'{' => Self::LBrace,
b'}' => Self::RBrace,
b'/' => Self::FSlash,
b'%' => Self::Percent,
_ => Self::Regular,
}
}
}
#[derive(Clone, Copy, PartialEq)]
enum PdfToken<'a> {
Regular(&'a [u8]),
LParen,
RParen,
LAngle,
RAngle,
LBracket,
RBracket,
LBrace,
RBrace,
FSlash,
Comment(&'a [u8]),
}
impl<'a> fmt::Debug for PdfToken<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Regular(contents) => {
if let Ok(contents) = str::from_utf8(contents) {
write!(f, "Regular({contents:?})")
} else {
write!(f, "Regular({contents:?})")
}
}
Self::LParen => write!(f, "LParen"),
Self::RParen => write!(f, "RParen"),
Self::LAngle => write!(f, "LAngle"),
Self::RAngle => write!(f, "RAngle"),
Self::LBracket => write!(f, "LBracket"),
Self::RBracket => write!(f, "RBracket"),
Self::LBrace => write!(f, "LBrace"),
Self::RBrace => write!(f, "RBrace"),
Self::FSlash => write!(f, "FSlash"),
Self::Comment(contents) => {
if let Ok(contents) = str::from_utf8(contents) {
write!(f, "Comment({contents:?})")
} else {
write!(f, "Comment({contents:?})")
}
}
}
}
}
#[derive(Clone)]
struct PdfTokenizerPeek<'a> {
token: PdfToken<'a>,
pos_after_token: usize,
}
#[derive(Clone)]
struct PdfTokenizer<'a> {
bytes: &'a [u8],
pos: usize,
peek_cache: Option<PdfTokenizerPeek<'a>>,
}
impl<'a> PdfTokenizer<'a> {
fn new(bytes: &'a [u8], pos: usize) -> Self {
Self {
bytes,
pos,
peek_cache: None,
}
}
fn pos(&self) -> PdfInputPosition {
PdfInputPosition::new(self.pos)
}
fn peek_byte(&mut self) -> Option<u8> {
self.bytes.get(self.pos).copied()
}
fn next_byte(&mut self) -> Option<u8> {
let b = self.bytes.get(self.pos)?;
self.pos += 1;
self.peek_cache = None;
Some(*b)
}
fn skip_whitespace(&mut self) {
while let Some(PdfCharCategory::Whitespace) = self.peek_byte().map(PdfCharCategory::new) {
self.next_byte();
}
}
fn peek(&mut self) -> Option<PdfToken<'a>> {
if let Some(PdfTokenizerPeek { token, .. }) = self.peek_cache {
return Some(token);
}
let mut tokenizer = self.clone();
let token = tokenizer.next()?;
self.peek_cache = Some(PdfTokenizerPeek {
token,
pos_after_token: tokenizer.pos,
});
Some(token)
}
fn read_bytes(&mut self, len: usize) -> Option<&'a [u8]> {
let retval = self.bytes.get(self.pos..self.pos.saturating_add(len))?;
self.peek_cache = None;
self.pos += len;
Some(retval)
}
}
impl<'a> Iterator for PdfTokenizer<'a> {
type Item = PdfToken<'a>;
fn next(&mut self) -> Option<Self::Item> {
if let Some(PdfTokenizerPeek {
token,
pos_after_token,
}) = self.peek_cache.take()
{
self.pos = pos_after_token;
return Some(token);
}
loop {
let start_pos = self.pos;
break match PdfCharCategory::new(self.next_byte()?) {
PdfCharCategory::Whitespace => continue,
PdfCharCategory::LParen => Some(PdfToken::LParen),
PdfCharCategory::RParen => Some(PdfToken::RParen),
PdfCharCategory::LAngle => Some(PdfToken::LAngle),
PdfCharCategory::RAngle => Some(PdfToken::RAngle),
PdfCharCategory::LBracket => Some(PdfToken::LBracket),
PdfCharCategory::RBracket => Some(PdfToken::RBracket),
PdfCharCategory::LBrace => Some(PdfToken::LBrace),
PdfCharCategory::RBrace => Some(PdfToken::RBrace),
PdfCharCategory::FSlash => Some(PdfToken::FSlash),
PdfCharCategory::Percent => {
loop {
match self.next_byte() {
None | Some(b'\n') => break,
Some(b'\r') => {
if let Some(b'\n') = self.peek_byte() {
self.pos += 1;
}
break;
}
Some(_) => continue,
}
}
Some(PdfToken::Comment(&self.bytes[start_pos..self.pos]))
}
PdfCharCategory::Regular => {
while let Some(PdfCharCategory::Regular) =
self.peek_byte().map(PdfCharCategory::new)
{
self.pos += 1;
}
Some(PdfToken::Regular(&self.bytes[start_pos..self.pos]))
}
};
}
}
}
struct PdfParser<'a> {
objects_arc: Arc<PdfObjects>,
objects_map: BTreeMap<PdfObjectIdentifier, PdfObject>,
unparsed_stream_dictionaries: Vec<UnparsedPdfStreamDictionary<PdfDictionary>>,
tokenizer: PdfTokenizer<'a>,
}
impl<'a> PdfParser<'a> {
fn parse_header(&mut self) -> Result<PdfHeader, PdfParseError> {
let Some(b'%') = self.tokenizer.bytes.first() else {
return Err(PdfParseError::NotAPdfFile);
};
let Some(PdfToken::Comment(header)) = self.tokenizer.next() else {
unreachable!()
};
let Ok(header) = str::from_utf8(header) else {
return Err(PdfParseError::NotAPdfFile);
};
let header = header.trim_end_matches(['\n', '\r']);
let Some(version) = header.strip_prefix(PdfHeader::PREFIX) else {
return Err(PdfParseError::NotAPdfFile);
};
let Some((major_str, minor_str)) = version.split_once('.') else {
return Err(PdfParseError::NotAPdfFile);
};
let (Ok(major), Ok(minor)) = (major_str.parse(), minor_str.parse()) else {
return Err(PdfParseError::NotAPdfFile);
};
Ok(PdfHeader { major, minor })
}
fn skip_comments_and_whitespace(&mut self) {
self.tokenizer.skip_whitespace();
while let Some(PdfToken::Comment(_)) = self.tokenizer.peek() {
self.tokenizer.next();
self.tokenizer.skip_whitespace();
}
}
fn parse_digits<T: FromStr>(
&mut self,
on_parse_failed: impl FnOnce(PdfInputPosition) -> Result<Option<Infallible>, PdfParseError>,
) -> Result<Option<(PdfInputPosition, T)>, PdfParseError> {
self.skip_comments_and_whitespace();
let old_tokenizer = self.tokenizer.clone();
let pos = self.tokenizer.pos();
let Some(PdfToken::Regular(number)) = self.tokenizer.next() else {
self.tokenizer = old_tokenizer;
return Ok(None);
};
if !number.iter().all(|b| b.is_ascii_digit()) {
self.tokenizer = old_tokenizer;
return Ok(None);
}
let Some(number) = str::from_utf8(number).ok().and_then(|v| v.parse().ok()) else {
self.tokenizer = old_tokenizer;
return Ok(match on_parse_failed(pos)? {
None => None,
});
};
Ok(Some((pos, number)))
}
fn parse_object_identifier(
&mut self,
return_none_for_out_of_range: bool,
) -> Result<Option<PdfObjectIdentifier>, PdfParseError> {
let old_tokenizer = self.tokenizer.clone();
let Some((pos, object_number)) = self.parse_digits(|pos| {
if return_none_for_out_of_range {
Ok(None)
} else {
Err(PdfParseError::InvalidObjectNumber { pos })
}
})?
else {
self.tokenizer = old_tokenizer;
return Ok(None);
};
let Some((_pos, generation_number)) = self.parse_digits(|pos| {
if return_none_for_out_of_range {
Ok(None)
} else {
Err(PdfParseError::InvalidGenerationNumber { pos })
}
})?
else {
self.tokenizer = old_tokenizer;
return Ok(None);
};
Ok(Some(PdfObjectIdentifier {
pos: pos.into(),
object_number,
generation_number,
}))
}
fn parse_indirect_object(&mut self) -> Result<Option<PdfObjectIndirect>, PdfParseError> {
let old_tokenizer = self.tokenizer.clone();
let Some(id) = self.parse_object_identifier(true)? else {
self.tokenizer = old_tokenizer;
return Ok(None);
};
if let Some(PdfToken::Regular(b"R")) = self.tokenizer.next() {
Ok(Some(PdfObjectIndirect::new(&self.objects_arc, id)))
} else {
self.tokenizer = old_tokenizer;
Ok(None)
}
}
fn parse_string_after_l_paren(&mut self) -> Result<PdfString, PdfParseError> {
let mut contents = Vec::new();
let mut paren_level = NonZero::new(1usize).expect("non-zero");
let string_pos = self.tokenizer.pos();
while let Some(b) = self.tokenizer.next_byte() {
contents.push(match b {
b'(' => {
paren_level = paren_level.checked_add(1).expect("overflow");
b
}
b')' => {
let Some(new_paren_level) = NonZero::new(paren_level.get() - 1) else {
return Ok(PdfString::new(
string_pos,
ArcOrRef::Arc(Arc::from(contents)),
));
};
paren_level = new_paren_level;
b
}
b'\r' if self.tokenizer.peek_byte() == Some(b'\n') => {
self.tokenizer.next_byte();
b'\n'
}
b'\r' | b'\n' => b'\n',
b'\\' => {
let pos = self.tokenizer.pos();
let Some(b) = self.tokenizer.next_byte() else {
return Err(PdfParseError::InvalidStringEscape { pos });
};
match b {
b'\r' if self.tokenizer.peek_byte() == Some(b'\n') => {
self.tokenizer.next_byte();
continue;
}
b'\r' | b'\n' => continue,
b'n' => b'\n',
b'r' => b'\r',
b't' => b'\t',
b'b' => b'\x08',
b'f' => b'\x0C',
b'(' | b')' | b'\\' => b,
b'0'..=b'7' => {
const MAX_OCTAL_DIGITS: usize = 3;
let mut value = b - b'0';
let mut len = 1;
while len < MAX_OCTAL_DIGITS {
let Some(b @ b'0'..=b'7') = self.tokenizer.peek_byte() else {
break;
};
value <<= 3;
value |= b - b'0';
len += 1;
self.tokenizer.next_byte();
}
value
}
_ => {
return Err(PdfParseError::InvalidStringEscape { pos });
}
}
}
_ => b,
});
}
Err(PdfParseError::TruncatedFile {
pos: self.tokenizer.pos(),
})
}
fn parse_string_after_l_angle(&mut self) -> Result<PdfString, PdfParseError> {
let mut contents = Vec::new();
let mut high_digit_value = None;
let mut push_digit_value = |value: u8| {
high_digit_value = match high_digit_value {
Some(high_digit_value) => {
contents.push((high_digit_value << 4) | value);
None
}
None => Some(value),
};
};
let string_pos = self.tokenizer.pos();
loop {
let pos = self.tokenizer.pos();
match self.tokenizer.next_byte() {
None => {
return Err(PdfParseError::TruncatedFile { pos });
}
Some(b) if PdfCharCategory::new(b) == PdfCharCategory::Whitespace => {}
Some(b'>') => {
// if we have an odd trailing digit, add the final digit, otherwise doesn't modify contents
push_digit_value(0);
return Ok(PdfString::new(
string_pos,
Arc::<[u8]>::from(contents).into(),
));
}
Some(b) => {
let Some(value) = (b as char).to_digit(0x10) else {
return Err(PdfParseError::InvalidHexStringDigit { pos });
};
push_digit_value(value as u8);
}
}
}
}
fn parse_name_after_f_slash(&mut self) -> Result<PdfName, PdfParseError> {
let mut name = vec![];
let name_pos = self.tokenizer.pos();
loop {
let Some(PdfCharCategory::Regular) =
self.tokenizer.peek_byte().map(PdfCharCategory::new)
else {
return Ok(PdfName::new(name_pos, ArcOrRef::Arc(Arc::from(name))));
};
let pos = self.tokenizer.pos();
match self
.tokenizer
.next_byte()
.expect("just checked that it's not None")
{
b'#' => {
let mut value = 0u8;
for _ in 0..2 {
let Some(digit) = self
.tokenizer
.next_byte()
.and_then(|b| (b as char).to_digit(0x10))
else {
return Err(PdfParseError::InvalidNameEscape { pos });
};
value <<= 4;
value |= digit as u8;
}
name.push(value);
}
b => name.push(b),
}
}
}
fn parse_array_after_l_bracket(&mut self) -> Result<PdfArray, PdfParseError> {
let array_pos = self.tokenizer.pos();
let mut contents: Vec<PdfObject> = Vec::new();
loop {
self.skip_comments_and_whitespace();
if let Some(PdfToken::RBracket) = self.tokenizer.peek() {
self.tokenizer.next();
return Ok(PdfArray::from_elements(array_pos, Arc::from(contents)));
}
contents.push(self.parse_object()?);
}
}
/// assumes `self.tokenizer.peek_byte() == Some(b'<')`
fn parse_dictionary_after_one_l_angle(&mut self) -> Result<PdfDictionary, PdfParseError> {
let l_angle = self.tokenizer.next_byte();
assert_eq!(l_angle, Some(b'<'));
let dictionary_pos = self.tokenizer.pos();
let mut contents: BTreeMap<PdfName, PdfObject> = BTreeMap::new();
loop {
self.skip_comments_and_whitespace();
if let Some(PdfToken::RAngle) = self.tokenizer.peek() {
self.tokenizer.next();
let pos = self.tokenizer.pos();
let b'>' = self
.tokenizer
.next_byte()
.ok_or(PdfParseError::TruncatedFile { pos })?
else {
return Err(PdfParseError::InvalidDictionaryClosingDoubleRAngle { pos });
};
return Ok(PdfDictionary::from_fields(
dictionary_pos,
Arc::new(contents),
));
}
let name = PdfName::parse(self.parse_object()?.into())?;
let name_pos = name.pos();
match contents.entry(name) {
std::collections::btree_map::Entry::Vacant(entry) => {
entry.insert(self.parse_object()?.into());
}
std::collections::btree_map::Entry::Occupied(entry) => {
return Err(PdfParseError::DuplicateDictionaryKey {
pos: name_pos,
name: entry.key().clone(),
});
}
}
}
}
/// assumes `self.tokenizer.peek() == Some(PdfToken::Regular(b"stream"))`
fn parse_stream_after_dictionary(
&mut self,
dictionary: PdfDictionary,
) -> Result<PdfStream, PdfParseError> {
self.tokenizer.skip_whitespace();
let stream_pos = self.tokenizer.pos();
let stream = self.tokenizer.next();
assert_eq!(stream, Some(PdfToken::Regular(b"stream")));
let len = PdfStreamDictionary::parse_len_from_dictionary(&dictionary)?;
let eol_pos = self.tokenizer.pos();
match self.tokenizer.next_byte() {
None => return Err(PdfParseError::TruncatedFile { pos: eol_pos }),
Some(b'\r') => {
let Some(b'\n') = self.tokenizer.next_byte() else {
return Err(PdfParseError::InvalidOrMissingEolAfterStreamKeyword {
pos: eol_pos,
});
};
}
Some(b'\n') => {}
_ => return Err(PdfParseError::InvalidOrMissingEolAfterStreamKeyword { pos: eol_pos }),
}
let Some(data) = self.tokenizer.read_bytes(len) else {
return Err(PdfParseError::TruncatedFile {
pos: PdfInputPosition::new(self.tokenizer.bytes.len()),
});
};
let (stream, unparsed) = PdfStream::new_unparsed(stream_pos, dictionary, Arc::from(data));
self.unparsed_stream_dictionaries.push(unparsed);
self.skip_comments_and_whitespace();
let pos = self.tokenizer.pos();
if let Some(PdfToken::Regular(b"endstream")) = self.tokenizer.next() {
Ok(stream)
} else {
Err(PdfParseError::MissingEndStreamKeyword { pos })
}
}
fn parse_object(&mut self) -> Result<PdfObject, PdfParseError> {
self.skip_comments_and_whitespace();
if let Some(indirect) = self.parse_indirect_object()? {
return Ok(indirect.into());
}
let pos = self.tokenizer.pos();
match self
.tokenizer
.next()
.ok_or(PdfParseError::TruncatedFile { pos })?
{
PdfToken::Regular(b"true") => Ok(PdfObject::Boolean(PdfBoolean::new(pos, true))),
PdfToken::Regular(b"false") => Ok(PdfObject::Boolean(PdfBoolean::new(pos, false))),
PdfToken::Regular(b"null") => Ok(PdfObject::Null(PdfNull::new(pos))),
PdfToken::Regular(
number @ ([b'+' | b'-', b'0'..=b'9' | b'.', ..] | [b'0'..=b'9' | b'.', ..]),
) => {
// parse number
let Ok(number) = str::from_utf8(number) else {
return Err(PdfParseError::InvalidNumber { pos });
};
let mut parts = number
.strip_prefix(&['+', '-'])
.unwrap_or(number)
.split('.');
let integer_part = parts
.next()
.expect("split always returns at least one part");
let fraction_part = parts.next();
if parts.next().is_some() {
return Err(PdfParseError::InvalidNumber { pos });
}
if integer_part.is_empty() && fraction_part.is_none_or(|v| v.is_empty()) {
return Err(PdfParseError::InvalidNumber { pos });
}
if !integer_part.bytes().all(|v| v.is_ascii_digit()) {
return Err(PdfParseError::InvalidNumber { pos });
}
if let Some(fraction_part) = fraction_part {
if !fraction_part.bytes().all(|v| v.is_ascii_digit()) {
return Err(PdfParseError::InvalidNumber { pos });
}
Ok(PdfObject::Real(PdfReal::new(
pos,
number
.parse()
.map_err(|_| PdfParseError::InvalidNumber { pos })?,
)))
} else {
Ok(PdfObject::Integer(PdfInteger::new(
pos,
number
.parse()
.map_err(|_| PdfParseError::InvalidNumber { pos })?,
)))
}
}
PdfToken::Regular(items) => todo!("{:?}", str::from_utf8(items)),
PdfToken::LParen => self.parse_string_after_l_paren().map(PdfObject::String),
PdfToken::RParen => todo!(),
PdfToken::LAngle => {
if self.tokenizer.peek_byte() == Some(b'<') {
let dictionary = self.parse_dictionary_after_one_l_angle()?;
self.skip_comments_and_whitespace();
if let Some(PdfToken::Regular(b"stream")) = self.tokenizer.peek() {
self.parse_stream_after_dictionary(dictionary)
.map(PdfObject::Stream)
} else {
Ok(dictionary.into())
}
} else {
self.parse_string_after_l_angle().map(PdfObject::String)
}
}
PdfToken::RAngle => todo!(),
PdfToken::LBracket => self.parse_array_after_l_bracket().map(PdfObject::Array),
PdfToken::RBracket => todo!(),
PdfToken::LBrace => todo!(),
PdfToken::RBrace => todo!(),
PdfToken::FSlash => self.parse_name_after_f_slash().map(PdfObject::Name),
PdfToken::Comment(_) => unreachable!(),
}
}
fn parse_indirect_object_definition(&mut self) -> Result<Option<()>, PdfParseError> {
self.skip_comments_and_whitespace();
let Some(id) = self.parse_object_identifier(false)? else {
return Ok(None);
};
self.skip_comments_and_whitespace();
let obj_pos = self.tokenizer.pos();
let Some(PdfToken::Regular(b"obj")) = self.tokenizer.next() else {
return Err(PdfParseError::MissingObj { pos: obj_pos });
};
let object = self.parse_object()?;
self.skip_comments_and_whitespace();
let end_obj_pos = self.tokenizer.pos();
let Some(PdfToken::Regular(b"endobj")) = self.tokenizer.next() else {
return Err(PdfParseError::MissingEndObj { pos: end_obj_pos });
};
if self.objects_map.insert(id, object).is_some() {
Err(PdfParseError::DuplicateIndirectObjectDefinition { pos: id.pos.0, id })
} else {
Ok(Some(()))
}
}
fn parse_body(&mut self) -> Result<(), PdfParseError> {
while let Some(()) = self.parse_indirect_object_definition()? {}
let Ok(()) = self
.objects_arc
.objects
.set(std::mem::take(&mut self.objects_map))
else {
unreachable!();
};
self.unparsed_stream_dictionaries
.drain(..)
.try_for_each(|v| v.finish_parsing())
}
fn parse_xref_table(&mut self) -> Result<(), PdfParseError> {
self.skip_comments_and_whitespace();
let xref_pos = self.tokenizer.pos();
let Some(PdfToken::Regular(b"xref")) = self.tokenizer.peek() else {
return Ok(());
};
todo!("{xref_pos}")
}
fn parse_trailer(&mut self) -> Result<PdfTrailer, PdfParseError> {
self.skip_comments_and_whitespace();
let trailer_pos = self.tokenizer.pos();
let trailer_dictionary = match self.tokenizer.peek() {
Some(PdfToken::Regular(b"trailer")) => {
self.tokenizer.next();
Some(PdfTrailerDictionary::parse(self.parse_object()?)?)
}
Some(PdfToken::Regular(b"startxref")) => None,
_ => {
return Err(PdfParseError::MissingTrailer { pos: trailer_pos });
}
};
self.skip_comments_and_whitespace();
let start_xref_kw_pos = self.tokenizer.pos();
let Some(PdfToken::Regular(b"startxref")) = self.tokenizer.next() else {
return Err(PdfParseError::MissingStartXRefKeyword {
pos: start_xref_kw_pos,
});
};
let start_xref_pos = self.tokenizer.pos();
let Some((start_xref_pos, start_xref)) =
self.parse_digits(|pos| Err(PdfParseError::IntegerOutOfRange { pos }))?
else {
return Err(PdfParseError::MissingStartXRefValue {
pos: start_xref_pos,
});
};
self.tokenizer.skip_whitespace();
let eof_comment_pos = self.tokenizer.pos();
let Some(PdfToken::Comment(b"%%EOF" | b"%%EOF\r" | b"%%EOF\r\n" | b"%%EOF\n")) =
self.tokenizer.next()
else {
return Err(PdfParseError::MissingEofComment {
pos: eof_comment_pos,
});
};
self.tokenizer.skip_whitespace();
if let Some(byte) = self.tokenizer.peek_byte() {
return Err(PdfParseError::UnexpectedByte {
pos: self.tokenizer.pos(),
byte,
});
}
if let Some(trailer_dictionary) = trailer_dictionary {
return Ok(PdfTrailer::Trailer {
trailer_dictionary,
start_xref,
});
}
let old_tokenizer = self.tokenizer.clone();
self.tokenizer = PdfTokenizer::new(self.tokenizer.bytes, start_xref);
let id = self.parse_object_identifier(false);
self.tokenizer = old_tokenizer;
let Some(id) = id? else {
return Err(PdfParseError::InvalidStartXRefValue {
pos: start_xref_pos,
start_xref,
});
};
let xref_stream =
PdfStream::parse(PdfObjectIndirect::new(&self.objects_arc, id).get().into())?;
Ok(PdfTrailer::Stream {
xref_stream,
start_xref,
})
}
fn parse_file(mut self) -> Result<Pdf, PdfParseError> {
let header = self.parse_header()?;
self.parse_body()?;
self.parse_xref_table()?;
let trailer = self.parse_trailer()?;
Ok(Pdf {
header,
objects: self.objects_arc,
trailer,
})
}
}
impl Pdf {
pub fn parse(bytes: impl AsRef<[u8]>) -> Result<Pdf, PdfParseError> {
PdfParser {
objects_arc: Arc::new(PdfObjects {
objects: OnceLock::new(),
}),
objects_map: BTreeMap::new(),
unparsed_stream_dictionaries: vec![],
tokenizer: PdfTokenizer::new(bytes.as_ref(), 0),
}
.parse_file()
}
}
#[cfg(test)]
mod tests {
use crate::{
pdf::{
object::{
PdfBoolean, PdfDictionary, PdfInteger, PdfName, PdfNull, PdfObject, PdfString,
},
parse::{PdfInputPosition, PdfParse, PdfParseError},
},
util::ArcOrRef,
};
#[test]
fn test_deserialize_dict() -> Result<(), PdfParseError> {
crate::pdf::parse::pdf_parse! {
#[derive(Debug)]
#[allow(dead_code)]
struct TestStruct {
#[pdf(name = "a")]
a: i32,
#[pdf(name = "c")]
c: i32,
#[pdf(name = "b")]
b: i32,
#[pdf(flatten)]
rest: PdfDictionary,
}
}
let v: TestStruct = PdfParse::parse(PdfObject::from(PdfDictionary::from_iter([
(
PdfName::new_static(b"a"),
PdfInteger::new(PdfInputPosition::empty(), 1).into(),
),
(
PdfName::new_static(b"c"),
PdfInteger::new(PdfInputPosition::empty(), 7).into(),
),
(
PdfName::new_static(b"b"),
PdfInteger::new(PdfInputPosition::empty(), 5).into(),
),
(
PdfName::new_static(b"d"),
PdfBoolean::new(PdfInputPosition::empty(), false).into(),
),
(
PdfName::new_static(b"e"),
PdfNull::new(PdfInputPosition::empty()).into(),
),
(
PdfName::new_static(b"f"),
PdfString::new(PdfInputPosition::empty(), ArcOrRef::Ref(b"test")).into(),
),
])))?;
let expected = TestStruct {
a: 1,
c: 7,
b: 5,
rest: PdfDictionary::from_iter([
(
PdfName::new_static(b"d"),
PdfBoolean::new(PdfInputPosition::empty(), false).into(),
),
(
PdfName::new_static(b"f"),
PdfString::new(PdfInputPosition::empty(), ArcOrRef::Ref(b"test")).into(),
),
]),
};
assert_eq!(format!("{v:?}"), format!("{expected:?}"));
Ok(())
}
}