parses root successfully

This commit is contained in:
Jacob Lifshay 2025-12-24 21:49:57 -08:00
parent 5fbfaa8053
commit 83631cc4c6
Signed by: programmerjake
SSH key fingerprint: SHA256:HnFTLGpSm4Q4Fj502oCFisjZSoakwEuTsJJMSke63RQ
7 changed files with 623 additions and 118 deletions

50
Cargo.lock generated
View file

@ -2,6 +2,56 @@
# It is not intended for manual editing.
version = 4
[[package]]
name = "adler2"
version = "2.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
[[package]]
name = "cfg-if"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
[[package]]
name = "crc32fast"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511"
dependencies = [
"cfg-if",
]
[[package]]
name = "flate2"
version = "1.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bfe33edd8e85a12a67454e37f8c75e730830d83e313556ab9ebf9ee7fbeb3bfb"
dependencies = [
"crc32fast",
"miniz_oxide",
]
[[package]]
name = "miniz_oxide"
version = "0.8.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316"
dependencies = [
"adler2",
"simd-adler32",
]
[[package]]
name = "parse_powerisa_pdf"
version = "0.1.0"
dependencies = [
"flate2",
]
[[package]]
name = "simd-adler32"
version = "0.3.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2"

View file

@ -5,4 +5,5 @@ edition = "2024"
license = "LGPL-3.0-or-later"
[dependencies]
flate2 = "1.1.5"

View file

@ -1,11 +1,11 @@
use crate::{
pdf::{
object::{
MaybeArray, PdfArray, PdfBoolean, PdfDictionary, PdfInteger, PdfName, PdfNull,
PdfObject, PdfObjectIdentifier, PdfObjectIndirect, PdfReal, PdfStream,
PdfArray, PdfBoolean, PdfDictionary, PdfInteger, PdfName, PdfNull, PdfObject,
PdfObjectIdentifier, PdfObjectIndirect, PdfObjectStreamDictionary, PdfReal, PdfStream,
PdfStreamDictionary, PdfString, UnparsedPdfStreamDictionary,
},
parse::{PdfInputPosition, PdfParse, PdfParseError},
parse::{GetPdfInputPosition, PdfInputPosition, PdfParse, PdfParseError},
},
pdf_parse,
util::ArcOrRef,
@ -21,9 +21,15 @@ use std::{
pub mod object;
pub mod parse;
pub mod stream_filters;
struct PdfObjectsInner {
objects: BTreeMap<PdfObjectIdentifier, PdfObject>,
object_streams: Vec<PdfStream<PdfObjectStreamDictionary>>,
}
pub struct PdfObjects {
objects: OnceLock<BTreeMap<PdfObjectIdentifier, PdfObject>>,
inner: OnceLock<PdfObjectsInner>,
}
#[derive(Copy, Clone, Debug)]
@ -70,24 +76,12 @@ pdf_parse! {
pub struct PdfXRefStreamDictionaryRest {
#[pdf(name = "Type")]
pub ty: PdfXRefName,
#[pdf(name = "Size")]
pub size: usize,
#[pdf(name = "Index")]
pub index: Option<Arc<[usize]>>,
#[pdf(name = "Prev")]
pub prev: Option<usize>,
#[pdf(name = "W")]
pub w: Option<Arc<[usize]>>,
#[pdf(name = "Root")]
pub root: PdfDictionary,
#[pdf(name = "Encrypt")]
pub encrypt: Option<PdfDictionary>,
#[pdf(name = "Info")]
pub info: Option<PdfDictionary>,
#[pdf(name = "ID")]
pub id: Option<[PdfString; 2]>,
#[pdf(flatten)]
pub rest: PdfDictionary,
pub rest: PdfTrailerDictionary,
}
}
@ -308,6 +302,38 @@ struct PdfParser<'a> {
}
impl<'a> PdfParser<'a> {
fn with_tokenizer<'b, R>(
&mut self,
tokenizer: PdfTokenizer<'b>,
f: impl FnOnce(&mut PdfParser<'b>) -> R,
) -> R {
let PdfParser {
objects_arc,
objects_map,
unparsed_stream_dictionaries,
tokenizer: _,
} = self;
let objects_arc = objects_arc.clone();
let objects_map = std::mem::take(objects_map);
let unparsed_stream_dictionaries = std::mem::take(unparsed_stream_dictionaries);
let mut new_parser = PdfParser {
objects_arc,
objects_map,
unparsed_stream_dictionaries,
tokenizer,
};
let retval = f(&mut new_parser);
let PdfParser {
objects_arc,
objects_map,
unparsed_stream_dictionaries,
tokenizer: _,
} = new_parser;
self.objects_arc = objects_arc;
self.objects_map = objects_map;
self.unparsed_stream_dictionaries = unparsed_stream_dictionaries;
retval
}
fn parse_header(&mut self) -> Result<PdfHeader, PdfParseError> {
let Some(b'%') = self.tokenizer.bytes.first() else {
return Err(PdfParseError::NotAPdfFile);
@ -739,18 +765,94 @@ impl<'a> PdfParser<'a> {
Ok(Some(()))
}
}
fn parse_object_stream_inner(
&mut self,
object_stream: &PdfStream<PdfObjectStreamDictionary>,
) -> Result<(), PdfParseError> {
let mut object_ids_and_byte_positions =
Vec::<(PdfObjectIdentifier, usize)>::with_capacity(object_stream.dictionary().rest.n);
for _ in 0..object_stream.dictionary().rest.n {
self.skip_comments_and_whitespace();
let Some((pos, object_number)) =
self.parse_digits(|pos| Err(PdfParseError::InvalidObjectNumber { pos }))?
else {
return Err(PdfParseError::InvalidObjectNumber {
pos: self.tokenizer.pos(),
});
};
self.skip_comments_and_whitespace();
let Some((_, byte_position)) =
self.parse_digits(|pos| Err(PdfParseError::InvalidNumber { pos }))?
else {
return Err(PdfParseError::InvalidNumber {
pos: self.tokenizer.pos(),
});
};
object_ids_and_byte_positions.push((
PdfObjectIdentifier {
pos: pos.into(),
object_number,
generation_number: 0,
},
byte_position,
));
}
for (id, _byte_position) in object_ids_and_byte_positions {
let object = self.parse_object()?;
if self.objects_map.insert(id, object).is_some() {
return Err(PdfParseError::DuplicateIndirectObjectDefinition { pos: id.pos.0, id });
}
}
Ok(())
}
fn parse_object_stream(
&mut self,
object_stream: &PdfStream<PdfObjectStreamDictionary>,
) -> Result<(), PdfParseError> {
let data = object_stream.decoded_data().as_ref()?;
self.with_tokenizer(PdfTokenizer::new(data, 0), |parser| {
parser.parse_object_stream_inner(object_stream)
})
.map_err(|e| PdfParseError::ObjectStreamParseError {
stream_pos: object_stream.get_pdf_input_position(),
error: Arc::new(e),
})
}
fn parse_body(&mut self) -> Result<(), PdfParseError> {
while let Some(()) = self.parse_indirect_object_definition()? {}
let Ok(()) = self
.objects_arc
.objects
.set(std::mem::take(&mut self.objects_map))
else {
unreachable!();
};
self.unparsed_stream_dictionaries
.drain(..)
.try_for_each(|v| v.finish_parsing())
.try_for_each(|v| v.finish_parsing())?;
let mut object_streams: Vec<PdfStream<PdfObjectStreamDictionary>> = Vec::new();
for object in self.objects_map.values_mut() {
let stream = match object {
PdfObject::Stream(stream) => stream,
PdfObject::Boolean(_)
| PdfObject::Integer(_)
| PdfObject::Real(_)
| PdfObject::String(_)
| PdfObject::Name(_)
| PdfObject::Array(_)
| PdfObject::Dictionary(_)
| PdfObject::Null(_)
| PdfObject::Indirect(_) => continue,
};
if PdfObjectStreamDictionary::parse_type_from_dictionary(&stream.dictionary().rest)
.is_ok()
{
object_streams.push(PdfStream::parse(object.clone())?);
}
}
for object_stream in &object_streams {
self.parse_object_stream(object_stream)?;
}
let Ok(()) = self.objects_arc.inner.set(PdfObjectsInner {
objects: std::mem::take(&mut self.objects_map),
object_streams,
}) else {
unreachable!();
};
Ok(())
}
fn parse_xref_table(&mut self) -> Result<(), PdfParseError> {
self.skip_comments_and_whitespace();
@ -844,7 +946,7 @@ impl Pdf {
pub fn parse(bytes: impl AsRef<[u8]>) -> Result<Pdf, PdfParseError> {
PdfParser {
objects_arc: Arc::new(PdfObjects {
objects: OnceLock::new(),
inner: OnceLock::new(),
}),
objects_map: BTreeMap::new(),
unparsed_stream_dictionaries: vec![],

View file

@ -5,15 +5,17 @@ use crate::{
GetPdfInputPosition, PdfInputPosition, PdfInputPositionNoCompare, PdfParse,
PdfParseError,
},
stream_filters::PdfStreamFilter,
},
pdf_parse,
util::ArcOrRef,
};
use std::{
any::TypeId,
borrow::Cow,
borrow::{Borrow, Cow},
collections::BTreeMap,
fmt::{self, Write},
iter::FusedIterator,
num::NonZero,
sync::{Arc, OnceLock},
};
@ -61,6 +63,12 @@ pub struct PdfName {
bytes: ArcOrRef<'static, [u8]>,
}
impl Borrow<[u8]> for PdfName {
fn borrow(&self) -> &[u8] {
&self.bytes
}
}
impl PdfName {
pub fn try_new(
pos: impl Into<PdfInputPositionNoCompare>,
@ -218,24 +226,51 @@ macro_rules! make_pdf_object {
$Variant:ident($ty:ty),
)+
) => {
#[derive(Clone, Debug)]
#[derive(Clone)]
pub enum PdfObjectNonNull {
$($Variant($ty),)*
}
#[derive(Clone, Debug)]
impl fmt::Debug for PdfObjectNonNull {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
$(Self::$Variant(v) => v.fmt(f),)*
}
}
}
#[derive(Clone)]
pub enum PdfObjectDirect {
$($Variant($ty),)*
Null(PdfNull),
}
#[derive(Clone, Debug)]
impl fmt::Debug for PdfObjectDirect {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
$(Self::$Variant(v) => v.fmt(f),)*
Self::Null(v) => v.fmt(f),
}
}
}
#[derive(Clone)]
pub enum PdfObject {
$($Variant($ty),)*
Null(PdfNull),
Indirect(PdfObjectIndirect),
}
impl fmt::Debug for PdfObject {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
$(Self::$Variant(v) => v.fmt(f),)*
Self::Null(v) => v.fmt(f),
Self::Indirect(v) => v.fmt(f),
}
}
}
$(
impl From<$ty> for PdfObjectNonNull {
fn from(value: $ty) -> Self {
@ -546,12 +581,12 @@ impl PdfObjectIndirect {
}
pub fn get(&self) -> PdfObjectDirect {
if let Some(objects) = self.objects.upgrade() {
if let Some(objects) = objects.objects.get() {
if let Some(objects) = objects.inner.get() {
let final_id = self.final_id.get().copied();
let limit = if final_id.is_some() { 1 } else { 1000usize };
let mut id = final_id.unwrap_or(self.id);
for _ in 0..limit {
if let Some(object) = objects.get(&self.id) {
if let Some(object) = objects.objects.get(&self.id) {
let retval = match object {
PdfObject::Boolean(v) => PdfObjectDirect::Boolean(*v),
PdfObject::Integer(v) => PdfObjectDirect::Integer(*v),
@ -628,18 +663,27 @@ impl PdfDictionary {
}
pub fn contains_key<Q: ?Sized>(&self, key: &Q) -> bool
where
PdfName: std::borrow::Borrow<Q> + Ord,
PdfName: std::borrow::Borrow<Q>,
Q: Ord,
{
self.fields.contains_key(key)
}
pub fn get<Q: ?Sized>(&self, key: &Q) -> Option<&PdfObject>
where
PdfName: std::borrow::Borrow<Q> + Ord,
PdfName: std::borrow::Borrow<Q>,
Q: Ord,
{
self.fields.get(key)
}
pub fn get_or_null<Q: ?Sized>(&self, key: &Q) -> PdfObject
where
PdfName: std::borrow::Borrow<Q>,
Q: Ord,
{
self.get(key)
.cloned()
.unwrap_or(PdfObject::Null(PdfNull(self.pos)))
}
pub fn pos(&self) -> PdfInputPosition {
self.pos.0
}
@ -842,35 +886,6 @@ impl<T> std::ops::DerefMut for MaybeArray<T> {
}
}
pdf_parse! {
#[derive(Clone, Debug, PartialEq, Eq)]
#[non_exhaustive]
pub enum PdfStreamFilter {
#[pdf(name = "ASCIIHexDecode")]
AsciiHexDecode,
#[pdf(name = "ASCII85Decode")]
Ascii85Decode,
#[pdf(name = "LZWDecode")]
LzwDecode,
#[pdf(name = "FlateDecode")]
FlateDecode,
#[pdf(name = "RunLengthDecode")]
RunLengthDecode,
#[pdf(name = "CCITTFaxDecode")]
CcittFaxDecode,
#[pdf(name = "JBIG2Decode")]
Jbig2Decode,
#[pdf(name = "DCTDecode")]
DctDecode,
#[pdf(name = "JPXDecode")]
JpxDecode,
#[pdf(name = "Crypt")]
Crypt,
#[pdf(other)]
Unknown(PdfName),
}
}
impl<T> Default for MaybeArray<T> {
fn default() -> Self {
Self(Arc::default())
@ -936,47 +951,101 @@ impl PdfStreamDictionary {
pub(crate) fn parse_len_from_dictionary(
dictionary: &PdfDictionary,
) -> Result<usize, PdfParseError> {
PdfParse::parse(
dictionary
.get(&PdfName::new_static(Self::LENGTH_NAME.as_bytes()))
.cloned()
.unwrap_or_default(),
PdfParse::parse(dictionary.get_or_null(Self::LENGTH_NAME.as_bytes()))
}
}
#[derive(Debug, Clone, Default)]
pub struct PdfStreamDictionaryFiltersAndParms<'a> {
filters: std::iter::Enumerate<std::slice::Iter<'a, PdfStreamFilter>>,
decode_parms: &'a [Option<PdfDictionary>],
}
impl<'a> PdfStreamDictionaryFiltersAndParms<'a> {
fn item_helper(
filter: (usize, &'a PdfStreamFilter),
decode_parms: &'a [Option<PdfDictionary>],
) -> (&'a PdfStreamFilter, &'a PdfDictionary) {
static EMPTY_DICTIONARY: OnceLock<PdfDictionary> = OnceLock::new();
let (index, filter) = filter;
(
filter,
match decode_parms.get(index) {
Some(Some(v)) => v,
_ => EMPTY_DICTIONARY.get_or_init(PdfDictionary::default),
},
)
}
}
impl<Rest> PdfStreamDictionary<Rest> {
pub fn filters_and_parms(
&self,
) -> impl Clone + ExactSizeIterator + DoubleEndedIterator<Item = (PdfStreamFilter, PdfDictionary)>
{
self.filters.iter().enumerate().map(|(index, filter)| {
(
filter.clone(),
self.decode_parms
.0
.get(index)
.cloned()
.flatten()
.unwrap_or_default(),
)
})
impl<'a> Iterator for PdfStreamDictionaryFiltersAndParms<'a> {
type Item = (&'a PdfStreamFilter, &'a PdfDictionary);
fn next(&mut self) -> Option<Self::Item> {
self.filters
.next()
.map(|filter| Self::item_helper(filter, self.decode_parms))
}
pub fn file_filters_and_parms(
&self,
) -> impl Clone + ExactSizeIterator + DoubleEndedIterator<Item = (PdfStreamFilter, PdfDictionary)>
fn size_hint(&self) -> (usize, Option<usize>) {
self.filters.size_hint()
}
fn nth(&mut self, n: usize) -> Option<Self::Item> {
self.filters
.nth(n)
.map(|filter| Self::item_helper(filter, self.decode_parms))
}
fn fold<B, F>(self, init: B, f: F) -> B
where
F: FnMut(B, Self::Item) -> B,
{
self.file_filters.iter().enumerate().map(|(index, filter)| {
(
filter.clone(),
self.file_decode_parms
.0
.get(index)
.cloned()
.flatten()
.unwrap_or_default(),
)
})
self.filters
.map(|filter| Self::item_helper(filter, self.decode_parms))
.fold(init, f)
}
}
impl<'a> FusedIterator for PdfStreamDictionaryFiltersAndParms<'a> {}
impl<'a> ExactSizeIterator for PdfStreamDictionaryFiltersAndParms<'a> {}
impl<'a> DoubleEndedIterator for PdfStreamDictionaryFiltersAndParms<'a> {
fn next_back(&mut self) -> Option<Self::Item> {
self.filters
.next_back()
.map(|filter| Self::item_helper(filter, self.decode_parms))
}
fn nth_back(&mut self, n: usize) -> Option<Self::Item> {
self.filters
.nth_back(n)
.map(|filter| Self::item_helper(filter, self.decode_parms))
}
fn rfold<B, F>(self, init: B, f: F) -> B
where
F: FnMut(B, Self::Item) -> B,
{
self.filters
.map(|filter| Self::item_helper(filter, self.decode_parms))
.rfold(init, f)
}
}
impl<Rest> PdfStreamDictionary<Rest> {
pub fn filters_and_parms<'a>(&'a self) -> PdfStreamDictionaryFiltersAndParms<'a> {
PdfStreamDictionaryFiltersAndParms {
filters: self.filters.iter().enumerate(),
decode_parms: &self.decode_parms,
}
}
pub fn file_filters_and_parms<'a>(&'a self) -> PdfStreamDictionaryFiltersAndParms<'a> {
PdfStreamDictionaryFiltersAndParms {
filters: self.file_filters.iter().enumerate(),
decode_parms: &self.file_decode_parms,
}
}
}
@ -1001,16 +1070,64 @@ impl<Rest: PdfParse> UnparsedPdfStreamDictionary<Rest> {
pub struct PdfStream<Rest = PdfDictionary> {
pos: PdfInputPositionNoCompare,
dictionary: Arc<OnceLock<PdfStreamDictionary<Rest>>>,
data: Arc<[u8]>,
encoded_data: Arc<[u8]>,
decoded_data: Arc<OnceLock<Result<Arc<[u8]>, PdfParseError>>>,
}
struct DumpBytes<'a>(&'a [u8]);
impl<'a> fmt::Debug for DumpBytes<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Display::fmt(self, f)
}
}
impl fmt::Display for DumpBytes<'_> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let mut first = true;
let mut fmt_chunk = |chunk: &[u8]| -> fmt::Result {
if first {
first = false;
} else {
f.write_str("\n")?;
}
write!(f, "\"{}\"", chunk.escape_ascii())
};
if self.0.is_empty() {
return fmt_chunk(self.0);
}
for chunk in self.0.chunks(32) {
fmt_chunk(chunk)?;
}
Ok(())
}
}
impl<Rest: fmt::Debug> fmt::Debug for PdfStream<Rest> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("PdfStream")
.field("pos", &self.pos)
.field("dictionary", &self.dictionary)
.field("data", &format_args!("{:02x?}", self.data))
.finish()
let Self {
pos,
dictionary,
encoded_data,
decoded_data,
} = self;
let mut debug_struct = f.debug_struct("PdfStream");
debug_struct.field("pos", pos);
if let Some(dictionary) = dictionary.get() {
debug_struct.field("dictionary", dictionary);
} else {
debug_struct.field("dictionary", &format_args!("<not-yet-parsed>"));
}
debug_struct.field("encoded_data", &DumpBytes(encoded_data));
if let Some(decoded_data) = decoded_data.get() {
match decoded_data {
Ok(decoded_data) => debug_struct.field("decoded_data", &DumpBytes(decoded_data)),
Err(e) => debug_struct.field("decoded_data", &Err::<(), _>(e)),
};
} else {
debug_struct.field("decoded_data", &format_args!("<not-yet-decoded>"));
}
debug_struct.finish()
}
}
@ -1018,25 +1135,27 @@ impl<Rest> PdfStream<Rest> {
pub fn new(
pos: impl Into<PdfInputPositionNoCompare>,
dictionary: PdfStreamDictionary<Rest>,
data: Arc<[u8]>,
encoded_data: Arc<[u8]>,
) -> Self {
Self {
pos: pos.into(),
dictionary: Arc::new(OnceLock::from(dictionary)),
data,
encoded_data,
decoded_data: Arc::new(OnceLock::new()),
}
}
pub(crate) fn new_unparsed(
pos: impl Into<PdfInputPositionNoCompare>,
unparsed_dictionary: PdfDictionary,
data: Arc<[u8]>,
encoded_data: Arc<[u8]>,
) -> (Self, UnparsedPdfStreamDictionary<Rest>) {
let dictionary = Arc::new(OnceLock::new());
(
Self {
pos: pos.into(),
dictionary: dictionary.clone(),
data,
encoded_data,
decoded_data: Arc::new(OnceLock::new()),
},
UnparsedPdfStreamDictionary {
unparsed_dictionary,
@ -1049,8 +1168,29 @@ impl<Rest> PdfStream<Rest> {
.get()
.expect("haven't finished parsing all pdf object definitions yet")
}
pub fn data(&self) -> &Arc<[u8]> {
&self.data
pub fn encoded_data(&self) -> &Arc<[u8]> {
&self.encoded_data
}
fn try_decode_data(&self) -> Result<Arc<[u8]>, PdfParseError> {
let dictionary = self.dictionary();
let (data, filters) = if let Some(file) = &dictionary.file {
todo!()
} else {
(&self.encoded_data, dictionary.filters_and_parms())
};
if filters.len() == 0 {
return Ok(data.clone());
}
let mut data: &[u8] = data;
let mut buffer;
for (filter, filter_parms) in filters {
buffer = filter.decode_stream_data(filter_parms.clone(), self.pos.0, &data)?;
data = &buffer;
}
Ok(Arc::from(data))
}
pub fn decoded_data(&self) -> &Result<Arc<[u8]>, PdfParseError> {
self.decoded_data.get_or_init(|| self.try_decode_data())
}
}
@ -1099,7 +1239,8 @@ impl<Rest: PdfParse> PdfParse for PdfStream<Rest> {
rest: Rest::parse(rest.clone().into())?,
}))
},
data: stream.data,
encoded_data: stream.encoded_data,
decoded_data: stream.decoded_data,
}),
object => Err(PdfParseError::InvalidType {
pos: object.get_pdf_input_position(),
@ -1109,3 +1250,37 @@ impl<Rest: PdfParse> PdfParse for PdfStream<Rest> {
}
}
}
pdf_parse! {
#[derive(Clone, Copy, Debug, Hash, Default, PartialEq, Eq, PartialOrd, Ord)]
pub enum PdfObjectStreamType {
#[pdf(name = "ObjStm")]
#[default]
ObjStm,
}
}
pdf_parse! {
#[derive(Clone, Debug)]
pub struct PdfObjectStreamDictionary {
#[pdf(name = Self::TYPE_NAME)]
pub ty: PdfObjectStreamType,
#[pdf(name = "N")]
pub n: usize,
#[pdf(name = "First")]
pub first: usize,
#[pdf(name = "Extends")]
pub extends: Option<PdfObjectIndirect>,
#[pdf(flatten)]
pub rest: PdfDictionary,
}
}
impl PdfObjectStreamDictionary {
pub const TYPE_NAME: &str = "Type";
pub(crate) fn parse_type_from_dictionary(
dictionary: &PdfDictionary,
) -> Result<PdfObjectStreamType, PdfParseError> {
PdfParse::parse(dictionary.get_or_null(Self::TYPE_NAME.as_bytes()))
}
}

View file

@ -144,7 +144,7 @@ impl PartialEq for PdfInputPositionNoCompare {
}
}
#[derive(Debug)]
#[derive(Debug, Clone)]
#[non_exhaustive]
pub enum PdfParseError {
Custom(String),
@ -231,6 +231,19 @@ pub enum PdfParseError {
pos: PdfInputPosition,
start_xref: usize,
},
UnknownStreamFilter {
pos: PdfInputPosition,
filter: PdfName,
},
StreamFilterError {
pos: PdfInputPosition,
filter: PdfName,
error: String,
},
ObjectStreamParseError {
stream_pos: PdfInputPosition,
error: Arc<PdfParseError>,
},
}
impl From<std::convert::Infallible> for PdfParseError {
@ -239,6 +252,12 @@ impl From<std::convert::Infallible> for PdfParseError {
}
}
impl<'a> From<&'a Self> for PdfParseError {
fn from(value: &'a Self) -> Self {
value.clone()
}
}
impl GetPdfInputPosition for PdfParseError {
fn get_pdf_input_position(&self) -> PdfInputPosition {
match *self {
@ -266,7 +285,12 @@ impl GetPdfInputPosition for PdfParseError {
| PdfParseError::MissingStartXRefValue { pos }
| PdfParseError::MissingEofComment { pos }
| PdfParseError::UnexpectedByte { pos, .. }
| PdfParseError::InvalidStartXRefValue { pos, .. } => pos,
| PdfParseError::InvalidStartXRefValue { pos, .. }
| PdfParseError::UnknownStreamFilter { pos, .. }
| PdfParseError::StreamFilterError { pos, .. }
| PdfParseError::ObjectStreamParseError {
stream_pos: pos, ..
} => pos,
}
}
}
@ -376,7 +400,7 @@ impl fmt::Display for PdfParseError {
write!(f, "at {pos}: missing `%%EOF` comment")
}
PdfParseError::UnexpectedByte { pos, byte } => {
write!(f, "at {pos}: unexpected byte {}", byte.escape_ascii())
write!(f, "at {pos}: unexpected byte '{}'", byte.escape_ascii())
}
PdfParseError::InvalidStartXRefValue { pos, start_xref } => {
write!(
@ -384,6 +408,23 @@ impl fmt::Display for PdfParseError {
"at {pos}: invalid `startxref` value: {start_xref} ({start_xref:#x})"
)
}
PdfParseError::UnknownStreamFilter { pos, ref filter } => {
write!(f, "at {pos}: unknown stream filter: {filter}")
}
PdfParseError::StreamFilterError {
pos,
ref filter,
ref error,
} => {
write!(f, "at {pos}: stream filter {filter} error: {error}")
}
PdfParseError::ObjectStreamParseError {
stream_pos,
ref error,
} => {
write!(f, "at {stream_pos}: object stream error: ")?;
error.fmt(f)
}
}
}
}
@ -785,12 +826,10 @@ macro_rules! pdf_parse {
[$(#[$($field_meta:tt)*])*]
$field_name:ident: $field_ty:ty
) => {
let $field_name = $crate::pdf::object::PdfName::new_static(
$crate::__std::convert::AsRef::<[u8]>::as_ref($name),
);
let $field_name = $crate::__std::convert::AsRef::<[u8]>::as_ref($name);
let $field_name = <$field_ty as $crate::pdf::parse::PdfParse>::parse(
$object_mut
.remove(&$field_name)
.remove($field_name)
.unwrap_or($crate::pdf::object::PdfObject::Null($crate::pdf::object::PdfNull::new($pos))),
)?;
};

65
src/pdf/stream_filters.rs Normal file
View file

@ -0,0 +1,65 @@
use crate::pdf::{
object::{PdfDictionary, PdfName},
parse::{PdfInputPosition, PdfParse, PdfParseError},
pdf_parse,
};
pub mod flate;
pdf_parse! {
#[derive(Clone, Debug, PartialEq, Eq)]
#[non_exhaustive]
pub enum PdfStreamFilter {
#[pdf(name = "ASCIIHexDecode")]
AsciiHexDecode,
#[pdf(name = "ASCII85Decode")]
Ascii85Decode,
#[pdf(name = "LZWDecode")]
LzwDecode,
#[pdf(name = "FlateDecode")]
FlateDecode,
#[pdf(name = "RunLengthDecode")]
RunLengthDecode,
#[pdf(name = "CCITTFaxDecode")]
CcittFaxDecode,
#[pdf(name = "JBIG2Decode")]
Jbig2Decode,
#[pdf(name = "DCTDecode")]
DctDecode,
#[pdf(name = "JPXDecode")]
JpxDecode,
#[pdf(name = "Crypt")]
Crypt,
#[pdf(other)]
Unknown(PdfName),
}
}
impl PdfStreamFilter {
pub fn decode_stream_data(
&self,
filter_parms: PdfDictionary,
stream_pos: PdfInputPosition,
encoded_data: &[u8],
) -> Result<Vec<u8>, PdfParseError> {
match self {
PdfStreamFilter::AsciiHexDecode => todo!(),
PdfStreamFilter::Ascii85Decode => todo!(),
PdfStreamFilter::LzwDecode => todo!(),
PdfStreamFilter::FlateDecode => {
flate::PdfFilterParmsFlateDecode::parse(filter_parms.into())?
.decode_stream_data(stream_pos, encoded_data)
}
PdfStreamFilter::RunLengthDecode => todo!(),
PdfStreamFilter::CcittFaxDecode => todo!(),
PdfStreamFilter::Jbig2Decode => todo!(),
PdfStreamFilter::DctDecode => todo!(),
PdfStreamFilter::JpxDecode => todo!(),
PdfStreamFilter::Crypt => todo!(),
PdfStreamFilter::Unknown(filter) => Err(PdfParseError::UnknownStreamFilter {
pos: stream_pos,
filter: filter.clone(),
}),
}
}
}

View file

@ -0,0 +1,73 @@
use crate::pdf::{
object::PdfDictionary,
parse::{PdfInputPosition, PdfParseError},
pdf_parse,
stream_filters::PdfStreamFilter,
};
use std::{io::Read, num::NonZero};
pdf_parse! {
#[derive(Clone, Debug, Default)]
pub struct PdfFilterParmsFlateDecode {
#[pdf(name = "Predictor")]
pub predictor: Option<NonZero<u32>>,
#[pdf(name = "Colors")]
pub colors: Option<NonZero<u32>>,
#[pdf(name = "BitsPerComponent")]
pub bits_per_component: Option<NonZero<u32>>,
#[pdf(name = "Columns")]
pub columns: Option<NonZero<u32>>,
#[pdf(flatten)]
pub rest: PdfDictionary,
}
}
impl PdfFilterParmsFlateDecode {
pub const FILTER: PdfStreamFilter = PdfStreamFilter::FlateDecode;
pub const DEFAULT_PREDICTOR: NonZero<u32> = const { NonZero::new(1).unwrap() };
pub const DEFAULT_COLORS: NonZero<u32> = const { NonZero::new(1).unwrap() };
pub const DEFAULT_BITS_PER_COMPONENT: NonZero<u32> = const { NonZero::new(8).unwrap() };
pub const DEFAULT_COLUMNS: NonZero<u32> = const { NonZero::new(1).unwrap() };
pub fn predictor(&self) -> NonZero<u32> {
self.predictor.unwrap_or(Self::DEFAULT_PREDICTOR)
}
pub fn colors(&self) -> NonZero<u32> {
self.colors.unwrap_or(Self::DEFAULT_COLORS)
}
pub fn bits_per_component(&self) -> NonZero<u32> {
self.bits_per_component
.unwrap_or(Self::DEFAULT_BITS_PER_COMPONENT)
}
pub fn columns(&self) -> NonZero<u32> {
self.columns.unwrap_or(Self::DEFAULT_COLUMNS)
}
pub fn decode_stream_data(
&self,
stream_pos: PdfInputPosition,
encoded_data: &[u8],
) -> Result<Vec<u8>, PdfParseError> {
let Self {
predictor: _,
colors: _,
bits_per_component: _,
columns: _,
rest: _,
} = self;
let mut decoded_data = vec![];
flate2::bufread::ZlibDecoder::new(encoded_data)
.read_to_end(&mut decoded_data)
.map_err(|e| PdfParseError::StreamFilterError {
pos: stream_pos,
filter: Self::FILTER.into(),
error: e.to_string(),
})?;
let predictor = self.predictor();
let colors = self.colors();
let bits_per_component = self.bits_per_component();
let columns = self.columns();
match predictor {
Self::DEFAULT_PREDICTOR => Ok(decoded_data),
_ => todo!("{predictor}"),
}
}
}