parses root successfully
This commit is contained in:
parent
5fbfaa8053
commit
83631cc4c6
7 changed files with 623 additions and 118 deletions
50
Cargo.lock
generated
50
Cargo.lock
generated
|
|
@ -2,6 +2,56 @@
|
|||
# It is not intended for manual editing.
|
||||
version = 4
|
||||
|
||||
[[package]]
|
||||
name = "adler2"
|
||||
version = "2.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
|
||||
|
||||
[[package]]
|
||||
name = "cfg-if"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
|
||||
|
||||
[[package]]
|
||||
name = "crc32fast"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "flate2"
|
||||
version = "1.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bfe33edd8e85a12a67454e37f8c75e730830d83e313556ab9ebf9ee7fbeb3bfb"
|
||||
dependencies = [
|
||||
"crc32fast",
|
||||
"miniz_oxide",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "miniz_oxide"
|
||||
version = "0.8.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316"
|
||||
dependencies = [
|
||||
"adler2",
|
||||
"simd-adler32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parse_powerisa_pdf"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"flate2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "simd-adler32"
|
||||
version = "0.3.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2"
|
||||
|
|
|
|||
|
|
@ -5,4 +5,5 @@ edition = "2024"
|
|||
license = "LGPL-3.0-or-later"
|
||||
|
||||
[dependencies]
|
||||
flate2 = "1.1.5"
|
||||
|
||||
|
|
|
|||
154
src/pdf.rs
154
src/pdf.rs
|
|
@ -1,11 +1,11 @@
|
|||
use crate::{
|
||||
pdf::{
|
||||
object::{
|
||||
MaybeArray, PdfArray, PdfBoolean, PdfDictionary, PdfInteger, PdfName, PdfNull,
|
||||
PdfObject, PdfObjectIdentifier, PdfObjectIndirect, PdfReal, PdfStream,
|
||||
PdfArray, PdfBoolean, PdfDictionary, PdfInteger, PdfName, PdfNull, PdfObject,
|
||||
PdfObjectIdentifier, PdfObjectIndirect, PdfObjectStreamDictionary, PdfReal, PdfStream,
|
||||
PdfStreamDictionary, PdfString, UnparsedPdfStreamDictionary,
|
||||
},
|
||||
parse::{PdfInputPosition, PdfParse, PdfParseError},
|
||||
parse::{GetPdfInputPosition, PdfInputPosition, PdfParse, PdfParseError},
|
||||
},
|
||||
pdf_parse,
|
||||
util::ArcOrRef,
|
||||
|
|
@ -21,9 +21,15 @@ use std::{
|
|||
|
||||
pub mod object;
|
||||
pub mod parse;
|
||||
pub mod stream_filters;
|
||||
|
||||
struct PdfObjectsInner {
|
||||
objects: BTreeMap<PdfObjectIdentifier, PdfObject>,
|
||||
object_streams: Vec<PdfStream<PdfObjectStreamDictionary>>,
|
||||
}
|
||||
|
||||
pub struct PdfObjects {
|
||||
objects: OnceLock<BTreeMap<PdfObjectIdentifier, PdfObject>>,
|
||||
inner: OnceLock<PdfObjectsInner>,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
|
|
@ -70,24 +76,12 @@ pdf_parse! {
|
|||
pub struct PdfXRefStreamDictionaryRest {
|
||||
#[pdf(name = "Type")]
|
||||
pub ty: PdfXRefName,
|
||||
#[pdf(name = "Size")]
|
||||
pub size: usize,
|
||||
#[pdf(name = "Index")]
|
||||
pub index: Option<Arc<[usize]>>,
|
||||
#[pdf(name = "Prev")]
|
||||
pub prev: Option<usize>,
|
||||
#[pdf(name = "W")]
|
||||
pub w: Option<Arc<[usize]>>,
|
||||
#[pdf(name = "Root")]
|
||||
pub root: PdfDictionary,
|
||||
#[pdf(name = "Encrypt")]
|
||||
pub encrypt: Option<PdfDictionary>,
|
||||
#[pdf(name = "Info")]
|
||||
pub info: Option<PdfDictionary>,
|
||||
#[pdf(name = "ID")]
|
||||
pub id: Option<[PdfString; 2]>,
|
||||
#[pdf(flatten)]
|
||||
pub rest: PdfDictionary,
|
||||
pub rest: PdfTrailerDictionary,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -308,6 +302,38 @@ struct PdfParser<'a> {
|
|||
}
|
||||
|
||||
impl<'a> PdfParser<'a> {
|
||||
fn with_tokenizer<'b, R>(
|
||||
&mut self,
|
||||
tokenizer: PdfTokenizer<'b>,
|
||||
f: impl FnOnce(&mut PdfParser<'b>) -> R,
|
||||
) -> R {
|
||||
let PdfParser {
|
||||
objects_arc,
|
||||
objects_map,
|
||||
unparsed_stream_dictionaries,
|
||||
tokenizer: _,
|
||||
} = self;
|
||||
let objects_arc = objects_arc.clone();
|
||||
let objects_map = std::mem::take(objects_map);
|
||||
let unparsed_stream_dictionaries = std::mem::take(unparsed_stream_dictionaries);
|
||||
let mut new_parser = PdfParser {
|
||||
objects_arc,
|
||||
objects_map,
|
||||
unparsed_stream_dictionaries,
|
||||
tokenizer,
|
||||
};
|
||||
let retval = f(&mut new_parser);
|
||||
let PdfParser {
|
||||
objects_arc,
|
||||
objects_map,
|
||||
unparsed_stream_dictionaries,
|
||||
tokenizer: _,
|
||||
} = new_parser;
|
||||
self.objects_arc = objects_arc;
|
||||
self.objects_map = objects_map;
|
||||
self.unparsed_stream_dictionaries = unparsed_stream_dictionaries;
|
||||
retval
|
||||
}
|
||||
fn parse_header(&mut self) -> Result<PdfHeader, PdfParseError> {
|
||||
let Some(b'%') = self.tokenizer.bytes.first() else {
|
||||
return Err(PdfParseError::NotAPdfFile);
|
||||
|
|
@ -739,18 +765,94 @@ impl<'a> PdfParser<'a> {
|
|||
Ok(Some(()))
|
||||
}
|
||||
}
|
||||
fn parse_object_stream_inner(
|
||||
&mut self,
|
||||
object_stream: &PdfStream<PdfObjectStreamDictionary>,
|
||||
) -> Result<(), PdfParseError> {
|
||||
let mut object_ids_and_byte_positions =
|
||||
Vec::<(PdfObjectIdentifier, usize)>::with_capacity(object_stream.dictionary().rest.n);
|
||||
for _ in 0..object_stream.dictionary().rest.n {
|
||||
self.skip_comments_and_whitespace();
|
||||
let Some((pos, object_number)) =
|
||||
self.parse_digits(|pos| Err(PdfParseError::InvalidObjectNumber { pos }))?
|
||||
else {
|
||||
return Err(PdfParseError::InvalidObjectNumber {
|
||||
pos: self.tokenizer.pos(),
|
||||
});
|
||||
};
|
||||
self.skip_comments_and_whitespace();
|
||||
let Some((_, byte_position)) =
|
||||
self.parse_digits(|pos| Err(PdfParseError::InvalidNumber { pos }))?
|
||||
else {
|
||||
return Err(PdfParseError::InvalidNumber {
|
||||
pos: self.tokenizer.pos(),
|
||||
});
|
||||
};
|
||||
object_ids_and_byte_positions.push((
|
||||
PdfObjectIdentifier {
|
||||
pos: pos.into(),
|
||||
object_number,
|
||||
generation_number: 0,
|
||||
},
|
||||
byte_position,
|
||||
));
|
||||
}
|
||||
for (id, _byte_position) in object_ids_and_byte_positions {
|
||||
let object = self.parse_object()?;
|
||||
if self.objects_map.insert(id, object).is_some() {
|
||||
return Err(PdfParseError::DuplicateIndirectObjectDefinition { pos: id.pos.0, id });
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
fn parse_object_stream(
|
||||
&mut self,
|
||||
object_stream: &PdfStream<PdfObjectStreamDictionary>,
|
||||
) -> Result<(), PdfParseError> {
|
||||
let data = object_stream.decoded_data().as_ref()?;
|
||||
self.with_tokenizer(PdfTokenizer::new(data, 0), |parser| {
|
||||
parser.parse_object_stream_inner(object_stream)
|
||||
})
|
||||
.map_err(|e| PdfParseError::ObjectStreamParseError {
|
||||
stream_pos: object_stream.get_pdf_input_position(),
|
||||
error: Arc::new(e),
|
||||
})
|
||||
}
|
||||
fn parse_body(&mut self) -> Result<(), PdfParseError> {
|
||||
while let Some(()) = self.parse_indirect_object_definition()? {}
|
||||
let Ok(()) = self
|
||||
.objects_arc
|
||||
.objects
|
||||
.set(std::mem::take(&mut self.objects_map))
|
||||
else {
|
||||
unreachable!();
|
||||
};
|
||||
self.unparsed_stream_dictionaries
|
||||
.drain(..)
|
||||
.try_for_each(|v| v.finish_parsing())
|
||||
.try_for_each(|v| v.finish_parsing())?;
|
||||
let mut object_streams: Vec<PdfStream<PdfObjectStreamDictionary>> = Vec::new();
|
||||
for object in self.objects_map.values_mut() {
|
||||
let stream = match object {
|
||||
PdfObject::Stream(stream) => stream,
|
||||
PdfObject::Boolean(_)
|
||||
| PdfObject::Integer(_)
|
||||
| PdfObject::Real(_)
|
||||
| PdfObject::String(_)
|
||||
| PdfObject::Name(_)
|
||||
| PdfObject::Array(_)
|
||||
| PdfObject::Dictionary(_)
|
||||
| PdfObject::Null(_)
|
||||
| PdfObject::Indirect(_) => continue,
|
||||
};
|
||||
if PdfObjectStreamDictionary::parse_type_from_dictionary(&stream.dictionary().rest)
|
||||
.is_ok()
|
||||
{
|
||||
object_streams.push(PdfStream::parse(object.clone())?);
|
||||
}
|
||||
}
|
||||
for object_stream in &object_streams {
|
||||
self.parse_object_stream(object_stream)?;
|
||||
}
|
||||
let Ok(()) = self.objects_arc.inner.set(PdfObjectsInner {
|
||||
objects: std::mem::take(&mut self.objects_map),
|
||||
object_streams,
|
||||
}) else {
|
||||
unreachable!();
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
fn parse_xref_table(&mut self) -> Result<(), PdfParseError> {
|
||||
self.skip_comments_and_whitespace();
|
||||
|
|
@ -844,7 +946,7 @@ impl Pdf {
|
|||
pub fn parse(bytes: impl AsRef<[u8]>) -> Result<Pdf, PdfParseError> {
|
||||
PdfParser {
|
||||
objects_arc: Arc::new(PdfObjects {
|
||||
objects: OnceLock::new(),
|
||||
inner: OnceLock::new(),
|
||||
}),
|
||||
objects_map: BTreeMap::new(),
|
||||
unparsed_stream_dictionaries: vec![],
|
||||
|
|
|
|||
|
|
@ -5,15 +5,17 @@ use crate::{
|
|||
GetPdfInputPosition, PdfInputPosition, PdfInputPositionNoCompare, PdfParse,
|
||||
PdfParseError,
|
||||
},
|
||||
stream_filters::PdfStreamFilter,
|
||||
},
|
||||
pdf_parse,
|
||||
util::ArcOrRef,
|
||||
};
|
||||
use std::{
|
||||
any::TypeId,
|
||||
borrow::Cow,
|
||||
borrow::{Borrow, Cow},
|
||||
collections::BTreeMap,
|
||||
fmt::{self, Write},
|
||||
iter::FusedIterator,
|
||||
num::NonZero,
|
||||
sync::{Arc, OnceLock},
|
||||
};
|
||||
|
|
@ -61,6 +63,12 @@ pub struct PdfName {
|
|||
bytes: ArcOrRef<'static, [u8]>,
|
||||
}
|
||||
|
||||
impl Borrow<[u8]> for PdfName {
|
||||
fn borrow(&self) -> &[u8] {
|
||||
&self.bytes
|
||||
}
|
||||
}
|
||||
|
||||
impl PdfName {
|
||||
pub fn try_new(
|
||||
pos: impl Into<PdfInputPositionNoCompare>,
|
||||
|
|
@ -218,24 +226,51 @@ macro_rules! make_pdf_object {
|
|||
$Variant:ident($ty:ty),
|
||||
)+
|
||||
) => {
|
||||
#[derive(Clone, Debug)]
|
||||
#[derive(Clone)]
|
||||
pub enum PdfObjectNonNull {
|
||||
$($Variant($ty),)*
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
impl fmt::Debug for PdfObjectNonNull {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
$(Self::$Variant(v) => v.fmt(f),)*
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub enum PdfObjectDirect {
|
||||
$($Variant($ty),)*
|
||||
Null(PdfNull),
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
impl fmt::Debug for PdfObjectDirect {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
$(Self::$Variant(v) => v.fmt(f),)*
|
||||
Self::Null(v) => v.fmt(f),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub enum PdfObject {
|
||||
$($Variant($ty),)*
|
||||
Null(PdfNull),
|
||||
Indirect(PdfObjectIndirect),
|
||||
}
|
||||
|
||||
impl fmt::Debug for PdfObject {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
$(Self::$Variant(v) => v.fmt(f),)*
|
||||
Self::Null(v) => v.fmt(f),
|
||||
Self::Indirect(v) => v.fmt(f),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$(
|
||||
impl From<$ty> for PdfObjectNonNull {
|
||||
fn from(value: $ty) -> Self {
|
||||
|
|
@ -546,12 +581,12 @@ impl PdfObjectIndirect {
|
|||
}
|
||||
pub fn get(&self) -> PdfObjectDirect {
|
||||
if let Some(objects) = self.objects.upgrade() {
|
||||
if let Some(objects) = objects.objects.get() {
|
||||
if let Some(objects) = objects.inner.get() {
|
||||
let final_id = self.final_id.get().copied();
|
||||
let limit = if final_id.is_some() { 1 } else { 1000usize };
|
||||
let mut id = final_id.unwrap_or(self.id);
|
||||
for _ in 0..limit {
|
||||
if let Some(object) = objects.get(&self.id) {
|
||||
if let Some(object) = objects.objects.get(&self.id) {
|
||||
let retval = match object {
|
||||
PdfObject::Boolean(v) => PdfObjectDirect::Boolean(*v),
|
||||
PdfObject::Integer(v) => PdfObjectDirect::Integer(*v),
|
||||
|
|
@ -628,18 +663,27 @@ impl PdfDictionary {
|
|||
}
|
||||
pub fn contains_key<Q: ?Sized>(&self, key: &Q) -> bool
|
||||
where
|
||||
PdfName: std::borrow::Borrow<Q> + Ord,
|
||||
PdfName: std::borrow::Borrow<Q>,
|
||||
Q: Ord,
|
||||
{
|
||||
self.fields.contains_key(key)
|
||||
}
|
||||
pub fn get<Q: ?Sized>(&self, key: &Q) -> Option<&PdfObject>
|
||||
where
|
||||
PdfName: std::borrow::Borrow<Q> + Ord,
|
||||
PdfName: std::borrow::Borrow<Q>,
|
||||
Q: Ord,
|
||||
{
|
||||
self.fields.get(key)
|
||||
}
|
||||
pub fn get_or_null<Q: ?Sized>(&self, key: &Q) -> PdfObject
|
||||
where
|
||||
PdfName: std::borrow::Borrow<Q>,
|
||||
Q: Ord,
|
||||
{
|
||||
self.get(key)
|
||||
.cloned()
|
||||
.unwrap_or(PdfObject::Null(PdfNull(self.pos)))
|
||||
}
|
||||
pub fn pos(&self) -> PdfInputPosition {
|
||||
self.pos.0
|
||||
}
|
||||
|
|
@ -842,35 +886,6 @@ impl<T> std::ops::DerefMut for MaybeArray<T> {
|
|||
}
|
||||
}
|
||||
|
||||
pdf_parse! {
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
#[non_exhaustive]
|
||||
pub enum PdfStreamFilter {
|
||||
#[pdf(name = "ASCIIHexDecode")]
|
||||
AsciiHexDecode,
|
||||
#[pdf(name = "ASCII85Decode")]
|
||||
Ascii85Decode,
|
||||
#[pdf(name = "LZWDecode")]
|
||||
LzwDecode,
|
||||
#[pdf(name = "FlateDecode")]
|
||||
FlateDecode,
|
||||
#[pdf(name = "RunLengthDecode")]
|
||||
RunLengthDecode,
|
||||
#[pdf(name = "CCITTFaxDecode")]
|
||||
CcittFaxDecode,
|
||||
#[pdf(name = "JBIG2Decode")]
|
||||
Jbig2Decode,
|
||||
#[pdf(name = "DCTDecode")]
|
||||
DctDecode,
|
||||
#[pdf(name = "JPXDecode")]
|
||||
JpxDecode,
|
||||
#[pdf(name = "Crypt")]
|
||||
Crypt,
|
||||
#[pdf(other)]
|
||||
Unknown(PdfName),
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Default for MaybeArray<T> {
|
||||
fn default() -> Self {
|
||||
Self(Arc::default())
|
||||
|
|
@ -936,47 +951,101 @@ impl PdfStreamDictionary {
|
|||
pub(crate) fn parse_len_from_dictionary(
|
||||
dictionary: &PdfDictionary,
|
||||
) -> Result<usize, PdfParseError> {
|
||||
PdfParse::parse(
|
||||
dictionary
|
||||
.get(&PdfName::new_static(Self::LENGTH_NAME.as_bytes()))
|
||||
.cloned()
|
||||
.unwrap_or_default(),
|
||||
PdfParse::parse(dictionary.get_or_null(Self::LENGTH_NAME.as_bytes()))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct PdfStreamDictionaryFiltersAndParms<'a> {
|
||||
filters: std::iter::Enumerate<std::slice::Iter<'a, PdfStreamFilter>>,
|
||||
decode_parms: &'a [Option<PdfDictionary>],
|
||||
}
|
||||
|
||||
impl<'a> PdfStreamDictionaryFiltersAndParms<'a> {
|
||||
fn item_helper(
|
||||
filter: (usize, &'a PdfStreamFilter),
|
||||
decode_parms: &'a [Option<PdfDictionary>],
|
||||
) -> (&'a PdfStreamFilter, &'a PdfDictionary) {
|
||||
static EMPTY_DICTIONARY: OnceLock<PdfDictionary> = OnceLock::new();
|
||||
let (index, filter) = filter;
|
||||
(
|
||||
filter,
|
||||
match decode_parms.get(index) {
|
||||
Some(Some(v)) => v,
|
||||
_ => EMPTY_DICTIONARY.get_or_init(PdfDictionary::default),
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl<Rest> PdfStreamDictionary<Rest> {
|
||||
pub fn filters_and_parms(
|
||||
&self,
|
||||
) -> impl Clone + ExactSizeIterator + DoubleEndedIterator<Item = (PdfStreamFilter, PdfDictionary)>
|
||||
{
|
||||
self.filters.iter().enumerate().map(|(index, filter)| {
|
||||
(
|
||||
filter.clone(),
|
||||
self.decode_parms
|
||||
.0
|
||||
.get(index)
|
||||
.cloned()
|
||||
.flatten()
|
||||
.unwrap_or_default(),
|
||||
)
|
||||
})
|
||||
impl<'a> Iterator for PdfStreamDictionaryFiltersAndParms<'a> {
|
||||
type Item = (&'a PdfStreamFilter, &'a PdfDictionary);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.filters
|
||||
.next()
|
||||
.map(|filter| Self::item_helper(filter, self.decode_parms))
|
||||
}
|
||||
pub fn file_filters_and_parms(
|
||||
&self,
|
||||
) -> impl Clone + ExactSizeIterator + DoubleEndedIterator<Item = (PdfStreamFilter, PdfDictionary)>
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
self.filters.size_hint()
|
||||
}
|
||||
|
||||
fn nth(&mut self, n: usize) -> Option<Self::Item> {
|
||||
self.filters
|
||||
.nth(n)
|
||||
.map(|filter| Self::item_helper(filter, self.decode_parms))
|
||||
}
|
||||
|
||||
fn fold<B, F>(self, init: B, f: F) -> B
|
||||
where
|
||||
F: FnMut(B, Self::Item) -> B,
|
||||
{
|
||||
self.file_filters.iter().enumerate().map(|(index, filter)| {
|
||||
(
|
||||
filter.clone(),
|
||||
self.file_decode_parms
|
||||
.0
|
||||
.get(index)
|
||||
.cloned()
|
||||
.flatten()
|
||||
.unwrap_or_default(),
|
||||
)
|
||||
})
|
||||
self.filters
|
||||
.map(|filter| Self::item_helper(filter, self.decode_parms))
|
||||
.fold(init, f)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> FusedIterator for PdfStreamDictionaryFiltersAndParms<'a> {}
|
||||
|
||||
impl<'a> ExactSizeIterator for PdfStreamDictionaryFiltersAndParms<'a> {}
|
||||
|
||||
impl<'a> DoubleEndedIterator for PdfStreamDictionaryFiltersAndParms<'a> {
|
||||
fn next_back(&mut self) -> Option<Self::Item> {
|
||||
self.filters
|
||||
.next_back()
|
||||
.map(|filter| Self::item_helper(filter, self.decode_parms))
|
||||
}
|
||||
|
||||
fn nth_back(&mut self, n: usize) -> Option<Self::Item> {
|
||||
self.filters
|
||||
.nth_back(n)
|
||||
.map(|filter| Self::item_helper(filter, self.decode_parms))
|
||||
}
|
||||
|
||||
fn rfold<B, F>(self, init: B, f: F) -> B
|
||||
where
|
||||
F: FnMut(B, Self::Item) -> B,
|
||||
{
|
||||
self.filters
|
||||
.map(|filter| Self::item_helper(filter, self.decode_parms))
|
||||
.rfold(init, f)
|
||||
}
|
||||
}
|
||||
|
||||
impl<Rest> PdfStreamDictionary<Rest> {
|
||||
pub fn filters_and_parms<'a>(&'a self) -> PdfStreamDictionaryFiltersAndParms<'a> {
|
||||
PdfStreamDictionaryFiltersAndParms {
|
||||
filters: self.filters.iter().enumerate(),
|
||||
decode_parms: &self.decode_parms,
|
||||
}
|
||||
}
|
||||
pub fn file_filters_and_parms<'a>(&'a self) -> PdfStreamDictionaryFiltersAndParms<'a> {
|
||||
PdfStreamDictionaryFiltersAndParms {
|
||||
filters: self.file_filters.iter().enumerate(),
|
||||
decode_parms: &self.file_decode_parms,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1001,16 +1070,64 @@ impl<Rest: PdfParse> UnparsedPdfStreamDictionary<Rest> {
|
|||
pub struct PdfStream<Rest = PdfDictionary> {
|
||||
pos: PdfInputPositionNoCompare,
|
||||
dictionary: Arc<OnceLock<PdfStreamDictionary<Rest>>>,
|
||||
data: Arc<[u8]>,
|
||||
encoded_data: Arc<[u8]>,
|
||||
decoded_data: Arc<OnceLock<Result<Arc<[u8]>, PdfParseError>>>,
|
||||
}
|
||||
|
||||
struct DumpBytes<'a>(&'a [u8]);
|
||||
|
||||
impl<'a> fmt::Debug for DumpBytes<'a> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
fmt::Display::fmt(self, f)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for DumpBytes<'_> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let mut first = true;
|
||||
let mut fmt_chunk = |chunk: &[u8]| -> fmt::Result {
|
||||
if first {
|
||||
first = false;
|
||||
} else {
|
||||
f.write_str("\n")?;
|
||||
}
|
||||
write!(f, "\"{}\"", chunk.escape_ascii())
|
||||
};
|
||||
if self.0.is_empty() {
|
||||
return fmt_chunk(self.0);
|
||||
}
|
||||
for chunk in self.0.chunks(32) {
|
||||
fmt_chunk(chunk)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<Rest: fmt::Debug> fmt::Debug for PdfStream<Rest> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.debug_struct("PdfStream")
|
||||
.field("pos", &self.pos)
|
||||
.field("dictionary", &self.dictionary)
|
||||
.field("data", &format_args!("{:02x?}", self.data))
|
||||
.finish()
|
||||
let Self {
|
||||
pos,
|
||||
dictionary,
|
||||
encoded_data,
|
||||
decoded_data,
|
||||
} = self;
|
||||
let mut debug_struct = f.debug_struct("PdfStream");
|
||||
debug_struct.field("pos", pos);
|
||||
if let Some(dictionary) = dictionary.get() {
|
||||
debug_struct.field("dictionary", dictionary);
|
||||
} else {
|
||||
debug_struct.field("dictionary", &format_args!("<not-yet-parsed>"));
|
||||
}
|
||||
debug_struct.field("encoded_data", &DumpBytes(encoded_data));
|
||||
if let Some(decoded_data) = decoded_data.get() {
|
||||
match decoded_data {
|
||||
Ok(decoded_data) => debug_struct.field("decoded_data", &DumpBytes(decoded_data)),
|
||||
Err(e) => debug_struct.field("decoded_data", &Err::<(), _>(e)),
|
||||
};
|
||||
} else {
|
||||
debug_struct.field("decoded_data", &format_args!("<not-yet-decoded>"));
|
||||
}
|
||||
debug_struct.finish()
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1018,25 +1135,27 @@ impl<Rest> PdfStream<Rest> {
|
|||
pub fn new(
|
||||
pos: impl Into<PdfInputPositionNoCompare>,
|
||||
dictionary: PdfStreamDictionary<Rest>,
|
||||
data: Arc<[u8]>,
|
||||
encoded_data: Arc<[u8]>,
|
||||
) -> Self {
|
||||
Self {
|
||||
pos: pos.into(),
|
||||
dictionary: Arc::new(OnceLock::from(dictionary)),
|
||||
data,
|
||||
encoded_data,
|
||||
decoded_data: Arc::new(OnceLock::new()),
|
||||
}
|
||||
}
|
||||
pub(crate) fn new_unparsed(
|
||||
pos: impl Into<PdfInputPositionNoCompare>,
|
||||
unparsed_dictionary: PdfDictionary,
|
||||
data: Arc<[u8]>,
|
||||
encoded_data: Arc<[u8]>,
|
||||
) -> (Self, UnparsedPdfStreamDictionary<Rest>) {
|
||||
let dictionary = Arc::new(OnceLock::new());
|
||||
(
|
||||
Self {
|
||||
pos: pos.into(),
|
||||
dictionary: dictionary.clone(),
|
||||
data,
|
||||
encoded_data,
|
||||
decoded_data: Arc::new(OnceLock::new()),
|
||||
},
|
||||
UnparsedPdfStreamDictionary {
|
||||
unparsed_dictionary,
|
||||
|
|
@ -1049,8 +1168,29 @@ impl<Rest> PdfStream<Rest> {
|
|||
.get()
|
||||
.expect("haven't finished parsing all pdf object definitions yet")
|
||||
}
|
||||
pub fn data(&self) -> &Arc<[u8]> {
|
||||
&self.data
|
||||
pub fn encoded_data(&self) -> &Arc<[u8]> {
|
||||
&self.encoded_data
|
||||
}
|
||||
fn try_decode_data(&self) -> Result<Arc<[u8]>, PdfParseError> {
|
||||
let dictionary = self.dictionary();
|
||||
let (data, filters) = if let Some(file) = &dictionary.file {
|
||||
todo!()
|
||||
} else {
|
||||
(&self.encoded_data, dictionary.filters_and_parms())
|
||||
};
|
||||
if filters.len() == 0 {
|
||||
return Ok(data.clone());
|
||||
}
|
||||
let mut data: &[u8] = data;
|
||||
let mut buffer;
|
||||
for (filter, filter_parms) in filters {
|
||||
buffer = filter.decode_stream_data(filter_parms.clone(), self.pos.0, &data)?;
|
||||
data = &buffer;
|
||||
}
|
||||
Ok(Arc::from(data))
|
||||
}
|
||||
pub fn decoded_data(&self) -> &Result<Arc<[u8]>, PdfParseError> {
|
||||
self.decoded_data.get_or_init(|| self.try_decode_data())
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1099,7 +1239,8 @@ impl<Rest: PdfParse> PdfParse for PdfStream<Rest> {
|
|||
rest: Rest::parse(rest.clone().into())?,
|
||||
}))
|
||||
},
|
||||
data: stream.data,
|
||||
encoded_data: stream.encoded_data,
|
||||
decoded_data: stream.decoded_data,
|
||||
}),
|
||||
object => Err(PdfParseError::InvalidType {
|
||||
pos: object.get_pdf_input_position(),
|
||||
|
|
@ -1109,3 +1250,37 @@ impl<Rest: PdfParse> PdfParse for PdfStream<Rest> {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
pdf_parse! {
|
||||
#[derive(Clone, Copy, Debug, Hash, Default, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub enum PdfObjectStreamType {
|
||||
#[pdf(name = "ObjStm")]
|
||||
#[default]
|
||||
ObjStm,
|
||||
}
|
||||
}
|
||||
|
||||
pdf_parse! {
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct PdfObjectStreamDictionary {
|
||||
#[pdf(name = Self::TYPE_NAME)]
|
||||
pub ty: PdfObjectStreamType,
|
||||
#[pdf(name = "N")]
|
||||
pub n: usize,
|
||||
#[pdf(name = "First")]
|
||||
pub first: usize,
|
||||
#[pdf(name = "Extends")]
|
||||
pub extends: Option<PdfObjectIndirect>,
|
||||
#[pdf(flatten)]
|
||||
pub rest: PdfDictionary,
|
||||
}
|
||||
}
|
||||
|
||||
impl PdfObjectStreamDictionary {
|
||||
pub const TYPE_NAME: &str = "Type";
|
||||
pub(crate) fn parse_type_from_dictionary(
|
||||
dictionary: &PdfDictionary,
|
||||
) -> Result<PdfObjectStreamType, PdfParseError> {
|
||||
PdfParse::parse(dictionary.get_or_null(Self::TYPE_NAME.as_bytes()))
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -144,7 +144,7 @@ impl PartialEq for PdfInputPositionNoCompare {
|
|||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, Clone)]
|
||||
#[non_exhaustive]
|
||||
pub enum PdfParseError {
|
||||
Custom(String),
|
||||
|
|
@ -231,6 +231,19 @@ pub enum PdfParseError {
|
|||
pos: PdfInputPosition,
|
||||
start_xref: usize,
|
||||
},
|
||||
UnknownStreamFilter {
|
||||
pos: PdfInputPosition,
|
||||
filter: PdfName,
|
||||
},
|
||||
StreamFilterError {
|
||||
pos: PdfInputPosition,
|
||||
filter: PdfName,
|
||||
error: String,
|
||||
},
|
||||
ObjectStreamParseError {
|
||||
stream_pos: PdfInputPosition,
|
||||
error: Arc<PdfParseError>,
|
||||
},
|
||||
}
|
||||
|
||||
impl From<std::convert::Infallible> for PdfParseError {
|
||||
|
|
@ -239,6 +252,12 @@ impl From<std::convert::Infallible> for PdfParseError {
|
|||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a Self> for PdfParseError {
|
||||
fn from(value: &'a Self) -> Self {
|
||||
value.clone()
|
||||
}
|
||||
}
|
||||
|
||||
impl GetPdfInputPosition for PdfParseError {
|
||||
fn get_pdf_input_position(&self) -> PdfInputPosition {
|
||||
match *self {
|
||||
|
|
@ -266,7 +285,12 @@ impl GetPdfInputPosition for PdfParseError {
|
|||
| PdfParseError::MissingStartXRefValue { pos }
|
||||
| PdfParseError::MissingEofComment { pos }
|
||||
| PdfParseError::UnexpectedByte { pos, .. }
|
||||
| PdfParseError::InvalidStartXRefValue { pos, .. } => pos,
|
||||
| PdfParseError::InvalidStartXRefValue { pos, .. }
|
||||
| PdfParseError::UnknownStreamFilter { pos, .. }
|
||||
| PdfParseError::StreamFilterError { pos, .. }
|
||||
| PdfParseError::ObjectStreamParseError {
|
||||
stream_pos: pos, ..
|
||||
} => pos,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -376,7 +400,7 @@ impl fmt::Display for PdfParseError {
|
|||
write!(f, "at {pos}: missing `%%EOF` comment")
|
||||
}
|
||||
PdfParseError::UnexpectedByte { pos, byte } => {
|
||||
write!(f, "at {pos}: unexpected byte {}", byte.escape_ascii())
|
||||
write!(f, "at {pos}: unexpected byte '{}'", byte.escape_ascii())
|
||||
}
|
||||
PdfParseError::InvalidStartXRefValue { pos, start_xref } => {
|
||||
write!(
|
||||
|
|
@ -384,6 +408,23 @@ impl fmt::Display for PdfParseError {
|
|||
"at {pos}: invalid `startxref` value: {start_xref} ({start_xref:#x})"
|
||||
)
|
||||
}
|
||||
PdfParseError::UnknownStreamFilter { pos, ref filter } => {
|
||||
write!(f, "at {pos}: unknown stream filter: {filter}")
|
||||
}
|
||||
PdfParseError::StreamFilterError {
|
||||
pos,
|
||||
ref filter,
|
||||
ref error,
|
||||
} => {
|
||||
write!(f, "at {pos}: stream filter {filter} error: {error}")
|
||||
}
|
||||
PdfParseError::ObjectStreamParseError {
|
||||
stream_pos,
|
||||
ref error,
|
||||
} => {
|
||||
write!(f, "at {stream_pos}: object stream error: ")?;
|
||||
error.fmt(f)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -785,12 +826,10 @@ macro_rules! pdf_parse {
|
|||
[$(#[$($field_meta:tt)*])*]
|
||||
$field_name:ident: $field_ty:ty
|
||||
) => {
|
||||
let $field_name = $crate::pdf::object::PdfName::new_static(
|
||||
$crate::__std::convert::AsRef::<[u8]>::as_ref($name),
|
||||
);
|
||||
let $field_name = $crate::__std::convert::AsRef::<[u8]>::as_ref($name);
|
||||
let $field_name = <$field_ty as $crate::pdf::parse::PdfParse>::parse(
|
||||
$object_mut
|
||||
.remove(&$field_name)
|
||||
.remove($field_name)
|
||||
.unwrap_or($crate::pdf::object::PdfObject::Null($crate::pdf::object::PdfNull::new($pos))),
|
||||
)?;
|
||||
};
|
||||
|
|
|
|||
65
src/pdf/stream_filters.rs
Normal file
65
src/pdf/stream_filters.rs
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
use crate::pdf::{
|
||||
object::{PdfDictionary, PdfName},
|
||||
parse::{PdfInputPosition, PdfParse, PdfParseError},
|
||||
pdf_parse,
|
||||
};
|
||||
|
||||
pub mod flate;
|
||||
|
||||
pdf_parse! {
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
#[non_exhaustive]
|
||||
pub enum PdfStreamFilter {
|
||||
#[pdf(name = "ASCIIHexDecode")]
|
||||
AsciiHexDecode,
|
||||
#[pdf(name = "ASCII85Decode")]
|
||||
Ascii85Decode,
|
||||
#[pdf(name = "LZWDecode")]
|
||||
LzwDecode,
|
||||
#[pdf(name = "FlateDecode")]
|
||||
FlateDecode,
|
||||
#[pdf(name = "RunLengthDecode")]
|
||||
RunLengthDecode,
|
||||
#[pdf(name = "CCITTFaxDecode")]
|
||||
CcittFaxDecode,
|
||||
#[pdf(name = "JBIG2Decode")]
|
||||
Jbig2Decode,
|
||||
#[pdf(name = "DCTDecode")]
|
||||
DctDecode,
|
||||
#[pdf(name = "JPXDecode")]
|
||||
JpxDecode,
|
||||
#[pdf(name = "Crypt")]
|
||||
Crypt,
|
||||
#[pdf(other)]
|
||||
Unknown(PdfName),
|
||||
}
|
||||
}
|
||||
|
||||
impl PdfStreamFilter {
|
||||
pub fn decode_stream_data(
|
||||
&self,
|
||||
filter_parms: PdfDictionary,
|
||||
stream_pos: PdfInputPosition,
|
||||
encoded_data: &[u8],
|
||||
) -> Result<Vec<u8>, PdfParseError> {
|
||||
match self {
|
||||
PdfStreamFilter::AsciiHexDecode => todo!(),
|
||||
PdfStreamFilter::Ascii85Decode => todo!(),
|
||||
PdfStreamFilter::LzwDecode => todo!(),
|
||||
PdfStreamFilter::FlateDecode => {
|
||||
flate::PdfFilterParmsFlateDecode::parse(filter_parms.into())?
|
||||
.decode_stream_data(stream_pos, encoded_data)
|
||||
}
|
||||
PdfStreamFilter::RunLengthDecode => todo!(),
|
||||
PdfStreamFilter::CcittFaxDecode => todo!(),
|
||||
PdfStreamFilter::Jbig2Decode => todo!(),
|
||||
PdfStreamFilter::DctDecode => todo!(),
|
||||
PdfStreamFilter::JpxDecode => todo!(),
|
||||
PdfStreamFilter::Crypt => todo!(),
|
||||
PdfStreamFilter::Unknown(filter) => Err(PdfParseError::UnknownStreamFilter {
|
||||
pos: stream_pos,
|
||||
filter: filter.clone(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
73
src/pdf/stream_filters/flate.rs
Normal file
73
src/pdf/stream_filters/flate.rs
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
use crate::pdf::{
|
||||
object::PdfDictionary,
|
||||
parse::{PdfInputPosition, PdfParseError},
|
||||
pdf_parse,
|
||||
stream_filters::PdfStreamFilter,
|
||||
};
|
||||
use std::{io::Read, num::NonZero};
|
||||
|
||||
pdf_parse! {
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct PdfFilterParmsFlateDecode {
|
||||
#[pdf(name = "Predictor")]
|
||||
pub predictor: Option<NonZero<u32>>,
|
||||
#[pdf(name = "Colors")]
|
||||
pub colors: Option<NonZero<u32>>,
|
||||
#[pdf(name = "BitsPerComponent")]
|
||||
pub bits_per_component: Option<NonZero<u32>>,
|
||||
#[pdf(name = "Columns")]
|
||||
pub columns: Option<NonZero<u32>>,
|
||||
#[pdf(flatten)]
|
||||
pub rest: PdfDictionary,
|
||||
}
|
||||
}
|
||||
|
||||
impl PdfFilterParmsFlateDecode {
|
||||
pub const FILTER: PdfStreamFilter = PdfStreamFilter::FlateDecode;
|
||||
pub const DEFAULT_PREDICTOR: NonZero<u32> = const { NonZero::new(1).unwrap() };
|
||||
pub const DEFAULT_COLORS: NonZero<u32> = const { NonZero::new(1).unwrap() };
|
||||
pub const DEFAULT_BITS_PER_COMPONENT: NonZero<u32> = const { NonZero::new(8).unwrap() };
|
||||
pub const DEFAULT_COLUMNS: NonZero<u32> = const { NonZero::new(1).unwrap() };
|
||||
pub fn predictor(&self) -> NonZero<u32> {
|
||||
self.predictor.unwrap_or(Self::DEFAULT_PREDICTOR)
|
||||
}
|
||||
pub fn colors(&self) -> NonZero<u32> {
|
||||
self.colors.unwrap_or(Self::DEFAULT_COLORS)
|
||||
}
|
||||
pub fn bits_per_component(&self) -> NonZero<u32> {
|
||||
self.bits_per_component
|
||||
.unwrap_or(Self::DEFAULT_BITS_PER_COMPONENT)
|
||||
}
|
||||
pub fn columns(&self) -> NonZero<u32> {
|
||||
self.columns.unwrap_or(Self::DEFAULT_COLUMNS)
|
||||
}
|
||||
pub fn decode_stream_data(
|
||||
&self,
|
||||
stream_pos: PdfInputPosition,
|
||||
encoded_data: &[u8],
|
||||
) -> Result<Vec<u8>, PdfParseError> {
|
||||
let Self {
|
||||
predictor: _,
|
||||
colors: _,
|
||||
bits_per_component: _,
|
||||
columns: _,
|
||||
rest: _,
|
||||
} = self;
|
||||
let mut decoded_data = vec![];
|
||||
flate2::bufread::ZlibDecoder::new(encoded_data)
|
||||
.read_to_end(&mut decoded_data)
|
||||
.map_err(|e| PdfParseError::StreamFilterError {
|
||||
pos: stream_pos,
|
||||
filter: Self::FILTER.into(),
|
||||
error: e.to_string(),
|
||||
})?;
|
||||
let predictor = self.predictor();
|
||||
let colors = self.colors();
|
||||
let bits_per_component = self.bits_per_component();
|
||||
let columns = self.columns();
|
||||
match predictor {
|
||||
Self::DEFAULT_PREDICTOR => Ok(decoded_data),
|
||||
_ => todo!("{predictor}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue