parse ToUnicode stream
This commit is contained in:
parent
d7727289eb
commit
b8a97a2326
5 changed files with 545 additions and 42 deletions
105
src/pdf/font.rs
105
src/pdf/font.rs
|
|
@ -1,5 +1,6 @@
|
|||
use crate::{
|
||||
pdf::{
|
||||
font::type_1_parse::PsFile,
|
||||
object::{
|
||||
IsPdfNull, PdfArray, PdfDictionary, PdfMatrix, PdfName, PdfNameOrInteger, PdfObject,
|
||||
PdfObjectDirect, PdfRectangle, PdfStream, PdfString,
|
||||
|
|
@ -15,26 +16,88 @@ use crate::{
|
|||
use std::{borrow::Cow, collections::BTreeMap, fmt, sync::Arc};
|
||||
|
||||
mod tables;
|
||||
mod to_unicode_parse;
|
||||
mod type_1_parse;
|
||||
|
||||
pdf_parse! {
|
||||
#[pdf(transparent)]
|
||||
#[derive(Clone)]
|
||||
// TODO: actually parse the stream
|
||||
pub struct PdfFontToUnicode {
|
||||
#[pdf]
|
||||
stream: PdfStream,
|
||||
#[pdf]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct PdfFontToUnicodeDictionary {
|
||||
#[pdf(name = "UseCMap")]
|
||||
pub base_map: Option<PdfObjectDirect>, // TODO: parse
|
||||
#[pdf(flatten)]
|
||||
pub rest: PdfDictionary,
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct PdfFontToUnicode {
|
||||
pub base_map: Option<PdfObjectDirect>, // TODO: parse
|
||||
pub char_map_name: PdfName,
|
||||
pub src_ranges: Arc<[std::ops::RangeInclusive<PdfString>]>,
|
||||
pub to_unicode_map: Arc<BTreeMap<PdfString, Arc<str>>>,
|
||||
}
|
||||
|
||||
impl fmt::Debug for PdfFontToUnicode {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
DagDebugState::scope(|_state| {
|
||||
let Self { stream } = self;
|
||||
f.debug_struct("PdfFontToUnicode")
|
||||
.field("stream", stream)
|
||||
.finish()
|
||||
})
|
||||
struct DebugFn<F: Fn(&mut fmt::Formatter<'_>) -> fmt::Result>(F);
|
||||
impl<F: Fn(&mut fmt::Formatter<'_>) -> fmt::Result> fmt::Debug for DebugFn<F> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
(self.0)(f)
|
||||
}
|
||||
}
|
||||
let Self {
|
||||
base_map,
|
||||
char_map_name,
|
||||
src_ranges,
|
||||
to_unicode_map,
|
||||
} = self;
|
||||
f.debug_struct("PdfFontToUnicode")
|
||||
.field("base_map", base_map)
|
||||
.field("char_map_name", char_map_name)
|
||||
.field(
|
||||
"src_ranges",
|
||||
&DebugFn(|f| {
|
||||
f.debug_set()
|
||||
.entries(
|
||||
src_ranges
|
||||
.iter()
|
||||
.map(|v| v.start().bytes_debug()..=v.end().bytes_debug()),
|
||||
)
|
||||
.finish()
|
||||
}),
|
||||
)
|
||||
.field(
|
||||
"to_unicode_map",
|
||||
&DebugFn(|f| {
|
||||
f.debug_map()
|
||||
.entries(to_unicode_map.iter().map(|(k, v)| (k.bytes_debug(), v)))
|
||||
.finish()
|
||||
}),
|
||||
)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl IsPdfNull for PdfFontToUnicode {
|
||||
fn is_pdf_null(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl PdfParse for PdfFontToUnicode {
|
||||
fn type_name() -> Cow<'static, str> {
|
||||
Cow::Borrowed("PdfFontToUnicode")
|
||||
}
|
||||
fn parse(object: PdfObject) -> Result<Self, PdfParseError> {
|
||||
let stream = PdfStream::<PdfFontToUnicodeDictionary>::parse(object)?;
|
||||
let base_map = stream.dictionary().rest.base_map.clone();
|
||||
let decoded_data = stream.decoded_data().clone()?;
|
||||
to_unicode_parse::ToUnicodeParser::new(PsFile::from_arc_bytes(
|
||||
decoded_data,
|
||||
stream.get_pdf_input_position(),
|
||||
))
|
||||
.parse(base_map)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -858,6 +921,17 @@ pub struct PdfSimpleFontEncodingTable {
|
|||
pub table: ArcOrRef<'static, [PdfSimpleFontEncodingTableEntry; 0x100]>,
|
||||
}
|
||||
|
||||
impl PdfSimpleFontEncodingTable {
|
||||
pub const fn empty() -> Self {
|
||||
const EMPTY_ENTRY: PdfSimpleFontEncodingTableEntry =
|
||||
PdfSimpleFontEncodingTableEntry::new_static(None, None);
|
||||
const EMPTY_TABLE: &[PdfSimpleFontEncodingTableEntry; 0x100] = &[EMPTY_ENTRY; 0x100];
|
||||
Self {
|
||||
table: ArcOrRef::Ref(EMPTY_TABLE),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum PdfSimpleFontEncoding {
|
||||
Predefined(PdfSimpleFontEncodingPredefined),
|
||||
|
|
@ -903,11 +977,12 @@ impl PdfParse for PdfSimpleFontEncoding {
|
|||
#[derive(Clone, Debug)]
|
||||
#[non_exhaustive]
|
||||
pub struct PdfFontType1Program {
|
||||
pub encoding: Option<Arc<[Option<PdfName>]>>,
|
||||
pub font_bbox: Option<PdfRectangle>,
|
||||
pub encoding: PdfSimpleFontEncodingTable,
|
||||
pub font_bbox: PdfRectangle,
|
||||
pub font_info: Option<PdfFontType1FontInfo>,
|
||||
pub font_matrix: Option<PdfMatrix>,
|
||||
pub font_matrix: PdfMatrix,
|
||||
pub font_name: Option<PdfName>,
|
||||
pub vertical_writing_mode: bool,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
|
|
|
|||
325
src/pdf/font/to_unicode_parse.rs
Normal file
325
src/pdf/font/to_unicode_parse.rs
Normal file
|
|
@ -0,0 +1,325 @@
|
|||
use std::{collections::BTreeMap, sync::Arc};
|
||||
|
||||
use crate::{
|
||||
pdf::{
|
||||
font::{
|
||||
PdfFontToUnicode,
|
||||
type_1_parse::{PsFile, Token},
|
||||
},
|
||||
object::{PdfName, PdfObjectDirect, PdfString},
|
||||
parse::{PdfInputPosition, PdfParseError},
|
||||
},
|
||||
util::ArcOrRef,
|
||||
};
|
||||
|
||||
pub(crate) struct ToUnicodeParser {
|
||||
tokenizer: PsFile,
|
||||
}
|
||||
|
||||
#[track_caller]
|
||||
fn invalid_token_err<T>(pos: PdfInputPosition, token: Option<Token>) -> Result<T, PdfParseError> {
|
||||
Err(PdfParseError::InvalidTokenInToUnicodeStream {
|
||||
pos,
|
||||
token: format!("{token:?}"),
|
||||
})
|
||||
}
|
||||
|
||||
impl ToUnicodeParser {
|
||||
pub(crate) fn new(tokenizer: PsFile) -> Self {
|
||||
Self { tokenizer }
|
||||
}
|
||||
fn expect_any_string(&mut self) -> Result<Vec<u8>, PdfParseError> {
|
||||
self.tokenizer.skip_comments_and_whitespace();
|
||||
let pos = self.tokenizer.pos();
|
||||
match self.tokenizer.next_token()? {
|
||||
Some(Token::String(string)) => Ok(string),
|
||||
token => invalid_token_err(pos, token),
|
||||
}
|
||||
}
|
||||
pub(crate) fn expect_string_with_len(
|
||||
&mut self,
|
||||
expected_len: usize,
|
||||
) -> Result<Vec<u8>, PdfParseError> {
|
||||
self.tokenizer.skip_comments_and_whitespace();
|
||||
let pos = self.tokenizer.pos();
|
||||
match self.tokenizer.next_token()? {
|
||||
Some(Token::String(string)) if string.len() == expected_len => Ok(string),
|
||||
token => invalid_token_err(pos, token),
|
||||
}
|
||||
}
|
||||
pub(crate) fn expect_literal_name(
|
||||
&mut self,
|
||||
expected_name: &[u8],
|
||||
) -> Result<(), PdfParseError> {
|
||||
self.tokenizer.skip_comments_and_whitespace();
|
||||
let pos = self.tokenizer.pos();
|
||||
match self.tokenizer.next_token()? {
|
||||
Some(Token::LiteralName(name)) if name == expected_name => Ok(()),
|
||||
token => invalid_token_err(pos, token),
|
||||
}
|
||||
}
|
||||
pub(crate) fn expect_any_literal_name(&mut self) -> Result<Vec<u8>, PdfParseError> {
|
||||
self.tokenizer.skip_comments_and_whitespace();
|
||||
let pos = self.tokenizer.pos();
|
||||
match self.tokenizer.next_token()? {
|
||||
Some(Token::LiteralName(name)) => Ok(name),
|
||||
token => invalid_token_err(pos, token),
|
||||
}
|
||||
}
|
||||
pub(crate) fn expect_executable_name(
|
||||
&mut self,
|
||||
expected_name: &[u8],
|
||||
) -> Result<(), PdfParseError> {
|
||||
self.tokenizer.skip_comments_and_whitespace();
|
||||
let pos = self.tokenizer.pos();
|
||||
match self.tokenizer.next_token()? {
|
||||
Some(Token::ExecutableName(name)) if name == expected_name => Ok(()),
|
||||
token => invalid_token_err(pos, token),
|
||||
}
|
||||
}
|
||||
pub(crate) fn expect(&mut self, expected_token: Token) -> Result<(), PdfParseError> {
|
||||
self.tokenizer.skip_comments_and_whitespace();
|
||||
let pos = self.tokenizer.pos();
|
||||
match self.tokenizer.next_token()? {
|
||||
Some(token) if token == expected_token => Ok(()),
|
||||
token => invalid_token_err(pos, token),
|
||||
}
|
||||
}
|
||||
pub(crate) fn expect_integer(&mut self) -> Result<i128, PdfParseError> {
|
||||
self.tokenizer.skip_comments_and_whitespace();
|
||||
let pos = self.tokenizer.pos();
|
||||
match self.tokenizer.next_token()? {
|
||||
Some(Token::Integer(value)) => Ok(value),
|
||||
token => invalid_token_err(pos, token),
|
||||
}
|
||||
}
|
||||
pub(crate) fn parse_dict(
|
||||
&mut self,
|
||||
mut entry_callback: impl FnMut(Vec<u8>, PdfInputPosition, Token) -> Result<(), PdfParseError>,
|
||||
) -> Result<(), PdfParseError> {
|
||||
self.expect(Token::DictStart)?;
|
||||
loop {
|
||||
self.tokenizer.skip_comments_and_whitespace();
|
||||
let name_pos = self.tokenizer.pos();
|
||||
match self.tokenizer.next_token()? {
|
||||
Some(Token::DictEnd) => return Ok(()),
|
||||
Some(Token::LiteralName(name)) => {
|
||||
self.tokenizer.skip_comments_and_whitespace();
|
||||
let value_pos = self.tokenizer.pos();
|
||||
let Some(value) = self.tokenizer.next_token()? else {
|
||||
return invalid_token_err(value_pos, None);
|
||||
};
|
||||
entry_callback(name, value_pos, value)?;
|
||||
}
|
||||
token => {
|
||||
return invalid_token_err(name_pos, token);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
pub(crate) fn parse(
|
||||
mut self,
|
||||
base_map: Option<PdfObjectDirect>,
|
||||
) -> Result<PdfFontToUnicode, PdfParseError> {
|
||||
self.tokenizer.skip_comments_and_whitespace();
|
||||
self.expect_literal_name(b"CIDInit")?;
|
||||
self.expect_literal_name(b"ProcSet")?;
|
||||
self.expect_executable_name(b"findresource")?;
|
||||
self.expect_executable_name(b"begin")?;
|
||||
self.expect_integer()?;
|
||||
self.expect_executable_name(b"dict")?;
|
||||
self.expect_executable_name(b"begin")?;
|
||||
self.expect_executable_name(b"begincmap")?;
|
||||
self.expect_literal_name(b"CIDSystemInfo")?;
|
||||
let mut registry = None;
|
||||
let mut ordering = None;
|
||||
let mut supplement = None;
|
||||
self.parse_dict(|name, value_pos, value| match &*name {
|
||||
b"Registry" => {
|
||||
let Token::String(v) = value else {
|
||||
return invalid_token_err(value_pos, Some(value));
|
||||
};
|
||||
registry = Some(v);
|
||||
Ok(())
|
||||
}
|
||||
b"Ordering" => {
|
||||
let Token::String(v) = value else {
|
||||
return invalid_token_err(value_pos, Some(value));
|
||||
};
|
||||
ordering = Some(v);
|
||||
Ok(())
|
||||
}
|
||||
b"Supplement" => {
|
||||
let Token::Integer(v) = value else {
|
||||
return invalid_token_err(value_pos, Some(value));
|
||||
};
|
||||
supplement = Some(v);
|
||||
Ok(())
|
||||
}
|
||||
_ => todo!("{}: {value:?}", name.escape_ascii()),
|
||||
})?;
|
||||
self.expect_executable_name(b"def")?;
|
||||
self.expect_literal_name(b"CMapName")?;
|
||||
self.tokenizer.skip_comments_and_whitespace();
|
||||
let char_map_name_pos = self.tokenizer.pos();
|
||||
let char_map_name = self.expect_any_literal_name()?;
|
||||
self.expect_executable_name(b"def")?;
|
||||
self.expect_literal_name(b"CMapType")?;
|
||||
self.expect(Token::Integer(2))?;
|
||||
self.expect_executable_name(b"def")?;
|
||||
self.expect(Token::Integer(1))?;
|
||||
self.expect_executable_name(b"begincodespacerange")?;
|
||||
self.tokenizer.skip_comments_and_whitespace();
|
||||
let range_start_pos = self.tokenizer.pos();
|
||||
let range_start = self.expect_any_string()?;
|
||||
if range_start.is_empty() {
|
||||
return invalid_token_err(range_start_pos, Some(Token::String(range_start)));
|
||||
}
|
||||
self.tokenizer.skip_comments_and_whitespace();
|
||||
let range_end_pos = self.tokenizer.pos();
|
||||
let range_end = self.expect_string_with_len(range_start.len())?;
|
||||
self.expect_executable_name(b"endcodespacerange")?;
|
||||
let mut to_unicode_map: BTreeMap<PdfString, Arc<str>> = BTreeMap::new();
|
||||
let mut dest_str = String::new();
|
||||
let mut insert_mapping = |src_pos: PdfInputPosition,
|
||||
src: &[u8],
|
||||
dest_pos: PdfInputPosition,
|
||||
dest_utf16_be: &[u8]|
|
||||
-> Result<(), PdfParseError> {
|
||||
dest_str.clear();
|
||||
for ch in char::decode_utf16(
|
||||
dest_utf16_be
|
||||
.chunks(2)
|
||||
.map(|chunk| u16::from_be_bytes([chunk[0], chunk[1]])),
|
||||
) {
|
||||
match ch {
|
||||
Ok(ch) => dest_str.push(ch),
|
||||
Err(_) => {
|
||||
return Err(PdfParseError::InvalidUtf16 { pos: dest_pos });
|
||||
}
|
||||
}
|
||||
}
|
||||
to_unicode_map.insert(
|
||||
PdfString::new(src_pos, ArcOrRef::Arc(src.into())),
|
||||
dest_str.as_str().into(),
|
||||
);
|
||||
Ok(())
|
||||
};
|
||||
loop {
|
||||
match self.tokenizer.next_token()? {
|
||||
Some(Token::Integer(size)) => match self.tokenizer.next_token()? {
|
||||
Some(Token::ExecutableName(name)) if name == b"beginbfrange" => {
|
||||
for _ in 0..size {
|
||||
self.tokenizer.skip_comments_and_whitespace();
|
||||
let src_pos = self.tokenizer.pos();
|
||||
let src_low = self.expect_string_with_len(range_start.len())?;
|
||||
self.tokenizer.skip_comments_and_whitespace();
|
||||
let src_high_pos = self.tokenizer.pos();
|
||||
let src_high = self.expect_string_with_len(range_start.len())?;
|
||||
if src_low.split_last().map(|(_, prefix)| prefix)
|
||||
!= src_high.split_last().map(|(_, prefix)| prefix)
|
||||
{
|
||||
return invalid_token_err(
|
||||
src_high_pos,
|
||||
Some(Token::String(src_high)),
|
||||
);
|
||||
}
|
||||
let src_last_range = *src_low.last().expect("known to be non-empty")
|
||||
..=*src_high.last().expect("known to be non-empty");
|
||||
self.tokenizer.skip_comments_and_whitespace();
|
||||
let dest_pos = self.tokenizer.pos();
|
||||
match self.tokenizer.next_token()? {
|
||||
Some(Token::String(dest))
|
||||
if dest.len() >= 2 && dest.len() % 2 == 0 =>
|
||||
{
|
||||
let mut src = src_low;
|
||||
for (index, src_last_byte) in src_last_range.enumerate() {
|
||||
*src.last_mut().expect("known to be non-empty") =
|
||||
src_last_byte;
|
||||
let mut dest = dest.clone();
|
||||
let [.., last] = &mut *dest else {
|
||||
unreachable!();
|
||||
};
|
||||
*last += index as u8;
|
||||
insert_mapping(src_pos, &src, dest_pos, &dest)?;
|
||||
}
|
||||
}
|
||||
Some(token @ Token::String(_)) => {
|
||||
todo!("odd number of dest bytes: {token:?}");
|
||||
}
|
||||
Some(Token::ArrayStart) => {
|
||||
let mut src = src_low;
|
||||
for src_last_byte in src_last_range {
|
||||
*src.last_mut().expect("known to be non-empty") =
|
||||
src_last_byte;
|
||||
self.tokenizer.skip_comments_and_whitespace();
|
||||
let dest_pos = self.tokenizer.pos();
|
||||
match self.tokenizer.next_token()? {
|
||||
Some(Token::String(dest))
|
||||
if dest.len() >= 2 && dest.len() % 2 == 0 =>
|
||||
{
|
||||
insert_mapping(src_pos, &src, dest_pos, &dest)?;
|
||||
}
|
||||
Some(token @ Token::String(_)) => {
|
||||
todo!("odd number of dest bytes: {token:?}");
|
||||
}
|
||||
token => return invalid_token_err(dest_pos, token),
|
||||
}
|
||||
}
|
||||
self.expect(Token::ArrayEnd)?;
|
||||
}
|
||||
token => return invalid_token_err(dest_pos, token),
|
||||
}
|
||||
}
|
||||
self.expect_executable_name(b"endbfrange")?;
|
||||
}
|
||||
Some(Token::ExecutableName(name)) if name == b"beginbfchar" => {
|
||||
for _ in 0..size {
|
||||
self.tokenizer.skip_comments_and_whitespace();
|
||||
let src_pos = self.tokenizer.pos();
|
||||
let src = self.expect_string_with_len(range_start.len())?;
|
||||
self.tokenizer.skip_comments_and_whitespace();
|
||||
let dest_pos = self.tokenizer.pos();
|
||||
match self.tokenizer.next_token()? {
|
||||
Some(Token::String(dest)) if dest.len() % 2 == 0 => {
|
||||
insert_mapping(src_pos, &src, dest_pos, &dest)?;
|
||||
}
|
||||
Some(token @ Token::String(_)) => {
|
||||
todo!("odd number of dest bytes: {token:?}");
|
||||
}
|
||||
token => return invalid_token_err(dest_pos, token),
|
||||
}
|
||||
}
|
||||
self.expect_executable_name(b"endbfchar")?;
|
||||
}
|
||||
token => todo!("{token:?}"),
|
||||
},
|
||||
Some(Token::ExecutableName(name)) if name == b"endcmap" => {
|
||||
break;
|
||||
}
|
||||
token => todo!("{token:?}"),
|
||||
}
|
||||
}
|
||||
self.expect_executable_name(b"CMapName")?;
|
||||
self.expect_executable_name(b"currentdict")?;
|
||||
self.expect_literal_name(b"CMap")?;
|
||||
self.expect_executable_name(b"defineresource")?;
|
||||
self.expect_executable_name(b"pop")?;
|
||||
self.expect_executable_name(b"end")?;
|
||||
self.expect_executable_name(b"end")?;
|
||||
self.tokenizer.skip_comments_and_whitespace();
|
||||
let eof_pos = self.tokenizer.pos();
|
||||
if let token @ Some(_) = self.tokenizer.next_token()? {
|
||||
return invalid_token_err(eof_pos, token);
|
||||
}
|
||||
Ok(PdfFontToUnicode {
|
||||
base_map,
|
||||
char_map_name: PdfName::new(char_map_name_pos, Arc::<[u8]>::from(char_map_name)),
|
||||
src_ranges: Arc::new([
|
||||
PdfString::new(range_start_pos, ArcOrRef::Arc(range_start.into()))
|
||||
..=PdfString::new(range_end_pos, ArcOrRef::Arc(range_end.into())),
|
||||
]),
|
||||
to_unicode_map: Arc::new(to_unicode_map),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
@ -1,7 +1,10 @@
|
|||
use crate::{
|
||||
pdf::{
|
||||
PdfObjects,
|
||||
font::{PdfFontType1FontInfo, PdfFontType1Program},
|
||||
font::{
|
||||
PdfFontType1FontInfo, PdfFontType1Program, PdfSimpleFontEncodingTable,
|
||||
PdfSimpleFontEncodingTableEntry,
|
||||
},
|
||||
object::{PdfMatrix, PdfName, PdfRectangle, PdfStreamContents, PdfString, PdfVec2D},
|
||||
parse::{
|
||||
PdfInputPosition, PdfInputPositionKnown, PdfInputPositionNoCompare, PdfParseError,
|
||||
|
|
@ -52,7 +55,7 @@ impl PsFileDecryptedSource {
|
|||
|
||||
#[derive(Clone)]
|
||||
enum PsFileSource {
|
||||
Bytes(Rc<[u8]>),
|
||||
Bytes(Arc<[u8]>),
|
||||
Decrypted(Rc<RefCell<PsFileDecryptedSource>>),
|
||||
}
|
||||
|
||||
|
|
@ -66,7 +69,7 @@ impl PsFileSource {
|
|||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct PsFile {
|
||||
pub(crate) struct PsFile {
|
||||
id: u64,
|
||||
source: PsFileSource,
|
||||
pos: Rc<Cell<PdfInputPositionKnown>>,
|
||||
|
|
@ -119,8 +122,8 @@ fn is_regular_char(v: u8) -> bool {
|
|||
|
||||
struct NotALineEnd;
|
||||
|
||||
#[derive(Clone)]
|
||||
enum Token {
|
||||
#[derive(Clone, PartialEq)]
|
||||
pub(crate) enum Token {
|
||||
Integer(i128),
|
||||
Real(f64),
|
||||
ArrayStart,
|
||||
|
|
@ -131,6 +134,8 @@ enum Token {
|
|||
LiteralName(Vec<u8>),
|
||||
ImmediatelyEvaluatedName(Vec<u8>),
|
||||
String(Vec<u8>),
|
||||
DictStart,
|
||||
DictEnd,
|
||||
}
|
||||
|
||||
impl fmt::Debug for Token {
|
||||
|
|
@ -150,6 +155,8 @@ impl fmt::Debug for Token {
|
|||
Self::String(contents) => {
|
||||
write!(f, "String({})", contents.escape_ascii())
|
||||
}
|
||||
Self::DictStart => write!(f, "DictStart"),
|
||||
Self::DictEnd => write!(f, "DictEnd"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -165,7 +172,10 @@ impl PsFile {
|
|||
})),
|
||||
}
|
||||
}
|
||||
fn pos(&self) -> PdfInputPosition {
|
||||
pub(crate) fn from_arc_bytes(bytes: Arc<[u8]>, stream_pos: PdfInputPosition) -> Self {
|
||||
Self::new(0, PsFileSource::Bytes(bytes), 0, stream_pos)
|
||||
}
|
||||
pub(crate) fn pos(&self) -> PdfInputPosition {
|
||||
PdfInputPosition::new(Some(self.pos.get()))
|
||||
}
|
||||
fn peek_byte(&self) -> Option<u8> {
|
||||
|
|
@ -200,7 +210,7 @@ impl PsFile {
|
|||
self.next_byte();
|
||||
}
|
||||
}
|
||||
fn skip_comments_and_whitespace(&mut self) {
|
||||
pub(crate) fn skip_comments_and_whitespace(&mut self) {
|
||||
loop {
|
||||
self.skip_whitespace();
|
||||
let Some(b'%') = self.peek_byte() else {
|
||||
|
|
@ -340,7 +350,41 @@ impl PsFile {
|
|||
}
|
||||
Err(PdfParseError::TruncatedFile { pos: self.pos() })
|
||||
}
|
||||
fn next_token(&mut self) -> Result<Option<Token>, PdfParseError> {
|
||||
fn parse_string_after_l_angle(&mut self) -> Result<Vec<u8>, PdfParseError> {
|
||||
let mut contents = Vec::new();
|
||||
let mut high_digit_value = None;
|
||||
let mut push_digit_value = |value: u8| {
|
||||
high_digit_value = match high_digit_value {
|
||||
Some(high_digit_value) => {
|
||||
contents.push((high_digit_value << 4) | value);
|
||||
None
|
||||
}
|
||||
None => Some(value),
|
||||
};
|
||||
};
|
||||
let string_pos = self.pos();
|
||||
loop {
|
||||
let pos = self.pos();
|
||||
match self.next_byte() {
|
||||
None => {
|
||||
return Err(PdfParseError::TruncatedFile { pos });
|
||||
}
|
||||
Some(b'\0' | b'\t' | b'\n' | b'\x0C' | b'\r' | b' ') => {}
|
||||
Some(b'>') => {
|
||||
// if we have an odd trailing digit, add the final digit, otherwise doesn't modify contents
|
||||
push_digit_value(0);
|
||||
return Ok(contents);
|
||||
}
|
||||
Some(b) => {
|
||||
let Some(value) = (b as char).to_digit(0x10) else {
|
||||
return Err(PdfParseError::InvalidHexStringDigit { pos });
|
||||
};
|
||||
push_digit_value(value as u8);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
pub(crate) fn next_token(&mut self) -> Result<Option<Token>, PdfParseError> {
|
||||
self.skip_comments_and_whitespace();
|
||||
let Some(first_byte) = self.peek_byte() else {
|
||||
return Ok(None);
|
||||
|
|
@ -352,9 +396,26 @@ impl PsFile {
|
|||
}
|
||||
b')' => todo!(),
|
||||
b'<' => {
|
||||
todo!("encoded string");
|
||||
self.next_byte();
|
||||
match self.peek_byte() {
|
||||
Some(b'<') => {
|
||||
self.next_byte();
|
||||
Ok(Some(Token::DictStart))
|
||||
}
|
||||
Some(b'~') => todo!("base 85 encoded string"),
|
||||
_ => Ok(Some(Token::String(self.parse_string_after_l_angle()?))),
|
||||
}
|
||||
}
|
||||
b'>' => {
|
||||
self.next_byte();
|
||||
match self.peek_byte() {
|
||||
Some(b'>') => {
|
||||
self.next_byte();
|
||||
Ok(Some(Token::DictEnd))
|
||||
}
|
||||
_ => todo!("stray >"),
|
||||
}
|
||||
}
|
||||
b'>' => todo!(),
|
||||
b'[' => {
|
||||
self.next_byte();
|
||||
Ok(Some(Token::ArrayStart))
|
||||
|
|
@ -724,12 +785,15 @@ impl PsOperator {
|
|||
let PsObject::Integer(initial) = initial else {
|
||||
todo!("{initial:?}");
|
||||
};
|
||||
let PsObject::Integer(increment @ (..=-1 | 1..)) = increment else {
|
||||
let PsObject::Integer(increment) = increment else {
|
||||
todo!("{increment:?}");
|
||||
};
|
||||
let PsObject::Integer(limit) = limit else {
|
||||
todo!("{limit:?} {:?}", parser.operand_stack);
|
||||
};
|
||||
if increment == 0 {
|
||||
return custom_err("postscript for operator: increment can't be zero");
|
||||
};
|
||||
let mut counter = initial;
|
||||
let proc = proc.into_vec();
|
||||
loop {
|
||||
|
|
@ -1158,6 +1222,8 @@ impl PsParser {
|
|||
Token::Real(v) => PsObject::Real(PsReal(v)),
|
||||
Token::ArrayStart => PsObject::ExecutableName(PsName(b"[".into())),
|
||||
Token::ArrayEnd => PsObject::ExecutableName(PsName(b"]".into())),
|
||||
Token::DictStart => PsObject::ExecutableName(PsName(b"<<".into())),
|
||||
Token::DictEnd => PsObject::ExecutableName(PsName(b">>".into())),
|
||||
Token::ProcedureStart => PsObject::Procedure(self.parse_procedure()?),
|
||||
Token::ProcedureEnd => return Ok(PsArray::from_elements(self, objects)),
|
||||
Token::ExecutableName(name) => PsObject::ExecutableName(PsName(name.into())),
|
||||
|
|
@ -1176,6 +1242,8 @@ impl PsParser {
|
|||
Token::Real(v) => self.operand_stack.push(PsObject::Real(PsReal(v))),
|
||||
Token::ArrayStart => self.run_name(&PsName(b"[".into()))?,
|
||||
Token::ArrayEnd => self.run_name(&PsName(b"]".into()))?,
|
||||
Token::DictStart => self.run_name(&PsName(b"<<".into()))?,
|
||||
Token::DictEnd => self.run_name(&PsName(b">>".into()))?,
|
||||
Token::ProcedureStart => {
|
||||
let procedure = self.parse_procedure()?;
|
||||
self.operand_stack.push(PsObject::Procedure(procedure))
|
||||
|
|
@ -1199,26 +1267,30 @@ impl PsParser {
|
|||
fn parse_font_encoding(
|
||||
&mut self,
|
||||
value: PsArray,
|
||||
) -> Result<Arc<[Option<PdfName>]>, PdfParseError> {
|
||||
) -> Result<PdfSimpleFontEncodingTable, PdfParseError> {
|
||||
let value = value.rc();
|
||||
let value = value.borrow();
|
||||
let mut vec = Vec::with_capacity(value.len());
|
||||
let mut retval = PdfSimpleFontEncodingTable::empty();
|
||||
let mut table_iter = ArcOrRef::make_mut(&mut retval.table).iter_mut();
|
||||
for entry in value.iter() {
|
||||
match entry {
|
||||
PsObject::Name(name) => {
|
||||
if name.0 == b".notdef" {
|
||||
vec.push(None);
|
||||
let name = if name.0 == b".notdef" {
|
||||
None
|
||||
} else {
|
||||
vec.push(Some(PdfName::new(
|
||||
self.tokenizer.pos(),
|
||||
Arc::from(&*name.0),
|
||||
)));
|
||||
Some(PdfName::new(self.tokenizer.pos(), Arc::from(&*name.0)))
|
||||
};
|
||||
if let Some(entry) = table_iter.next() {
|
||||
*entry = PdfSimpleFontEncodingTableEntry {
|
||||
name,
|
||||
presumed_unicode: None,
|
||||
};
|
||||
}
|
||||
}
|
||||
_ => todo!("{entry:?}"),
|
||||
}
|
||||
}
|
||||
Ok(Arc::from(vec))
|
||||
Ok(retval)
|
||||
}
|
||||
fn parse_font_bbox(&mut self, value: PsArray) -> Result<PdfRectangle, PdfParseError> {
|
||||
let value = value.rc();
|
||||
|
|
@ -1332,6 +1404,7 @@ impl PsParser {
|
|||
let mut font_info = None;
|
||||
let mut font_matrix = None;
|
||||
let mut font_name = None;
|
||||
let mut vertical_writing_mode = false;
|
||||
for (key, value) in named {
|
||||
match (&*key.0, value) {
|
||||
(b"Encoding", PsObject::Array(value)) => {
|
||||
|
|
@ -1349,6 +1422,7 @@ impl PsParser {
|
|||
(b"FontName", PsObject::Name(value)) => {
|
||||
font_name = Some(value.into());
|
||||
}
|
||||
(b"WMode", PsObject::Boolean(v)) => vertical_writing_mode = v,
|
||||
(b"FontType", _) => {
|
||||
// TODO
|
||||
}
|
||||
|
|
@ -1361,12 +1435,22 @@ impl PsParser {
|
|||
for (key, value) in other {
|
||||
todo!("{key:?}: {value:?}");
|
||||
}
|
||||
let Some(encoding) = encoding else {
|
||||
return custom_err("postscript type 1 font must have Encoding");
|
||||
};
|
||||
let Some(font_bbox) = font_bbox else {
|
||||
return custom_err("postscript type 1 font must have FontBBox");
|
||||
};
|
||||
let Some(font_matrix) = font_matrix else {
|
||||
return custom_err("postscript type 1 font must have FontMatrix");
|
||||
};
|
||||
Ok(PdfFontType1Program {
|
||||
encoding,
|
||||
font_bbox,
|
||||
font_info,
|
||||
font_matrix,
|
||||
font_name,
|
||||
vertical_writing_mode,
|
||||
})
|
||||
}
|
||||
fn parse(mut self) -> Result<PdfFontType1Program, PdfParseError> {
|
||||
|
|
@ -1414,7 +1498,7 @@ impl PdfStreamContents for PdfFontType1Program {
|
|||
) -> Result<Self, PdfParseError> {
|
||||
PsParser::new(PsFile::new(
|
||||
0,
|
||||
PsFileSource::Bytes(Rc::from(data)),
|
||||
PsFileSource::Bytes(Arc::from(data)),
|
||||
0,
|
||||
stream_pos,
|
||||
))
|
||||
|
|
|
|||
|
|
@ -296,6 +296,13 @@ pub enum PdfParseError {
|
|||
MissingSetFontOperator {
|
||||
pos: PdfInputPosition,
|
||||
},
|
||||
InvalidTokenInToUnicodeStream {
|
||||
pos: PdfInputPosition,
|
||||
token: String,
|
||||
},
|
||||
InvalidUtf16 {
|
||||
pos: PdfInputPosition,
|
||||
},
|
||||
}
|
||||
|
||||
impl From<std::convert::Infallible> for PdfParseError {
|
||||
|
|
@ -345,7 +352,9 @@ impl GetPdfInputPosition for PdfParseError {
|
|||
| PdfParseError::CantRestoreGraphicsStateWithEmptyStack { pos }
|
||||
| PdfParseError::FontResourceNotFound { pos, .. }
|
||||
| PdfParseError::MissingBeginTextOperator { pos }
|
||||
| PdfParseError::MissingSetFontOperator { pos } => pos,
|
||||
| PdfParseError::MissingSetFontOperator { pos }
|
||||
| PdfParseError::InvalidTokenInToUnicodeStream { pos, .. }
|
||||
| PdfParseError::InvalidUtf16 { pos } => pos,
|
||||
PdfParseError::OperatorNotAllowedHere { ref operator } => operator.pos(),
|
||||
PdfParseError::OperatorHasTooFewOperands { ref operator }
|
||||
| PdfParseError::OperatorHasTooManyOperands { ref operator } => operator.pos(),
|
||||
|
|
@ -488,7 +497,7 @@ impl fmt::Display for PdfParseError {
|
|||
)
|
||||
}
|
||||
PdfParseError::MissingOperator { pos } => {
|
||||
write!(f, "at {pos}: stream not allowed here")
|
||||
write!(f, "at {pos}: missing operator")
|
||||
}
|
||||
PdfParseError::OperatorHasTooFewOperands { ref operator } => {
|
||||
write!(
|
||||
|
|
@ -525,6 +534,12 @@ impl fmt::Display for PdfParseError {
|
|||
"at {pos}: missing set font `Tf` operator before this text showing operator"
|
||||
)
|
||||
}
|
||||
PdfParseError::InvalidTokenInToUnicodeStream { pos, ref token } => {
|
||||
write!(f, "at {pos}: invalid token in ToUnicode stream: {token}")
|
||||
}
|
||||
PdfParseError::InvalidUtf16 { pos } => {
|
||||
write!(f, "at {pos}: invalid UTF-16")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -35,7 +35,7 @@ use crate::{
|
|||
PdfOperatorUnparsed,
|
||||
},
|
||||
document_structure::{PdfPage, PdfResourcesDictionary},
|
||||
font::{PdfFont, PdfTodo},
|
||||
font::{PdfFont, PdfSimpleFontEncodingTableEntry, PdfTodo},
|
||||
object::{
|
||||
IsPdfNull, PdfMatrix, PdfName, PdfNumber, PdfObject, PdfObjectDirect,
|
||||
PdfStringOrNumber, PdfVec2D,
|
||||
|
|
@ -934,13 +934,17 @@ impl PdfRenderOperator for PdfOperatorShowTextWithGlyphPositioning {
|
|||
.font_descriptor()
|
||||
.and_then(|v| v.font_file.as_ref())
|
||||
.and_then(|v| v.decoded_data().as_ref().ok())
|
||||
.and_then(|v| v.encoding.as_ref())
|
||||
.map(|v| v.encoding.clone())
|
||||
else {
|
||||
todo!()
|
||||
};
|
||||
todo!("{font_encoding:?}");
|
||||
font_encoding
|
||||
});
|
||||
todo!("{table:?}");
|
||||
let PdfSimpleFontEncodingTableEntry {
|
||||
name,
|
||||
presumed_unicode,
|
||||
} = table.table[usize::from(*glyph)].clone();
|
||||
todo!("{name:?} {presumed_unicode:?} {:#?}", font.to_unicode());
|
||||
}
|
||||
}
|
||||
PdfStringOrNumber::Number(number) => positioning = number.as_f32(),
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue