summaryrefslogtreecommitdiff
path: root/libqpdf/Pl_QPDFTokenizer.cc
diff options
context:
space:
mode:
Diffstat (limited to 'libqpdf/Pl_QPDFTokenizer.cc')
-rw-r--r--libqpdf/Pl_QPDFTokenizer.cc198
1 files changed, 198 insertions, 0 deletions
diff --git a/libqpdf/Pl_QPDFTokenizer.cc b/libqpdf/Pl_QPDFTokenizer.cc
new file mode 100644
index 0000000..ea13fb7
--- /dev/null
+++ b/libqpdf/Pl_QPDFTokenizer.cc
@@ -0,0 +1,198 @@
+#include <qpdf/Pl_QPDFTokenizer.hh>
+#include <qpdf/QPDF_String.hh>
+#include <qpdf/QPDF_Name.hh>
+#include <qpdf/QTC.hh>
+#include <stdexcept>
+#include <string.h>
+
+Pl_QPDFTokenizer::Pl_QPDFTokenizer(char const* identifier, Pipeline* next) :
+ Pipeline(identifier, next),
+ newline_after_next_token(false),
+ just_wrote_nl(false),
+ last_char_was_cr(false),
+ unread_char(false),
+ char_to_unread('\0'),
+ in_inline_image(false)
+{
+ memset(this->image_buf, 0, IMAGE_BUF_SIZE);
+}
+
+Pl_QPDFTokenizer::~Pl_QPDFTokenizer()
+{
+}
+
+void
+Pl_QPDFTokenizer::writeNext(char const* buf, size_t len)
+{
+ if (len)
+ {
+ unsigned char* t = new unsigned char[len];
+ memcpy(t, buf, len);
+ getNext()->write(t, len);
+ delete [] t;
+ this->just_wrote_nl = (buf[len-1] == '\n');
+ }
+}
+
+void
+Pl_QPDFTokenizer::writeToken(QPDFTokenizer::Token& token)
+{
+ std::string value = token.getRawValue();
+
+ switch (token.getType())
+ {
+ case QPDFTokenizer::tt_string:
+ value = QPDF_String(token.getValue()).unparse();
+ break;
+
+ case QPDFTokenizer::tt_name:
+ value = QPDF_Name(token.getValue()).unparse();
+ break;
+
+ default:
+ break;
+ }
+ writeNext(value.c_str(), value.length());
+}
+
+void
+Pl_QPDFTokenizer::processChar(char ch)
+{
+ if (this->in_inline_image)
+ {
+ // Scan through the input looking for EI surrounded by
+ // whitespace. If that pattern appears in the inline image's
+ // representation, we're hosed, but this situation seems
+ // excessively unlikely, and this code path is only followed
+ // during content stream normalization, which is pretty much
+ // used for debugging and human inspection of PDF files.
+ memmove(this->image_buf,
+ this->image_buf + 1,
+ IMAGE_BUF_SIZE - 1);
+ this->image_buf[IMAGE_BUF_SIZE - 1] = ch;
+ if (strchr(" \t\n\v\f\r", this->image_buf[0]) &&
+ (this->image_buf[1] == 'E') &&
+ (this->image_buf[2] == 'I') &&
+ strchr(" \t\n\v\f\r", this->image_buf[3]))
+ {
+ // We've found an EI operator. We've already written the
+ // EI operator to output; terminate with a newline
+ // character and resume normal processing.
+ writeNext("\n", 1);
+ this->in_inline_image = false;
+ QTC::TC("qpdf", "Pl_QPDFTokenizer found EI");
+ }
+ else
+ {
+ writeNext(&ch, 1);
+ }
+ return;
+ }
+
+ tokenizer.presentCharacter(ch);
+ QPDFTokenizer::Token token;
+ if (tokenizer.getToken(token, this->unread_char, this->char_to_unread))
+ {
+ writeToken(token);
+ if (this->newline_after_next_token)
+ {
+ writeNext("\n", 1);
+ this->newline_after_next_token = false;
+ }
+ if ((token.getType() == QPDFTokenizer::tt_word) &&
+ (token.getValue() == "ID"))
+ {
+ // Suspend normal scanning until we find an EI token.
+ this->in_inline_image = true;
+ if (this->unread_char)
+ {
+ writeNext(&this->char_to_unread, 1);
+ this->unread_char = false;
+ }
+ }
+ }
+ else
+ {
+ bool suppress = false;
+ if ((ch == '\n') && (this->last_char_was_cr))
+ {
+ // Always ignore \n following \r
+ suppress = true;
+ }
+
+ if ((this->last_char_was_cr = (ch == '\r')))
+ {
+ ch = '\n';
+ }
+
+ if (this->tokenizer.betweenTokens())
+ {
+ if (! suppress)
+ {
+ writeNext(&ch, 1);
+ }
+ }
+ else
+ {
+ if (ch == '\n')
+ {
+ this->newline_after_next_token = true;
+ }
+ }
+ }
+}
+
+
+void
+Pl_QPDFTokenizer::checkUnread()
+{
+ if (this->unread_char)
+ {
+ processChar(this->char_to_unread);
+ if (this->unread_char)
+ {
+ throw std::logic_error(
+ "INTERNAL ERROR: unread_char still true after processing "
+ "unread character");
+ }
+ }
+}
+
+void
+Pl_QPDFTokenizer::write(unsigned char* buf, size_t len)
+{
+ checkUnread();
+ for (size_t i = 0; i < len; ++i)
+ {
+ processChar(buf[i]);
+ checkUnread();
+ }
+}
+
+void
+Pl_QPDFTokenizer::finish()
+{
+ this->tokenizer.presentEOF();
+ if (! this->in_inline_image)
+ {
+ QPDFTokenizer::Token token;
+ if (tokenizer.getToken(token, this->unread_char, this->char_to_unread))
+ {
+ writeToken(token);
+ if (unread_char)
+ {
+ if (this->char_to_unread == '\r')
+ {
+ this->char_to_unread = '\n';
+ }
+ writeNext(&this->char_to_unread, 1);
+ }
+ }
+ }
+ if (! this->just_wrote_nl)
+ {
+ writeNext("\n", 1);
+ }
+
+ getNext()->finish();
+}