diff options
Diffstat (limited to 'libxml/xml.l')
-rw-r--r-- | libxml/xml.l | 68 |
1 files changed, 43 insertions, 25 deletions
diff --git a/libxml/xml.l b/libxml/xml.l index 13ff219..d5d8237 100644 --- a/libxml/xml.l +++ b/libxml/xml.l @@ -31,7 +31,6 @@ #include <vector> #include <stdio.h> #include "xml.h" -//#include "message.h" #define YY_NEVER_INTERACTIVE 1 #define YY_NO_INPUT 1 @@ -42,7 +41,7 @@ struct xmlYY_state std::string fileName; int lineNr = 1; const char * inputString = 0; //!< the code fragment as text - yy_size_t inputPosition = 0; //!< read offset during parsing + int inputPosition = 0; //!< read offset during parsing std::string name; bool isEnd = false; bool selfClose = false; @@ -54,14 +53,16 @@ struct xmlYY_state int cdataContext; int commentContext; char stringChar; + std::string encoding; std::vector<std::string> xpath; + std::function<XMLParser::Transcode> transcodeFunc; }; #if USE_STATE2STRING static const char *stateToString(int state); #endif -static yy_size_t yyread(yyscan_t yyscanner,char *buf,yy_size_t max_size); +static int yyread(yyscan_t yyscanner,char *buf,int max_size); static void initElement(yyscan_t yyscanner); static void addCharacters(yyscan_t yyscanner); static void addElement(yyscan_t yyscanner); @@ -192,10 +193,18 @@ ENDCDATA "]]>" . { yyextra->data += yytext; } } <Prolog>{ + "encoding"\s*=\s*\"[^\"]*\" { + std::string encoding=yytext; + size_t i=encoding.find('"'); + encoding=encoding.substr(i+1,yyleng-i-2); + if (encoding!="UTF-8") // need to transcode to UTF-8 + { + yyextra->encoding=encoding; + } + } {CLOSESPECIAL} { countLines(yyscanner,yytext,yyleng); BEGIN(Initial); } - [^?\n]+ { } \n { yyextra->lineNr++; } . { } } @@ -218,12 +227,12 @@ ENDCDATA "]]>" //---------------------------------------------------------------------------------------- -static yy_size_t yyread(yyscan_t yyscanner,char *buf,size_t max_size) +static int yyread(yyscan_t yyscanner,char *buf,int max_size) { struct yyguts_t *yyg = (struct yyguts_t*)yyscanner; - yy_size_t inputPosition = yyextra->inputPosition; + int inputPosition = yyextra->inputPosition; const char *s = yyextra->inputString + inputPosition; - yy_size_t c=0; + int c=0; while( c < max_size && *s) { *buf++ = *s++; @@ -321,6 +330,10 @@ static void addCharacters(yyscan_t yyscanner) { struct yyguts_t *yyg = (struct yyguts_t*)yyscanner; std::string data = trimSpaces(yyextra->data); + if (!yyextra->encoding.empty() && !yyextra->transcodeFunc(data,yyextra->encoding.c_str())) + { + reportError(yyscanner,"failed to transcode string '"+data+"' from encoding '"+yyextra->encoding+"' to UTF-8"); + } if (yyextra->handlers.characters) { yyextra->handlers.characters(data); @@ -337,7 +350,12 @@ static void addCharacters(yyscan_t yyscanner) static void addAttribute(yyscan_t yyscanner) { struct yyguts_t *yyg = (struct yyguts_t*)yyscanner; - yyextra->attrs.insert(std::make_pair(yyextra->attrName,yyextra->attrValue)); + std::string val = yyextra->attrValue; + if (!yyextra->encoding.empty() && !yyextra->transcodeFunc(val,yyextra->encoding.c_str())) + { + reportError(yyscanner,"failed to transcode string '"+val+"' from encoding '"+yyextra->encoding+"' to UTF-8"); + } + yyextra->attrs.insert(std::make_pair(yyextra->attrName,val)); } static void reportError(yyscan_t yyscanner,const std::string &msg) @@ -426,34 +444,37 @@ XMLParser::~XMLParser() xmlYYlex_destroy(p->yyscanner); } -void XMLParser::parse(const char *fileName,const char *inputStr, bool debugEnabled) +void XMLParser::parse(const char *fileName, + const char *inputStr, + bool debugEnabled, + std::function<void()> debugStart, + std::function<void()> debugEnd, + std::function<Transcode> transcodeFunc) { yyscan_t yyscanner = p->yyscanner; struct yyguts_t *yyg = (struct yyguts_t*)yyscanner; #ifdef FLEX_DEBUG - xmlYYset_debug(1,p->yyscanner); + xmlYYset_debug(debugEnabled?1:0,p->yyscanner); #endif if (inputStr==nullptr || inputStr[0]=='\0') return; // empty input - FILE *output = 0; - const char *enter_txt = 0; - const char *finished_txt = 0; - const char *pre_txt = 0; - if (yy_flex_debug) { output=stderr; pre_txt="--"; enter_txt="entering"; finished_txt="finished"; } - else if (debugEnabled) { output=stdout; pre_txt=""; enter_txt="Entering"; finished_txt="Finished"; } - - if (output) - { - fprintf(output,"%s%s lexical analyzer: %s (for: %s)\n",pre_txt,enter_txt, __FILE__, fileName); - } + debugStart(); BEGIN(Initial); yyextra->fileName = fileName; yyextra->lineNr = 1; yyextra->inputString = inputStr; yyextra->inputPosition = 0; + yyextra->transcodeFunc = transcodeFunc; + + if (static_cast<unsigned char>(inputStr[0])==0xEF && + static_cast<unsigned char>(inputStr[1])==0xBB && + static_cast<unsigned char>(inputStr[2])==0xBF) + { + yyextra->inputPosition = 3; // remove UTF-8 BOM + } xmlYYrestart( 0, yyscanner ); @@ -474,10 +495,7 @@ void XMLParser::parse(const char *fileName,const char *inputStr, bool debugEnabl reportError(yyscanner,msg); } - if (output) - { - fprintf(output,"%s%s lexical analyzer: %s (for: %s)\n",pre_txt,finished_txt, __FILE__, fileName); - } + debugEnd(); } int XMLParser::lineNr() const |