diff options
Diffstat (limited to 'qtools/qregexp.cpp')
-rw-r--r-- | qtools/qregexp.cpp | 1092 |
1 files changed, 1092 insertions, 0 deletions
diff --git a/qtools/qregexp.cpp b/qtools/qregexp.cpp new file mode 100644 index 0000000..2b3770c --- /dev/null +++ b/qtools/qregexp.cpp @@ -0,0 +1,1092 @@ +/**************************************************************************** +** +** +** Implementation of QRegExp class +** +** Created : 950126 +** +** Copyright (C) 1992-2000 Trolltech AS. All rights reserved. +** +** This file is part of the tools module of the Qt GUI Toolkit. +** +** This file may be distributed under the terms of the Q Public License +** as defined by Trolltech AS of Norway and appearing in the file +** LICENSE.QPL included in the packaging of this file. +** +** This file may be distributed and/or modified under the terms of the +** GNU General Public License version 2 as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL included in the +** packaging of this file. +** +** Licensees holding valid Qt Enterprise Edition or Qt Professional Edition +** licenses may use this file in accordance with the Qt Commercial License +** Agreement provided with the Software. +** +** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE +** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. +** +** See http://www.trolltech.com/pricing.html or email sales@trolltech.com for +** information about Qt Commercial License Agreements. +** See http://www.trolltech.com/qpl/ for QPL licensing information. +** See http://www.trolltech.com/gpl/ for GPL licensing information. +** +** Contact info@trolltech.com if any conditions of this licensing are +** not clear to you. +** +**********************************************************************/ + +#include "qregexp.h" +#include <ctype.h> +#include <stdlib.h> + +// NOT REVISED +/*! + \class QRegExp qregexp.h + \ingroup tools + \ingroup misc + \brief The QRegExp class provides pattern matching using regular + expressions or wildcards. + + QRegExp knows these regexp primitives: + <ul plain> + <li><dfn>c</dfn> matches the character 'c' + <li><dfn>.</dfn> matches any character + <li><dfn>^</dfn> matches start of input + <li><dfn>$</dfn> matches end of input + <li><dfn>[]</dfn> matches a defined set of characters - see below. + <li><dfn>a*</dfn> matches a sequence of zero or more a's + <li><dfn>a+</dfn> matches a sequence of one or more a's + <li><dfn>a?</dfn> matches an optional a + <li><dfn>\c</dfn> escape code for matching special characters such + as \, [, *, +, . etc. + <li><dfn>\t</dfn> matches the TAB character (9) + <li><dfn>\n</dfn> matches newline (10) + <li><dfn>\r</dfn> matches return (13) + <li><dfn>\s</dfn> matches a white space (defined as any character + for which QChar::isSpace() returns TRUE. This includes at least + ASCII characters 9 (TAB), 10 (LF), 11 (VT), 12(FF), 13 (CR) and 32 + (Space)). + <li><dfn>\d</dfn> matches a digit (defined as any character for + which QChar::isDigit() returns TRUE. This includes at least ASCII + characters '0'-'9'). + <li><dfn>\x1f6b</dfn> matches the character with unicode point U1f6b + (hexadecimal 1f6b). \x0012 will match the ASCII/Latin1 character + 0x12 (18 decimal, 12 hexadecimal). + <li><dfn>\022</dfn> matches the ASCII/Latin1 character 022 (18 + decimal, 22 octal). + </ul> + + In wildcard mode, it only knows four primitives: + <ul plain> + <li><dfn>c</dfn> matches the character 'c' + <li><dfn>?</dfn> matches any character + <li><dfn>*</dfn> matches any sequence of characters + <li><dfn>[]</dfn> matches a defined set of characters - see below. + </ul> + + QRegExp supports Unicode both in the pattern strings and in the + strings to be matched. + + When writing regular expressions in C++ code, remember that C++ + processes \ characters. So in order to match e.g. a "." character, + you must write "\\." in C++ source, not "\.". + + A character set matches a defined set of characters. For example, + [BSD] matches any of 'B', 'D' and 'S'. Within a character set, the + special characters '.', '*', '?', '^', '$', '+' and '[' lose their + special meanings. The following special characters apply: + <ul plain> + <li><dfn>^</dfn> When placed first in the list, changes the + character set to match any character \e not in the list. To include + the character '^' itself in the set, escape it or place it anywhere + but first. + <li><dfn>-</dfn> Defines a range of characters. To include the + character '-' itself in the set, escape it or place it last. + <li><dfn>]</dfn> Ends the character set definition. To include the + character ']' itself in the set, escape it or place it first (but + after the negation operator '^', if present) + </ul> + Thus, [a-zA-Z0-9.] matches upper and lower case ASCII letters, + digits and dot; and [^\s] matches everything except white space. + + \bug Case insensitive matching is not supported for non-ASCII/Latin1 + (non-8bit) characters. Any character with a non-zero QChar.row() is + matched case sensitively even if the QRegExp is in case insensitive + mode. + + \note In Qt 3.0, the language of regular expressions will contain + five more special characters, namely '(', ')', '{', '|' and '}'. To + ease porting, it's a good idea to escape these characters with a + backslash in all the regular expressions you'll write from now on. +*/ + + +// +// The regexp pattern is internally represented as an array of uints, +// each element containing an 16-bit character or a 32-bit code +// (listed below). User-defined character classes (e.g. [a-zA-Z]) +// are encoded as this: +// uint no: 1 2 3 ... +// value: CCL | n from | to from | to +// +// where n is the (16-bit) number of following range definitions and +// from and to define the ranges inclusive. from <= to is always true, +// otherwise it is a built-in charclass (Pxx, eg \s - PWS). Single +// characters in the class are coded as from==to. Negated classes +// (e.g. [^a-z]) use CCN instead of CCL. + +const uint END = 0x00000000; +const uint PWS = 0x10010000; // predef charclass: whitespace (\s) +const uint PDG = 0x10020000; // predef charclass: digit (\d) +const uint CCL = 0x20010000; // character class [] +const uint CCN = 0x20020000; // neg character class [^] +const uint CHR = 0x40000000; // character +const uint BOL = 0x80010000; // beginning of line ^ +const uint EOL = 0x80020000; // end of line $ +const uint BOW = 0x80030000; // beginning of word \< +const uint EOW = 0x80040000; // end of word \> +const uint ANY = 0x80050000; // any character . +const uint CLO = 0x80070000; // Kleene closure * +const uint OPT = 0x80080000; // Optional closure ? + +const uint MCC = 0x20000000; // character class bitmask +const uint MCD = 0xffff0000; // code mask +const uint MVL = 0x0000ffff; // value mask + +// +// QRegExp::error codes (internal) +// + +const int PatOk = 0; // pattern ok +const int PatNull = 1; // no pattern defined +const int PatSyntax = 2; // pattern syntax error +const int PatOverflow = 4; // pattern too long + + +/***************************************************************************** + QRegExp member functions + *****************************************************************************/ + +/*! + Constructs an empty regular expression. +*/ + +QRegExp::QRegExp() +{ + rxdata = 0; + cs = TRUE; + wc = FALSE; + error = PatOk; +} + +/*! + Constructs a regular expression. + + \arg \e pattern is the regular expression pattern string. + \arg \e caseSensitive specifies whether or not to use case sensitive + matching. + \arg \e wildcard specifies whether the pattern string should be used for + wildcard matching (also called globbing expression), normally used for + matching file names. + + \sa setWildcard() +*/ + +QRegExp::QRegExp( const QCString &pattern, bool caseSensitive, bool wildcard ) +{ + rxstring = pattern; + rxdata = 0; + cs = caseSensitive; + wc = wildcard; + compile(); +} + +/*! + Constructs a regular expression which is a copy of \e r. + \sa operator=(const QRegExp&) +*/ + +QRegExp::QRegExp( const QRegExp &r ) +{ + rxstring = r.pattern(); + rxdata = 0; + cs = r.caseSensitive(); + wc = r.wildcard(); + compile(); +} + +/*! + Destructs the regular expression and cleans up its internal data. +*/ + +QRegExp::~QRegExp() +{ + if ( rxdata ) // Avoid purify complaints + delete [] rxdata; +} + +/*! + Copies the regexp \e r and returns a reference to this regexp. + The case sensitivity and wildcard options are copied, as well. +*/ + +QRegExp &QRegExp::operator=( const QRegExp &r ) +{ + rxstring = r.rxstring; + cs = r.cs; + wc = r.wc; + compile(); + return *this; +} + +/*! + \obsolete + Consider using setPattern() instead of this method. + + Sets the pattern string to \e pattern and returns a reference to this regexp. + The case sensitivity or wildcard options do not change. +*/ + +QRegExp &QRegExp::operator=( const QCString &pattern ) +{ + rxstring = pattern; + compile(); + return *this; +} + + +/*! + Returns TRUE if this regexp is equal to \e r. + + Two regexp objects are equal if they have equal pattern strings, + case sensitivity options and wildcard options. +*/ + +bool QRegExp::operator==( const QRegExp &r ) const +{ + return rxstring == r.rxstring && cs == r.cs && wc == r.wc; +} + +/*! + \fn bool QRegExp::operator!=( const QRegExp &r ) const + + Returns TRUE if this regexp is \e not equal to \e r. + + \sa operator==() +*/ + +/*! + \fn bool QRegExp::isEmpty() const + Returns TRUE if the regexp is empty. +*/ + +/*! + \fn bool QRegExp::isValid() const + Returns TRUE if the regexp is valid, or FALSE if it is invalid. + + The pattern "[a-z" is an example of an invalid pattern, since it lacks a + closing bracket. +*/ + + +/*! + \fn bool QRegExp::wildcard() const + Returns TRUE if wildcard mode is on, otherwise FALSE. \sa setWildcard(). +*/ + +/*! + Sets the wildcard option for the regular expression. The default + is FALSE. + + Setting \e wildcard to TRUE makes it convenient to match filenames + instead of plain text. + + For example, "qr*.cpp" matches the string "qregexp.cpp" in wildcard mode, + but not "qicpp" (which would be matched in normal mode). + + \sa wildcard() +*/ + +void QRegExp::setWildcard( bool wildcard ) +{ + if ( wildcard != wc ) { + wc = wildcard; + compile(); + } +} + +/*! + \fn bool QRegExp::caseSensitive() const + + Returns TRUE if case sensitivity is enabled, otherwise FALSE. The + default is TRUE. + + \sa setCaseSensitive() +*/ + +/*! + Enables or disables case sensitive matching. + + In case sensitive mode, "a.e" matches "axe" but not "Axe". + + See also: caseSensitive() +*/ + +void QRegExp::setCaseSensitive( bool enable ) +{ + if ( cs != enable ) { + cs = enable; + compile(); + } +} + + +/*! + \fn QCString QRegExp::pattern() const + Returns the pattern string of the regexp. +*/ + + +/*! + \fn void QRegExp::setPattern(const QCString & pattern) + Sets the pattern string to \a pattern and returns a reference to this regexp. + The case sensitivity or wildcard options do not change. +*/ + +static inline bool iswordchar( int x ) +{ + return isalnum(x) || x == '_'; //# Only 8-bit support +} + + +/*! + \internal + Match character class +*/ + +static bool matchcharclass( uint *rxd, char c ) +{ + uint *d = rxd; + uint clcode = *d & MCD; + bool neg = clcode == CCN; + if ( clcode != CCL && clcode != CCN) + qWarning("QRegExp: Internal error, please report to qt-bugs@trolltech.com"); + uint numFields = *d & MVL; + uint cval = (uint)c; //(((uint)(c.row())) << 8) | ((uint)c.cell()); + bool found = FALSE; + for ( int i = 0; i < (int)numFields; i++ ) { + d++; + if ( *d == PWS && isspace(c) ) { + found = TRUE; + break; + } + if ( *d == PDG && isdigit(c) ) { + found = TRUE; + break; + } + else { + uint from = ( *d & MCD ) >> 16; + uint to = *d & MVL; + if ( (cval >= from) && (cval <= to) ) { + found = TRUE; + break; + } + } + } + return neg ? !found : found; +} + + + +/* + Internal: Recursively match string. +*/ + +static int matchstring( uint *rxd, const char *str, uint strlength, + const char *bol, bool cs ) +{ + const char *p = str; + const char *start = p; + uint pl = strlength; + uint *d = rxd; + + //### in all cases here: handle pl == 0! (don't read past strlen) + while ( *d ) { + if ( *d & CHR ) { // match char + if ( !pl ) + return -1; + char c = *d; + if ( !cs /*&& !c.row()*/ ) { // case insensitive, #Only 8bit + if ( tolower(*p) != c ) + return -1; + p++; + pl--; + } else { // case insensitive + if ( *p != c ) + return -1; + p++; + pl--; + } + d++; + } + else if ( *d & MCC ) { // match char class + if ( !pl ) + return -1; + if ( !matchcharclass( d, *p ) ) + return -1; + p++; + pl--; + d += (*d & MVL) + 1; + } + else switch ( *d++ ) { + case PWS: // match whitespace + if ( !pl || !isspace(*p) ) + return -1; + p++; + pl--; + break; + case PDG: // match digits + if ( !pl || !isdigit(*p) ) + return -1; + p++; + pl--; + break; + case ANY: // match anything + if ( !pl ) + return -1; + p++; + pl--; + break; + case BOL: // match beginning of line + if ( p != bol ) + return -1; + break; + case EOL: // match end of line + if ( pl ) + return -1; + break; + case BOW: // match beginning of word + if ( !iswordchar(*p) || (p > bol && iswordchar(*(p-1)) ) ) + return -1; + break; + case EOW: // match end of word + if ( iswordchar(*p) || p == bol || !iswordchar(*(p-1)) ) + return -1; + break; + case CLO: // Kleene closure + { + const char *first_p = p; + if ( *d & CHR ) { // match char + char c = *d; + if ( !cs /*&& !c.row()*/ ) { // case insensitive, #only 8bit + while ( pl /*&& !p->row()*/ && tolower(*p)==c ) { + p++; + pl--; + } + } + else { // case sensitive + while ( pl && *p == c ) { + p++; + pl--; + } + } + d++; + } + else if ( *d & MCC ) { // match char class + while( pl && matchcharclass( d, *p ) ) { + p++; + pl--; + } + d += (*d & MVL) + 1; + } + else if ( *d == PWS ) { + while ( pl && isspace(*p) ) { + p++; + pl--; + } + d++; + } + else if ( *d == PDG ) { + while ( pl && isdigit(*p) ) { + p++; + pl--; + } + d++; + } + else if ( *d == ANY ) { + p += pl; + pl = 0; + d++; + } + else { + return -1; // error + } + d++; // skip CLO's END + while ( p >= first_p ) { // go backwards + int end = matchstring( d, p, pl, bol, cs ); + if ( end >= 0 ) + return ( p - start ) + end; + if ( !p ) + return -1; + --p; + ++pl; + } + } + return -1; + case OPT: // optional closure + { + const char *first_p = p; + if ( *d & CHR ) { // match char + char c = *d; + if ( !cs /*&& !c.row()*/ ) { // case insensitive, #only 8bit + if ( pl && /*!p->row() &&*/ tolower(*p) == c ) { + p++; + pl--; + } + } + else { // case sensitive + if ( pl && *p == c ) { + p++; + pl--; + } + } + d++; + } + else if ( *d & MCC ) { // match char class + if ( pl && matchcharclass( d, *p ) ) { + p++; + pl--; + } + d += (*d & MVL) + 1; + } + else if ( *d == PWS ) { + if ( pl && isspace(*p) ) { + p++; + pl--; + } + d++; + } + else if ( *d == PDG ) { + if ( pl && isdigit(*p) ) { + p++; + pl--; + } + d++; + } + else if ( *d == ANY ) { + if ( pl ) { + p++; + pl--; + } + d++; + } + else { + return -1; // error + } + d++; // skip OPT's END + while ( p >= first_p ) { // go backwards + int end = matchstring( d, p, pl, bol, cs ); + if ( end >= 0 ) + return ( p - start ) + end; + if ( !p ) + return -1; + --p; + ++pl; + } + } + return -1; + + default: // error + return -1; + } + } + return p - start; +} + + +/*! + \internal + Recursively match string. +*/ + +// This is obsolete now, but since it is protected (not private), it +// is still implemented on the off-chance that somebody has made a +// class derived from QRegExp and calls this directly. +// Qt 3.0: Remove this? + +#if 0 +const char *QRegExp::matchstr( uint *rxd, const QChar *str, uint strlength, + const QChar *bol ) const +{ + int len = matchstring( rxd, str, strlength, bol, cs ); + if ( len < 0 ) + return 0; + return str + len; +} +#endif + +/*! + Attempts to match in \e str, starting from position \e index. + Returns the position of the match, or -1 if there was no match. + + If \e len is not a null pointer, the length of the match is stored in + \e *len. + + If \e indexIsStart is TRUE (the default), the position \e index in + the string will match the start-of-input primitive (^) in the + regexp, if present. Otherwise, position 0 in \e str will match. + + Example: + \code + QRegExp r("[0-9]*\\.[0-9]+"); // matches floating point + int len; + r.match("pi = 3.1416", 0, &len); // returns 5, len == 6 + \endcode + + \note In Qt 3.0, this function will be replaced by find(). +*/ + +int QRegExp::match( const QCString &str, int index, int *len, + bool indexIsStart ) const +{ + if ( !isValid() || isEmpty() ) + return -1; + if ( str.length() < (uint)index ) + return -1; + const char *start = str.data(); + const char *p = start + index; + uint pl = str.length() - index; + uint *d = rxdata; + int ep = -1; + + if ( *d == BOL ) { // match from beginning of line + ep = matchstring( d, p, pl, indexIsStart ? p : start, cs ); + } else { + if ( *d & CHR ) { + char c = *d; + if ( !cs /*&& !c.row()*/ ) { // case sensitive, # only 8bit + while ( pl && ( /*p->row() ||*/ tolower(*p) != c ) ) { + p++; + pl--; + } + } else { // case insensitive + while ( pl && *p != c ) { + p++; + pl--; + } + } + } + while( 1 ) { // regular match + ep = matchstring( d, p, pl, indexIsStart ? start+index : start, cs ); + if ( ep >= 0 ) + break; + if ( !pl ) + break; + p++; + pl--; + } + } + if ( len ) + *len = ep >= 0 ? ep : 0; // No match -> 0, for historical reasons + return ep >= 0 ? (int)(p - start) : -1; // return index; +} + +/*! \fn int QRegExp::find( const QCString& str, int index ) + + Attempts to match in \e str, starting from position \e index. + Returns the position of the match, or -1 if there was no match. + + \sa match() +*/ + +// +// Translate wildcard pattern to standard regexp pattern. +// Ex: *.cpp ==> ^.*\.cpp$ +// + +static QCString wc2rx( const QCString &pattern ) +{ + int patlen = (int)pattern.length(); + QCString wcpattern("^"); + + char c; + for( int i = 0; i < patlen; i++ ) { + c = pattern[i]; + switch ( (char)c ) { + case '*': // '*' ==> '.*' + wcpattern += '.'; + break; + case '?': // '?' ==> '.' + c = '.'; + break; + case '.': // quote special regexp chars + case '+': + case '\\': + case '$': + case '^': + wcpattern += '\\'; + break; + case '[': + if ( (char)pattern[i+1] == '^' ) { // don't quote '^' after '[' + wcpattern += '['; + c = pattern[i+1]; + i++; + } + break; + } + wcpattern += c; + + } + wcpattern += '$'; + return wcpattern; // return new regexp pattern +} + + +// +// Internal: Get char value and increment pointer. +// + +static uint char_val( const char **str, uint *strlength ) // get char value +{ + const char *p = *str; + uint pl = *strlength; + uint len = 1; + uint v = 0; + if ( (char)*p == '\\' ) { // escaped code + p++; + pl--; + if ( !pl ) { // it is just a '\' + (*str)++; + (*strlength)--; + return '\\'; + } + len++; // length at least 2 + int i; + char c; + char ch = tolower((char)*p); + switch ( ch ) { + case 'b': v = '\b'; break; // bell + case 'f': v = '\f'; break; // form feed + case 'n': v = '\n'; break; // newline + case 'r': v = '\r'; break; // return + case 't': v = '\t'; break; // tab + case 's': v = PWS; break; // whitespace charclass + case 'd': v = PDG; break; // digit charclass + case '<': v = BOW; break; // word beginning matcher + case '>': v = EOW; break; // word ending matcher + + case 'x': { // hex code + p++; + pl--; + for ( i = 0; (i < 4) && pl; i++ ) { //up to 4 hex digits + c = tolower((char)*p); + bool a = ( c >= 'a' && c <= 'f' ); + if ( (c >= '0' && c <= '9') || a ) { + v <<= 4; + v += a ? 10 + c - 'a' : c - '0'; + len++; + } + else { + break; + } + p++; + pl--; + } + } + break; + + default: { + if ( ch >= '0' && ch <= '7' ) { //octal code + len--; + for ( i = 0; (i < 3) && pl; i++ ) { // up to 3 oct digits + c = (char)*p; + if ( c >= '0' && c <= '7' ) { + v <<= 3; + v += c - '0'; + len++; + } + else { + break; + } + p++; + pl--; + } + } + else { // not an octal number + v = (uint)*p; //(((uint)(p->row())) << 8) | ((uint)p->cell()); + } + } + } + } else { + v = (uint)*p; //(((uint)(p->row())) << 8) | ((uint)p->cell()); + } + *str += len; + *strlength -= len; + return v; +} + + +#if 0 //defined(DEBUG) +static uint *dump( uint *p ) +{ + while ( *p != END ) { + if ( *p & CHR ) { + uchar uc = (uchar)*p; + char c = (char)uc; + uint u = (uint)uc; //(((uint)(uc.row())) << 8) | ((uint)uc.cell()); + qDebug( "\tCHR\tU%04x (%c)", u, (c ? c : ' ')); + p++; + } + else if ( *p & MCC ) { + uint clcode = *p & MCD; + uint numFields = *p & MVL; + if ( clcode == CCL ) + qDebug( "\tCCL\t%i", numFields ); + else if ( clcode == CCN ) + qDebug( "\tCCN\t%i", numFields ); + else + qDebug("coding error!"); + for ( int i = 0; i < (int)numFields; i++ ) { + p++; + if ( *p == PWS ) + qDebug( "\t\tPWS" ); + else if ( *p == PDG ) + qDebug( "\t\tPDG" ); + else { + uint from = ( *p & MCD ) >> 16; + uint to = *p & MVL; + char fc = (char)from; + char tc = (char)to; + qDebug( "\t\tU%04x (%c) - U%04x (%c)", from, + (fc ? fc : ' '), to, (tc ? tc : ' ') ); + } + } + p++; + } + else switch ( *p++ ) { + case PWS: + qDebug( "\tPWS" ); + break; + case PDG: + qDebug( "\tPDG" ); + break; + case BOL: + qDebug( "\tBOL" ); + break; + case EOL: + qDebug( "\tEOL" ); + break; + case BOW: + qDebug( "\tBOW" ); + break; + case EOW: + qDebug( "\tEOW" ); + break; + case ANY: + qDebug( "\tANY" ); + break; + case CLO: + qDebug( "\tCLO" ); + p = dump( p ); + break; + case OPT: + qDebug( "\tOPT" ); + p = dump( p ); + break; + } + } + qDebug( "\tEND" ); + return p+1; +} +#endif // DEBUG + + +static const int maxlen = 1024; // max length of regexp array +static uint rxarray[ maxlen ]; // tmp regexp array + +/*! + \internal + Compiles the regular expression and stores the result in rxdata. + The 'error' flag is set to non-zero if an error is detected. + NOTE! This function is not reentrant! +*/ + +void QRegExp::compile() +{ + if ( rxdata ) { // delete old data + delete [] rxdata; + rxdata = 0; + } + if ( rxstring.isEmpty() ) { // no regexp pattern set + error = PatNull; + return; + } + + error = PatOk; // assume pattern is ok + + QCString pattern; + if ( wc ) + pattern = wc2rx(rxstring); + else + pattern = rxstring; + const char *start = pattern.data(); // pattern pointer + const char *p = start; // pattern pointer + uint pl = pattern.length(); + uint *d = rxarray; // data pointer + uint *prev_d = 0; + +#define GEN(x) *d++ = (x) + + while ( pl ) { + char ch = (char)*p; + switch ( ch ) { + + case '^': // beginning of line + prev_d = d; + GEN( p == start ? BOL : (CHR | ch) ); + p++; + pl--; + break; + + case '$': // end of line + prev_d = d; + GEN( pl == 1 ? EOL : (CHR | ch) ); + p++; + pl--; + break; + + case '.': // any char + prev_d = d; + GEN( ANY ); + p++; + pl--; + break; + + case '[': // character class + { + prev_d = d; + p++; + pl--; + if ( !pl ) { + error = PatSyntax; + return; + } + bool firstIsEscaped = ( (char)*p == '\\' ); + uint cch = char_val( &p, &pl ); + if ( cch == '^' && !firstIsEscaped ) { // negate! + GEN( CCN ); + if ( !pl ) { + error = PatSyntax; + return; + } + cch = char_val( &p, &pl ); + } else { + GEN( CCL ); + } + uint numFields = 0; + while ( pl ) { + if ((pl>2) && ((char)*p == '-') && ((char)*(p+1) != ']')) { + // Found a range + char_val( &p, &pl ); // Read the '-' + uint cch2 = char_val( &p, &pl ); // Read the range end + if ( cch > cch2 ) { // swap start and stop + int tmp = cch; + cch = cch2; + cch2 = tmp; + } + GEN( (cch << 16) | cch2 ); // from < to + numFields++; + } + else { + // Found a single character + if ( cch & MCD ) // It's a code; will not be mistaken + GEN( cch ); // for a range, since from > to + else + GEN( (cch << 16) | cch ); // from == to range + numFields++; + } + if ( d >= rxarray + maxlen ) { // pattern too long + error = PatOverflow; + return; + } + if ( !pl ) { // At least ']' should be left + error = PatSyntax; + return; + } + bool nextIsEscaped = ( (char)*p == '\\' ); + cch = char_val( &p, &pl ); + if ( cch == (uint)']' && !nextIsEscaped ) + break; + if ( !pl ) { // End, should have seen ']' + error = PatSyntax; + return; + } + } + *prev_d |= numFields; // Store number of fields + } + break; + + case '*': // Kleene closure, or + case '+': // positive closure, or + case '?': // optional closure + { + if ( prev_d == 0 ) { // no previous expression + error = PatSyntax; // empty closure + return; + } + switch ( *prev_d ) { // test if invalid closure + case BOL: + case BOW: + case EOW: + case CLO: + case OPT: + error = PatSyntax; + return; + } + int ddiff = d - prev_d; + if ( *p == '+' ) { // convert to Kleene closure + if ( d + ddiff >= rxarray + maxlen ) { + error = PatOverflow; // pattern too long + return; + } + memcpy( d, prev_d, ddiff*sizeof(uint) ); + d += ddiff; + prev_d += ddiff; + } + memmove( prev_d+1, prev_d, ddiff*sizeof(uint) ); + *prev_d = ch == '?' ? OPT : CLO; + d++; + GEN( END ); + p++; + pl--; + } + break; + + default: + { + prev_d = d; + uint cv = char_val( &p, &pl ); + if ( cv & MCD ) { // It's a code + GEN( cv ); + } + else { + if ( !cs && cv <= 0xff ) // #only 8bit support + cv = tolower( cv ); + GEN( CHR | cv ); + } + } + } + if ( d >= rxarray + maxlen ) { // oops! + error = PatOverflow; // pattern too long + return; + } + } + GEN( END ); + int len = d - rxarray; + rxdata = new uint[ len ]; // copy from rxarray to rxdata + CHECK_PTR( rxdata ); + memcpy( rxdata, rxarray, len*sizeof(uint) ); +#if defined(DEBUG) + //dump( rxdata ); // uncomment this line for debugging +#endif +} |