summaryrefslogtreecommitdiff
path: root/qtools/qregexp.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'qtools/qregexp.cpp')
-rw-r--r--qtools/qregexp.cpp1093
1 files changed, 0 insertions, 1093 deletions
diff --git a/qtools/qregexp.cpp b/qtools/qregexp.cpp
deleted file mode 100644
index 8709858..0000000
--- a/qtools/qregexp.cpp
+++ /dev/null
@@ -1,1093 +0,0 @@
-/****************************************************************************
-**
-**
-** Implementation of QRegExp class
-**
-** Created : 950126
-**
-** Copyright (C) 1992-2000 Trolltech AS. All rights reserved.
-**
-** This file is part of the tools module of the Qt GUI Toolkit.
-**
-** This file may be distributed under the terms of the Q Public License
-** as defined by Trolltech AS of Norway and appearing in the file
-** LICENSE.QPL included in the packaging of this file.
-**
-** This file may be distributed and/or modified under the terms of the
-** GNU General Public License version 2 as published by the Free Software
-** Foundation and appearing in the file LICENSE.GPL included in the
-** packaging of this file.
-**
-** Licensees holding valid Qt Enterprise Edition or Qt Professional Edition
-** licenses may use this file in accordance with the Qt Commercial License
-** Agreement provided with the Software.
-**
-** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
-** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
-**
-** See http://www.trolltech.com/pricing.html or email sales@trolltech.com for
-** information about Qt Commercial License Agreements.
-** See http://www.trolltech.com/qpl/ for QPL licensing information.
-** See http://www.trolltech.com/gpl/ for GPL licensing information.
-**
-** Contact info@trolltech.com if any conditions of this licensing are
-** not clear to you.
-**
-**********************************************************************/
-
-#include "qregexp.h"
-#include <ctype.h>
-#include <stdlib.h>
-
-// NOT REVISED
-/*!
- \class QRegExp qregexp.h
- \ingroup tools
- \ingroup misc
- \brief The QRegExp class provides pattern matching using regular
- expressions or wildcards.
-
- QRegExp knows these regexp primitives:
- <ul plain>
- <li><dfn>c</dfn> matches the character 'c'
- <li><dfn>.</dfn> matches any character
- <li><dfn>^</dfn> matches start of input
- <li><dfn>$</dfn> matches end of input
- <li><dfn>[]</dfn> matches a defined set of characters - see below.
- <li><dfn>a*</dfn> matches a sequence of zero or more a's
- <li><dfn>a+</dfn> matches a sequence of one or more a's
- <li><dfn>a?</dfn> matches an optional a
- <li><dfn>\c</dfn> escape code for matching special characters such
- as \, [, *, +, . etc.
- <li><dfn>\t</dfn> matches the TAB character (9)
- <li><dfn>\n</dfn> matches newline (10)
- <li><dfn>\r</dfn> matches return (13)
- <li><dfn>\s</dfn> matches a white space (defined as any character
- for which QChar::isSpace() returns TRUE. This includes at least
- ASCII characters 9 (TAB), 10 (LF), 11 (VT), 12(FF), 13 (CR) and 32
- (Space)).
- <li><dfn>\d</dfn> matches a digit (defined as any character for
- which QChar::isDigit() returns TRUE. This includes at least ASCII
- characters '0'-'9').
- <li><dfn>\x1f6b</dfn> matches the character with unicode point U1f6b
- (hexadecimal 1f6b). \x0012 will match the ASCII/Latin1 character
- 0x12 (18 decimal, 12 hexadecimal).
- <li><dfn>\022</dfn> matches the ASCII/Latin1 character 022 (18
- decimal, 22 octal).
- </ul>
-
- In wildcard mode, it only knows four primitives:
- <ul plain>
- <li><dfn>c</dfn> matches the character 'c'
- <li><dfn>?</dfn> matches any character
- <li><dfn>*</dfn> matches any sequence of characters
- <li><dfn>[]</dfn> matches a defined set of characters - see below.
- </ul>
-
- QRegExp supports Unicode both in the pattern strings and in the
- strings to be matched.
-
- When writing regular expressions in C++ code, remember that C++
- processes \ characters. So in order to match e.g. a "." character,
- you must write "\\." in C++ source, not "\.".
-
- A character set matches a defined set of characters. For example,
- [BSD] matches any of 'B', 'D' and 'S'. Within a character set, the
- special characters '.', '*', '?', '^', '$', '+' and '[' lose their
- special meanings. The following special characters apply:
- <ul plain>
- <li><dfn>^</dfn> When placed first in the list, changes the
- character set to match any character \e not in the list. To include
- the character '^' itself in the set, escape it or place it anywhere
- but first.
- <li><dfn>-</dfn> Defines a range of characters. To include the
- character '-' itself in the set, escape it or place it last.
- <li><dfn>]</dfn> Ends the character set definition. To include the
- character ']' itself in the set, escape it or place it first (but
- after the negation operator '^', if present)
- </ul>
- Thus, [a-zA-Z0-9.] matches upper and lower case ASCII letters,
- digits and dot; and [^\s] matches everything except white space.
-
- \bug Case insensitive matching is not supported for non-ASCII/Latin1
- (non-8bit) characters. Any character with a non-zero QChar.row() is
- matched case sensitively even if the QRegExp is in case insensitive
- mode.
-
- \note In Qt 3.0, the language of regular expressions will contain
- five more special characters, namely '(', ')', '{', '|' and '}'. To
- ease porting, it's a good idea to escape these characters with a
- backslash in all the regular expressions you'll write from now on.
-*/
-
-
-//
-// The regexp pattern is internally represented as an array of uints,
-// each element containing an 16-bit character or a 32-bit code
-// (listed below). User-defined character classes (e.g. [a-zA-Z])
-// are encoded as this:
-// uint no: 1 2 3 ...
-// value: CCL | n from | to from | to
-//
-// where n is the (16-bit) number of following range definitions and
-// from and to define the ranges inclusive. from <= to is always true,
-// otherwise it is a built-in charclass (Pxx, eg \s - PWS). Single
-// characters in the class are coded as from==to. Negated classes
-// (e.g. [^a-z]) use CCN instead of CCL.
-
-const uint END = 0x00000000;
-const uint PWS = 0x10010000; // predef charclass: whitespace (\s)
-const uint PDG = 0x10020000; // predef charclass: digit (\d)
-const uint CCL = 0x20010000; // character class []
-const uint CCN = 0x20020000; // neg character class [^]
-const uint CHR = 0x40000000; // character
-const uint BOL = 0x80010000; // beginning of line ^
-const uint EOL = 0x80020000; // end of line $
-const uint BOW = 0x80030000; // beginning of word \<
-const uint EOW = 0x80040000; // end of word \>
-const uint ANY = 0x80050000; // any character .
-const uint CLO = 0x80070000; // Kleene closure *
-const uint OPT = 0x80080000; // Optional closure ?
-
-const uint MCC = 0x20000000; // character class bitmask
-const uint MCD = 0xffff0000; // code mask
-const uint MVL = 0x0000ffff; // value mask
-
-//
-// QRegExp::error codes (internal)
-//
-
-const int PatOk = 0; // pattern ok
-const int PatNull = 1; // no pattern defined
-const int PatSyntax = 2; // pattern syntax error
-const int PatOverflow = 4; // pattern too long
-
-
-/*****************************************************************************
- QRegExp member functions
- *****************************************************************************/
-
-/*!
- Constructs an empty regular expression.
-*/
-
-QRegExp::QRegExp()
-{
- rxdata = 0;
- cs = TRUE;
- wc = FALSE;
- error = PatOk;
-}
-
-/*!
- Constructs a regular expression.
-
- \arg \e pattern is the regular expression pattern string.
- \arg \e caseSensitive specifies whether or not to use case sensitive
- matching.
- \arg \e wildcard specifies whether the pattern string should be used for
- wildcard matching (also called globbing expression), normally used for
- matching file names.
-
- \sa setWildcard()
-*/
-
-QRegExp::QRegExp( const QCString &pattern, bool caseSensitive, bool wildcard )
-{
- rxstring = pattern;
- rxdata = 0;
- cs = caseSensitive;
- wc = wildcard;
- compile();
-}
-
-/*!
- Constructs a regular expression which is a copy of \e r.
- \sa operator=(const QRegExp&)
-*/
-
-QRegExp::QRegExp( const QRegExp &r )
-{
- rxstring = r.pattern();
- rxdata = 0;
- cs = r.caseSensitive();
- wc = r.wildcard();
- compile();
-}
-
-/*!
- Destructs the regular expression and cleans up its internal data.
-*/
-
-QRegExp::~QRegExp()
-{
- if ( rxdata ) // Avoid purify complaints
- delete [] rxdata;
-}
-
-/*!
- Copies the regexp \e r and returns a reference to this regexp.
- The case sensitivity and wildcard options are copied, as well.
-*/
-
-QRegExp &QRegExp::operator=( const QRegExp &r )
-{
- rxstring = r.rxstring;
- cs = r.cs;
- wc = r.wc;
- compile();
- return *this;
-}
-
-/*!
- \obsolete
- Consider using setPattern() instead of this method.
-
- Sets the pattern string to \e pattern and returns a reference to this regexp.
- The case sensitivity or wildcard options do not change.
-*/
-
-QRegExp &QRegExp::operator=( const QCString &pattern )
-{
- rxstring = pattern;
- compile();
- return *this;
-}
-
-
-/*!
- Returns TRUE if this regexp is equal to \e r.
-
- Two regexp objects are equal if they have equal pattern strings,
- case sensitivity options and wildcard options.
-*/
-
-bool QRegExp::operator==( const QRegExp &r ) const
-{
- return rxstring == r.rxstring && cs == r.cs && wc == r.wc;
-}
-
-/*!
- \fn bool QRegExp::operator!=( const QRegExp &r ) const
-
- Returns TRUE if this regexp is \e not equal to \e r.
-
- \sa operator==()
-*/
-
-/*!
- \fn bool QRegExp::isEmpty() const
- Returns TRUE if the regexp is empty.
-*/
-
-/*!
- \fn bool QRegExp::isValid() const
- Returns TRUE if the regexp is valid, or FALSE if it is invalid.
-
- The pattern "[a-z" is an example of an invalid pattern, since it lacks a
- closing bracket.
-*/
-
-
-/*!
- \fn bool QRegExp::wildcard() const
- Returns TRUE if wildcard mode is on, otherwise FALSE. \sa setWildcard().
-*/
-
-/*!
- Sets the wildcard option for the regular expression. The default
- is FALSE.
-
- Setting \e wildcard to TRUE makes it convenient to match filenames
- instead of plain text.
-
- For example, "qr*.cpp" matches the string "qregexp.cpp" in wildcard mode,
- but not "qicpp" (which would be matched in normal mode).
-
- \sa wildcard()
-*/
-
-void QRegExp::setWildcard( bool wildcard )
-{
- if ( wildcard != wc ) {
- wc = wildcard;
- compile();
- }
-}
-
-/*!
- \fn bool QRegExp::caseSensitive() const
-
- Returns TRUE if case sensitivity is enabled, otherwise FALSE. The
- default is TRUE.
-
- \sa setCaseSensitive()
-*/
-
-/*!
- Enables or disables case sensitive matching.
-
- In case sensitive mode, "a.e" matches "axe" but not "Axe".
-
- See also: caseSensitive()
-*/
-
-void QRegExp::setCaseSensitive( bool enable )
-{
- if ( cs != enable ) {
- cs = enable;
- compile();
- }
-}
-
-
-/*!
- \fn QCString QRegExp::pattern() const
- Returns the pattern string of the regexp.
-*/
-
-
-/*!
- \fn void QRegExp::setPattern(const QCString & pattern)
- Sets the pattern string to \a pattern and returns a reference to this regexp.
- The case sensitivity or wildcard options do not change.
-*/
-
-static inline bool iswordchar( int x )
-{
- return isalnum(x) || x == '_'; //# Only 8-bit support
-}
-
-
-/*!
- \internal
- Match character class
-*/
-
-static bool matchcharclass( uint *rxd, char c )
-{
- uint *d = rxd;
- uint clcode = *d & MCD;
- bool neg = clcode == CCN;
- if ( clcode != CCL && clcode != CCN)
- qWarning("QRegExp: Internal error, please report to qt-bugs@trolltech.com");
- uint numFields = *d & MVL;
- uint cval = (unsigned char)c; //(((uint)(c.row())) << 8) | ((uint)c.cell());
- bool found = FALSE;
- for ( int i = 0; i < (int)numFields; i++ ) {
- d++;
- if ( *d == PWS && isspace(c) ) {
- found = TRUE;
- break;
- }
- if ( *d == PDG && isdigit(c) ) {
- found = TRUE;
- break;
- }
- else {
- uint from = ( *d & MCD ) >> 16;
- uint to = *d & MVL;
- if ( (cval >= from) && (cval <= to) ) {
- found = TRUE;
- break;
- }
- }
- }
- return neg ? !found : found;
-}
-
-
-
-/*
- Internal: Recursively match string.
-*/
-
-static int matchstring( uint *rxd, const char *str, uint strlength,
- const char *bol, bool cs )
-{
- const char *p = str;
- const char *start = p;
- uint pl = strlength;
- uint *d = rxd;
-
- //### in all cases here: handle pl == 0! (don't read past strlen)
- while ( *d ) {
- if ( *d & CHR ) { // match char
- if ( !pl )
- return -1;
- char c = *d;
- if ( !cs /*&& !c.row()*/ ) { // case insensitive, #Only 8bit
- if ( tolower(*p) != c )
- return -1;
- p++;
- pl--;
- } else { // case insensitive
- if ( *p != c )
- return -1;
- p++;
- pl--;
- }
- d++;
- }
- else if ( *d & MCC ) { // match char class
- if ( !pl )
- return -1;
- if ( !matchcharclass( d, *p ) )
- return -1;
- p++;
- pl--;
- d += (*d & MVL) + 1;
- }
- else switch ( *d++ ) {
- case PWS: // match whitespace
- if ( !pl || !isspace(*p) )
- return -1;
- p++;
- pl--;
- break;
- case PDG: // match digits
- if ( !pl || !isdigit(*p) )
- return -1;
- p++;
- pl--;
- break;
- case ANY: // match anything
- if ( !pl )
- return -1;
- p++;
- pl--;
- break;
- case BOL: // match beginning of line
- if ( p != bol )
- return -1;
- break;
- case EOL: // match end of line
- if ( pl )
- return -1;
- break;
- case BOW: // match beginning of word
- if ( !iswordchar(*p) || (p > bol && iswordchar(*(p-1)) ) )
- return -1;
- break;
- case EOW: // match end of word
- if ( iswordchar(*p) || p == bol || !iswordchar(*(p-1)) )
- return -1;
- break;
- case CLO: // Kleene closure
- {
- const char *first_p = p;
- if ( *d & CHR ) { // match char
- char c = *d;
- if ( !cs /*&& !c.row()*/ ) { // case insensitive, #only 8bit
- while ( pl /*&& !p->row()*/ && tolower(*p)==c ) {
- p++;
- pl--;
- }
- }
- else { // case sensitive
- while ( pl && *p == c ) {
- p++;
- pl--;
- }
- }
- d++;
- }
- else if ( *d & MCC ) { // match char class
- while( pl && matchcharclass( d, *p ) ) {
- p++;
- pl--;
- }
- d += (*d & MVL) + 1;
- }
- else if ( *d == PWS ) {
- while ( pl && isspace(*p) ) {
- p++;
- pl--;
- }
- d++;
- }
- else if ( *d == PDG ) {
- while ( pl && isdigit(*p) ) {
- p++;
- pl--;
- }
- d++;
- }
- else if ( *d == ANY ) {
- p += pl;
- pl = 0;
- d++;
- }
- else {
- return -1; // error
- }
- d++; // skip CLO's END
- while ( p >= first_p ) { // go backwards
- int end = matchstring( d, p, pl, bol, cs );
- if ( end >= 0 )
- return ( (int)(p - start) ) + end;
- if ( !p )
- return -1;
- --p;
- ++pl;
- }
- }
- return -1;
- case OPT: // optional closure
- {
- const char *first_p = p;
- if ( *d & CHR ) { // match char
- char c = *d;
- if ( !cs /*&& !c.row()*/ ) { // case insensitive, #only 8bit
- if ( pl && /*!p->row() &&*/ tolower(*p) == c ) {
- p++;
- pl--;
- }
- }
- else { // case sensitive
- if ( pl && *p == c ) {
- p++;
- pl--;
- }
- }
- d++;
- }
- else if ( *d & MCC ) { // match char class
- if ( pl && matchcharclass( d, *p ) ) {
- p++;
- pl--;
- }
- d += (*d & MVL) + 1;
- }
- else if ( *d == PWS ) {
- if ( pl && isspace(*p) ) {
- p++;
- pl--;
- }
- d++;
- }
- else if ( *d == PDG ) {
- if ( pl && isdigit(*p) ) {
- p++;
- pl--;
- }
- d++;
- }
- else if ( *d == ANY ) {
- if ( pl ) {
- p++;
- pl--;
- }
- d++;
- }
- else {
- return -1; // error
- }
- d++; // skip OPT's END
- while ( p >= first_p ) { // go backwards
- int end = matchstring( d, p, pl, bol, cs );
- if ( end >= 0 )
- return ( (int)(p - start) ) + end;
- if ( !p )
- return -1;
- --p;
- ++pl;
- }
- }
- return -1;
-
- default: // error
- return -1;
- }
- }
- return (int)(p - start);
-}
-
-
-/*!
- \internal
- Recursively match string.
-*/
-
-// This is obsolete now, but since it is protected (not private), it
-// is still implemented on the off-chance that somebody has made a
-// class derived from QRegExp and calls this directly.
-// Qt 3.0: Remove this?
-
-#if 0
-const char *QRegExp::matchstr( uint *rxd, const QChar *str, uint strlength,
- const QChar *bol ) const
-{
- int len = matchstring( rxd, str, strlength, bol, cs );
- if ( len < 0 )
- return 0;
- return str + len;
-}
-#endif
-
-/*!
- Attempts to match in \e str, starting from position \e index.
- Returns the position of the match, or -1 if there was no match.
-
- If \e len is not a null pointer, the length of the match is stored in
- \e *len.
-
- If \e indexIsStart is TRUE (the default), the position \e index in
- the string will match the start-of-input primitive (^) in the
- regexp, if present. Otherwise, position 0 in \e str will match.
-
- Example:
- \code
- QRegExp r("[0-9]*\\.[0-9]+"); // matches floating point
- int len;
- r.match("pi = 3.1416", 0, &len); // returns 5, len == 6
- \endcode
-
- \note In Qt 3.0, this function will be replaced by find().
-*/
-
-int QRegExp::match( const QCString &str, int index, int *len,
- bool indexIsStart ) const
-{
- if ( !isValid() || isEmpty() )
- return -1;
- if ( str.length() < (uint)index )
- return -1;
- const char *start = str.data();
- const char *p = start + index;
- uint pl = str.length() - index;
- uint *d = rxdata;
- int ep = -1;
-
- if ( *d == BOL ) { // match from beginning of line
- ep = matchstring( d, p, pl, indexIsStart ? p : start, cs );
- } else {
- if ( *d & CHR ) {
- char c = *d;
- if ( !cs /*&& !c.row()*/ ) { // case sensitive, # only 8bit
- while ( pl && ( /*p->row() ||*/ tolower(*p) != c ) ) {
- p++;
- pl--;
- }
- } else { // case insensitive
- while ( pl && *p != c ) {
- p++;
- pl--;
- }
- }
- }
- while( 1 ) { // regular match
- ep = matchstring( d, p, pl, indexIsStart ? start+index : start, cs );
- if ( ep >= 0 )
- break;
- if ( !pl )
- break;
- p++;
- pl--;
- }
- }
- if ( len )
- *len = ep >= 0 ? ep : 0; // No match -> 0, for historical reasons
- return ep >= 0 ? (int)(p - start) : -1; // return index;
-}
-
-/*! \fn int QRegExp::find( const QCString& str, int index )
-
- Attempts to match in \e str, starting from position \e index.
- Returns the position of the match, or -1 if there was no match.
-
- \sa match()
-*/
-
-//
-// Translate wildcard pattern to standard regexp pattern.
-// Ex: *.cpp ==> ^.*\.cpp$
-//
-
-static QCString wc2rx( const QCString &pattern )
-{
- int patlen = (int)pattern.length();
- QCString wcpattern("^");
-
- char c;
- for( int i = 0; i < patlen; i++ ) {
- c = pattern[i];
- switch ( (char)c ) {
- case '*': // '*' ==> '.*'
- wcpattern += '.';
- break;
- case '?': // '?' ==> '.'
- c = '.';
- break;
- case '.': // quote special regexp chars
- case '+':
- case '\\':
- case '$':
- case '^':
- wcpattern += '\\';
- break;
- case '[':
- if ( (char)pattern[i+1] == '^' ) { // don't quote '^' after '['
- wcpattern += '[';
- c = pattern[i+1];
- i++;
- }
- break;
- }
- wcpattern += c;
-
- }
- wcpattern += '$';
- return wcpattern; // return new regexp pattern
-}
-
-
-//
-// Internal: Get char value and increment pointer.
-//
-
-static uint char_val( const char **str, uint *strlength ) // get char value
-{
- const char *p = *str;
- uint pl = *strlength;
- uint len = 1;
- uint v = 0;
- if ( (char)*p == '\\' ) { // escaped code
- p++;
- pl--;
- if ( !pl ) { // it is just a '\'
- (*str)++;
- (*strlength)--;
- return '\\';
- }
- len++; // length at least 2
- int i;
- char c;
- char ch = tolower((char)*p);
- switch ( ch ) {
- case 'b': v = '\b'; break; // bell
- case 'f': v = '\f'; break; // form feed
- case 'n': v = '\n'; break; // newline
- case 'r': v = '\r'; break; // return
- case 't': v = '\t'; break; // tab
- case 's': v = PWS; break; // whitespace charclass
- case 'd': v = PDG; break; // digit charclass
- case '<': v = BOW; break; // word beginning matcher
- case '>': v = EOW; break; // word ending matcher
-
- case 'x': { // hex code
- p++;
- pl--;
- for ( i = 0; (i < 4) && pl; i++ ) { //up to 4 hex digits
- c = tolower((char)*p);
- bool a = ( c >= 'a' && c <= 'f' );
- if ( (c >= '0' && c <= '9') || a ) {
- v <<= 4;
- v += a ? 10 + c - 'a' : c - '0';
- len++;
- }
- else {
- break;
- }
- p++;
- pl--;
- }
- }
- break;
-
- default: {
- if ( ch >= '0' && ch <= '7' ) { //octal code
- len--;
- for ( i = 0; (i < 3) && pl; i++ ) { // up to 3 oct digits
- c = (char)*p;
- if ( c >= '0' && c <= '7' ) {
- v <<= 3;
- v += c - '0';
- len++;
- }
- else {
- break;
- }
- p++;
- pl--;
- }
- }
- else { // not an octal number
- v = (uint)*p; //(((uint)(p->row())) << 8) | ((uint)p->cell());
- }
- }
- }
- } else {
- v = (uint)*p; //(((uint)(p->row())) << 8) | ((uint)p->cell());
- }
- *str += len;
- *strlength -= len;
- return v;
-}
-
-
-#if 0 //defined(DEBUG)
-static uint *dump( uint *p )
-{
- while ( *p != END ) {
- if ( *p & CHR ) {
- uchar uc = (uchar)*p;
- char c = (char)uc;
- uint u = (uint)uc; //(((uint)(uc.row())) << 8) | ((uint)uc.cell());
- qDebug( "\tCHR\tU%04x (%c)", u, (c ? c : ' '));
- p++;
- }
- else if ( *p & MCC ) {
- uint clcode = *p & MCD;
- uint numFields = *p & MVL;
- if ( clcode == CCL )
- qDebug( "\tCCL\t%i", numFields );
- else if ( clcode == CCN )
- qDebug( "\tCCN\t%i", numFields );
- else
- qDebug("coding error!");
- for ( int i = 0; i < (int)numFields; i++ ) {
- p++;
- if ( *p == PWS )
- qDebug( "\t\tPWS" );
- else if ( *p == PDG )
- qDebug( "\t\tPDG" );
- else {
- uint from = ( *p & MCD ) >> 16;
- uint to = *p & MVL;
- char fc = (char)from;
- char tc = (char)to;
- qDebug( "\t\tU%04x (%c) - U%04x (%c)", from,
- (fc ? fc : ' '), to, (tc ? tc : ' ') );
- }
- }
- p++;
- }
- else switch ( *p++ ) {
- case PWS:
- qDebug( "\tPWS" );
- break;
- case PDG:
- qDebug( "\tPDG" );
- break;
- case BOL:
- qDebug( "\tBOL" );
- break;
- case EOL:
- qDebug( "\tEOL" );
- break;
- case BOW:
- qDebug( "\tBOW" );
- break;
- case EOW:
- qDebug( "\tEOW" );
- break;
- case ANY:
- qDebug( "\tANY" );
- break;
- case CLO:
- qDebug( "\tCLO" );
- p = dump( p );
- break;
- case OPT:
- qDebug( "\tOPT" );
- p = dump( p );
- break;
- }
- }
- qDebug( "\tEND" );
- return p+1;
-}
-#endif // DEBUG
-
-
-
-/*!
- \internal
- Compiles the regular expression and stores the result in rxdata.
- The 'error' flag is set to non-zero if an error is detected.
- NOTE! This function is not reentrant!
-*/
-
-void QRegExp::compile()
-{
- const int maxlen = 1024; // max length of regexp array
- uint rxarray[ maxlen ]; // tmp regexp array
-
- if ( rxdata ) { // delete old data
- delete [] rxdata;
- rxdata = 0;
- }
- if ( rxstring.isEmpty() ) { // no regexp pattern set
- error = PatNull;
- return;
- }
-
- error = PatOk; // assume pattern is ok
-
- QCString pattern;
- if ( wc )
- pattern = wc2rx(rxstring);
- else
- pattern = rxstring;
- const char *start = pattern.data(); // pattern pointer
- const char *p = start; // pattern pointer
- uint pl = pattern.length();
- uint *d = rxarray; // data pointer
- uint *prev_d = 0;
-
-#define GEN(x) *d++ = (x)
-
- while ( pl ) {
- char ch = (char)*p;
- switch ( ch ) {
-
- case '^': // beginning of line
- prev_d = d;
- GEN( p == start ? BOL : (CHR | ch) );
- p++;
- pl--;
- break;
-
- case '$': // end of line
- prev_d = d;
- GEN( pl == 1 ? EOL : (CHR | ch) );
- p++;
- pl--;
- break;
-
- case '.': // any char
- prev_d = d;
- GEN( ANY );
- p++;
- pl--;
- break;
-
- case '[': // character class
- {
- prev_d = d;
- p++;
- pl--;
- if ( !pl ) {
- error = PatSyntax;
- return;
- }
- bool firstIsEscaped = ( (char)*p == '\\' );
- uint cch = char_val( &p, &pl );
- if ( cch == '^' && !firstIsEscaped ) { // negate!
- GEN( CCN );
- if ( !pl ) {
- error = PatSyntax;
- return;
- }
- cch = char_val( &p, &pl );
- } else {
- GEN( CCL );
- }
- uint numFields = 0;
- while ( pl ) {
- if ((pl>2) && ((char)*p == '-') && ((char)*(p+1) != ']')) {
- // Found a range
- char_val( &p, &pl ); // Read the '-'
- uint cch2 = char_val( &p, &pl ); // Read the range end
- if ( cch > cch2 ) { // swap start and stop
- int tmp = cch;
- cch = cch2;
- cch2 = tmp;
- }
- GEN( (cch << 16) | cch2 ); // from < to
- numFields++;
- }
- else {
- // Found a single character
- if ( cch & MCD ) // It's a code; will not be mistaken
- GEN( cch ); // for a range, since from > to
- else
- GEN( (cch << 16) | cch ); // from == to range
- numFields++;
- }
- if ( d >= rxarray + maxlen ) { // pattern too long
- error = PatOverflow;
- return;
- }
- if ( !pl ) { // At least ']' should be left
- error = PatSyntax;
- return;
- }
- bool nextIsEscaped = ( (char)*p == '\\' );
- cch = char_val( &p, &pl );
- if ( cch == (uint)']' && !nextIsEscaped )
- break;
- if ( !pl ) { // End, should have seen ']'
- error = PatSyntax;
- return;
- }
- }
- *prev_d |= numFields; // Store number of fields
- }
- break;
-
- case '*': // Kleene closure, or
- case '+': // positive closure, or
- case '?': // optional closure
- {
- if ( prev_d == 0 ) { // no previous expression
- error = PatSyntax; // empty closure
- return;
- }
- switch ( *prev_d ) { // test if invalid closure
- case BOL:
- case BOW:
- case EOW:
- case CLO:
- case OPT:
- error = PatSyntax;
- return;
- }
- int ddiff = (int)(d - prev_d);
- if ( *p == '+' ) { // convert to Kleene closure
- if ( d + ddiff >= rxarray + maxlen ) {
- error = PatOverflow; // pattern too long
- return;
- }
- memcpy( d, prev_d, ddiff*sizeof(uint) );
- d += ddiff;
- prev_d += ddiff;
- }
- memmove( prev_d+1, prev_d, ddiff*sizeof(uint) );
- *prev_d = ch == '?' ? OPT : CLO;
- d++;
- GEN( END );
- p++;
- pl--;
- }
- break;
-
- default:
- {
- prev_d = d;
- uint cv = char_val( &p, &pl );
- if ( cv & MCD ) { // It's a code
- GEN( cv );
- }
- else {
- if ( !cs && cv <= 0xff ) // #only 8bit support
- cv = tolower( cv );
- GEN( CHR | cv );
- }
- }
- }
- if ( d >= rxarray + maxlen ) { // oops!
- error = PatOverflow; // pattern too long
- return;
- }
- }
- GEN( END );
- int len = (int)(d - rxarray);
- rxdata = new uint[ len ]; // copy from rxarray to rxdata
- CHECK_PTR( rxdata );
- memcpy( rxdata, rxarray, len*sizeof(uint) );
-#if defined(DEBUG)
- //dump( rxdata ); // uncomment this line for debugging
-#endif
-}