/* * re2glob - C implementation * (c) 2007 ActiveState Software Inc. */ #include #define DEBUG 0 static void ExpChopNested _ANSI_ARGS_ ((Tcl_UniChar** xstr, int* xstrlen, Tcl_UniChar open, Tcl_UniChar close)); static Tcl_UniChar* ExpLiteral _ANSI_ARGS_ ((Tcl_UniChar* nexto, Tcl_UniChar* str, int strlen)); static Tcl_UniChar* ExpCollapseStar _ANSI_ARGS_ ((Tcl_UniChar* src, Tcl_UniChar* last)); static Tcl_UniChar* ExpCollapseQForward _ANSI_ARGS_ ((Tcl_UniChar* src, Tcl_UniChar* last)); static Tcl_UniChar* ExpCollapseQBack _ANSI_ARGS_ ((Tcl_UniChar* src, Tcl_UniChar* last)); static Tcl_UniChar ExpBackslash _ANSI_ARGS_ ((char prefix, Tcl_UniChar* str, int strlen)); static int ExpCountStar _ANSI_ARGS_ ((Tcl_UniChar* src, Tcl_UniChar* last)); static char* xxx (Tcl_UniChar* x, int xl) { static Tcl_DString ds; Tcl_DStringInit (&ds); return Tcl_UniCharToUtfDString (x,xl,&ds); } Tcl_Obj* exp_retoglob ( Tcl_UniChar* str, int strlen) { /* * Output: x2 size of input (literal where every character has to be * quoted. * Location: For next translated unit, in output. * Size of last generated unit, in characters. * Stack of output locations at opening parens. x1 size of input. * Location for next location on stack. */ static Tcl_UniChar litprefix [] = {'*','*','*','='}; static Tcl_UniChar areprefix [] = {'*','*','*',':'}; static Tcl_UniChar areopts [] = {'(','?'}; static Tcl_UniChar nocapture [] = {'?',':'}; static Tcl_UniChar lookhas [] = {'?','='}; static Tcl_UniChar looknot [] = {'?','!'}; static Tcl_UniChar xcomment [] = {'?','#'}; static Tcl_UniChar classa [] = {'[','.'}; static Tcl_UniChar classb [] = {'[','='}; static Tcl_UniChar classc [] = {'[',':'}; int lastsz, expanded; Tcl_UniChar* out; Tcl_UniChar* nexto; Tcl_UniChar** paren; Tcl_UniChar** nextp; Tcl_Obj* glob = NULL; Tcl_UniChar* mark; Tcl_UniChar ch; /* * Set things up. */ out = nexto = (Tcl_UniChar*) Tcl_Alloc (strlen*2*sizeof (Tcl_UniChar)); paren = nextp = (Tcl_UniChar**) Tcl_Alloc (strlen* sizeof (Tcl_UniChar*)); lastsz = -1; expanded = 0; /* * Start processing ... */ #define CHOP(n) {str += (n); strlen -= (n);} #define CHOPC(c) {while (*str != (c) && strlen) CHOP(1) ;} #define EMIT(c) {lastsz = 1; *nexto++ = (c);} #define EMITX(c) {lastsz++; *nexto++ = (c);} #define MATCH(lit) ((strlen >= (sizeof (lit)/sizeof (Tcl_UniChar))) && (0 == Tcl_UniCharNcmp (str,(lit),sizeof(lit)/sizeof (Tcl_UniChar)))) #define MATCHC(c) (strlen && (*str == (c))) #define PUSHPAREN {*nextp++ = nexto;} #define UNEMIT {nexto -= lastsz; lastsz = -1;} /* Tcl_UniCharIsDigit ? */ #define MATCH_DIGIT (MATCHC ('0') || MATCHC ('1') || \ MATCHC ('2') || MATCHC ('3') || \ MATCHC ('4') || MATCHC ('5') || \ MATCHC ('6') || MATCHC ('7') || \ MATCHC ('8') || MATCHC ('9')) #define MATCH_HEXDIGIT (MATCH_DIGIT || \ MATCHC ('a') || MATCHC ('A') || \ MATCHC ('b') || MATCHC ('B') || \ MATCHC ('c') || MATCHC ('C') || \ MATCHC ('d') || MATCHC ('D') || \ MATCHC ('e') || MATCHC ('E') || \ MATCHC ('f') || MATCHC ('F')) #define EMITC(c) {if (((c) == '\\') || \ ((c) == '*') || \ ((c) == '?') || \ ((c) == '$') || \ ((c) == '^') || \ ((c) == '[')) { \ EMIT ('\\'); EMITX ((c)); \ } else { \ EMIT ((c));}} #define MATCH_AREOPTS(c) (c == 'b' || c == 'c' || \ c == 'e' || c == 'i' || c == 'm' || c == 'n' || \ c == 'p' || c == 'q' || c == 's' || c == 't' || \ c == 'w' || c == 'x') #if DEBUG #define LOG if (1) fprintf #define FF fflush (stderr) #define MARK(s) LOG (stderr,#s "\n"); FF; #else #define LOG if (0) fprintf #define FF #define MARK(s) #endif /* ***= -> literal string follows */ LOG (stderr,"RE-2-GLOB '%s'\n", xxx(str,strlen)); FF; if (MATCH (litprefix)) { CHOP (4); nexto = ExpLiteral (nexto, str, strlen); goto done; } /* ***: -> RE is ARE. Always for Expect. Therefore ignore */ if (MATCH (areprefix)) { CHOP (4); LOG (stderr,"ARE '%s'\n", xxx(str,strlen)); FF; } /* (?xyz) ARE options, in {bceimnpqstwx}. Not validating that the * options are legal. We assume that the RE is valid. */ if (MATCH (areopts)) { /* "(?" */ Tcl_UniChar* save = str; Tcl_UniChar* stop; int stoplen; int save_strlen = strlen; int all_ARE_opts = 1; /* First, ensure that this is actually an ARE opts string. * It could be something else (e.g., a non-capturing block). */ CHOP (2); mark = str; CHOPC (')'); stop = str; /* Remember closing parens location, allows */ stoplen = strlen; /* us to avoid a second CHOPC run later */ while (mark < str) { if (MATCH_AREOPTS(*mark)) { mark++; } else { all_ARE_opts = 0; break; } } /* Reset back to our entry point. */ str = save; strlen = save_strlen; if (all_ARE_opts) { /* Now actually perform the ARE option processing */ LOG (stderr, "%s\n", "Processing AREOPTS"); FF; CHOP (2); mark = str; /* Equivalent to CHOPC (')') */ str = stop; strlen = stoplen; while (mark < str) { if (*mark == 'q') { CHOP (1); nexto = ExpLiteral (nexto, str, strlen); goto done; } else if (*mark == 'x') { expanded = 1; LOG (stderr,"EXPANDED\n"); FF; } mark++; } CHOP (1); } } while (strlen) { LOG (stderr,"'%s' <-- ",xxx(out,nexto-out)); FF; LOG (stderr,"'%s'\n", xxx(str,strlen)); FF; if (expanded) { /* Expanded syntax, whitespace and comments, ignore. */ while (MATCHC (' ') || MATCHC (0x9) || MATCHC (0xa)) CHOP (1); if (MATCHC ('#')) { CHOPC (0xa); if (strlen) CHOP (1); continue; } } if (MATCHC ('|')) { /* branching is too complex */ goto error; } else if (MATCHC ('(')) { /* open parens */ CHOP (1); if (MATCH (nocapture)) { /* "?:" */ /* non capturing -save location */ PUSHPAREN; CHOP (2); } else if (MATCH (lookhas) || /* "?=" */ MATCH (looknot)) { /* "?!" */ /* lookahead - ignore */ CHOP (2); ExpChopNested (&str, &strlen, '(', ')'); } else if (MATCH (xcomment)) { /* "?#" */ /* comment - ignore */ CHOPC (')'); CHOP (1); } else { /* plain capturing */ PUSHPAREN; } } else if (MATCHC (')')) { /* Closing parens. */ CHOP (1); /* Everything coming after the saved result is new, and * collapsed into a single entry for a possible coming operator * to handle. */ nextp --; /* Back to last save */ mark = *nextp; /* Location where generation for this parens started */ lastsz = (nexto - mark); /* This many chars generated */ /* Now lastsz has the correct value for a possibly following * UNEMIT */ } else if (MATCHC ('$') || MATCHC ('^')) { /* anchor constraints - ignore */ CHOP (1); } else if (MATCHC ('[')) { /* Classes - reduce to any char [[=chars=]] [[.chars.]] * [[:name:]] [chars] Count brackets to find end. * These are a bit complicated ... [= =], [. .], [: {] sequences * always have to be complete. '[' does NOT nest otherwise. And * a ']' after the opening '[' (with only '^' allowed to * intervene is a character, not the closing bracket. We have to * process the class in pieces to handle all this. The Tcl level * implementations (0-2 all have bugs one way or other, all * different. */ int first = 1; int allowed = 1; CHOP (1); while (strlen) { if (first && MATCHC ('^')) { /* ^ as first keeps allowed ok for one more cycle */ CHOP (1); first = 0; continue; } else if (allowed && MATCHC (']')) { /* Not a closing bracket! */ CHOP (1); } else if (MATCHC (']')) { /* Closing bracket found */ CHOP (1); break; } else if (MATCH (classa) || MATCH (classb) || MATCH (classc)) { Tcl_UniChar delim[2]; delim[0] = str [1]; delim[1] = ']'; CHOP (2); while (!MATCH (delim)) CHOP (1); CHOP (2); } else { /* Any char in class */ CHOP (1); } /* Reset flags handling start of class */ allowed = first = 0; } EMIT ('?'); } else if (MATCHC ('\\')) { /* Escapes */ CHOP (1); if (MATCHC ('d') || MATCHC ('D') || MATCHC ('s') || MATCHC ('S') || MATCHC ('w') || MATCHC ('W')) { /* Class shorthands - reduce to any char */ EMIT ('?'); CHOP (1); } else if (MATCHC ('m') || MATCHC ('M') || MATCHC ('y') || MATCHC ('Y') || MATCHC ('A') || MATCHC ('Z')) { /* constraint escapes - ignore */ CHOP (1); } else if (MATCHC ('B')) { /* Backslash */ EMIT ('\\'); EMITX ('\\'); CHOP (1); } else if (MATCHC ('0')) { /* Escape NULL */ EMIT ('\0'); CHOP (1); } else if (MATCHC ('e')) { /* Escape ESC */ EMIT ('\033'); CHOP (1); } else if (MATCHC ('a')) { /* Escape \a */ EMIT (0x7); CHOP (1); } else if (MATCHC ('b')) { /* Escape \b */ EMIT (0x8); CHOP (1); } else if (MATCHC ('f')) { /* Escape \f */ EMIT (0xc); CHOP (1); } else if (MATCHC ('n')) { /* Escape \n */ EMIT (0xa); CHOP (1); } else if (MATCHC ('r')) { /* Escape \r */ EMIT (0xd); CHOP (1); } else if (MATCHC ('t')) { /* Escape \t */ EMIT (0x9); CHOP (1); } else if (MATCHC ('v')) { /* Escape \v */ EMIT (0xb); CHOP (1); } else if (MATCHC ('c') && (strlen >= 2)) { /* Escape \cX - reduce to (.) */ EMIT ('?'); CHOP (2); } else if (MATCHC ('x')) { CHOP (1); if (MATCH_HEXDIGIT) { /* Escape hex character */ mark = str; while (MATCH_HEXDIGIT) CHOP (1); if ((str - mark) > 2) { mark = str - 2; } ch = ExpBackslash ('x',mark,str-mark); EMITC (ch); } else { /* Without hex digits following this is a plain char */ EMIT ('x'); } } else if (MATCHC ('u')) { /* Escapes unicode short. */ CHOP (1); mark = str; CHOP (4); ch = ExpBackslash ('u',mark,str-mark); EMITC (ch); } else if (MATCHC ('U')) { /* Escapes unicode long. */ CHOP (1); mark = str; CHOP (8); ch = ExpBackslash ('U',mark,str-mark); EMITC (ch); } else if (MATCH_DIGIT) { /* Escapes, octal, and backreferences - reduce (.*) */ CHOP (1); while (MATCH_DIGIT) CHOP (1); EMIT ('*'); } else { /* Plain escaped characters - copy over, requote */ EMITC (*str); CHOP (1); } } else if (MATCHC ('{')) { /* Non-greedy and greedy bounds - reduce to (*) */ CHOP (1); if (MATCH_DIGIT) { /* Locate closing brace and remove operator */ CHOPC ('}'); CHOP (1); /* Remove optional greedy quantifier */ if (MATCHC ('?')) { CHOP (1);} UNEMIT; EMIT ('*'); } else { /* Brace is plain character, copy over */ EMIT ('{'); /* CHOP already done */ } } else if (MATCHC ('*') || MATCHC ('+') || MATCHC ('?')) { /* (Non-)greedy operators - reduce to (*) */ CHOP (1); /* Remove optional greedy quantifier */ if (MATCHC ('?')) { CHOP (1);} UNEMIT; EMIT ('*'); } else if (MATCHC ('.')) { /* anychar - copy over */ EMIT ('?'); CHOP (1); } else { /* Plain char, copy over. */ EMIT (*str); CHOP (1); } } LOG (stderr,"'%s' <-- ",xxx(out,nexto-out)); FF; LOG (stderr,"'%s'\n", xxx(str,strlen)); FF; /* * Clean up the output a bit (collapse *-sequences and absorb ?'s * into adjacent *'s. */ MARK (QF) nexto = ExpCollapseQForward (out,nexto); LOG (stderr,"QF '%s'\n",xxx(out,nexto-out)); FF; MARK (QB) nexto = ExpCollapseQBack (out,nexto); LOG (stderr,"QB '%s'\n",xxx(out,nexto-out)); FF; MARK (QS) nexto = ExpCollapseStar (out,nexto); LOG (stderr,"ST '%s'\n",xxx(out,nexto-out)); FF; /* * Heuristic: if there are more than two *s, the risk is far too * large that the result actually is slower than the normal re * matching. So bail out. */ if (ExpCountStar (out,nexto) > 2) { goto error; } /* * Check if the result is actually useful. * Empty or just a *, or ? are not. A series * of ?'s is borderline, as they semi-count * the buffer. */ if ((nexto == out) || (((nexto-out) == 1) && ((*out == '*') || (*out == '?')))) { goto error; } /* * Result generation and cleanup. */ done: LOG (stderr,"RESULT_ '%s'\n", xxx(out,nexto-out)); FF; glob = Tcl_NewUnicodeObj (out,(nexto-out)); goto cleanup; error: LOG (stderr,"RESULT_ ERROR\n"); FF; cleanup: Tcl_Free ((char*)out); Tcl_Free ((char*)paren); return glob; } static void #ifdef _AIX ExpChopNested (Tcl_UniChar** xstr, int* xstrlen, Tcl_UniChar open, Tcl_UniChar close) #else ExpChopNested (xstr,xstrlen, open, close) Tcl_UniChar** xstr; int* xstrlen; Tcl_UniChar open; Tcl_UniChar close; #endif { Tcl_UniChar* str = *xstr; int strlen = *xstrlen; int level = 0; while (strlen) { if (MATCHC (open)) { level ++; } else if (MATCHC (close)) { level --; if (level < 0) { CHOP (1); break; } } CHOP (1); } *xstr = str; *xstrlen = strlen; } static Tcl_UniChar* ExpLiteral (nexto, str, strlen) Tcl_UniChar* nexto; Tcl_UniChar* str; int strlen; { int lastsz; LOG (stderr,"LITERAL '%s'\n", xxx(str,strlen)); FF; while (strlen) { EMITC (*str); CHOP (1); } return nexto; } static Tcl_UniChar #ifdef _AIX ExpBackslash (char prefix, Tcl_UniChar* str, int strlen) #else ExpBackslash (prefix, str, strlen) char prefix; Tcl_UniChar* str; int strlen; #endif { /* strlen <= 8 */ char buf[20]; char dst[TCL_UTF_MAX+1]; Tcl_UniChar ch; int at = 0; /* Construct an utf backslash sequence we can throw to Tcl */ buf [at++] = '\\'; buf [at++] = prefix; while (strlen) { buf [at++] = *str++; strlen --; } Tcl_UtfBackslash (buf, NULL, dst); TclUtfToUniChar (dst, &ch); return ch; } static Tcl_UniChar* ExpCollapseStar (src, last) Tcl_UniChar* src; Tcl_UniChar* last; { Tcl_UniChar* dst, *base; int skip = 0; int star = 0; /* Collapses series of *'s into a single *. State machine. The * complexity is due to the need of handling escaped characters. */ LOG (stderr,"Q-STAR\n"); FF; for (dst = base = src; src < last;) { LOG (stderr,"@%1d /%1d '%s' <-- ", star,skip,xxx(base,dst-base)); FF; LOG (stderr,"'%s'\n", xxx(src,last-src)); FF; if (skip) { skip = 0; star = 0; } else if (*src == '\\') { skip = 1; /* Copy next char, whatever its value */ star = 0; } else if (*src == '*') { if (star) { /* Previous char was *, do not copy the current * to collapse * the sequence */ src++; continue; } star = 1; /* *-series starts here */ } else { star = 0; } *dst++ = *src++; } LOG (stderr,"@%1d /%1d '%s' <-- ", star,skip,xxx(base,dst-base)); FF; LOG (stderr,"'%s'\n", xxx(src,last-src)); FF; return dst; } static Tcl_UniChar* ExpCollapseQForward (src, last) Tcl_UniChar* src; Tcl_UniChar* last; { Tcl_UniChar* dst, *base; int skip = 0; int quest = 0; /* Collapses series of ?'s coming after a *. State machine. The * complexity is due to the need of handling escaped characters. */ LOG (stderr,"Q-Forward\n"); FF; for (dst = base = src; src < last;) { LOG (stderr,"?%1d /%1d '%s' <-- ", quest,skip,xxx(base,dst-base)); FF; LOG (stderr,"'%s'\n", xxx(src,last-src)); FF; if (skip) { skip = 0; quest = 0; } else if (*src == '\\') { skip = 1; quest = 0; /* Copy next char, whatever its value */ } else if (*src == '?') { if (quest) { /* Previous char was *, do not copy the current ? to collapse * the sequence */ src++; continue; } } else if (*src == '*') { quest = 1; } else { quest = 0; } *dst++ = *src++; } LOG (stderr,"?%1d /%1d '%s' <-- ", quest,skip,xxx(base,dst-base)); FF; LOG (stderr,"'%s'\n", xxx(src,last-src)); FF; return dst; } static Tcl_UniChar* ExpCollapseQBack (src, last) Tcl_UniChar* src; Tcl_UniChar* last; { Tcl_UniChar* dst, *base; int skip = 0; /* Collapses series of ?'s coming before a *. State machine. The * complexity is due to the need of handling escaped characters. */ LOG (stderr,"Q-Backward\n"); FF; for (dst = base = src; src < last;) { LOG (stderr,"/%1d '%s' <-- ",skip,xxx(base,dst-base)); FF; LOG (stderr,"'%s'\n", xxx(src,last-src)); FF; if (skip) { skip = 0; } else if (*src == '\\') { skip = 1; /* Copy next char, whatever its value */ } else if (*src == '*') { /* Move backward in the output while the previous character is * an unescaped question mark. If there is a previous character, * or a character before that.. */ while ((((dst-base) > 2) && (dst[-1] == '?') && (dst[-2] != '\\')) || (((dst-base) == 1) && (dst[-1] == '?'))) { dst --; } } *dst++ = *src++; } LOG (stderr,"/%1d '%s' <-- \n",skip,xxx(base,dst-base)); FF; LOG (stderr,"'%s'\n", xxx(src,last-src)); FF; return dst; } static int ExpCountStar (src, last) Tcl_UniChar* src; Tcl_UniChar* last; { int skip = 0; int stars = 0; /* Count number of *'s. State machine. The complexity is due to the * need of handling escaped characters. */ for (; src < last; src++) { if (skip) { skip = 0; } else if (*src == '\\') { skip = 1; } else if (*src == '*') { stars++; } } return stars; } #undef CHOP #undef CHOPC #undef EMIT #undef EMITX #undef MATCH #undef MATCHC #undef MATCH_DIGIT #undef MATCH_HEXDIGIT #undef PUSHPAREN #undef UNEMIT