diff options
Diffstat (limited to 'retoglob.c')
-rw-r--r-- | retoglob.c | 767 |
1 files changed, 767 insertions, 0 deletions
diff --git a/retoglob.c b/retoglob.c new file mode 100644 index 0000000..521d0ae --- /dev/null +++ b/retoglob.c @@ -0,0 +1,767 @@ +/* + * re2glob - C implementation + * (c) 2007 ActiveState Software Inc. + */ + +#include <tcl.h> + +#define DEBUG 0 + +static void +ExpChopNested _ANSI_ARGS_ ((Tcl_UniChar** xstr, + int* xstrlen, + Tcl_UniChar open, + Tcl_UniChar close)); + +static Tcl_UniChar* +ExpLiteral _ANSI_ARGS_ ((Tcl_UniChar* nexto, + Tcl_UniChar* str, + int strlen)); + +static Tcl_UniChar* +ExpCollapseStar _ANSI_ARGS_ ((Tcl_UniChar* src, + Tcl_UniChar* last)); +static Tcl_UniChar* +ExpCollapseQForward _ANSI_ARGS_ ((Tcl_UniChar* src, + Tcl_UniChar* last)); + +static Tcl_UniChar* +ExpCollapseQBack _ANSI_ARGS_ ((Tcl_UniChar* src, + Tcl_UniChar* last)); + +static Tcl_UniChar +ExpBackslash _ANSI_ARGS_ ((char prefix, + Tcl_UniChar* str, + int strlen)); + +static int +ExpCountStar _ANSI_ARGS_ ((Tcl_UniChar* src, Tcl_UniChar* last)); + + +static char* +xxx (Tcl_UniChar* x, int xl) +{ + static Tcl_DString ds; + Tcl_DStringInit (&ds); + return Tcl_UniCharToUtfDString (x,xl,&ds); +} + + +Tcl_Obj* +exp_retoglob ( + Tcl_UniChar* str, + int strlen) +{ + /* + * Output: x2 size of input (literal where every character has to be + * quoted. + * Location: For next translated unit, in output. + * Size of last generated unit, in characters. + * Stack of output locations at opening parens. x1 size of input. + * Location for next location on stack. + */ + + static Tcl_UniChar litprefix [] = {'*','*','*','='}; + static Tcl_UniChar areprefix [] = {'*','*','*',':'}; + static Tcl_UniChar areopts [] = {'(','?'}; + static Tcl_UniChar nocapture [] = {'?',':'}; + static Tcl_UniChar lookhas [] = {'?','='}; + static Tcl_UniChar looknot [] = {'?','!'}; + static Tcl_UniChar xcomment [] = {'?','#'}; + + static Tcl_UniChar classa [] = {'[','.'}; + static Tcl_UniChar classb [] = {'[','='}; + static Tcl_UniChar classc [] = {'[',':'}; + + + int lastsz, expanded; + Tcl_UniChar* out; + Tcl_UniChar* nexto; + Tcl_UniChar** paren; + Tcl_UniChar** nextp; + Tcl_Obj* glob = NULL; + Tcl_UniChar* mark; + Tcl_UniChar ch; + + /* + * Set things up. + */ + + out = nexto = (Tcl_UniChar*) Tcl_Alloc (strlen*2*sizeof (Tcl_UniChar)); + paren = nextp = (Tcl_UniChar**) Tcl_Alloc (strlen* sizeof (Tcl_UniChar*)); + lastsz = -1; + expanded = 0; + + /* + * Start processing ... + */ + +#define CHOP(n) {str += (n); strlen -= (n);} +#define CHOPC(c) {while (*str != (c) && strlen) CHOP(1) ;} +#define EMIT(c) {lastsz = 1; *nexto++ = (c);} +#define EMITX(c) {lastsz++; *nexto++ = (c);} +#define MATCH(lit) ((strlen >= (sizeof (lit)/sizeof (Tcl_UniChar))) && (0 == Tcl_UniCharNcmp (str,(lit),sizeof(lit)/sizeof (Tcl_UniChar)))) +#define MATCHC(c) (strlen && (*str == (c))) +#define PUSHPAREN {*nextp++ = nexto;} +#define UNEMIT {nexto -= lastsz; lastsz = -1;} + /* Tcl_UniCharIsDigit ? */ +#define MATCH_DIGIT (MATCHC ('0') || MATCHC ('1') || \ + MATCHC ('2') || MATCHC ('3') || \ + MATCHC ('4') || MATCHC ('5') || \ + MATCHC ('6') || MATCHC ('7') || \ + MATCHC ('8') || MATCHC ('9')) +#define MATCH_HEXDIGIT (MATCH_DIGIT || \ + MATCHC ('a') || MATCHC ('A') || \ + MATCHC ('b') || MATCHC ('B') || \ + MATCHC ('c') || MATCHC ('C') || \ + MATCHC ('d') || MATCHC ('D') || \ + MATCHC ('e') || MATCHC ('E') || \ + MATCHC ('f') || MATCHC ('F')) +#define EMITC(c) {if (((c) == '\\') || \ + ((c) == '*') || \ + ((c) == '?') || \ + ((c) == '$') || \ + ((c) == '^') || \ + ((c) == '[')) { \ + EMIT ('\\'); EMITX ((c)); \ + } else { \ + EMIT ((c));}} +#define MATCH_AREOPTS(c) (c == 'b' || c == 'c' || \ + c == 'e' || c == 'i' || c == 'm' || c == 'n' || \ + c == 'p' || c == 'q' || c == 's' || c == 't' || \ + c == 'w' || c == 'x') + +#if DEBUG +#define LOG if (1) fprintf +#define FF fflush (stderr) +#define MARK(s) LOG (stderr,#s "\n"); FF; +#else +#define LOG if (0) fprintf +#define FF +#define MARK(s) +#endif + + /* ***= -> literal string follows */ + + LOG (stderr,"RE-2-GLOB '%s'\n", xxx(str,strlen)); FF; + + if (MATCH (litprefix)) { + CHOP (4); + nexto = ExpLiteral (nexto, str, strlen); + goto done; + } + + /* ***: -> RE is ARE. Always for Expect. Therefore ignore */ + + if (MATCH (areprefix)) { + CHOP (4); + LOG (stderr,"ARE '%s'\n", xxx(str,strlen)); FF; + } + + /* (?xyz) ARE options, in {bceimnpqstwx}. Not validating that the + * options are legal. We assume that the RE is valid. + */ + + if (MATCH (areopts)) { /* "(?" */ + Tcl_UniChar* save = str; + Tcl_UniChar* stop; + int stoplen; + int save_strlen = strlen; + int all_ARE_opts = 1; + + /* First, ensure that this is actually an ARE opts string. + * It could be something else (e.g., a non-capturing block). + */ + CHOP (2); + mark = str; CHOPC (')'); + stop = str; /* Remember closing parens location, allows */ + stoplen = strlen; /* us to avoid a second CHOPC run later */ + + while (mark < str) { + if (MATCH_AREOPTS(*mark)) { + mark++; + } else { + all_ARE_opts = 0; + break; + } + } + + /* Reset back to our entry point. */ + str = save; + strlen = save_strlen; + + if (all_ARE_opts) { + /* Now actually perform the ARE option processing */ + LOG (stderr, "%s\n", "Processing AREOPTS"); FF; + + CHOP (2); + mark = str; + /* Equivalent to CHOPC (')') */ + str = stop; + strlen = stoplen; + + while (mark < str) { + if (*mark == 'q') { + CHOP (1); + nexto = ExpLiteral (nexto, str, strlen); + goto done; + } else if (*mark == 'x') { + expanded = 1; + LOG (stderr,"EXPANDED\n"); FF; + } + mark++; + } + CHOP (1); + } + } + + while (strlen) { + + LOG (stderr,"'%s' <-- ",xxx(out,nexto-out)); FF; + LOG (stderr,"'%s'\n", xxx(str,strlen)); FF; + + if (expanded) { + /* Expanded syntax, whitespace and comments, ignore. */ + while (MATCHC (' ') || + MATCHC (0x9) || + MATCHC (0xa)) CHOP (1); + if (MATCHC ('#')) { + CHOPC (0xa); + if (strlen) CHOP (1); + continue; + } + } + + if (MATCHC ('|')) { + /* branching is too complex */ + goto error; + } else if (MATCHC ('(')) { + /* open parens */ + CHOP (1); + if (MATCH (nocapture)) { /* "?:" */ + /* non capturing -save location */ + PUSHPAREN; + CHOP (2); + } else if (MATCH (lookhas) || /* "?=" */ + MATCH (looknot)) { /* "?!" */ + /* lookahead - ignore */ + CHOP (2); + ExpChopNested (&str, &strlen, '(', ')'); + } else if (MATCH (xcomment)) { /* "?#" */ + /* comment - ignore */ + CHOPC (')'); CHOP (1); + } else { + /* plain capturing */ + PUSHPAREN; + } + } else if (MATCHC (')')) { + /* Closing parens. */ + CHOP (1); + /* Everything coming after the saved result is new, and + * collapsed into a single entry for a possible coming operator + * to handle. + */ + nextp --; /* Back to last save */ + mark = *nextp; /* Location where generation for this parens started */ + lastsz = (nexto - mark); /* This many chars generated */ + /* Now lastsz has the correct value for a possibly following + * UNEMIT + */ + } else if (MATCHC ('$') || MATCHC ('^')) { + /* anchor constraints - ignore */ + CHOP (1); + } else if (MATCHC ('[')) { + /* Classes - reduce to any char [[=chars=]] [[.chars.]] + * [[:name:]] [chars] Count brackets to find end. + + * These are a bit complicated ... [= =], [. .], [: {] sequences + * always have to be complete. '[' does NOT nest otherwise. And + * a ']' after the opening '[' (with only '^' allowed to + * intervene is a character, not the closing bracket. We have to + * process the class in pieces to handle all this. The Tcl level + * implementations (0-2 all have bugs one way or other, all + * different. + */ + + int first = 1; + int allowed = 1; + CHOP (1); + while (strlen) { + if (first && MATCHC ('^')) { + /* ^ as first keeps allowed ok for one more cycle */ + CHOP (1); + first = 0; + continue; + } else if (allowed && MATCHC (']')) { + /* Not a closing bracket! */ + CHOP (1); + } else if (MATCHC (']')) { + /* Closing bracket found */ + CHOP (1); + break; + } else if (MATCH (classa) || + MATCH (classb) || + MATCH (classc)) { + Tcl_UniChar delim[2]; + delim[0] = str [1]; + delim[1] = ']'; + CHOP (2); + while (!MATCH (delim)) CHOP (1); + CHOP (2); + } else { + /* Any char in class */ + CHOP (1); + } + /* Reset flags handling start of class */ + allowed = first = 0; + } + + EMIT ('?'); + } else if (MATCHC ('\\')) { + /* Escapes */ + CHOP (1); + if (MATCHC ('d') || MATCHC ('D') || + MATCHC ('s') || MATCHC ('S') || + MATCHC ('w') || MATCHC ('W')) { + /* Class shorthands - reduce to any char */ + EMIT ('?'); + CHOP (1); + } else if (MATCHC ('m') || MATCHC ('M') || + MATCHC ('y') || MATCHC ('Y') || + MATCHC ('A') || MATCHC ('Z')) { + /* constraint escapes - ignore */ + CHOP (1); + } else if (MATCHC ('B')) { + /* Backslash */ + EMIT ('\\'); + EMITX ('\\'); + CHOP (1); + } else if (MATCHC ('0')) { + /* Escape NULL */ + EMIT ('\0'); + CHOP (1); + } else if (MATCHC ('e')) { + /* Escape ESC */ + EMIT ('\033'); + CHOP (1); + } else if (MATCHC ('a')) { + /* Escape \a */ + EMIT (0x7); + CHOP (1); + } else if (MATCHC ('b')) { + /* Escape \b */ + EMIT (0x8); + CHOP (1); + } else if (MATCHC ('f')) { + /* Escape \f */ + EMIT (0xc); + CHOP (1); + } else if (MATCHC ('n')) { + /* Escape \n */ + EMIT (0xa); + CHOP (1); + } else if (MATCHC ('r')) { + /* Escape \r */ + EMIT (0xd); + CHOP (1); + } else if (MATCHC ('t')) { + /* Escape \t */ + EMIT (0x9); + CHOP (1); + } else if (MATCHC ('v')) { + /* Escape \v */ + EMIT (0xb); + CHOP (1); + } else if (MATCHC ('c') && (strlen >= 2)) { + /* Escape \cX - reduce to (.) */ + EMIT ('?'); + CHOP (2); + } else if (MATCHC ('x')) { + CHOP (1); + if (MATCH_HEXDIGIT) { + /* Escape hex character */ + mark = str; + while (MATCH_HEXDIGIT) CHOP (1); + if ((str - mark) > 2) { mark = str - 2; } + ch = ExpBackslash ('x',mark,str-mark); + EMITC (ch); + } else { + /* Without hex digits following this is a plain char */ + EMIT ('x'); + } + } else if (MATCHC ('u')) { + /* Escapes unicode short. */ + CHOP (1); + mark = str; + CHOP (4); + ch = ExpBackslash ('u',mark,str-mark); + EMITC (ch); + } else if (MATCHC ('U')) { + /* Escapes unicode long. */ + CHOP (1); + mark = str; + CHOP (8); + ch = ExpBackslash ('U',mark,str-mark); + EMITC (ch); + } else if (MATCH_DIGIT) { + /* Escapes, octal, and backreferences - reduce (.*) */ + CHOP (1); + while (MATCH_DIGIT) CHOP (1); + EMIT ('*'); + } else { + /* Plain escaped characters - copy over, requote */ + EMITC (*str); + CHOP (1); + } + } else if (MATCHC ('{')) { + /* Non-greedy and greedy bounds - reduce to (*) */ + CHOP (1); + if (MATCH_DIGIT) { + /* Locate closing brace and remove operator */ + CHOPC ('}'); CHOP (1); + /* Remove optional greedy quantifier */ + if (MATCHC ('?')) { CHOP (1);} + UNEMIT; + EMIT ('*'); + } else { + /* Brace is plain character, copy over */ + EMIT ('{'); + /* CHOP already done */ + } + } else if (MATCHC ('*') || + MATCHC ('+') || + MATCHC ('?')) { + /* (Non-)greedy operators - reduce to (*) */ + CHOP (1); + /* Remove optional greedy quantifier */ + if (MATCHC ('?')) { CHOP (1);} + UNEMIT; + EMIT ('*'); + } else if (MATCHC ('.')) { + /* anychar - copy over */ + EMIT ('?'); + CHOP (1); + } else { + /* Plain char, copy over. */ + EMIT (*str); + CHOP (1); + } + } + + LOG (stderr,"'%s' <-- ",xxx(out,nexto-out)); FF; + LOG (stderr,"'%s'\n", xxx(str,strlen)); FF; + + /* + * Clean up the output a bit (collapse *-sequences and absorb ?'s + * into adjacent *'s. + */ + + MARK (QF) + nexto = ExpCollapseQForward (out,nexto); + LOG (stderr,"QF '%s'\n",xxx(out,nexto-out)); FF; + + MARK (QB) + nexto = ExpCollapseQBack (out,nexto); + LOG (stderr,"QB '%s'\n",xxx(out,nexto-out)); FF; + + MARK (QS) + nexto = ExpCollapseStar (out,nexto); + LOG (stderr,"ST '%s'\n",xxx(out,nexto-out)); FF; + + /* + * Heuristic: if there are more than two *s, the risk is far too + * large that the result actually is slower than the normal re + * matching. So bail out. + */ + if (ExpCountStar (out,nexto) > 2) { + goto error; + } + + /* + * Check if the result is actually useful. + * Empty or just a *, or ? are not. A series + * of ?'s is borderline, as they semi-count + * the buffer. + */ + + if ((nexto == out) || + (((nexto-out) == 1) && + ((*out == '*') || + (*out == '?')))) { + goto error; + } + + /* + * Result generation and cleanup. + */ + done: + LOG (stderr,"RESULT_ '%s'\n", xxx(out,nexto-out)); FF; + glob = Tcl_NewUnicodeObj (out,(nexto-out)); + goto cleanup; + + error: + LOG (stderr,"RESULT_ ERROR\n"); FF; + + cleanup: + Tcl_Free ((char*)out); + Tcl_Free ((char*)paren); + + return glob; +} + +static void +#ifdef _AIX +ExpChopNested (Tcl_UniChar** xstr, + int* xstrlen, + Tcl_UniChar open, + Tcl_UniChar close) +#else +ExpChopNested (xstr,xstrlen, open, close) + Tcl_UniChar** xstr; + int* xstrlen; + Tcl_UniChar open; + Tcl_UniChar close; +#endif +{ + Tcl_UniChar* str = *xstr; + int strlen = *xstrlen; + int level = 0; + + while (strlen) { + if (MATCHC (open)) { + level ++; + } else if (MATCHC (close)) { + level --; + if (level < 0) { + CHOP (1); + break; + } + } + CHOP (1); + } + + *xstr = str; + *xstrlen = strlen; +} + +static Tcl_UniChar* +ExpLiteral (nexto, str, strlen) + Tcl_UniChar* nexto; + Tcl_UniChar* str; + int strlen; +{ + int lastsz; + + LOG (stderr,"LITERAL '%s'\n", xxx(str,strlen)); FF; + + while (strlen) { + EMITC (*str); + CHOP (1); + } + return nexto; +} + +static Tcl_UniChar +#ifdef _AIX +ExpBackslash (char prefix, + Tcl_UniChar* str, + int strlen) +#else +ExpBackslash (prefix, str, strlen) + char prefix; + Tcl_UniChar* str; + int strlen; +#endif +{ + /* strlen <= 8 */ + char buf[20]; + char dst[TCL_UTF_MAX+1]; + Tcl_UniChar ch; + int at = 0; + + /* Construct an utf backslash sequence we can throw to Tcl */ + + buf [at++] = '\\'; + buf [at++] = prefix; + while (strlen) { + buf [at++] = *str++; + strlen --; + } + + Tcl_UtfBackslash (buf, NULL, dst); + TclUtfToUniChar (dst, &ch); + return ch; +} + +static Tcl_UniChar* +ExpCollapseStar (src, last) + Tcl_UniChar* src; + Tcl_UniChar* last; +{ + Tcl_UniChar* dst, *base; + int skip = 0; + int star = 0; + + /* Collapses series of *'s into a single *. State machine. The + * complexity is due to the need of handling escaped characters. + */ + + LOG (stderr,"Q-STAR\n"); FF; + + for (dst = base = src; src < last;) { + + LOG (stderr,"@%1d /%1d '%s' <-- ", star,skip,xxx(base,dst-base)); FF; + LOG (stderr,"'%s'\n", xxx(src,last-src)); FF; + + if (skip) { + skip = 0; + star = 0; + } else if (*src == '\\') { + skip = 1; /* Copy next char, whatever its value */ + star = 0; + } else if (*src == '*') { + if (star) { + /* Previous char was *, do not copy the current * to collapse + * the sequence + */ + src++; + continue; + } + star = 1; /* *-series starts here */ + } else { + star = 0; + } + *dst++ = *src++; + } + + LOG (stderr,"@%1d /%1d '%s' <-- ", star,skip,xxx(base,dst-base)); FF; + LOG (stderr,"'%s'\n", xxx(src,last-src)); FF; + + return dst; +} + +static Tcl_UniChar* +ExpCollapseQForward (src, last) + Tcl_UniChar* src; + Tcl_UniChar* last; +{ + Tcl_UniChar* dst, *base; + int skip = 0; + int quest = 0; + + /* Collapses series of ?'s coming after a *. State machine. The + * complexity is due to the need of handling escaped characters. + */ + + LOG (stderr,"Q-Forward\n"); FF; + + for (dst = base = src; src < last;) { + + LOG (stderr,"?%1d /%1d '%s' <-- ", quest,skip,xxx(base,dst-base)); FF; + LOG (stderr,"'%s'\n", xxx(src,last-src)); FF; + + if (skip) { + skip = 0; + quest = 0; + } else if (*src == '\\') { + skip = 1; + quest = 0; + /* Copy next char, whatever its value */ + } else if (*src == '?') { + if (quest) { + /* Previous char was *, do not copy the current ? to collapse + * the sequence + */ + src++; + continue; + } + } else if (*src == '*') { + quest = 1; + } else { + quest = 0; + } + *dst++ = *src++; + } + + LOG (stderr,"?%1d /%1d '%s' <-- ", quest,skip,xxx(base,dst-base)); FF; + LOG (stderr,"'%s'\n", xxx(src,last-src)); FF; + return dst; +} + +static Tcl_UniChar* +ExpCollapseQBack (src, last) + Tcl_UniChar* src; + Tcl_UniChar* last; +{ + Tcl_UniChar* dst, *base; + int skip = 0; + + /* Collapses series of ?'s coming before a *. State machine. The + * complexity is due to the need of handling escaped characters. + */ + + LOG (stderr,"Q-Backward\n"); FF; + + for (dst = base = src; src < last;) { + LOG (stderr,"/%1d '%s' <-- ",skip,xxx(base,dst-base)); FF; + LOG (stderr,"'%s'\n", xxx(src,last-src)); FF; + + if (skip) { + skip = 0; + } else if (*src == '\\') { + skip = 1; + /* Copy next char, whatever its value */ + } else if (*src == '*') { + /* Move backward in the output while the previous character is + * an unescaped question mark. If there is a previous character, + * or a character before that.. + */ + + while ((((dst-base) > 2) && (dst[-1] == '?') && (dst[-2] != '\\')) || + (((dst-base) == 1) && (dst[-1] == '?'))) { + dst --; + } + } + *dst++ = *src++; + } + + LOG (stderr,"/%1d '%s' <-- \n",skip,xxx(base,dst-base)); FF; + LOG (stderr,"'%s'\n", xxx(src,last-src)); FF; + return dst; +} + +static int +ExpCountStar (src, last) + Tcl_UniChar* src; + Tcl_UniChar* last; +{ + int skip = 0; + int stars = 0; + + /* Count number of *'s. State machine. The complexity is due to the + * need of handling escaped characters. + */ + + for (; src < last; src++) { + if (skip) { + skip = 0; + } else if (*src == '\\') { + skip = 1; + } else if (*src == '*') { + stars++; + } + } + + return stars; +} + +#undef CHOP +#undef CHOPC +#undef EMIT +#undef EMITX +#undef MATCH +#undef MATCHC +#undef MATCH_DIGIT +#undef MATCH_HEXDIGIT +#undef PUSHPAREN +#undef UNEMIT |