Main Page | Class List | Directories | File List | Class Members | File Members

re.c

Go to the documentation of this file.
00001 /*
00002  * re.c - compile regular expressions.
00003  */
00004 
00005 /* 
00006  * Copyright (C) 1991-2003 the Free Software Foundation, Inc.
00007  * 
00008  * This file is part of GAWK, the GNU implementation of the
00009  * AWK Programming Language.
00010  * 
00011  * GAWK is free software; you can redistribute it and/or modify
00012  * it under the terms of the GNU General Public License as published by
00013  * the Free Software Foundation; either version 2 of the License, or
00014  * (at your option) any later version.
00015  * 
00016  * GAWK is distributed in the hope that it will be useful,
00017  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00018  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00019  * GNU General Public License for more details.
00020  * 
00021  * You should have received a copy of the GNU General Public License
00022  * along with this program; if not, write to the Free Software
00023  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
00024  */
00025 
00026 #include "awk.h"
00027 
00028 static reg_syntax_t syn;
00029 
00030 /* make_regexp --- generate compiled regular expressions */
00031 
00032 Regexp *
00033 make_regexp(const char *s, size_t len, int ignorecase)
00034 {
00035         Regexp *rp;
00036         const char *rerr;
00037         const char *src = s;
00038         char *temp;
00039         const char *end = s + len;
00040         register char *dest;
00041         register int c, c2;
00042 #ifdef MBS_SUPPORT
00043         /* The number of bytes in the current multbyte character.
00044            It is 0, when the current character is a singlebyte character.  */
00045         size_t is_multibyte = 0;
00046         mbstate_t mbs;
00047 
00048         if (gawk_mb_cur_max > 1)
00049                 memset(&mbs, 0, sizeof(mbstate_t)); /* Initialize.  */
00050 #endif
00051 
00052         /* Handle escaped characters first. */
00053 
00054         /*
00055          * Build a copy of the string (in dest) with the
00056          * escaped characters translated, and generate the regex
00057          * from that.  
00058          */
00059         emalloc(dest, char *, len + 2, "make_regexp");
00060         temp = dest;
00061 
00062         while (src < end) {
00063 #ifdef MBS_SUPPORT
00064                 if (gawk_mb_cur_max > 1 && !is_multibyte) {
00065                         /* The previous byte is a singlebyte character, or last byte
00066                            of a multibyte character.  We check the next character.  */
00067                         is_multibyte = mbrlen(src, end - src, &mbs);
00068                         if ((is_multibyte == 1) || (is_multibyte == (size_t) -1)
00069                                 || (is_multibyte == (size_t) -2 || (is_multibyte == 0))) {
00070                                 /* We treat it as a singlebyte character.  */
00071                                 is_multibyte = 0;
00072                         }
00073                 }
00074 #endif
00075 
00076                 if (
00077 #ifdef MBS_SUPPORT
00078                 /* We skip multibyte character, since it must not be a special
00079                    character.  */
00080                     (gawk_mb_cur_max == 1 || ! is_multibyte) &&
00081 #endif
00082                     (*src == '\\')) {
00083                         c = *++src;
00084                         switch (c) {
00085                         case 'a':
00086                         case 'b':
00087                         case 'f':
00088                         case 'n':
00089                         case 'r':
00090                         case 't':
00091                         case 'v':
00092                         case 'x':
00093                         case '0':
00094                         case '1':
00095                         case '2':
00096                         case '3':
00097                         case '4':
00098                         case '5':
00099                         case '6':
00100                         case '7':
00101                                 c2 = parse_escape(&src);
00102                                 if (c2 < 0)
00103                                         cant_happen();
00104                                 /*
00105                                  * Unix awk treats octal (and hex?) chars
00106                                  * literally in re's, so escape regexp
00107                                  * metacharacters.
00108                                  */
00109                                 if (do_traditional && ! do_posix && (ISDIGIT(c) || c == 'x')
00110                                     && strchr("()|*+?.^$\\[]", c2) != NULL)
00111                                         *dest++ = '\\';
00112                                 *dest++ = (char) c2;
00113                                 break;
00114                         case '8':
00115                         case '9':       /* a\9b not valid */
00116                                 *dest++ = c;
00117                                 src++;
00118                                 break;
00119                         case 'y':       /* normally \b */
00120                                 /* gnu regex op */
00121                                 if (! do_traditional) {
00122                                         *dest++ = '\\';
00123                                         *dest++ = 'b';
00124                                         src++;
00125                                         break;
00126                                 }
00127                                 /* else, fall through */
00128                         default:
00129                                 *dest++ = '\\';
00130                                 *dest++ = (char) c;
00131                                 src++;
00132                                 break;
00133                         } /* switch */
00134                 } else
00135                         *dest++ = *src++;       /* not '\\' */
00136 #ifdef MBS_SUPPORT
00137                 if (gawk_mb_cur_max > 1 && is_multibyte)
00138                         is_multibyte--;
00139 #endif
00140         } /* while */
00141 
00142         *dest = '\0' ;  /* Only necessary if we print dest ? */
00143         emalloc(rp, Regexp *, sizeof(*rp), "make_regexp");
00144         memset((char *) rp, 0, sizeof(*rp));
00145         rp->pat.allocated = 0;  /* regex will allocate the buffer */
00146         emalloc(rp->pat.fastmap, char *, 256, "make_regexp");
00147 
00148         if (ignorecase)
00149                 rp->pat.translate = casetable;
00150         else
00151                 rp->pat.translate = NULL;
00152         len = dest - temp;
00153         if ((rerr = re_compile_pattern(temp, len, &(rp->pat))) != NULL)
00154                 fatal("%s: /%s/", rerr, temp);  /* rerr already gettextized inside regex routines */
00155 
00156         /* gack. this must be done *after* re_compile_pattern */
00157         rp->pat.newline_anchor = FALSE; /* don't get \n in middle of string */
00158 
00159         free(temp);
00160         return rp;
00161 }
00162 
00163 /* research --- do a regexp search */
00164 
00165 int
00166 research(Regexp *rp, register const char *str, int start,
00167         register size_t len, int need_start)
00168 {
00169         const char *ret = str;
00170 
00171         if (ret) {
00172                 /*
00173                  * Passing NULL as last arg speeds up search for cases
00174                  * where we don't need the start/end info.
00175                  */
00176                 int res = re_search(&(rp->pat), str, start+len,
00177                                 start, len, need_start ? &(rp->regs) : NULL);
00178 
00179                 /*
00180                  * A return of -2 indicates that a heuristic in
00181                  * regex decided it might allocate too much memory
00182                  * on the C stack. This doesn't apply to gawk, which
00183                  * uses REGEX_MALLOC. This is dealt with by the
00184                  * assignment to re_max_failures in resetup().
00185                  * Naetheless, we keep this code here as a fallback.
00186                  *
00187                  * XXX: The above comment is obsolete; the new regex
00188                  * doesn't have an re_max_failures variable. But we
00189                  * keep the code here just in case.
00190                  */
00191                 if (res == -2) {
00192                         /* the 10 here is arbitrary */
00193                         fatal(_("regex match failed, not enough memory to match string \"%.*s%s\""),
00194                                         (int) (len > 10 ? 10 : len), str + start,
00195                                         len > 10 ? "..." : "");
00196                 }
00197                 return res;
00198         } else
00199                 return -1;
00200 }
00201 
00202 /* refree --- free up the dynamic memory used by a compiled regexp */
00203 
00204 void
00205 refree(Regexp *rp)
00206 {
00207         /*
00208          * This isn't malloced, don't let regfree free it.
00209          * (This is strictly necessary only for the old
00210          * version of regex, but it's a good idea to keep it
00211          * here in case regex internals change in the future.)
00212          */
00213         rp->pat.translate = NULL;
00214 
00215         regfree(& rp->pat);
00216         if (rp->regs.start)
00217                 free(rp->regs.start);
00218         if (rp->regs.end)
00219                 free(rp->regs.end);
00220         free(rp);
00221 }
00222 
00223 /* re_update --- recompile a dynamic regexp */
00224 
00225 Regexp *
00226 re_update(NODE *t)
00227 {
00228         NODE *t1;
00229 
00230         if ((t->re_flags & CASE) == IGNORECASE) {
00231                 if ((t->re_flags & CONST) != 0) {
00232                         assert(t->type == Node_regex);
00233                         return t->re_reg;
00234                 }
00235                 t1 = force_string(tree_eval(t->re_exp));
00236                 if (t->re_text != NULL) {
00237                         if (cmp_nodes(t->re_text, t1) == 0) {
00238                                 free_temp(t1);
00239                                 return t->re_reg;
00240                         }
00241                         unref(t->re_text);
00242                 }
00243                 t->re_text = dupnode(t1);
00244                 free_temp(t1);
00245         }
00246         if (t->re_reg != NULL)
00247                 refree(t->re_reg);
00248         if (t->re_text == NULL || (t->re_flags & CASE) != IGNORECASE) {
00249                 t1 = force_string(tree_eval(t->re_exp));
00250                 unref(t->re_text);
00251                 t->re_text = dupnode(t1);
00252                 free_temp(t1);
00253         }
00254         t->re_reg = make_regexp(t->re_text->stptr, t->re_text->stlen,
00255                                 IGNORECASE);
00256         t->re_flags &= ~CASE;
00257         t->re_flags |= IGNORECASE;
00258         return t->re_reg;
00259 }
00260 
00261 /* resetup --- choose what kind of regexps we match */
00262 
00263 void
00264 resetup()
00265 {
00266         if (do_posix)
00267                 syn = RE_SYNTAX_POSIX_AWK;      /* strict POSIX re's */
00268         else if (do_traditional)
00269                 syn = RE_SYNTAX_AWK;            /* traditional Unix awk re's */
00270         else
00271                 syn = RE_SYNTAX_GNU_AWK;        /* POSIX re's + GNU ops */
00272 
00273         /*
00274          * Interval expressions are off by default, since it's likely to
00275          * break too many old programs to have them on.
00276          */
00277         if (do_intervals)
00278                 syn |= RE_INTERVALS;
00279 
00280         (void) re_set_syntax(syn);
00281 }
00282 
00283 /* reisstring --- return TRUE if the RE match is a simple string match */
00284 
00285 int
00286 reisstring(const char *text, size_t len, Regexp *re, const char *buf)
00287 {
00288         static char metas[] = ".*+(){}[]|?^$\\";
00289         int i;
00290         int res;
00291         const char *matched;
00292 
00293         /* simple checking for has meta characters in re */
00294         for (i = 0; i < len; i++) {
00295                 if (strchr(metas, text[i]) != NULL) {
00296                         return FALSE;   /* give up early, can't be string match */
00297                 }
00298         }
00299 
00300         /* make accessable to gdb */
00301         matched = &buf[RESTART(re, buf)];
00302 
00303         res = STREQN(text, matched, len);
00304 
00305         return res;
00306 }
00307 
00308 /* remaybelong --- return TRUE if the RE contains * ? | + */
00309 
00310 int
00311 remaybelong(const char *text, size_t len)
00312 {
00313         while (len--) {
00314                 if (strchr("*+|?", *text++) != NULL) {
00315                         return TRUE;
00316                 }
00317         }
00318 
00319         return FALSE;
00320 }
00321 
00322 /* reflags2str --- make a regex flags value readable */
00323 
00324 const char *
00325 reflags2str(int flagval)
00326 {
00327         static const struct flagtab values[] = {
00328                 { RE_BACKSLASH_ESCAPE_IN_LISTS, "RE_BACKSLASH_ESCAPE_IN_LISTS" },
00329                 { RE_BK_PLUS_QM, "RE_BK_PLUS_QM" },
00330                 { RE_CHAR_CLASSES, "RE_CHAR_CLASSES" },
00331                 { RE_CONTEXT_INDEP_ANCHORS, "RE_CONTEXT_INDEP_ANCHORS" },
00332                 { RE_CONTEXT_INDEP_OPS, "RE_CONTEXT_INDEP_OPS" },
00333                 { RE_CONTEXT_INVALID_OPS, "RE_CONTEXT_INVALID_OPS" },
00334                 { RE_DOT_NEWLINE, "RE_DOT_NEWLINE" },
00335                 { RE_DOT_NOT_NULL, "RE_DOT_NOT_NULL" },
00336                 { RE_HAT_LISTS_NOT_NEWLINE, "RE_HAT_LISTS_NOT_NEWLINE" },
00337                 { RE_INTERVALS, "RE_INTERVALS" },
00338                 { RE_LIMITED_OPS, "RE_LIMITED_OPS" },
00339                 { RE_NEWLINE_ALT, "RE_NEWLINE_ALT" },
00340                 { RE_NO_BK_BRACES, "RE_NO_BK_BRACES" },
00341                 { RE_NO_BK_PARENS, "RE_NO_BK_PARENS" },
00342                 { RE_NO_BK_REFS, "RE_NO_BK_REFS" },
00343                 { RE_NO_BK_VBAR, "RE_NO_BK_VBAR" },
00344                 { RE_NO_EMPTY_RANGES, "RE_NO_EMPTY_RANGES" },
00345                 { RE_UNMATCHED_RIGHT_PAREN_ORD, "RE_UNMATCHED_RIGHT_PAREN_ORD" },
00346                 { RE_NO_POSIX_BACKTRACKING, "RE_NO_POSIX_BACKTRACKING" },
00347                 { RE_NO_GNU_OPS, "RE_NO_GNU_OPS" },
00348                 { RE_DEBUG, "RE_DEBUG" },
00349                 { RE_INVALID_INTERVAL_ORD, "RE_INVALID_INTERVAL_ORD" },
00350                 { RE_ICASE, "RE_ICASE" },
00351                 { 0,    NULL },
00352         };
00353 
00354         return genflags2str(flagval, values);
00355 }

© sourcejam.com 2005-2008