Main Page | Class List | Directories | File List | Class Members | File Members

re.c File Reference

#include "awk.h"

Go to the source code of this file.

Functions

Regexpmake_regexp (const char *s, size_t len, int ignorecase)
int research (Regexp *rp, register const char *str, int start, register size_t len, int need_start)
void refree (Regexp *rp)
Regexpre_update (NODE *t)
void resetup ()
int reisstring (const char *text, size_t len, Regexp *re, const char *buf)
int remaybelong (const char *text, size_t len)
const char * reflags2str (int flagval)

Variables

static reg_syntax_t syn


Function Documentation

Regexp* make_regexp const char *  s,
size_t  len,
int  ignorecase
 

Definition at line 33 of file re.c.

References cant_happen, casetable, do_posix, do_traditional, emalloc, FALSE, fatal, free(), ISDIGIT, memset(), NULL, parse_escape(), re_compile_pattern(), and strchr().

Referenced by re_update(), set_FS(), and set_RS().

00034 {
00035         Regexp *rp;
00036         const char *rerr;
00037         const char *src = s;
00038         char *temp;
00039         const char *end = s + len;
00040         register char *dest;
00041         register int c, c2;
00042 #ifdef MBS_SUPPORT
00043         /* The number of bytes in the current multbyte character.
00044            It is 0, when the current character is a singlebyte character.  */
00045         size_t is_multibyte = 0;
00046         mbstate_t mbs;
00047 
00048         if (gawk_mb_cur_max > 1)
00049                 memset(&mbs, 0, sizeof(mbstate_t)); /* Initialize.  */
00050 #endif
00051 
00052         /* Handle escaped characters first. */
00053 
00054         /*
00055          * Build a copy of the string (in dest) with the
00056          * escaped characters translated, and generate the regex
00057          * from that.  
00058          */
00059         emalloc(dest, char *, len + 2, "make_regexp");
00060         temp = dest;
00061 
00062         while (src < end) {
00063 #ifdef MBS_SUPPORT
00064                 if (gawk_mb_cur_max > 1 && !is_multibyte) {
00065                         /* The previous byte is a singlebyte character, or last byte
00066                            of a multibyte character.  We check the next character.  */
00067                         is_multibyte = mbrlen(src, end - src, &mbs);
00068                         if ((is_multibyte == 1) || (is_multibyte == (size_t) -1)
00069                                 || (is_multibyte == (size_t) -2 || (is_multibyte == 0))) {
00070                                 /* We treat it as a singlebyte character.  */
00071                                 is_multibyte = 0;
00072                         }
00073                 }
00074 #endif
00075 
00076                 if (
00077 #ifdef MBS_SUPPORT
00078                 /* We skip multibyte character, since it must not be a special
00079                    character.  */
00080                     (gawk_mb_cur_max == 1 || ! is_multibyte) &&
00081 #endif
00082                     (*src == '\\')) {
00083                         c = *++src;
00084                         switch (c) {
00085                         case 'a':
00086                         case 'b':
00087                         case 'f':
00088                         case 'n':
00089                         case 'r':
00090                         case 't':
00091                         case 'v':
00092                         case 'x':
00093                         case '0':
00094                         case '1':
00095                         case '2':
00096                         case '3':
00097                         case '4':
00098                         case '5':
00099                         case '6':
00100                         case '7':
00101                                 c2 = parse_escape(&src);
00102                                 if (c2 < 0)
00103                                         cant_happen();
00104                                 /*
00105                                  * Unix awk treats octal (and hex?) chars
00106                                  * literally in re's, so escape regexp
00107                                  * metacharacters.
00108                                  */
00109                                 if (do_traditional && ! do_posix && (ISDIGIT(c) || c == 'x')
00110                                     && strchr("()|*+?.^$\\[]", c2) != NULL)
00111                                         *dest++ = '\\';
00112                                 *dest++ = (char) c2;
00113                                 break;
00114                         case '8':
00115                         case '9':       /* a\9b not valid */
00116                                 *dest++ = c;
00117                                 src++;
00118                                 break;
00119                         case 'y':       /* normally \b */
00120                                 /* gnu regex op */
00121                                 if (! do_traditional) {
00122                                         *dest++ = '\\';
00123                                         *dest++ = 'b';
00124                                         src++;
00125                                         break;
00126                                 }
00127                                 /* else, fall through */
00128                         default:
00129                                 *dest++ = '\\';
00130                                 *dest++ = (char) c;
00131                                 src++;
00132                                 break;
00133                         } /* switch */
00134                 } else
00135                         *dest++ = *src++;       /* not '\\' */
00136 #ifdef MBS_SUPPORT
00137                 if (gawk_mb_cur_max > 1 && is_multibyte)
00138                         is_multibyte--;
00139 #endif
00140         } /* while */
00141 
00142         *dest = '\0' ;  /* Only necessary if we print dest ? */
00143         emalloc(rp, Regexp *, sizeof(*rp), "make_regexp");
00144         memset((char *) rp, 0, sizeof(*rp));
00145         rp->pat.allocated = 0;  /* regex will allocate the buffer */
00146         emalloc(rp->pat.fastmap, char *, 256, "make_regexp");
00147 
00148         if (ignorecase)
00149                 rp->pat.translate = casetable;
00150         else
00151                 rp->pat.translate = NULL;
00152         len = dest - temp;
00153         if ((rerr = re_compile_pattern(temp, len, &(rp->pat))) != NULL)
00154                 fatal("%s: /%s/", rerr, temp);  /* rerr already gettextized inside regex routines */
00155 
00156         /* gack. this must be done *after* re_compile_pattern */
00157         rp->pat.newline_anchor = FALSE; /* don't get \n in middle of string */
00158 
00159         free(temp);
00160         return rp;
00161 }

Regexp* re_update NODE t  ) 
 

Definition at line 226 of file re.c.

References CASE, cmp_nodes(), CONST, dupnode, force_string, free_temp, IGNORECASE, make_regexp(), Node_regex, NULL, refree(), tree_eval, exp_node::type, and unref().

Referenced by do_match(), do_split(), interpret(), match_op(), and sub_common().

00227 {
00228         NODE *t1;
00229 
00230         if ((t->re_flags & CASE) == IGNORECASE) {
00231                 if ((t->re_flags & CONST) != 0) {
00232                         assert(t->type == Node_regex);
00233                         return t->re_reg;
00234                 }
00235                 t1 = force_string(tree_eval(t->re_exp));
00236                 if (t->re_text != NULL) {
00237                         if (cmp_nodes(t->re_text, t1) == 0) {
00238                                 free_temp(t1);
00239                                 return t->re_reg;
00240                         }
00241                         unref(t->re_text);
00242                 }
00243                 t->re_text = dupnode(t1);
00244                 free_temp(t1);
00245         }
00246         if (t->re_reg != NULL)
00247                 refree(t->re_reg);
00248         if (t->re_text == NULL || (t->re_flags & CASE) != IGNORECASE) {
00249                 t1 = force_string(tree_eval(t->re_exp));
00250                 unref(t->re_text);
00251                 t->re_text = dupnode(t1);
00252                 free_temp(t1);
00253         }
00254         t->re_reg = make_regexp(t->re_text->stptr, t->re_text->stlen,
00255                                 IGNORECASE);
00256         t->re_flags &= ~CASE;
00257         t->re_flags |= IGNORECASE;
00258         return t->re_reg;
00259 }

const char* reflags2str int  flagval  ) 
 

Definition at line 325 of file re.c.

References genflags2str(), NULL, RE_BACKSLASH_ESCAPE_IN_LISTS, RE_BK_PLUS_QM, RE_CHAR_CLASSES, RE_CONTEXT_INDEP_ANCHORS, RE_CONTEXT_INDEP_OPS, RE_CONTEXT_INVALID_OPS, RE_DEBUG, RE_DOT_NEWLINE, RE_DOT_NOT_NULL, RE_HAT_LISTS_NOT_NEWLINE, RE_ICASE, RE_INTERVALS, RE_INVALID_INTERVAL_ORD, RE_LIMITED_OPS, RE_NEWLINE_ALT, RE_NO_BK_BRACES, RE_NO_BK_PARENS, RE_NO_BK_REFS, RE_NO_BK_VBAR, RE_NO_EMPTY_RANGES, RE_NO_GNU_OPS, RE_NO_POSIX_BACKTRACKING, and RE_UNMATCHED_RIGHT_PAREN_ORD.

00326 {
00327         static const struct flagtab values[] = {
00328                 { RE_BACKSLASH_ESCAPE_IN_LISTS, "RE_BACKSLASH_ESCAPE_IN_LISTS" },
00329                 { RE_BK_PLUS_QM, "RE_BK_PLUS_QM" },
00330                 { RE_CHAR_CLASSES, "RE_CHAR_CLASSES" },
00331                 { RE_CONTEXT_INDEP_ANCHORS, "RE_CONTEXT_INDEP_ANCHORS" },
00332                 { RE_CONTEXT_INDEP_OPS, "RE_CONTEXT_INDEP_OPS" },
00333                 { RE_CONTEXT_INVALID_OPS, "RE_CONTEXT_INVALID_OPS" },
00334                 { RE_DOT_NEWLINE, "RE_DOT_NEWLINE" },
00335                 { RE_DOT_NOT_NULL, "RE_DOT_NOT_NULL" },
00336                 { RE_HAT_LISTS_NOT_NEWLINE, "RE_HAT_LISTS_NOT_NEWLINE" },
00337                 { RE_INTERVALS, "RE_INTERVALS" },
00338                 { RE_LIMITED_OPS, "RE_LIMITED_OPS" },
00339                 { RE_NEWLINE_ALT, "RE_NEWLINE_ALT" },
00340                 { RE_NO_BK_BRACES, "RE_NO_BK_BRACES" },
00341                 { RE_NO_BK_PARENS, "RE_NO_BK_PARENS" },
00342                 { RE_NO_BK_REFS, "RE_NO_BK_REFS" },
00343                 { RE_NO_BK_VBAR, "RE_NO_BK_VBAR" },
00344                 { RE_NO_EMPTY_RANGES, "RE_NO_EMPTY_RANGES" },
00345                 { RE_UNMATCHED_RIGHT_PAREN_ORD, "RE_UNMATCHED_RIGHT_PAREN_ORD" },
00346                 { RE_NO_POSIX_BACKTRACKING, "RE_NO_POSIX_BACKTRACKING" },
00347                 { RE_NO_GNU_OPS, "RE_NO_GNU_OPS" },
00348                 { RE_DEBUG, "RE_DEBUG" },
00349                 { RE_INVALID_INTERVAL_ORD, "RE_INVALID_INTERVAL_ORD" },
00350                 { RE_ICASE, "RE_ICASE" },
00351                 { 0,    NULL },
00352         };
00353 
00354         return genflags2str(flagval, values);
00355 }

void refree Regexp rp  ) 
 

Definition at line 205 of file re.c.

References re_registers::end, free(), NULL, Regexp::pat, regfree(), Regexp::regs, re_registers::start, and re_pattern_buffer::translate.

Referenced by re_update(), set_FS(), and set_RS().

00206 {
00207         /*
00208          * This isn't malloced, don't let regfree free it.
00209          * (This is strictly necessary only for the old
00210          * version of regex, but it's a good idea to keep it
00211          * here in case regex internals change in the future.)
00212          */
00213         rp->pat.translate = NULL;
00214 
00215         regfree(& rp->pat);
00216         if (rp->regs.start)
00217                 free(rp->regs.start);
00218         if (rp->regs.end)
00219                 free(rp->regs.end);
00220         free(rp);
00221 }

int reisstring const char *  text,
size_t  len,
Regexp re,
const char *  buf
 

Definition at line 286 of file re.c.

References FALSE, i, NULL, RESTART, strchr(), and STREQN.

Referenced by rsrescan().

00287 {
00288         static char metas[] = ".*+(){}[]|?^$\\";
00289         int i;
00290         int res;
00291         const char *matched;
00292 
00293         /* simple checking for has meta characters in re */
00294         for (i = 0; i < len; i++) {
00295                 if (strchr(metas, text[i]) != NULL) {
00296                         return FALSE;   /* give up early, can't be string match */
00297                 }
00298         }
00299 
00300         /* make accessable to gdb */
00301         matched = &buf[RESTART(re, buf)];
00302 
00303         res = STREQN(text, matched, len);
00304 
00305         return res;
00306 }

int remaybelong const char *  text,
size_t  len
 

Definition at line 311 of file re.c.

References FALSE, NULL, strchr(), and TRUE.

Referenced by rsrescan().

00312 {
00313         while (len--) {
00314                 if (strchr("*+|?", *text++) != NULL) {
00315                         return TRUE;
00316                 }
00317         }
00318 
00319         return FALSE;
00320 }

int research Regexp rp,
register const char *  str,
int  start,
register size_t  len,
int  need_start
 

Definition at line 166 of file re.c.

References _, fatal, NULL, Regexp::pat, re_search(), and Regexp::regs.

Referenced by do_match(), interpret(), match_op(), re_parse_field(), rsrescan(), and sub_common().

00168 {
00169         const char *ret = str;
00170 
00171         if (ret) {
00172                 /*
00173                  * Passing NULL as last arg speeds up search for cases
00174                  * where we don't need the start/end info.
00175                  */
00176                 int res = re_search(&(rp->pat), str, start+len,
00177                                 start, len, need_start ? &(rp->regs) : NULL);
00178 
00179                 /*
00180                  * A return of -2 indicates that a heuristic in
00181                  * regex decided it might allocate too much memory
00182                  * on the C stack. This doesn't apply to gawk, which
00183                  * uses REGEX_MALLOC. This is dealt with by the
00184                  * assignment to re_max_failures in resetup().
00185                  * Naetheless, we keep this code here as a fallback.
00186                  *
00187                  * XXX: The above comment is obsolete; the new regex
00188                  * doesn't have an re_max_failures variable. But we
00189                  * keep the code here just in case.
00190                  */
00191                 if (res == -2) {
00192                         /* the 10 here is arbitrary */
00193                         fatal(_("regex match failed, not enough memory to match string \"%.*s%s\""),
00194                                         (int) (len > 10 ? 10 : len), str + start,
00195                                         len > 10 ? "..." : "");
00196                 }
00197                 return res;
00198         } else
00199                 return -1;
00200 }

void resetup  ) 
 

Definition at line 264 of file re.c.

References do_intervals, do_posix, do_traditional, RE_INTERVALS, re_set_syntax(), RE_SYNTAX_AWK, RE_SYNTAX_GNU_AWK, RE_SYNTAX_POSIX_AWK, and syn.

Referenced by main().

00265 {
00266         if (do_posix)
00267                 syn = RE_SYNTAX_POSIX_AWK;      /* strict POSIX re's */
00268         else if (do_traditional)
00269                 syn = RE_SYNTAX_AWK;            /* traditional Unix awk re's */
00270         else
00271                 syn = RE_SYNTAX_GNU_AWK;        /* POSIX re's + GNU ops */
00272 
00273         /*
00274          * Interval expressions are off by default, since it's likely to
00275          * break too many old programs to have them on.
00276          */
00277         if (do_intervals)
00278                 syn |= RE_INTERVALS;
00279 
00280         (void) re_set_syntax(syn);
00281 }


Variable Documentation

reg_syntax_t syn [static]
 

Definition at line 28 of file re.c.

Referenced by resetup().


© sourcejam.com 2005-2008