#include "awk.h"Go to the source code of this file.
Functions | |
| Regexp * | make_regexp (const char *s, size_t len, int ignorecase) |
| int | research (Regexp *rp, register const char *str, int start, register size_t len, int need_start) |
| void | refree (Regexp *rp) |
| Regexp * | re_update (NODE *t) |
| void | resetup () |
| int | reisstring (const char *text, size_t len, Regexp *re, const char *buf) |
| int | remaybelong (const char *text, size_t len) |
| const char * | reflags2str (int flagval) |
Variables | |
| static reg_syntax_t | syn |
|
||||||||||||||||
|
Definition at line 33 of file re.c. References cant_happen, casetable, do_posix, do_traditional, emalloc, FALSE, fatal, free(), ISDIGIT, memset(), NULL, parse_escape(), re_compile_pattern(), and strchr(). Referenced by re_update(), set_FS(), and set_RS(). 00034 { 00035 Regexp *rp; 00036 const char *rerr; 00037 const char *src = s; 00038 char *temp; 00039 const char *end = s + len; 00040 register char *dest; 00041 register int c, c2; 00042 #ifdef MBS_SUPPORT 00043 /* The number of bytes in the current multbyte character. 00044 It is 0, when the current character is a singlebyte character. */ 00045 size_t is_multibyte = 0; 00046 mbstate_t mbs; 00047 00048 if (gawk_mb_cur_max > 1) 00049 memset(&mbs, 0, sizeof(mbstate_t)); /* Initialize. */ 00050 #endif 00051 00052 /* Handle escaped characters first. */ 00053 00054 /* 00055 * Build a copy of the string (in dest) with the 00056 * escaped characters translated, and generate the regex 00057 * from that. 00058 */ 00059 emalloc(dest, char *, len + 2, "make_regexp"); 00060 temp = dest; 00061 00062 while (src < end) { 00063 #ifdef MBS_SUPPORT 00064 if (gawk_mb_cur_max > 1 && !is_multibyte) { 00065 /* The previous byte is a singlebyte character, or last byte 00066 of a multibyte character. We check the next character. */ 00067 is_multibyte = mbrlen(src, end - src, &mbs); 00068 if ((is_multibyte == 1) || (is_multibyte == (size_t) -1) 00069 || (is_multibyte == (size_t) -2 || (is_multibyte == 0))) { 00070 /* We treat it as a singlebyte character. */ 00071 is_multibyte = 0; 00072 } 00073 } 00074 #endif 00075 00076 if ( 00077 #ifdef MBS_SUPPORT 00078 /* We skip multibyte character, since it must not be a special 00079 character. */ 00080 (gawk_mb_cur_max == 1 || ! is_multibyte) && 00081 #endif 00082 (*src == '\\')) { 00083 c = *++src; 00084 switch (c) { 00085 case 'a': 00086 case 'b': 00087 case 'f': 00088 case 'n': 00089 case 'r': 00090 case 't': 00091 case 'v': 00092 case 'x': 00093 case '0': 00094 case '1': 00095 case '2': 00096 case '3': 00097 case '4': 00098 case '5': 00099 case '6': 00100 case '7': 00101 c2 = parse_escape(&src); 00102 if (c2 < 0) 00103 cant_happen(); 00104 /* 00105 * Unix awk treats octal (and hex?) chars 00106 * literally in re's, so escape regexp 00107 * metacharacters. 00108 */ 00109 if (do_traditional && ! do_posix && (ISDIGIT(c) || c == 'x') 00110 && strchr("()|*+?.^$\\[]", c2) != NULL) 00111 *dest++ = '\\'; 00112 *dest++ = (char) c2; 00113 break; 00114 case '8': 00115 case '9': /* a\9b not valid */ 00116 *dest++ = c; 00117 src++; 00118 break; 00119 case 'y': /* normally \b */ 00120 /* gnu regex op */ 00121 if (! do_traditional) { 00122 *dest++ = '\\'; 00123 *dest++ = 'b'; 00124 src++; 00125 break; 00126 } 00127 /* else, fall through */ 00128 default: 00129 *dest++ = '\\'; 00130 *dest++ = (char) c; 00131 src++; 00132 break; 00133 } /* switch */ 00134 } else 00135 *dest++ = *src++; /* not '\\' */ 00136 #ifdef MBS_SUPPORT 00137 if (gawk_mb_cur_max > 1 && is_multibyte) 00138 is_multibyte--; 00139 #endif 00140 } /* while */ 00141 00142 *dest = '\0' ; /* Only necessary if we print dest ? */ 00143 emalloc(rp, Regexp *, sizeof(*rp), "make_regexp"); 00144 memset((char *) rp, 0, sizeof(*rp)); 00145 rp->pat.allocated = 0; /* regex will allocate the buffer */ 00146 emalloc(rp->pat.fastmap, char *, 256, "make_regexp"); 00147 00148 if (ignorecase) 00149 rp->pat.translate = casetable; 00150 else 00151 rp->pat.translate = NULL; 00152 len = dest - temp; 00153 if ((rerr = re_compile_pattern(temp, len, &(rp->pat))) != NULL) 00154 fatal("%s: /%s/", rerr, temp); /* rerr already gettextized inside regex routines */ 00155 00156 /* gack. this must be done *after* re_compile_pattern */ 00157 rp->pat.newline_anchor = FALSE; /* don't get \n in middle of string */ 00158 00159 free(temp); 00160 return rp; 00161 }
|
|
|
Definition at line 226 of file re.c. References CASE, cmp_nodes(), CONST, dupnode, force_string, free_temp, IGNORECASE, make_regexp(), Node_regex, NULL, refree(), tree_eval, exp_node::type, and unref(). Referenced by do_match(), do_split(), interpret(), match_op(), and sub_common(). 00227 { 00228 NODE *t1; 00229 00230 if ((t->re_flags & CASE) == IGNORECASE) { 00231 if ((t->re_flags & CONST) != 0) { 00232 assert(t->type == Node_regex); 00233 return t->re_reg; 00234 } 00235 t1 = force_string(tree_eval(t->re_exp)); 00236 if (t->re_text != NULL) { 00237 if (cmp_nodes(t->re_text, t1) == 0) { 00238 free_temp(t1); 00239 return t->re_reg; 00240 } 00241 unref(t->re_text); 00242 } 00243 t->re_text = dupnode(t1); 00244 free_temp(t1); 00245 } 00246 if (t->re_reg != NULL) 00247 refree(t->re_reg); 00248 if (t->re_text == NULL || (t->re_flags & CASE) != IGNORECASE) { 00249 t1 = force_string(tree_eval(t->re_exp)); 00250 unref(t->re_text); 00251 t->re_text = dupnode(t1); 00252 free_temp(t1); 00253 } 00254 t->re_reg = make_regexp(t->re_text->stptr, t->re_text->stlen, 00255 IGNORECASE); 00256 t->re_flags &= ~CASE; 00257 t->re_flags |= IGNORECASE; 00258 return t->re_reg; 00259 }
|
|
|
Definition at line 325 of file re.c. References genflags2str(), NULL, RE_BACKSLASH_ESCAPE_IN_LISTS, RE_BK_PLUS_QM, RE_CHAR_CLASSES, RE_CONTEXT_INDEP_ANCHORS, RE_CONTEXT_INDEP_OPS, RE_CONTEXT_INVALID_OPS, RE_DEBUG, RE_DOT_NEWLINE, RE_DOT_NOT_NULL, RE_HAT_LISTS_NOT_NEWLINE, RE_ICASE, RE_INTERVALS, RE_INVALID_INTERVAL_ORD, RE_LIMITED_OPS, RE_NEWLINE_ALT, RE_NO_BK_BRACES, RE_NO_BK_PARENS, RE_NO_BK_REFS, RE_NO_BK_VBAR, RE_NO_EMPTY_RANGES, RE_NO_GNU_OPS, RE_NO_POSIX_BACKTRACKING, and RE_UNMATCHED_RIGHT_PAREN_ORD. 00326 { 00327 static const struct flagtab values[] = { 00328 { RE_BACKSLASH_ESCAPE_IN_LISTS, "RE_BACKSLASH_ESCAPE_IN_LISTS" }, 00329 { RE_BK_PLUS_QM, "RE_BK_PLUS_QM" }, 00330 { RE_CHAR_CLASSES, "RE_CHAR_CLASSES" }, 00331 { RE_CONTEXT_INDEP_ANCHORS, "RE_CONTEXT_INDEP_ANCHORS" }, 00332 { RE_CONTEXT_INDEP_OPS, "RE_CONTEXT_INDEP_OPS" }, 00333 { RE_CONTEXT_INVALID_OPS, "RE_CONTEXT_INVALID_OPS" }, 00334 { RE_DOT_NEWLINE, "RE_DOT_NEWLINE" }, 00335 { RE_DOT_NOT_NULL, "RE_DOT_NOT_NULL" }, 00336 { RE_HAT_LISTS_NOT_NEWLINE, "RE_HAT_LISTS_NOT_NEWLINE" }, 00337 { RE_INTERVALS, "RE_INTERVALS" }, 00338 { RE_LIMITED_OPS, "RE_LIMITED_OPS" }, 00339 { RE_NEWLINE_ALT, "RE_NEWLINE_ALT" }, 00340 { RE_NO_BK_BRACES, "RE_NO_BK_BRACES" }, 00341 { RE_NO_BK_PARENS, "RE_NO_BK_PARENS" }, 00342 { RE_NO_BK_REFS, "RE_NO_BK_REFS" }, 00343 { RE_NO_BK_VBAR, "RE_NO_BK_VBAR" }, 00344 { RE_NO_EMPTY_RANGES, "RE_NO_EMPTY_RANGES" }, 00345 { RE_UNMATCHED_RIGHT_PAREN_ORD, "RE_UNMATCHED_RIGHT_PAREN_ORD" }, 00346 { RE_NO_POSIX_BACKTRACKING, "RE_NO_POSIX_BACKTRACKING" }, 00347 { RE_NO_GNU_OPS, "RE_NO_GNU_OPS" }, 00348 { RE_DEBUG, "RE_DEBUG" }, 00349 { RE_INVALID_INTERVAL_ORD, "RE_INVALID_INTERVAL_ORD" }, 00350 { RE_ICASE, "RE_ICASE" }, 00351 { 0, NULL }, 00352 }; 00353 00354 return genflags2str(flagval, values); 00355 }
|
|
|
Definition at line 205 of file re.c. References re_registers::end, free(), NULL, Regexp::pat, regfree(), Regexp::regs, re_registers::start, and re_pattern_buffer::translate. Referenced by re_update(), set_FS(), and set_RS(). 00206 { 00207 /* 00208 * This isn't malloced, don't let regfree free it. 00209 * (This is strictly necessary only for the old 00210 * version of regex, but it's a good idea to keep it 00211 * here in case regex internals change in the future.) 00212 */ 00213 rp->pat.translate = NULL; 00214 00215 regfree(& rp->pat); 00216 if (rp->regs.start) 00217 free(rp->regs.start); 00218 if (rp->regs.end) 00219 free(rp->regs.end); 00220 free(rp); 00221 }
|
|
||||||||||||||||||||
|
Definition at line 286 of file re.c. References FALSE, i, NULL, RESTART, strchr(), and STREQN. Referenced by rsrescan(). 00287 { 00288 static char metas[] = ".*+(){}[]|?^$\\"; 00289 int i; 00290 int res; 00291 const char *matched; 00292 00293 /* simple checking for has meta characters in re */ 00294 for (i = 0; i < len; i++) { 00295 if (strchr(metas, text[i]) != NULL) { 00296 return FALSE; /* give up early, can't be string match */ 00297 } 00298 } 00299 00300 /* make accessable to gdb */ 00301 matched = &buf[RESTART(re, buf)]; 00302 00303 res = STREQN(text, matched, len); 00304 00305 return res; 00306 }
|
|
||||||||||||
|
Definition at line 311 of file re.c. References FALSE, NULL, strchr(), and TRUE. Referenced by rsrescan(). 00312 { 00313 while (len--) { 00314 if (strchr("*+|?", *text++) != NULL) { 00315 return TRUE; 00316 } 00317 } 00318 00319 return FALSE; 00320 }
|
|
||||||||||||||||||||||||
|
Definition at line 166 of file re.c. References _, fatal, NULL, Regexp::pat, re_search(), and Regexp::regs. Referenced by do_match(), interpret(), match_op(), re_parse_field(), rsrescan(), and sub_common(). 00168 { 00169 const char *ret = str; 00170 00171 if (ret) { 00172 /* 00173 * Passing NULL as last arg speeds up search for cases 00174 * where we don't need the start/end info. 00175 */ 00176 int res = re_search(&(rp->pat), str, start+len, 00177 start, len, need_start ? &(rp->regs) : NULL); 00178 00179 /* 00180 * A return of -2 indicates that a heuristic in 00181 * regex decided it might allocate too much memory 00182 * on the C stack. This doesn't apply to gawk, which 00183 * uses REGEX_MALLOC. This is dealt with by the 00184 * assignment to re_max_failures in resetup(). 00185 * Naetheless, we keep this code here as a fallback. 00186 * 00187 * XXX: The above comment is obsolete; the new regex 00188 * doesn't have an re_max_failures variable. But we 00189 * keep the code here just in case. 00190 */ 00191 if (res == -2) { 00192 /* the 10 here is arbitrary */ 00193 fatal(_("regex match failed, not enough memory to match string \"%.*s%s\""), 00194 (int) (len > 10 ? 10 : len), str + start, 00195 len > 10 ? "..." : ""); 00196 } 00197 return res; 00198 } else 00199 return -1; 00200 }
|
|
|
Definition at line 264 of file re.c. References do_intervals, do_posix, do_traditional, RE_INTERVALS, re_set_syntax(), RE_SYNTAX_AWK, RE_SYNTAX_GNU_AWK, RE_SYNTAX_POSIX_AWK, and syn. Referenced by main(). 00265 { 00266 if (do_posix) 00267 syn = RE_SYNTAX_POSIX_AWK; /* strict POSIX re's */ 00268 else if (do_traditional) 00269 syn = RE_SYNTAX_AWK; /* traditional Unix awk re's */ 00270 else 00271 syn = RE_SYNTAX_GNU_AWK; /* POSIX re's + GNU ops */ 00272 00273 /* 00274 * Interval expressions are off by default, since it's likely to 00275 * break too many old programs to have them on. 00276 */ 00277 if (do_intervals) 00278 syn |= RE_INTERVALS; 00279 00280 (void) re_set_syntax(syn); 00281 }
|
|
|
Definition at line 28 of file re.c. Referenced by resetup(). |