Main Page | Class List | Directories | File List | Class Members | File Members

regcomp.c

Go to the documentation of this file.
00001 /* Extended regular expression matching and search library.
00002    Copyright (C) 2002, 2003 Free Software Foundation, Inc.
00003    This file is part of the GNU C Library.
00004    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
00005 
00006    The GNU C Library is free software; you can redistribute it and/or
00007    modify it under the terms of the GNU Lesser General Public
00008    License as published by the Free Software Foundation; either
00009    version 2.1 of the License, or (at your option) any later version.
00010 
00011    The GNU C Library is distributed in the hope that it will be useful,
00012    but WITHOUT ANY WARRANTY; without even the implied warranty of
00013    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014    Lesser General Public License for more details.
00015 
00016    You should have received a copy of the GNU Lesser General Public
00017    License along with the GNU C Library; if not, write to the Free
00018    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
00019    02111-1307 USA.  */
00020 
00021 static reg_errcode_t re_compile_internal _RE_ARGS((regex_t *preg, const char * pattern,
00022                                           int length, reg_syntax_t syntax));
00023 static void re_compile_fastmap_iter _RE_ARGS((regex_t *bufp,
00024                                      const re_dfastate_t *init_state,
00025                                      char *fastmap));
00026 static reg_errcode_t init_dfa _RE_ARGS((re_dfa_t *dfa, int pat_len));
00027 static reg_errcode_t init_word_char _RE_ARGS((re_dfa_t *dfa));
00028 #ifdef RE_ENABLE_I18N
00029 static void free_charset _RE_ARGS((re_charset_t *cset));
00030 #endif /* RE_ENABLE_I18N */
00031 static void free_workarea_compile _RE_ARGS((regex_t *preg));
00032 static reg_errcode_t create_initial_state _RE_ARGS((re_dfa_t *dfa));
00033 static reg_errcode_t analyze _RE_ARGS((re_dfa_t *dfa));
00034 static reg_errcode_t analyze_tree _RE_ARGS((re_dfa_t *dfa, bin_tree_t *node));
00035 static void calc_first _RE_ARGS((re_dfa_t *dfa, bin_tree_t *node));
00036 static void calc_next _RE_ARGS((re_dfa_t *dfa, bin_tree_t *node));
00037 static void calc_epsdest _RE_ARGS((re_dfa_t *dfa, bin_tree_t *node));
00038 static reg_errcode_t duplicate_node_closure _RE_ARGS((re_dfa_t *dfa, int top_org_node,
00039                                              int top_clone_node, int root_node,
00040                                              unsigned int constraint));
00041 static reg_errcode_t duplicate_node _RE_ARGS((int *new_idx, re_dfa_t *dfa, int org_idx,
00042                                      unsigned int constraint));
00043 static int search_duplicated_node _RE_ARGS((re_dfa_t *dfa, int org_node,
00044                                    unsigned int constraint));
00045 static reg_errcode_t calc_eclosure _RE_ARGS((re_dfa_t *dfa));
00046 static reg_errcode_t calc_eclosure_iter _RE_ARGS((re_node_set *new_set, re_dfa_t *dfa,
00047                                          int node, int root));
00048 static void calc_inveclosure _RE_ARGS((re_dfa_t *dfa));
00049 static int fetch_number _RE_ARGS((re_string_t *input, re_token_t *token,
00050                          reg_syntax_t syntax));
00051 static re_token_t fetch_token _RE_ARGS((re_string_t *input, reg_syntax_t syntax));
00052 static int peek_token _RE_ARGS((re_token_t *token, re_string_t *input,
00053                         reg_syntax_t syntax));
00054 static int peek_token_bracket _RE_ARGS((re_token_t *token, re_string_t *input,
00055                                reg_syntax_t syntax));
00056 static bin_tree_t *parse _RE_ARGS((re_string_t *regexp, regex_t *preg,
00057                           reg_syntax_t syntax, reg_errcode_t *err));
00058 static bin_tree_t *parse_reg_exp _RE_ARGS((re_string_t *regexp, regex_t *preg,
00059                                   re_token_t *token, reg_syntax_t syntax,
00060                                   int nest, reg_errcode_t *err));
00061 static bin_tree_t *parse_branch _RE_ARGS((re_string_t *regexp, regex_t *preg,
00062                                  re_token_t *token, reg_syntax_t syntax,
00063                                  int nest, reg_errcode_t *err));
00064 static bin_tree_t *parse_expression _RE_ARGS((re_string_t *regexp, regex_t *preg,
00065                                      re_token_t *token, reg_syntax_t syntax,
00066                                      int nest, reg_errcode_t *err));
00067 static bin_tree_t *parse_sub_exp _RE_ARGS((re_string_t *regexp, regex_t *preg,
00068                                   re_token_t *token, reg_syntax_t syntax,
00069                                   int nest, reg_errcode_t *err));
00070 static bin_tree_t *parse_dup_op _RE_ARGS((bin_tree_t *dup_elem, re_string_t *regexp,
00071                                  re_dfa_t *dfa, re_token_t *token,
00072                                  reg_syntax_t syntax, reg_errcode_t *err));
00073 static bin_tree_t *parse_bracket_exp _RE_ARGS((re_string_t *regexp, re_dfa_t *dfa,
00074                                       re_token_t *token, reg_syntax_t syntax,
00075                                       reg_errcode_t *err));
00076 static reg_errcode_t parse_bracket_element _RE_ARGS((bracket_elem_t *elem,
00077                                             re_string_t *regexp,
00078                                             re_token_t *token, int token_len,
00079                                             re_dfa_t *dfa,
00080                                             reg_syntax_t syntax));
00081 static reg_errcode_t parse_bracket_symbol _RE_ARGS((bracket_elem_t *elem,
00082                                           re_string_t *regexp,
00083                                           re_token_t *token));
00084 #ifndef _LIBC
00085 # ifdef RE_ENABLE_I18N
00086 static reg_errcode_t build_range_exp _RE_ARGS((re_bitset_ptr_t sbcset,
00087                                       re_charset_t *mbcset, int *range_alloc,
00088                                       bracket_elem_t *start_elem,
00089                                       bracket_elem_t *end_elem));
00090 static reg_errcode_t build_collating_symbol _RE_ARGS((re_bitset_ptr_t sbcset,
00091                                              re_charset_t *mbcset,
00092                                              int *coll_sym_alloc,
00093                                              const unsigned char *name));
00094 # else /* not RE_ENABLE_I18N */
00095 static reg_errcode_t build_range_exp _RE_ARGS((re_bitset_ptr_t sbcset,
00096                                       bracket_elem_t *start_elem,
00097                                       bracket_elem_t *end_elem));
00098 static reg_errcode_t build_collating_symbol _RE_ARGS((re_bitset_ptr_t sbcset,
00099                                              const unsigned char *name));
00100 # endif /* not RE_ENABLE_I18N */
00101 #endif /* not _LIBC */
00102 #ifdef RE_ENABLE_I18N
00103 static reg_errcode_t build_equiv_class _RE_ARGS((re_bitset_ptr_t sbcset,
00104                                         re_charset_t *mbcset,
00105                                         int *equiv_class_alloc,
00106                                         const unsigned char *name));
00107 static reg_errcode_t build_charclass _RE_ARGS((RE_TRANSLATE_TYPE trans,
00108                                       re_bitset_ptr_t sbcset,
00109                                       re_charset_t *mbcset,
00110                                       int *char_class_alloc,
00111                                       const unsigned char *class_name,
00112                                       reg_syntax_t syntax));
00113 #else  /* not RE_ENABLE_I18N */
00114 static reg_errcode_t build_equiv_class _RE_ARGS((re_bitset_ptr_t sbcset,
00115                                         const unsigned char *name));
00116 static reg_errcode_t build_charclass _RE_ARGS((RE_TRANSLATE_TYPE trans,
00117                                       re_bitset_ptr_t sbcset,
00118                                       const unsigned char *class_name,
00119                                       reg_syntax_t syntax));
00120 #endif /* not RE_ENABLE_I18N */
00121 static bin_tree_t *build_word_op _RE_ARGS((re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
00122                                   int not, reg_errcode_t *err));
00123 static void free_bin_tree _RE_ARGS((bin_tree_t *tree));
00124 static bin_tree_t *create_tree _RE_ARGS((bin_tree_t *left, bin_tree_t *right,
00125                                 re_token_type_t type, int index));
00126 static bin_tree_t *duplicate_tree _RE_ARGS((const bin_tree_t *src, re_dfa_t *dfa));
00127 
00128 /* This table gives an error message for each of the error codes listed
00129    in regex.h.  Obviously the order here has to be same as there.
00130    POSIX doesn't require that we do anything for REG_NOERROR,
00131    but why not be nice?  */
00132 
00133 const char __re_error_msgid[] attribute_hidden =
00134   {
00135 #define REG_NOERROR_IDX 0
00136     gettext_noop ("Success")    /* REG_NOERROR */
00137     "\0"
00138 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
00139     gettext_noop ("No match")   /* REG_NOMATCH */
00140     "\0"
00141 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
00142     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
00143     "\0"
00144 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
00145     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
00146     "\0"
00147 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
00148     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
00149     "\0"
00150 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
00151     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
00152     "\0"
00153 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
00154     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
00155     "\0"
00156 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
00157     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
00158     "\0"
00159 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
00160     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
00161     "\0"
00162 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
00163     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
00164     "\0"
00165 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
00166     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
00167     "\0"
00168 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
00169     gettext_noop ("Invalid range end")  /* REG_ERANGE */
00170     "\0"
00171 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
00172     gettext_noop ("Memory exhausted") /* REG_ESPACE */
00173     "\0"
00174 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
00175     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
00176     "\0"
00177 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
00178     gettext_noop ("Premature end of regular expression") /* REG_EEND */
00179     "\0"
00180 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
00181     gettext_noop ("Regular expression too big") /* REG_ESIZE */
00182     "\0"
00183 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
00184     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
00185   };
00186 
00187 const size_t __re_error_msgid_idx[] attribute_hidden =
00188   {
00189     REG_NOERROR_IDX,
00190     REG_NOMATCH_IDX,
00191     REG_BADPAT_IDX,
00192     REG_ECOLLATE_IDX,
00193     REG_ECTYPE_IDX,
00194     REG_EESCAPE_IDX,
00195     REG_ESUBREG_IDX,
00196     REG_EBRACK_IDX,
00197     REG_EPAREN_IDX,
00198     REG_EBRACE_IDX,
00199     REG_BADBR_IDX,
00200     REG_ERANGE_IDX,
00201     REG_ESPACE_IDX,
00202     REG_BADRPT_IDX,
00203     REG_EEND_IDX,
00204     REG_ESIZE_IDX,
00205     REG_ERPAREN_IDX
00206   };
00207 
00208 /* Entry points for GNU code.  */
00209 
00210 /* re_compile_pattern is the GNU regular expression compiler: it
00211    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
00212    Returns 0 if the pattern was valid, otherwise an error string.
00213 
00214    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
00215    are set in BUFP on entry.  */
00216 
00217 const char *
00218 re_compile_pattern (pattern, length, bufp)
00219     const char *pattern;
00220     size_t length;
00221     struct re_pattern_buffer *bufp;
00222 {
00223   reg_errcode_t ret;
00224 
00225   /* And GNU code determines whether or not to get register information
00226      by passing null for the REGS argument to re_match, etc., not by
00227      setting no_sub.  */
00228   bufp->no_sub = 0;
00229 
00230   /* Match anchors at newline.  */
00231   bufp->newline_anchor = 1;
00232 
00233   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
00234 
00235   if (!ret)
00236     return NULL;
00237   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
00238 }
00239 #ifdef _LIBC
00240 weak_alias (__re_compile_pattern, re_compile_pattern)
00241 #endif
00242 
00243 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
00244    also be assigned to arbitrarily: each pattern buffer stores its own
00245    syntax, so it can be changed between regex compilations.  */
00246 /* This has no initializer because initialized variables in Emacs
00247    become read-only after dumping.  */
00248 reg_syntax_t re_syntax_options;
00249 
00250 
00251 /* Specify the precise syntax of regexps for compilation.  This provides
00252    for compatibility for various utilities which historically have
00253    different, incompatible syntaxes.
00254 
00255    The argument SYNTAX is a bit mask comprised of the various bits
00256    defined in regex.h.  We return the old syntax.  */
00257 
00258 reg_syntax_t
00259 re_set_syntax (syntax)
00260     reg_syntax_t syntax;
00261 {
00262   reg_syntax_t ret = re_syntax_options;
00263 
00264   re_syntax_options = syntax;
00265 #ifdef RE_ENABLE_I18N
00266   re_mb_cur_max = MB_CUR_MAX;
00267 #endif
00268   return ret;
00269 }
00270 #ifdef _LIBC
00271 weak_alias (__re_set_syntax, re_set_syntax)
00272 #endif
00273 
00274 int
00275 re_compile_fastmap (bufp)
00276     struct re_pattern_buffer *bufp;
00277 {
00278   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
00279   char *fastmap = bufp->fastmap;
00280 
00281   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
00282   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
00283   if (dfa->init_state != dfa->init_state_word)
00284     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
00285   if (dfa->init_state != dfa->init_state_nl)
00286     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
00287   if (dfa->init_state != dfa->init_state_begbuf)
00288     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
00289   bufp->fastmap_accurate = 1;
00290   return 0;
00291 }
00292 #ifdef _LIBC
00293 weak_alias (__re_compile_fastmap, re_compile_fastmap)
00294 #endif
00295 
00296 static inline void
00297 re_set_fastmap (char *fastmap, int icase, int ch)
00298 {
00299   fastmap[ch] = 1;
00300   if (icase)
00301     fastmap[tolower (ch)] = 1;
00302 }
00303 
00304 /* Helper function for re_compile_fastmap.
00305    Compile fastmap for the initial_state INIT_STATE.  */
00306 
00307 static void
00308 re_compile_fastmap_iter (bufp, init_state, fastmap)
00309      regex_t *bufp;
00310      const re_dfastate_t *init_state;
00311      char *fastmap;
00312 {
00313   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
00314   int node_cnt;
00315 #ifdef RE_ENABLE_I18N
00316   int icase = (re_mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
00317 #else
00318   int icase = (bufp->syntax & RE_ICASE);
00319 #endif
00320   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
00321     {
00322       int node = init_state->nodes.elems[node_cnt];
00323       re_token_type_t type = dfa->nodes[node].type;
00324 
00325       if (type == CHARACTER)
00326         re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
00327       else if (type == SIMPLE_BRACKET)
00328         {
00329           int i, j, ch;
00330           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
00331             for (j = 0; j < UINT_BITS; ++j, ++ch)
00332               if (dfa->nodes[node].opr.sbcset[i] & (1UL << j))
00333                 re_set_fastmap (fastmap, icase, ch);
00334         }
00335 #ifdef RE_ENABLE_I18N
00336       else if (type == COMPLEX_BRACKET)
00337         {
00338           int i;
00339           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
00340           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
00341               || cset->nranges || cset->nchar_classes)
00342             {
00343 # ifdef _LIBC
00344               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
00345                 {
00346                   /* In this case we want to catch the bytes which are
00347                      the first byte of any collation elements.
00348                      e.g. In da_DK, we want to catch 'a' since "aa"
00349                           is a valid collation element, and don't catch
00350                           'b' since 'b' is the only collation element
00351                           which starts from 'b'.  */
00352                   int j, ch;
00353                   const int32_t *table = (const int32_t *)
00354                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
00355                   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
00356                     for (j = 0; j < UINT_BITS; ++j, ++ch)
00357                       if (table[ch] < 0)
00358                         re_set_fastmap (fastmap, icase, ch);
00359                 }
00360 # else
00361               if (re_mb_cur_max > 1)
00362                 for (i = 0; i < SBC_MAX; ++i)
00363                   if (__btowc (i) == WEOF)
00364                     re_set_fastmap (fastmap, icase, i);
00365 # endif /* not _LIBC */
00366             }
00367           for (i = 0; i < cset->nmbchars; ++i)
00368             {
00369               char buf[256];
00370               mbstate_t state;
00371               memset (&state, '\0', sizeof (state));
00372               __wcrtomb (buf, cset->mbchars[i], &state);
00373               re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
00374             }
00375         }
00376 #endif /* RE_ENABLE_I18N */
00377       else if (type == END_OF_RE || type == OP_PERIOD)
00378         {
00379           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
00380           if (type == END_OF_RE)
00381             bufp->can_be_null = 1;
00382           return;
00383         }
00384     }
00385 }
00386 
00387 /* Entry point for POSIX code.  */
00388 /* regcomp takes a regular expression as a string and compiles it.
00389 
00390    PREG is a regex_t *.  We do not expect any fields to be initialized,
00391    since POSIX says we shouldn't.  Thus, we set
00392 
00393      `buffer' to the compiled pattern;
00394      `used' to the length of the compiled pattern;
00395      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
00396        REG_EXTENDED bit in CFLAGS is set; otherwise, to
00397        RE_SYNTAX_POSIX_BASIC;
00398      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
00399      `fastmap' to an allocated space for the fastmap;
00400      `fastmap_accurate' to zero;
00401      `re_nsub' to the number of subexpressions in PATTERN.
00402 
00403    PATTERN is the address of the pattern string.
00404 
00405    CFLAGS is a series of bits which affect compilation.
00406 
00407      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
00408      use POSIX basic syntax.
00409 
00410      If REG_NEWLINE is set, then . and [^...] don't match newline.
00411      Also, regexec will try a match beginning after every newline.
00412 
00413      If REG_ICASE is set, then we considers upper- and lowercase
00414      versions of letters to be equivalent when matching.
00415 
00416      If REG_NOSUB is set, then when PREG is passed to regexec, that
00417      routine will report only success or failure, and nothing about the
00418      registers.
00419 
00420    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
00421    the return codes and their meanings.)  */
00422 
00423 int
00424 regcomp (preg, pattern, cflags)
00425     regex_t *__restrict preg;
00426     const char *__restrict pattern;
00427     int cflags;
00428 {
00429   reg_errcode_t ret;
00430   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
00431                          : RE_SYNTAX_POSIX_BASIC);
00432 
00433   preg->buffer = NULL;
00434   preg->allocated = 0;
00435   preg->used = 0;
00436 
00437   /* Try to allocate space for the fastmap.  */
00438   preg->fastmap = re_malloc (char, SBC_MAX);
00439   if (BE (preg->fastmap == NULL, 0))
00440     return REG_ESPACE;
00441 
00442   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
00443 
00444   /* If REG_NEWLINE is set, newlines are treated differently.  */
00445   if (cflags & REG_NEWLINE)
00446     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
00447       syntax &= ~RE_DOT_NEWLINE;
00448       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
00449       /* It also changes the matching behavior.  */
00450       preg->newline_anchor = 1;
00451     }
00452   else
00453     preg->newline_anchor = 0;
00454   preg->no_sub = !!(cflags & REG_NOSUB);
00455   preg->translate = NULL;
00456 
00457   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
00458 
00459   /* POSIX doesn't distinguish between an unmatched open-group and an
00460      unmatched close-group: both are REG_EPAREN.  */
00461   if (ret == REG_ERPAREN)
00462     ret = REG_EPAREN;
00463 
00464   /* We have already checked preg->fastmap != NULL.  */
00465   if (BE (ret == REG_NOERROR, 1))
00466     /* Compute the fastmap now, since regexec cannot modify the pattern
00467        buffer.  This function nevers fails in this implementation.  */
00468     (void) re_compile_fastmap (preg);
00469   else
00470     {
00471       /* Some error occurred while compiling the expression.  */
00472       re_free (preg->fastmap);
00473       preg->fastmap = NULL;
00474     }
00475 
00476   return (int) ret;
00477 }
00478 #ifdef _LIBC
00479 weak_alias (__regcomp, regcomp)
00480 #endif
00481 
00482 /* Returns a message corresponding to an error code, ERRCODE, returned
00483    from either regcomp or regexec.   We don't use PREG here.  */
00484 
00485 size_t
00486 regerror (errcode, preg, errbuf, errbuf_size)
00487     int errcode;
00488     const regex_t *preg;
00489     char *errbuf;
00490     size_t errbuf_size;
00491 {
00492   const char *msg;
00493   size_t msg_size;
00494 
00495   if (BE (errcode < 0
00496           || errcode >= (int) (sizeof (__re_error_msgid_idx)
00497                                / sizeof (__re_error_msgid_idx[0])), 0))
00498     /* Only error codes returned by the rest of the code should be passed
00499        to this routine.  If we are given anything else, or if other regex
00500        code generates an invalid error code, then the program has a bug.
00501        Dump core so we can fix it.  */
00502     abort ();
00503 
00504   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
00505 
00506   msg_size = strlen (msg) + 1; /* Includes the null.  */
00507 
00508   if (BE (errbuf_size != 0, 1))
00509     {
00510       if (BE (msg_size > errbuf_size, 0))
00511         {
00512           memcpy (errbuf, msg, errbuf_size - 1);
00513           errbuf[errbuf_size - 1] = 0;
00514         }
00515       else
00516         memcpy (errbuf, msg, msg_size);
00517     }
00518 
00519   return msg_size;
00520 }
00521 #ifdef _LIBC
00522 weak_alias (__regerror, regerror)
00523 #endif
00524 
00525 
00526 static void
00527 free_dfa_content (re_dfa_t *dfa)
00528 {
00529   int i, j;
00530 
00531   re_free (dfa->subexps);
00532 
00533   for (i = 0; i < dfa->nodes_len; ++i)
00534     {
00535       re_token_t *node = dfa->nodes + i;
00536 #ifdef RE_ENABLE_I18N
00537       if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
00538         free_charset (node->opr.mbcset);
00539       else
00540 #endif /* RE_ENABLE_I18N */
00541         if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
00542           re_free (node->opr.sbcset);
00543     }
00544   re_free (dfa->nexts);
00545   for (i = 0; i < dfa->nodes_len; ++i)
00546     {
00547       if (dfa->eclosures != NULL)
00548         re_node_set_free (dfa->eclosures + i);
00549       if (dfa->inveclosures != NULL)
00550         re_node_set_free (dfa->inveclosures + i);
00551       if (dfa->edests != NULL)
00552         re_node_set_free (dfa->edests + i);
00553     }
00554   re_free (dfa->edests);
00555   re_free (dfa->eclosures);
00556   re_free (dfa->inveclosures);
00557   re_free (dfa->nodes);
00558 
00559   for (i = 0; i <= dfa->state_hash_mask; ++i)
00560     {
00561       struct re_state_table_entry *entry = dfa->state_table + i;
00562       for (j = 0; j < entry->num; ++j)
00563         {
00564           re_dfastate_t *state = entry->array[j];
00565           free_state (state);
00566         }
00567       re_free (entry->array);
00568     }
00569   re_free (dfa->state_table);
00570 
00571   if (dfa->word_char != NULL)
00572     re_free (dfa->word_char);
00573 #ifdef DEBUG
00574   re_free (dfa->re_str);
00575 #endif
00576 
00577   re_free (dfa);
00578 }
00579 
00580 
00581 /* Free dynamically allocated space used by PREG.  */
00582 
00583 void
00584 regfree (preg)
00585     regex_t *preg;
00586 {
00587   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
00588   if (BE (dfa != NULL, 1))
00589     free_dfa_content (dfa);
00590 
00591   re_free (preg->fastmap);
00592 }
00593 #ifdef _LIBC
00594 weak_alias (__regfree, regfree)
00595 #endif
00596 
00597 /* Entry points compatible with 4.2 BSD regex library.  We don't define
00598    them unless specifically requested.  */
00599 
00600 #if defined _REGEX_RE_COMP || defined _LIBC
00601 
00602 /* BSD has one and only one pattern buffer.  */
00603 static struct re_pattern_buffer re_comp_buf;
00604 
00605 char *
00606 # ifdef _LIBC
00607 /* Make these definitions weak in libc, so POSIX programs can redefine
00608    these names if they don't use our functions, and still use
00609    regcomp/regexec above without link errors.  */
00610 weak_function
00611 # endif
00612 re_comp (s)
00613      const char *s;
00614 {
00615   reg_errcode_t ret;
00616   char *fastmap;
00617 
00618   if (!s)
00619     {
00620       if (!re_comp_buf.buffer)
00621         return gettext ("No previous regular expression");
00622       return 0;
00623     }
00624 
00625   if (re_comp_buf.buffer)
00626     {
00627       fastmap = re_comp_buf.fastmap;
00628       re_comp_buf.fastmap = NULL;
00629       __regfree (&re_comp_buf);
00630       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
00631       re_comp_buf.fastmap = fastmap;
00632     }
00633 
00634   if (re_comp_buf.fastmap == NULL)
00635     {
00636       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
00637       if (re_comp_buf.fastmap == NULL)
00638         return (char *) gettext (__re_error_msgid
00639                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
00640     }
00641 
00642   /* Since `re_exec' always passes NULL for the `regs' argument, we
00643      don't need to initialize the pattern buffer fields which affect it.  */
00644 
00645   /* Match anchors at newlines.  */
00646   re_comp_buf.newline_anchor = 1;
00647 
00648   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
00649 
00650   if (!ret)
00651     return NULL;
00652 
00653   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
00654   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
00655 }
00656 
00657 #ifdef _LIBC
00658 libc_freeres_fn (free_mem)
00659 {
00660   __regfree (&re_comp_buf);
00661 }
00662 #endif
00663 
00664 #endif /* _REGEX_RE_COMP */
00665 
00666 /* Internal entry point.
00667    Compile the regular expression PATTERN, whose length is LENGTH.
00668    SYNTAX indicate regular expression's syntax.  */
00669 
00670 static reg_errcode_t
00671 re_compile_internal (preg, pattern, length, syntax)
00672      regex_t *preg;
00673      const char * pattern;
00674      int length;
00675      reg_syntax_t syntax;
00676 {
00677   reg_errcode_t err = REG_NOERROR;
00678   re_dfa_t *dfa;
00679   re_string_t regexp;
00680 
00681   /* Initialize the pattern buffer.  */
00682   preg->fastmap_accurate = 0;
00683   preg->syntax = syntax;
00684   preg->not_bol = preg->not_eol = 0;
00685   preg->used = 0;
00686   preg->re_nsub = 0;
00687   preg->can_be_null = 0;
00688   preg->regs_allocated = REGS_UNALLOCATED;
00689 
00690   /* Initialize the dfa.  */
00691   dfa = (re_dfa_t *) preg->buffer;
00692   if (preg->allocated < sizeof (re_dfa_t))
00693     {
00694       /* If zero allocated, but buffer is non-null, try to realloc
00695          enough space.  This loses if buffer's address is bogus, but
00696          that is the user's responsibility.  If ->buffer is NULL this
00697          is a simple allocation.  */
00698       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
00699       if (dfa == NULL)
00700         return REG_ESPACE;
00701       preg->allocated = sizeof (re_dfa_t);
00702     }
00703   preg->buffer = (unsigned char *) dfa;
00704   preg->used = sizeof (re_dfa_t);
00705 
00706   err = init_dfa (dfa, length);
00707   if (BE (err != REG_NOERROR, 0))
00708     {
00709       re_free (dfa);
00710       preg->buffer = NULL;
00711       preg->allocated = 0;
00712       return err;
00713     }
00714 #ifdef DEBUG
00715   dfa->re_str = re_malloc (char, length + 1);
00716   strncpy (dfa->re_str, pattern, length + 1);
00717 #endif
00718 
00719   err = re_string_construct (&regexp, pattern, length, preg->translate,
00720                              syntax & RE_ICASE);
00721   if (BE (err != REG_NOERROR, 0))
00722     {
00723       re_free (dfa);
00724       preg->buffer = NULL;
00725       preg->allocated = 0;
00726       return err;
00727     }
00728 
00729   /* Parse the regular expression, and build a structure tree.  */
00730   preg->re_nsub = 0;
00731   dfa->str_tree = parse (&regexp, preg, syntax, &err);
00732   if (BE (dfa->str_tree == NULL, 0))
00733     goto re_compile_internal_free_return;
00734 
00735   /* Analyze the tree and collect information which is necessary to
00736      create the dfa.  */
00737   err = analyze (dfa);
00738   if (BE (err != REG_NOERROR, 0))
00739     goto re_compile_internal_free_return;
00740 
00741   /* Then create the initial state of the dfa.  */
00742   err = create_initial_state (dfa);
00743 
00744   /* Release work areas.  */
00745   free_workarea_compile (preg);
00746   re_string_destruct (&regexp);
00747 
00748   if (BE (err != REG_NOERROR, 0))
00749     {
00750     re_compile_internal_free_return:
00751       free_dfa_content (dfa);
00752       preg->buffer = NULL;
00753       preg->allocated = 0;
00754     }
00755 
00756   return err;
00757 }
00758 
00759 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
00760    as the initial length of some arrays.  */
00761 
00762 static reg_errcode_t
00763 init_dfa (dfa, pat_len)
00764      re_dfa_t *dfa;
00765      int pat_len;
00766 {
00767   int table_size;
00768 
00769   memset (dfa, '\0', sizeof (re_dfa_t));
00770 
00771   dfa->nodes_alloc = pat_len + 1;
00772   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
00773 
00774   dfa->states_alloc = pat_len + 1;
00775 
00776   /*  table_size = 2 ^ ceil(log pat_len) */
00777   for (table_size = 1; table_size > 0; table_size <<= 1)
00778     if (table_size > pat_len)
00779       break;
00780 
00781   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
00782   dfa->state_hash_mask = table_size - 1;
00783 
00784   dfa->subexps_alloc = 1;
00785   dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
00786   dfa->word_char = NULL;
00787 
00788   if (BE (dfa->nodes == NULL || dfa->state_table == NULL
00789           || dfa->subexps == NULL, 0))
00790     {
00791       /* We don't bother to free anything which was allocated.  Very
00792          soon the process will go down anyway.  */
00793       dfa->subexps = NULL;
00794       dfa->state_table = NULL;
00795       dfa->nodes = NULL;
00796       return REG_ESPACE;
00797     }
00798   return REG_NOERROR;
00799 }
00800 
00801 /* Initialize WORD_CHAR table, which indicate which character is
00802    "word".  In this case "word" means that it is the word construction
00803    character used by some operators like "<", ">", etc.  */
00804 
00805 static reg_errcode_t
00806 init_word_char (dfa)
00807      re_dfa_t *dfa;
00808 {
00809   int i, j, ch;
00810   dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
00811   if (BE (dfa->word_char == NULL, 0))
00812     return REG_ESPACE;
00813   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
00814     for (j = 0; j < UINT_BITS; ++j, ++ch)
00815       if (isalnum (ch) || ch == '_')
00816         dfa->word_char[i] |= 1UL << j;
00817   return REG_NOERROR;
00818 }
00819 
00820 /* Free the work area which are only used while compiling.  */
00821 
00822 static void
00823 free_workarea_compile (preg)
00824      regex_t *preg;
00825 {
00826   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
00827   free_bin_tree (dfa->str_tree);
00828   dfa->str_tree = NULL;
00829   re_free (dfa->org_indices);
00830   dfa->org_indices = NULL;
00831 }
00832 
00833 /* Create initial states for all contexts.  */
00834 
00835 static reg_errcode_t
00836 create_initial_state (dfa)
00837      re_dfa_t *dfa;
00838 {
00839   int first, i;
00840   reg_errcode_t err;
00841   re_node_set init_nodes;
00842 
00843   /* Initial states have the epsilon closure of the node which is
00844      the first node of the regular expression.  */
00845   first = dfa->str_tree->first;
00846   dfa->init_node = first;
00847   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
00848   if (BE (err != REG_NOERROR, 0))
00849     return err;
00850 
00851   /* The back-references which are in initial states can epsilon transit,
00852      since in this case all of the subexpressions can be null.
00853      Then we add epsilon closures of the nodes which are the next nodes of
00854      the back-references.  */
00855   if (dfa->nbackref > 0)
00856     for (i = 0; i < init_nodes.nelem; ++i)
00857       {
00858         int node_idx = init_nodes.elems[i];
00859         re_token_type_t type = dfa->nodes[node_idx].type;
00860 
00861         int clexp_idx;
00862         if (type != OP_BACK_REF)
00863           continue;
00864         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
00865           {
00866             re_token_t *clexp_node;
00867             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
00868             if (clexp_node->type == OP_CLOSE_SUBEXP
00869                 && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
00870               break;
00871           }
00872         if (clexp_idx == init_nodes.nelem)
00873           continue;
00874 
00875         if (type == OP_BACK_REF)
00876           {
00877             int dest_idx = dfa->edests[node_idx].elems[0];
00878             if (!re_node_set_contains (&init_nodes, dest_idx))
00879               {
00880                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
00881                 i = 0;
00882               }
00883           }
00884       }
00885 
00886   /* It must be the first time to invoke acquire_state.  */
00887   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
00888   /* We don't check ERR here, since the initial state must not be NULL.  */
00889   if (BE (dfa->init_state == NULL, 0))
00890     return err;
00891   if (dfa->init_state->has_constraint)
00892     {
00893       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
00894                                                        CONTEXT_WORD);
00895       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
00896                                                      CONTEXT_NEWLINE);
00897       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
00898                                                          &init_nodes,
00899                                                          CONTEXT_NEWLINE
00900                                                          | CONTEXT_BEGBUF);
00901       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
00902               || dfa->init_state_begbuf == NULL, 0))
00903         return err;
00904     }
00905   else
00906     dfa->init_state_word = dfa->init_state_nl
00907       = dfa->init_state_begbuf = dfa->init_state;
00908 
00909   re_node_set_free (&init_nodes);
00910   return REG_NOERROR;
00911 }
00912 
00913 /* Analyze the structure tree, and calculate "first", "next", "edest",
00914    "eclosure", and "inveclosure".  */
00915 
00916 static reg_errcode_t
00917 analyze (dfa)
00918      re_dfa_t *dfa;
00919 {
00920   int i;
00921   reg_errcode_t ret;
00922 
00923   /* Allocate arrays.  */
00924   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
00925   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
00926   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
00927   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
00928   dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
00929   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
00930           || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
00931     return REG_ESPACE;
00932   /* Initialize them.  */
00933   for (i = 0; i < dfa->nodes_len; ++i)
00934     {
00935       dfa->nexts[i] = -1;
00936       re_node_set_init_empty (dfa->edests + i);
00937       re_node_set_init_empty (dfa->eclosures + i);
00938       re_node_set_init_empty (dfa->inveclosures + i);
00939     }
00940 
00941   ret = analyze_tree (dfa, dfa->str_tree);
00942   if (BE (ret == REG_NOERROR, 1))
00943     {
00944       ret = calc_eclosure (dfa);
00945       if (ret == REG_NOERROR)
00946         calc_inveclosure (dfa);
00947     }
00948   return ret;
00949 }
00950 
00951 /* Helper functions for analyze.
00952    This function calculate "first", "next", and "edest" for the subtree
00953    whose root is NODE.  */
00954 
00955 static reg_errcode_t
00956 analyze_tree (dfa, node)
00957      re_dfa_t *dfa;
00958      bin_tree_t *node;
00959 {
00960   reg_errcode_t ret;
00961   if (node->first == -1)
00962     calc_first (dfa, node);
00963   if (node->next == -1)
00964     calc_next (dfa, node);
00965   if (node->eclosure.nelem == 0)
00966     calc_epsdest (dfa, node);
00967   /* Calculate "first" etc. for the left child.  */
00968   if (node->left != NULL)
00969     {
00970       ret = analyze_tree (dfa, node->left);
00971       if (BE (ret != REG_NOERROR, 0))
00972         return ret;
00973     }
00974   /* Calculate "first" etc. for the right child.  */
00975   if (node->right != NULL)
00976     {
00977       ret = analyze_tree (dfa, node->right);
00978       if (BE (ret != REG_NOERROR, 0))
00979         return ret;
00980     }
00981   return REG_NOERROR;
00982 }
00983 
00984 /* Calculate "first" for the node NODE.  */
00985 static void
00986 calc_first (dfa, node)
00987      re_dfa_t *dfa;
00988      bin_tree_t *node;
00989 {
00990   int idx, type;
00991   idx = node->node_idx;
00992   type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
00993 
00994   switch (type)
00995     {
00996 #ifdef DEBUG
00997     case OP_OPEN_BRACKET:
00998     case OP_CLOSE_BRACKET:
00999     case OP_OPEN_DUP_NUM:
01000     case OP_CLOSE_DUP_NUM:
01001     case OP_NON_MATCH_LIST:
01002     case OP_OPEN_COLL_ELEM:
01003     case OP_CLOSE_COLL_ELEM:
01004     case OP_OPEN_EQUIV_CLASS:
01005     case OP_CLOSE_EQUIV_CLASS:
01006     case OP_OPEN_CHAR_CLASS:
01007     case OP_CLOSE_CHAR_CLASS:
01008       /* These must not be appeared here.  */
01009       assert (0);
01010 #endif
01011     case END_OF_RE:
01012     case CHARACTER:
01013     case OP_PERIOD:
01014     case OP_DUP_ASTERISK:
01015     case OP_DUP_QUESTION:
01016 #ifdef RE_ENABLE_I18N
01017     case COMPLEX_BRACKET:
01018 #endif /* RE_ENABLE_I18N */
01019     case SIMPLE_BRACKET:
01020     case OP_BACK_REF:
01021     case ANCHOR:
01022     case OP_OPEN_SUBEXP:
01023     case OP_CLOSE_SUBEXP:
01024       node->first = idx;
01025       break;
01026     case OP_DUP_PLUS:
01027 #ifdef DEBUG
01028       assert (node->left != NULL);
01029 #endif
01030       if (node->left->first == -1)
01031         calc_first (dfa, node->left);
01032       node->first = node->left->first;
01033       break;
01034     case OP_ALT:
01035       node->first = idx;
01036       break;
01037       /* else fall through */
01038     default:
01039 #ifdef DEBUG
01040       assert (node->left != NULL);
01041 #endif
01042       if (node->left->first == -1)
01043         calc_first (dfa, node->left);
01044       node->first = node->left->first;
01045       break;
01046     }
01047 }
01048 
01049 /* Calculate "next" for the node NODE.  */
01050 
01051 static void
01052 calc_next (dfa, node)
01053      re_dfa_t *dfa;
01054      bin_tree_t *node;
01055 {
01056   int idx, type;
01057   bin_tree_t *parent = node->parent;
01058   if (parent == NULL)
01059     {
01060       node->next = -1;
01061       idx = node->node_idx;
01062       if (node->type == 0)
01063         dfa->nexts[idx] = node->next;
01064       return;
01065     }
01066 
01067   idx = parent->node_idx;
01068   type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
01069 
01070   switch (type)
01071     {
01072     case OP_DUP_ASTERISK:
01073     case OP_DUP_PLUS:
01074       node->next = idx;
01075       break;
01076     case CONCAT:
01077       if (parent->left == node)
01078         {
01079           if (parent->right->first == -1)
01080             calc_first (dfa, parent->right);
01081           node->next = parent->right->first;