Main Page | Class List | Directories | File List | Class Members | File Members

regex_internal.h

Go to the documentation of this file.
00001 /* Extended regular expression matching and search library.
00002    Copyright (C) 2002, 2003 Free Software Foundation, Inc.
00003    This file is part of the GNU C Library.
00004    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
00005 
00006    The GNU C Library is free software; you can redistribute it and/or
00007    modify it under the terms of the GNU Lesser General Public
00008    License as published by the Free Software Foundation; either
00009    version 2.1 of the License, or (at your option) any later version.
00010 
00011    The GNU C Library is distributed in the hope that it will be useful,
00012    but WITHOUT ANY WARRANTY; without even the implied warranty of
00013    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014    Lesser General Public License for more details.
00015 
00016    You should have received a copy of the GNU Lesser General Public
00017    License along with the GNU C Library; if not, write to the Free
00018    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
00019    02111-1307 USA.  */
00020 
00021 #ifndef _REGEX_INTERNAL_H
00022 #define _REGEX_INTERNAL_H 1
00023 
00024 #ifdef HAVE_CONFIG_H
00025 #include "config.h"
00026 #endif
00027 
00028 #include <assert.h>
00029 #include <ctype.h>
00030 #if 0
00031 /* Don't include this here. On some systems it sets RE_DUP_MAX to a
00032  * lower value than GNU regex allows.  Instead, include it in
00033  * regex.c, before include of <regex.h>, which correctly
00034  * #undefs RE_DUP_MAX and sets it to the right value.
00035  */
00036 #include <limits.h>
00037 #endif
00038 #include <stdio.h>
00039 #include <stdlib.h>
00040 #include <string.h>
00041 
00042 #if defined HAVE_LOCALE_H || defined _LIBC
00043 # include <locale.h>
00044 #endif
00045 #if defined HAVE_WCHAR_H || defined _LIBC
00046 # include <wchar.h>
00047 #endif /* HAVE_WCHAR_H || _LIBC */
00048 #if defined HAVE_WCTYPE_H || defined _LIBC
00049 # include <wctype.h>
00050 #endif /* HAVE_WCTYPE_H || _LIBC */
00051 
00052 /* In case that the system doesn't have isblank().  */
00053 #if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank
00054 # define isblank(ch) ((ch) == ' ' || (ch) == '\t')
00055 #endif
00056 
00057 #ifdef _LIBC
00058 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
00059 #  define _RE_DEFINE_LOCALE_FUNCTIONS 1
00060 #   include <locale/localeinfo.h>
00061 #   include <locale/elem-hash.h>
00062 #   include <locale/coll-lookup.h>
00063 # endif
00064 #endif
00065 
00066 /* This is for other GNU distributions with internationalized messages.  */
00067 #if (HAVE_LIBINTL_H && ENABLE_NLS) || defined _LIBC
00068 # include <libintl.h>
00069 # ifdef _LIBC
00070 #  undef gettext
00071 #  define gettext(msgid) \
00072   INTUSE(__dcgettext) (INTUSE(_libc_intl_domainname), msgid, LC_MESSAGES)
00073 # endif
00074 #else
00075 # define gettext(msgid) (msgid)
00076 #endif
00077 
00078 #ifndef gettext_noop
00079 /* This define is so xgettext can find the internationalizable
00080    strings.  */
00081 # define gettext_noop(String) String
00082 #endif
00083 
00084 #if (defined MB_CUR_MAX && HAVE_LOCALE_H && HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_WCRTOMB && HAVE_MBRTOWC && HAVE_WCSCOLL) || _LIBC
00085 # define RE_ENABLE_I18N
00086 #endif
00087 
00088 #if __GNUC__ >= 3
00089 # define BE(expr, val) __builtin_expect (expr, val)
00090 #else
00091 # define BE(expr, val) (expr)
00092 # define inline
00093 #endif
00094 
00095 /* Number of bits in a byte.  */
00096 #define BYTE_BITS 8
00097 /* Number of single byte character.  */
00098 #define SBC_MAX 256
00099 
00100 #define COLL_ELEM_LEN_MAX 8
00101 
00102 /* The character which represents newline.  */
00103 #define NEWLINE_CHAR '\n'
00104 #define WIDE_NEWLINE_CHAR L'\n'
00105 
00106 /* Rename to standard API for using out of glibc.  */
00107 #ifndef _LIBC
00108 # define __wctype wctype
00109 # define __iswctype iswctype
00110 # define __btowc btowc
00111 # define __wcrtomb wcrtomb
00112 # define attribute_hidden
00113 # define __thread
00114 #endif /* not _LIBC */
00115 
00116 #if _LIBC || __GNUC__ >= 3
00117 # define BE(expr, val) __builtin_expect (expr, val)
00118 #else
00119 # define BE(expr, val) (expr)
00120 # define inline
00121 #endif
00122 
00123 #ifdef RE_ENABLE_I18N
00124 __thread int re_mb_cur_max = 1;
00125 #endif
00126 
00127 extern const char __re_error_msgid[] attribute_hidden;
00128 extern const size_t __re_error_msgid_idx[] attribute_hidden;
00129 
00130 /* Number of bits in an unsinged int.  */
00131 #define UINT_BITS (sizeof (unsigned int) * BYTE_BITS)
00132 /* Number of unsigned int in an bit_set.  */
00133 #define BITSET_UINTS ((SBC_MAX + UINT_BITS - 1) / UINT_BITS)
00134 typedef unsigned int bitset[BITSET_UINTS];
00135 typedef unsigned int *re_bitset_ptr_t;
00136 
00137 #define bitset_set(set,i) (set[i / UINT_BITS] |= 1UL << i % UINT_BITS)
00138 #define bitset_clear(set,i) (set[i / UINT_BITS] &= ~(1UL << i % UINT_BITS))
00139 #define bitset_contain(set,i) (set[i / UINT_BITS] & (1UL << i % UINT_BITS))
00140 #define bitset_empty(set) memset (set, 0, sizeof (unsigned int) * BITSET_UINTS)
00141 #define bitset_set_all(set) \
00142   memset (set, 255, sizeof (unsigned int) * BITSET_UINTS)
00143 #define bitset_copy(dest,src) \
00144   memcpy (dest, src, sizeof (unsigned int) * BITSET_UINTS)
00145 static inline void bitset_not _RE_ARGS((bitset set));
00146 static inline void bitset_merge _RE_ARGS((bitset dest, const bitset src));
00147 static inline void bitset_not_merge _RE_ARGS((bitset dest, const bitset src));
00148 
00149 #define PREV_WORD_CONSTRAINT 0x0001
00150 #define PREV_NOTWORD_CONSTRAINT 0x0002
00151 #define NEXT_WORD_CONSTRAINT 0x0004
00152 #define NEXT_NOTWORD_CONSTRAINT 0x0008
00153 #define PREV_NEWLINE_CONSTRAINT 0x0010
00154 #define NEXT_NEWLINE_CONSTRAINT 0x0020
00155 #define PREV_BEGBUF_CONSTRAINT 0x0040
00156 #define NEXT_ENDBUF_CONSTRAINT 0x0080
00157 #define DUMMY_CONSTRAINT 0x0100
00158 
00159 typedef enum
00160 {
00161   INSIDE_WORD = PREV_WORD_CONSTRAINT | NEXT_WORD_CONSTRAINT,
00162   WORD_FIRST = PREV_NOTWORD_CONSTRAINT | NEXT_WORD_CONSTRAINT,
00163   WORD_LAST = PREV_WORD_CONSTRAINT | NEXT_NOTWORD_CONSTRAINT,
00164   LINE_FIRST = PREV_NEWLINE_CONSTRAINT,
00165   LINE_LAST = NEXT_NEWLINE_CONSTRAINT,
00166   BUF_FIRST = PREV_BEGBUF_CONSTRAINT,
00167   BUF_LAST = NEXT_ENDBUF_CONSTRAINT,
00168   WORD_DELIM = DUMMY_CONSTRAINT
00169 } re_context_type;
00170 
00171 typedef struct
00172 {
00173   int alloc;
00174   int nelem;
00175   int *elems;
00176 } re_node_set;
00177 
00178 typedef enum
00179 {
00180   NON_TYPE = 0,
00181 
00182   /* Token type, these are used only by token.  */
00183   OP_OPEN_BRACKET,
00184   OP_CLOSE_BRACKET,
00185   OP_CHARSET_RANGE,
00186   OP_OPEN_DUP_NUM,
00187   OP_CLOSE_DUP_NUM,
00188   OP_NON_MATCH_LIST,
00189   OP_OPEN_COLL_ELEM,
00190   OP_CLOSE_COLL_ELEM,
00191   OP_OPEN_EQUIV_CLASS,
00192   OP_CLOSE_EQUIV_CLASS,
00193   OP_OPEN_CHAR_CLASS,
00194   OP_CLOSE_CHAR_CLASS,
00195   OP_WORD,
00196   OP_NOTWORD,
00197   BACK_SLASH,
00198 
00199   /* Tree type, these are used only by tree. */
00200   CONCAT,
00201   ALT,
00202   SUBEXP,
00203   SIMPLE_BRACKET,
00204 #ifdef RE_ENABLE_I18N
00205   COMPLEX_BRACKET,
00206 #endif /* RE_ENABLE_I18N */
00207 
00208   /* Node type, These are used by token, node, tree.  */
00209   OP_OPEN_SUBEXP,
00210   OP_CLOSE_SUBEXP,
00211   OP_PERIOD,
00212   CHARACTER,
00213   END_OF_RE,
00214   OP_ALT,
00215   OP_DUP_ASTERISK,
00216   OP_DUP_PLUS,
00217   OP_DUP_QUESTION,
00218   OP_BACK_REF,
00219   ANCHOR,
00220 
00221   /* Dummy marker.  */
00222   END_OF_RE_TOKEN_T
00223 } re_token_type_t;
00224 
00225 #ifdef RE_ENABLE_I18N
00226 typedef struct
00227 {
00228   /* Multibyte characters.  */
00229   wchar_t *mbchars;
00230 
00231   /* Collating symbols.  */
00232 # ifdef _LIBC
00233   int32_t *coll_syms;
00234 # endif
00235 
00236   /* Equivalence classes. */
00237 # ifdef _LIBC
00238   int32_t *equiv_classes;
00239 # endif
00240 
00241   /* Range expressions. */
00242 # ifdef _LIBC
00243   uint32_t *range_starts;
00244   uint32_t *range_ends;
00245 # else /* not _LIBC */
00246   wchar_t *range_starts;
00247   wchar_t *range_ends;
00248 # endif /* not _LIBC */
00249 
00250   /* Character classes. */
00251   wctype_t *char_classes;
00252 
00253   /* If this character set is the non-matching list.  */
00254   unsigned int non_match : 1;
00255 
00256   /* # of multibyte characters.  */
00257   int nmbchars;
00258 
00259   /* # of collating symbols.  */
00260   int ncoll_syms;
00261 
00262   /* # of equivalence classes. */
00263   int nequiv_classes;
00264 
00265   /* # of range expressions. */
00266   int nranges;
00267 
00268   /* # of character classes. */
00269   int nchar_classes;
00270 } re_charset_t;
00271 #endif /* RE_ENABLE_I18N */
00272 
00273 typedef struct
00274 {
00275   union
00276   {
00277     unsigned char c;            /* for CHARACTER */
00278     re_bitset_ptr_t sbcset;     /* for SIMPLE_BRACKET */
00279 #ifdef RE_ENABLE_I18N
00280     re_charset_t *mbcset;       /* for COMPLEX_BRACKET */
00281 #endif /* RE_ENABLE_I18N */
00282     int idx;                    /* for BACK_REF */
00283     re_context_type ctx_type;   /* for ANCHOR */
00284   } opr;
00285 #if __GNUC__ >= 2
00286   re_token_type_t type : 8;
00287 #else
00288   re_token_type_t type;
00289 #endif
00290   unsigned int constraint : 10; /* context constraint */
00291   unsigned int duplicated : 1;
00292 #ifdef RE_ENABLE_I18N
00293   unsigned int mb_partial : 1;
00294 #endif
00295 } re_token_t;
00296 
00297 #define IS_EPSILON_NODE(type) \
00298   ((type) == OP_ALT || (type) == OP_DUP_ASTERISK || (type) == OP_DUP_PLUS \
00299    || (type) == OP_DUP_QUESTION || (type) == ANCHOR \
00300    || (type) == OP_OPEN_SUBEXP || (type) == OP_CLOSE_SUBEXP)
00301 
00302 #define ACCEPT_MB_NODE(type) \
00303   ((type) == COMPLEX_BRACKET || (type) == OP_PERIOD)
00304 
00305 struct re_string_t
00306 {
00307   /* Indicate the raw buffer which is the original string passed as an
00308      argument of regexec(), re_search(), etc..  */
00309   const unsigned char *raw_mbs;
00310   /* Store the multibyte string.  In case of "case insensitive mode" like
00311      REG_ICASE, upper cases of the string are stored, otherwise MBS points
00312      the same address that RAW_MBS points.  */
00313   unsigned char *mbs;
00314   /* Store the case sensitive multibyte string.  In case of
00315      "case insensitive mode", the original string are stored,
00316      otherwise MBS_CASE points the same address that MBS points.  */
00317   unsigned char *mbs_case;
00318 #ifdef RE_ENABLE_I18N
00319   /* Store the wide character string which is corresponding to MBS.  */
00320   wint_t *wcs;
00321   mbstate_t cur_state;
00322 #endif
00323   /* Index in RAW_MBS.  Each character mbs[i] corresponds to
00324      raw_mbs[raw_mbs_idx + i].  */
00325   int raw_mbs_idx;
00326   /* The length of the valid characters in the buffers.  */
00327   int valid_len;
00328   /* The length of the buffers MBS, MBS_CASE, and WCS.  */
00329   int bufs_len;
00330   /* The index in MBS, which is updated by re_string_fetch_byte.  */
00331   int cur_idx;
00332   /* This is length_of_RAW_MBS - RAW_MBS_IDX.  */
00333   int len;
00334   /* End of the buffer may be shorter than its length in the cases such
00335      as re_match_2, re_search_2.  Then, we use STOP for end of the buffer
00336      instead of LEN.  */
00337   int stop;
00338 
00339   /* The context of mbs[0].  We store the context independently, since
00340      the context of mbs[0] may be different from raw_mbs[0], which is
00341      the beginning of the input string.  */
00342   unsigned int tip_context;
00343   /* The translation passed as a part of an argument of re_compile_pattern.  */
00344   RE_TRANSLATE_TYPE trans;
00345   /* 1 if REG_ICASE.  */
00346   unsigned int icase : 1;
00347 };
00348 typedef struct re_string_t re_string_t;
00349 /* In case of REG_ICASE, we allocate the buffer dynamically for mbs.  */
00350 #define MBS_ALLOCATED(pstr) (pstr->icase)
00351 /* In case that we need translation, we allocate the buffer dynamically
00352    for mbs_case.  Note that mbs == mbs_case if not REG_ICASE.  */
00353 #define MBS_CASE_ALLOCATED(pstr) (pstr->trans != NULL)
00354 
00355 static reg_errcode_t re_string_allocate _RE_ARGS((re_string_t *pstr, const char *str,
00356                                          int len, int init_len,
00357                                   RE_TRANSLATE_TYPE trans, int icase));
00358 static reg_errcode_t re_string_construct _RE_ARGS((re_string_t *pstr, const char *str, int len,
00359                                    RE_TRANSLATE_TYPE trans, int icase));
00360 static reg_errcode_t re_string_reconstruct _RE_ARGS((re_string_t *pstr, int idx,
00361                                      int eflags, int newline));
00362 static reg_errcode_t re_string_realloc_buffers _RE_ARGS((re_string_t *pstr, int new_buf_len));
00363 #ifdef RE_ENABLE_I18N
00364 static void build_wcs_buffer _RE_ARGS((re_string_t *pstr));
00365 static void build_wcs_upper_buffer _RE_ARGS((re_string_t *pstr));
00366 #endif /* RE_ENABLE_I18N */
00367 static void build_upper_buffer _RE_ARGS((re_string_t *pstr));
00368 static void re_string_translate_buffer _RE_ARGS((re_string_t *pstr));
00369 static void re_string_destruct _RE_ARGS((re_string_t *pstr));
00370 #ifdef RE_ENABLE_I18N
00371 static int re_string_elem_size_at _RE_ARGS((const re_string_t *pstr, int idx));
00372 static inline int re_string_char_size_at _RE_ARGS((const re_string_t *pstr, int idx));
00373 static inline wint_t re_string_wchar_at _RE_ARGS((const re_string_t *pstr, int idx));
00374 #endif /* RE_ENABLE_I18N */
00375 static unsigned int re_string_context_at _RE_ARGS((const re_string_t *input, int idx,
00376                                    int eflags, int newline_anchor));
00377 #define re_string_peek_byte(pstr, offset) \
00378   ((pstr)->mbs[(pstr)->cur_idx + offset])
00379 #define re_string_peek_byte_case(pstr, offset) \
00380   ((pstr)->mbs_case[(pstr)->cur_idx + offset])
00381 #define re_string_fetch_byte(pstr) \
00382   ((pstr)->mbs[(pstr)->cur_idx++])
00383 #define re_string_fetch_byte_case(pstr) \
00384   ((pstr)->mbs_case[(pstr)->cur_idx++])
00385 #define re_string_first_byte(pstr, idx) \
00386   ((idx) == (pstr)->len || (pstr)->wcs[idx] != WEOF)
00387 #define re_string_is_single_byte_char(pstr, idx) \
00388   ((pstr)->wcs[idx] != WEOF && ((pstr)->len == (idx) \
00389                                 || (pstr)->wcs[(idx) + 1] != WEOF))
00390 #define re_string_eoi(pstr) ((pstr)->stop <= (pstr)->cur_idx)
00391 #define re_string_cur_idx(pstr) ((pstr)->cur_idx)
00392 #define re_string_get_buffer(pstr) ((pstr)->mbs)
00393 #define re_string_length(pstr) ((pstr)->len)
00394 #define re_string_byte_at(pstr,idx) ((pstr)->mbs[idx])
00395 #define re_string_skip_bytes(pstr,idx) ((pstr)->cur_idx += (idx))
00396 #define re_string_set_index(pstr,idx) ((pstr)->cur_idx = (idx))
00397 
00398 #define re_malloc(t,n) ((t *) malloc ((n) * sizeof (t)))
00399 #define re_realloc(p,t,n) ((t *) realloc (p, (n) * sizeof (t)))
00400 #define re_free(p) free (p)
00401 
00402 struct bin_tree_t
00403 {
00404   struct bin_tree_t *parent;
00405   struct bin_tree_t *left;
00406   struct bin_tree_t *right;
00407 
00408   /* `node_idx' is the index in dfa->nodes, if `type' == 0.
00409      Otherwise `type' indicate the type of this node.  */
00410   re_token_type_t type;
00411   int node_idx;
00412 
00413   int first;
00414   int next;
00415   re_node_set eclosure;
00416 };
00417 typedef struct bin_tree_t bin_tree_t;
00418 
00419 
00420 #define CONTEXT_WORD 1
00421 #define CONTEXT_NEWLINE (CONTEXT_WORD << 1)
00422 #define CONTEXT_BEGBUF (CONTEXT_NEWLINE << 1)
00423 #define CONTEXT_ENDBUF (CONTEXT_BEGBUF << 1)
00424 
00425 #define IS_WORD_CONTEXT(c) ((c) & CONTEXT_WORD)
00426 #define IS_NEWLINE_CONTEXT(c) ((c) & CONTEXT_NEWLINE)
00427 #define IS_BEGBUF_CONTEXT(c) ((c) & CONTEXT_BEGBUF)
00428 #define IS_ENDBUF_CONTEXT(c) ((c) & CONTEXT_ENDBUF)
00429 #define IS_ORDINARY_CONTEXT(c) ((c) == 0)
00430 
00431 #define IS_WORD_CHAR(ch) (isalnum (ch) || (ch) == '_')
00432 #define IS_NEWLINE(ch) ((ch) == NEWLINE_CHAR)
00433 #define IS_WIDE_WORD_CHAR(ch) (iswalnum (ch) || (ch) == L'_')
00434 #define IS_WIDE_NEWLINE(ch) ((ch) == WIDE_NEWLINE_CHAR)
00435 
00436 #define NOT_SATISFY_PREV_CONSTRAINT(constraint,context) \
00437  ((((constraint) & PREV_WORD_CONSTRAINT) && !IS_WORD_CONTEXT (context)) \
00438   || ((constraint & PREV_NOTWORD_CONSTRAINT) && IS_WORD_CONTEXT (context)) \
00439   || ((constraint & PREV_NEWLINE_CONSTRAINT) && !IS_NEWLINE_CONTEXT (context))\
00440   || ((constraint & PREV_BEGBUF_CONSTRAINT) && !IS_BEGBUF_CONTEXT (context)))
00441 
00442 #define NOT_SATISFY_NEXT_CONSTRAINT(constraint,context) \
00443  ((((constraint) & NEXT_WORD_CONSTRAINT) && !IS_WORD_CONTEXT (context)) \
00444   || (((constraint) & NEXT_NOTWORD_CONSTRAINT) && IS_WORD_CONTEXT (context)) \
00445   || (((constraint) & NEXT_NEWLINE_CONSTRAINT) && !IS_NEWLINE_CONTEXT (context)) \
00446   || (((constraint) & NEXT_ENDBUF_CONSTRAINT) && !IS_ENDBUF_CONTEXT (context)))
00447 
00448 struct re_dfastate_t
00449 {
00450   unsigned int hash;
00451   re_node_set nodes;
00452   re_node_set *entrance_nodes;
00453   struct re_dfastate_t **trtable;
00454   struct re_dfastate_t **trtable_search;
00455   /* If this state is a special state.
00456      A state is a special state if the state is the halt state, or
00457      a anchor.  */
00458   unsigned int context : 2;
00459   unsigned int halt : 1;
00460   /* If this state can accept `multi byte'.
00461      Note that we refer to multibyte characters, and multi character
00462      collating elements as `multi byte'.  */
00463   unsigned int accept_mb : 1;
00464   /* If this state has backreference node(s).  */
00465   unsigned int has_backref : 1;
00466   unsigned int has_constraint : 1;
00467 };
00468 typedef struct re_dfastate_t re_dfastate_t;
00469 
00470 typedef struct
00471 {
00472   /* start <= node < end  */
00473   int start;
00474   int end;
00475 } re_subexp_t;
00476 
00477 struct re_state_table_entry
00478 {
00479   int num;
00480   int alloc;
00481   re_dfastate_t **array;
00482 };
00483 
00484 /* Array type used in re_sub_match_last_t and re_sub_match_top_t.  */
00485 
00486 typedef struct
00487 {
00488   int next_idx;
00489   int alloc;
00490   re_dfastate_t **array;
00491 } state_array_t;
00492 
00493 /* Store information about the node NODE whose type is OP_CLOSE_SUBEXP.  */
00494 
00495 typedef struct
00496 {
00497   int node;
00498   int str_idx; /* The position NODE match at.  */
00499   state_array_t path;
00500 } re_sub_match_last_t;
00501 
00502 /* Store information about the node NODE whose type is OP_OPEN_SUBEXP.
00503    And information about the node, whose type is OP_CLOSE_SUBEXP,
00504    corresponding to NODE is stored in LASTS.  */
00505 
00506 typedef struct
00507 {
00508   int str_idx;
00509   int node;
00510   int next_last_offset;
00511   state_array_t *path;
00512   int alasts; /* Allocation size of LASTS.  */
00513   int nlasts; /* The number of LASTS.  */
00514   re_sub_match_last_t **lasts;
00515 } re_sub_match_top_t;
00516 
00517 struct re_backref_cache_entry
00518 {
00519   int node;
00520   int str_idx;
00521   int subexp_from;
00522   int subexp_to;
00523   int flag;
00524 };
00525 
00526 typedef struct
00527 {
00528   /* EFLAGS of the argument of regexec.  */
00529   int eflags;
00530   /* Where the matching ends.  */
00531   int match_last;
00532   int last_node;
00533   /* The string object corresponding to the input string.  */
00534   re_string_t *input;
00535   /* The state log used by the matcher.  */
00536   re_dfastate_t **state_log;
00537   int state_log_top;
00538   /* Back reference cache.  */
00539   int nbkref_ents;
00540   int abkref_ents;
00541   struct re_backref_cache_entry *bkref_ents;
00542   int max_mb_elem_len;
00543   int nsub_tops;
00544   int asub_tops;
00545   re_sub_match_top_t **sub_tops;
00546 } re_match_context_t;
00547 
00548 typedef struct
00549 {
00550   int cur_bkref;
00551   int cls_subexp_idx;
00552 
00553   re_dfastate_t **sifted_states;
00554   re_dfastate_t **limited_states;
00555 
00556   re_node_set limits;
00557 
00558   int last_node;
00559   int last_str_idx;
00560   int check_subexp;
00561 } re_sift_context_t;
00562 
00563 struct re_fail_stack_ent_t
00564 {
00565   int idx;
00566   int node;
00567   regmatch_t *regs;
00568   re_node_set eps_via_nodes;
00569 };
00570 
00571 struct re_fail_stack_t
00572 {
00573   int num;
00574   int alloc;
00575   struct re_fail_stack_ent_t *stack;
00576 };
00577 
00578 struct re_dfa_t
00579 {
00580   re_bitset_ptr_t word_char;
00581 
00582   /* number of subexpressions `re_nsub' is in regex_t.  */
00583   int subexps_alloc;
00584   re_subexp_t *subexps;
00585 
00586   re_token_t *nodes;
00587   int nodes_alloc;
00588   int nodes_len;
00589   bin_tree_t *str_tree;
00590   int *nexts;
00591   int *org_indices;
00592   re_node_set *edests;
00593   re_node_set *eclosures;
00594   re_node_set *inveclosures;
00595   struct re_state_table_entry *state_table;
00596   unsigned int state_hash_mask;
00597   re_dfastate_t *init_state;
00598   re_dfastate_t *init_state_word;
00599   re_dfastate_t *init_state_nl;
00600   re_dfastate_t *init_state_begbuf;
00601   int states_alloc;
00602   int init_node;
00603   int nbackref; /* The number of backreference in this dfa.  */
00604   /* Bitmap expressing which backreference is used.  */
00605   unsigned int used_bkref_map;
00606 #ifdef DEBUG
00607   char* re_str;
00608 #endif
00609   unsigned int has_plural_match : 1;
00610   /* If this dfa has "multibyte node", which is a backreference or
00611      a node which can accept multibyte character or multi character
00612      collating element.  */
00613   unsigned int has_mb_node : 1;
00614 };
00615 typedef struct re_dfa_t re_dfa_t;
00616 
00617 static reg_errcode_t re_node_set_alloc _RE_ARGS((re_node_set *set, int size));
00618 static reg_errcode_t re_node_set_init_1 _RE_ARGS((re_node_set *set, int elem));
00619 static reg_errcode_t re_node_set_init_2 _RE_ARGS((re_node_set *set, int elem1, int elem2));
00620 #define re_node_set_init_empty(set) memset (set, '\0', sizeof (re_node_set))
00621 static reg_errcode_t re_node_set_init_copy _RE_ARGS((re_node_set *dest,
00622                                             const re_node_set *src));
00623 static reg_errcode_t re_node_set_add_intersect _RE_ARGS((re_node_set *dest,
00624                                                 const re_node_set *src1,
00625                                          const re_node_set *src2));
00626 static reg_errcode_t re_node_set_init_union _RE_ARGS((re_node_set *dest,
00627                                              const re_node_set *src1,
00628                                       const re_node_set *src2));
00629 static reg_errcode_t re_node_set_merge _RE_ARGS((re_node_set *dest, const re_node_set *src));
00630 static int re_node_set_insert _RE_ARGS((re_node_set *set, int elem));
00631 static int re_node_set_compare _RE_ARGS((const re_node_set *set1, const re_node_set *set2));
00632 static int re_node_set_contains _RE_ARGS((const re_node_set *set, int elem));
00633 static void re_node_set_remove_at _RE_ARGS((re_node_set *set, int idx));
00634 #define re_node_set_remove(set,id) \
00635   (re_node_set_remove_at (set, re_node_set_contains (set, id) - 1))
00636 #define re_node_set_empty(p) ((p)->nelem = 0)
00637 #define re_node_set_free(set) re_free ((set)->elems)
00638 static int re_dfa_add_node _RE_ARGS((re_dfa_t *dfa, re_token_t token, int mode));
00639 static re_dfastate_t *re_acquire_state _RE_ARGS((reg_errcode_t *err, re_dfa_t *dfa,
00640                                  const re_node_set *nodes));
00641 static re_dfastate_t *re_acquire_state_context _RE_ARGS((reg_errcode_t *err, re_dfa_t *dfa,
00642                                                 const re_node_set *nodes,
00643                                          unsigned int context));
00644 static void free_state _RE_ARGS((re_dfastate_t *state));
00645 
00646 
00647 typedef enum
00648 {
00649   SB_CHAR,
00650   MB_CHAR,
00651   EQUIV_CLASS,
00652   COLL_SYM,
00653   CHAR_CLASS
00654 } bracket_elem_type;
00655 
00656 typedef struct
00657 {
00658   bracket_elem_type type;
00659   union
00660   {
00661     unsigned char ch;
00662     unsigned char *name;
00663     wchar_t wch;
00664   } opr;
00665 } bracket_elem_t;
00666 
00667 
00668 /* Inline functions for bitset operation.  */
00669 static inline void
00670 bitset_not (set)
00671      bitset set;
00672 {
00673   int bitset_i;
00674   for (bitset_i = 0; bitset_i < BITSET_UINTS; ++bitset_i)
00675     set[bitset_i] = ~set[bitset_i];
00676 }
00677 
00678 static inline void
00679 bitset_merge (dest, src)
00680      bitset dest;
00681      const bitset src;
00682 {
00683   int bitset_i;
00684   for (bitset_i = 0; bitset_i < BITSET_UINTS; ++bitset_i)
00685     dest[bitset_i] |= src[bitset_i];
00686 }
00687 
00688 static inline void
00689 bitset_not_merge (dest, src)
00690      bitset dest;
00691      const bitset src;
00692 {
00693   int i;
00694   for (i = 0; i < BITSET_UINTS; ++i)
00695     dest[i] |= ~src[i];
00696 }
00697 
00698 #ifdef RE_ENABLE_I18N
00699 /* Inline functions for re_string.  */
00700 static inline int
00701 re_string_char_size_at (pstr, idx)
00702      const re_string_t *pstr;
00703      int idx;
00704 {
00705   int byte_idx;
00706   if (re_mb_cur_max == 1)
00707     return 1;
00708   for (byte_idx = 1; idx + byte_idx < pstr->len; ++byte_idx)
00709     if (pstr->wcs[idx + byte_idx] != WEOF)
00710       break;
00711   return byte_idx;
00712 }
00713 
00714 static inline wint_t
00715 re_string_wchar_at (pstr, idx)
00716      const re_string_t *pstr;
00717      int idx;
00718 {
00719   if (re_mb_cur_max == 1)
00720     return (wint_t) pstr->mbs[idx];
00721   return (wint_t) pstr->wcs[idx];
00722 }
00723 
00724 static int
00725 re_string_elem_size_at (pstr, idx)
00726      const re_string_t *pstr;
00727      int idx;
00728 {
00729 #ifdef _LIBC
00730   const unsigned char *p, *extra;
00731   const int32_t *table, *indirect;
00732   int32_t tmp;
00733 # include <locale/weight.h>
00734   uint_fast32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
00735 
00736   if (nrules != 0)
00737     {
00738       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
00739       extra = (const unsigned char *)
00740         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
00741       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
00742                                                 _NL_COLLATE_INDIRECTMB);
00743       p = pstr->mbs + idx;
00744       tmp = findidx (&p);
00745       return p - pstr->mbs - idx;
00746     }
00747   else
00748 #endif /* _LIBC */
00749     return 1;
00750 }
00751 #endif /* RE_ENABLE_I18N */
00752 
00753 #endif /*  _REGEX_INTERNAL_H */

© sourcejam.com 2005-2008