Main Page | Class List | Directories | File List | Class Members | File Members

regex_internal.c

Go to the documentation of this file.
00001 /* Extended regular expression matching and search library.
00002    Copyright (C) 2002, 2003 Free Software Foundation, Inc.
00003    This file is part of the GNU C Library.
00004    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
00005 
00006    The GNU C Library is free software; you can redistribute it and/or
00007    modify it under the terms of the GNU Lesser General Public
00008    License as published by the Free Software Foundation; either
00009    version 2.1 of the License, or (at your option) any later version.
00010 
00011    The GNU C Library is distributed in the hope that it will be useful,
00012    but WITHOUT ANY WARRANTY; without even the implied warranty of
00013    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014    Lesser General Public License for more details.
00015 
00016    You should have received a copy of the GNU Lesser General Public
00017    License along with the GNU C Library; if not, write to the Free
00018    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
00019    02111-1307 USA.  */
00020 
00021 static void re_string_construct_common _RE_ARGS((const char *str, int len,
00022                                         re_string_t *pstr,
00023                                         RE_TRANSLATE_TYPE trans, int icase));
00024 #ifdef RE_ENABLE_I18N
00025 static int re_string_skip_chars _RE_ARGS((re_string_t *pstr, int new_raw_idx,
00026                                  wint_t *last_wc));
00027 #endif /* RE_ENABLE_I18N */
00028 static re_dfastate_t *create_newstate_common _RE_ARGS((re_dfa_t *dfa,
00029                                               const re_node_set *nodes,
00030                                               unsigned int hash));
00031 static reg_errcode_t register_state _RE_ARGS((re_dfa_t *dfa, re_dfastate_t *newstate,
00032                                      unsigned int hash));
00033 static re_dfastate_t *create_ci_newstate _RE_ARGS((re_dfa_t *dfa,
00034                                           const re_node_set *nodes,
00035                                           unsigned int hash));
00036 static re_dfastate_t *create_cd_newstate _RE_ARGS((re_dfa_t *dfa,
00037                                           const re_node_set *nodes,
00038                                           unsigned int context,
00039                                           unsigned int hash));
00040 static inline unsigned int calc_state_hash _RE_ARGS((const re_node_set *nodes,
00041                                             unsigned int context));
00042 
00043 /* Functions for string operation.  */
00044 
00045 /* This function allocate the buffers.  It is necessary to call
00046    re_string_reconstruct before using the object.  */
00047 
00048 static reg_errcode_t
00049 re_string_allocate (pstr, str, len, init_len, trans, icase)
00050      re_string_t *pstr;
00051      const char *str;
00052      int len, init_len, icase;
00053      RE_TRANSLATE_TYPE trans;
00054 {
00055   reg_errcode_t ret;
00056   int init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
00057   re_string_construct_common (str, len, pstr, trans, icase);
00058   pstr->stop = pstr->len;
00059 
00060   ret = re_string_realloc_buffers (pstr, init_buf_len);
00061   if (BE (ret != REG_NOERROR, 0))
00062     return ret;
00063 
00064   pstr->mbs_case = (MBS_CASE_ALLOCATED (pstr) ? pstr->mbs_case
00065                     : (unsigned char *) str);
00066   pstr->mbs = MBS_ALLOCATED (pstr) ? pstr->mbs : pstr->mbs_case;
00067   pstr->valid_len = (MBS_CASE_ALLOCATED (pstr) || MBS_ALLOCATED (pstr)
00068 #ifdef RE_ENABLE_I18N
00069                      || re_mb_cur_max > 1
00070 #endif
00071                      ) ? pstr->valid_len : len;
00072   return REG_NOERROR;
00073 }
00074 
00075 /* This function allocate the buffers, and initialize them.  */
00076 
00077 static reg_errcode_t
00078 re_string_construct (pstr, str, len, trans, icase)
00079      re_string_t *pstr;
00080      const char *str;
00081      int len, icase;
00082      RE_TRANSLATE_TYPE trans;
00083 {
00084   reg_errcode_t ret;
00085   re_string_construct_common (str, len, pstr, trans, icase);
00086   pstr->stop = pstr->len;
00087   /* Set 0 so that this function can initialize whole buffers.  */
00088   pstr->valid_len = 0;
00089 
00090   if (len > 0)
00091     {
00092       ret = re_string_realloc_buffers (pstr, len + 1);
00093       if (BE (ret != REG_NOERROR, 0))
00094         return ret;
00095     }
00096   pstr->mbs_case = (MBS_CASE_ALLOCATED (pstr) ? pstr->mbs_case
00097                     : (unsigned char *) str);
00098   pstr->mbs = MBS_ALLOCATED (pstr) ? pstr->mbs : pstr->mbs_case;
00099 
00100   if (icase)
00101     {
00102 #ifdef RE_ENABLE_I18N
00103       if (re_mb_cur_max > 1)
00104         build_wcs_upper_buffer (pstr);
00105       else
00106 #endif /* RE_ENABLE_I18N  */
00107         build_upper_buffer (pstr);
00108     }
00109   else
00110     {
00111 #ifdef RE_ENABLE_I18N
00112       if (re_mb_cur_max > 1)
00113         build_wcs_buffer (pstr);
00114       else
00115 #endif /* RE_ENABLE_I18N  */
00116         {
00117           if (trans != NULL)
00118             re_string_translate_buffer (pstr);
00119           else
00120             pstr->valid_len = len;
00121         }
00122     }
00123 
00124   /* Initialized whole buffers, then valid_len == bufs_len.  */
00125   pstr->valid_len = pstr->bufs_len;
00126   return REG_NOERROR;
00127 }
00128 
00129 /* Helper functions for re_string_allocate, and re_string_construct.  */
00130 
00131 static reg_errcode_t
00132 re_string_realloc_buffers (pstr, new_buf_len)
00133      re_string_t *pstr;
00134      int new_buf_len;
00135 {
00136 #ifdef RE_ENABLE_I18N
00137   if (re_mb_cur_max > 1)
00138     {
00139       wint_t *new_array = re_realloc (pstr->wcs, wint_t, new_buf_len);
00140       if (BE (new_array == NULL, 0))
00141         return REG_ESPACE;
00142       pstr->wcs = new_array;
00143     }
00144 #endif /* RE_ENABLE_I18N  */
00145   if (MBS_ALLOCATED (pstr))
00146     {
00147       unsigned char *new_array = re_realloc (pstr->mbs, unsigned char,
00148                                              new_buf_len);
00149       if (BE (new_array == NULL, 0))
00150         return REG_ESPACE;
00151       pstr->mbs = new_array;
00152     }
00153   if (MBS_CASE_ALLOCATED (pstr))
00154     {
00155       unsigned char *new_array = re_realloc (pstr->mbs_case, unsigned char,
00156                                              new_buf_len);
00157       if (BE (new_array == NULL, 0))
00158         return REG_ESPACE;
00159       pstr->mbs_case = new_array;
00160       if (!MBS_ALLOCATED (pstr))
00161         pstr->mbs = pstr->mbs_case;
00162     }
00163   pstr->bufs_len = new_buf_len;
00164   return REG_NOERROR;
00165 }
00166 
00167 
00168 static void
00169 re_string_construct_common (str, len, pstr, trans, icase)
00170      const char *str;
00171      int len;
00172      re_string_t *pstr;
00173      RE_TRANSLATE_TYPE trans;
00174      int icase;
00175 {
00176   memset (pstr, '\0', sizeof (re_string_t));
00177   pstr->raw_mbs = (const unsigned char *) str;
00178   pstr->len = len;
00179   pstr->trans = trans;
00180   pstr->icase = icase ? 1 : 0;
00181 }
00182 
00183 #ifdef RE_ENABLE_I18N
00184 
00185 /* Build wide character buffer PSTR->WCS.
00186    If the byte sequence of the string are:
00187      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
00188    Then wide character buffer will be:
00189      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
00190    We use WEOF for padding, they indicate that the position isn't
00191    a first byte of a multibyte character.
00192 
00193    Note that this function assumes PSTR->VALID_LEN elements are already
00194    built and starts from PSTR->VALID_LEN.  */
00195 
00196 static void
00197 build_wcs_buffer (pstr)
00198      re_string_t *pstr;
00199 {
00200   mbstate_t prev_st;
00201   int byte_idx, end_idx, mbclen, remain_len;
00202   /* Build the buffers from pstr->valid_len to either pstr->len or
00203      pstr->bufs_len.  */
00204   end_idx = (pstr->bufs_len > pstr->len)? pstr->len : pstr->bufs_len;
00205   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
00206     {
00207       wchar_t wc;
00208       remain_len = end_idx - byte_idx;
00209       prev_st = pstr->cur_state;
00210       mbclen = mbrtowc (&wc, ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
00211                               + byte_idx), remain_len, &pstr->cur_state);
00212       if (BE (mbclen == (size_t) -2, 0))
00213         {
00214           /* The buffer doesn't have enough space, finish to build.  */
00215           pstr->cur_state = prev_st;
00216           break;
00217         }
00218       else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
00219         {
00220           /* We treat these cases as a singlebyte character.  */
00221           mbclen = 1;
00222           wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
00223           pstr->cur_state = prev_st;
00224         }
00225 
00226       /* Apply the translateion if we need.  */
00227       if (pstr->trans != NULL && mbclen == 1)
00228         {
00229           int ch = pstr->trans[pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]];
00230           pstr->mbs_case[byte_idx] = ch;
00231         }
00232       /* Write wide character and padding.  */
00233       pstr->wcs[byte_idx++] = wc;
00234       /* Write paddings.  */
00235       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
00236         pstr->wcs[byte_idx++] = WEOF;
00237     }
00238   pstr->valid_len = byte_idx;
00239 }
00240 
00241 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
00242    but for REG_ICASE.  */
00243 
00244 static void
00245 build_wcs_upper_buffer (pstr)
00246      re_string_t *pstr;
00247 {
00248   mbstate_t prev_st;
00249   int byte_idx, end_idx, mbclen, remain_len;
00250   /* Build the buffers from pstr->valid_len to either pstr->len or
00251      pstr->bufs_len.  */
00252   end_idx = (pstr->bufs_len > pstr->len)? pstr->len : pstr->bufs_len;
00253   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
00254     {
00255       wchar_t wc;
00256       remain_len = end_idx - byte_idx;
00257       prev_st = pstr->cur_state;
00258       mbclen = mbrtowc (&wc, ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
00259                               + byte_idx), remain_len, &pstr->cur_state);
00260       if (BE (mbclen == (size_t) -2, 0))
00261         {
00262           /* The buffer doesn't have enough space, finish to build.  */
00263           pstr->cur_state = prev_st;
00264           break;
00265         }
00266       else if (mbclen == 1 || mbclen == (size_t) -1 || mbclen == 0)
00267         {
00268           /* In case of a singlebyte character.  */
00269           int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
00270           /* Apply the translateion if we need.  */
00271           if (pstr->trans != NULL && mbclen == 1)
00272             {
00273               ch = pstr->trans[ch];
00274               pstr->mbs_case[byte_idx] = ch;
00275             }
00276           pstr->wcs[byte_idx] = iswlower (wc) ? toupper (wc) : wc;
00277           pstr->mbs[byte_idx++] = islower (ch) ? toupper (ch) : ch;
00278           if (BE (mbclen == (size_t) -1, 0))
00279             pstr->cur_state = prev_st;
00280         }
00281       else /* mbclen > 1 */
00282         {
00283           if (iswlower (wc))
00284             wcrtomb ((char *) pstr->mbs + byte_idx, towupper (wc), &prev_st);
00285           else
00286             memcpy (pstr->mbs + byte_idx,
00287                     pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
00288           pstr->wcs[byte_idx++] = iswlower (wc) ? toupper (wc) : wc;
00289           /* Write paddings.  */
00290           for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
00291             pstr->wcs[byte_idx++] = WEOF;
00292         }
00293     }
00294   pstr->valid_len = byte_idx;
00295 }
00296 
00297 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
00298    Return the index.  */
00299 
00300 static int
00301 re_string_skip_chars (pstr, new_raw_idx, last_wc)
00302      re_string_t *pstr;
00303      int new_raw_idx;
00304      wint_t *last_wc;
00305 {
00306   mbstate_t prev_st;
00307   int rawbuf_idx, mbclen;
00308   wchar_t wc = 0;
00309 
00310   /* Skip the characters which are not necessary to check.  */
00311   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_len;
00312        rawbuf_idx < new_raw_idx;)
00313     {
00314       int remain_len;
00315       remain_len = pstr->len - rawbuf_idx;
00316       prev_st = pstr->cur_state;
00317       mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
00318                         remain_len, &pstr->cur_state);
00319       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
00320         {
00321           /* We treat these cases as a singlebyte character.  */
00322           mbclen = 1;
00323           pstr->cur_state = prev_st;
00324         }
00325       /* Then proceed the next character.  */
00326       rawbuf_idx += mbclen;
00327     }
00328   *last_wc = (wint_t) wc;
00329   return rawbuf_idx;
00330 }
00331 #endif /* RE_ENABLE_I18N  */
00332 
00333 /* Build the buffer PSTR->MBS, and apply the translation if we need.
00334    This function is used in case of REG_ICASE.  */
00335 
00336 static void
00337 build_upper_buffer (pstr)
00338      re_string_t *pstr;
00339 {
00340   int char_idx, end_idx;
00341   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
00342 
00343   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
00344     {
00345       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
00346       if (pstr->trans != NULL)
00347         {
00348           ch =  pstr->trans[ch];
00349           pstr->mbs_case[char_idx] = ch;
00350         }
00351       if (islower (ch))
00352         pstr->mbs[char_idx] = toupper (ch);
00353       else
00354         pstr->mbs[char_idx] = ch;
00355     }
00356   pstr->valid_len = char_idx;
00357 }
00358 
00359 /* Apply TRANS to the buffer in PSTR.  */
00360 
00361 static void
00362 re_string_translate_buffer (pstr)
00363      re_string_t *pstr;
00364 {
00365   int buf_idx, end_idx;
00366   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
00367 
00368   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
00369     {
00370       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
00371       pstr->mbs_case[buf_idx] = pstr->trans[ch];
00372     }
00373 
00374   pstr->valid_len = buf_idx;
00375 }
00376 
00377 /* This function re-construct the buffers.
00378    Concretely, convert to wide character in case of re_mb_cur_max > 1,
00379    convert to upper case in case of REG_ICASE, apply translation.  */
00380 
00381 static reg_errcode_t
00382 re_string_reconstruct (pstr, idx, eflags, newline)
00383      re_string_t *pstr;
00384      int idx, eflags, newline;
00385 {
00386   int offset = idx - pstr->raw_mbs_idx;
00387   if (offset < 0)
00388     {
00389       /* Reset buffer.  */
00390 #ifdef RE_ENABLE_I18N
00391       if (re_mb_cur_max > 1)
00392         memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
00393 #endif /* RE_ENABLE_I18N */
00394       pstr->len += pstr->raw_mbs_idx;
00395       pstr->stop += pstr->raw_mbs_idx;
00396       pstr->valid_len = pstr->raw_mbs_idx = 0;
00397       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
00398                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
00399       if (!MBS_CASE_ALLOCATED (pstr))
00400         pstr->mbs_case = (unsigned char *) pstr->raw_mbs;
00401       if (!MBS_ALLOCATED (pstr) && !MBS_CASE_ALLOCATED (pstr))
00402         pstr->mbs = (unsigned char *) pstr->raw_mbs;
00403       offset = idx;
00404     }
00405 
00406   if (offset != 0)
00407     {
00408       /* Are the characters which are already checked remain?  */
00409       if (offset < pstr->valid_len)
00410         {
00411           /* Yes, move them to the front of the buffer.  */
00412           pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags,
00413                                                     newline);
00414 #ifdef RE_ENABLE_I18N
00415           if (re_mb_cur_max > 1)
00416             memmove (pstr->wcs, pstr->wcs + offset,
00417                      (pstr->valid_len - offset) * sizeof (wint_t));
00418 #endif /* RE_ENABLE_I18N */
00419           if (MBS_ALLOCATED (pstr))
00420             memmove (pstr->mbs, pstr->mbs + offset,
00421                      pstr->valid_len - offset);
00422           if (MBS_CASE_ALLOCATED (pstr))
00423             memmove (pstr->mbs_case, pstr->mbs_case + offset,
00424                      pstr->valid_len - offset);
00425           pstr->valid_len -= offset;
00426 #if DEBUG
00427           assert (pstr->valid_len > 0);
00428 #endif
00429         }
00430       else
00431         {
00432           /* No, skip all characters until IDX.  */
00433           pstr->valid_len = 0;
00434 #ifdef RE_ENABLE_I18N
00435           if (re_mb_cur_max > 1)
00436             {
00437               int wcs_idx;
00438               wint_t wc;
00439               pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
00440               for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
00441                 pstr->wcs[wcs_idx] = WEOF;
00442               if (pstr->trans && wc <= 0xff)
00443                 wc = pstr->trans[wc];
00444               pstr->tip_context = (IS_WIDE_WORD_CHAR (wc) ? CONTEXT_WORD
00445                                    : ((newline && IS_WIDE_NEWLINE (wc))
00446                                       ? CONTEXT_NEWLINE : 0));
00447             }
00448           else
00449 #endif /* RE_ENABLE_I18N */
00450             {
00451               int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
00452               if (pstr->trans)
00453                 c = pstr->trans[c];
00454               pstr->tip_context = (IS_WORD_CHAR (c) ? CONTEXT_WORD
00455                                    : ((newline && IS_NEWLINE (c))
00456                                       ? CONTEXT_NEWLINE : 0));
00457             }
00458         }
00459       if (!MBS_CASE_ALLOCATED (pstr))
00460         {
00461           pstr->mbs_case += offset;
00462           /* In case of !MBS_ALLOCATED && !MBS_CASE_ALLOCATED.  */
00463           if (!MBS_ALLOCATED (pstr))
00464             pstr->mbs += offset;
00465         }
00466     }
00467   pstr->raw_mbs_idx = idx;
00468   pstr->len -= offset;
00469   pstr->stop -= offset;
00470 
00471   /* Then build the buffers.  */
00472 #ifdef RE_ENABLE_I18N
00473   if (re_mb_cur_max > 1)
00474     {
00475       if (pstr->icase)
00476         build_wcs_upper_buffer (pstr);
00477       else
00478         build_wcs_buffer (pstr);
00479     }
00480   else
00481 #endif /* RE_ENABLE_I18N */
00482     {
00483       if (pstr->icase)
00484         build_upper_buffer (pstr);
00485       else if (pstr->trans != NULL)
00486         re_string_translate_buffer (pstr);
00487     }
00488   pstr->cur_idx = 0;
00489 
00490   return REG_NOERROR;
00491 }
00492 
00493 static void
00494 re_string_destruct (pstr)
00495      re_string_t *pstr;
00496 {
00497 #ifdef RE_ENABLE_I18N
00498   re_free (pstr->wcs);
00499 #endif /* RE_ENABLE_I18N  */
00500   if (MBS_ALLOCATED (pstr))
00501     re_free (pstr->mbs);
00502   if (MBS_CASE_ALLOCATED (pstr))
00503     re_free (pstr->mbs_case);
00504 }
00505 
00506 /* Return the context at IDX in INPUT.  */
00507 
00508 static unsigned int
00509 re_string_context_at (input, idx, eflags, newline_anchor)
00510      const re_string_t *input;
00511      int idx, eflags, newline_anchor;
00512 {
00513   int c;
00514   if (idx < 0 || idx == input->len)
00515     {
00516       if (idx < 0)
00517         /* In this case, we use the value stored in input->tip_context,
00518            since we can't know the character in input->mbs[-1] here.  */
00519         return input->tip_context;
00520       else /* (idx == input->len) */
00521         return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
00522                 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
00523     }
00524 #ifdef RE_ENABLE_I18N
00525   if (re_mb_cur_max > 1)
00526     {
00527       wint_t wc;
00528       int wc_idx = idx;
00529       while(input->wcs[wc_idx] == WEOF)
00530         {
00531 #ifdef DEBUG
00532           /* It must not happen.  */
00533           assert (wc_idx >= 0);
00534 #endif
00535           --wc_idx;
00536           if (wc_idx < 0)
00537             return input->tip_context;
00538         }
00539       wc = input->wcs[wc_idx];
00540       if (IS_WIDE_WORD_CHAR (wc))
00541         return CONTEXT_WORD;
00542       return (newline_anchor && IS_WIDE_NEWLINE (wc)) ? CONTEXT_NEWLINE : 0;
00543     }
00544   else
00545 #endif
00546     {
00547       c = re_string_byte_at (input, idx);
00548       if (IS_WORD_CHAR (c))
00549         return CONTEXT_WORD;
00550       return (newline_anchor && IS_NEWLINE (c)) ? CONTEXT_NEWLINE : 0;
00551     }
00552 }
00553 
00554 /* Functions for set operation.  */
00555 
00556 static reg_errcode_t
00557 re_node_set_alloc (set, size)
00558      re_node_set *set;
00559      int size;
00560 {
00561   set->alloc = size;
00562   set->nelem = 0;
00563   set->elems = re_malloc (int, size);
00564   if (BE (set->elems == NULL, 0))
00565     return REG_ESPACE;
00566   return REG_NOERROR;
00567 }
00568 
00569 static reg_errcode_t
00570 re_node_set_init_1 (set, elem)
00571      re_node_set *set;
00572      int elem;
00573 {
00574   set->alloc = 1;
00575   set->nelem = 1;
00576   set->elems = re_malloc (int, 1);
00577   if (BE (set->elems == NULL, 0))
00578     {
00579       set->alloc = set->nelem = 0;
00580       return REG_ESPACE;
00581     }
00582   set->elems[0] = elem;
00583   return REG_NOERROR;
00584 }
00585 
00586 static reg_errcode_t
00587 re_node_set_init_2 (set, elem1, elem2)
00588      re_node_set *set;
00589      int elem1, elem2;
00590 {
00591   set->alloc = 2;
00592   set->elems = re_malloc (int, 2);
00593   if (BE (set->elems == NULL, 0))
00594     return REG_ESPACE;
00595   if (elem1 == elem2)
00596     {
00597       set->nelem = 1;
00598       set->elems[0] = elem1;
00599     }
00600   else
00601     {
00602       set->nelem = 2;
00603       if (elem1 < elem2)
00604         {
00605           set->elems[0] = elem1;
00606           set->elems[1] = elem2;
00607         }
00608       else
00609         {
00610           set->elems[0] = elem2;
00611           set->elems[1] = elem1;
00612         }
00613     }
00614   return REG_NOERROR;
00615 }
00616 
00617 static reg_errcode_t
00618 re_node_set_init_copy (dest, src)
00619      re_node_set *dest;
00620      const re_node_set *src;
00621 {
00622   dest->nelem = src->nelem;
00623   if (src->nelem > 0)
00624     {
00625       dest->alloc = dest->nelem;
00626       dest->elems = re_malloc (int, dest->alloc);
00627       if (BE (dest->elems == NULL, 0))
00628         {
00629           dest->alloc = dest->nelem = 0;
00630           return REG_ESPACE;
00631         }
00632       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
00633     }
00634   else
00635     re_node_set_init_empty (dest);
00636   return REG_NOERROR;
00637 }
00638 
00639 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
00640    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
00641    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
00642 
00643 static reg_errcode_t
00644 re_node_set_add_intersect (dest, src1, src2)
00645      re_node_set *dest;
00646      const re_node_set *src1, *src2;
00647 {
00648   int i1, i2, id;
00649   if (src1->nelem > 0 && src2->nelem > 0)
00650     {
00651       if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
00652         {
00653           dest->alloc = src1->nelem + src2->nelem + dest->nelem;
00654           dest->elems = re_realloc (dest->elems, int, dest->alloc);
00655           if (BE (dest->elems == NULL, 0))
00656             return REG_ESPACE;
00657         }
00658     }
00659   else
00660     return REG_NOERROR;
00661 
00662   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
00663     {
00664       if (src1->elems[i1] > src2->elems[i2])
00665         {
00666           ++i2;
00667           continue;
00668         }
00669       if (src1->elems[i1] == src2->elems[i2])
00670         {
00671           while (id < dest->nelem && dest->elems[id] < src2->elems[i2])
00672             ++id;
00673           if (id < dest->nelem && dest->elems[id] == src2->elems[i2])
00674             ++id;
00675           else
00676             {
00677               memmove (dest->elems + id + 1, dest->elems + id,
00678                        sizeof (int) * (dest->nelem - id));
00679               dest->elems[id++] = src2->elems[i2++];
00680               ++dest->nelem;
00681             }
00682         }
00683       ++i1;
00684     }
00685   return REG_NOERROR;
00686 }
00687 
00688 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
00689    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
00690 
00691 static reg_errcode_t
00692 re_node_set_init_union (dest, src1, src2)
00693      re_node_set *dest;
00694      const re_node_set *src1, *src2;
00695 {
00696   int i1, i2, id;
00697   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
00698     {
00699       dest->alloc = src1->nelem + src2->nelem;
00700       dest->elems = re_malloc (int, dest->alloc);
00701       if (BE (dest->elems == NULL, 0))
00702         return REG_ESPACE;
00703     }
00704   else
00705     {
00706       if (src1 != NULL && src1->nelem > 0)
00707         return re_node_set_init_copy (dest, src1);
00708       else if (src2 != NULL && src2->nelem > 0)
00709         return re_node_set_init_copy (dest, src2);
00710       else
00711         re_node_set_init_empty (dest);
00712       return REG_NOERROR;
00713     }
00714   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
00715     {
00716       if (src1->elems[i1] > src2->elems[i2])
00717         {
00718           dest->elems[id++] = src2->elems[i2++];
00719           continue;
00720         }
00721       if (src1->elems[i1] == src2->elems[i2])
00722         ++i2;
00723       dest->elems[id++] = src1->elems[i1++];
00724     }
00725   if (i1 < src1->nelem)
00726     {
00727       memcpy (dest->elems + id, src1->elems + i1,
00728              (src1->nelem - i1) * sizeof (int));
00729       id += src1->nelem - i1;
00730     }
00731   else if (i2 < src2->nelem)
00732     {
00733       memcpy (dest->elems + id, src2->elems + i2,
00734              (src2->nelem - i2) * sizeof (int));
00735       id += src2->nelem - i2;
00736     }
00737   dest->nelem = id;
00738   return REG_NOERROR;
00739 }
00740 
00741 /* Calculate the union set of the sets DEST and SRC. And store it to
00742    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
00743 
00744 static reg_errcode_t
00745 re_node_set_merge (dest, src)
00746      re_node_set *dest;
00747      const re_node_set *src;
00748 {
00749   int si, di;
00750   if (src == NULL || src->nelem == 0)
00751     return REG_NOERROR;
00752   if (dest->alloc < src->nelem + dest->nelem)
00753     {
00754       int *new_buffer;
00755       dest->alloc = 2 * (src->nelem + dest->alloc);
00756       new_buffer = re_realloc (dest->elems, int, dest->alloc);
00757       if (BE (new_buffer == NULL, 0))
00758         return REG_ESPACE;
00759       dest->elems = new_buffer;
00760     }
00761 
00762   for (si = 0, di = 0 ; si < src->nelem && di < dest->nelem ;)
00763     {
00764       int cp_from, ncp, mid, right, src_elem = src->elems[si];
00765       /* Binary search the spot we will add the new element.  */
00766       right = dest->nelem;
00767       while (di < right)
00768         {
00769           mid = (di + right) / 2;
00770           if (dest->elems[mid] < src_elem)
00771             di = mid + 1;
00772           else
00773             right = mid;
00774         }
00775       if (di >= dest->nelem)
00776         break;
00777 
00778       if (dest->elems[di] == src_elem)
00779         {
00780           /* Skip since, DEST already has the element.  */
00781           ++di;
00782           ++si;
00783           continue;
00784         }
00785 
00786       /* Skip the src elements which are less than dest->elems[di].  */
00787       cp_from = si;
00788       while (si < src->nelem && src->elems[si] < dest->elems[di])
00789         ++si;
00790       /* Copy these src elements.  */
00791       ncp = si - cp_from;
00792       memmove (dest->elems + di + ncp, dest->elems + di,
00793                sizeof (int) * (dest->nelem - di));
00794       memcpy (dest->elems + di, src->elems + cp_from,
00795               sizeof (int) * ncp);
00796       /* Update counters.  */
00797       di += ncp;
00798       dest->nelem += ncp;
00799     }
00800 
00801   /* Copy remaining src elements.  */
00802   if (si < src->nelem)
00803     {
00804       memcpy (dest->elems + di, src->elems + si,
00805               sizeof (int) * (src->nelem - si));
00806       dest->nelem += src->nelem - si;
00807     }
00808   return REG_NOERROR;
00809 }
00810 
00811 /* Insert the new element ELEM to the re_node_set* SET.
00812    return 0 if SET already has ELEM,
00813    return -1 if an error is occured, return 1 otherwise.  */
00814 
00815 static int
00816 re_node_set_insert (set, elem)
00817      re_node_set *set;
00818      int elem;
00819 {
00820   int idx, right, mid;
00821   /* In case of the set is empty.  */
00822   if (set->elems == NULL || set->alloc == 0)
00823     {
00824       if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
00825         return 1;
00826       else
00827         return -1;
00828     }
00829 
00830   /* Binary search the spot we will add the new element.  */
00831   idx = 0;
00832   right = set->nelem;
00833   while (idx < right)
00834     {
00835       mid = (idx + right) / 2;
00836       if (set->elems[mid] < elem)
00837         idx = mid + 1;
00838       else
00839         right = mid;
00840     }
00841 
00842   /* Realloc if we need.  */
00843   if (set->alloc < set->nelem + 1)
00844     {
00845       int *new_array;
00846       set->alloc = set->alloc * 2;
00847       new_array = re_malloc (int, set->alloc);
00848       if (BE (new_array == NULL, 0))
00849         return -1;
00850       /* Copy the elements they are followed by the new element.  */
00851       if (idx > 0)
00852         memcpy (new_array, set->elems, sizeof (int) * (idx));
00853       /* Copy the elements which follows the new element.  */
00854       if (set->nelem - idx > 0)
00855         memcpy (new_array + idx + 1, set->elems + idx,
00856                 sizeof (int) * (set->nelem - idx));
00857       re_free (set->elems);
00858       set->elems = new_array;
00859     }
00860   else
00861     {
00862       /* Move the elements which follows the new element.  */
00863       if (set->nelem - idx > 0)
00864         memmove (set->elems + idx + 1, set->elems + idx,
00865                  sizeof (int) * (set->nelem - idx));
00866     }
00867   /* Insert the new element.  */
00868   set->elems[idx] = elem;
00869   ++set->nelem;
00870   return 1;
00871 }
00872 
00873 /* Compare two node sets SET1 and SET2.
00874    return 1 if SET1 and SET2 are equivalent, retrun 0 otherwise.  */
00875 
00876 static int
00877 re_node_set_compare (set1, set2)
00878      const re_node_set *set1, *set2;
00879 {
00880   int i;
00881   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
00882     return 0;
00883   for (i = 0 ; i < set1->nelem ; i++)
00884     if (set1->elems[i] != set2->elems[i])
00885       return 0;
00886   return 1;
00887 }
00888 
00889 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
00890 
00891 static int
00892 re_node_set_contains (set, elem)
00893      const re_node_set *set;
00894      int elem;
00895 {
00896   int idx, right, mid;
00897   if (set->nelem <= 0)
00898     return 0;
00899 
00900   /* Binary search the element.  */
00901   idx = 0;
00902   right = set->nelem - 1;
00903   while (idx < right)
00904     {
00905       mid = (idx + right) / 2;
00906       if (set->elems[mid] < elem)
00907         idx = mid + 1;
00908       else
00909         right = mid;
00910     }
00911   return set->elems[idx] == elem ? idx + 1 : 0;
00912 }
00913 
00914 static void
00915 re_node_set_remove_at (set, idx)
00916      re_node_set *set;
00917      int idx;
00918 {
00919   if (idx < 0 || idx >= set->nelem)
00920     return;
00921   if (idx < set->nelem - 1)
00922     memmove (set->elems + idx, set->elems + idx + 1,
00923              sizeof (int) * (set->nelem - idx - 1));
00924   --set->nelem;
00925 }
00926 
00927 
00928 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
00929    Or return -1, if an error will be occured.  */
00930 
00931 static int
00932 re_dfa_add_node (dfa, token, mode)
00933      re_dfa_t *dfa;
00934      re_token_t token;
00935      int mode;
00936 {
00937   if (dfa->nodes_len >= dfa->nodes_alloc)
00938     {
00939       re_token_t *new_array;
00940       dfa->nodes_alloc *= 2;
00941       new_array = re_realloc (dfa->nodes, re_token_t, dfa->nodes_alloc);
00942       if (BE (new_array == NULL, 0))
00943         return -1;
00944       else
00945         dfa->nodes = new_array;
00946       if (mode)
00947         {
00948           int *new_nexts, *new_indices;
00949           re_node_set *new_edests, *new_eclosures, *new_inveclosures;
00950 
00951           new_nexts = re_realloc (dfa->nexts, int, dfa->nodes_alloc);
00952           new_indices = re_realloc (dfa->org_indices, int, dfa->nodes_alloc);
00953           new_edests = re_realloc (dfa->edests, re_node_set, dfa->nodes_alloc);
00954           new_eclosures = re_realloc (dfa->eclosures, re_node_set,
00955                                       dfa->nodes_alloc);
00956           new_inveclosures = re_realloc (dfa->inveclosures, re_node_set,
00957                                          dfa->nodes_alloc);
00958           if (BE (new_nexts == NULL || new_indices == NULL
00959                   || new_edests == NULL || new_eclosures == NULL
00960                   || new_inveclosures == NULL, 0))
00961             return -1;
00962           dfa->nexts = new_nexts;
00963           dfa->org_indices = new_indices;
00964           dfa->edests = new_edests;
00965           dfa->eclosures = new_eclosures;
00966           dfa->inveclosures = new_inveclosures;
00967         }
00968     }
00969   dfa->nodes[dfa->nodes_len] = token;
00970   dfa->nodes[dfa->nodes_len].duplicated = 0;
00971   dfa->nodes[dfa->nodes_len].constraint = 0;
00972   return dfa->nodes_len++;
00973 }
00974 
00975 static unsigned int inline
00976 calc_state_hash (nodes, context)
00977      const re_node_set *nodes;
00978      unsigned int context;
00979 {
00980   unsigned int hash = nodes->nelem + context;
00981   int i;
00982   for (i = 0 ; i < nodes->nelem ; i++)
00983     hash += nodes->elems[i];
00984   return hash;
00985 }
00986 
00987 /* Search for the state whose node_set is equivalent to NODES.
00988    Return the pointer to the state, if we found it in the DFA.
00989    Otherwise create the new one and return it.  In case of an error
00990    return NULL and set the error code in ERR.
00991    Note: - We assume NULL as the invalid state, then it is possible that
00992            return value is NULL and ERR is REG_NOERROR.
00993          - We never return non-NULL value in case of any errors, it is for
00994            optimization.  */
00995 
00996 static re_dfastate_t*
00997 re_acquire_state (err, dfa, nodes)
00998      reg_errcode_t *err;
00999      re_dfa_t *dfa;
01000      const re_node_set *nodes;
01001 {
01002   unsigned int hash;
01003   re_dfastate_t *new_state;
01004   struct re_state_table_entry *spot;
01005   int i;
01006   if (BE (nodes->nelem == 0, 0))
01007     {
01008       *err = REG_NOERROR;
01009       return NULL;
01010     }
01011   hash = calc_state_hash (nodes, 0);
01012   spot = dfa->state_table + (hash & dfa->state_hash_mask);
01013 
01014   for (i = 0 ; i < spot->num ; i++)
01015     {
01016       re_dfastate_t *state = spot->array[i];
01017       if (hash != state->hash)
01018         continue;
01019       if (re_node_set_compare (&state->nodes, nodes))
01020         return state;
01021     }
01022 
01023   /* There are no appropriate state in the dfa, create the new one.  */
01024   new_state = create_ci_newstate (dfa, nodes, hash);
01025   if (BE (new_state != NULL, 1))
01026     return new_state;
01027   else
01028     {
01029       *err = REG_ESPACE;
01030       return NULL;
01031     }
01032 }
01033 
01034 /* Search for the state whose node_set is equivalent to NODES and
01035    whose context is equivalent to CONTEXT.
01036    Return the pointer to the state, if we found it in the DFA.
01037    Otherwise create the new one and return it.  In case of an error
01038    return NULL and set the error code in ERR.
01039    Note: - We assume NULL as the invalid state, then it is possible that
01040            return value is NULL and ERR is REG_NOERROR.
01041          - We never return non-NULL value in case of any errors, it is for
01042            optimization.  */
01043 
01044 static re_dfastate_t*
01045 re_acquire_state_context (err, dfa, nodes, context)
01046      reg_errcode_t *err;
01047      re_dfa_t *dfa;
01048      const re_node_set *nodes;
01049      unsigned int context;
01050 {
01051   unsigned int hash;
01052   re_dfastate_t *new_state;
01053   struct re_state_table_entry *spot;
01054   int i;
01055   if (nodes->nelem == 0)
01056     {
01057       *err = REG_NOERROR;
01058       return NULL;
01059     }
01060   hash = calc_state_hash (nodes, context);
01061   spot = dfa->state_table + (hash & dfa->state_hash_mask);
01062 
01063   for (i = 0 ; i < spot->num ; i++)
01064     {
01065       re_dfastate_t *state = spot->array[i];
01066       if (hash != state->hash)
01067         continue;
01068       if (re_node_set_compare (state->entrance_nodes, nodes)
01069           && state->context == context)
01070         return state;
01071     }
01072   /* There are no appropriate state in `dfa', create the new one.  */
01073   new_state = create_cd_newstate (dfa, nodes, context, hash);
01074   if (BE (new_state != NULL, 1))
01075     return new_state;
01076   else
01077     {
01078       *err = REG_ESPACE;
01079       return NULL;
01080     }
01081 }
01082 
01083 /* Allocate memory for DFA state and initialize common properties.
01084    Return the new state if succeeded, otherwise return NULL.  */
01085 
01086 static re_dfastate_t *
01087 create_newstate_common (dfa, nodes, hash)
01088      re_dfa_t *dfa;
01089      const re_node_set *nodes;
01090      unsigned int hash;
01091 {
01092   re_dfastate_t *newstate;
01093   reg_errcode_t err;
01094   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
01095   if (BE (newstate == NULL, 0))
01096     return NULL;
01097   err = re_node_set_init_copy (&newstate->nodes, nodes);
01098   if (BE (err != REG_NOERROR, 0))
01099     {
01100       re_free (newstate);
01101       return NULL;
01102     }
01103   newstate->trtable = NULL;
01104   newstate->trtable_search = NULL;
01105   newstate->hash = hash;
01106   return newstate;
01107 }
01108 
01109 /* Store the new state NEWSTATE whose hash value is HASH in appropriate
01110    position.  Return value indicate the error code if failed.  */
01111 
01112 static reg_errcode_t
01113 register_state (dfa, newstate, hash)
01114      re_dfa_t *dfa;
01115      re_dfastate_t *newstate;
01116      unsigned int hash;
01117 {
01118   struct re_state_table_entry *spot;
01119   spot = dfa->state_table + (hash & dfa->state_hash_mask);
01120 
01121   if (spot->alloc <= spot->num)
01122     {
01123       re_dfastate_t **new_array;
01124       spot->alloc = 2 * spot->num + 2;
01125       new_array = re_realloc (spot->array, re_dfastate_t *, spot->alloc);
01126       if (BE (new_array == NULL, 0))
01127         return REG_ESPACE;
01128       spot->array = new_array;
01129     }
01130   spot->array[spot->num++] = newstate;
01131   return REG_NOERROR;
01132 }
01133 
01134 /* Create the new state which is independ of contexts.
01135    Return the new state if succeeded, otherwise return NULL.  */
01136 
01137 static re_dfastate_t *
01138 create_ci_newstate (dfa, nodes, hash)
01139      re_dfa_t *dfa;
01140      const re_node_set *nodes;
01141      unsigned int hash;
01142 {
01143   int i;
01144   reg_errcode_t err;
01145   re_dfastate_t *newstate;
01146   newstate = create_newstate_common (dfa, nodes, hash);
01147   if (BE (newstate == NULL, 0))
01148     return NULL;
01149   newstate->entrance_nodes = &newstate->nodes;
01150 
01151   for (i = 0 ; i < nodes->nelem ; i++)
01152     {
01153       re_token_t *node = dfa->nodes + nodes->elems[i];
01154       re_token_type_t type = node->type;
01155       if (type == CHARACTER && !node->constraint)
01156         continue;
01157 
01158       /* If the state has the halt node, the state is a halt state.  */
01159       else if (type == END_OF_RE)
01160         newstate->halt = 1;
01161 #ifdef RE_ENABLE_I18N
01162       else if (type == COMPLEX_BRACKET
01163                || (type == OP_PERIOD && re_mb_cur_max > 1))
01164         newstate->accept_mb = 1;
01165 #endif /* RE_ENABLE_I18N */
01166       else if (type == OP_BACK_REF)
01167         newstate->has_backref = 1;
01168       else if (type == ANCHOR || node->constraint)
01169         newstate->has_constraint = 1;
01170     }
01171   err = register_state (dfa, newstate, hash);
01172   if (BE (err != REG_NOERROR, 0))
01173     {
01174       free_state (newstate);
01175       newstate = NULL;
01176     }
01177   return newstate;
01178 }
01179 
01180 /* Create the new state which is depend on the context CONTEXT.
01181    Return the new state if succeeded, otherwise return NULL.  */
01182 
01183 static re_dfastate_t *
01184 create_cd_newstate (dfa, nodes, context, hash)
01185      re_dfa_t *dfa;
01186      const re_node_set *nodes;
01187      unsigned int context, hash;
01188 {
01189   int i, nctx_nodes = 0;
01190   reg_errcode_t err;
01191   re_dfastate_t *newstate;
01192 
01193   newstate = create_newstate_common (dfa, nodes, hash);
01194   if (BE (newstate == NULL, 0))
01195     return NULL;
01196   newstate->context = context;
01197   newstate->entrance_nodes = &newstate->nodes;
01198 
01199   for (i = 0 ; i < nodes->nelem ; i++)
01200     {
01201       unsigned int constraint = 0;
01202       re_token_t *node = dfa->nodes + nodes->elems[i];
01203       re_token_type_t type = node->type;
01204       if (node->constraint)
01205         constraint = node->constraint;
01206 
01207       if (type == CHARACTER && !constraint)
01208         continue;
01209       /* If the state has the halt node, the state is a halt state.  */
01210       else if (type == END_OF_RE)
01211         newstate->halt = 1;
01212 #ifdef RE_ENABLE_I18N
01213       else if (type == COMPLEX_BRACKET
01214                || (type == OP_PERIOD && re_mb_cur_max > 1))
01215         newstate->accept_mb = 1;
01216 #endif /* RE_ENABLE_I18N */
01217       else if (type == OP_BACK_REF)
01218         newstate->has_backref = 1;
01219       else if (type == ANCHOR)
01220         constraint = node->opr.ctx_type;
01221 
01222       if (constraint)
01223         {
01224           if (newstate->entrance_nodes == &newstate->nodes)
01225             {
01226               newstate->entrance_nodes = re_malloc (re_node_set, 1);
01227               if (BE (newstate->entrance_nodes == NULL, 0))
01228                 {
01229                   free_state (newstate);
01230                   return NULL;
01231                 }
01232               re_node_set_init_copy (newstate->entrance_nodes, nodes);
01233               nctx_nodes = 0;
01234               newstate->has_constraint = 1;
01235             }
01236 
01237           if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
01238             {
01239               re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
01240               ++nctx_nodes;
01241             }
01242         }
01243     }
01244   err = register_state (dfa, newstate, hash);
01245   if (BE (err != REG_NOERROR, 0))
01246     {
01247       free_state (newstate);
01248       newstate = NULL;
01249     }
01250   return  newstate;
01251 }
01252 
01253 static void
01254 free_state (state)
01255      re_dfastate_t *state;
01256 {
01257   if (state->entrance_nodes != &state->nodes)
01258     {
01259       re_node_set_free (state->entrance_nodes);
01260       re_free (state->entrance_nodes);
01261     }
01262   re_node_set_free (&state->nodes);
01263   re_free (state->trtable);
01264   re_free (state->trtable_search);
01265   re_free (state);
01266 }

© sourcejam.com 2005-2008