Main Page | Class List | Directories | File List | Class Members | File Members

array.c

Go to the documentation of this file.
00001 /*
00002  * array.c - routines for associative arrays.
00003  */
00004 
00005 /* 
00006  * Copyright (C) 1986, 1988, 1989, 1991-2003 the Free Software Foundation, Inc.
00007  * 
00008  * This file is part of GAWK, the GNU implementation of the
00009  * AWK Programming Language.
00010  * 
00011  * GAWK is free software; you can redistribute it and/or modify
00012  * it under the terms of the GNU General Public License as published by
00013  * the Free Software Foundation; either version 2 of the License, or
00014  * (at your option) any later version.
00015  * 
00016  * GAWK is distributed in the hope that it will be useful,
00017  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00018  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00019  * GNU General Public License for more details.
00020  * 
00021  * You should have received a copy of the GNU General Public License
00022  * along with this program; if not, write to the Free Software
00023  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
00024  */
00025 
00026 /*
00027  * Tree walks (``for (iggy in foo)'') and array deletions use expensive
00028  * linear searching.  So what we do is start out with small arrays and
00029  * grow them as needed, so that our arrays are hopefully small enough,
00030  * most of the time, that they're pretty full and we're not looking at
00031  * wasted space.
00032  *
00033  * The decision is made to grow the array if the average chain length is
00034  * ``too big''. This is defined as the total number of entries in the table
00035  * divided by the size of the array being greater than some constant.
00036  *
00037  * 11/2002: We make the constant a variable, so that it can be tweaked
00038  * via environment variable.
00039  */
00040 
00041 static int AVG_CHAIN_MAX = 2;   /* 11/2002: Modern machines are bigger, cut this down from 10. */
00042 
00043 #include "awk.h"
00044 
00045 static NODE *assoc_find P((NODE *symbol, NODE *subs, int hash1));
00046 static void grow_table P((NODE *symbol));
00047 
00048 static unsigned long gst_hash_string P((const char *str, size_t len, unsigned long hsize));
00049 static unsigned long scramble P((unsigned long x));
00050 static unsigned long awk_hash P((const char *s, size_t len, unsigned long hsize));
00051 
00052 unsigned long (*hash)P((const char *s, size_t len, unsigned long hsize)) = awk_hash;
00053 
00054 /* array_init --- possibly temporary function for experimentation purposes */
00055 
00056 void
00057 array_init()
00058 {
00059         const char *val;
00060         int newval;
00061 
00062         if ((val = getenv("AVG_CHAIN_MAX")) != NULL && ISDIGIT(*val)) {
00063                 for (newval = 0; *val && ISDIGIT(*val); val++)
00064                         newval = (newval * 10) + *val - '0';
00065 
00066                 AVG_CHAIN_MAX = newval;
00067         }
00068 
00069         if ((val = getenv("AWK_HASH")) != NULL && strcmp(val, "gst") == 0)
00070                 hash = gst_hash_string; 
00071 }
00072 
00073 /*
00074  * get_actual --- proceed to the actual Node_var_array,
00075  *      change Node_var_new to an array.
00076  *      If canfatal and type isn't good, die fatally,
00077  *      otherwise return the final actual value.
00078  */
00079  
00080 NODE *
00081 get_actual(NODE *symbol, int canfatal)
00082 {
00083         int isparam = (symbol->type == Node_param_list
00084                         && (symbol->flags & FUNC) == 0);
00085         NODE *save_symbol = symbol;
00086 
00087         if (isparam) {
00088                 save_symbol = symbol = stack_ptr[symbol->param_cnt];
00089                 if (symbol->type == Node_array_ref)
00090                         symbol = symbol->orig_array;
00091         }
00092 
00093         switch (symbol->type) {
00094         case Node_var_new:
00095                 symbol->type = Node_var_array;
00096                 symbol->var_array = NULL;
00097                 /* fall through */
00098         case Node_var_array:
00099                 break;
00100 
00101         case Node_array_ref:
00102         case Node_param_list:
00103                 if (canfatal)
00104                         cant_happen();
00105                 /* else
00106                         fall through */
00107 
00108         default:
00109                 /* notably Node_var but catches also e.g. FS[1] = "x" */
00110                 if (canfatal)
00111                         fatal(isparam ?
00112                                 _("attempt to use scalar parameter `%s' as an array") :
00113                                 _("attempt to use scalar `%s' as array"),
00114                                                                 save_symbol->vname);
00115                 else
00116                         break;
00117         }
00118 
00119         return symbol;
00120 }
00121 
00122 /*
00123  * array_vname --- print the name of the array
00124  *
00125  * Returns a pointer to a statically maintained dynamically allocated string.
00126  * It's appropriate for printing the name once; if the caller wants
00127  * to save it, they have to make a copy.
00128  *
00129  * Setting MAX_LEN to a positive value (eg. 140) would limit the length
00130  * of the output to _roughly_ that length.
00131  *
00132  * If MAX_LEN == 0, which is the default, the whole stack is printed.
00133  */
00134 #define MAX_LEN 0
00135 
00136 char *
00137 array_vname(register const NODE *symbol)
00138 {
00139         if (symbol->type == Node_param_list)
00140                 symbol = stack_ptr[symbol->param_cnt];
00141 
00142         if (symbol->type != Node_array_ref || symbol->orig_array->type != Node_var_array)
00143                 return symbol->vname;
00144         else {
00145                 static char *message = NULL;
00146                 static size_t msglen = 0;
00147                 char *s;
00148                 size_t len;
00149                 int n;
00150                 const NODE *save_symbol = symbol;
00151                 const char *from = _("from %s");
00152 
00153 #if (MAX_LEN <= 0) || !defined(HAVE_SNPRINTF)
00154                 /* This is the default branch. */
00155 
00156                 /* First, we have to compute the length of the string: */
00157                 len = strlen(symbol->vname) + 2;        /* "%s (" */
00158                 n = 0;
00159                 do {
00160                         symbol = symbol->prev_array;
00161                         len += strlen(symbol->vname);
00162                         n++;
00163                 } while (symbol->type == Node_array_ref);
00164                 /*
00165                  * Each node contributes by strlen(from) minus the length
00166                  * of "%s" in the translation (which is at least 2)
00167                  * plus 2 for ", " or ")\0"; this adds up to strlen(from).
00168                  */
00169                 len += n * strlen(from);
00170 
00171                 /* (Re)allocate memory: */
00172                 if (message == NULL) {
00173                         emalloc(message, char *, len, "array_vname");
00174                         msglen = len;
00175                 } else if (len > msglen) {
00176                         erealloc(message, char *, len, "array_vname");
00177                         msglen = len;
00178                 } /* else
00179                         current buffer can hold new name */
00180 
00181                 /* We're ready to print: */
00182                 symbol = save_symbol;
00183                 s = message;
00184                 /*
00185                  * Ancient systems have sprintf() returning char *, not int.
00186                  * Thus, `s += sprintf(s, from, name);' is a no-no.
00187                  */
00188                 sprintf(s, "%s (", symbol->vname);
00189                 s += strlen(s);
00190                 for (;;) {
00191                         symbol = symbol->prev_array;
00192                         sprintf(s, from, symbol->vname);
00193                         s += strlen(s);
00194                         if (symbol->type != Node_array_ref)
00195                                 break;
00196                         sprintf(s, ", ");
00197                         s += strlen(s);
00198                 }
00199                 sprintf(s, ")");
00200 
00201 #else /* MAX_LEN > 0 */
00202 
00203                 /*
00204                  * The following check fails only on
00205                  * abnormally_long_variable_name.
00206                  */
00207 #define PRINT_CHECK \
00208                 if (n <= 0 || n >= len) \
00209                         return save_symbol->vname; \
00210                 s += n; len -= n
00211 #define PRINT(str) \
00212                 n = snprintf(s, len, str); \
00213                 PRINT_CHECK
00214 #define PRINT_vname(str) \
00215                 n = snprintf(s, len, str, symbol->vname); \
00216                 PRINT_CHECK
00217 
00218                 if (message == NULL)
00219                         emalloc(message, char *, MAX_LEN, "array_vname");
00220 
00221                 s = message;
00222                 len = MAX_LEN;
00223 
00224                 /* First, print the vname of the node. */
00225                 PRINT_vname("%s (");
00226 
00227                 for (;;) {
00228                         symbol = symbol->prev_array;
00229                         /*
00230                          * When we don't have enough space and this is not
00231                          * the last node, shorten the list.
00232                          */
00233                         if (len < 40 && symbol->type == Node_array_ref) {
00234                                 PRINT("..., ");
00235                                 symbol = symbol->orig_array;
00236                         }
00237                         PRINT_vname(from);
00238                         if (symbol->type != Node_array_ref)
00239                                 break;
00240                         PRINT(", ");
00241                 }
00242                 PRINT(")");
00243 
00244 #undef PRINT_CHECK
00245 #undef PRINT
00246 #undef PRINT_vname
00247 #endif /* MAX_LEN <= 0 */
00248 
00249                 return message;
00250         }
00251 }
00252 #undef MAX_LEN
00253 
00254 /* concat_exp --- concatenate expression list into a single string */
00255 
00256 NODE *
00257 concat_exp(register NODE *tree)
00258 {
00259         register NODE *r;
00260         char *str;
00261         char *s;
00262         size_t len;
00263         int offset;
00264         size_t subseplen;
00265         const char *subsep;
00266 
00267         if (tree->type != Node_expression_list)
00268                 return force_string(tree_eval(tree));
00269         r = force_string(tree_eval(tree->lnode));
00270         if (tree->rnode == NULL)
00271                 return r;
00272         subseplen = SUBSEP_node->var_value->stlen;
00273         subsep = SUBSEP_node->var_value->stptr;
00274         len = r->stlen + subseplen + 2;
00275         emalloc(str, char *, len, "concat_exp");
00276         memcpy(str, r->stptr, r->stlen+1);
00277         s = str + r->stlen;
00278         free_temp(r);
00279         for (tree = tree->rnode; tree != NULL; tree = tree->rnode) {
00280                 if (subseplen == 1)
00281                         *s++ = *subsep;
00282                 else {
00283                         memcpy(s, subsep, subseplen+1);
00284                         s += subseplen;
00285                 }
00286                 r = force_string(tree_eval(tree->lnode));
00287                 len += r->stlen + subseplen;
00288                 offset = s - str;
00289                 erealloc(str, char *, len, "concat_exp");
00290                 s = str + offset;
00291                 memcpy(s, r->stptr, r->stlen+1);
00292                 s += r->stlen;
00293                 free_temp(r);
00294         }
00295         r = make_str_node(str, s - str, ALREADY_MALLOCED);
00296         r->flags |= TEMP;
00297         return r;
00298 }
00299 
00300 /* assoc_clear --- flush all the values in symbol[] before doing a split() */
00301 
00302 void
00303 assoc_clear(NODE *symbol)
00304 {
00305         int i;
00306         NODE *bucket, *next;
00307 
00308         if (symbol->var_array == NULL)
00309                 return;
00310         for (i = 0; i < symbol->array_size; i++) {
00311                 for (bucket = symbol->var_array[i]; bucket != NULL; bucket = next) {
00312                         next = bucket->ahnext;
00313                         unref(bucket->ahvalue);
00314                         unref(bucket);  /* unref() will free the ahname_str */
00315                 }
00316                 symbol->var_array[i] = NULL;
00317         }
00318         free(symbol->var_array);
00319         symbol->var_array = NULL;
00320         symbol->array_size = symbol->table_size = 0;
00321         symbol->flags &= ~ARRAYMAXED;
00322 }
00323 
00324 /* hash --- calculate the hash function of the string in subs */
00325 
00326 static unsigned long
00327 awk_hash(register const char *s, register size_t len, unsigned long hsize)
00328 {
00329         register unsigned long h = 0;
00330 
00331         /*
00332          * This is INCREDIBLY ugly, but fast.  We break the string up into
00333          * 8 byte units.  On the first time through the loop we get the
00334          * "leftover bytes" (strlen % 8).  On every other iteration, we
00335          * perform 8 HASHC's so we handle all 8 bytes.  Essentially, this
00336          * saves us 7 cmp & branch instructions.  If this routine is
00337          * heavily used enough, it's worth the ugly coding.
00338          *
00339          * OZ's original sdbm hash, copied from Margo Seltzers db package.
00340          */
00341 
00342         /*
00343          * Even more speed:
00344          * #define HASHC   h = *s++ + 65599 * h
00345          * Because 65599 = pow(2, 6) + pow(2, 16) - 1 we multiply by shifts
00346          */
00347 #define HASHC   htmp = (h << 6);  \
00348                 h = *s++ + htmp + (htmp << 10) - h
00349 
00350         unsigned long htmp;
00351 
00352         h = 0;
00353 
00354 #if defined(VAXC)
00355         /*      
00356          * This was an implementation of "Duff's Device", but it has been
00357          * redone, separating the switch for extra iterations from the
00358          * loop. This is necessary because the DEC VAX-C compiler is
00359          * STOOPID.
00360          */
00361         switch (len & (8 - 1)) {
00362         case 7:         HASHC;
00363         case 6:         HASHC;
00364         case 5:         HASHC;
00365         case 4:         HASHC;
00366         case 3:         HASHC;
00367         case 2:         HASHC;
00368         case 1:         HASHC;
00369         default:        break;
00370         }
00371 
00372         if (len > (8 - 1)) {
00373                 register size_t loop = len >> 3;
00374                 do {
00375                         HASHC;
00376                         HASHC;
00377                         HASHC;
00378                         HASHC;
00379                         HASHC;
00380                         HASHC;
00381                         HASHC;
00382                         HASHC;
00383                 } while (--loop);
00384         }
00385 #else /* ! VAXC */
00386         /* "Duff's Device" for those who can handle it */
00387         if (len > 0) {
00388                 register size_t loop = (len + 8 - 1) >> 3;
00389 
00390                 switch (len & (8 - 1)) {
00391                 case 0:
00392                         do {    /* All fall throughs */
00393                                 HASHC;
00394                 case 7:         HASHC;
00395                 case 6:         HASHC;
00396                 case 5:         HASHC;
00397                 case 4:         HASHC;
00398                 case 3:         HASHC;
00399                 case 2:         HASHC;
00400                 case 1:         HASHC;
00401                         } while (--loop);
00402                 }
00403         }
00404 #endif /* ! VAXC */
00405 
00406         if (h >= hsize)
00407                 h %= hsize;
00408         return h;
00409 }
00410 
00411 /* assoc_find --- locate symbol[subs] */
00412 
00413 static NODE *                           /* NULL if not found */
00414 assoc_find(NODE *symbol, register NODE *subs, int hash1)
00415 {
00416         register NODE *bucket;
00417         const char *s1_str;
00418         size_t s1_len;
00419         NODE *s2;
00420 
00421         for (bucket = symbol->var_array[hash1]; bucket != NULL;
00422                         bucket = bucket->ahnext) {
00423                 /*
00424                  * This used to use cmp_nodes() here.  That's wrong.
00425                  * Array indexes are strings; compare as such, always!
00426                  */
00427                 s1_str = bucket->ahname_str;
00428                 s1_len = bucket->ahname_len;
00429                 s2 = subs;
00430 
00431                 if (s1_len == s2->stlen) {
00432                         if (s1_len == 0         /* "" is a valid index */
00433                             || STREQN(s1_str, s2->stptr, s1_len))
00434                                 return bucket;
00435                 }
00436         }
00437         return NULL;
00438 }
00439 
00440 /* in_array --- test whether the array element symbol[subs] exists or not,
00441  *              return pointer to value if it does.
00442  */
00443 
00444 NODE *
00445 in_array(NODE *symbol, NODE *subs)
00446 {
00447         register int hash1;
00448         NODE *ret;
00449 
00450         symbol = get_array(symbol);
00451 
00452         /*
00453          * Evaluate subscript first, it could have side effects.
00454          */
00455         subs = concat_exp(subs);        /* concat_exp returns a string node */
00456         if (symbol->var_array == NULL) {
00457                 free_temp(subs);
00458                 return NULL;
00459         }
00460         hash1 = hash(subs->stptr, subs->stlen, (unsigned long) symbol->array_size);
00461         ret = assoc_find(symbol, subs, hash1);
00462         free_temp(subs);
00463         if (ret)
00464                 return ret->ahvalue;
00465         else
00466                 return NULL;
00467 }
00468 
00469 /*
00470  * assoc_lookup:
00471  * Find SYMBOL[SUBS] in the assoc array.  Install it with value "" if it
00472  * isn't there. Returns a pointer ala get_lhs to where its value is stored.
00473  *
00474  * SYMBOL is the address of the node (or other pointer) being dereferenced.
00475  * SUBS is a number or string used as the subscript. 
00476  */
00477 
00478 NODE **
00479 assoc_lookup(NODE *symbol, NODE *subs, int reference)
00480 {
00481         register int hash1;
00482         register NODE *bucket;
00483 
00484         assert(symbol->type == Node_var_array);
00485 
00486         (void) force_string(subs);
00487 
00488         if (symbol->var_array == NULL) {
00489                 symbol->array_size = symbol->table_size = 0;    /* sanity */
00490                 symbol->flags &= ~ARRAYMAXED;
00491                 grow_table(symbol);
00492                 hash1 = hash(subs->stptr, subs->stlen,
00493                                 (unsigned long) symbol->array_size);
00494         } else {
00495                 hash1 = hash(subs->stptr, subs->stlen,
00496                                 (unsigned long) symbol->array_size);
00497                 bucket = assoc_find(symbol, subs, hash1);
00498                 if (bucket != NULL) {
00499                         free_temp(subs);
00500                         return &(bucket->ahvalue);
00501                 }
00502         }
00503 
00504         if (do_lint && reference) {
00505                 subs->stptr[subs->stlen] = '\0';
00506                 lintwarn(_("reference to uninitialized element `%s[\"%s\"]'"),
00507                       array_vname(symbol), subs->stptr);
00508         }
00509 
00510         /* It's not there, install it. */
00511         if (do_lint && subs->stlen == 0)
00512                 lintwarn(_("subscript of array `%s' is null string"),
00513                         array_vname(symbol));
00514 
00515         /* first see if we would need to grow the array, before installing */
00516         symbol->table_size++;
00517         if ((symbol->flags & ARRAYMAXED) == 0
00518             && (symbol->table_size / symbol->array_size) > AVG_CHAIN_MAX) {
00519                 grow_table(symbol);
00520                 /* have to recompute hash value for new size */
00521                 hash1 = hash(subs->stptr, subs->stlen,
00522                                 (unsigned long) symbol->array_size);
00523         }
00524 
00525         getnode(bucket);
00526         bucket->type = Node_ahash;
00527 
00528         /*
00529          * Freeze this string value --- it must never
00530          * change, no matter what happens to the value
00531          * that created it or to CONVFMT, etc.
00532          *
00533          * One day: Use an atom table to track array indices,
00534          * and avoid the extra memory overhead.
00535          */
00536         bucket->flags |= MALLOC;
00537         bucket->ahname_ref = 1;
00538         emalloc(bucket->ahname_str, char *, subs->stlen + 2, "assoc_lookup");
00539         bucket->ahname_len = subs->stlen;
00540 
00541         memcpy(bucket->ahname_str, subs->stptr, subs->stlen);
00542         bucket->ahname_str[bucket->ahname_len] = '\0';
00543 
00544         free_temp(subs);
00545 
00546         bucket->ahvalue = Nnull_string;
00547         bucket->ahnext = symbol->var_array[hash1];
00548         symbol->var_array[hash1] = bucket;
00549         return &(bucket->ahvalue);
00550 }
00551 
00552 /* do_delete --- perform `delete array[s]' */
00553 
00554 /*
00555  * `symbol' is array
00556  * `tree' is subscript
00557  */
00558 
00559 void
00560 do_delete(NODE *sym, NODE *tree)
00561 {
00562         register int hash1;
00563         register NODE *bucket, *last;
00564         NODE *subs;
00565         register NODE *symbol = get_array(sym);
00566 
00567         if (tree == NULL) {     /* delete array */
00568                 assoc_clear(symbol);
00569                 return;
00570         }
00571 
00572         last = NULL;    /* shut up gcc -Wall */
00573         hash1 = 0;      /* ditto */
00574 
00575         /*
00576          * Always evaluate subscript, it could have side effects.
00577          */
00578         subs = concat_exp(tree);        /* concat_exp returns string node */
00579 
00580         if (symbol->var_array != NULL) {
00581                 hash1 = hash(subs->stptr, subs->stlen,
00582                                 (unsigned long) symbol->array_size);
00583                 last = NULL;
00584                 for (bucket = symbol->var_array[hash1]; bucket != NULL;
00585                                 last = bucket, bucket = bucket->ahnext) {
00586                         /*
00587                          * This used to use cmp_nodes() here.  That's wrong.
00588                          * Array indexes are strings; compare as such, always!
00589                          */
00590                         const char *s1_str;
00591                         size_t s1_len;
00592                         NODE *s2;
00593 
00594                         s1_str = bucket->ahname_str;
00595                         s1_len = bucket->ahname_len;
00596                         s2 = subs;
00597         
00598                         if (s1_len == s2->stlen) {
00599                                 if (s1_len == 0         /* "" is a valid index */
00600                                     || STREQN(s1_str, s2->stptr, s1_len))
00601                                         break;
00602                         }
00603                 }
00604         } else
00605                 bucket = NULL;  /* The array is empty.  */
00606 
00607         if (bucket == NULL) {
00608                 if (do_lint)
00609                         lintwarn(_("delete: index `%s' not in array `%s'"),
00610                                 subs->stptr, array_vname(sym));
00611                 free_temp(subs);
00612                 return;
00613         }
00614 
00615         free_temp(subs);
00616 
00617         if (last != NULL)
00618                 last->ahnext = bucket->ahnext;
00619         else
00620                 symbol->var_array[hash1] = bucket->ahnext;
00621         unref(bucket->ahvalue);
00622         unref(bucket);  /* unref() will free the ahname_str */
00623         symbol->table_size--;
00624         if (symbol->table_size <= 0) {
00625                 memset(symbol->var_array, '\0',
00626                         sizeof(NODE *) * symbol->array_size);
00627                 symbol->table_size = symbol->array_size = 0;
00628                 symbol->flags &= ~ARRAYMAXED;
00629                 free((char *) symbol->var_array);
00630                 symbol->var_array = NULL;
00631         }
00632 }
00633 
00634 /* do_delete_loop --- simulate ``for (iggy in foo) delete foo[iggy]'' */
00635 
00636 /*
00637  * The primary hassle here is that `iggy' needs to have some arbitrary
00638  * array index put in it before we can clear the array, we can't
00639  * just replace the loop with `delete foo'.
00640  */
00641 
00642 void
00643 do_delete_loop(NODE *symbol, NODE *tree)
00644 {
00645         size_t i;
00646         NODE **lhs;
00647         Func_ptr after_assign = NULL;
00648 
00649         symbol = get_array(symbol);
00650 
00651         if (symbol->var_array == NULL)
00652                 return;
00653 
00654         /* get first index value */
00655         for (i = 0; i < symbol->array_size; i++) {
00656                 if (symbol->var_array[i] != NULL) {
00657                         lhs = get_lhs(tree->lnode, & after_assign, FALSE);
00658                         unref(*lhs);
00659                         *lhs = make_string(symbol->var_array[i]->ahname_str,
00660                                         symbol->var_array[i]->ahname_len);
00661                         if (after_assign)
00662                                 (*after_assign)();
00663                         break;
00664                 }
00665         }
00666 
00667         /* blast the array in one shot */
00668         assoc_clear(symbol);
00669 }
00670 
00671 /* grow_table --- grow a hash table */
00672 
00673 static void
00674 grow_table(NODE *symbol)
00675 {
00676         NODE **old, **new, *chain, *next;
00677         int i, j;
00678         unsigned long hash1;
00679         unsigned long oldsize, newsize;
00680         /*
00681          * This is an array of primes. We grow the table by an order of
00682          * magnitude each time (not just doubling) so that growing is a
00683          * rare operation. We expect, on average, that it won't happen
00684          * more than twice.  The final size is also chosen to be small
00685          * enough so that MS-DOG mallocs can handle it. When things are
00686          * very large (> 8K), we just double more or less, instead of
00687          * just jumping from 8K to 64K.
00688          */
00689         static const long sizes[] = { 13, 127, 1021, 8191, 16381, 32749, 65497,
00690 #if ! defined(MSDOS) && ! defined(OS2) && ! defined(atarist)
00691                                 131101, 262147, 524309, 1048583, 2097169,
00692                                 4194319, 8388617, 16777259, 33554467, 
00693                                 67108879, 134217757, 268435459, 536870923,
00694                                 1073741827
00695 #endif
00696         };
00697 
00698         /* find next biggest hash size */
00699         newsize = oldsize = symbol->array_size;
00700         for (i = 0, j = sizeof(sizes)/sizeof(sizes[0]); i < j; i++) {
00701                 if (oldsize < sizes[i]) {
00702                         newsize = sizes[i];
00703                         break;
00704                 }
00705         }
00706 
00707         if (newsize == oldsize) {       /* table already at max (!) */
00708                 symbol->flags |= ARRAYMAXED;
00709                 return;
00710         }
00711 
00712         /* allocate new table */
00713         emalloc(new, NODE **, newsize * sizeof(NODE *), "grow_table");
00714         memset(new, '\0', newsize * sizeof(NODE *));
00715 
00716         /* brand new hash table, set things up and return */
00717         if (symbol->var_array == NULL) {
00718                 symbol->table_size = 0;
00719                 goto done;
00720         }
00721 
00722         /* old hash table there, move stuff to new, free old */
00723         old = symbol->var_array;
00724         for (i = 0; i < oldsize; i++) {
00725                 if (old[i] == NULL)
00726                         continue;
00727 
00728                 for (chain = old[i]; chain != NULL; chain = next) {
00729                         next = chain->ahnext;
00730                         hash1 = hash(chain->ahname_str,
00731                                         chain->ahname_len, newsize);
00732 
00733                         /* remove from old list, add to new */
00734                         chain->ahnext = new[hash1];
00735                         new[hash1] = chain;
00736                 }
00737         }
00738         free(old);
00739 
00740 done:
00741         /*
00742          * note that symbol->table_size does not change if an old array,
00743          * and is explicitly set to 0 if a new one.
00744          */
00745         symbol->var_array = new;
00746         symbol->array_size = newsize;
00747 }
00748 
00749 /* pr_node --- print simple node info */
00750 
00751 static void
00752 pr_node(NODE *n)
00753 {
00754         if ((n->flags & (NUMCUR|NUMBER)) != 0)
00755                 printf("%g", n->numbr);
00756         else
00757                 printf("%.*s", (int) n->stlen, n->stptr);
00758 }
00759 
00760 /* assoc_dump --- dump the contents of an array */
00761 
00762 NODE *
00763 assoc_dump(NODE *symbol)
00764 {
00765         int i;
00766         NODE *bucket;
00767 
00768         if (symbol->var_array == NULL) {
00769                 printf(_("%s: empty (null)\n"), symbol->vname);
00770                 return tmp_number((AWKNUM) 0);
00771         }
00772 
00773         if (symbol->table_size == 0) {
00774                 printf(_("%s: empty (zero)\n"), symbol->vname);
00775                 return tmp_number((AWKNUM) 0);
00776         }
00777 
00778         printf(_("%s: table_size = %d, array_size = %d\n"), symbol->vname,
00779                         (int) symbol->table_size, (int) symbol->array_size);
00780 
00781         for (i = 0; i < symbol->array_size; i++) {
00782                 for (bucket = symbol->var_array[i]; bucket != NULL;
00783                                 bucket = bucket->ahnext) {
00784                         printf("%s: I: [len %d <%.*s>] V: [",
00785                                 symbol->vname,
00786                                 (int) bucket->ahname_len,
00787                                 (int) bucket->ahname_len,
00788                                 bucket->ahname_str);
00789                         pr_node(bucket->ahvalue);
00790                         printf("]\n");
00791                 }
00792         }
00793 
00794         return tmp_number((AWKNUM) 0);
00795 }
00796 
00797 /* do_adump --- dump an array: interface to assoc_dump */
00798 
00799 NODE *
00800 do_adump(NODE *tree)
00801 {
00802         NODE *r, *a;
00803 
00804         a = tree->lnode;
00805 
00806         if (a->type == Node_param_list) {
00807                 printf(_("%s: is parameter\n"), a->vname);
00808                 a = stack_ptr[a->param_cnt];
00809         }
00810 
00811         if (a->type == Node_array_ref) {
00812                 printf(_("%s: array_ref to %s\n"), a->vname,
00813                                         a->orig_array->vname);
00814                 a = a->orig_array;
00815         }
00816 
00817         r = assoc_dump(a);
00818 
00819         return r;
00820 }
00821 
00822 /*
00823  * The following functions implement the builtin
00824  * asort function.  Initial work by Alan J. Broder,
00825  * ajb@woti.com.
00826  */
00827 
00828 /* dup_table --- duplicate input symbol table "symbol" */
00829 
00830 static void
00831 dup_table(NODE *symbol, NODE *newsymb)
00832 {
00833         NODE **old, **new, *chain, *bucket;
00834         int i;
00835         unsigned long cursize;
00836 
00837         /* find the current hash size */
00838         cursize = symbol->array_size;
00839 
00840         new = NULL;
00841 
00842         /* input is a brand new hash table, so there's nothing to copy */
00843         if (symbol->var_array == NULL)
00844                 newsymb->table_size = 0;
00845         else {
00846                 /* old hash table there, dupnode stuff into a new table */
00847 
00848                 /* allocate new table */
00849                 emalloc(new, NODE **, cursize * sizeof(NODE *), "dup_table");
00850                 memset(new, '\0', cursize * sizeof(NODE *));
00851 
00852                 /* do the copying/dupnode'ing */
00853                 old = symbol->var_array;
00854                 for (i = 0; i < cursize; i++) {
00855                         if (old[i] != NULL) {
00856                                 for (chain = old[i]; chain != NULL;
00857                                                 chain = chain->ahnext) {
00858                                         /* get a node for the linked list */
00859                                         getnode(bucket);
00860                                         bucket->type = Node_ahash;
00861                                         bucket->flags |= MALLOC;
00862                                         bucket->ahname_ref = 1;
00863 
00864                                         /*
00865                                          * copy the corresponding name and
00866                                          * value from the original input list
00867                                          */
00868                                         emalloc(bucket->ahname_str, char *, chain->ahname_len + 2, "dup_table");
00869                                         bucket->ahname_len = chain->ahname_len;
00870 
00871                                         memcpy(bucket->ahname_str, chain->ahname_str, chain->ahname_len);
00872                                         bucket->ahname_str[bucket->ahname_len] = '\0';
00873 
00874                                         bucket->ahvalue = dupnode(chain->ahvalue);
00875 
00876                                         /*
00877                                          * put the node on the corresponding
00878                                          * linked list in the new table
00879                                          */
00880                                         bucket->ahnext = new[i];
00881                                         new[i] = bucket;
00882                                 }
00883                         }
00884                 }
00885                 newsymb->table_size = symbol->table_size;
00886         }
00887 
00888         newsymb->var_array = new;
00889         newsymb->array_size = cursize;
00890 }
00891 
00892 /* merge --- do a merge of two sorted lists */
00893 
00894 static NODE *
00895 merge(NODE *left, NODE *right)
00896 {
00897         NODE *ans, *cur;
00898 
00899         /*
00900          * The use of cmp_nodes() here means that IGNORECASE influences the
00901          * comparison.  This is OK, but it may be surprising.  This comment
00902          * serves to remind us that we know about this and that it's OK.
00903          */
00904         if (cmp_nodes(left->ahvalue, right->ahvalue) <= 0) {
00905                 ans = cur = left;
00906                 left = left->ahnext;
00907         } else {
00908                 ans = cur = right;
00909                 right = right->ahnext;
00910         }
00911 
00912         while (left != NULL && right != NULL) {
00913                 if (cmp_nodes(left->ahvalue, right->ahvalue) <= 0) {
00914                         cur->ahnext = left;
00915                         cur = left;
00916                         left  = left->ahnext;
00917                 } else {
00918                         cur->ahnext = right;
00919                         cur = right;
00920                         right = right->ahnext;
00921                 }
00922         }
00923 
00924         cur->ahnext = (left != NULL ? left : right);
00925 
00926         return ans;
00927 }
00928 
00929 /* merge_sort --- recursively sort the left and right sides of a list */
00930 
00931 static NODE *
00932 merge_sort(NODE *left, int size)
00933 {
00934         NODE *right, *tmp;
00935         int i, half;
00936 
00937         if (size <= 1)
00938                 return left;
00939 
00940         /* walk down the list, till just one before the midpoint */
00941         tmp = left;
00942         half = size / 2;
00943         for (i = 0; i < half-1; i++)
00944                 tmp = tmp->ahnext;
00945 
00946         /* split the list into two parts */
00947         right = tmp->ahnext;
00948         tmp->ahnext = NULL;
00949 
00950         /* sort the left and right parts of the list */
00951         left  = merge_sort(left,       half);
00952         right = merge_sort(right, size-half);
00953 
00954         /* merge the two sorted parts of the list */
00955         return merge(left, right);
00956 }
00957 
00958 
00959 /*
00960  * assoc_from_list -- Populate an array with the contents of a list of NODEs, 
00961  * using increasing integers as the key.
00962  */
00963 
00964 static void
00965 assoc_from_list(NODE *symbol, NODE *list)
00966 {
00967         NODE *next;
00968         unsigned long i = 0;
00969         register int hash1;
00970         char buf[100];
00971 
00972         for (; list != NULL; list = next) {
00973                 next = list->ahnext;
00974 
00975                 /* make an int out of i++ */
00976                 i++;
00977                 sprintf(buf, "%lu", i);
00978                 assert(list->ahname_str == NULL);
00979                 assert(list->ahname_ref == 1);
00980                 emalloc(list->ahname_str, char *, strlen(buf) + 2, "assoc_from_list");
00981                 list->ahname_len = strlen(buf);
00982                 strcpy(list->ahname_str, buf);
00983 
00984                 /* find the bucket where it belongs */
00985                 hash1 = hash(list->ahname_str, list->ahname_len,
00986                                 symbol->array_size);
00987 
00988                 /* link the node into the chain at that bucket */
00989                 list->ahnext = symbol->var_array[hash1];
00990                 symbol->var_array[hash1] = list;
00991         }
00992 }
00993 
00994 /*
00995  * assoc_sort_inplace --- sort all the values in symbol[], replacing
00996  * the sorted values back into symbol[], indexed by integers starting with 1.
00997  */
00998 
00999 typedef enum asort_how { VALUE, INDEX } ASORT_TYPE;
01000 
01001 static NODE *
01002 assoc_sort_inplace(NODE *symbol, ASORT_TYPE how)
01003 {
01004         int i, num;
01005         NODE *bucket, *next, *list;
01006 
01007         if (symbol->var_array == NULL
01008             || symbol->array_size <= 0
01009             || symbol->table_size <= 0)
01010                 return tmp_number((AWKNUM) 0);
01011 
01012         /* build a linked list out of all the entries in the table */
01013         if (how == VALUE) {
01014                 list = NULL;
01015                 num = 0;
01016                 for (i = 0; i < symbol->array_size; i++) {
01017                         for (bucket = symbol->var_array[i]; bucket != NULL; bucket = next) {
01018                                 next = bucket->ahnext;
01019                                 if (bucket->ahname_ref == 1) {
01020                                         free(bucket->ahname_str);
01021                                         bucket->ahname_str = NULL;
01022                                         bucket->ahname_len = 0;
01023                                 } else {
01024                                         NODE *r;
01025 
01026                                         getnode(r);
01027                                         *r = *bucket;
01028                                         unref(bucket);
01029                                         bucket = r;
01030                                         bucket->flags |= MALLOC;
01031                                         bucket->ahname_ref = 1;
01032                                         bucket->ahname_str = NULL;
01033                                         bucket->ahname_len = 0;
01034                                 }
01035                                 bucket->ahnext = list;
01036                                 list = bucket;
01037                                 num++;
01038                         }
01039                         symbol->var_array[i] = NULL;
01040                 }
01041         } else {        /* how == INDEX */
01042                 list = NULL;
01043                 num = 0;
01044                 for (i = 0; i < symbol->array_size; i++) {
01045                         for (bucket = symbol->var_array[i]; bucket != NULL; bucket = next) {
01046                                 next = bucket->ahnext;
01047 
01048                                 /* toss old value */
01049                                 unref(bucket->ahvalue);
01050 
01051                                 /* move index into value */
01052                                 if (bucket->ahname_ref == 1) {
01053                                         bucket->ahvalue = make_str_node(bucket->ahname_str,
01054                                                                 bucket->ahname_len, ALREADY_MALLOCED);
01055                                         bucket->ahname_str = NULL;
01056                                         bucket->ahname_len = 0;
01057                                 } else {
01058                                         NODE *r;
01059 
01060                                         bucket->ahvalue = make_string(bucket->ahname_str, bucket->ahname_len);
01061                                         getnode(r);
01062                                         *r = *bucket;
01063                                         unref(bucket);
01064                                         bucket = r;
01065                                         bucket->flags |= MALLOC;
01066                                         bucket->ahname_ref = 1;
01067                                         bucket->ahname_str = NULL;
01068                                         bucket->ahname_len = 0;
01069                                 }
01070 
01071                                 bucket->ahnext = list;
01072                                 list = bucket;
01073                                 num++;
01074                         }
01075                         symbol->var_array[i] = NULL;
01076                 }
01077         }
01078 
01079         /*
01080          * Sort the linked list of NODEs.
01081          * (The especially nice thing about using a merge sort here is that
01082          * we require absolutely no additional storage. This is handy if the
01083          * array has grown to be very large.)
01084          */
01085         list = merge_sort(list, num);
01086 
01087         /*
01088          * now repopulate the original array, using increasing
01089          * integers as the key
01090          */
01091         assoc_from_list(symbol, list);
01092 
01093         return tmp_number((AWKNUM) num);
01094 }
01095 
01096 /* asort_actual --- do the actual work to sort the input array */
01097 
01098 static NODE *
01099 asort_actual(NODE *tree, ASORT_TYPE how)
01100 {
01101         NODE *array = get_array(tree->lnode);
01102 
01103         if (tree->rnode != NULL) {  /* 2nd optional arg */
01104                 NODE *dest = get_array(tree->rnode->lnode);
01105 
01106                 assoc_clear(dest);
01107                 dup_table(array, dest);
01108                 array = dest;
01109         }
01110 
01111         return assoc_sort_inplace(array, how);
01112 }
01113 
01114 /* do_asort --- sort array by value */
01115 
01116 NODE *
01117 do_asort(NODE *tree)
01118 {
01119         return asort_actual(tree, VALUE);
01120 }
01121 
01122 /* do_asorti --- sort array by index */
01123 
01124 NODE *
01125 do_asorti(NODE *tree)
01126 {
01127         return asort_actual(tree, INDEX);
01128 }
01129 
01130 /*
01131 From bonzini@gnu.org  Mon Oct 28 16:05:26 2002
01132 Date: Mon, 28 Oct 2002 13:33:03 +0100
01133 From: Paolo Bonzini <bonzini@gnu.org>
01134 To: arnold@skeeve.com
01135 Subject: Hash function
01136 Message-ID: <20021028123303.GA6832@biancaneve>
01137 
01138 Here is the hash function I'm using in GNU Smalltalk.  The scrambling is
01139 needed if you use powers of two as the table sizes.  If you use primes it
01140 is not needed.
01141 
01142 To use double-hashing with power-of-two size, you should use the
01143 _gst_hash_string(str, len) as the primary hash and
01144 scramble(_gst_hash_string (str, len)) | 1 as the secondary hash.
01145 
01146 Paolo
01147 
01148 */
01149 /*
01150  * ADR: Slightly modified to work w/in the context of gawk.
01151  */
01152 
01153 static unsigned long
01154 gst_hash_string(const char *str, size_t len, unsigned long hsize)
01155 {
01156         unsigned long hashVal = 1497032417;    /* arbitrary value */
01157         unsigned long ret;
01158 
01159         while (len--) {
01160                 hashVal += *str++;
01161                 hashVal += (hashVal << 10);
01162                 hashVal ^= (hashVal >> 6);
01163         }
01164 
01165         ret = scramble(hashVal);
01166         if (ret >= hsize)
01167                 ret %= hsize;
01168 
01169         return ret;
01170 }
01171 
01172 static unsigned long
01173 scramble(unsigned long x)
01174 {
01175         if (sizeof(long) == 4) {
01176                 int y = ~x;
01177 
01178                 x += (y << 10) | (y >> 22);
01179                 x += (x << 6)  | (x >> 26);
01180                 x -= (x << 16) | (x >> 16);
01181         } else {
01182                 x ^= (~x) >> 31;
01183                 x += (x << 21) | (x >> 11);
01184                 x += (x << 5) | (x >> 27);
01185                 x += (x << 27) | (x >> 5);
01186                 x += (x << 31);
01187         }
01188 
01189         return x;
01190 }

© sourcejam.com 2005-2008