Main Page | Class List | Directories | File List | Class Members | File Members

field.c

Go to the documentation of this file.
00001 /*
00002  * field.c - routines for dealing with fields and record parsing
00003  */
00004 
00005 /* 
00006  * Copyright (C) 1986, 1988, 1989, 1991-2003 the Free Software Foundation, Inc.
00007  * 
00008  * This file is part of GAWK, the GNU implementation of the
00009  * AWK Programming Language.
00010  * 
00011  * GAWK is free software; you can redistribute it and/or modify
00012  * it under the terms of the GNU General Public License as published by
00013  * the Free Software Foundation; either version 2 of the License, or
00014  * (at your option) any later version.
00015  * 
00016  * GAWK is distributed in the hope that it will be useful,
00017  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00018  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00019  * GNU General Public License for more details.
00020  * 
00021  * You should have received a copy of the GNU General Public License
00022  * along with this program; if not, write to the Free Software
00023  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
00024  */
00025 
00026 #include "awk.h"
00027 
00028 typedef void (* Setfunc) P((long, char *, long, NODE *));
00029 
00030 static long (*parse_field) P((long, char **, int, NODE *,
00031                              Regexp *, Setfunc, NODE *));
00032 static void rebuild_record P((void));
00033 static long re_parse_field P((long, char **, int, NODE *,
00034                              Regexp *, Setfunc, NODE *));
00035 static long def_parse_field P((long, char **, int, NODE *,
00036                               Regexp *, Setfunc, NODE *));
00037 static long posix_def_parse_field P((long, char **, int, NODE *,
00038                               Regexp *, Setfunc, NODE *));
00039 static long null_parse_field P((long, char **, int, NODE *,
00040                              Regexp *, Setfunc, NODE *));
00041 static long sc_parse_field P((long, char **, int, NODE *,
00042                              Regexp *, Setfunc, NODE *));
00043 static long fw_parse_field P((long, char **, int, NODE *,
00044                              Regexp *, Setfunc, NODE *));
00045 static void set_element P((long num, char * str, long len, NODE *arr));
00046 static void grow_fields_arr P((long num));
00047 static void set_field P((long num, char *str, long len, NODE *dummy));
00048 static void update_PROCINFO P((char *subscript, char *str));
00049 
00050 
00051 static char *parse_extent;      /* marks where to restart parse of record */
00052 static long parse_high_water = 0; /* field number that we have parsed so far */
00053 static long nf_high_water = 0;  /* size of fields_arr */
00054 static int resave_fs;
00055 static NODE *save_FS;           /* save current value of FS when line is read,
00056                                  * to be used in deferred parsing
00057                                  */
00058 static int *FIELDWIDTHS = NULL;
00059 
00060 NODE **fields_arr;              /* array of pointers to the field nodes */
00061 int field0_valid;               /* $(>0) has not been changed yet */
00062 int default_FS;                 /* TRUE when FS == " " */
00063 Regexp *FS_re_yes_case = NULL;
00064 Regexp *FS_re_no_case = NULL;
00065 Regexp *FS_regexp = NULL;
00066 NODE *Null_field = NULL;
00067 
00068 /* using_FIELDWIDTHS --- static function, macro to avoid overhead */
00069 #define using_FIELDWIDTHS()     (parse_field == fw_parse_field)
00070 
00071 /* init_fields --- set up the fields array to start with */
00072 
00073 void
00074 init_fields()
00075 {
00076         emalloc(fields_arr, NODE **, sizeof(NODE *), "init_fields");
00077         fields_arr[0] = Nnull_string;
00078         parse_extent = fields_arr[0]->stptr;
00079         save_FS = dupnode(FS_node->var_value);
00080         getnode(Null_field);
00081         *Null_field = *Nnull_string;
00082         Null_field->flags |= FIELD;
00083         Null_field->flags &= ~(NUMCUR|NUMBER|MAYBE_NUM|PERM);
00084         field0_valid = TRUE;
00085 }
00086 
00087 /* grow_fields --- acquire new fields as needed */
00088 
00089 static void
00090 grow_fields_arr(long num)
00091 {
00092         register int t;
00093         register NODE *n;
00094 
00095         erealloc(fields_arr, NODE **, (num + 1) * sizeof(NODE *), "grow_fields_arr");
00096         for (t = nf_high_water + 1; t <= num; t++) {
00097                 getnode(n);
00098                 *n = *Null_field;
00099                 fields_arr[t] = n;
00100         }
00101         nf_high_water = num;
00102 }
00103 
00104 /* set_field --- set the value of a particular field */
00105 
00106 /*ARGSUSED*/
00107 static void
00108 set_field(long num,
00109         char *str,
00110         long len,
00111         NODE *dummy ATTRIBUTE_UNUSED)   /* just to make interface same as set_element */
00112 {
00113         register NODE *n;
00114 
00115         if (num > nf_high_water)
00116                 grow_fields_arr(num);
00117         n = fields_arr[num];
00118         n->stptr = str;
00119         n->stlen = len;
00120         n->flags = (STRCUR|STRING|MAYBE_NUM|FIELD);
00121 }
00122 
00123 /* rebuild_record --- Someone assigned a value to $(something).
00124                         Fix up $0 to be right */
00125 
00126 static void
00127 rebuild_record()
00128 {
00129         /*
00130          * use explicit unsigned longs for lengths, in case
00131          * a size_t isn't big enough.
00132          */
00133         register unsigned long tlen;
00134         register unsigned long ofslen;
00135         register NODE *tmp;
00136         NODE *ofs;
00137         char *ops;
00138         register char *cops;
00139         long i;
00140 
00141         assert(NF != -1);
00142 
00143         tlen = 0;
00144         ofs = force_string(OFS_node->var_value);
00145         ofslen = ofs->stlen;
00146         for (i = NF; i > 0; i--) {
00147                 tmp = fields_arr[i];
00148                 tmp = force_string(tmp);
00149                 tlen += tmp->stlen;
00150         }
00151         tlen += (NF - 1) * ofslen;
00152         if ((long) tlen < 0)
00153                 tlen = 0;
00154         emalloc(ops, char *, tlen + 2, "rebuild_record");
00155         cops = ops;
00156         ops[0] = '\0';
00157         for (i = 1;  i <= NF; i++) {
00158                 tmp = fields_arr[i];
00159                 /* copy field */
00160                 if (tmp->stlen == 1)
00161                         *cops++ = tmp->stptr[0];
00162                 else if (tmp->stlen != 0) {
00163                         memcpy(cops, tmp->stptr, tmp->stlen);
00164                         cops += tmp->stlen;
00165                 }
00166                 /* copy OFS */
00167                 if (i != NF) {
00168                         if (ofslen == 1)
00169                                 *cops++ = ofs->stptr[0];
00170                         else if (ofslen != 0) {
00171                                 memcpy(cops, ofs->stptr, ofslen);
00172                                 cops += ofslen;
00173                         }
00174                 }
00175         }
00176         tmp = make_str_node(ops, tlen, ALREADY_MALLOCED);
00177 
00178         /*
00179          * Since we are about to unref fields_arr[0], we want to find
00180          * any fields that still point into it, and have them point
00181          * into the new field zero.  This has to be done intelligently,
00182          * so that unrefing a field doesn't try to unref into the old $0.
00183          */
00184         for (cops = ops, i = 1; i <= NF; i++) {
00185                 if (fields_arr[i]->stlen > 0) {
00186                         NODE *n;
00187                         getnode(n);
00188 
00189                         if ((fields_arr[i]->flags & FIELD) == 0) {
00190                                 *n = *Null_field;
00191                                 n->stlen = fields_arr[i]->stlen;
00192                                 if ((fields_arr[i]->flags & (NUMCUR|NUMBER)) != 0) {
00193                                         n->flags |= (fields_arr[i]->flags & (NUMCUR|NUMBER));
00194                                         n->numbr = fields_arr[i]->numbr;
00195                                 }
00196                         } else {
00197                                 *n = *(fields_arr[i]);
00198                                 n->flags &= ~(MALLOC|TEMP|PERM|STRING);
00199                         }
00200 
00201                         n->stptr = cops;
00202                         unref(fields_arr[i]);
00203                         fields_arr[i] = n;
00204                 }
00205                 cops += fields_arr[i]->stlen + ofslen;
00206         }
00207 
00208         unref(fields_arr[0]);
00209 
00210         fields_arr[0] = tmp;
00211         field0_valid = TRUE;
00212 }
00213 
00214 /*
00215  * set_record:
00216  * setup $0, but defer parsing rest of line until reference is made to $(>0)
00217  * or to NF.  At that point, parse only as much as necessary.
00218  *
00219  * Manage a private buffer for the contents of $0.  Doing so keeps us safe
00220  * if `getline var' decides to rearrange the contents of the IOBUF that
00221  * $0 might have been pointing into.  The cost is the copying of the buffer;
00222  * but better correct than fast.
00223  */
00224 void
00225 set_record(const char *buf, int cnt)
00226 {
00227         NODE *n;
00228         static char *databuf;
00229         static unsigned long databuf_size;
00230 #define INITIAL_SIZE    512
00231 #define MAX_SIZE        ((unsigned long) ~0)    /* maximally portable ... */
00232 
00233         reset_record();
00234 
00235         /* buffer management: */
00236         if (databuf_size == 0) {        /* first time */
00237                 emalloc(databuf, char *, INITIAL_SIZE, "set_record");
00238                 databuf_size = INITIAL_SIZE;
00239                 memset(databuf, '\0', INITIAL_SIZE);
00240 
00241         }
00242         /*
00243          * Make sure there's enough room. Since we sometimes need
00244          * to place a sentinel at the end, we make sure
00245          * databuf_size is > cnt after allocation.
00246          */
00247         if (cnt >= databuf_size) {
00248                 while (cnt >= databuf_size && databuf_size <= MAX_SIZE)
00249                         databuf_size *= 2;
00250                 erealloc(databuf, char *, databuf_size, "set_record");
00251                 memset(databuf, '\0', databuf_size);
00252         }
00253         /* copy the data */
00254         memcpy(databuf, buf, cnt);
00255 
00256         /* manage field 0: */
00257         unref(fields_arr[0]);
00258         getnode(n);
00259         n->stptr = databuf;
00260         n->stlen = cnt;
00261         n->stref = 1;
00262         n->type = Node_val;
00263         n->stfmt = -1;
00264         n->flags = (STRING|STRCUR|MAYBE_NUM|FIELD);
00265         fields_arr[0] = n;
00266 
00267 #undef INITIAL_SIZE
00268 #undef MAX_SIZE
00269 }
00270 
00271 /* reset_record --- start over again with current $0 */
00272 
00273 void
00274 reset_record()
00275 {
00276         register int i;
00277         NODE *n;
00278 
00279         (void) force_string(fields_arr[0]);
00280 
00281         NF = -1;
00282         for (i = 1; i <= parse_high_water; i++) {
00283                 unref(fields_arr[i]);
00284                 getnode(n);
00285                 *n = *Null_field;
00286                 fields_arr[i] = n;
00287         }
00288 
00289         parse_high_water = 0;
00290         /*
00291          * $0 = $0 should resplit using the current value of FS.
00292          */
00293         if (resave_fs) {
00294                 resave_fs = FALSE;
00295                 unref(save_FS);
00296                 save_FS = dupnode(FS_node->var_value);
00297         }
00298 
00299         field0_valid = TRUE;
00300 }
00301 
00302 /* set_NF --- handle what happens to $0 and fields when NF is changed */
00303 
00304 void
00305 set_NF()
00306 {
00307         register int i;
00308         NODE *n;
00309 
00310         assert(NF != -1);
00311 
00312         NF = (long) force_number(NF_node->var_value);
00313 
00314         if (NF < 0)
00315                 fatal(_("NF set to negative value"));
00316 
00317         if (NF > nf_high_water)
00318                 grow_fields_arr(NF);
00319         if (parse_high_water < NF) {
00320                 for (i = parse_high_water + 1; i >= 0 && i <= NF; i++) {
00321                         unref(fields_arr[i]);
00322                         getnode(n);
00323                         *n = *Null_field;
00324                         fields_arr[i] = n;
00325                 }
00326         } else if (parse_high_water > 0) {
00327                 for (i = NF + 1; i >= 0 && i <= parse_high_water; i++) {
00328                         unref(fields_arr[i]);
00329                         getnode(n);
00330                         *n = *Null_field;
00331                         fields_arr[i] = n;
00332                 }
00333                 parse_high_water = NF;
00334         }
00335         field0_valid = FALSE;
00336 }
00337 
00338 /*
00339  * re_parse_field --- parse fields using a regexp.
00340  *
00341  * This is called both from get_field() and from do_split()
00342  * via (*parse_field)().  This variation is for when FS is a regular
00343  * expression -- either user-defined or because RS=="" and FS==" "
00344  */
00345 static long
00346 re_parse_field(long up_to,      /* parse only up to this field number */
00347         char **buf,     /* on input: string to parse; on output: point to start next */
00348         int len,
00349         NODE *fs ATTRIBUTE_UNUSED,
00350         Regexp *rp,
00351         Setfunc set,    /* routine to set the value of the parsed field */
00352         NODE *n)
00353 {
00354         register char *scan = *buf;
00355         register long nf = parse_high_water;
00356         register char *field;
00357         register char *end = scan + len;
00358 #ifdef MBS_SUPPORT
00359         size_t mbclen = 0;
00360         mbstate_t mbs;
00361         if (gawk_mb_cur_max > 1)
00362                 memset(&mbs, 0, sizeof(mbstate_t));
00363 #endif
00364 
00365         if (up_to == HUGE)
00366                 nf = 0;
00367         if (len == 0)
00368                 return nf;
00369 
00370         if (RS_is_null && default_FS)
00371                 while (scan < end && (*scan == ' ' || *scan == '\t' || *scan == '\n'))
00372                         scan++;
00373         field = scan;
00374         while (scan < end
00375                && research(rp, scan, 0, (end - scan), TRUE) != -1
00376                && nf < up_to) {
00377                 if (REEND(rp, scan) == RESTART(rp, scan)) {   /* null match */
00378 #ifdef MBS_SUPPORT
00379                         if (gawk_mb_cur_max > 1)        {
00380                                 mbclen = mbrlen(scan, end-scan, &mbs);
00381                                 if ((mbclen == 1) || (mbclen == (size_t) -1)
00382                                         || (mbclen == (size_t) -2) || (mbclen == 0)) {
00383                                         /* We treat it as a singlebyte character.  */
00384                                         mbclen = 1;
00385                                 }
00386                                 scan += mbclen;
00387                         } else
00388 #endif
00389                         scan++;
00390                         if (scan == end) {
00391                                 (*set)(++nf, field, (long)(scan - field), n);
00392                                 up_to = nf;
00393                                 break;
00394                         }
00395                         continue;
00396                 }
00397                 (*set)(++nf, field,
00398                        (long)(scan + RESTART(rp, scan) - field), n);
00399                 scan += REEND(rp, scan);
00400                 field = scan;
00401                 if (scan == end)        /* FS at end of record */
00402                         (*set)(++nf, field, 0L, n);
00403         }
00404         if (nf != up_to && scan < end) {
00405                 (*set)(++nf, scan, (long)(end - scan), n);
00406                 scan = end;
00407         }
00408         *buf = scan;
00409         return (nf);
00410 }
00411 
00412 /*
00413  * def_parse_field --- default field parsing.
00414  *
00415  * This is called both from get_field() and from do_split()
00416  * via (*parse_field)().  This variation is for when FS is a single space
00417  * character.
00418  */
00419 
00420 static long
00421 def_parse_field(long up_to,     /* parse only up to this field number */
00422         char **buf,     /* on input: string to parse; on output: point to start next */
00423         int len,
00424         NODE *fs,
00425         Regexp *rp ATTRIBUTE_UNUSED,
00426         Setfunc set,    /* routine to set the value of the parsed field */
00427         NODE *n)
00428 {
00429         register char *scan = *buf;
00430         register long nf = parse_high_water;
00431         register char *field;
00432         register char *end = scan + len;
00433         char sav;
00434 
00435         if (up_to == HUGE)
00436                 nf = 0;
00437         if (len == 0)
00438                 return nf;
00439 
00440         /*
00441          * Nasty special case. If FS set to "", return whole record
00442          * as first field. This is not worth a separate function.
00443          */
00444         if (fs->stlen == 0) {
00445                 (*set)(++nf, *buf, len, n);
00446                 *buf += len;
00447                 return nf;
00448         }
00449 
00450         /* before doing anything save the char at *end */
00451         sav = *end;
00452         /* because it will be destroyed now: */
00453 
00454         *end = ' ';     /* sentinel character */
00455         for (; nf < up_to; scan++) {
00456                 /*
00457                  * special case:  fs is single space, strip leading whitespace 
00458                  */
00459                 while (scan < end && (*scan == ' ' || *scan == '\t' || *scan == '\n'))
00460                         scan++;
00461                 if (scan >= end)
00462                         break;
00463                 field = scan;
00464                 while (*scan != ' ' && *scan != '\t' && *scan != '\n')
00465                         scan++;
00466                 (*set)(++nf, field, (long)(scan - field), n);
00467                 if (scan == end)
00468                         break;
00469         }
00470 
00471         /* everything done, restore original char at *end */
00472         *end = sav;
00473 
00474         *buf = scan;
00475         return nf;
00476 }
00477 
00478 /*
00479  * posix_def_parse_field --- default field parsing.
00480  *
00481  * This is called both from get_field() and from do_split()
00482  * via (*parse_field)().  This variation is for when FS is a single space
00483  * character.  The only difference between this and def_parse_field()
00484  * is that this one does not allow newlines to separate fields.
00485  */
00486 
00487 static long
00488 posix_def_parse_field(long up_to,       /* parse only up to this field number */
00489         char **buf,     /* on input: string to parse; on output: point to start next */
00490         int len,
00491         NODE *fs,
00492         Regexp *rp ATTRIBUTE_UNUSED,
00493         Setfunc set,    /* routine to set the value of the parsed field */
00494         NODE *n)
00495 {
00496         register char *scan = *buf;
00497         register long nf = parse_high_water;
00498         register char *field;
00499         register char *end = scan + len;
00500         char sav;
00501 
00502         if (up_to == HUGE)
00503                 nf = 0;
00504         if (len == 0)
00505                 return nf;
00506 
00507         /*
00508          * Nasty special case. If FS set to "", return whole record
00509          * as first field. This is not worth a separate function.
00510          */
00511         if (fs->stlen == 0) {
00512                 (*set)(++nf, *buf, len, n);
00513                 *buf += len;
00514                 return nf;
00515         }
00516 
00517         /* before doing anything save the char at *end */
00518         sav = *end;
00519         /* because it will be destroyed now: */
00520 
00521         *end = ' ';     /* sentinel character */
00522         for (; nf < up_to; scan++) {
00523                 /*
00524                  * special case:  fs is single space, strip leading whitespace 
00525                  */
00526                 while (scan < end && (*scan == ' ' || *scan == '\t'))
00527                         scan++;
00528                 if (scan >= end)
00529                         break;
00530                 field = scan;
00531                 while (*scan != ' ' && *scan != '\t')
00532                         scan++;
00533                 (*set)(++nf, field, (long)(scan - field), n);
00534                 if (scan == end)
00535                         break;
00536         }
00537 
00538         /* everything done, restore original char at *end */
00539         *end = sav;
00540 
00541         *buf = scan;
00542         return nf;
00543 }
00544 
00545 /*
00546  * null_parse_field --- each character is a separate field
00547  *
00548  * This is called both from get_field() and from do_split()
00549  * via (*parse_field)().  This variation is for when FS is the null string.
00550  */
00551 static long
00552 null_parse_field(long up_to,    /* parse only up to this field number */
00553         char **buf,     /* on input: string to parse; on output: point to start next */
00554         int len,
00555         NODE *fs ATTRIBUTE_UNUSED,
00556         Regexp *rp ATTRIBUTE_UNUSED,
00557         Setfunc set,    /* routine to set the value of the parsed field */
00558         NODE *n)
00559 {
00560         register char *scan = *buf;
00561         register long nf = parse_high_water;
00562         register char *end = scan + len;
00563 
00564         if (up_to == HUGE)
00565                 nf = 0;
00566         if (len == 0)
00567                 return nf;
00568 
00569 #ifdef MBS_SUPPORT
00570         if (gawk_mb_cur_max > 1) {
00571                 mbstate_t mbs;
00572                 memset(&mbs, 0, sizeof(mbstate_t));
00573                 for (; nf < up_to && scan < end;) {
00574                         size_t mbclen = mbrlen(scan, end-scan, &mbs);
00575                         if ((mbclen == 1) || (mbclen == (size_t) -1)
00576                                 || (mbclen == (size_t) -2) || (mbclen == 0)) {
00577                                 /* We treat it as a singlebyte character.  */
00578                                 mbclen = 1;
00579                         }
00580                         (*set)(++nf, scan, mbclen, n);
00581                         scan += mbclen;
00582                 }
00583         } else
00584 #endif
00585         for (; nf < up_to && scan < end; scan++)
00586                 (*set)(++nf, scan, 1L, n);
00587 
00588         *buf = scan;
00589         return nf;
00590 }
00591 
00592 /*
00593  * sc_parse_field --- single character field separator
00594  *
00595  * This is called both from get_field() and from do_split()
00596  * via (*parse_field)().  This variation is for when FS is a single character
00597  * other than space.
00598  */
00599 static long
00600 sc_parse_field(long up_to,      /* parse only up to this field number */
00601         char **buf,     /* on input: string to parse; on output: point to start next */
00602         int len,
00603         NODE *fs,
00604         Regexp *rp ATTRIBUTE_UNUSED,
00605         Setfunc set,    /* routine to set the value of the parsed field */
00606         NODE *n)
00607 {
00608         register char *scan = *buf;
00609         register char fschar;
00610         register long nf = parse_high_water;
00611         register char *field;
00612         register char *end = scan + len;
00613         char sav;
00614 #ifdef MBS_SUPPORT
00615         size_t mbclen = 0;
00616         mbstate_t mbs;
00617         if (gawk_mb_cur_max > 1)
00618                 memset(&mbs, 0, sizeof(mbstate_t));
00619 #endif
00620 
00621         if (up_to == HUGE)
00622                 nf = 0;
00623         if (len == 0)
00624                 return nf;
00625 
00626         if (RS_is_null && fs->stlen == 0)
00627                 fschar = '\n';
00628         else
00629                 fschar = fs->stptr[0];
00630 
00631         /* before doing anything save the char at *end */
00632         sav = *end;
00633         /* because it will be destroyed now: */
00634         *end = fschar;  /* sentinel character */
00635 
00636         for (; nf < up_to;) {
00637                 field = scan;
00638 #ifdef MBS_SUPPORT
00639                 if (gawk_mb_cur_max > 1) {
00640                         while (*scan != fschar) {
00641                                 mbclen = mbrlen(scan, end-scan, &mbs);
00642                                 if ((mbclen == 1) || (mbclen == (size_t) -1)
00643                                         || (mbclen == (size_t) -2) || (mbclen == 0)) {
00644                                         /* We treat it as a singlebyte character.  */
00645                                         mbclen = 1;
00646                                 }
00647                                 scan += mbclen;
00648                         }
00649                 } else
00650 #endif
00651                 while (*scan != fschar)
00652                         scan++;
00653                 (*set)(++nf, field, (long)(scan - field), n);
00654                 if (scan == end)
00655                         break;
00656                 scan++;
00657                 if (scan == end) {      /* FS at end of record */
00658                         (*set)(++nf, field, 0L, n);
00659                         break;
00660                 }
00661         }
00662 
00663         /* everything done, restore original char at *end */
00664         *end = sav;
00665 
00666         *buf = scan;
00667         return nf;
00668 }
00669 
00670 /*
00671  * fw_parse_field --- field parsing using FIELDWIDTHS spec
00672  *
00673  * This is called from get_field() via (*parse_field)().
00674  * This variation is for fields are fixed widths.
00675  */
00676 static long
00677 fw_parse_field(long up_to,      /* parse only up to this field number */
00678         char **buf,     /* on input: string to parse; on output: point to start next */
00679         int len,
00680         NODE *fs ATTRIBUTE_UNUSED,
00681         Regexp *rp ATTRIBUTE_UNUSED,
00682         Setfunc set,    /* routine to set the value of the parsed field */
00683         NODE *n)
00684 {
00685         register char *scan = *buf;
00686         register long nf = parse_high_water;
00687         register char *end = scan + len;
00688 
00689         if (up_to == HUGE)
00690                 nf = 0;
00691         if (len == 0)
00692                 return nf;
00693         for (; nf < up_to && (len = FIELDWIDTHS[nf+1]) != -1; ) {
00694                 if (len > end - scan)
00695                         len = end - scan;
00696                 (*set)(++nf, scan, (long) len, n);
00697                 scan += len;
00698         }
00699         if (len == -1)
00700                 *buf = end;
00701         else
00702                 *buf = scan;
00703         return nf;
00704 }
00705 
00706 /* get_field --- return a particular $n */
00707 
00708 /* assign is not NULL if this field is on the LHS of an assign */
00709 
00710 NODE **
00711 get_field(register long requested, Func_ptr *assign)
00712 {
00713         /*
00714          * if requesting whole line but some other field has been altered,
00715          * then the whole line must be rebuilt
00716          */
00717         if (requested == 0) {
00718                 if (! field0_valid) {
00719                         /* first, parse remainder of input record */
00720                         if (NF == -1) {
00721                                 NF = (*parse_field)(HUGE-1, &parse_extent,
00722                                         fields_arr[0]->stlen -
00723                                         (parse_extent - fields_arr[0]->stptr),
00724                                         save_FS, FS_regexp, set_field,
00725                                         (NODE *) NULL);
00726                                 parse_high_water = NF;
00727                         }
00728                         rebuild_record();
00729                 }
00730                 if (assign != NULL)
00731                         *assign = reset_record;
00732                 return &fields_arr[0];
00733         }
00734 
00735         /* assert(requested > 0); */
00736 
00737         if (assign != NULL)
00738                 field0_valid = FALSE;           /* $0 needs reconstruction */
00739 
00740         if (requested <= parse_high_water)      /* already parsed this field */
00741                 return &fields_arr[requested];
00742 
00743         if (NF == -1) { /* have not yet parsed to end of record */
00744                 /*
00745                  * parse up to requested fields, calling set_field() for each,
00746                  * saving in parse_extent the point where the parse left off
00747                  */
00748                 if (parse_high_water == 0)      /* starting at the beginning */
00749                         parse_extent = fields_arr[0]->stptr;
00750                 parse_high_water = (*parse_field)(requested, &parse_extent,
00751                      fields_arr[0]->stlen - (parse_extent - fields_arr[0]->stptr),
00752                      save_FS, FS_regexp, set_field, (NODE *) NULL);
00753 
00754                 /*
00755                  * if we reached the end of the record, set NF to the number of
00756                  * fields so far.  Note that requested might actually refer to
00757                  * a field that is beyond the end of the record, but we won't
00758                  * set NF to that value at this point, since this is only a
00759                  * reference to the field and NF only gets set if the field
00760                  * is assigned to -- this case is handled below
00761                  */
00762                 if (parse_extent == fields_arr[0]->stptr + fields_arr[0]->stlen)
00763                         NF = parse_high_water;
00764                 if (requested == HUGE-1)        /* HUGE-1 means set NF */
00765                         requested = parse_high_water;
00766         }
00767         if (parse_high_water < requested) { /* requested beyond end of record */
00768                 if (assign != NULL) {   /* expand record */
00769                         if (requested > nf_high_water)
00770                                 grow_fields_arr(requested);
00771 
00772                         NF = requested;
00773                         parse_high_water = requested;
00774                 } else
00775                         return &Null_field;
00776         }
00777 
00778         return &fields_arr[requested];
00779 }
00780 
00781 /* set_element --- set an array element, used by do_split() */
00782 
00783 static void
00784 set_element(long num, char *s, long len, NODE *n)
00785 {
00786         register NODE *it;
00787 
00788         it = make_string(s, len);
00789         it->flags |= MAYBE_NUM;
00790         *assoc_lookup(n, tmp_number((AWKNUM) (num)), FALSE) = it;
00791 }
00792 
00793 /* do_split --- implement split(), semantics are same as for field splitting */
00794 
00795 NODE *
00796 do_split(NODE *tree)
00797 {
00798         NODE *src, *arr, *sep, *fs, *src2, *fs2, *tmp;
00799         char *s;
00800         long (*parseit) P((long, char **, int, NODE *,
00801                          Regexp *, Setfunc, NODE *));
00802         Regexp *rp = NULL;
00803 
00804         src = force_string(tree_eval(tree->lnode));
00805 
00806         arr = get_param(tree->rnode->lnode);
00807         if (arr->type != Node_var_array)
00808                 fatal(_("split: second argument is not an array"));
00809 
00810         sep = tree->rnode->rnode->lnode;
00811 
00812         if (src->stlen == 0) {
00813                 /*
00814                  * Skip the work if first arg is the null string.
00815                  */
00816                 free_temp(src);
00817                 /*
00818                  * Evaluate sep if it may have side effects.
00819                  */
00820                 if ((sep->re_flags & (FS_DFLT|CONST)) == 0)
00821                         free_temp(tree_eval(sep->re_exp));
00822                 /*
00823                  * And now we can safely turn off the array.
00824                  */
00825                 assoc_clear(arr);
00826                 return tmp_number((AWKNUM) 0);
00827         }
00828 
00829         if ((sep->re_flags & FS_DFLT) != 0 && ! using_FIELDWIDTHS() && ! RS_is_null) {
00830                 parseit = parse_field;
00831                 fs = force_string(FS_node->var_value);
00832                 rp = FS_regexp;
00833         } else {
00834                 fs = force_string(tree_eval(sep->re_exp));
00835                 if (fs->stlen == 0) {
00836                         static short warned = FALSE;
00837 
00838                         parseit = null_parse_field;
00839 
00840                         if (do_lint && ! warned) {
00841                                 warned = TRUE;
00842                                 lintwarn(_("split: null string for third arg is a gawk extension"));
00843                         }
00844                 } else if (fs->stlen == 1 && (sep->re_flags & CONST) == 0) {
00845                         if (fs->stptr[0] == ' ') {
00846                                 if (do_posix)
00847                                         parseit = posix_def_parse_field;
00848                                 else
00849                                         parseit = def_parse_field;
00850                         } else
00851                                 parseit = sc_parse_field;
00852                 } else {
00853                         parseit = re_parse_field;
00854                         rp = re_update(sep);
00855                 }
00856         }
00857 
00858         /*
00859          * do dupnode(), to avoid problems like
00860          *      x = split(a["LINE"], a, a["FS"])
00861          * since we assoc_clear the array. gack.
00862          * this also gives us complete call by value semantics.
00863          */
00864         src2 = dupnode(src);
00865         free_temp(src);
00866 
00867         fs2 = dupnode(fs);
00868         free_temp(fs);
00869 
00870         assoc_clear(arr);
00871 
00872         s = src2->stptr;
00873         tmp = tmp_number((AWKNUM) (*parseit)(HUGE, &s, (int) src2->stlen,
00874                                              fs2, rp, set_element, arr));
00875         unref(src2);
00876         unref(fs2);
00877         return tmp;
00878 }
00879 
00880 /* set_FIELDWIDTHS --- handle an assignment to FIELDWIDTHS */
00881 
00882 void
00883 set_FIELDWIDTHS()
00884 {
00885         register char *scan;
00886         char *end;
00887         register int i;
00888         static int fw_alloc = 4;
00889         static int warned = FALSE;
00890         extern double strtod();
00891 
00892         if (do_lint && ! warned) {
00893                 warned = TRUE;
00894                 lintwarn(_("`FIELDWIDTHS' is a gawk extension"));
00895         }
00896         if (do_traditional)     /* quick and dirty, does the trick */
00897                 return;
00898 
00899         /*
00900          * If changing the way fields are split, obey least-suprise
00901          * semantics, and force $0 to be split totally.
00902          */
00903         if (fields_arr != NULL)
00904                 (void) get_field(HUGE - 1, 0);
00905 
00906         parse_field = fw_parse_field;
00907         scan = force_string(FIELDWIDTHS_node->var_value)->stptr;
00908         end = scan + 1;
00909         if (FIELDWIDTHS == NULL)
00910                 emalloc(FIELDWIDTHS, int *, fw_alloc * sizeof(int), "set_FIELDWIDTHS");
00911         FIELDWIDTHS[0] = 0;
00912         for (i = 1; ; i++) {
00913                 if (i >= fw_alloc) {
00914                         fw_alloc *= 2;
00915                         erealloc(FIELDWIDTHS, int *, fw_alloc * sizeof(int), "set_FIELDWIDTHS");
00916                 }
00917                 FIELDWIDTHS[i] = (int) strtod(scan, &end);
00918                 if (end == scan)
00919                         break;
00920                 if (FIELDWIDTHS[i] <= 0)
00921                         fatal(_("field %d in FIELDWIDTHS, must be > 0"), i);
00922                 scan = end;
00923         }
00924         FIELDWIDTHS[i] = -1;
00925 
00926         update_PROCINFO("FS", "FIELDWIDTHS");
00927 }
00928 
00929 /* set_FS --- handle things when FS is assigned to */
00930 
00931 void
00932 set_FS()
00933 {
00934         char buf[10];
00935         NODE *fs;
00936         static NODE *save_fs = NULL;
00937         static NODE *save_rs = NULL;
00938         int remake_re = TRUE;
00939 
00940         /*
00941          * If changing the way fields are split, obey least-suprise
00942          * semantics, and force $0 to be split totally.
00943          */
00944         if (fields_arr != NULL)
00945                 (void) get_field(HUGE - 1, 0);
00946 
00947         /* It's possible that only IGNORECASE changed, or FS = FS */
00948         /*
00949          * This comparison can't use cmp_nodes(), which pays attention
00950          * to IGNORECASE, and that's not what we want.
00951          */
00952         if (save_fs
00953                 && FS_node->var_value->stlen == save_fs->stlen
00954                 && STREQN(FS_node->var_value->stptr, save_fs->stptr, save_fs->stlen)
00955                 && save_rs
00956                 && RS_node->var_value->stlen == save_rs->stlen
00957                 && STREQN(RS_node->var_value->stptr, save_rs->stptr, save_rs->stlen)) {
00958                 if (FS_regexp != NULL)
00959                         FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case);
00960 
00961                 /* FS = FS */
00962                 if (! using_FIELDWIDTHS())
00963                         return;
00964                 else {
00965                         remake_re = FALSE;
00966                         goto choose_fs_function;
00967                 }
00968         }
00969 
00970         unref(save_fs);
00971         save_fs = dupnode(FS_node->var_value);
00972         unref(save_rs);
00973         save_rs = dupnode(RS_node->var_value);
00974         resave_fs = TRUE;
00975         if (FS_regexp != NULL) {
00976                 refree(FS_re_yes_case);
00977                 refree(FS_re_no_case);
00978                 FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
00979         }
00980 
00981 
00982 choose_fs_function:
00983         buf[0] = '\0';
00984         default_FS = FALSE;
00985         fs = force_string(FS_node->var_value);
00986 
00987         if (! do_traditional && fs->stlen == 0) {
00988                 static short warned = FALSE;
00989 
00990                 parse_field = null_parse_field;
00991 
00992                 if (do_lint && ! warned) {
00993                         warned = TRUE;
00994                         lintwarn(_("null string for `FS' is a gawk extension"));
00995                 }
00996         } else if (fs->stlen > 1) {
00997                 parse_field = re_parse_field;
00998         } else if (RS_is_null) {
00999                 /* we know that fs->stlen <= 1 */
01000                 parse_field = sc_parse_field;
01001                 if (fs->stlen == 1) {
01002                         if (fs->stptr[0] == ' ') {
01003                                 default_FS = TRUE;
01004                                 strcpy(buf, "[ \t\n]+");
01005                         } else if (fs->stptr[0] == '\\') {
01006                                 /* yet another special case */
01007                                 strcpy(buf, "[\\\\\n]");
01008                         } else if (fs->stptr[0] != '\n')
01009                                 sprintf(buf, "[%c\n]", fs->stptr[0]);
01010                 }
01011         } else {
01012                 if (do_posix)
01013                         parse_field = posix_def_parse_field;
01014                 else
01015                         parse_field = def_parse_field;
01016 
01017                 if (fs->stlen == 1) {
01018                         if (fs->stptr[0] == ' ')
01019                                 default_FS = TRUE;
01020                         else if (fs->stptr[0] == '\\')
01021                                 /* same special case */
01022                                 strcpy(buf, "[\\\\]");
01023                         else
01024                                 parse_field = sc_parse_field;
01025                 }
01026         }
01027         if (remake_re) {
01028                 if (FS_regexp != NULL) {
01029                         refree(FS_re_yes_case);
01030                         refree(FS_re_no_case);
01031                         FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
01032                 }
01033 
01034                 if (buf[0] != '\0') {
01035                         FS_re_yes_case = make_regexp(buf, strlen(buf), FALSE);
01036                         FS_re_no_case = make_regexp(buf, strlen(buf), TRUE);
01037                         FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case);
01038                         parse_field = re_parse_field;
01039                 } else if (parse_field == re_parse_field) {
01040                         FS_re_yes_case = make_regexp(fs->stptr, fs->stlen, FALSE);
01041                         FS_re_no_case = make_regexp(fs->stptr, fs->stlen, TRUE);
01042                         FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case);
01043                 } else
01044                         FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
01045         }
01046 
01047         /*
01048          * For FS = "c", we don't use IGNORECASE. But we must use
01049          * re_parse_field to get the character and the newline as
01050          * field separators.
01051          */
01052         if (fs->stlen == 1 && parse_field == re_parse_field)
01053                 FS_regexp = FS_re_yes_case;
01054 
01055         update_PROCINFO("FS", "FS");
01056 }
01057 
01058 /* using_fieldwidths --- is FS or FIELDWIDTHS in use? */
01059 
01060 int
01061 using_fieldwidths()
01062 {
01063         return using_FIELDWIDTHS();
01064 }
01065 
01066 /* update_PROCINFO --- update PROCINFO[sub] when FS or FIELDWIDTHS set */
01067 
01068 static void
01069 update_PROCINFO(char *subscript, char *str)
01070 {
01071         NODE **aptr;
01072 
01073         if (PROCINFO_node == NULL)
01074                 return;
01075 
01076         aptr = assoc_lookup(PROCINFO_node, tmp_string(subscript, strlen(subscript)), FALSE);
01077         assign_val(aptr, tmp_string(str, strlen(str)));
01078 }

© sourcejam.com 2005-2008