diff options
Diffstat (limited to 'gnu/usr.bin/awk/field.c')
-rw-r--r-- | gnu/usr.bin/awk/field.c | 645 |
1 files changed, 645 insertions, 0 deletions
diff --git a/gnu/usr.bin/awk/field.c b/gnu/usr.bin/awk/field.c new file mode 100644 index 0000000..d8f9a54 --- /dev/null +++ b/gnu/usr.bin/awk/field.c @@ -0,0 +1,645 @@ +/* + * field.c - routines for dealing with fields and record parsing + */ + +/* + * Copyright (C) 1986, 1988, 1989, 1991, 1992 the Free Software Foundation, Inc. + * + * This file is part of GAWK, the GNU implementation of the + * AWK Progamming Language. + * + * GAWK is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GAWK is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GAWK; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "awk.h" + +static int (*parse_field) P((int, char **, int, NODE *, + Regexp *, void (*)(), NODE *)); +static void rebuild_record P((void)); +static int re_parse_field P((int, char **, int, NODE *, + Regexp *, void (*)(), NODE *)); +static int def_parse_field P((int, char **, int, NODE *, + Regexp *, void (*)(), NODE *)); +static int sc_parse_field P((int, char **, int, NODE *, + Regexp *, void (*)(), NODE *)); +static int fw_parse_field P((int, char **, int, NODE *, + Regexp *, void (*)(), NODE *)); +static void set_element P((int, char *, int, NODE *)); +static void grow_fields_arr P((int num)); +static void set_field P((int num, char *str, int len, NODE *dummy)); + + +static Regexp *FS_regexp = NULL; +static char *parse_extent; /* marks where to restart parse of record */ +static int parse_high_water=0; /* field number that we have parsed so far */ +static int nf_high_water = 0; /* size of fields_arr */ +static int resave_fs; +static NODE *save_FS; /* save current value of FS when line is read, + * to be used in deferred parsing + */ + +NODE **fields_arr; /* array of pointers to the field nodes */ +int field0_valid; /* $(>0) has not been changed yet */ +int default_FS; +static NODE **nodes; /* permanent repository of field nodes */ +static int *FIELDWIDTHS = NULL; + +void +init_fields() +{ + NODE *n; + + emalloc(fields_arr, NODE **, sizeof(NODE *), "init_fields"); + emalloc(nodes, NODE **, sizeof(NODE *), "init_fields"); + getnode(n); + *n = *Nnull_string; + fields_arr[0] = nodes[0] = n; + parse_extent = fields_arr[0]->stptr; + save_FS = dupnode(FS_node->var_value); + field0_valid = 1; +} + + +static void +grow_fields_arr(num) +int num; +{ + register int t; + register NODE *n; + + erealloc(fields_arr, NODE **, (num + 1) * sizeof(NODE *), "set_field"); + erealloc(nodes, NODE **, (num+1) * sizeof(NODE *), "set_field"); + for (t = nf_high_water+1; t <= num; t++) { + getnode(n); + *n = *Nnull_string; + fields_arr[t] = nodes[t] = n; + } + nf_high_water = num; +} + +/*ARGSUSED*/ +static void +set_field(num, str, len, dummy) +int num; +char *str; +int len; +NODE *dummy; /* not used -- just to make interface same as set_element */ +{ + register NODE *n; + + if (num > nf_high_water) + grow_fields_arr(num); + n = nodes[num]; + n->stptr = str; + n->stlen = len; + n->flags = (PERM|STR|STRING|MAYBE_NUM); + fields_arr[num] = n; +} + +/* Someone assigned a value to $(something). Fix up $0 to be right */ +static void +rebuild_record() +{ + register int tlen; + register NODE *tmp; + NODE *ofs; + char *ops; + register char *cops; + register NODE **ptr; + register int ofslen; + + tlen = 0; + ofs = force_string(OFS_node->var_value); + ofslen = ofs->stlen; + ptr = &fields_arr[NF]; + while (ptr > &fields_arr[0]) { + tmp = force_string(*ptr); + tlen += tmp->stlen; + ptr--; + } + tlen += (NF - 1) * ofslen; + if (tlen < 0) + tlen = 0; + emalloc(ops, char *, tlen + 2, "fix_fields"); + cops = ops; + ops[0] = '\0'; + for (ptr = &fields_arr[1]; ptr <= &fields_arr[NF]; ptr++) { + tmp = *ptr; + if (tmp->stlen == 1) + *cops++ = tmp->stptr[0]; + else if (tmp->stlen != 0) { + memcpy(cops, tmp->stptr, tmp->stlen); + cops += tmp->stlen; + } + if (ptr != &fields_arr[NF]) { + if (ofslen == 1) + *cops++ = ofs->stptr[0]; + else if (ofslen != 0) { + memcpy(cops, ofs->stptr, ofslen); + cops += ofslen; + } + } + } + tmp = make_str_node(ops, tlen, ALREADY_MALLOCED); + unref(fields_arr[0]); + fields_arr[0] = tmp; + field0_valid = 1; +} + +/* + * setup $0, but defer parsing rest of line until reference is made to $(>0) + * or to NF. At that point, parse only as much as necessary. + */ +void +set_record(buf, cnt, freeold) +char *buf; +int cnt; +int freeold; +{ + register int i; + + NF = -1; + for (i = 1; i <= parse_high_water; i++) { + unref(fields_arr[i]); + } + parse_high_water = 0; + if (freeold) { + unref(fields_arr[0]); + if (resave_fs) { + resave_fs = 0; + unref(save_FS); + save_FS = dupnode(FS_node->var_value); + } + nodes[0]->stptr = buf; + nodes[0]->stlen = cnt; + nodes[0]->stref = 1; + nodes[0]->flags = (STRING|STR|PERM|MAYBE_NUM); + fields_arr[0] = nodes[0]; + } + fields_arr[0]->flags |= MAYBE_NUM; + field0_valid = 1; +} + +void +reset_record() +{ + (void) force_string(fields_arr[0]); + set_record(fields_arr[0]->stptr, fields_arr[0]->stlen, 0); +} + +void +set_NF() +{ + register int i; + + NF = (int) force_number(NF_node->var_value); + if (NF > nf_high_water) + grow_fields_arr(NF); + for (i = parse_high_water + 1; i <= NF; i++) { + unref(fields_arr[i]); + fields_arr[i] = Nnull_string; + } + field0_valid = 0; +} + +/* + * this is called both from get_field() and from do_split() + * via (*parse_field)(). This variation is for when FS is a regular + * expression -- either user-defined or because RS=="" and FS==" " + */ +static int +re_parse_field(up_to, buf, len, fs, rp, set, n) +int up_to; /* parse only up to this field number */ +char **buf; /* on input: string to parse; on output: point to start next */ +int len; +NODE *fs; +Regexp *rp; +void (*set) (); /* routine to set the value of the parsed field */ +NODE *n; +{ + register char *scan = *buf; + register int nf = parse_high_water; + register char *field; + register char *end = scan + len; + + if (up_to == HUGE) + nf = 0; + if (len == 0) + return nf; + + if (*RS == 0 && default_FS) + while (scan < end && isspace(*scan)) + scan++; + field = scan; + while (scan < end + && research(rp, scan, 0, (int)(end - scan), 1) != -1 + && nf < up_to) { + if (REEND(rp, scan) == RESTART(rp, scan)) { /* null match */ + scan++; + if (scan == end) { + (*set)(++nf, field, scan - field, n); + up_to = nf; + break; + } + continue; + } + (*set)(++nf, field, scan + RESTART(rp, scan) - field, n); + scan += REEND(rp, scan); + field = scan; + if (scan == end) /* FS at end of record */ + (*set)(++nf, field, 0, n); + } + if (nf != up_to && scan < end) { + (*set)(++nf, scan, (int)(end - scan), n); + scan = end; + } + *buf = scan; + return (nf); +} + +/* + * this is called both from get_field() and from do_split() + * via (*parse_field)(). This variation is for when FS is a single space + * character. + */ +static int +def_parse_field(up_to, buf, len, fs, rp, set, n) +int up_to; /* parse only up to this field number */ +char **buf; /* on input: string to parse; on output: point to start next */ +int len; +NODE *fs; +Regexp *rp; +void (*set) (); /* routine to set the value of the parsed field */ +NODE *n; +{ + register char *scan = *buf; + register int nf = parse_high_water; + register char *field; + register char *end = scan + len; + char sav; + + if (up_to == HUGE) + nf = 0; + if (len == 0) + return nf; + + /* before doing anything save the char at *end */ + sav = *end; + /* because it will be destroyed now: */ + + *end = ' '; /* sentinel character */ + for (; nf < up_to; scan++) { + /* + * special case: fs is single space, strip leading whitespace + */ + while (scan < end && (*scan == ' ' || *scan == '\t')) + scan++; + if (scan >= end) + break; + field = scan; + while (*scan != ' ' && *scan != '\t') + scan++; + (*set)(++nf, field, (int)(scan - field), n); + if (scan == end) + break; + } + + /* everything done, restore original char at *end */ + *end = sav; + + *buf = scan; + return nf; +} + +/* + * this is called both from get_field() and from do_split() + * via (*parse_field)(). This variation is for when FS is a single character + * other than space. + */ +static int +sc_parse_field(up_to, buf, len, fs, rp, set, n) +int up_to; /* parse only up to this field number */ +char **buf; /* on input: string to parse; on output: point to start next */ +int len; +NODE *fs; +Regexp *rp; +void (*set) (); /* routine to set the value of the parsed field */ +NODE *n; +{ + register char *scan = *buf; + register char fschar; + register int nf = parse_high_water; + register char *field; + register char *end = scan + len; + char sav; + + if (up_to == HUGE) + nf = 0; + if (len == 0) + return nf; + + if (*RS == 0 && fs->stlen == 0) + fschar = '\n'; + else + fschar = fs->stptr[0]; + + /* before doing anything save the char at *end */ + sav = *end; + /* because it will be destroyed now: */ + *end = fschar; /* sentinel character */ + + for (; nf < up_to; scan++) { + field = scan; + while (*scan++ != fschar) + ; + scan--; + (*set)(++nf, field, (int)(scan - field), n); + if (scan == end) + break; + } + + /* everything done, restore original char at *end */ + *end = sav; + + *buf = scan; + return nf; +} + +/* + * this is called both from get_field() and from do_split() + * via (*parse_field)(). This variation is for fields are fixed widths. + */ +static int +fw_parse_field(up_to, buf, len, fs, rp, set, n) +int up_to; /* parse only up to this field number */ +char **buf; /* on input: string to parse; on output: point to start next */ +int len; +NODE *fs; +Regexp *rp; +void (*set) (); /* routine to set the value of the parsed field */ +NODE *n; +{ + register char *scan = *buf; + register int nf = parse_high_water; + register char *end = scan + len; + + if (up_to == HUGE) + nf = 0; + if (len == 0) + return nf; + for (; nf < up_to && (len = FIELDWIDTHS[nf+1]) != -1; ) { + if (len > end - scan) + len = end - scan; + (*set)(++nf, scan, len, n); + scan += len; + } + if (len == -1) + *buf = end; + else + *buf = scan; + return nf; +} + +NODE ** +get_field(requested, assign) +register int requested; +Func_ptr *assign; /* this field is on the LHS of an assign */ +{ + /* + * if requesting whole line but some other field has been altered, + * then the whole line must be rebuilt + */ + if (requested == 0) { + if (!field0_valid) { + /* first, parse remainder of input record */ + if (NF == -1) { + NF = (*parse_field)(HUGE-1, &parse_extent, + fields_arr[0]->stlen - + (parse_extent - fields_arr[0]->stptr), + save_FS, FS_regexp, set_field, + (NODE *)NULL); + parse_high_water = NF; + } + rebuild_record(); + } + if (assign) + *assign = reset_record; + return &fields_arr[0]; + } + + /* assert(requested > 0); */ + + if (assign) + field0_valid = 0; /* $0 needs reconstruction */ + + if (requested <= parse_high_water) /* already parsed this field */ + return &fields_arr[requested]; + + if (NF == -1) { /* have not yet parsed to end of record */ + /* + * parse up to requested fields, calling set_field() for each, + * saving in parse_extent the point where the parse left off + */ + if (parse_high_water == 0) /* starting at the beginning */ + parse_extent = fields_arr[0]->stptr; + parse_high_water = (*parse_field)(requested, &parse_extent, + fields_arr[0]->stlen - (parse_extent-fields_arr[0]->stptr), + save_FS, FS_regexp, set_field, (NODE *)NULL); + + /* + * if we reached the end of the record, set NF to the number of + * fields so far. Note that requested might actually refer to + * a field that is beyond the end of the record, but we won't + * set NF to that value at this point, since this is only a + * reference to the field and NF only gets set if the field + * is assigned to -- this case is handled below + */ + if (parse_extent == fields_arr[0]->stptr + fields_arr[0]->stlen) + NF = parse_high_water; + if (requested == HUGE-1) /* HUGE-1 means set NF */ + requested = parse_high_water; + } + if (parse_high_water < requested) { /* requested beyond end of record */ + if (assign) { /* expand record */ + register int i; + + if (requested > nf_high_water) + grow_fields_arr(requested); + + /* fill in fields that don't exist */ + for (i = parse_high_water + 1; i <= requested; i++) + fields_arr[i] = Nnull_string; + + NF = requested; + parse_high_water = requested; + } else + return &Nnull_string; + } + + return &fields_arr[requested]; +} + +static void +set_element(num, s, len, n) +int num; +char *s; +int len; +NODE *n; +{ + register NODE *it; + + it = make_string(s, len); + it->flags |= MAYBE_NUM; + *assoc_lookup(n, tmp_number((AWKNUM) (num))) = it; +} + +NODE * +do_split(tree) +NODE *tree; +{ + NODE *t1, *t2, *t3, *tmp; + NODE *fs; + char *s; + int (*parseit)P((int, char **, int, NODE *, + Regexp *, void (*)(), NODE *)); + Regexp *rp = NULL; + + t1 = tree_eval(tree->lnode); + t2 = tree->rnode->lnode; + t3 = tree->rnode->rnode->lnode; + + (void) force_string(t1); + + if (t2->type == Node_param_list) + t2 = stack_ptr[t2->param_cnt]; + if (t2->type != Node_var && t2->type != Node_var_array) + fatal("second argument of split is not a variable"); + assoc_clear(t2); + + if (t3->re_flags & FS_DFLT) { + parseit = parse_field; + fs = force_string(FS_node->var_value); + rp = FS_regexp; + } else { + tmp = force_string(tree_eval(t3->re_exp)); + if (tmp->stlen == 1) { + if (tmp->stptr[0] == ' ') + parseit = def_parse_field; + else + parseit = sc_parse_field; + } else { + parseit = re_parse_field; + rp = re_update(t3); + } + fs = tmp; + } + + s = t1->stptr; + tmp = tmp_number((AWKNUM) (*parseit)(HUGE, &s, (int)t1->stlen, + fs, rp, set_element, t2)); + free_temp(t1); + free_temp(t3); + return tmp; +} + +void +set_FS() +{ + NODE *tmp = NULL; + char buf[10]; + NODE *fs; + + buf[0] = '\0'; + default_FS = 0; + if (FS_regexp) { + refree(FS_regexp); + FS_regexp = NULL; + } + fs = force_string(FS_node->var_value); + if (fs->stlen > 1) + parse_field = re_parse_field; + else if (*RS == 0) { + parse_field = sc_parse_field; + if (fs->stlen == 1) { + if (fs->stptr[0] == ' ') { + default_FS = 1; + strcpy(buf, "[ \t\n]+"); + } else if (fs->stptr[0] != '\n') + sprintf(buf, "[%c\n]", fs->stptr[0]); + } + } else { + parse_field = def_parse_field; + if (fs->stptr[0] == ' ' && fs->stlen == 1) + default_FS = 1; + else if (fs->stptr[0] != ' ' && fs->stlen == 1) { + if (IGNORECASE == 0) + parse_field = sc_parse_field; + else + sprintf(buf, "[%c]", fs->stptr[0]); + } + } + if (buf[0]) { + FS_regexp = make_regexp(buf, strlen(buf), IGNORECASE, 1); + parse_field = re_parse_field; + } else if (parse_field == re_parse_field) { + FS_regexp = make_regexp(fs->stptr, fs->stlen, IGNORECASE, 1); + } else + FS_regexp = NULL; + resave_fs = 1; +} + +void +set_RS() +{ + (void) force_string(RS_node->var_value); + RS = RS_node->var_value->stptr; + set_FS(); +} + +void +set_FIELDWIDTHS() +{ + register char *scan; + char *end; + register int i; + static int fw_alloc = 1; + static int warned = 0; + extern double strtod(); + + if (do_lint && ! warned) { + warned = 1; + warning("use of FIELDWIDTHS is a gawk extension"); + } + if (do_unix) /* quick and dirty, does the trick */ + return; + + parse_field = fw_parse_field; + scan = force_string(FIELDWIDTHS_node->var_value)->stptr; + end = scan + 1; + if (FIELDWIDTHS == NULL) + emalloc(FIELDWIDTHS, int *, fw_alloc * sizeof(int), "set_FIELDWIDTHS"); + FIELDWIDTHS[0] = 0; + for (i = 1; ; i++) { + if (i >= fw_alloc) { + fw_alloc *= 2; + erealloc(FIELDWIDTHS, int *, fw_alloc * sizeof(int), "set_FIELDWIDTHS"); + } + FIELDWIDTHS[i] = (int) strtod(scan, &end); + if (end == scan) + break; + scan = end; + } + FIELDWIDTHS[i] = -1; +} |