From 0e0a0e6429c7113acf15c4c94bd5fe94c45f9e99 Mon Sep 17 00:00:00 2001 From: ru Date: Tue, 17 Apr 2001 12:12:05 +0000 Subject: Virgin import of FSF groff v1.17 --- contrib/groff/src/preproc/refer/Makefile.sub | 23 + contrib/groff/src/preproc/refer/TODO | 124 ++ contrib/groff/src/preproc/refer/command.cc | 807 +++++++++++++ contrib/groff/src/preproc/refer/command.h | 36 + contrib/groff/src/preproc/refer/label.cc | 1602 ++++++++++++++++++++++++++ contrib/groff/src/preproc/refer/label.y | 1177 +++++++++++++++++++ contrib/groff/src/preproc/refer/ref.cc | 1160 +++++++++++++++++++ contrib/groff/src/preproc/refer/ref.h | 120 ++ contrib/groff/src/preproc/refer/refer.cc | 1234 ++++++++++++++++++++ contrib/groff/src/preproc/refer/refer.h | 78 ++ contrib/groff/src/preproc/refer/refer.man | 1302 +++++++++++++++++++++ contrib/groff/src/preproc/refer/token.cc | 378 ++++++ contrib/groff/src/preproc/refer/token.h | 88 ++ 13 files changed, 8129 insertions(+) create mode 100644 contrib/groff/src/preproc/refer/Makefile.sub create mode 100644 contrib/groff/src/preproc/refer/TODO create mode 100644 contrib/groff/src/preproc/refer/command.cc create mode 100644 contrib/groff/src/preproc/refer/command.h create mode 100644 contrib/groff/src/preproc/refer/label.cc create mode 100644 contrib/groff/src/preproc/refer/label.y create mode 100644 contrib/groff/src/preproc/refer/ref.cc create mode 100644 contrib/groff/src/preproc/refer/ref.h create mode 100644 contrib/groff/src/preproc/refer/refer.cc create mode 100644 contrib/groff/src/preproc/refer/refer.h create mode 100644 contrib/groff/src/preproc/refer/refer.man create mode 100644 contrib/groff/src/preproc/refer/token.cc create mode 100644 contrib/groff/src/preproc/refer/token.h (limited to 'contrib/groff/src/preproc/refer') diff --git a/contrib/groff/src/preproc/refer/Makefile.sub b/contrib/groff/src/preproc/refer/Makefile.sub new file mode 100644 index 0000000..1631b5e --- /dev/null +++ b/contrib/groff/src/preproc/refer/Makefile.sub @@ -0,0 +1,23 @@ +PROG=refer +MAN1=refer.n +XLIBS=$(LIBBIB) $(LIBGROFF) +MLIB=$(LIBM) +OBJS=\ + command.o \ + label.o \ + ref.o \ + refer.o \ + token.o +CCSRCS=\ + $(srcdir)/command.cc \ + $(srcdir)/ref.cc \ + $(srcdir)/refer.cc \ + $(srcdir)/token.cc +HDRS=\ + $(srcdir)/refer.h \ + $(srcdir)/token.h \ + $(srcdir)/command.h \ + $(srcdir)/ref.h +GRAM=$(srcdir)/label.y +YTABC=$(srcdir)/label.cc +NAMEPREFIX=$(g) diff --git a/contrib/groff/src/preproc/refer/TODO b/contrib/groff/src/preproc/refer/TODO new file mode 100644 index 0000000..5bbd9bf --- /dev/null +++ b/contrib/groff/src/preproc/refer/TODO @@ -0,0 +1,124 @@ +inline references + +Some sort of macro/subroutine that can cover several references. + +move-punctuation should ignore multiple punctuation characters. + +Make the index files machine independent. + +Allow search keys to be negated (with !) to indicate that the +reference should not contain the key. Ignore negated keys during +indexed searching. + +Provide an option with lkbib and lookbib that prints the location +(filename, position) of each reference. Need to map filename_id's +back to filenames. + +Rename join-authors to join-fields. Have a separate label-join-fields +command used by @ and #. + +Have some sort of quantifier: eg $.n#A means execute `$.n' for each +instance of an A field, setting $ to that field, and then join the +results using the join-authors command. + +no-text-in-bracket command which says not to allow post_text and +pre_text when the [] flags has been given. Useful for superscripted +footnotes. + +Make it possible to translate - to \(en in page ranges. + +Trim eign a bit. + +In indexed searching discard all numeric keys except dates. + +Allow `\ ' to separate article from first word. + +%also + +Option automatically to supply [] flags in every reference. + +See if we can avoid requiring a comma before jr. and so on +in find_last_name(). + +Cache sortified authors in authors string during tentative evaluation of +label specification. + +Possibly don't allow * and % expressions in the first part of ?:, | or +& expressions. + +Handle better the case where <> occurs inside functions and in the +first operand of ~. Or perhaps implement <> using some magic character +in the string. + +Should special treatment be given to lines beginning with . in +references? (Unix refer seems to treat them like `%'). + +Add global flag to control whether all files should be stat-ed after +loading, and whether they should be stat-ed before each search. +Perhaps make this dependent on the number of files there are. + +Option to truncate keys to truncate_len in linear searching. + +Allow multiple -f options in indxbib. + +In indxbib, possibly store common words rather than common words +filename. In this case store only words that are actually present in +the file. + +Perhaps we should put out an obnoxious copyright message when lookbib +starts up. + +Provide an option that writes a file containing just the references +actually used. Useful if you want to distribute a document. + +Have a magic token such that +%A +will print as though it were +%A +but sort as though it were +%A +Do we need this if we can specify author alternatives for sorting? +No, provided we have separate alternatives for @. + +In consider_authors when last names are ambiguous we might be able to +use just the first name and not Jr. bit. Or we might be able to +abbreviate the author. + +It ought to be possible to specify an alternative field to sort on +instead of date. (ie if there's a field giving the type of document -- +these references should sort after any years) + +Provide a way to execute a command using a command-line option. + +Option to set the label-spec as a command-line option (-L). + +Command to to specify which fields can occur multiple times: +multiple AE + +Command to specify how various fields sort: +aort-as-name A +sort-as-date D +sort-as-title T +sort-as-other O + +Command to specify which fields are author fields: +# if we don't have A use field Q +author-fields AQ + +Commands to set properties of tokens. +sortify-token \(ae ae +uppercase-token \[ae] \[AE] + +Command to set the names of months: +months january february march april may ... + +Perhaps provide some sort of macro capability: +# perhaps a macro capability +defmacro foo +annotation-field $1 +endef + +Command to control strings used in capitalization +capitalize-start \s+2 +capitalize-end \s-2 +(perhaps make these arguments to the capitalize command.) diff --git a/contrib/groff/src/preproc/refer/command.cc b/contrib/groff/src/preproc/refer/command.cc new file mode 100644 index 0000000..004189e --- /dev/null +++ b/contrib/groff/src/preproc/refer/command.cc @@ -0,0 +1,807 @@ +// -*- C++ -*- +/* Copyright (C) 1989, 1990, 1991, 1992 Free Software Foundation, Inc. + Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License along +with groff; see the file COPYING. If not, write to the Free Software +Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + +#include "refer.h" +#include "refid.h" +#include "search.h" +#include "command.h" + +cset cs_field_name = csalpha; + +class input_item { + input_item *next; + char *filename; + int first_lineno; + string buffer; + const char *ptr; + const char *end; +public: + input_item(string &, const char *, int = 1); + ~input_item(); + int get_char(); + int peek_char(); + void skip_char(); + int get_location(const char **, int *); + + friend class input_stack; +}; + +input_item::input_item(string &s, const char *fn, int ln) +: filename(strsave(fn)), first_lineno(ln) +{ + buffer.move(s); + ptr = buffer.contents(); + end = ptr + buffer.length(); +} + +input_item::~input_item() +{ + a_delete filename; +} + +inline int input_item::peek_char() +{ + if (ptr >= end) + return EOF; + else + return (unsigned char)*ptr; +} + +inline int input_item::get_char() +{ + if (ptr >= end) + return EOF; + else + return (unsigned char)*ptr++; +} + +inline void input_item::skip_char() +{ + ptr++; +} + +int input_item::get_location(const char **filenamep, int *linenop) +{ + *filenamep = filename; + if (ptr == buffer.contents()) + *linenop = first_lineno; + else { + int ln = first_lineno; + const char *e = ptr - 1; + for (const char *p = buffer.contents(); p < e; p++) + if (*p == '\n') + ln++; + *linenop = ln; + } + return 1; +} + +class input_stack { + static input_item *top; +public: + static void init(); + static int get_char(); + static int peek_char(); + static void skip_char() { top->skip_char(); } + static void push_file(const char *); + static void push_string(string &, const char *, int); + static void error(const char *format, + const errarg &arg1 = empty_errarg, + const errarg &arg2 = empty_errarg, + const errarg &arg3 = empty_errarg); +}; + +input_item *input_stack::top = 0; + +void input_stack::init() +{ + while (top) { + input_item *tem = top; + top = top->next; + delete tem; + } +} + +int input_stack::get_char() +{ + while (top) { + int c = top->get_char(); + if (c >= 0) + return c; + input_item *tem = top; + top = top->next; + delete tem; + } + return -1; +} + +int input_stack::peek_char() +{ + while (top) { + int c = top->peek_char(); + if (c >= 0) + return c; + input_item *tem = top; + top = top->next; + delete tem; + } + return -1; +} + +void input_stack::push_file(const char *fn) +{ + FILE *fp; + if (strcmp(fn, "-") == 0) { + fp = stdin; + fn = ""; + } + else { + errno = 0; + fp = fopen(fn, "r"); + if (fp == 0) { + error("can't open `%1': %2", fn, strerror(errno)); + return; + } + } + string buf; + int bol = 1; + int lineno = 1; + for (;;) { + int c = getc(fp); + if (bol && c == '.') { + // replace lines beginning with .R1 or .R2 with a blank line + c = getc(fp); + if (c == 'R') { + c = getc(fp); + if (c == '1' || c == '2') { + int cc = c; + c = getc(fp); + if (compatible_flag || c == ' ' || c == '\n' || c == EOF) { + while (c != '\n' && c != EOF) + c = getc(fp); + } + else { + buf += '.'; + buf += 'R'; + buf += cc; + } + } + else { + buf += '.'; + buf += 'R'; + } + } + else + buf += '.'; + } + if (c == EOF) + break; + if (illegal_input_char(c)) + error_with_file_and_line(fn, lineno, + "illegal input character code %1", int(c)); + else { + buf += c; + if (c == '\n') { + bol = 1; + lineno++; + } + else + bol = 0; + } + } + if (fp != stdin) + fclose(fp); + if (buf.length() > 0 && buf[buf.length() - 1] != '\n') + buf += '\n'; + input_item *it = new input_item(buf, fn); + it->next = top; + top = it; +} + +void input_stack::push_string(string &s, const char *filename, int lineno) +{ + input_item *it = new input_item(s, filename, lineno); + it->next = top; + top = it; +} + +void input_stack::error(const char *format, const errarg &arg1, + const errarg &arg2, const errarg &arg3) +{ + const char *filename; + int lineno; + for (input_item *it = top; it; it = it->next) + if (it->get_location(&filename, &lineno)) { + error_with_file_and_line(filename, lineno, format, arg1, arg2, arg3); + return; + } + ::error(format, arg1, arg2, arg3); +} + +void command_error(const char *format, const errarg &arg1, + const errarg &arg2, const errarg &arg3) +{ + input_stack::error(format, arg1, arg2, arg3); +} + +// # not recognized in "" +// \ is recognized in "" +// # does not conceal newline +// if missing closing quote, word extends to end of line +// no special treatment of \ other than before newline +// \ not recognized after # +// ; allowed as alternative to newline +// ; not recognized in "" +// don't clear word_buffer; just append on +// return -1 for EOF, 0 for newline, 1 for word + +int get_word(string &word_buffer) +{ + int c = input_stack::get_char(); + for (;;) { + if (c == '#') { + do { + c = input_stack::get_char(); + } while (c != '\n' && c != EOF); + break; + } + if (c == '\\' && input_stack::peek_char() == '\n') + input_stack::skip_char(); + else if (c != ' ' && c != '\t') + break; + c = input_stack::get_char(); + } + if (c == EOF) + return -1; + if (c == '\n' || c == ';') + return 0; + if (c == '"') { + for (;;) { + c = input_stack::peek_char(); + if (c == EOF || c == '\n') + break; + input_stack::skip_char(); + if (c == '"') { + int d = input_stack::peek_char(); + if (d == '"') + input_stack::skip_char(); + else + break; + } + else if (c == '\\') { + int d = input_stack::peek_char(); + if (d == '\n') + input_stack::skip_char(); + else + word_buffer += '\\'; + } + else + word_buffer += c; + } + return 1; + } + word_buffer += c; + for (;;) { + c = input_stack::peek_char(); + if (c == ' ' || c == '\t' || c == '\n' || c == '#' || c == ';') + break; + input_stack::skip_char(); + if (c == '\\') { + int d = input_stack::peek_char(); + if (d == '\n') + input_stack::skip_char(); + else + word_buffer += '\\'; + } + else + word_buffer += c; + } + return 1; +} + +union argument { + const char *s; + int n; +}; + +// This is for debugging. + +static void echo_command(int argc, argument *argv) +{ + for (int i = 0; i < argc; i++) + fprintf(stderr, "%s\n", argv[i].s); +} + +static void include_command(int argc, argument *argv) +{ + assert(argc == 1); + input_stack::push_file(argv[0].s); +} + +static void capitalize_command(int argc, argument *argv) +{ + if (argc > 0) + capitalize_fields = argv[0].s; + else + capitalize_fields.clear(); +} + +static void accumulate_command(int, argument *) +{ + accumulate = 1; +} + +static void no_accumulate_command(int, argument *) +{ + accumulate = 0; +} + +static void move_punctuation_command(int, argument *) +{ + move_punctuation = 1; +} + +static void no_move_punctuation_command(int, argument *) +{ + move_punctuation = 0; +} + +static void sort_command(int argc, argument *argv) +{ + if (argc == 0) + sort_fields = "AD"; + else + sort_fields = argv[0].s; + accumulate = 1; +} + +static void no_sort_command(int, argument *) +{ + sort_fields.clear(); +} + +static void articles_command(int argc, argument *argv) +{ + articles.clear(); + int i; + for (i = 0; i < argc; i++) { + articles += argv[i].s; + articles += '\0'; + } + int len = articles.length(); + for (i = 0; i < len; i++) + articles[i] = cmlower(articles[i]); +} + +static void database_command(int argc, argument *argv) +{ + for (int i = 0; i < argc; i++) + database_list.add_file(argv[i].s); +} + +static void default_database_command(int, argument *) +{ + search_default = 1; +} + +static void no_default_database_command(int, argument *) +{ + search_default = 0; +} + +static void bibliography_command(int argc, argument *argv) +{ + const char *saved_filename = current_filename; + int saved_lineno = current_lineno; + int saved_label_in_text = label_in_text; + label_in_text = 0; + if (!accumulate) + fputs(".]<\n", stdout); + for (int i = 0; i < argc; i++) + do_bib(argv[i].s); + if (accumulate) + output_references(); + else + fputs(".]>\n", stdout); + current_filename = saved_filename; + current_lineno = saved_lineno; + label_in_text = saved_label_in_text; +} + +static void annotate_command(int argc, argument *argv) +{ + if (argc > 0) + annotation_field = argv[0].s[0]; + else + annotation_field = 'X'; + if (argc == 2) + annotation_macro = argv[1].s; + else + annotation_macro = "AP"; +} + +static void no_annotate_command(int, argument *) +{ + annotation_macro.clear(); + annotation_field = -1; +} + +static void reverse_command(int, argument *argv) +{ + reverse_fields = argv[0].s; +} + +static void no_reverse_command(int, argument *) +{ + reverse_fields.clear(); +} + +static void abbreviate_command(int argc, argument *argv) +{ + abbreviate_fields = argv[0].s; + period_before_initial = argc > 1 ? argv[1].s : ". "; + period_before_last_name = argc > 2 ? argv[2].s : ". "; + period_before_other = argc > 3 ? argv[3].s : ". "; + period_before_hyphen = argc > 4 ? argv[4].s : "."; +} + +static void no_abbreviate_command(int, argument *) +{ + abbreviate_fields.clear(); +} + +string search_ignore_fields; + +static void search_ignore_command(int argc, argument *argv) +{ + if (argc > 0) + search_ignore_fields = argv[0].s; + else + search_ignore_fields = "XYZ"; + search_ignore_fields += '\0'; + linear_ignore_fields = search_ignore_fields.contents(); +} + +static void no_search_ignore_command(int, argument *) +{ + linear_ignore_fields = ""; +} + +static void search_truncate_command(int argc, argument *argv) +{ + if (argc > 0) + linear_truncate_len = argv[0].n; + else + linear_truncate_len = 6; +} + +static void no_search_truncate_command(int, argument *) +{ + linear_truncate_len = -1; +} + +static void discard_command(int argc, argument *argv) +{ + if (argc == 0) + discard_fields = "XYZ"; + else + discard_fields = argv[0].s; + accumulate = 1; +} + +static void no_discard_command(int, argument *) +{ + discard_fields.clear(); +} + +static void label_command(int, argument *argv) +{ + set_label_spec(argv[0].s); +} + +static void abbreviate_label_ranges_command(int argc, argument *argv) +{ + abbreviate_label_ranges = 1; + label_range_indicator = argc > 0 ? argv[0].s : "-"; +} + +static void no_abbreviate_label_ranges_command(int, argument *) +{ + abbreviate_label_ranges = 0; +} + +static void label_in_reference_command(int, argument *) +{ + label_in_reference = 1; +} + +static void no_label_in_reference_command(int, argument *) +{ + label_in_reference = 0; +} + +static void label_in_text_command(int, argument *) +{ + label_in_text = 1; +} + +static void no_label_in_text_command(int, argument *) +{ + label_in_text = 0; +} + +static void sort_adjacent_labels_command(int, argument *) +{ + sort_adjacent_labels = 1; +} + +static void no_sort_adjacent_labels_command(int, argument *) +{ + sort_adjacent_labels = 0; +} + +static void date_as_label_command(int argc, argument *argv) +{ + if (set_date_label_spec(argc > 0 ? argv[0].s : "D%a*")) + date_as_label = 1; +} + +static void no_date_as_label_command(int, argument *) +{ + date_as_label = 0; +} + +static void short_label_command(int, argument *argv) +{ + if (set_short_label_spec(argv[0].s)) + short_label_flag = 1; +} + +static void no_short_label_command(int, argument *) +{ + short_label_flag = 0; +} + +static void compatible_command(int, argument *) +{ + compatible_flag = 1; +} + +static void no_compatible_command(int, argument *) +{ + compatible_flag = 0; +} + +static void join_authors_command(int argc, argument *argv) +{ + join_authors_exactly_two = argv[0].s; + join_authors_default = argc > 1 ? argv[1].s : argv[0].s; + join_authors_last_two = argc == 3 ? argv[2].s : argv[0].s; +} + +static void bracket_label_command(int, argument *argv) +{ + pre_label = argv[0].s; + post_label = argv[1].s; + sep_label = argv[2].s; +} + +static void separate_label_second_parts_command(int, argument *argv) +{ + separate_label_second_parts = argv[0].s; +} + +static void et_al_command(int argc, argument *argv) +{ + et_al = argv[0].s; + et_al_min_elide = argv[1].n; + if (et_al_min_elide < 1) + et_al_min_elide = 1; + et_al_min_total = argc >= 3 ? argv[2].n : 0; +} + +static void no_et_al_command(int, argument *) +{ + et_al.clear(); + et_al_min_elide = 0; +} + +typedef void (*command_t)(int, argument *); + +/* arg_types is a string describing the numbers and types of arguments. +s means a string, i means an integer, f is a list of fields, F is +a single field, +? means that the previous argument is optional, * means that the +previous argument can occur any number of times. */ + +struct { + const char *name; + command_t func; + const char *arg_types; +} command_table[] = { + { "include", include_command, "s" }, + { "echo", echo_command, "s*" }, + { "capitalize", capitalize_command, "f?" }, + { "accumulate", accumulate_command, "" }, + { "no-accumulate", no_accumulate_command, "" }, + { "move-punctuation", move_punctuation_command, "" }, + { "no-move-punctuation", no_move_punctuation_command, "" }, + { "sort", sort_command, "s?" }, + { "no-sort", no_sort_command, "" }, + { "articles", articles_command, "s*" }, + { "database", database_command, "ss*" }, + { "default-database", default_database_command, "" }, + { "no-default-database", no_default_database_command, "" }, + { "bibliography", bibliography_command, "ss*" }, + { "annotate", annotate_command, "F?s?" }, + { "no-annotate", no_annotate_command, "" }, + { "reverse", reverse_command, "s" }, + { "no-reverse", no_reverse_command, "" }, + { "abbreviate", abbreviate_command, "ss?s?s?s?" }, + { "no-abbreviate", no_abbreviate_command, "" }, + { "search-ignore", search_ignore_command, "f?" }, + { "no-search-ignore", no_search_ignore_command, "" }, + { "search-truncate", search_truncate_command, "i?" }, + { "no-search-truncate", no_search_truncate_command, "" }, + { "discard", discard_command, "f?" }, + { "no-discard", no_discard_command, "" }, + { "label", label_command, "s" }, + { "abbreviate-label-ranges", abbreviate_label_ranges_command, "s?" }, + { "no-abbreviate-label-ranges", no_abbreviate_label_ranges_command, "" }, + { "label-in-reference", label_in_reference_command, "" }, + { "no-label-in-reference", no_label_in_reference_command, "" }, + { "label-in-text", label_in_text_command, "" }, + { "no-label-in-text", no_label_in_text_command, "" }, + { "sort-adjacent-labels", sort_adjacent_labels_command, "" }, + { "no-sort-adjacent-labels", no_sort_adjacent_labels_command, "" }, + { "date-as-label", date_as_label_command, "s?" }, + { "no-date-as-label", no_date_as_label_command, "" }, + { "short-label", short_label_command, "s" }, + { "no-short-label", no_short_label_command, "" }, + { "compatible", compatible_command, "" }, + { "no-compatible", no_compatible_command, "" }, + { "join-authors", join_authors_command, "sss?" }, + { "bracket-label", bracket_label_command, "sss" }, + { "separate-label-second-parts", separate_label_second_parts_command, "s" }, + { "et-al", et_al_command, "sii?" }, + { "no-et-al", no_et_al_command, "" }, +}; + +static int check_args(const char *types, const char *name, + int argc, argument *argv) +{ + int argno = 0; + while (*types) { + if (argc == 0) { + if (types[1] == '?') + break; + else if (types[1] == '*') { + assert(types[2] == '\0'); + break; + } + else { + input_stack::error("missing argument for command `%1'", name); + return 0; + } + } + switch (*types) { + case 's': + break; + case 'i': + { + char *ptr; + long n = strtol(argv->s, &ptr, 10); + if ((n == 0 && ptr == argv->s) + || *ptr != '\0') { + input_stack::error("argument %1 for command `%2' must be an integer", + argno + 1, name); + return 0; + } + argv->n = (int)n; + break; + } + case 'f': + { + for (const char *ptr = argv->s; *ptr != '\0'; ptr++) + if (!cs_field_name(*ptr)) { + input_stack::error("argument %1 for command `%2' must be a list of fields", + argno + 1, name); + return 0; + } + break; + } + case 'F': + if (argv->s[0] == '\0' || argv->s[1] != '\0' + || !cs_field_name(argv->s[0])) { + input_stack::error("argument %1 for command `%2' must be a field name", + argno + 1, name); + return 0; + } + break; + default: + assert(0); + } + if (types[1] == '?') + types += 2; + else if (types[1] != '*') + types += 1; + --argc; + ++argv; + ++argno; + } + if (argc > 0) { + input_stack::error("too many arguments for command `%1'", name); + return 0; + } + return 1; +} + +static void execute_command(const char *name, int argc, argument *argv) +{ + for (int i = 0; i < sizeof(command_table)/sizeof(command_table[0]); i++) + if (strcmp(name, command_table[i].name) == 0) { + if (check_args(command_table[i].arg_types, name, argc, argv)) + (*command_table[i].func)(argc, argv); + return; + } + input_stack::error("unknown command `%1'", name); +} + +static void command_loop() +{ + string command; + for (;;) { + command.clear(); + int res = get_word(command); + if (res != 1) { + if (res == 0) + continue; + break; + } + int argc = 0; + command += '\0'; + while ((res = get_word(command)) == 1) { + argc++; + command += '\0'; + } + argument *argv = new argument[argc]; + const char *ptr = command.contents(); + for (int i = 0; i < argc; i++) + argv[i].s = ptr = strchr(ptr, '\0') + 1; + execute_command(command.contents(), argc, argv); + a_delete argv; + if (res == -1) + break; + } +} + +void process_commands(const char *file) +{ + input_stack::init(); + input_stack::push_file(file); + command_loop(); +} + +void process_commands(string &s, const char *file, int lineno) +{ + input_stack::init(); + input_stack::push_string(s, file, lineno); + command_loop(); +} diff --git a/contrib/groff/src/preproc/refer/command.h b/contrib/groff/src/preproc/refer/command.h new file mode 100644 index 0000000..c7085db --- /dev/null +++ b/contrib/groff/src/preproc/refer/command.h @@ -0,0 +1,36 @@ +// -*- C++ -*- +/* Copyright (C) 1989, 1990, 1991, 1992 Free Software Foundation, Inc. + Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License along +with groff; see the file COPYING. If not, write to the Free Software +Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + +void process_commands(const char *file); +void process_commands(string &s, const char *file, int lineno); + +extern int accumulate; +extern int move_punctuation; +extern int search_default; +extern search_list database_list; +extern int label_in_text; +extern int label_in_reference; +extern int sort_adjacent_labels; +extern string pre_label; +extern string post_label; +extern string sep_label; + +extern void do_bib(const char *); +extern void output_references(); diff --git a/contrib/groff/src/preproc/refer/label.cc b/contrib/groff/src/preproc/refer/label.cc new file mode 100644 index 0000000..c6dc07c --- /dev/null +++ b/contrib/groff/src/preproc/refer/label.cc @@ -0,0 +1,1602 @@ +#ifndef lint +/*static char yysccsid[] = "from: @(#)yaccpar 1.9 (Berkeley) 02/21/93";*/ +static char yyrcsid[] = "$Id: label.cc,v 1.2 2000/02/28 11:02:12 wlemb Exp $"; +#endif +#define YYBYACC 1 +#define YYMAJOR 1 +#define YYMINOR 9 +#define yyclearin (yychar=(-1)) +#define yyerrok (yyerrflag=0) +#define YYRECOVERING (yyerrflag!=0) +#define YYPREFIX "yy" +#line 22 "label.y" + +#include "refer.h" +#include "refid.h" +#include "ref.h" +#include "token.h" + +int yylex(); +void yyerror(const char *); +int yyparse(); + +static const char *format_serial(char c, int n); + +struct label_info { + int start; + int length; + int count; + int total; + label_info(const string &); +}; + +label_info *lookup_label(const string &label); + +struct expression { + enum { + /* Does the tentative label depend on the reference?*/ + CONTAINS_VARIABLE = 01, + CONTAINS_STAR = 02, + CONTAINS_FORMAT = 04, + CONTAINS_AT = 010 + }; + virtual ~expression() { } + virtual void evaluate(int, const reference &, string &, + substring_position &) = 0; + virtual unsigned analyze() { return 0; } +}; + +class at_expr : public expression { +public: + at_expr() { } + void evaluate(int, const reference &, string &, substring_position &); + unsigned analyze() { return CONTAINS_VARIABLE|CONTAINS_AT; } +}; + +class format_expr : public expression { + char type; + int width; + int first_number; +public: + format_expr(char c, int w = 0, int f = 1) + : type(c), width(w), first_number(f) { } + void evaluate(int, const reference &, string &, substring_position &); + unsigned analyze() { return CONTAINS_FORMAT; } +}; + +class field_expr : public expression { + int number; + char name; +public: + field_expr(char nm, int num) : number(num), name(nm) { } + void evaluate(int, const reference &, string &, substring_position &); + unsigned analyze() { return CONTAINS_VARIABLE; } +}; + +class literal_expr : public expression { + string s; +public: + literal_expr(const char *ptr, int len) : s(ptr, len) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class unary_expr : public expression { +protected: + expression *expr; +public: + unary_expr(expression *e) : expr(e) { } + ~unary_expr() { delete expr; } + void evaluate(int, const reference &, string &, substring_position &) = 0; + unsigned analyze() { return expr ? expr->analyze() : 0; } +}; + +/* This caches the analysis of an expression.*/ + +class analyzed_expr : public unary_expr { + unsigned flags; +public: + analyzed_expr(expression *); + void evaluate(int, const reference &, string &, substring_position &); + unsigned analyze() { return flags; } +}; + +class star_expr : public unary_expr { +public: + star_expr(expression *e) : unary_expr(e) { } + void evaluate(int, const reference &, string &, substring_position &); + unsigned analyze() { + return ((expr ? (expr->analyze() & ~CONTAINS_VARIABLE) : 0) + | CONTAINS_STAR); + } +}; + +typedef void map_func(const char *, const char *, string &); + +class map_expr : public unary_expr { + map_func *func; +public: + map_expr(expression *e, map_func *f) : unary_expr(e), func(f) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +typedef const char *extractor_func(const char *, const char *, const char **); + +class extractor_expr : public unary_expr { + int part; + extractor_func *func; +public: + enum { BEFORE = +1, MATCH = 0, AFTER = -1 }; + extractor_expr(expression *e, extractor_func *f, int pt) + : unary_expr(e), part(pt), func(f) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class truncate_expr : public unary_expr { + int n; +public: + truncate_expr(expression *e, int i) : unary_expr(e), n(i) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class separator_expr : public unary_expr { +public: + separator_expr(expression *e) : unary_expr(e) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class binary_expr : public expression { +protected: + expression *expr1; + expression *expr2; +public: + binary_expr(expression *e1, expression *e2) : expr1(e1), expr2(e2) { } + ~binary_expr() { delete expr1; delete expr2; } + void evaluate(int, const reference &, string &, substring_position &) = 0; + unsigned analyze() { + return (expr1 ? expr1->analyze() : 0) | (expr2 ? expr2->analyze() : 0); + } +}; + +class alternative_expr : public binary_expr { +public: + alternative_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class list_expr : public binary_expr { +public: + list_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class substitute_expr : public binary_expr { +public: + substitute_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class ternary_expr : public expression { +protected: + expression *expr1; + expression *expr2; + expression *expr3; +public: + ternary_expr(expression *e1, expression *e2, expression *e3) + : expr1(e1), expr2(e2), expr3(e3) { } + ~ternary_expr() { delete expr1; delete expr2; delete expr3; } + void evaluate(int, const reference &, string &, substring_position &) = 0; + unsigned analyze() { + return ((expr1 ? expr1->analyze() : 0) + | (expr2 ? expr2->analyze() : 0) + | (expr3 ? expr3->analyze() : 0)); + } +}; + +class conditional_expr : public ternary_expr { +public: + conditional_expr(expression *e1, expression *e2, expression *e3) + : ternary_expr(e1, e2, e3) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +static expression *parsed_label = 0; +static expression *parsed_date_label = 0; +static expression *parsed_short_label = 0; + +static expression *parse_result; + +string literals; + +#line 221 "label.y" +typedef union { + int num; + expression *expr; + struct { int ndigits; int val; } dig; + struct { int start; int len; } str; +} YYSTYPE; +#line 218 "y.tab.c" +#define TOKEN_LETTER 257 +#define TOKEN_LITERAL 258 +#define TOKEN_DIGIT 259 +#define YYERRCODE 256 +short yylhs[] = { -1, + 0, 1, 1, 6, 6, 2, 2, 2, 3, 3, + 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 9, 9, 7, 7, 8, 8, + 10, 10, 10, +}; +short yylen[] = { 2, + 1, 1, 5, 0, 1, 1, 3, 3, 1, 2, + 1, 3, 1, 1, 1, 2, 2, 2, 5, 3, + 3, 2, 3, 3, 0, 1, 1, 2, 1, 2, + 0, 1, 1, +}; +short yydefred[] = { 0, + 0, 14, 13, 0, 0, 0, 0, 5, 0, 0, + 0, 0, 1, 27, 0, 17, 29, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 22, 0, 28, + 30, 23, 24, 0, 0, 0, 32, 33, 0, 0, + 0, 0, 0, 0, 3, 0, 19, +}; +short yydgoto[] = { 7, + 8, 9, 10, 11, 12, 13, 15, 18, 47, 39, +}; +short yysindex[] = { -32, + -257, 0, 0, -240, -32, -32, 0, 0, -18, -32, + -36, -114, 0, 0, -246, 0, 0, -241, -14, -39, + -32, -32, -32, -114, -21, -257, -257, 0, -32, 0, + 0, 0, 0, -25, -32, -32, 0, 0, -223, -246, + -246, -36, -32, -257, 0, -246, 0, +}; +short yyrindex[] = { 35, + 1, 0, 0, 0, -5, -4, 0, 0, 14, 208, + 159, 224, 0, 0, 11, 0, 0, 40, 0, 0, + 2, 0, 0, 253, -220, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 263, 281, 0, 0, 0, 50, + 105, 214, 0, 115, 0, 149, 0, +}; +short yygindex[] = { 0, + 19, 0, 7, 37, -10, 10, -23, 0, 0, 0, +}; +#define YYTABLESIZE 511 +short yytable[] = { 24, + 15, 14, 40, 41, 4, 28, 26, 5, 27, 25, + 16, 29, 30, 2, 19, 20, 16, 31, 17, 23, + 46, 37, 33, 38, 24, 24, 32, 6, 35, 36, + 34, 3, 43, 44, 4, 4, 31, 15, 15, 18, + 15, 15, 15, 15, 21, 15, 15, 16, 16, 20, + 16, 16, 16, 16, 2, 16, 16, 4, 15, 4, + 15, 45, 15, 15, 15, 42, 0, 0, 16, 0, + 16, 2, 16, 16, 16, 2, 18, 18, 0, 18, + 18, 18, 18, 0, 18, 18, 20, 20, 0, 20, + 20, 20, 20, 0, 20, 20, 0, 18, 0, 18, + 0, 18, 18, 18, 21, 22, 0, 20, 0, 20, + 0, 20, 20, 20, 25, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 15, 0, 15, 0, 0, 0, + 0, 0, 0, 0, 16, 0, 16, 0, 0, 0, + 0, 21, 21, 0, 21, 21, 21, 21, 26, 21, + 21, 25, 25, 0, 25, 25, 25, 25, 11, 25, + 25, 0, 21, 18, 21, 18, 21, 21, 21, 0, + 0, 0, 25, 20, 25, 20, 25, 25, 25, 0, + 0, 0, 0, 0, 0, 26, 26, 0, 26, 26, + 26, 26, 0, 26, 26, 11, 11, 0, 11, 11, + 0, 0, 0, 0, 0, 0, 26, 6, 26, 0, + 26, 26, 26, 12, 0, 0, 11, 0, 11, 0, + 11, 11, 11, 9, 1, 2, 0, 0, 21, 0, + 21, 0, 0, 0, 0, 0, 0, 0, 25, 0, + 25, 0, 0, 0, 0, 6, 0, 0, 6, 0, + 12, 12, 10, 12, 12, 0, 0, 15, 15, 0, + 9, 9, 7, 9, 9, 6, 0, 16, 16, 6, + 6, 12, 26, 12, 26, 12, 12, 12, 0, 0, + 8, 9, 11, 9, 11, 9, 9, 9, 0, 10, + 10, 0, 10, 10, 0, 0, 18, 18, 0, 0, + 7, 0, 0, 7, 0, 0, 20, 20, 0, 0, + 10, 0, 10, 0, 10, 10, 10, 0, 8, 0, + 7, 8, 0, 0, 7, 7, 0, 0, 0, 0, + 0, 6, 0, 0, 0, 0, 0, 12, 8, 12, + 0, 0, 8, 8, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 21, 21, 0, 0, 0, 0, 0, 0, 0, + 0, 25, 25, 0, 0, 0, 10, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 8, 26, 26, 0, 0, 0, + 0, 0, 0, 0, 0, 11, 11, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 12, 12, 0, 0, 0, 0, 0, 0, 0, 0, + 9, 9, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, + 10, +}; +short yycheck[] = { 10, + 0, 259, 26, 27, 37, 42, 43, 40, 45, 46, + 0, 126, 259, 0, 5, 6, 257, 259, 259, 38, + 44, 43, 62, 45, 35, 36, 41, 60, 22, 23, + 21, 64, 58, 257, 0, 41, 257, 37, 38, 0, + 40, 41, 42, 43, 63, 45, 46, 37, 38, 0, + 40, 41, 42, 43, 41, 45, 46, 62, 58, 58, + 60, 43, 62, 63, 64, 29, -1, -1, 58, -1, + 60, 58, 62, 63, 64, 62, 37, 38, -1, 40, + 41, 42, 43, -1, 45, 46, 37, 38, -1, 40, + 41, 42, 43, -1, 45, 46, -1, 58, -1, 60, + -1, 62, 63, 64, 0, 124, -1, 58, -1, 60, + -1, 62, 63, 64, 0, -1, -1, -1, -1, -1, + -1, -1, -1, -1, 124, -1, 126, -1, -1, -1, + -1, -1, -1, -1, 124, -1, 126, -1, -1, -1, + -1, 37, 38, -1, 40, 41, 42, 43, 0, 45, + 46, 37, 38, -1, 40, 41, 42, 43, 0, 45, + 46, -1, 58, 124, 60, 126, 62, 63, 64, -1, + -1, -1, 58, 124, 60, 126, 62, 63, 64, -1, + -1, -1, -1, -1, -1, 37, 38, -1, 40, 41, + 42, 43, -1, 45, 46, 37, 38, -1, 40, 41, + -1, -1, -1, -1, -1, -1, 58, 0, 60, -1, + 62, 63, 64, 0, -1, -1, 58, -1, 60, -1, + 62, 63, 64, 0, 257, 258, -1, -1, 124, -1, + 126, -1, -1, -1, -1, -1, -1, -1, 124, -1, + 126, -1, -1, -1, -1, 38, -1, -1, 41, -1, + 37, 38, 0, 40, 41, -1, -1, 257, 258, -1, + 37, 38, 0, 40, 41, 58, -1, 257, 258, 62, + 63, 58, 124, 60, 126, 62, 63, 64, -1, -1, + 0, 58, 124, 60, 126, 62, 63, 64, -1, 37, + 38, -1, 40, 41, -1, -1, 257, 258, -1, -1, + 38, -1, -1, 41, -1, -1, 257, 258, -1, -1, + 58, -1, 60, -1, 62, 63, 64, -1, 38, -1, + 58, 41, -1, -1, 62, 63, -1, -1, -1, -1, + -1, 124, -1, -1, -1, -1, -1, 124, 58, 126, + -1, -1, 62, 63, -1, -1, -1, 124, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, 257, 258, -1, -1, -1, -1, -1, -1, -1, + -1, 257, 258, -1, -1, -1, 124, -1, -1, -1, + -1, -1, -1, -1, -1, -1, 124, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, 124, 257, 258, -1, -1, -1, + -1, -1, -1, -1, -1, 257, 258, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 257, 258, -1, -1, -1, -1, -1, -1, -1, -1, + 257, 258, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, 257, + 258, +}; +#define YYFINAL 7 +#ifndef YYDEBUG +#define YYDEBUG 0 +#endif +#define YYMAXTOKEN 259 +#if YYDEBUG +char *yyname[] = { +"end-of-file",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,"'%'","'&'",0,"'('","')'","'*'","'+'",0,"'-'","'.'",0,0,0,0,0,0,0,0,0,0,0, +"':'",0,"'<'",0,"'>'","'?'","'@'",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"'|'",0, +"'~'",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,"TOKEN_LETTER","TOKEN_LITERAL","TOKEN_DIGIT", +}; +char *yyrule[] = { +"$accept : expr", +"expr : optional_conditional", +"conditional : alternative", +"conditional : alternative '?' optional_conditional ':' conditional", +"optional_conditional :", +"optional_conditional : conditional", +"alternative : list", +"alternative : alternative '|' list", +"alternative : alternative '&' list", +"list : substitute", +"list : list substitute", +"substitute : string", +"substitute : substitute '~' string", +"string : '@'", +"string : TOKEN_LITERAL", +"string : TOKEN_LETTER", +"string : TOKEN_LETTER number", +"string : '%' TOKEN_LETTER", +"string : '%' digits", +"string : string '.' flag TOKEN_LETTER optional_number", +"string : string '+' number", +"string : string '-' number", +"string : string '*'", +"string : '(' optional_conditional ')'", +"string : '<' optional_conditional '>'", +"optional_number :", +"optional_number : number", +"number : TOKEN_DIGIT", +"number : number TOKEN_DIGIT", +"digits : TOKEN_DIGIT", +"digits : digits TOKEN_DIGIT", +"flag :", +"flag : '+'", +"flag : '-'", +}; +#endif +#ifdef YYSTACKSIZE +#undef YYMAXDEPTH +#define YYMAXDEPTH YYSTACKSIZE +#else +#ifdef YYMAXDEPTH +#define YYSTACKSIZE YYMAXDEPTH +#else +#define YYSTACKSIZE 500 +#define YYMAXDEPTH 500 +#endif +#endif +int yydebug; +int yynerrs; +int yyerrflag; +int yychar; +short *yyssp; +YYSTYPE *yyvsp; +YYSTYPE yyval; +YYSTYPE yylval; +short yyss[YYSTACKSIZE]; +YYSTYPE yyvs[YYSTACKSIZE]; +#define yystacksize YYSTACKSIZE +#line 397 "label.y" + +/* bison defines const to be empty unless __STDC__ is defined, which it +isn't under cfront */ + +#ifdef const +#undef const +#endif + +const char *spec_ptr; +const char *spec_end; +const char *spec_cur; + +int yylex() +{ + while (spec_ptr < spec_end && csspace(*spec_ptr)) + spec_ptr++; + spec_cur = spec_ptr; + if (spec_ptr >= spec_end) + return 0; + unsigned char c = *spec_ptr++; + if (csalpha(c)) { + yylval.num = c; + return TOKEN_LETTER; + } + if (csdigit(c)) { + yylval.num = c - '0'; + return TOKEN_DIGIT; + } + if (c == '\'') { + yylval.str.start = literals.length(); + for (; spec_ptr < spec_end; spec_ptr++) { + if (*spec_ptr == '\'') { + if (++spec_ptr < spec_end && *spec_ptr == '\'') + literals += '\''; + else { + yylval.str.len = literals.length() - yylval.str.start; + return TOKEN_LITERAL; + } + } + else + literals += *spec_ptr; + } + yylval.str.len = literals.length() - yylval.str.start; + return TOKEN_LITERAL; + } + return c; +} + +int set_label_spec(const char *label_spec) +{ + spec_cur = spec_ptr = label_spec; + spec_end = strchr(label_spec, '\0'); + literals.clear(); + if (yyparse()) + return 0; + delete parsed_label; + parsed_label = parse_result; + return 1; +} + +int set_date_label_spec(const char *label_spec) +{ + spec_cur = spec_ptr = label_spec; + spec_end = strchr(label_spec, '\0'); + literals.clear(); + if (yyparse()) + return 0; + delete parsed_date_label; + parsed_date_label = parse_result; + return 1; +} + +int set_short_label_spec(const char *label_spec) +{ + spec_cur = spec_ptr = label_spec; + spec_end = strchr(label_spec, '\0'); + literals.clear(); + if (yyparse()) + return 0; + delete parsed_short_label; + parsed_short_label = parse_result; + return 1; +} + +void yyerror(const char *message) +{ + if (spec_cur < spec_end) + command_error("label specification %1 before `%2'", message, spec_cur); + else + command_error("label specification %1 at end of string", + message, spec_cur); +} + +void at_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &) +{ + if (tentative) + ref.canonicalize_authors(result); + else { + const char *end, *start = ref.get_authors(&end); + if (start) + result.append(start, end - start); + } +} + +void format_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &) +{ + if (tentative) + return; + const label_info *lp = ref.get_label_ptr(); + int num = lp == 0 ? ref.get_number() : lp->count; + if (type != '0') + result += format_serial(type, num + 1); + else { + const char *ptr = i_to_a(num + first_number); + int pad = width - strlen(ptr); + while (--pad >= 0) + result += '0'; + result += ptr; + } +} + +static const char *format_serial(char c, int n) +{ + assert(n > 0); + static char buf[128]; // more than enough. + switch (c) { + case 'i': + case 'I': + { + char *p = buf; + // troff uses z and w to represent 10000 and 5000 in Roman + // numerals; I can find no historical basis for this usage + const char *s = c == 'i' ? "zwmdclxvi" : "ZWMDCLXVI"; + if (n >= 40000) + return i_to_a(n); + while (n >= 10000) { + *p++ = s[0]; + n -= 10000; + } + for (int i = 1000; i > 0; i /= 10, s += 2) { + int m = n/i; + n -= m*i; + switch (m) { + case 3: + *p++ = s[2]; + /* falls through */ + case 2: + *p++ = s[2]; + /* falls through */ + case 1: + *p++ = s[2]; + break; + case 4: + *p++ = s[2]; + *p++ = s[1]; + break; + case 8: + *p++ = s[1]; + *p++ = s[2]; + *p++ = s[2]; + *p++ = s[2]; + break; + case 7: + *p++ = s[1]; + *p++ = s[2]; + *p++ = s[2]; + break; + case 6: + *p++ = s[1]; + *p++ = s[2]; + break; + case 5: + *p++ = s[1]; + break; + case 9: + *p++ = s[2]; + *p++ = s[0]; + } + } + *p = 0; + break; + } + case 'a': + case 'A': + { + char *p = buf; + // this is derived from troff/reg.c + while (n > 0) { + int d = n % 26; + if (d == 0) + d = 26; + n -= d; + n /= 26; + *p++ = c + d - 1; // ASCII dependent + } + *p-- = 0; + // Reverse it. + char *q = buf; + while (q < p) { + char temp = *q; + *q = *p; + *p = temp; + --p; + ++q; + } + break; + } + default: + assert(0); + } + return buf; +} + +void field_expr::evaluate(int, const reference &ref, + string &result, substring_position &) +{ + const char *end; + const char *start = ref.get_field(name, &end); + if (start) { + start = nth_field(number, start, &end); + if (start) + result.append(start, end - start); + } +} + +void literal_expr::evaluate(int, const reference &, + string &result, substring_position &) +{ + result += s; +} + +analyzed_expr::analyzed_expr(expression *e) +: unary_expr(e), flags(e ? e->analyze() : 0) +{ +} + +void analyzed_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + if (expr) + expr->evaluate(tentative, ref, result, pos); +} + +void star_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + const label_info *lp = ref.get_label_ptr(); + if (!tentative + && (lp == 0 || lp->total > 1) + && expr) + expr->evaluate(tentative, ref, result, pos); +} + +void separator_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + int start_length = result.length(); + int is_first = pos.start < 0; + if (expr) + expr->evaluate(tentative, ref, result, pos); + if (is_first) { + pos.start = start_length; + pos.length = result.length() - start_length; + } +} + +void map_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &) +{ + if (expr) { + string temp; + substring_position temp_pos; + expr->evaluate(tentative, ref, temp, temp_pos); + (*func)(temp.contents(), temp.contents() + temp.length(), result); + } +} + +void extractor_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &) +{ + if (expr) { + string temp; + substring_position temp_pos; + expr->evaluate(tentative, ref, temp, temp_pos); + const char *end, *start = (*func)(temp.contents(), + temp.contents() + temp.length(), + &end); + switch (part) { + case BEFORE: + if (start) + result.append(temp.contents(), start - temp.contents()); + else + result += temp; + break; + case MATCH: + if (start) + result.append(start, end - start); + break; + case AFTER: + if (start) + result.append(end, temp.contents() + temp.length() - end); + break; + default: + assert(0); + } + } +} + +static void first_part(int len, const char *ptr, const char *end, + string &result) +{ + for (;;) { + const char *token_start = ptr; + if (!get_token(&ptr, end)) + break; + const token_info *ti = lookup_token(token_start, ptr); + int counts = ti->sortify_non_empty(token_start, ptr); + if (counts && --len < 0) + break; + if (counts || ti->is_accent()) + result.append(token_start, ptr - token_start); + } +} + +static void last_part(int len, const char *ptr, const char *end, + string &result) +{ + const char *start = ptr; + int count = 0; + for (;;) { + const char *token_start = ptr; + if (!get_token(&ptr, end)) + break; + const token_info *ti = lookup_token(token_start, ptr); + if (ti->sortify_non_empty(token_start, ptr)) + count++; + } + ptr = start; + int skip = count - len; + if (skip > 0) { + for (;;) { + const char *token_start = ptr; + if (!get_token(&ptr, end)) + assert(0); + const token_info *ti = lookup_token(token_start, ptr); + if (ti->sortify_non_empty(token_start, ptr) && --skip < 0) { + ptr = token_start; + break; + } + } + } + first_part(len, ptr, end, result); +} + +void truncate_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &) +{ + if (expr) { + string temp; + substring_position temp_pos; + expr->evaluate(tentative, ref, temp, temp_pos); + const char *start = temp.contents(); + const char *end = start + temp.length(); + if (n > 0) + first_part(n, start, end, result); + else if (n < 0) + last_part(-n, start, end, result); + } +} + +void alternative_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + int start_length = result.length(); + if (expr1) + expr1->evaluate(tentative, ref, result, pos); + if (result.length() == start_length && expr2) + expr2->evaluate(tentative, ref, result, pos); +} + +void list_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + if (expr1) + expr1->evaluate(tentative, ref, result, pos); + if (expr2) + expr2->evaluate(tentative, ref, result, pos); +} + +void substitute_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + int start_length = result.length(); + if (expr1) + expr1->evaluate(tentative, ref, result, pos); + if (result.length() > start_length && result[result.length() - 1] == '-') { + // ought to see if pos covers the - + result.set_length(result.length() - 1); + if (expr2) + expr2->evaluate(tentative, ref, result, pos); + } +} + +void conditional_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + string temp; + substring_position temp_pos; + if (expr1) + expr1->evaluate(tentative, ref, temp, temp_pos); + if (temp.length() > 0) { + if (expr2) + expr2->evaluate(tentative, ref, result, pos); + } + else { + if (expr3) + expr3->evaluate(tentative, ref, result, pos); + } +} + +void reference::pre_compute_label() +{ + if (parsed_label != 0 + && (parsed_label->analyze() & expression::CONTAINS_VARIABLE)) { + label.clear(); + substring_position temp_pos; + parsed_label->evaluate(1, *this, label, temp_pos); + label_ptr = lookup_label(label); + } +} + +void reference::compute_label() +{ + label.clear(); + if (parsed_label) + parsed_label->evaluate(0, *this, label, separator_pos); + if (short_label_flag && parsed_short_label) + parsed_short_label->evaluate(0, *this, short_label, short_separator_pos); + if (date_as_label) { + string new_date; + if (parsed_date_label) { + substring_position temp_pos; + parsed_date_label->evaluate(0, *this, new_date, temp_pos); + } + set_date(new_date); + } + if (label_ptr) + label_ptr->count += 1; +} + +void reference::immediate_compute_label() +{ + if (label_ptr) + label_ptr->total = 2; // force use of disambiguator + compute_label(); +} + +int reference::merge_labels(reference **v, int n, label_type type, + string &result) +{ + if (abbreviate_label_ranges) + return merge_labels_by_number(v, n, type, result); + else + return merge_labels_by_parts(v, n, type, result); +} + +int reference::merge_labels_by_number(reference **v, int n, label_type type, + string &result) +{ + if (n <= 1) + return 0; + int num = get_number(); + // Only merge three or more labels. + if (v[0]->get_number() != num + 1 + || v[1]->get_number() != num + 2) + return 0; + int i; + for (i = 2; i < n; i++) + if (v[i]->get_number() != num + i + 1) + break; + result = get_label(type); + result += label_range_indicator; + result += v[i - 1]->get_label(type); + return i; +} + +const substring_position &reference::get_separator_pos(label_type type) const +{ + if (type == SHORT_LABEL && short_label_flag) + return short_separator_pos; + else + return separator_pos; +} + +const string &reference::get_label(label_type type) const +{ + if (type == SHORT_LABEL && short_label_flag) + return short_label; + else + return label; +} + +int reference::merge_labels_by_parts(reference **v, int n, label_type type, + string &result) +{ + if (n <= 0) + return 0; + const string &lb = get_label(type); + const substring_position &sp = get_separator_pos(type); + if (sp.start < 0 + || sp.start != v[0]->get_separator_pos(type).start + || memcmp(lb.contents(), v[0]->get_label(type).contents(), + sp.start) != 0) + return 0; + result = lb; + int i = 0; + do { + result += separate_label_second_parts; + const substring_position &s = v[i]->get_separator_pos(type); + int sep_end_pos = s.start + s.length; + result.append(v[i]->get_label(type).contents() + sep_end_pos, + v[i]->get_label(type).length() - sep_end_pos); + } while (++i < n + && sp.start == v[i]->get_separator_pos(type).start + && memcmp(lb.contents(), v[i]->get_label(type).contents(), + sp.start) == 0); + return i; +} + +string label_pool; + +label_info::label_info(const string &s) +: start(label_pool.length()), length(s.length()), count(0), total(1) +{ + label_pool += s; +} + +static label_info **label_table = 0; +static int label_table_size = 0; +static int label_table_used = 0; + +label_info *lookup_label(const string &label) +{ + if (label_table == 0) { + label_table = new label_info *[17]; + label_table_size = 17; + for (int i = 0; i < 17; i++) + label_table[i] = 0; + } + unsigned h = hash_string(label.contents(), label.length()) % label_table_size; + label_info **ptr; + for (ptr = label_table + h; + *ptr != 0; + (ptr == label_table) + ? (ptr = label_table + label_table_size - 1) + : ptr--) + if ((*ptr)->length == label.length() + && memcmp(label_pool.contents() + (*ptr)->start, label.contents(), + label.length()) == 0) { + (*ptr)->total += 1; + return *ptr; + } + label_info *result = *ptr = new label_info(label); + if (++label_table_used * 2 > label_table_size) { + // Rehash the table. + label_info **old_table = label_table; + int old_size = label_table_size; + label_table_size = next_size(label_table_size); + label_table = new label_info *[label_table_size]; + int i; + for (i = 0; i < label_table_size; i++) + label_table[i] = 0; + for (i = 0; i < old_size; i++) + if (old_table[i]) { + unsigned h = hash_string(label_pool.contents() + old_table[i]->start, + old_table[i]->length); + label_info **p; + for (p = label_table + (h % label_table_size); + *p != 0; + (p == label_table) + ? (p = label_table + label_table_size - 1) + : --p) + ; + *p = old_table[i]; + } + a_delete old_table; + } + return result; +} + +void clear_labels() +{ + for (int i = 0; i < label_table_size; i++) { + delete label_table[i]; + label_table[i] = 0; + } + label_table_used = 0; + label_pool.clear(); +} + +static void consider_authors(reference **start, reference **end, int i); + +void compute_labels(reference **v, int n) +{ + if (parsed_label + && (parsed_label->analyze() & expression::CONTAINS_AT) + && sort_fields.length() >= 2 + && sort_fields[0] == 'A' + && sort_fields[1] == '+') + consider_authors(v, v + n, 0); + for (int i = 0; i < n; i++) + v[i]->compute_label(); +} + + +/* A reference with a list of authors _needs_ author i +where 0 <= i <= N if there exists a reference with a list of authors + such that != and M >= i +and Aj = Bj for 0 <= j < i. In this case if we can't say ``A0, +A1,...,A(i-1) et al'' because this would match both and +. If a reference needs author i we only have to call +need_author(j) for some j >= i such that the reference also needs +author j. */ + +/* This function handles 2 tasks: +determine which authors are needed (cannot be elided with et al.); +determine which authors can have only last names in the labels. + +References >= start and < end have the same first i author names. +Also they're sorted by A+. */ + +static void consider_authors(reference **start, reference **end, int i) +{ + if (start >= end) + return; + reference **p = start; + if (i >= (*p)->get_nauthors()) { + for (++p; p < end && i >= (*p)->get_nauthors(); p++) + ; + if (p < end && i > 0) { + // If we have an author list and an author list , + // then both lists need C. + for (reference **q = start; q < end; q++) + (*q)->need_author(i - 1); + } + start = p; + } + while (p < end) { + reference **last_name_start = p; + reference **name_start = p; + for (++p; + p < end && i < (*p)->get_nauthors() + && same_author_last_name(**last_name_start, **p, i); + p++) { + if (!same_author_name(**name_start, **p, i)) { + consider_authors(name_start, p, i + 1); + name_start = p; + } + } + consider_authors(name_start, p, i + 1); + if (last_name_start == name_start) { + for (reference **q = last_name_start; q < p; q++) + (*q)->set_last_name_unambiguous(i); + } + // If we have an author list and , then the lists + // need author D and E respectively. + if (name_start > start || p < end) { + for (reference **q = last_name_start; q < p; q++) + (*q)->need_author(i); + } + } +} + +int same_author_last_name(const reference &r1, const reference &r2, int n) +{ + const char *ae1; + const char *as1 = r1.get_sort_field(0, n, 0, &ae1); + assert(as1 != 0); + const char *ae2; + const char *as2 = r2.get_sort_field(0, n, 0, &ae2); + assert(as2 != 0); + return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0; +} + +int same_author_name(const reference &r1, const reference &r2, int n) +{ + const char *ae1; + const char *as1 = r1.get_sort_field(0, n, -1, &ae1); + assert(as1 != 0); + const char *ae2; + const char *as2 = r2.get_sort_field(0, n, -1, &ae2); + assert(as2 != 0); + return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0; +} + + +void int_set::set(int i) +{ + assert(i >= 0); + int bytei = i >> 3; + if (bytei >= v.length()) { + int old_length = v.length(); + v.set_length(bytei + 1); + for (int j = old_length; j <= bytei; j++) + v[j] = 0; + } + v[bytei] |= 1 << (i & 7); +} + +int int_set::get(int i) const +{ + assert(i >= 0); + int bytei = i >> 3; + return bytei >= v.length() ? 0 : (v[bytei] & (1 << (i & 7))) != 0; +} + +void reference::set_last_name_unambiguous(int i) +{ + last_name_unambiguous.set(i); +} + +void reference::need_author(int n) +{ + if (n > last_needed_author) + last_needed_author = n; +} + +const char *reference::get_authors(const char **end) const +{ + if (!computed_authors) { + ((reference *)this)->computed_authors = 1; + string &result = ((reference *)this)->authors; + int na = get_nauthors(); + result.clear(); + for (int i = 0; i < na; i++) { + if (last_name_unambiguous.get(i)) { + const char *e, *start = get_author_last_name(i, &e); + assert(start != 0); + result.append(start, e - start); + } + else { + const char *e, *start = get_author(i, &e); + assert(start != 0); + result.append(start, e - start); + } + if (i == last_needed_author + && et_al.length() > 0 + && et_al_min_elide > 0 + && last_needed_author + et_al_min_elide < na + && na >= et_al_min_total) { + result += et_al; + break; + } + if (i < na - 1) { + if (na == 2) + result += join_authors_exactly_two; + else if (i < na - 2) + result += join_authors_default; + else + result += join_authors_last_two; + } + } + } + const char *start = authors.contents(); + *end = start + authors.length(); + return start; +} + +int reference::get_nauthors() const +{ + if (nauthors < 0) { + const char *dummy; + int na; + for (na = 0; get_author(na, &dummy) != 0; na++) + ; + ((reference *)this)->nauthors = na; + } + return nauthors; +} +#line 1228 "y.tab.c" +#define YYABORT goto yyabort +#define YYREJECT goto yyabort +#define YYACCEPT goto yyaccept +#define YYERROR goto yyerrlab +int +#if defined(__STDC__) +yyparse(void) +#else +yyparse() +#endif +{ + register int yym, yyn, yystate; +#if YYDEBUG + register char *yys; + extern char *getenv(); + + if (yys = getenv("YYDEBUG")) + { + yyn = *yys; + if (yyn >= '0' && yyn <= '9') + yydebug = yyn - '0'; + } +#endif + + yynerrs = 0; + yyerrflag = 0; + yychar = (-1); + + yyssp = yyss; + yyvsp = yyvs; + *yyssp = yystate = 0; + +yyloop: + if ((yyn = yydefred[yystate]) != 0) goto yyreduce; + if (yychar < 0) + { + if ((yychar = yylex()) < 0) yychar = 0; +#if YYDEBUG + if (yydebug) + { + yys = 0; + if (yychar <= YYMAXTOKEN) yys = yyname[yychar]; + if (!yys) yys = "illegal-symbol"; + printf("%sdebug: state %d, reading %d (%s)\n", + YYPREFIX, yystate, yychar, yys); + } +#endif + } + if ((yyn = yysindex[yystate]) && (yyn += yychar) >= 0 && + yyn <= YYTABLESIZE && yycheck[yyn] == yychar) + { +#if YYDEBUG + if (yydebug) + printf("%sdebug: state %d, shifting to state %d\n", + YYPREFIX, yystate, yytable[yyn]); +#endif + if (yyssp >= yyss + yystacksize - 1) + { + goto yyoverflow; + } + *++yyssp = yystate = yytable[yyn]; + *++yyvsp = yylval; + yychar = (-1); + if (yyerrflag > 0) --yyerrflag; + goto yyloop; + } + if ((yyn = yyrindex[yystate]) && (yyn += yychar) >= 0 && + yyn <= YYTABLESIZE && yycheck[yyn] == yychar) + { + yyn = yytable[yyn]; + goto yyreduce; + } + if (yyerrflag) goto yyinrecovery; + yyerror("syntax error"); +#ifdef lint + goto yyerrlab; +#endif +yyerrlab: + ++yynerrs; +yyinrecovery: + if (yyerrflag < 3) + { + yyerrflag = 3; + for (;;) + { + if ((yyn = yysindex[*yyssp]) && (yyn += YYERRCODE) >= 0 && + yyn <= YYTABLESIZE && yycheck[yyn] == YYERRCODE) + { +#if YYDEBUG + if (yydebug) + printf("%sdebug: state %d, error recovery shifting\ + to state %d\n", YYPREFIX, *yyssp, yytable[yyn]); +#endif + if (yyssp >= yyss + yystacksize - 1) + { + goto yyoverflow; + } + *++yyssp = yystate = yytable[yyn]; + *++yyvsp = yylval; + goto yyloop; + } + else + { +#if YYDEBUG + if (yydebug) + printf("%sdebug: error recovery discarding state %d\n", + YYPREFIX, *yyssp); +#endif + if (yyssp <= yyss) goto yyabort; + --yyssp; + --yyvsp; + } + } + } + else + { + if (yychar == 0) goto yyabort; +#if YYDEBUG + if (yydebug) + { + yys = 0; + if (yychar <= YYMAXTOKEN) yys = yyname[yychar]; + if (!yys) yys = "illegal-symbol"; + printf("%sdebug: state %d, error recovery discards token %d (%s)\n", + YYPREFIX, yystate, yychar, yys); + } +#endif + yychar = (-1); + goto yyloop; + } +yyreduce: +#if YYDEBUG + if (yydebug) + printf("%sdebug: state %d, reducing by rule %d (%s)\n", + YYPREFIX, yystate, yyn, yyrule[yyn]); +#endif + yym = yylen[yyn]; + yyval = yyvsp[1-yym]; + switch (yyn) + { +case 1: +#line 250 "label.y" +{ parse_result = (yyvsp[0].expr ? new analyzed_expr(yyvsp[0].expr) : 0); } +break; +case 2: +#line 255 "label.y" +{ yyval.expr = yyvsp[0].expr; } +break; +case 3: +#line 257 "label.y" +{ yyval.expr = new conditional_expr(yyvsp[-4].expr, yyvsp[-2].expr, yyvsp[0].expr); } +break; +case 4: +#line 262 "label.y" +{ yyval.expr = 0; } +break; +case 5: +#line 264 "label.y" +{ yyval.expr = yyvsp[0].expr; } +break; +case 6: +#line 269 "label.y" +{ yyval.expr = yyvsp[0].expr; } +break; +case 7: +#line 271 "label.y" +{ yyval.expr = new alternative_expr(yyvsp[-2].expr, yyvsp[0].expr); } +break; +case 8: +#line 273 "label.y" +{ yyval.expr = new conditional_expr(yyvsp[-2].expr, yyvsp[0].expr, 0); } +break; +case 9: +#line 278 "label.y" +{ yyval.expr = yyvsp[0].expr; } +break; +case 10: +#line 280 "label.y" +{ yyval.expr = new list_expr(yyvsp[-1].expr, yyvsp[0].expr); } +break; +case 11: +#line 285 "label.y" +{ yyval.expr = yyvsp[0].expr; } +break; +case 12: +#line 287 "label.y" +{ yyval.expr = new substitute_expr(yyvsp[-2].expr, yyvsp[0].expr); } +break; +case 13: +#line 292 "label.y" +{ yyval.expr = new at_expr; } +break; +case 14: +#line 294 "label.y" +{ + yyval.expr = new literal_expr(literals.contents() + yyvsp[0].str.start, + yyvsp[0].str.len); + } +break; +case 15: +#line 299 "label.y" +{ yyval.expr = new field_expr(yyvsp[0].num, 0); } +break; +case 16: +#line 301 "label.y" +{ yyval.expr = new field_expr(yyvsp[-1].num, yyvsp[0].num - 1); } +break; +case 17: +#line 303 "label.y" +{ + switch (yyvsp[0].num) { + case 'I': + case 'i': + case 'A': + case 'a': + yyval.expr = new format_expr(yyvsp[0].num); + break; + default: + command_error("unrecognized format `%1'", char(yyvsp[0].num)); + yyval.expr = new format_expr('a'); + break; + } + } +break; +case 18: +#line 319 "label.y" +{ + yyval.expr = new format_expr('0', yyvsp[0].dig.ndigits, yyvsp[0].dig.val); + } +break; +case 19: +#line 323 "label.y" +{ + switch (yyvsp[-1].num) { + case 'l': + yyval.expr = new map_expr(yyvsp[-4].expr, lowercase); + break; + case 'u': + yyval.expr = new map_expr(yyvsp[-4].expr, uppercase); + break; + case 'c': + yyval.expr = new map_expr(yyvsp[-4].expr, capitalize); + break; + case 'r': + yyval.expr = new map_expr(yyvsp[-4].expr, reverse_name); + break; + case 'a': + yyval.expr = new map_expr(yyvsp[-4].expr, abbreviate_name); + break; + case 'y': + yyval.expr = new extractor_expr(yyvsp[-4].expr, find_year, yyvsp[-2].num); + break; + case 'n': + yyval.expr = new extractor_expr(yyvsp[-4].expr, find_last_name, yyvsp[-2].num); + break; + default: + yyval.expr = yyvsp[-4].expr; + command_error("unknown function `%1'", char(yyvsp[-1].num)); + break; + } + } +break; +case 20: +#line 354 "label.y" +{ yyval.expr = new truncate_expr(yyvsp[-2].expr, yyvsp[0].num); } +break; +case 21: +#line 356 "label.y" +{ yyval.expr = new truncate_expr(yyvsp[-2].expr, -yyvsp[0].num); } +break; +case 22: +#line 358 "label.y" +{ yyval.expr = new star_expr(yyvsp[-1].expr); } +break; +case 23: +#line 360 "label.y" +{ yyval.expr = yyvsp[-1].expr; } +break; +case 24: +#line 362 "label.y" +{ yyval.expr = new separator_expr(yyvsp[-1].expr); } +break; +case 25: +#line 367 "label.y" +{ yyval.num = -1; } +break; +case 26: +#line 369 "label.y" +{ yyval.num = yyvsp[0].num; } +break; +case 27: +#line 374 "label.y" +{ yyval.num = yyvsp[0].num; } +break; +case 28: +#line 376 "label.y" +{ yyval.num = yyvsp[-1].num*10 + yyvsp[0].num; } +break; +case 29: +#line 381 "label.y" +{ yyval.dig.ndigits = 1; yyval.dig.val = yyvsp[0].num; } +break; +case 30: +#line 383 "label.y" +{ yyval.dig.ndigits = yyvsp[-1].dig.ndigits + 1; yyval.dig.val = yyvsp[-1].dig.val*10 + yyvsp[0].num; } +break; +case 31: +#line 389 "label.y" +{ yyval.num = 0; } +break; +case 32: +#line 391 "label.y" +{ yyval.num = 1; } +break; +case 33: +#line 393 "label.y" +{ yyval.num = -1; } +break; +#line 1547 "y.tab.c" + } + yyssp -= yym; + yystate = *yyssp; + yyvsp -= yym; + yym = yylhs[yyn]; + if (yystate == 0 && yym == 0) + { +#if YYDEBUG + if (yydebug) + printf("%sdebug: after reduction, shifting from state 0 to\ + state %d\n", YYPREFIX, YYFINAL); +#endif + yystate = YYFINAL; + *++yyssp = YYFINAL; + *++yyvsp = yyval; + if (yychar < 0) + { + if ((yychar = yylex()) < 0) yychar = 0; +#if YYDEBUG + if (yydebug) + { + yys = 0; + if (yychar <= YYMAXTOKEN) yys = yyname[yychar]; + if (!yys) yys = "illegal-symbol"; + printf("%sdebug: state %d, reading %d (%s)\n", + YYPREFIX, YYFINAL, yychar, yys); + } +#endif + } + if (yychar == 0) goto yyaccept; + goto yyloop; + } + if ((yyn = yygindex[yym]) && (yyn += yystate) >= 0 && + yyn <= YYTABLESIZE && yycheck[yyn] == yystate) + yystate = yytable[yyn]; + else + yystate = yydgoto[yym]; +#if YYDEBUG + if (yydebug) + printf("%sdebug: after reduction, shifting from state %d \ +to state %d\n", YYPREFIX, *yyssp, yystate); +#endif + if (yyssp >= yyss + yystacksize - 1) + { + goto yyoverflow; + } + *++yyssp = yystate; + *++yyvsp = yyval; + goto yyloop; +yyoverflow: + yyerror("yacc stack overflow"); +yyabort: + return (1); +yyaccept: + return (0); +} diff --git a/contrib/groff/src/preproc/refer/label.y b/contrib/groff/src/preproc/refer/label.y new file mode 100644 index 0000000..2648b98 --- /dev/null +++ b/contrib/groff/src/preproc/refer/label.y @@ -0,0 +1,1177 @@ +/* -*- C++ -*- + Copyright (C) 1989, 1990, 1991, 1992, 2000 Free Software Foundation, Inc. + Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License along +with groff; see the file COPYING. If not, write to the Free Software +Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + +%{ + +#include "refer.h" +#include "refid.h" +#include "ref.h" +#include "token.h" + +int yylex(); +void yyerror(const char *); +int yyparse(); + +static const char *format_serial(char c, int n); + +struct label_info { + int start; + int length; + int count; + int total; + label_info(const string &); +}; + +label_info *lookup_label(const string &label); + +struct expression { + enum { + // Does the tentative label depend on the reference? + CONTAINS_VARIABLE = 01, + CONTAINS_STAR = 02, + CONTAINS_FORMAT = 04, + CONTAINS_AT = 010 + }; + virtual ~expression() { } + virtual void evaluate(int, const reference &, string &, + substring_position &) = 0; + virtual unsigned analyze() { return 0; } +}; + +class at_expr : public expression { +public: + at_expr() { } + void evaluate(int, const reference &, string &, substring_position &); + unsigned analyze() { return CONTAINS_VARIABLE|CONTAINS_AT; } +}; + +class format_expr : public expression { + char type; + int width; + int first_number; +public: + format_expr(char c, int w = 0, int f = 1) + : type(c), width(w), first_number(f) { } + void evaluate(int, const reference &, string &, substring_position &); + unsigned analyze() { return CONTAINS_FORMAT; } +}; + +class field_expr : public expression { + int number; + char name; +public: + field_expr(char nm, int num) : number(num), name(nm) { } + void evaluate(int, const reference &, string &, substring_position &); + unsigned analyze() { return CONTAINS_VARIABLE; } +}; + +class literal_expr : public expression { + string s; +public: + literal_expr(const char *ptr, int len) : s(ptr, len) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class unary_expr : public expression { +protected: + expression *expr; +public: + unary_expr(expression *e) : expr(e) { } + ~unary_expr() { delete expr; } + void evaluate(int, const reference &, string &, substring_position &) = 0; + unsigned analyze() { return expr ? expr->analyze() : 0; } +}; + +// This caches the analysis of an expression. + +class analyzed_expr : public unary_expr { + unsigned flags; +public: + analyzed_expr(expression *); + void evaluate(int, const reference &, string &, substring_position &); + unsigned analyze() { return flags; } +}; + +class star_expr : public unary_expr { +public: + star_expr(expression *e) : unary_expr(e) { } + void evaluate(int, const reference &, string &, substring_position &); + unsigned analyze() { + return ((expr ? (expr->analyze() & ~CONTAINS_VARIABLE) : 0) + | CONTAINS_STAR); + } +}; + +typedef void map_func(const char *, const char *, string &); + +class map_expr : public unary_expr { + map_func *func; +public: + map_expr(expression *e, map_func *f) : unary_expr(e), func(f) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +typedef const char *extractor_func(const char *, const char *, const char **); + +class extractor_expr : public unary_expr { + int part; + extractor_func *func; +public: + enum { BEFORE = +1, MATCH = 0, AFTER = -1 }; + extractor_expr(expression *e, extractor_func *f, int pt) + : unary_expr(e), part(pt), func(f) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class truncate_expr : public unary_expr { + int n; +public: + truncate_expr(expression *e, int i) : unary_expr(e), n(i) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class separator_expr : public unary_expr { +public: + separator_expr(expression *e) : unary_expr(e) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class binary_expr : public expression { +protected: + expression *expr1; + expression *expr2; +public: + binary_expr(expression *e1, expression *e2) : expr1(e1), expr2(e2) { } + ~binary_expr() { delete expr1; delete expr2; } + void evaluate(int, const reference &, string &, substring_position &) = 0; + unsigned analyze() { + return (expr1 ? expr1->analyze() : 0) | (expr2 ? expr2->analyze() : 0); + } +}; + +class alternative_expr : public binary_expr { +public: + alternative_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class list_expr : public binary_expr { +public: + list_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class substitute_expr : public binary_expr { +public: + substitute_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +class ternary_expr : public expression { +protected: + expression *expr1; + expression *expr2; + expression *expr3; +public: + ternary_expr(expression *e1, expression *e2, expression *e3) + : expr1(e1), expr2(e2), expr3(e3) { } + ~ternary_expr() { delete expr1; delete expr2; delete expr3; } + void evaluate(int, const reference &, string &, substring_position &) = 0; + unsigned analyze() { + return ((expr1 ? expr1->analyze() : 0) + | (expr2 ? expr2->analyze() : 0) + | (expr3 ? expr3->analyze() : 0)); + } +}; + +class conditional_expr : public ternary_expr { +public: + conditional_expr(expression *e1, expression *e2, expression *e3) + : ternary_expr(e1, e2, e3) { } + void evaluate(int, const reference &, string &, substring_position &); +}; + +static expression *parsed_label = 0; +static expression *parsed_date_label = 0; +static expression *parsed_short_label = 0; + +static expression *parse_result; + +string literals; + +%} + +%union { + int num; + expression *expr; + struct { int ndigits; int val; } dig; + struct { int start; int len; } str; +} + +/* uppercase or lowercase letter */ +%token TOKEN_LETTER +/* literal characters */ +%token TOKEN_LITERAL +/* digit */ +%token TOKEN_DIGIT + +%type conditional +%type alternative +%type list +%type string +%type substitute +%type optional_conditional +%type number +%type digits +%type optional_number +%type flag + +%% + +expr: + optional_conditional + { parse_result = ($1 ? new analyzed_expr($1) : 0); } + ; + +conditional: + alternative + { $$ = $1; } + | alternative '?' optional_conditional ':' conditional + { $$ = new conditional_expr($1, $3, $5); } + ; + +optional_conditional: + /* empty */ + { $$ = 0; } + | conditional + { $$ = $1; } + ; + +alternative: + list + { $$ = $1; } + | alternative '|' list + { $$ = new alternative_expr($1, $3); } + | alternative '&' list + { $$ = new conditional_expr($1, $3, 0); } + ; + +list: + substitute + { $$ = $1; } + | list substitute + { $$ = new list_expr($1, $2); } + ; + +substitute: + string + { $$ = $1; } + | substitute '~' string + { $$ = new substitute_expr($1, $3); } + ; + +string: + '@' + { $$ = new at_expr; } + | TOKEN_LITERAL + { + $$ = new literal_expr(literals.contents() + $1.start, + $1.len); + } + | TOKEN_LETTER + { $$ = new field_expr($1, 0); } + | TOKEN_LETTER number + { $$ = new field_expr($1, $2 - 1); } + | '%' TOKEN_LETTER + { + switch ($2) { + case 'I': + case 'i': + case 'A': + case 'a': + $$ = new format_expr($2); + break; + default: + command_error("unrecognized format `%1'", char($2)); + $$ = new format_expr('a'); + break; + } + } + + | '%' digits + { + $$ = new format_expr('0', $2.ndigits, $2.val); + } + | string '.' flag TOKEN_LETTER optional_number + { + switch ($4) { + case 'l': + $$ = new map_expr($1, lowercase); + break; + case 'u': + $$ = new map_expr($1, uppercase); + break; + case 'c': + $$ = new map_expr($1, capitalize); + break; + case 'r': + $$ = new map_expr($1, reverse_name); + break; + case 'a': + $$ = new map_expr($1, abbreviate_name); + break; + case 'y': + $$ = new extractor_expr($1, find_year, $3); + break; + case 'n': + $$ = new extractor_expr($1, find_last_name, $3); + break; + default: + $$ = $1; + command_error("unknown function `%1'", char($4)); + break; + } + } + + | string '+' number + { $$ = new truncate_expr($1, $3); } + | string '-' number + { $$ = new truncate_expr($1, -$3); } + | string '*' + { $$ = new star_expr($1); } + | '(' optional_conditional ')' + { $$ = $2; } + | '<' optional_conditional '>' + { $$ = new separator_expr($2); } + ; + +optional_number: + /* empty */ + { $$ = -1; } + | number + { $$ = $1; } + ; + +number: + TOKEN_DIGIT + { $$ = $1; } + | number TOKEN_DIGIT + { $$ = $1*10 + $2; } + ; + +digits: + TOKEN_DIGIT + { $$.ndigits = 1; $$.val = $1; } + | digits TOKEN_DIGIT + { $$.ndigits = $1.ndigits + 1; $$.val = $1.val*10 + $2; } + ; + + +flag: + /* empty */ + { $$ = 0; } + | '+' + { $$ = 1; } + | '-' + { $$ = -1; } + ; + +%% + +/* bison defines const to be empty unless __STDC__ is defined, which it +isn't under cfront */ + +#ifdef const +#undef const +#endif + +const char *spec_ptr; +const char *spec_end; +const char *spec_cur; + +int yylex() +{ + while (spec_ptr < spec_end && csspace(*spec_ptr)) + spec_ptr++; + spec_cur = spec_ptr; + if (spec_ptr >= spec_end) + return 0; + unsigned char c = *spec_ptr++; + if (csalpha(c)) { + yylval.num = c; + return TOKEN_LETTER; + } + if (csdigit(c)) { + yylval.num = c - '0'; + return TOKEN_DIGIT; + } + if (c == '\'') { + yylval.str.start = literals.length(); + for (; spec_ptr < spec_end; spec_ptr++) { + if (*spec_ptr == '\'') { + if (++spec_ptr < spec_end && *spec_ptr == '\'') + literals += '\''; + else { + yylval.str.len = literals.length() - yylval.str.start; + return TOKEN_LITERAL; + } + } + else + literals += *spec_ptr; + } + yylval.str.len = literals.length() - yylval.str.start; + return TOKEN_LITERAL; + } + return c; +} + +int set_label_spec(const char *label_spec) +{ + spec_cur = spec_ptr = label_spec; + spec_end = strchr(label_spec, '\0'); + literals.clear(); + if (yyparse()) + return 0; + delete parsed_label; + parsed_label = parse_result; + return 1; +} + +int set_date_label_spec(const char *label_spec) +{ + spec_cur = spec_ptr = label_spec; + spec_end = strchr(label_spec, '\0'); + literals.clear(); + if (yyparse()) + return 0; + delete parsed_date_label; + parsed_date_label = parse_result; + return 1; +} + +int set_short_label_spec(const char *label_spec) +{ + spec_cur = spec_ptr = label_spec; + spec_end = strchr(label_spec, '\0'); + literals.clear(); + if (yyparse()) + return 0; + delete parsed_short_label; + parsed_short_label = parse_result; + return 1; +} + +void yyerror(const char *message) +{ + if (spec_cur < spec_end) + command_error("label specification %1 before `%2'", message, spec_cur); + else + command_error("label specification %1 at end of string", + message, spec_cur); +} + +void at_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &) +{ + if (tentative) + ref.canonicalize_authors(result); + else { + const char *end, *start = ref.get_authors(&end); + if (start) + result.append(start, end - start); + } +} + +void format_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &) +{ + if (tentative) + return; + const label_info *lp = ref.get_label_ptr(); + int num = lp == 0 ? ref.get_number() : lp->count; + if (type != '0') + result += format_serial(type, num + 1); + else { + const char *ptr = i_to_a(num + first_number); + int pad = width - strlen(ptr); + while (--pad >= 0) + result += '0'; + result += ptr; + } +} + +static const char *format_serial(char c, int n) +{ + assert(n > 0); + static char buf[128]; // more than enough. + switch (c) { + case 'i': + case 'I': + { + char *p = buf; + // troff uses z and w to represent 10000 and 5000 in Roman + // numerals; I can find no historical basis for this usage + const char *s = c == 'i' ? "zwmdclxvi" : "ZWMDCLXVI"; + if (n >= 40000) + return i_to_a(n); + while (n >= 10000) { + *p++ = s[0]; + n -= 10000; + } + for (int i = 1000; i > 0; i /= 10, s += 2) { + int m = n/i; + n -= m*i; + switch (m) { + case 3: + *p++ = s[2]; + /* falls through */ + case 2: + *p++ = s[2]; + /* falls through */ + case 1: + *p++ = s[2]; + break; + case 4: + *p++ = s[2]; + *p++ = s[1]; + break; + case 8: + *p++ = s[1]; + *p++ = s[2]; + *p++ = s[2]; + *p++ = s[2]; + break; + case 7: + *p++ = s[1]; + *p++ = s[2]; + *p++ = s[2]; + break; + case 6: + *p++ = s[1]; + *p++ = s[2]; + break; + case 5: + *p++ = s[1]; + break; + case 9: + *p++ = s[2]; + *p++ = s[0]; + } + } + *p = 0; + break; + } + case 'a': + case 'A': + { + char *p = buf; + // this is derived from troff/reg.c + while (n > 0) { + int d = n % 26; + if (d == 0) + d = 26; + n -= d; + n /= 26; + *p++ = c + d - 1; // ASCII dependent + } + *p-- = 0; + // Reverse it. + char *q = buf; + while (q < p) { + char temp = *q; + *q = *p; + *p = temp; + --p; + ++q; + } + break; + } + default: + assert(0); + } + return buf; +} + +void field_expr::evaluate(int, const reference &ref, + string &result, substring_position &) +{ + const char *end; + const char *start = ref.get_field(name, &end); + if (start) { + start = nth_field(number, start, &end); + if (start) + result.append(start, end - start); + } +} + +void literal_expr::evaluate(int, const reference &, + string &result, substring_position &) +{ + result += s; +} + +analyzed_expr::analyzed_expr(expression *e) +: unary_expr(e), flags(e ? e->analyze() : 0) +{ +} + +void analyzed_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + if (expr) + expr->evaluate(tentative, ref, result, pos); +} + +void star_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + const label_info *lp = ref.get_label_ptr(); + if (!tentative + && (lp == 0 || lp->total > 1) + && expr) + expr->evaluate(tentative, ref, result, pos); +} + +void separator_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + int start_length = result.length(); + int is_first = pos.start < 0; + if (expr) + expr->evaluate(tentative, ref, result, pos); + if (is_first) { + pos.start = start_length; + pos.length = result.length() - start_length; + } +} + +void map_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &) +{ + if (expr) { + string temp; + substring_position temp_pos; + expr->evaluate(tentative, ref, temp, temp_pos); + (*func)(temp.contents(), temp.contents() + temp.length(), result); + } +} + +void extractor_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &) +{ + if (expr) { + string temp; + substring_position temp_pos; + expr->evaluate(tentative, ref, temp, temp_pos); + const char *end, *start = (*func)(temp.contents(), + temp.contents() + temp.length(), + &end); + switch (part) { + case BEFORE: + if (start) + result.append(temp.contents(), start - temp.contents()); + else + result += temp; + break; + case MATCH: + if (start) + result.append(start, end - start); + break; + case AFTER: + if (start) + result.append(end, temp.contents() + temp.length() - end); + break; + default: + assert(0); + } + } +} + +static void first_part(int len, const char *ptr, const char *end, + string &result) +{ + for (;;) { + const char *token_start = ptr; + if (!get_token(&ptr, end)) + break; + const token_info *ti = lookup_token(token_start, ptr); + int counts = ti->sortify_non_empty(token_start, ptr); + if (counts && --len < 0) + break; + if (counts || ti->is_accent()) + result.append(token_start, ptr - token_start); + } +} + +static void last_part(int len, const char *ptr, const char *end, + string &result) +{ + const char *start = ptr; + int count = 0; + for (;;) { + const char *token_start = ptr; + if (!get_token(&ptr, end)) + break; + const token_info *ti = lookup_token(token_start, ptr); + if (ti->sortify_non_empty(token_start, ptr)) + count++; + } + ptr = start; + int skip = count - len; + if (skip > 0) { + for (;;) { + const char *token_start = ptr; + if (!get_token(&ptr, end)) + assert(0); + const token_info *ti = lookup_token(token_start, ptr); + if (ti->sortify_non_empty(token_start, ptr) && --skip < 0) { + ptr = token_start; + break; + } + } + } + first_part(len, ptr, end, result); +} + +void truncate_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &) +{ + if (expr) { + string temp; + substring_position temp_pos; + expr->evaluate(tentative, ref, temp, temp_pos); + const char *start = temp.contents(); + const char *end = start + temp.length(); + if (n > 0) + first_part(n, start, end, result); + else if (n < 0) + last_part(-n, start, end, result); + } +} + +void alternative_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + int start_length = result.length(); + if (expr1) + expr1->evaluate(tentative, ref, result, pos); + if (result.length() == start_length && expr2) + expr2->evaluate(tentative, ref, result, pos); +} + +void list_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + if (expr1) + expr1->evaluate(tentative, ref, result, pos); + if (expr2) + expr2->evaluate(tentative, ref, result, pos); +} + +void substitute_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + int start_length = result.length(); + if (expr1) + expr1->evaluate(tentative, ref, result, pos); + if (result.length() > start_length && result[result.length() - 1] == '-') { + // ought to see if pos covers the - + result.set_length(result.length() - 1); + if (expr2) + expr2->evaluate(tentative, ref, result, pos); + } +} + +void conditional_expr::evaluate(int tentative, const reference &ref, + string &result, substring_position &pos) +{ + string temp; + substring_position temp_pos; + if (expr1) + expr1->evaluate(tentative, ref, temp, temp_pos); + if (temp.length() > 0) { + if (expr2) + expr2->evaluate(tentative, ref, result, pos); + } + else { + if (expr3) + expr3->evaluate(tentative, ref, result, pos); + } +} + +void reference::pre_compute_label() +{ + if (parsed_label != 0 + && (parsed_label->analyze() & expression::CONTAINS_VARIABLE)) { + label.clear(); + substring_position temp_pos; + parsed_label->evaluate(1, *this, label, temp_pos); + label_ptr = lookup_label(label); + } +} + +void reference::compute_label() +{ + label.clear(); + if (parsed_label) + parsed_label->evaluate(0, *this, label, separator_pos); + if (short_label_flag && parsed_short_label) + parsed_short_label->evaluate(0, *this, short_label, short_separator_pos); + if (date_as_label) { + string new_date; + if (parsed_date_label) { + substring_position temp_pos; + parsed_date_label->evaluate(0, *this, new_date, temp_pos); + } + set_date(new_date); + } + if (label_ptr) + label_ptr->count += 1; +} + +void reference::immediate_compute_label() +{ + if (label_ptr) + label_ptr->total = 2; // force use of disambiguator + compute_label(); +} + +int reference::merge_labels(reference **v, int n, label_type type, + string &result) +{ + if (abbreviate_label_ranges) + return merge_labels_by_number(v, n, type, result); + else + return merge_labels_by_parts(v, n, type, result); +} + +int reference::merge_labels_by_number(reference **v, int n, label_type type, + string &result) +{ + if (n <= 1) + return 0; + int num = get_number(); + // Only merge three or more labels. + if (v[0]->get_number() != num + 1 + || v[1]->get_number() != num + 2) + return 0; + int i; + for (i = 2; i < n; i++) + if (v[i]->get_number() != num + i + 1) + break; + result = get_label(type); + result += label_range_indicator; + result += v[i - 1]->get_label(type); + return i; +} + +const substring_position &reference::get_separator_pos(label_type type) const +{ + if (type == SHORT_LABEL && short_label_flag) + return short_separator_pos; + else + return separator_pos; +} + +const string &reference::get_label(label_type type) const +{ + if (type == SHORT_LABEL && short_label_flag) + return short_label; + else + return label; +} + +int reference::merge_labels_by_parts(reference **v, int n, label_type type, + string &result) +{ + if (n <= 0) + return 0; + const string &lb = get_label(type); + const substring_position &sp = get_separator_pos(type); + if (sp.start < 0 + || sp.start != v[0]->get_separator_pos(type).start + || memcmp(lb.contents(), v[0]->get_label(type).contents(), + sp.start) != 0) + return 0; + result = lb; + int i = 0; + do { + result += separate_label_second_parts; + const substring_position &s = v[i]->get_separator_pos(type); + int sep_end_pos = s.start + s.length; + result.append(v[i]->get_label(type).contents() + sep_end_pos, + v[i]->get_label(type).length() - sep_end_pos); + } while (++i < n + && sp.start == v[i]->get_separator_pos(type).start + && memcmp(lb.contents(), v[i]->get_label(type).contents(), + sp.start) == 0); + return i; +} + +string label_pool; + +label_info::label_info(const string &s) +: start(label_pool.length()), length(s.length()), count(0), total(1) +{ + label_pool += s; +} + +static label_info **label_table = 0; +static int label_table_size = 0; +static int label_table_used = 0; + +label_info *lookup_label(const string &label) +{ + if (label_table == 0) { + label_table = new label_info *[17]; + label_table_size = 17; + for (int i = 0; i < 17; i++) + label_table[i] = 0; + } + unsigned h = hash_string(label.contents(), label.length()) % label_table_size; + label_info **ptr; + for (ptr = label_table + h; + *ptr != 0; + (ptr == label_table) + ? (ptr = label_table + label_table_size - 1) + : ptr--) + if ((*ptr)->length == label.length() + && memcmp(label_pool.contents() + (*ptr)->start, label.contents(), + label.length()) == 0) { + (*ptr)->total += 1; + return *ptr; + } + label_info *result = *ptr = new label_info(label); + if (++label_table_used * 2 > label_table_size) { + // Rehash the table. + label_info **old_table = label_table; + int old_size = label_table_size; + label_table_size = next_size(label_table_size); + label_table = new label_info *[label_table_size]; + int i; + for (i = 0; i < label_table_size; i++) + label_table[i] = 0; + for (i = 0; i < old_size; i++) + if (old_table[i]) { + unsigned h = hash_string(label_pool.contents() + old_table[i]->start, + old_table[i]->length); + label_info **p; + for (p = label_table + (h % label_table_size); + *p != 0; + (p == label_table) + ? (p = label_table + label_table_size - 1) + : --p) + ; + *p = old_table[i]; + } + a_delete old_table; + } + return result; +} + +void clear_labels() +{ + for (int i = 0; i < label_table_size; i++) { + delete label_table[i]; + label_table[i] = 0; + } + label_table_used = 0; + label_pool.clear(); +} + +static void consider_authors(reference **start, reference **end, int i); + +void compute_labels(reference **v, int n) +{ + if (parsed_label + && (parsed_label->analyze() & expression::CONTAINS_AT) + && sort_fields.length() >= 2 + && sort_fields[0] == 'A' + && sort_fields[1] == '+') + consider_authors(v, v + n, 0); + for (int i = 0; i < n; i++) + v[i]->compute_label(); +} + + +/* A reference with a list of authors _needs_ author i +where 0 <= i <= N if there exists a reference with a list of authors + such that != and M >= i +and Aj = Bj for 0 <= j < i. In this case if we can't say ``A0, +A1,...,A(i-1) et al'' because this would match both and +. If a reference needs author i we only have to call +need_author(j) for some j >= i such that the reference also needs +author j. */ + +/* This function handles 2 tasks: +determine which authors are needed (cannot be elided with et al.); +determine which authors can have only last names in the labels. + +References >= start and < end have the same first i author names. +Also they're sorted by A+. */ + +static void consider_authors(reference **start, reference **end, int i) +{ + if (start >= end) + return; + reference **p = start; + if (i >= (*p)->get_nauthors()) { + for (++p; p < end && i >= (*p)->get_nauthors(); p++) + ; + if (p < end && i > 0) { + // If we have an author list and an author list , + // then both lists need C. + for (reference **q = start; q < end; q++) + (*q)->need_author(i - 1); + } + start = p; + } + while (p < end) { + reference **last_name_start = p; + reference **name_start = p; + for (++p; + p < end && i < (*p)->get_nauthors() + && same_author_last_name(**last_name_start, **p, i); + p++) { + if (!same_author_name(**name_start, **p, i)) { + consider_authors(name_start, p, i + 1); + name_start = p; + } + } + consider_authors(name_start, p, i + 1); + if (last_name_start == name_start) { + for (reference **q = last_name_start; q < p; q++) + (*q)->set_last_name_unambiguous(i); + } + // If we have an author list and , then the lists + // need author D and E respectively. + if (name_start > start || p < end) { + for (reference **q = last_name_start; q < p; q++) + (*q)->need_author(i); + } + } +} + +int same_author_last_name(const reference &r1, const reference &r2, int n) +{ + const char *ae1; + const char *as1 = r1.get_sort_field(0, n, 0, &ae1); + assert(as1 != 0); + const char *ae2; + const char *as2 = r2.get_sort_field(0, n, 0, &ae2); + assert(as2 != 0); + return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0; +} + +int same_author_name(const reference &r1, const reference &r2, int n) +{ + const char *ae1; + const char *as1 = r1.get_sort_field(0, n, -1, &ae1); + assert(as1 != 0); + const char *ae2; + const char *as2 = r2.get_sort_field(0, n, -1, &ae2); + assert(as2 != 0); + return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0; +} + + +void int_set::set(int i) +{ + assert(i >= 0); + int bytei = i >> 3; + if (bytei >= v.length()) { + int old_length = v.length(); + v.set_length(bytei + 1); + for (int j = old_length; j <= bytei; j++) + v[j] = 0; + } + v[bytei] |= 1 << (i & 7); +} + +int int_set::get(int i) const +{ + assert(i >= 0); + int bytei = i >> 3; + return bytei >= v.length() ? 0 : (v[bytei] & (1 << (i & 7))) != 0; +} + +void reference::set_last_name_unambiguous(int i) +{ + last_name_unambiguous.set(i); +} + +void reference::need_author(int n) +{ + if (n > last_needed_author) + last_needed_author = n; +} + +const char *reference::get_authors(const char **end) const +{ + if (!computed_authors) { + ((reference *)this)->computed_authors = 1; + string &result = ((reference *)this)->authors; + int na = get_nauthors(); + result.clear(); + for (int i = 0; i < na; i++) { + if (last_name_unambiguous.get(i)) { + const char *e, *start = get_author_last_name(i, &e); + assert(start != 0); + result.append(start, e - start); + } + else { + const char *e, *start = get_author(i, &e); + assert(start != 0); + result.append(start, e - start); + } + if (i == last_needed_author + && et_al.length() > 0 + && et_al_min_elide > 0 + && last_needed_author + et_al_min_elide < na + && na >= et_al_min_total) { + result += et_al; + break; + } + if (i < na - 1) { + if (na == 2) + result += join_authors_exactly_two; + else if (i < na - 2) + result += join_authors_default; + else + result += join_authors_last_two; + } + } + } + const char *start = authors.contents(); + *end = start + authors.length(); + return start; +} + +int reference::get_nauthors() const +{ + if (nauthors < 0) { + const char *dummy; + int na; + for (na = 0; get_author(na, &dummy) != 0; na++) + ; + ((reference *)this)->nauthors = na; + } + return nauthors; +} diff --git a/contrib/groff/src/preproc/refer/ref.cc b/contrib/groff/src/preproc/refer/ref.cc new file mode 100644 index 0000000..c3517b1 --- /dev/null +++ b/contrib/groff/src/preproc/refer/ref.cc @@ -0,0 +1,1160 @@ +// -*- C++ -*- +/* Copyright (C) 1989, 1990, 1991, 1992 Free Software Foundation, Inc. +Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License along +with groff; see the file COPYING. If not, write to the Free Software +Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + +#include "refer.h" +#include "refid.h" +#include "ref.h" +#include "token.h" + +static const char *find_day(const char *, const char *, const char **); +static int find_month(const char *start, const char *end); +static void abbreviate_names(string &); + +#define DEFAULT_ARTICLES "the\000a\000an" + +string articles(DEFAULT_ARTICLES, sizeof(DEFAULT_ARTICLES)); + +// Multiple occurrences of fields are separated by FIELD_SEPARATOR. +const char FIELD_SEPARATOR = '\0'; + +const char MULTI_FIELD_NAMES[] = "AE"; +const char *AUTHOR_FIELDS = "AQ"; + +enum { OTHER, JOURNAL_ARTICLE, BOOK, ARTICLE_IN_BOOK, TECH_REPORT, BELL_TM }; + +const char *reference_types[] = { + "other", + "journal-article", + "book", + "article-in-book", + "tech-report", + "bell-tm", +}; + +static string temp_fields[256]; + +reference::reference(const char *start, int len, reference_id *ridp) +: h(0), merged(0), no(-1), field(0), nfields(0), label_ptr(0), + computed_authors(0), last_needed_author(-1), nauthors(-1) +{ + int i; + for (i = 0; i < 256; i++) + field_index[i] = NULL_FIELD_INDEX; + if (ridp) + rid = *ridp; + if (start == 0) + return; + if (len <= 0) + return; + const char *end = start + len; + const char *ptr = start; + assert(*ptr == '%'); + while (ptr < end) { + if (ptr + 1 < end && ptr[1] != '\0' + && ((ptr[1] != '%' && ptr[1] == annotation_field) + || (ptr + 2 < end && ptr[1] == '%' && ptr[2] != '\0' + && discard_fields.search(ptr[2]) < 0))) { + if (ptr[1] == '%') + ptr++; + string &f = temp_fields[(unsigned char)ptr[1]]; + ptr += 2; + while (ptr < end && csspace(*ptr)) + ptr++; + for (;;) { + for (;;) { + if (ptr >= end) { + f += '\n'; + break; + } + f += *ptr; + if (*ptr++ == '\n') + break; + } + if (ptr >= end || *ptr == '%') + break; + } + } + else if (ptr + 1 < end && ptr[1] != '\0' && ptr[1] != '%' + && discard_fields.search(ptr[1]) < 0) { + string &f = temp_fields[(unsigned char)ptr[1]]; + if (f.length() > 0) { + if (strchr(MULTI_FIELD_NAMES, ptr[1]) != 0) + f += FIELD_SEPARATOR; + else + f.clear(); + } + ptr += 2; + if (ptr < end) { + if (*ptr == ' ') + ptr++; + for (;;) { + const char *p = ptr; + while (ptr < end && *ptr != '\n') + ptr++; + // strip trailing white space + const char *q = ptr; + while (q > p && q[-1] != '\n' && csspace(q[-1])) + q--; + while (p < q) + f += *p++; + if (ptr >= end) + break; + ptr++; + if (ptr >= end) + break; + if (*ptr == '%') + break; + f += ' '; + } + } + } + else { + // skip this field + for (;;) { + while (ptr < end && *ptr++ != '\n') + ; + if (ptr >= end || *ptr == '%') + break; + } + } + } + for (i = 0; i < 256; i++) + if (temp_fields[i].length() > 0) + nfields++; + field = new string[nfields]; + int j = 0; + for (i = 0; i < 256; i++) + if (temp_fields[i].length() > 0) { + field[j].move(temp_fields[i]); + if (abbreviate_fields.search(i) >= 0) + abbreviate_names(field[j]); + field_index[i] = j; + j++; + } +} + +reference::~reference() +{ + if (nfields > 0) + ad_delete(nfields) field; +} + +// ref is the inline, this is the database ref + +void reference::merge(reference &ref) +{ + int i; + for (i = 0; i < 256; i++) + if (field_index[i] != NULL_FIELD_INDEX) + temp_fields[i].move(field[field_index[i]]); + for (i = 0; i < 256; i++) + if (ref.field_index[i] != NULL_FIELD_INDEX) + temp_fields[i].move(ref.field[ref.field_index[i]]); + for (i = 0; i < 256; i++) + field_index[i] = NULL_FIELD_INDEX; + int old_nfields = nfields; + nfields = 0; + for (i = 0; i < 256; i++) + if (temp_fields[i].length() > 0) + nfields++; + if (nfields != old_nfields) { + if (old_nfields > 0) + ad_delete(old_nfields) field; + field = new string[nfields]; + } + int j = 0; + for (i = 0; i < 256; i++) + if (temp_fields[i].length() > 0) { + field[j].move(temp_fields[i]); + field_index[i] = j; + j++; + } + merged = 1; +} + +void reference::insert_field(unsigned char c, string &s) +{ + assert(s.length() > 0); + if (field_index[c] != NULL_FIELD_INDEX) { + field[field_index[c]].move(s); + return; + } + assert(field_index[c] == NULL_FIELD_INDEX); + string *old_field = field; + field = new string[nfields + 1]; + int pos = 0; + int i; + for (i = 0; i < int(c); i++) + if (field_index[i] != NULL_FIELD_INDEX) + pos++; + for (i = 0; i < pos; i++) + field[i].move(old_field[i]); + field[pos].move(s); + for (i = pos; i < nfields; i++) + field[i + 1].move(old_field[i]); + if (nfields > 0) + ad_delete(nfields) old_field; + nfields++; + field_index[c] = pos; + for (i = c + 1; i < 256; i++) + if (field_index[i] != NULL_FIELD_INDEX) + field_index[i] += 1; +} + +void reference::delete_field(unsigned char c) +{ + if (field_index[c] == NULL_FIELD_INDEX) + return; + string *old_field = field; + field = new string[nfields - 1]; + int i; + for (i = 0; i < int(field_index[c]); i++) + field[i].move(old_field[i]); + for (i = field_index[c]; i < nfields - 1; i++) + field[i].move(old_field[i + 1]); + if (nfields > 0) + ad_delete(nfields) old_field; + nfields--; + field_index[c] = NULL_FIELD_INDEX; + for (i = c + 1; i < 256; i++) + if (field_index[i] != NULL_FIELD_INDEX) + field_index[i] -= 1; +} + +void reference::compute_hash_code() +{ + if (!rid.is_null()) + h = rid.hash(); + else { + h = 0; + for (int i = 0; i < nfields; i++) + if (field[i].length() > 0) { + h <<= 4; + h ^= hash_string(field[i].contents(), field[i].length()); + } + } +} + +void reference::set_number(int n) +{ + no = n; +} + +const char SORT_SEP = '\001'; +const char SORT_SUB_SEP = '\002'; +const char SORT_SUB_SUB_SEP = '\003'; + +// sep specifies additional word separators + +void sortify_words(const char *s, const char *end, const char *sep, + string &result) +{ + int non_empty = 0; + int need_separator = 0; + for (;;) { + const char *token_start = s; + if (!get_token(&s, end)) + break; + if ((s - token_start == 1 + && (*token_start == ' ' + || *token_start == '\n' + || (sep && *token_start != '\0' + && strchr(sep, *token_start) != 0))) + || (s - token_start == 2 + && token_start[0] == '\\' && token_start[1] == ' ')) { + if (non_empty) + need_separator = 1; + } + else { + const token_info *ti = lookup_token(token_start, s); + if (ti->sortify_non_empty(token_start, s)) { + if (need_separator) { + result += ' '; + need_separator = 0; + } + ti->sortify(token_start, s, result); + non_empty = 1; + } + } + } +} + +void sortify_word(const char *s, const char *end, string &result) +{ + for (;;) { + const char *token_start = s; + if (!get_token(&s, end)) + break; + const token_info *ti = lookup_token(token_start, s); + ti->sortify(token_start, s, result); + } +} + +void sortify_other(const char *s, int len, string &key) +{ + sortify_words(s, s + len, 0, key); +} + +void sortify_title(const char *s, int len, string &key) +{ + const char *end = s + len; + for (; s < end && (*s == ' ' || *s == '\n'); s++) + ; + const char *ptr = s; + for (;;) { + const char *token_start = ptr; + if (!get_token(&ptr, end)) + break; + if (ptr - token_start == 1 + && (*token_start == ' ' || *token_start == '\n')) + break; + } + if (ptr < end) { + int first_word_len = ptr - s - 1; + const char *ae = articles.contents() + articles.length(); + for (const char *a = articles.contents(); + a < ae; + a = strchr(a, '\0') + 1) + if (first_word_len == strlen(a)) { + int j; + for (j = 0; j < first_word_len; j++) + if (a[j] != cmlower(s[j])) + break; + if (j >= first_word_len) { + s = ptr; + for (; s < end && (*s == ' ' || *s == '\n'); s++) + ; + break; + } + } + } + sortify_words(s, end, 0, key); +} + +void sortify_name(const char *s, int len, string &key) +{ + const char *last_name_end; + const char *last_name = find_last_name(s, s + len, &last_name_end); + sortify_word(last_name, last_name_end, key); + key += SORT_SUB_SUB_SEP; + if (last_name > s) + sortify_words(s, last_name, ".", key); + key += SORT_SUB_SUB_SEP; + if (last_name_end < s + len) + sortify_words(last_name_end, s + len, ".,", key); +} + +void sortify_date(const char *s, int len, string &key) +{ + const char *year_end; + const char *year_start = find_year(s, s + len, &year_end); + if (!year_start) { + // Things without years are often `forthcoming', so it makes sense + // that they sort after things with explicit years. + key += 'A'; + sortify_words(s, s + len, 0, key); + return; + } + int n = year_end - year_start; + while (n < 4) { + key += '0'; + n++; + } + while (year_start < year_end) + key += *year_start++; + int m = find_month(s, s + len); + if (m < 0) + return; + key += 'A' + m; + const char *day_end; + const char *day_start = find_day(s, s + len, &day_end); + if (!day_start) + return; + if (day_end - day_start == 1) + key += '0'; + while (day_start < day_end) + key += *day_start++; +} + +// SORT_{SUB,SUB_SUB}_SEP can creep in from use of @ in label specification. + +void sortify_label(const char *s, int len, string &key) +{ + const char *end = s + len; + for (;;) { + const char *ptr; + for (ptr = s; + ptr < end && *ptr != SORT_SUB_SEP && *ptr != SORT_SUB_SUB_SEP; + ptr++) + ; + if (ptr > s) + sortify_words(s, ptr, 0, key); + s = ptr; + if (s >= end) + break; + key += *s++; + } +} + +void reference::compute_sort_key() +{ + if (sort_fields.length() == 0) + return; + sort_fields += '\0'; + const char *sf = sort_fields.contents(); + while (*sf != '\0') { + if (sf > sort_fields) + sort_key += SORT_SEP; + char f = *sf++; + int n = 1; + if (*sf == '+') { + n = INT_MAX; + sf++; + } + else if (csdigit(*sf)) { + char *ptr; + long l = strtol(sf, &ptr, 10); + if (l == 0 && ptr == sf) + ; + else { + sf = ptr; + if (l < 0) { + n = 1; + } + else { + n = int(l); + } + } + } + if (f == '.') + sortify_label(label.contents(), label.length(), sort_key); + else if (f == AUTHOR_FIELDS[0]) + sortify_authors(n, sort_key); + else + sortify_field(f, n, sort_key); + } + sort_fields.set_length(sort_fields.length() - 1); +} + +void reference::sortify_authors(int n, string &result) const +{ + for (const char *p = AUTHOR_FIELDS; *p != '\0'; p++) + if (contains_field(*p)) { + sortify_field(*p, n, result); + return; + } + sortify_field(AUTHOR_FIELDS[0], n, result); +} + +void reference::canonicalize_authors(string &result) const +{ + int len = result.length(); + sortify_authors(INT_MAX, result); + if (result.length() > len) + result += SORT_SUB_SEP; +} + +void reference::sortify_field(unsigned char f, int n, string &result) const +{ + typedef void (*sortify_t)(const char *, int, string &); + sortify_t sortifier = sortify_other; + switch (f) { + case 'A': + case 'E': + sortifier = sortify_name; + break; + case 'D': + sortifier = sortify_date; + break; + case 'B': + case 'J': + case 'T': + sortifier = sortify_title; + break; + } + int fi = field_index[(unsigned char)f]; + if (fi != NULL_FIELD_INDEX) { + string &str = field[fi]; + const char *start = str.contents(); + const char *end = start + str.length(); + for (int i = 0; i < n && start < end; i++) { + const char *p = start; + while (start < end && *start != FIELD_SEPARATOR) + start++; + if (i > 0) + result += SORT_SUB_SEP; + (*sortifier)(p, start - p, result); + if (start < end) + start++; + } + } +} + +int compare_reference(const reference &r1, const reference &r2) +{ + assert(r1.no >= 0); + assert(r2.no >= 0); + const char *s1 = r1.sort_key.contents(); + int n1 = r1.sort_key.length(); + const char *s2 = r2.sort_key.contents(); + int n2 = r2.sort_key.length(); + for (; n1 > 0 && n2 > 0; --n1, --n2, ++s1, ++s2) + if (*s1 != *s2) + return (int)(unsigned char)*s1 - (int)(unsigned char)*s2; + if (n2 > 0) + return -1; + if (n1 > 0) + return 1; + return r1.no - r2.no; +} + +int same_reference(const reference &r1, const reference &r2) +{ + if (!r1.rid.is_null() && r1.rid == r2.rid) + return 1; + if (r1.h != r2.h) + return 0; + if (r1.nfields != r2.nfields) + return 0; + int i = 0; + for (i = 0; i < 256; i++) + if (r1.field_index != r2.field_index) + return 0; + for (i = 0; i < r1.nfields; i++) + if (r1.field[i] != r2.field[i]) + return 0; + return 1; +} + +const char *find_last_name(const char *start, const char *end, + const char **endp) +{ + const char *ptr = start; + const char *last_word = start; + for (;;) { + const char *token_start = ptr; + if (!get_token(&ptr, end)) + break; + if (ptr - token_start == 1) { + if (*token_start == ',') { + *endp = token_start; + return last_word; + } + else if (*token_start == ' ' || *token_start == '\n') { + if (ptr < end && *ptr != ' ' && *ptr != '\n') + last_word = ptr; + } + } + } + *endp = end; + return last_word; +} + +void abbreviate_name(const char *ptr, const char *end, string &result) +{ + const char *last_name_end; + const char *last_name_start = find_last_name(ptr, end, &last_name_end); + int need_period = 0; + for (;;) { + const char *token_start = ptr; + if (!get_token(&ptr, last_name_start)) + break; + const token_info *ti = lookup_token(token_start, ptr); + if (need_period) { + if ((ptr - token_start == 1 && *token_start == ' ') + || (ptr - token_start == 2 && token_start[0] == '\\' + && token_start[1] == ' ')) + continue; + if (ti->is_upper()) + result += period_before_initial; + else + result += period_before_other; + need_period = 0; + } + result.append(token_start, ptr - token_start); + if (ti->is_upper()) { + const char *lower_ptr = ptr; + int first_token = 1; + for (;;) { + token_start = ptr; + if (!get_token(&ptr, last_name_start)) + break; + if ((ptr - token_start == 1 && *token_start == ' ') + || (ptr - token_start == 2 && token_start[0] == '\\' + && token_start[1] == ' ')) + break; + ti = lookup_token(token_start, ptr); + if (ti->is_hyphen()) { + const char *ptr1 = ptr; + if (get_token(&ptr1, last_name_start)) { + ti = lookup_token(ptr, ptr1); + if (ti->is_upper()) { + result += period_before_hyphen; + result.append(token_start, ptr1 - token_start); + ptr = ptr1; + } + } + } + else if (ti->is_upper()) { + // MacDougal -> MacD. + result.append(lower_ptr, ptr - lower_ptr); + lower_ptr = ptr; + first_token = 1; + } + else if (first_token && ti->is_accent()) { + result.append(token_start, ptr - token_start); + lower_ptr = ptr; + } + first_token = 0; + } + need_period = 1; + } + } + if (need_period) + result += period_before_last_name; + result.append(last_name_start, end - last_name_start); +} + +static void abbreviate_names(string &result) +{ + string str; + str.move(result); + const char *ptr = str.contents(); + const char *end = ptr + str.length(); + while (ptr < end) { + const char *name_end = (char *)memchr(ptr, FIELD_SEPARATOR, end - ptr); + if (name_end == 0) + name_end = end; + abbreviate_name(ptr, name_end, result); + if (name_end >= end) + break; + ptr = name_end + 1; + result += FIELD_SEPARATOR; + } +} + +void reverse_name(const char *ptr, const char *name_end, string &result) +{ + const char *last_name_end; + const char *last_name_start = find_last_name(ptr, name_end, &last_name_end); + result.append(last_name_start, last_name_end - last_name_start); + while (last_name_start > ptr + && (last_name_start[-1] == ' ' || last_name_start[-1] == '\n')) + last_name_start--; + if (last_name_start > ptr) { + result += ", "; + result.append(ptr, last_name_start - ptr); + } + if (last_name_end < name_end) + result.append(last_name_end, name_end - last_name_end); +} + +void reverse_names(string &result, int n) +{ + if (n <= 0) + return; + string str; + str.move(result); + const char *ptr = str.contents(); + const char *end = ptr + str.length(); + while (ptr < end) { + if (--n < 0) { + result.append(ptr, end - ptr); + break; + } + const char *name_end = (char *)memchr(ptr, FIELD_SEPARATOR, end - ptr); + if (name_end == 0) + name_end = end; + reverse_name(ptr, name_end, result); + if (name_end >= end) + break; + ptr = name_end + 1; + result += FIELD_SEPARATOR; + } +} + +// Return number of field separators. + +int join_fields(string &f) +{ + const char *ptr = f.contents(); + int len = f.length(); + int nfield_seps = 0; + int j; + for (j = 0; j < len; j++) + if (ptr[j] == FIELD_SEPARATOR) + nfield_seps++; + if (nfield_seps == 0) + return 0; + string temp; + int field_seps_left = nfield_seps; + for (j = 0; j < len; j++) { + if (ptr[j] == FIELD_SEPARATOR) { + if (nfield_seps == 1) + temp += join_authors_exactly_two; + else if (--field_seps_left == 0) + temp += join_authors_last_two; + else + temp += join_authors_default; + } + else + temp += ptr[j]; + } + f = temp; + return nfield_seps; +} + +void uppercase(const char *start, const char *end, string &result) +{ + for (;;) { + const char *token_start = start; + if (!get_token(&start, end)) + break; + const token_info *ti = lookup_token(token_start, start); + ti->upper_case(token_start, start, result); + } +} + +void lowercase(const char *start, const char *end, string &result) +{ + for (;;) { + const char *token_start = start; + if (!get_token(&start, end)) + break; + const token_info *ti = lookup_token(token_start, start); + ti->lower_case(token_start, start, result); + } +} + +void capitalize(const char *ptr, const char *end, string &result) +{ + int in_small_point_size = 0; + for (;;) { + const char *start = ptr; + if (!get_token(&ptr, end)) + break; + const token_info *ti = lookup_token(start, ptr); + const char *char_end = ptr; + int is_lower = ti->is_lower(); + if ((is_lower || ti->is_upper()) && get_token(&ptr, end)) { + const token_info *ti2 = lookup_token(char_end, ptr); + if (!ti2->is_accent()) + ptr = char_end; + } + if (is_lower) { + if (!in_small_point_size) { + result += "\\s-2"; + in_small_point_size = 1; + } + ti->upper_case(start, char_end, result); + result.append(char_end, ptr - char_end); + } + else { + if (in_small_point_size) { + result += "\\s+2"; + in_small_point_size = 0; + } + result.append(start, ptr - start); + } + } + if (in_small_point_size) + result += "\\s+2"; +} + +void capitalize_field(string &str) +{ + string temp; + capitalize(str.contents(), str.contents() + str.length(), temp); + str.move(temp); +} + +int is_terminated(const char *ptr, const char *end) +{ + const char *last_token = end; + for (;;) { + const char *p = ptr; + if (!get_token(&ptr, end)) + break; + last_token = p; + } + return end - last_token == 1 + && (*last_token == '.' || *last_token == '!' || *last_token == '?'); +} + +void reference::output(FILE *fp) +{ + fputs(".]-\n", fp); + for (int i = 0; i < 256; i++) + if (field_index[i] != NULL_FIELD_INDEX && i != annotation_field) { + string &f = field[field_index[i]]; + if (!csdigit(i)) { + int j = reverse_fields.search(i); + if (j >= 0) { + int n; + int len = reverse_fields.length(); + if (++j < len && csdigit(reverse_fields[j])) { + n = reverse_fields[j] - '0'; + for (++j; j < len && csdigit(reverse_fields[j]); j++) + // should check for overflow + n = n*10 + reverse_fields[j] - '0'; + } + else + n = INT_MAX; + reverse_names(f, n); + } + } + int is_multiple = join_fields(f) > 0; + if (capitalize_fields.search(i) >= 0) + capitalize_field(f); + if (memchr(f.contents(), '\n', f.length()) == 0) { + fprintf(fp, ".ds [%c ", i); + if (f[0] == ' ' || f[0] == '\\' || f[0] == '"') + putc('"', fp); + put_string(f, fp); + putc('\n', fp); + } + else { + fprintf(fp, ".de [%c\n", i); + put_string(f, fp); + fputs("..\n", fp); + } + if (i == 'P') { + int multiple_pages = 0; + const char *s = f.contents(); + const char *end = f.contents() + f.length(); + for (;;) { + const char *token_start = s; + if (!get_token(&s, end)) + break; + const token_info *ti = lookup_token(token_start, s); + if (ti->is_hyphen() || ti->is_range_sep()) { + multiple_pages = 1; + break; + } + } + fprintf(fp, ".nr [P %d\n", multiple_pages); + } + else if (i == 'E') + fprintf(fp, ".nr [E %d\n", is_multiple); + } + for (const char *p = "TAO"; *p; p++) { + int fi = field_index[(unsigned char)*p]; + if (fi != NULL_FIELD_INDEX) { + string &f = field[fi]; + fprintf(fp, ".nr [%c %d\n", *p, + is_terminated(f.contents(), f.contents() + f.length())); + } + } + int t = classify(); + fprintf(fp, ".][ %d %s\n", t, reference_types[t]); + if (annotation_macro.length() > 0 && annotation_field >= 0 + && field_index[annotation_field] != NULL_FIELD_INDEX) { + putc('.', fp); + put_string(annotation_macro, fp); + putc('\n', fp); + put_string(field[field_index[annotation_field]], fp); + } +} + +void reference::print_sort_key_comment(FILE *fp) +{ + fputs(".\\\"", fp); + put_string(sort_key, fp); + putc('\n', fp); +} + +const char *find_year(const char *start, const char *end, const char **endp) +{ + for (;;) { + while (start < end && !csdigit(*start)) + start++; + const char *ptr = start; + if (start == end) + break; + while (ptr < end && csdigit(*ptr)) + ptr++; + if (ptr - start == 4 || ptr - start == 3 + || (ptr - start == 2 + && (start[0] >= '4' || (start[0] == '3' && start[1] >= '2')))) { + *endp = ptr; + return start; + } + start = ptr; + } + return 0; +} + +static const char *find_day(const char *start, const char *end, + const char **endp) +{ + for (;;) { + while (start < end && !csdigit(*start)) + start++; + const char *ptr = start; + if (start == end) + break; + while (ptr < end && csdigit(*ptr)) + ptr++; + if ((ptr - start == 1 && start[0] != '0') + || (ptr - start == 2 && + (start[0] == '1' + || start[0] == '2' + || (start[0] == '3' && start[1] <= '1') + || (start[0] == '0' && start[1] != '0')))) { + *endp = ptr; + return start; + } + start = ptr; + } + return 0; +} + +static int find_month(const char *start, const char *end) +{ + static const char *months[] = { + "january", + "february", + "march", + "april", + "may", + "june", + "july", + "august", + "september", + "october", + "november", + "december", + }; + for (;;) { + while (start < end && !csalpha(*start)) + start++; + const char *ptr = start; + if (start == end) + break; + while (ptr < end && csalpha(*ptr)) + ptr++; + if (ptr - start >= 3) { + for (int i = 0; i < sizeof(months)/sizeof(months[0]); i++) { + const char *q = months[i]; + const char *p = start; + for (; p < ptr; p++, q++) + if (cmlower(*p) != *q) + break; + if (p >= ptr) + return i; + } + } + start = ptr; + } + return -1; +} + +int reference::contains_field(char c) const +{ + return field_index[(unsigned char)c] != NULL_FIELD_INDEX; +} + +int reference::classify() +{ + if (contains_field('J')) + return JOURNAL_ARTICLE; + if (contains_field('B')) + return ARTICLE_IN_BOOK; + if (contains_field('G')) + return TECH_REPORT; + if (contains_field('R')) + return TECH_REPORT; + if (contains_field('I')) + return BOOK; + if (contains_field('M')) + return BELL_TM; + return OTHER; +} + +const char *reference::get_year(const char **endp) const +{ + if (field_index['D'] != NULL_FIELD_INDEX) { + string &date = field[field_index['D']]; + const char *start = date.contents(); + const char *end = start + date.length(); + return find_year(start, end, endp); + } + else + return 0; +} + +const char *reference::get_field(unsigned char c, const char **endp) const +{ + if (field_index[c] != NULL_FIELD_INDEX) { + string &f = field[field_index[c]]; + const char *start = f.contents(); + *endp = start + f.length(); + return start; + } + else + return 0; +} + +const char *reference::get_date(const char **endp) const +{ + return get_field('D', endp); +} + +const char *nth_field(int i, const char *start, const char **endp) +{ + while (--i >= 0) { + start = (char *)memchr(start, FIELD_SEPARATOR, *endp - start); + if (!start) + return 0; + start++; + } + const char *e = (char *)memchr(start, FIELD_SEPARATOR, *endp - start); + if (e) + *endp = e; + return start; +} + +const char *reference::get_author(int i, const char **endp) const +{ + for (const char *f = AUTHOR_FIELDS; *f != '\0'; f++) { + const char *start = get_field(*f, endp); + if (start) { + if (strchr(MULTI_FIELD_NAMES, *f) != 0) + return nth_field(i, start, endp); + else if (i == 0) + return start; + else + return 0; + } + } + return 0; +} + +const char *reference::get_author_last_name(int i, const char **endp) const +{ + for (const char *f = AUTHOR_FIELDS; *f != '\0'; f++) { + const char *start = get_field(*f, endp); + if (start) { + if (strchr(MULTI_FIELD_NAMES, *f) != 0) { + start = nth_field(i, start, endp); + if (!start) + return 0; + } + if (*f == 'A') + return find_last_name(start, *endp, endp); + else + return start; + } + } + return 0; +} + +void reference::set_date(string &d) +{ + if (d.length() == 0) + delete_field('D'); + else + insert_field('D', d); +} + +int same_year(const reference &r1, const reference &r2) +{ + const char *ye1; + const char *ys1 = r1.get_year(&ye1); + const char *ye2; + const char *ys2 = r2.get_year(&ye2); + if (ys1 == 0) { + if (ys2 == 0) + return same_date(r1, r2); + else + return 0; + } + else if (ys2 == 0) + return 0; + else if (ye1 - ys1 != ye2 - ys2) + return 0; + else + return memcmp(ys1, ys2, ye1 - ys1) == 0; +} + +int same_date(const reference &r1, const reference &r2) +{ + const char *e1; + const char *s1 = r1.get_date(&e1); + const char *e2; + const char *s2 = r2.get_date(&e2); + if (s1 == 0) + return s2 == 0; + else if (s2 == 0) + return 0; + else if (e1 - s1 != e2 - s2) + return 0; + else + return memcmp(s1, s2, e1 - s1) == 0; +} + +const char *reference::get_sort_field(int i, int si, int ssi, + const char **endp) const +{ + const char *start = sort_key.contents(); + const char *end = start + sort_key.length(); + if (i < 0) { + *endp = end; + return start; + } + while (--i >= 0) { + start = (char *)memchr(start, SORT_SEP, end - start); + if (!start) + return 0; + start++; + } + const char *e = (char *)memchr(start, SORT_SEP, end - start); + if (e) + end = e; + if (si < 0) { + *endp = end; + return start; + } + while (--si >= 0) { + start = (char *)memchr(start, SORT_SUB_SEP, end - start); + if (!start) + return 0; + start++; + } + e = (char *)memchr(start, SORT_SUB_SEP, end - start); + if (e) + end = e; + if (ssi < 0) { + *endp = end; + return start; + } + while (--ssi >= 0) { + start = (char *)memchr(start, SORT_SUB_SUB_SEP, end - start); + if (!start) + return 0; + start++; + } + e = (char *)memchr(start, SORT_SUB_SUB_SEP, end - start); + if (e) + end = e; + *endp = end; + return start; +} + diff --git a/contrib/groff/src/preproc/refer/ref.h b/contrib/groff/src/preproc/refer/ref.h new file mode 100644 index 0000000..13a984a --- /dev/null +++ b/contrib/groff/src/preproc/refer/ref.h @@ -0,0 +1,120 @@ +// -*- C++ -*- +/* Copyright (C) 1989, 1990, 1991, 1992 Free Software Foundation, Inc. + Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License along +with groff; see the file COPYING. If not, write to the Free Software +Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + +struct label_info; + +enum label_type { NORMAL_LABEL, SHORT_LABEL }; +const int N_LABEL_TYPES = 2; + +struct substring_position { + int start; + int length; + substring_position() : start(-1) { } +}; + +class int_set { + string v; +public: + int_set() { } + void set(int i); + int get(int i) const; +}; + +class reference { +private: + unsigned h; + reference_id rid; + int merged; + string sort_key; + int no; + string *field; + int nfields; + unsigned char field_index[256]; + enum { NULL_FIELD_INDEX = 255 }; + string label; + substring_position separator_pos; + string short_label; + substring_position short_separator_pos; + label_info *label_ptr; + string authors; + int computed_authors; + int last_needed_author; + int nauthors; + int_set last_name_unambiguous; + + int contains_field(char) const; + void insert_field(unsigned char, string &s); + void delete_field(unsigned char); + void set_date(string &); + const char *get_sort_field(int i, int si, int ssi, const char **endp) const; + int merge_labels_by_parts(reference **, int, label_type, string &); + int merge_labels_by_number(reference **, int, label_type, string &); +public: + reference(const char * = 0, int = -1, reference_id * = 0); + ~reference(); + void output(FILE *); + void print_sort_key_comment(FILE *); + void set_number(int); + int get_number() const { return no; } + unsigned hash() const { return h; } + const string &get_label(label_type type) const; + const substring_position &get_separator_pos(label_type) const; + int is_merged() const { return merged; } + void compute_sort_key(); + void compute_hash_code(); + void pre_compute_label(); + void compute_label(); + void immediate_compute_label(); + int classify(); + void merge(reference &); + int merge_labels(reference **, int, label_type, string &); + int get_nauthors() const; + void need_author(int); + void set_last_name_unambiguous(int); + void sortify_authors(int, string &) const; + void canonicalize_authors(string &) const; + void sortify_field(unsigned char, int, string &) const; + const char *get_author(int, const char **) const; + const char *get_author_last_name(int, const char **) const; + const char *get_date(const char **) const; + const char *get_year(const char **) const; + const char *get_field(unsigned char, const char **) const; + const label_info *get_label_ptr() const { return label_ptr; } + const char *get_authors(const char **) const; + // for sorting + friend int compare_reference(const reference &r1, const reference &r2); + // for merging + friend int same_reference(const reference &, const reference &); + friend int same_year(const reference &, const reference &); + friend int same_date(const reference &, const reference &); + friend int same_author_last_name(const reference &, const reference &, int); + friend int same_author_name(const reference &, const reference &, int); +}; + +const char *find_year(const char *, const char *, const char **); +const char *find_last_name(const char *, const char *, const char **); + +const char *nth_field(int i, const char *start, const char **endp); + +void capitalize(const char *ptr, const char *end, string &result); +void reverse_name(const char *ptr, const char *end, string &result); +void uppercase(const char *ptr, const char *end, string &result); +void lowercase(const char *ptr, const char *end, string &result); +void abbreviate_name(const char *ptr, const char *end, string &result); diff --git a/contrib/groff/src/preproc/refer/refer.cc b/contrib/groff/src/preproc/refer/refer.cc new file mode 100644 index 0000000..b6cefc5 --- /dev/null +++ b/contrib/groff/src/preproc/refer/refer.cc @@ -0,0 +1,1234 @@ +// -*- C++ -*- +/* Copyright (C) 1989-1992, 2000, 2001 Free Software Foundation, Inc. + Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License along +with groff; see the file COPYING. If not, write to the Free Software +Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + +#include "refer.h" +#include "refid.h" +#include "ref.h" +#include "token.h" +#include "search.h" +#include "command.h" + +const char PRE_LABEL_MARKER = '\013'; +const char POST_LABEL_MARKER = '\014'; +const char LABEL_MARKER = '\015'; // label_type is added on + +#define FORCE_LEFT_BRACKET 04 +#define FORCE_RIGHT_BRACKET 010 + +static FILE *outfp = stdout; + +string capitalize_fields; +string reverse_fields; +string abbreviate_fields; +string period_before_last_name = ". "; +string period_before_initial = "."; +string period_before_hyphen = ""; +string period_before_other = ". "; +string sort_fields; +int annotation_field = -1; +string annotation_macro; +string discard_fields = "XYZ"; +string pre_label = "\\*([."; +string post_label = "\\*(.]"; +string sep_label = ", "; +int accumulate = 0; +int move_punctuation = 0; +int abbreviate_label_ranges = 0; +string label_range_indicator; +int label_in_text = 1; +int label_in_reference = 1; +int date_as_label = 0; +int sort_adjacent_labels = 0; +// Join exactly two authors with this. +string join_authors_exactly_two = " and "; +// When there are more than two authors join the last two with this. +string join_authors_last_two = ", and "; +// Otherwise join authors with this. +string join_authors_default = ", "; +string separate_label_second_parts = ", "; +// Use this string to represent that there are other authors. +string et_al = " et al"; +// Use et al only if it can replace at least this many authors. +int et_al_min_elide = 2; +// Use et al only if the total number of authors is at least this. +int et_al_min_total = 3; + + +int compatible_flag = 0; + +int short_label_flag = 0; + +static int recognize_R1_R2 = 1; + +search_list database_list; +int search_default = 1; +static int default_database_loaded = 0; + +static reference **citation = 0; +static int ncitations = 0; +static int citation_max = 0; + +static reference **reference_hash_table = 0; +static int hash_table_size; +static int nreferences = 0; + +static int need_syncing = 0; +string pending_line; +string pending_lf_lines; + +static void output_pending_line(); +static unsigned immediately_handle_reference(const string &); +static void immediately_output_references(); +static unsigned store_reference(const string &); +static void divert_to_temporary_file(); +static reference *make_reference(const string &, unsigned *); +static void usage(FILE *stream); +static void do_file(const char *); +static void split_punct(string &line, string &punct); +static void output_citation_group(reference **v, int n, label_type, FILE *fp); +static void possibly_load_default_database(); + +int main(int argc, char **argv) +{ + program_name = argv[0]; + static char stderr_buf[BUFSIZ]; + setbuf(stderr, stderr_buf); + outfp = stdout; + int finished_options = 0; + int bib_flag = 0; + int done_spec = 0; + + for (--argc, ++argv; + !finished_options && argc > 0 && argv[0][0] == '-' + && argv[0][1] != '\0'; + argv++, argc--) { + const char *opt = argv[0] + 1; + while (opt != 0 && *opt != '\0') { + switch (*opt) { + case 'C': + compatible_flag = 1; + opt++; + break; + case 'B': + bib_flag = 1; + label_in_reference = 0; + label_in_text = 0; + ++opt; + if (*opt == '\0') { + annotation_field = 'X'; + annotation_macro = "AP"; + } + else if (csalnum(opt[0]) && opt[1] == '.' && opt[2] != '\0') { + annotation_field = opt[0]; + annotation_macro = opt + 2; + } + opt = 0; + break; + case 'P': + move_punctuation = 1; + opt++; + break; + case 'R': + recognize_R1_R2 = 0; + opt++; + break; + case 'S': + // Not a very useful spec. + set_label_spec("(A.n|Q)', '(D.y|D)"); + done_spec = 1; + pre_label = " ("; + post_label = ")"; + sep_label = "; "; + opt++; + break; + case 'V': + verify_flag = 1; + opt++; + break; + case 'f': + { + const char *num = 0; + if (*++opt == '\0') { + if (argc > 1) { + num = *++argv; + --argc; + } + else { + error("option `f' requires an argument"); + usage(stderr); + exit(1); + } + } + else { + num = opt; + opt = 0; + } + const char *ptr; + for (ptr = num; *ptr; ptr++) + if (!csdigit(*ptr)) { + error("bad character `%1' in argument to -f option", *ptr); + break; + } + if (*ptr == '\0') { + string spec; + spec = '%'; + spec += num; + spec += '\0'; + set_label_spec(spec.contents()); + done_spec = 1; + } + break; + } + case 'b': + label_in_text = 0; + label_in_reference = 0; + opt++; + break; + case 'e': + accumulate = 1; + opt++; + break; + case 'c': + capitalize_fields = ++opt; + opt = 0; + break; + case 'k': + { + char buf[5]; + if (csalpha(*++opt)) + buf[0] = *opt++; + else { + if (*opt != '\0') + error("bad field name `%1'", *opt++); + buf[0] = 'L'; + } + buf[1] = '~'; + buf[2] = '%'; + buf[3] = 'a'; + buf[4] = '\0'; + set_label_spec(buf); + done_spec = 1; + } + break; + case 'a': + { + const char *ptr; + for (ptr = ++opt; *ptr; ptr++) + if (!csdigit(*ptr)) { + error("argument to `a' option not a number"); + break; + } + if (*ptr == '\0') { + reverse_fields = 'A'; + reverse_fields += opt; + } + opt = 0; + } + break; + case 'i': + linear_ignore_fields = ++opt; + opt = 0; + break; + case 'l': + { + char buf[INT_DIGITS*2 + 11]; // A.n+2D.y-3%a + strcpy(buf, "A.n"); + if (*++opt != '\0' && *opt != ',') { + char *ptr; + long n = strtol(opt, &ptr, 10); + if (n == 0 && ptr == opt) { + error("bad integer `%1' in `l' option", opt); + opt = 0; + break; + } + if (n < 0) + n = 0; + opt = ptr; + sprintf(strchr(buf, '\0'), "+%ld", n); + } + strcat(buf, "D.y"); + if (*opt == ',') + opt++; + if (*opt != '\0') { + char *ptr; + long n = strtol(opt, &ptr, 10); + if (n == 0 && ptr == opt) { + error("bad integer `%1' in `l' option", opt); + opt = 0; + break; + } + if (n < 0) + n = 0; + sprintf(strchr(buf, '\0'), "-%ld", n); + opt = ptr; + if (*opt != '\0') + error("argument to `l' option not of form `m,n'"); + } + strcat(buf, "%a"); + if (!set_label_spec(buf)) + assert(0); + done_spec = 1; + } + break; + case 'n': + search_default = 0; + opt++; + break; + case 'p': + { + const char *filename = 0; + if (*++opt == '\0') { + if (argc > 1) { + filename = *++argv; + argc--; + } + else { + error("option `p' requires an argument"); + usage(stderr); + exit(1); + } + } + else { + filename = opt; + opt = 0; + } + database_list.add_file(filename); + } + break; + case 's': + if (*++opt == '\0') + sort_fields = "AD"; + else { + sort_fields = opt; + opt = 0; + } + accumulate = 1; + break; + case 't': + { + char *ptr; + long n = strtol(opt, &ptr, 10); + if (n == 0 && ptr == opt) { + error("bad integer `%1' in `t' option", opt); + opt = 0; + break; + } + if (n < 1) + n = 1; + linear_truncate_len = int(n); + opt = ptr; + break; + } + case '-': + if (opt[1] == '\0') { + finished_options = 1; + opt++; + break; + } + if (strcmp(opt,"-version")==0) { + case 'v': + extern const char *Version_string; + printf("GNU refer (groff) version %s\n", Version_string); + exit(0); + break; + } + if (strcmp(opt,"-help")==0) { + usage(stdout); + exit(0); + break; + } + // fall through + default: + error("unrecognized option `%1'", *opt); + usage(stderr); + exit(1); + break; + } + } + } + if (!done_spec) + set_label_spec("%1"); + if (argc <= 0) { + if (bib_flag) + do_bib("-"); + else + do_file("-"); + } + else { + for (int i = 0; i < argc; i++) { + if (bib_flag) + do_bib(argv[i]); + else + do_file(argv[i]); + } + } + if (accumulate) + output_references(); + if (fflush(stdout) < 0) + fatal("output error"); + return 0; +} + +static void usage(FILE *stream) +{ + fprintf(stream, +"usage: %s [-benvCPRS] [-aN] [-cXYZ] [-fN] [-iXYZ] [-kX] [-lM,N] [-p file]\n" +" [-sXYZ] [-tN] [-BL.M] [files ...]\n", + program_name); +} + +static void possibly_load_default_database() +{ + if (search_default && !default_database_loaded) { + char *filename = getenv("REFER"); + if (filename) + database_list.add_file(filename); + else + database_list.add_file(DEFAULT_INDEX, 1); + default_database_loaded = 1; + } +} + +static int is_list(const string &str) +{ + const char *start = str.contents(); + const char *end = start + str.length(); + while (end > start && csspace(end[-1])) + end--; + while (start < end && csspace(*start)) + start++; + return end - start == 6 && memcmp(start, "$LIST$", 6) == 0; +} + +static void do_file(const char *filename) +{ + FILE *fp; + if (strcmp(filename, "-") == 0) { + fp = stdin; + } + else { + errno = 0; + fp = fopen(filename, "r"); + if (fp == 0) { + error("can't open `%1': %2", filename, strerror(errno)); + return; + } + } + current_filename = filename; + fprintf(outfp, ".lf 1 %s\n", filename); + string line; + current_lineno = 0; + for (;;) { + line.clear(); + for (;;) { + int c = getc(fp); + if (c == EOF) { + if (line.length() > 0) + line += '\n'; + break; + } + if (illegal_input_char(c)) + error("illegal input character code %1", c); + else { + line += c; + if (c == '\n') + break; + } + } + int len = line.length(); + if (len == 0) + break; + current_lineno++; + if (len >= 2 && line[0] == '.' && line[1] == '[') { + int start_lineno = current_lineno; + int start_of_line = 1; + string str; + string post; + string pre(line.contents() + 2, line.length() - 3); + for (;;) { + int c = getc(fp); + if (c == EOF) { + error_with_file_and_line(current_filename, start_lineno, + "missing `.]' line"); + break; + } + if (start_of_line) + current_lineno++; + if (start_of_line && c == '.') { + int d = getc(fp); + if (d == ']') { + while ((d = getc(fp)) != '\n' && d != EOF) { + if (illegal_input_char(d)) + error("illegal input character code %1", d); + else + post += d; + } + break; + } + if (d != EOF) + ungetc(d, fp); + } + if (illegal_input_char(c)) + error("illegal input character code %1", c); + else + str += c; + start_of_line = (c == '\n'); + } + if (is_list(str)) { + output_pending_line(); + if (accumulate) + output_references(); + else + error("found `$LIST$' but not accumulating references"); + } + else { + unsigned flags = (accumulate + ? store_reference(str) + : immediately_handle_reference(str)); + if (label_in_text) { + if (accumulate && outfp == stdout) + divert_to_temporary_file(); + if (pending_line.length() == 0) { + warning("can't attach citation to previous line"); + } + else + pending_line.set_length(pending_line.length() - 1); + string punct; + if (move_punctuation) + split_punct(pending_line, punct); + int have_text = pre.length() > 0 || post.length() > 0; + label_type lt = label_type(flags & ~(FORCE_LEFT_BRACKET + |FORCE_RIGHT_BRACKET)); + if ((flags & FORCE_LEFT_BRACKET) || !have_text) + pending_line += PRE_LABEL_MARKER; + pending_line += pre; + char lm = LABEL_MARKER + (int)lt; + pending_line += lm; + pending_line += post; + if ((flags & FORCE_RIGHT_BRACKET) || !have_text) + pending_line += POST_LABEL_MARKER; + pending_line += punct; + pending_line += '\n'; + } + } + need_syncing = 1; + } + else if (len >= 4 + && line[0] == '.' && line[1] == 'l' && line[2] == 'f' + && (compatible_flag || line[3] == '\n' || line[3] == ' ')) { + pending_lf_lines += line; + line += '\0'; + if (interpret_lf_args(line.contents() + 3)) + current_lineno--; + } + else if (recognize_R1_R2 + && len >= 4 + && line[0] == '.' && line[1] == 'R' && line[2] == '1' + && (compatible_flag || line[3] == '\n' || line[3] == ' ')) { + line.clear(); + int start_of_line = 1; + int start_lineno = current_lineno; + for (;;) { + int c = getc(fp); + if (c != EOF && start_of_line) + current_lineno++; + if (start_of_line && c == '.') { + c = getc(fp); + if (c == 'R') { + c = getc(fp); + if (c == '2') { + c = getc(fp); + if (compatible_flag || c == ' ' || c == '\n' || c == EOF) { + while (c != EOF && c != '\n') + c = getc(fp); + break; + } + else { + line += '.'; + line += 'R'; + line += '2'; + } + } + else { + line += '.'; + line += 'R'; + } + } + else + line += '.'; + } + if (c == EOF) { + error_with_file_and_line(current_filename, start_lineno, + "missing `.R2' line"); + break; + } + if (illegal_input_char(c)) + error("illegal input character code %1", int(c)); + else { + line += c; + start_of_line = c == '\n'; + } + } + output_pending_line(); + if (accumulate) + output_references(); + else + nreferences = 0; + process_commands(line, current_filename, start_lineno + 1); + need_syncing = 1; + } + else { + output_pending_line(); + pending_line = line; + } + } + need_syncing = 0; + output_pending_line(); + if (fp != stdin) + fclose(fp); +} + +class label_processing_state { + enum { + NORMAL, + PENDING_LABEL, + PENDING_LABEL_POST, + PENDING_LABEL_POST_PRE, + PENDING_POST + } state; + label_type type; // type of pending labels + int count; // number of pending labels + reference **rptr; // pointer to next reference + int rcount; // number of references left + FILE *fp; + int handle_pending(int c); +public: + label_processing_state(reference **, int, FILE *); + ~label_processing_state(); + void process(int c); +}; + +static void output_pending_line() +{ + if (label_in_text && !accumulate && ncitations > 0) { + label_processing_state state(citation, ncitations, outfp); + int len = pending_line.length(); + for (int i = 0; i < len; i++) + state.process((unsigned char)(pending_line[i])); + } + else + put_string(pending_line, outfp); + pending_line.clear(); + if (pending_lf_lines.length() > 0) { + put_string(pending_lf_lines, outfp); + pending_lf_lines.clear(); + } + if (!accumulate) + immediately_output_references(); + if (need_syncing) { + fprintf(outfp, ".lf %d %s\n", current_lineno, current_filename); + need_syncing = 0; + } +} + +static void split_punct(string &line, string &punct) +{ + const char *start = line.contents(); + const char *end = start + line.length(); + const char *ptr = start; + const char *last_token_start = 0; + for (;;) { + if (ptr >= end) + break; + last_token_start = ptr; + if (*ptr == PRE_LABEL_MARKER || *ptr == POST_LABEL_MARKER + || (*ptr >= LABEL_MARKER && *ptr < LABEL_MARKER + N_LABEL_TYPES)) + ptr++; + else if (!get_token(&ptr, end)) + break; + } + if (last_token_start) { + const token_info *ti = lookup_token(last_token_start, end); + if (ti->is_punct()) { + punct.append(last_token_start, end - last_token_start); + line.set_length(last_token_start - start); + } + } +} + +static void divert_to_temporary_file() +{ + outfp = xtmpfile(); +} + +static void store_citation(reference *ref) +{ + if (ncitations >= citation_max) { + if (citation == 0) + citation = new reference*[citation_max = 100]; + else { + reference **old_citation = citation; + citation_max *= 2; + citation = new reference *[citation_max]; + memcpy(citation, old_citation, ncitations*sizeof(reference *)); + a_delete old_citation; + } + } + citation[ncitations++] = ref; +} + +static unsigned store_reference(const string &str) +{ + if (reference_hash_table == 0) { + reference_hash_table = new reference *[17]; + hash_table_size = 17; + for (int i = 0; i < hash_table_size; i++) + reference_hash_table[i] = 0; + } + unsigned flags; + reference *ref = make_reference(str, &flags); + ref->compute_hash_code(); + unsigned h = ref->hash(); + reference **ptr; + for (ptr = reference_hash_table + (h % hash_table_size); + *ptr != 0; + ((ptr == reference_hash_table) + ? (ptr = reference_hash_table + hash_table_size - 1) + : --ptr)) + if (same_reference(**ptr, *ref)) + break; + if (*ptr != 0) { + if (ref->is_merged()) + warning("fields ignored because reference already used"); + delete ref; + ref = *ptr; + } + else { + *ptr = ref; + ref->set_number(nreferences); + nreferences++; + ref->pre_compute_label(); + ref->compute_sort_key(); + if (nreferences*2 >= hash_table_size) { + // Rehash it. + reference **old_table = reference_hash_table; + int old_size = hash_table_size; + hash_table_size = next_size(hash_table_size); + reference_hash_table = new reference*[hash_table_size]; + int i; + for (i = 0; i < hash_table_size; i++) + reference_hash_table[i] = 0; + for (i = 0; i < old_size; i++) + if (old_table[i]) { + reference **p; + for (p = (reference_hash_table + + (old_table[i]->hash() % hash_table_size)); + *p; + ((p == reference_hash_table) + ? (p = reference_hash_table + hash_table_size - 1) + : --p)) + ; + *p = old_table[i]; + } + a_delete old_table; + } + } + if (label_in_text) + store_citation(ref); + return flags; +} + +unsigned immediately_handle_reference(const string &str) +{ + unsigned flags; + reference *ref = make_reference(str, &flags); + ref->set_number(nreferences); + if (label_in_text || label_in_reference) { + ref->pre_compute_label(); + ref->immediate_compute_label(); + } + nreferences++; + store_citation(ref); + return flags; +} + +static void immediately_output_references() +{ + for (int i = 0; i < ncitations; i++) { + reference *ref = citation[i]; + if (label_in_reference) { + fputs(".ds [F ", outfp); + const string &label = ref->get_label(NORMAL_LABEL); + if (label.length() > 0 + && (label[0] == ' ' || label[0] == '\\' || label[0] == '"')) + putc('"', outfp); + put_string(label, outfp); + putc('\n', outfp); + } + ref->output(outfp); + delete ref; + } + ncitations = 0; +} + +static void output_citation_group(reference **v, int n, label_type type, + FILE *fp) +{ + if (sort_adjacent_labels) { + // Do an insertion sort. Usually n will be very small. + for (int i = 1; i < n; i++) { + int num = v[i]->get_number(); + reference *temp = v[i]; + int j; + for (j = i - 1; j >= 0 && v[j]->get_number() > num; j--) + v[j + 1] = v[j]; + v[j + 1] = temp; + } + } + // This messes up if !accumulate. + if (accumulate && n > 1) { + // remove duplicates + int j = 1; + for (int i = 1; i < n; i++) + if (v[i]->get_label(type) != v[i - 1]->get_label(type)) + v[j++] = v[i]; + n = j; + } + string merged_label; + for (int i = 0; i < n; i++) { + int nmerged = v[i]->merge_labels(v + i + 1, n - i - 1, type, merged_label); + if (nmerged > 0) { + put_string(merged_label, fp); + i += nmerged; + } + else + put_string(v[i]->get_label(type), fp); + if (i < n - 1) + put_string(sep_label, fp); + } +} + + +label_processing_state::label_processing_state(reference **p, int n, FILE *f) +: state(NORMAL), count(0), rptr(p), rcount(n), fp(f) +{ +} + +label_processing_state::~label_processing_state() +{ + int handled = handle_pending(EOF); + assert(!handled); + assert(rcount == 0); +} + +int label_processing_state::handle_pending(int c) +{ + switch (state) { + case NORMAL: + break; + case PENDING_LABEL: + if (c == POST_LABEL_MARKER) { + state = PENDING_LABEL_POST; + return 1; + } + else { + output_citation_group(rptr, count, type, fp); + rptr += count ; + rcount -= count; + state = NORMAL; + } + break; + case PENDING_LABEL_POST: + if (c == PRE_LABEL_MARKER) { + state = PENDING_LABEL_POST_PRE; + return 1; + } + else { + output_citation_group(rptr, count, type, fp); + rptr += count; + rcount -= count; + put_string(post_label, fp); + state = NORMAL; + } + break; + case PENDING_LABEL_POST_PRE: + if (c >= LABEL_MARKER + && c < LABEL_MARKER + N_LABEL_TYPES + && c - LABEL_MARKER == type) { + count += 1; + state = PENDING_LABEL; + return 1; + } + else { + output_citation_group(rptr, count, type, fp); + rptr += count; + rcount -= count; + put_string(sep_label, fp); + state = NORMAL; + } + break; + case PENDING_POST: + if (c == PRE_LABEL_MARKER) { + put_string(sep_label, fp); + state = NORMAL; + return 1; + } + else { + put_string(post_label, fp); + state = NORMAL; + } + break; + } + return 0; +} + +void label_processing_state::process(int c) +{ + if (handle_pending(c)) + return; + assert(state == NORMAL); + switch (c) { + case PRE_LABEL_MARKER: + put_string(pre_label, fp); + state = NORMAL; + break; + case POST_LABEL_MARKER: + state = PENDING_POST; + break; + case LABEL_MARKER: + case LABEL_MARKER + 1: + count = 1; + state = PENDING_LABEL; + type = label_type(c - LABEL_MARKER); + break; + default: + state = NORMAL; + putc(c, fp); + break; + } +} + +extern "C" { + +int rcompare(const void *p1, const void *p2) +{ + return compare_reference(**(reference **)p1, **(reference **)p2); +} + +} + +void output_references() +{ + assert(accumulate); + if (nreferences > 0) { + int j = 0; + int i; + for (i = 0; i < hash_table_size; i++) + if (reference_hash_table[i] != 0) + reference_hash_table[j++] = reference_hash_table[i]; + assert(j == nreferences); + for (; j < hash_table_size; j++) + reference_hash_table[j] = 0; + qsort(reference_hash_table, nreferences, sizeof(reference*), rcompare); + for (i = 0; i < nreferences; i++) + reference_hash_table[i]->set_number(i); + compute_labels(reference_hash_table, nreferences); + } + if (outfp != stdout) { + rewind(outfp); + { + label_processing_state state(citation, ncitations, stdout); + int c; + while ((c = getc(outfp)) != EOF) + state.process(c); + } + ncitations = 0; + fclose(outfp); + outfp = stdout; + } + if (nreferences > 0) { + fputs(".]<\n", outfp); + for (int i = 0; i < nreferences; i++) { + if (sort_fields.length() > 0) + reference_hash_table[i]->print_sort_key_comment(outfp); + if (label_in_reference) { + fputs(".ds [F ", outfp); + const string &label = reference_hash_table[i]->get_label(NORMAL_LABEL); + if (label.length() > 0 + && (label[0] == ' ' || label[0] == '\\' || label[0] == '"')) + putc('"', outfp); + put_string(label, outfp); + putc('\n', outfp); + } + reference_hash_table[i]->output(outfp); + delete reference_hash_table[i]; + reference_hash_table[i] = 0; + } + fputs(".]>\n", outfp); + nreferences = 0; + } + clear_labels(); +} + +static reference *find_reference(const char *query, int query_len) +{ + // This is so that error messages look better. + while (query_len > 0 && csspace(query[query_len - 1])) + query_len--; + string str; + for (int i = 0; i < query_len; i++) + str += query[i] == '\n' ? ' ' : query[i]; + str += '\0'; + possibly_load_default_database(); + search_list_iterator iter(&database_list, str.contents()); + reference_id rid; + const char *start; + int len; + if (!iter.next(&start, &len, &rid)) { + error("no matches for `%1'", str.contents()); + return 0; + } + const char *end = start + len; + while (start < end) { + if (*start == '%') + break; + while (start < end && *start++ != '\n') + ; + } + if (start >= end) { + error("found a reference for `%1' but it didn't contain any fields", + str.contents()); + return 0; + } + reference *result = new reference(start, end - start, &rid); + if (iter.next(&start, &len, &rid)) + warning("multiple matches for `%1'", str.contents()); + return result; +} + +static reference *make_reference(const string &str, unsigned *flagsp) +{ + const char *start = str.contents(); + const char *end = start + str.length(); + const char *ptr = start; + while (ptr < end) { + if (*ptr == '%') + break; + while (ptr < end && *ptr++ != '\n') + ; + } + *flagsp = 0; + for (; start < ptr; start++) { + if (*start == '#') + *flagsp = (SHORT_LABEL | (*flagsp & (FORCE_RIGHT_BRACKET + | FORCE_LEFT_BRACKET))); + else if (*start == '[') + *flagsp |= FORCE_LEFT_BRACKET; + else if (*start == ']') + *flagsp |= FORCE_RIGHT_BRACKET; + else if (!csspace(*start)) + break; + } + if (start >= end) { + error("empty reference"); + return new reference; + } + reference *database_ref = 0; + if (start < ptr) + database_ref = find_reference(start, ptr - start); + reference *inline_ref = 0; + if (ptr < end) + inline_ref = new reference(ptr, end - ptr); + if (inline_ref) { + if (database_ref) { + database_ref->merge(*inline_ref); + delete inline_ref; + return database_ref; + } + else + return inline_ref; + } + else if (database_ref) + return database_ref; + else + return new reference; +} + +static void do_ref(const string &str) +{ + if (accumulate) + (void)store_reference(str); + else { + (void)immediately_handle_reference(str); + immediately_output_references(); + } +} + +static void trim_blanks(string &str) +{ + const char *start = str.contents(); + const char *end = start + str.length(); + while (end > start && end[-1] != '\n' && csspace(end[-1])) + --end; + str.set_length(end - start); +} + +void do_bib(const char *filename) +{ + FILE *fp; + if (strcmp(filename, "-") == 0) + fp = stdin; + else { + errno = 0; + fp = fopen(filename, "r"); + if (fp == 0) { + error("can't open `%1': %2", filename, strerror(errno)); + return; + } + current_filename = filename; + } + enum { + START, MIDDLE, BODY, BODY_START, BODY_BLANK, BODY_DOT + } state = START; + string body; + for (;;) { + int c = getc(fp); + if (c == EOF) + break; + if (illegal_input_char(c)) { + error("illegal input character code %1", c); + continue; + } + switch (state) { + case START: + if (c == '%') { + body = c; + state = BODY; + } + else if (c != '\n') + state = MIDDLE; + break; + case MIDDLE: + if (c == '\n') + state = START; + break; + case BODY: + body += c; + if (c == '\n') + state = BODY_START; + break; + case BODY_START: + if (c == '\n') { + do_ref(body); + state = START; + } + else if (c == '.') + state = BODY_DOT; + else if (csspace(c)) { + state = BODY_BLANK; + body += c; + } + else { + body += c; + state = BODY; + } + break; + case BODY_BLANK: + if (c == '\n') { + trim_blanks(body); + do_ref(body); + state = START; + } + else if (csspace(c)) + body += c; + else { + body += c; + state = BODY; + } + break; + case BODY_DOT: + if (c == ']') { + do_ref(body); + state = MIDDLE; + } + else { + body += '.'; + body += c; + state = c == '\n' ? BODY_START : BODY; + } + break; + default: + assert(0); + } + if (c == '\n') + current_lineno++; + } + switch (state) { + case START: + case MIDDLE: + break; + case BODY: + body += '\n'; + do_ref(body); + break; + case BODY_DOT: + case BODY_START: + do_ref(body); + break; + case BODY_BLANK: + trim_blanks(body); + do_ref(body); + break; + } + fclose(fp); +} + +// from the Dragon Book + +unsigned hash_string(const char *s, int len) +{ + const char *end = s + len; + unsigned h = 0, g; + while (s < end) { + h <<= 4; + h += *s++; + if ((g = h & 0xf0000000) != 0) { + h ^= g >> 24; + h ^= g; + } + } + return h; +} + +int next_size(int n) +{ + static const int table_sizes[] = { + 101, 503, 1009, 2003, 3001, 4001, 5003, 10007, 20011, 40009, + 80021, 160001, 500009, 1000003, 2000003, 4000037, 8000009, + 16000057, 32000011, 64000031, 128000003, 0 + }; + + const int *p; + for (p = table_sizes; *p <= n && *p != 0; p++) + ; + assert(*p != 0); + return *p; +} + diff --git a/contrib/groff/src/preproc/refer/refer.h b/contrib/groff/src/preproc/refer/refer.h new file mode 100644 index 0000000..f0ab3cd --- /dev/null +++ b/contrib/groff/src/preproc/refer/refer.h @@ -0,0 +1,78 @@ +// -*- C++ -*- +/* Copyright (C) 1989, 1990, 1991, 1992 Free Software Foundation, Inc. + Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License along +with groff; see the file COPYING. If not, write to the Free Software +Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + +#include +#include +#include +#include +#include + +#include "errarg.h" +#include "error.h" +#include "lib.h" +#include "stringclass.h" +#include "cset.h" +#include "cmap.h" + +#include "defs.h" + +unsigned hash_string(const char *, int); +int next_size(int); + +extern string capitalize_fields; +extern string reverse_fields; +extern string abbreviate_fields; +extern string period_before_last_name; +extern string period_before_initial; +extern string period_before_hyphen; +extern string period_before_other; +extern string sort_fields; +extern int annotation_field; +extern string annotation_macro; +extern string discard_fields; +extern string articles; +extern int abbreviate_label_ranges; +extern string label_range_indicator; +extern int date_as_label; +extern string join_authors_exactly_two; +extern string join_authors_last_two; +extern string join_authors_default; +extern string separate_label_second_parts; +extern string et_al; +extern int et_al_min_elide; +extern int et_al_min_total; + +extern int compatible_flag; + +extern int set_label_spec(const char *); +extern int set_date_label_spec(const char *); +extern int set_short_label_spec(const char *); + +extern int short_label_flag; + +void clear_labels(); +void command_error(const char *, + const errarg &arg1 = empty_errarg, + const errarg &arg2 = empty_errarg, + const errarg &arg3 = empty_errarg); + +struct reference; + +void compute_labels(reference **, int); diff --git a/contrib/groff/src/preproc/refer/refer.man b/contrib/groff/src/preproc/refer/refer.man new file mode 100644 index 0000000..13708cf --- /dev/null +++ b/contrib/groff/src/preproc/refer/refer.man @@ -0,0 +1,1302 @@ +.ig \"-*- nroff -*- +Copyright (C) 1989-2000 Free Software Foundation, Inc. + +Permission is granted to make and distribute verbatim copies of +this manual provided the copyright notice and this permission notice +are preserved on all copies. + +Permission is granted to copy and distribute modified versions of this +manual under the conditions for verbatim copying, provided that the +entire resulting derived work is distributed under the terms of a +permission notice identical to this one. + +Permission is granted to copy and distribute translations of this +manual into another language, under the above conditions for modified +versions, except that this permission notice may be included in +translations approved by the Free Software Foundation instead of in +the original English. +.. +.de TQ +.br +.ns +.TP \\$1 +.. +.\" Like TP, but if specified indent is more than half +.\" the current line-length - indent, use the default indent. +.de Tp +.ie \\n(.$=0:((0\\$1)*2u>(\\n(.lu-\\n(.iu)) .TP +.el .TP "\\$1" +.. +.\" The BSD man macros can't handle " in arguments to font change macros, +.\" so use \(ts instead of ". +.tr \(ts" +.TH @G@REFER @MAN1EXT@ "@MDATE@" "Groff Version @VERSION@" +.SH NAME +@g@refer \- preprocess bibliographic references for groff +.SH SYNOPSIS +.nr a \n(.j +.ad l +.nr i \n(.i +.in +\w'\fB@g@refer 'u +.ti \niu +.B @g@refer +.de OP +.ie \\n(.$-1 .RI "[\ \fB\\$1\fP" "\\$2" "\ ]" +.el .RB "[\ " "\\$1" "\ ]" +.. +.OP \-benvCPRS +.OP \-a n +.OP \-c fields +.OP \-f n +.OP \-i fields +.OP \-k field +.OP \-l m,n +.OP \-p filename +.OP \-s fields +.OP \-t n +.OP \-B field.macro +.RI [\ filename \|.\|.\|.\ ] +.br +.ad \na +.PP +It is possible to have whitespace between a command line option and its +parameter. +.SH DESCRIPTION +This file documents the GNU version of +.BR refer , +which is part of the groff document formatting system. +.B refer +copies the contents of +.IR filename \|.\|.\|. +to the standard output, +except that lines between +.B .[ +and +.B .] +are interpreted as citations, +and lines between +.B .R1 +and +.B .R2 +are interpreted as commands about how citations are to be processed. +.LP +Each citation specifies a reference. +The citation can specify a reference that is contained in +a bibliographic database by giving a set of keywords +that only that reference contains. +Alternatively it can specify a reference by supplying a database +record in the citation. +A combination of these alternatives is also possible. +.LP +For each citation, +.B refer +can produce a mark in the text. +This mark consists of some label which can be separated from +the text and from other labels in various ways. +For each reference it also outputs +.B groff +commands that can be used by a macro package to produce a formatted +reference for each citation. +The output of +.B refer +must therefore be processed using a suitable macro package. +The +.B \-ms +and +.B \-me +macros are both suitable. +The commands to format a citation's reference can be output immediately after +the citation, +or the references may be accumulated, +and the commands output at some later point. +If the references are accumulated, then multiple citations of the same +reference will produce a single formatted reference. +.LP +The interpretation of lines between +.B .R1 +and +.B .R2 +as commands is a new feature of GNU refer. +Documents making use of this feature can still be processed by +Unix refer just by adding the lines +.RS +.LP +.nf +.ft B +\&.de R1 +\&.ig R2 +\&.. +.ft +.fi +.RE +to the beginning of the document. +This will cause +.B troff +to ignore everything between +.B .R1 +and +.BR .R2 . +The effect of some commands can also be achieved by options. +These options are supported mainly for compatibility with Unix refer. +It is usually more convenient to use commands. +.LP +.B refer +generates +.B .lf +lines so that filenames and line numbers in messages produced +by commands that read +.B refer +output will be correct; +it also interprets lines beginning with +.B .lf +so that filenames and line numbers in the messages and +.B .lf +lines that it produces will be accurate even if the input has been +preprocessed by a command such as +.BR @g@soelim (@MAN1EXT@). +.SH OPTIONS +.LP +Most options are equivalent to commands +(for a description of these commands see the +.B Commands +subsection): +.TP +.B \-b +.B +no-label-in-text; no-label-in-reference +.TP +.B \-e +.B accumulate +.TP +.B \-n +.B no-default-database +.TP +.B \-C +.B compatible +.TP +.B \-P +.B move-punctuation +.TP +.B \-S +.B +label "(A.n|Q) ', ' (D.y|D)"; bracket-label " (" ) "; " +.TP +.BI \-a n +.B reverse +.BI A n +.TP +.BI \-c fields +.B capitalize +.I fields +.TP +.BI \-f n +.B label +.BI % n +.TP +.BI \-i fields +.B search-ignore +.I fields +.TP +.B \-k +.B label +.B L\(ti%a +.TP +.BI \-k field +.B label +.IB field \(ti%a +.TP +.B \-l +.B label +.BI A.nD.y%a +.TP +.BI \-l m +.B label +.BI A.n+ m D.y%a +.TP +.BI \-l, n +.B label +.BI A.nD.y\- n %a +.TP +.BI \-l m , n +.B label +.BI A.n+ m D.y\- n %a +.TP +.BI \-p filename +.B database +.I filename +.TP +.BI \-s spec +.B sort +.I spec +.TP +.BI \-t n +.B search-truncate +.I n +.LP +These options are equivalent to the following commands with the +addition that the filenames specified on the command line are +processed as if they were arguments to the +.B bibliography +command instead of in the normal way: +.TP +.B \-B +.B +annotate X AP; no-label-in-reference +.TP +.BI \-B field . macro +.B annotate +.I field +.IB macro ; +.B no-label-in-reference +.LP +The following options have no equivalent commands: +.TP +.B \-v +Print the version number. +.TP +.B \-R +Don't recognize lines beginning with +.BR .R1 / .R2 . +.SH USAGE +.SS Bibliographic databases +The bibliographic database is a text file consisting of records +separated by one or more blank lines. +Within each record fields start with a +.B % +at the beginning of a line. +Each field has a one character name that immediately follows the +.BR % . +It is best to use only upper and lower case letters for the names +of fields. +The name of the field should be followed by exactly one space, +and then by the contents of the field. +Empty fields are ignored. +The conventional meaning of each field is as follows: +.TP +.B A +The name of an author. +If the name contains a title such as +.B Jr. +at the end, +it should be separated from the last name by a comma. +There can be multiple occurrences of the +.B A +field. +The order is significant. +It is a good idea always to supply an +.B A +field or a +.B Q +field. +.TP +.B B +For an article that is part of a book, the title of the book +.TP +.B C +The place (city) of publication. +.TP +.B D +The date of publication. +The year should be specified in full. +If the month is specified, the name rather than the number of the month +should be used, but only the first three letters are required. +It is a good idea always to supply a +.B D +field; +if the date is unknown, a value such as +.B in press +or +.B unknown +can be used. +.TP +.B E +For an article that is part of a book, the name of an editor of the book. +Where the work has editors and no authors, +the names of the editors should be given as +.B A +fields and +.B ,\ (ed) +or +.B ,\ (eds) +should be appended to the last author. +.TP +.B G +US Government ordering number. +.TP +.B I +The publisher (issuer). +.TP +.B J +For an article in a journal, the name of the journal. +.TP +.B K +Keywords to be used for searching. +.TP +.B L +Label. +.TP +.B N +Journal issue number. +.TP +.B O +Other information. +This is usually printed at the end of the reference. +.TP +.B P +Page number. +A range of pages can be specified as +.IB m \- n\fR. +.TP +.B Q +The name of the author, if the author is not a person. +This will only be used if there are no +.B A +fields. +There can only be one +.B Q +field. +.TP +.B R +Technical report number. +.TP +.B S +Series name. +.TP +.B T +Title. +For an article in a book or journal, +this should be the title of the article. +.TP +.B V +Volume number of the journal or book. +.TP +.B X +Annotation. +.LP +For all fields except +.B A +and +.BR E , +if there is more than one occurrence of a particular field in a record, +only the last such field will be used. +.LP +If accent strings are used, they should follow the character to be accented. +This means that the +.B AM +macro must be used with the +.B \-ms +macros. +Accent strings should not be quoted: +use one +.B \e +rather than two. +.SS Citations +The format of a citation is +.RS +.BI .[ opening-text +.br +.I +flags keywords +.br +.I fields +.br +.BI .] closing-text +.RE +.LP +The +.IR opening-text , +.IR closing-text +and +.I flags +components are optional. +Only one of the +.I keywords +and +.I fields +components need be specified. +.LP +The +.I keywords +component says to search the bibliographic databases for a reference +that contains all the words in +.IR keywords . +It is an error if more than one reference if found. +.LP +The +.I fields +components specifies additional fields to replace or supplement +those specified in the reference. +When references are being accumulated and the +.I keywords +component is non-empty, +then additional fields should be specified only on the first +occasion that a particular reference is cited, +and will apply to all citations of that reference. +.LP +The +.I opening-text +and +.I closing-text +component specifies strings to be used to bracket the label instead +of the strings specified in the +.B bracket-label +command. +If either of these components is non-empty, +the strings specified in the +.B bracket-label +command will not be used; +this behaviour can be altered using the +.B [ +and +.B ] +flags. +Note that leading and trailing spaces are significant for these components. +.LP +The +.I flags +component is a list of +non-alphanumeric characters each of which modifies the treatment +of this particular citation. +Unix refer will treat these flags as part of the keywords and +so will ignore them since they are non-alphanumeric. +The following flags are currently recognized: +.TP +.B # +This says to use the label specified by the +.B short-label +command, +instead of that specified by the +.B label +command. +If no short label has been specified, the normal label will be used. +Typically the short label is used with author-date labels +and consists of only the date and possibly a disambiguating letter; +the +.B # +is supposed to be suggestive of a numeric type of label. +.TP +.B [ +Precede +.I opening-text +with the first string specified in the +.B bracket-label +command. +.TP +.B ] +Follow +.I closing-text +with the second string specified in the +.B bracket-label +command. +.LP +One advantages of using the +.B [ +and +.B ] +flags rather than including the brackets in +.I opening-text +and +.I closing-text +is that +you can change the style of bracket used in the document just by changing the +.B bracket-label +command. +Another advantage is that sorting and merging of citations +will not necessarily be inhibited if the flags are used. +.LP +If a label is to be inserted into the text, +it will be attached to the line preceding the +.B .[ +line. +If there is no such line, then an extra line will be inserted before the +.B .[ +line and a warning will be given. +.LP +There is no special notation for making a citation to multiple references. +Just use a sequence of citations, one for each reference. +Don't put anything between the citations. +The labels for all the citations will be attached to the line preceding +the first citation. +The labels may also be sorted or merged. +See the description of the +.B <> +label expression, and of the +.B sort-adjacent-labels +and +.B abbreviate-label-ranges +command. +A label will not be merged if its citation has a non-empty +.I opening-text +or +.IR closing-text . +However, the labels for a citation using the +.B ] +flag and without any +.I closing-text +immediately followed by a citation using the +.B [ +flag and without any +.I opening-text +may be sorted and merged +even though the first citation's +.I opening-text +or the second citation's +.I closing-text +is non-empty. +(If you wish to prevent this just make the first citation's +.I closing-text +.BR \e& .) +.SS Commands +Commands are contained between lines starting with +.B .R1 +and +.BR .R2 . +Recognition of these lines can be prevented by the +.B \-R +option. +When a +.B .R1 +line is recognized any accumulated references are flushed out. +Neither +.B .R1 +nor +.B .R2 +lines, +nor anything between them +is output. +.LP +Commands are separated by newlines or +.BR ; s. +.B # +introduces a comment that extends to the end of the line +(but does not conceal the newline). +Each command is broken up into words. +Words are separated by spaces or tabs. +A word that begins with +.B \(ts +extends to the next +.B \(ts +that is not followed by another +.BR \(ts . +If there is no such +.B \(ts +the word extends to the end of the line. +Pairs of +.B \(ts +in a word beginning with +.B \(ts +collapse to a single +.BR \(ts . +Neither +.B # +nor +.B ; +are recognized inside +.BR \(ts s. +A line can be continued by ending it with +.BR \e ; +this works everywhere except after a +.BR # . +.LP +.ds n \fR* +Each command +.I name +that is marked with \*n has an associated negative command +.BI no- name +that undoes the effect of +.IR name . +For example, the +.B no-sort +command specifies that references should not be sorted. +The negative commands take no arguments. +.LP +In the following description each argument must be a single word; +.I field +is used for a single upper or lower case letter naming a field; +.I fields +is used for a sequence of such letters; +.I m +and +.I n +are used for a non-negative numbers; +.I string +is used for an arbitrary string; +.I filename +is used for the name of a file. +.Tp \w'\fBabbreviate-label-ranges'u+2n +.BI abbreviate\*n\ fields\ string1\ string2\ string3\ string4 +Abbreviate the first names of +.IR fields . +An initial letter will be separated from another initial letter by +.IR string1 , +from the last name by +.IR string2 , +and from anything else +(such as a +.B von +or +.BR de ) +by +.IR string3 . +These default to a period followed by a space. +In a hyphenated first name, +the initial of the first part of the name will be separated from the hyphen by +.IR string4 ; +this defaults to a period. +No attempt is made to handle any ambiguities that might +result from abbreviation. +Names are abbreviated before sorting and before +label construction. +.TP +.BI abbreviate-label-ranges\*n\ string +Three or more adjacent labels that refer to consecutive references +will be abbreviated to a label consisting +of the first label, followed by +.I string +followed by the last label. +This is mainly useful with numeric labels. +If +.I string +is omitted it defaults to +.BR \- . +.TP +.B accumulate\*n +Accumulate references instead of writing out each reference +as it is encountered. +Accumulated references will be written out whenever a reference +of the form +.RS +.IP +.B .[ +.br +.B $LIST$ +.br +.B .] +.LP +is encountered, +after all input files hve been processed, +and whenever +.B .R1 +line is recognized. +.RE +.TP +.BI annotate\*n\ field\ string +.I field +is an annotation; +print it at the end of the reference as a paragraph preceded by the line +.RS +.IP +.BI . string +.LP +If +.I macro +is omitted it will default to +.BR AP ; +if +.I field +is also omitted it will default to +.BR X . +Only one field can be an annotation. +.RE +.TP +.BI articles\ string \fR\|.\|.\|. +.IR string \|.\|.\|. +are definite or indefinite articles, and should be ignored at the beginning of +.B T +fields when sorting. +Initially, +.BR the , +.B a +and +.B an +are recognized as articles. +.TP +.BI bibliography\ filename \fR\|.\|.\|. +Write out all the references contained in the bibliographic databases +.IR filename \|.\|.\|. +.TP +.BI bracket-label\ string1\ string2\ string3 +In the text, bracket each label +with +.I string1 +and +.IR string2 . +An occurrence of +.I string2 +immediately followed by +.I string1 +will be turned into +.IR string3 . +The default behaviour is +.RS +.IP +.B +bracket-label \e*([. \e*(.] ", " +.RE +.TP +.BI capitalize\ fields +Convert +.I fields +to caps and small caps. +.TP +.B compatible\*n +Recognize +.B .R1 +and +.B .R2 +even when followed by a character other than space or newline. +.TP +.BI database\ filename \fR\|.\|.\|. +Search the bibliographic databases +.IR filename \|.\|.\|. +For each +.I filename +if an index +.IB filename @INDEX_SUFFIX@ +created by +.BR @g@indxbib (@MAN1EXT@) +exists, then it will be searched instead; +each index can cover multiple databases. +.TP +.BI date-as-label\*n\ string +.I string +is a label expression that specifies a string with which to replace the +.B D +field after constructing the label. +See the +.B "Label expressions" +subsection for a description of label expressions. +This command is useful if you do not want explicit labels in the +reference list, but instead want to handle any necessary +disambiguation by qualifying the date in some way. +The label used in the text would typically be some combination of the +author and date. +In most cases you should also use the +.B no-label-in-reference +command. +For example, +.RS +.IP +.B +date-as-label D.+yD.y%a*D.-y +.LP +would attach a disambiguating letter to the year part of the +.B D +field in the reference. +.RE +.TP +.B default-database\*n +The default database should be searched. +This is the default behaviour, so the negative version of +this command is more useful. +refer determines whether the default database should be searched +on the first occasion that it needs to do a search. +Thus a +.B no-default-database +command must be given before then, +in order to be effective. +.TP +.BI discard\*n\ fields +When the reference is read, +.I fields +should be discarded; +no string definitions for +.I fields +will be output. +Initially, +.I fields +are +.BR XYZ . +.TP +.BI et-al\*n\ string\ m\ n +Control use of +.B +et al +in the evaluation of +.B @ +expressions in label expressions. +If the number of authors needed to make the author sequence +unambiguous is +.I u +and the total number of authors is +.I t +then the last +.IR t \|\-\| u +authors will be replaced by +.I string +provided that +.IR t \|\-\| u +is not less than +.I m +and +.I t +is not less than +.IR n . +The default behaviour is +.RS +.IP +.B +et-al " et al" 2 3 +.RE +.TP +.BI include\ filename +Include +.I filename +and interpret the contents as commands. +.TP +.BI join-authors\ string1\ string2\ string3 +This says how authors should be joined together. +When there are exactly two authors, they will be joined with +.IR string1 . +When there are more than two authors, all but the last two will +be joined with +.IR string2 , +and the last two authors will be joined with +.IR string3 . +If +.I string3 +is omitted, +it will default to +.IR string1 ; +if +.I string2 +is also omitted it will also default to +.IR string1 . +For example, +.RS +.IP +.B +join-authors " and " ", " ", and " +.LP +will restore the default method for joining authors. +.RE +.TP +.B label-in-reference\*n +When outputting the reference, +define the string +.B [F +to be the reference's label. +This is the default behaviour; so the negative version +of this command is more useful. +.TP +.B label-in-text\*n +For each reference output a label in the text. +The label will be separated from the surrounding text as described in the +.B bracket-label +command. +This is the default behaviour; so the negative version +of this command is more useful. +.TP +.BI label\ string +.I string +is a label expression describing how to label each reference. +.TP +.BI separate-label-second-parts\ string +When merging two-part labels, separate the second part of the second +label from the first label with +.IR string . +See the description of the +.B <> +label expression. +.TP +.B move-punctuation\*n +In the text, move any punctuation at the end of line past the label. +It is usually a good idea to give this command unless you are using +superscripted numbers as labels. +.TP +.BI reverse\*n\ string +Reverse the fields whose names +are in +.IR string . +Each field name can be followed by a number which says +how many such fields should be reversed. +If no number is given for a field, all such fields will be reversed. +.TP +.BI search-ignore\*n\ fields +While searching for keys in databases for which no index exists, +ignore the contents of +.IR fields . +Initially, fields +.B XYZ +are ignored. +.TP +.BI search-truncate\*n\ n +Only require the first +.I n +characters of keys to be given. +In effect when searching for a given key +words in the database are truncated to the maximum of +.I n +and the length of the key. +Initially +.I n +is 6. +.TP +.BI short-label\*n\ string +.I string +is a label expression that specifies an alternative (usually shorter) +style of label. +This is used when the +.B # +flag is given in the citation. +When using author-date style labels, the identity of the author +or authors is sometimes clear from the context, and so it +may be desirable to omit the author or authors from the label. +The +.B short-label +command will typically be used to specify a label containing just +a date and possibly a disambiguating letter. +.TP +.BI sort\*n\ string +Sort references according to +.BR string . +References will automatically be accumulated. +.I string +should be a list of field names, each followed by a number, +indicating how many fields with the name should be used for sorting. +.B + +can be used to indicate that all the fields with the name should be used. +Also +.B . +can be used to indicate the references should be sorted using the +(tentative) label. +(The +.B +Label expressions +subsection describes the concept of a tentative label.) +.TP +.B sort-adjacent-labels\*n +Sort labels that are adjacent in the text according to their +position in the reference list. +This command should usually be given if the +.B abbreviate-label-ranges +command has been given, +or if the label expression contains a +.B <> +expression. +This will have no effect unless references are being accumulated. +.SS Label expressions +.LP +Label expressions can be evaluated both normally and tentatively. +The result of normal evaluation is used for output. +The result of tentative evaluation, called the +.I +tentative label, +is used to gather the information +that normal evaluation needs to disambiguate the label. +Label expressions specified by the +.B date-as-label +and +.B short-label +commands are not evaluated tentatively. +Normal and tentative evaluation are the same for all types +of expression other than +.BR @ , +.BR * , +and +.B % +expressions. +The description below applies to normal evaluation, +except where otherwise specified. +.TP +.I field +.TQ +.I field\ n +The +.IR n -th +part of +.IR field . +If +.I n +is omitted, it defaults to 1. +.TP +.BI ' string ' +The characters in +.I string +literally. +.TP +.B @ +All the authors joined as specified by the +.B join-authors +command. +The whole of each author's name will be used. +However, if the references are sorted by author +(that is the sort specification starts with +.BR A+ ), +then authors' last names will be used instead, provided that this does +not introduce ambiguity, +and also an initial subsequence of the authors may be used +instead of all the authors, again provided that this does not +introduce ambiguity. +The use of only the last name for the +.IR i -th +author of some reference +is considered to be ambiguous if +there is some other reference, +such that the first +.IR i \|-\|1 +authors of the references are the same, +the +.IR i -th +authors are not the same, +but the +.IR i -th +authors' last names are the same. +A proper initial subsequence of the sequence +of authors for some reference is considered to be ambiguous if there is +a reference with some other sequence of authors which also has +that subsequence as a proper initial subsequence. +When an initial subsequence of authors is used, the remaining +authors are replaced by the string specified by the +.B et-al +command; +this command may also specify additional requirements that must be +met before an initial subsequence can be used. +.B @ +tentatively evaluates to a canonical representation of the authors, +such that authors that compare equally for sorting purpose +will have the same representation. +.TP +.BI % n +.TQ +.B %a +.TQ +.B %A +.TQ +.B %i +.TQ +.B %I +The serial number of the reference formatted according to the character +following the +.BR % . +The serial number of a reference is 1 plus the number of earlier references +with same tentative label as this reference. +These expressions tentatively evaluate to an empty string. +.TP +.IB expr * +If there is another reference with the same tentative label as +this reference, then +.IR expr , +otherwise an empty string. +It tentatively evaluates to an empty string. +.TP +.IB expr + n +.TQ +.IB expr \- n +The first +.RB ( + ) +or last +.RB ( \- ) +.I n +upper or lower case letters or digits of +.IR expr . +Troff special characters (such as +.BR \e('a ) +count as a single letter. +Accent strings are retained but do not count towards the total. +.TP +.IB expr .l +.I expr +converted to lowercase. +.TP +.IB expr .u +.I expr +converted to uppercase. +.TP +.IB expr .c +.I expr +converted to caps and small caps. +.TP +.IB expr .r +.I expr +reversed so that the last name is first. +.TP +.IB expr .a +.I expr +with first names abbreviated. +Note that fields specified in the +.B abbreviate +command are abbreviated before any labels are evaluated. +Thus +.B .a +is useful only when you want a field to be abbreviated in a label +but not in a reference. +.TP +.IB expr .y +The year part of +.IR expr . +.TP +.IB expr .+y +The part of +.I expr +before the year, or the whole of +.I expr +if it does not contain a year. +.TP +.IB expr .\-y +The part of +.I expr +after the year, or an empty string if +.I expr +does not contain a year. +.TP +.IB expr .n +The last name part of +.IR expr . +.TP +.IB expr1 \(ti expr2 +.I expr1 +except that if the last character of +.I expr1 +is +.B \- +then it will be replaced by +.IR expr2 . +.TP +.I expr1\ expr2 +The concatenation of +.I expr1 +and +.IR expr2 . +.TP +.IB expr1 | expr2 +If +.I expr1 +is non-empty then +.I expr1 +otherwise +.IR expr2 . +.TP +.IB expr1 & expr2 +If +.I expr1 +is non-empty +then +.I expr2 +otherwise an empty string. +.TP +.IB expr1 ? expr2 : expr3 +If +.I expr1 +is non-empty +then +.I expr2 +otherwise +.IR expr3 . +.TP +.BI < expr > +The label is in two parts, which are separated by +.IR expr . +Two adjacent two-part labels which have the same first part will be +merged by appending the second part of the second label onto the first +label separated by the string specified in the +.B separate-label-second-parts +command (initially, a comma followed by a space); the resulting label +will also be a two-part label with the same first part as before +merging, and so additional labels can be merged into it. +Note that it is permissible for the first part to be empty; +this maybe desirable for expressions used in the +.B short-label +command. +.TP +.BI ( expr ) +The same as +.IR expr . +Used for grouping. +.LP +The above expressions are listed in order of precedence +(highest first); +.B & +and +.B | +have the same precedence. +.SS Macro interface +Each reference starts with a call to the macro +.BR ]- . +The string +.B [F +will be defined to be the label for this reference, +unless the +.B no-label-in-reference +command has been given. +There then follows a series of string definitions, +one for each field: +string +.BI [ X +corresponds to field +.IR X . +The number register +.B [P +is set to 1 if the +.B P +field contains a range of pages. +The +.BR [T , +.B [A +and +.B [O +number registers are set to 1 according as the +.BR T , +.B A +and +.B O +fields end with one of the characters +.BR .?! . +The +.B [E +number register will be set to 1 if the +.B [E +string contains more than one name. +The reference is followed by a call to the +.B ][ +macro. +The first argument to this macro gives a number representing +the type of the reference. +If a reference contains a +.B J +field, it will be classified as type 1, +otherwise if it contains a +.B B +field, it will type 3, +otherwise if it contains a +.B G +or +.B R +field it will be type 4, +otherwise if contains a +.B I +field it will be type 2, +otherwise it will be type 0. +The second argument is a symbolic name for the type: +.BR other , +.BR journal-article , +.BR book , +.B article-in-book +or +.BR tech-report . +Groups of references that have been accumulated +or are produced by the +.B bibliography +command are preceded by a call to the +.B ]< +macro and followed by a call to the +.B ]> +macro. +.SH FILES +.Tp \w'\fB@DEFAULT_INDEX@'u+2n +.B @DEFAULT_INDEX@ +Default database. +.TP +.IB file @INDEX_SUFFIX@ +Index files. +.SH "SEE ALSO" +.BR @g@indxbib (@MAN1EXT@), +.BR @g@lookbib (@MAN1EXT@), +.BR lkbib (@MAN1EXT@) +.br +.SH BUGS +In label expressions, +.B <> +expressions are ignored inside +.BI . char +expressions. diff --git a/contrib/groff/src/preproc/refer/token.cc b/contrib/groff/src/preproc/refer/token.cc new file mode 100644 index 0000000..1cf6890 --- /dev/null +++ b/contrib/groff/src/preproc/refer/token.cc @@ -0,0 +1,378 @@ +// -*- C++ -*- +/* Copyright (C) 1989, 1990, 1991, 1992 Free Software Foundation, Inc. + Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License along +with groff; see the file COPYING. If not, write to the Free Software +Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + +#include "refer.h" +#include "token.h" + +#define TOKEN_TABLE_SIZE 1009 +// I believe in Icelandic thorn sorts after z. +#define THORN_SORT_KEY "{" + +struct token_table_entry { + const char *tok; + token_info ti; + token_table_entry(); +}; + +token_table_entry token_table[TOKEN_TABLE_SIZE]; +int ntokens = 0; + +static void skip_name(const char **ptr, const char *end) +{ + if (*ptr < end) { + switch (*(*ptr)++) { + case '(': + if (*ptr < end) { + *ptr += 1; + if (*ptr < end) + *ptr += 1; + } + break; + case '[': + while (*ptr < end) + if (*(*ptr)++ == ']') + break; + break; + } + } +} + +int get_token(const char **ptr, const char *end) +{ + if (*ptr >= end) + return 0; + char c = *(*ptr)++; + if (c == '\\' && *ptr < end) { + switch (**ptr) { + default: + *ptr += 1; + break; + case '(': + case '[': + skip_name(ptr, end); + break; + case '*': + case 'f': + *ptr += 1; + skip_name(ptr, end); + break; + } + } + return 1; +} + +token_info::token_info() +: type(TOKEN_OTHER), sort_key(0), other_case(0) +{ +} + +void token_info::set(token_type t, const char *sk, const char *oc) +{ + assert(oc == 0 || t == TOKEN_UPPER || t == TOKEN_LOWER); + type = t; + sort_key = sk; + other_case = oc; +} + +void token_info::sortify(const char *start, const char *end, string &result) + const +{ + if (sort_key) + result += sort_key; + else if (type == TOKEN_UPPER || type == TOKEN_LOWER) { + for (; start < end; start++) + if (csalpha(*start)) + result += cmlower(*start); + } +} + +int token_info::sortify_non_empty(const char *start, const char *end) const +{ + if (sort_key) + return *sort_key != '\0'; + if (type != TOKEN_UPPER && type != TOKEN_LOWER) + return 0; + for (; start < end; start++) + if (csalpha(*start)) + return 1; + return 0; +} + + +void token_info::lower_case(const char *start, const char *end, + string &result) const +{ + if (type != TOKEN_UPPER) { + while (start < end) + result += *start++; + } + else if (other_case) + result += other_case; + else { + while (start < end) + result += cmlower(*start++); + } +} + +void token_info::upper_case(const char *start, const char *end, + string &result) const +{ + if (type != TOKEN_LOWER) { + while (start < end) + result += *start++; + } + else if (other_case) + result += other_case; + else { + while (start < end) + result += cmupper(*start++); + } +} + +token_table_entry::token_table_entry() +: tok(0) +{ +} + +static void store_token(const char *tok, token_type typ, + const char *sk = 0, const char *oc = 0) +{ + unsigned n = hash_string(tok, strlen(tok)) % TOKEN_TABLE_SIZE; + for (;;) { + if (token_table[n].tok == 0) { + if (++ntokens == TOKEN_TABLE_SIZE) + assert(0); + token_table[n].tok = tok; + break; + } + if (strcmp(tok, token_table[n].tok) == 0) + break; + if (n == 0) + n = TOKEN_TABLE_SIZE - 1; + else + --n; + } + token_table[n].ti.set(typ, sk, oc); +} + + +token_info default_token_info; + +const token_info *lookup_token(const char *start, const char *end) +{ + unsigned n = hash_string(start, end - start) % TOKEN_TABLE_SIZE; + for (;;) { + if (token_table[n].tok == 0) + break; + if (strlen(token_table[n].tok) == end - start + && memcmp(token_table[n].tok, start, end - start) == 0) + return &(token_table[n].ti); + if (n == 0) + n = TOKEN_TABLE_SIZE - 1; + else + --n; + } + return &default_token_info; +} + +static void init_ascii() +{ + const char *p; + for (p = "abcdefghijklmnopqrstuvwxyz"; *p; p++) { + char buf[2]; + buf[0] = *p; + buf[1] = '\0'; + store_token(strsave(buf), TOKEN_LOWER); + buf[0] = cmupper(buf[0]); + store_token(strsave(buf), TOKEN_UPPER); + } + for (p = "0123456789"; *p; p++) { + char buf[2]; + buf[0] = *p; + buf[1] = '\0'; + const char *s = strsave(buf); + store_token(s, TOKEN_OTHER, s); + } + for (p = ".,:;?!"; *p; p++) { + char buf[2]; + buf[0] = *p; + buf[1] = '\0'; + store_token(strsave(buf), TOKEN_PUNCT); + } + store_token("-", TOKEN_HYPHEN); +} + +static void store_letter(const char *lower, const char *upper, + const char *sort_key = 0) +{ + store_token(lower, TOKEN_LOWER, sort_key, upper); + store_token(upper, TOKEN_UPPER, sort_key, lower); +} + +static void init_letter(unsigned char uc_code, unsigned char lc_code, + const char *sort_key) +{ + char lbuf[2]; + lbuf[0] = lc_code; + lbuf[1] = 0; + char ubuf[2]; + ubuf[0] = uc_code; + ubuf[1] = 0; + store_letter(strsave(lbuf), strsave(ubuf), sort_key); +} + +static void init_latin1() +{ + init_letter(0xc0, 0xe0, "a"); + init_letter(0xc1, 0xe1, "a"); + init_letter(0xc2, 0xe2, "a"); + init_letter(0xc3, 0xe3, "a"); + init_letter(0xc4, 0xe4, "a"); + init_letter(0xc5, 0xe5, "a"); + init_letter(0xc6, 0xe6, "ae"); + init_letter(0xc7, 0xe7, "c"); + init_letter(0xc8, 0xe8, "e"); + init_letter(0xc9, 0xe9, "e"); + init_letter(0xca, 0xea, "e"); + init_letter(0xcb, 0xeb, "e"); + init_letter(0xcc, 0xec, "i"); + init_letter(0xcd, 0xed, "i"); + init_letter(0xce, 0xee, "i"); + init_letter(0xcf, 0xef, "i"); + + init_letter(0xd0, 0xf0, "d"); + init_letter(0xd1, 0xf1, "n"); + init_letter(0xd2, 0xf2, "o"); + init_letter(0xd3, 0xf3, "o"); + init_letter(0xd4, 0xf4, "o"); + init_letter(0xd5, 0xf5, "o"); + init_letter(0xd6, 0xf6, "o"); + init_letter(0xd8, 0xf8, "o"); + init_letter(0xd9, 0xf9, "u"); + init_letter(0xda, 0xfa, "u"); + init_letter(0xdb, 0xfb, "u"); + init_letter(0xdc, 0xfc, "u"); + init_letter(0xdd, 0xfd, "y"); + init_letter(0xde, 0xfe, THORN_SORT_KEY); + + store_token("\337", TOKEN_LOWER, "ss", "SS"); + store_token("\377", TOKEN_LOWER, "y", "Y"); +} + +static void init_two_char_letter(char l1, char l2, char u1, char u2, + const char *sk = 0) +{ + char buf[6]; + buf[0] = '\\'; + buf[1] = '('; + buf[2] = l1; + buf[3] = l2; + buf[4] = '\0'; + const char *p = strsave(buf); + buf[2] = u1; + buf[3] = u2; + store_letter(p, strsave(buf), sk); + buf[1] = '['; + buf[4] = ']'; + buf[5] = '\0'; + p = strsave(buf); + buf[2] = l1; + buf[3] = l2; + store_letter(strsave(buf), p, sk); + +} + +static void init_special_chars() +{ + const char *p; + for (p = "':^`~"; *p; p++) + for (const char *q = "aeiouy"; *q; q++) { + // Use a variable to work around bug in gcc 2.0 + char c = cmupper(*q); + init_two_char_letter(*p, *q, *p, c); + } + for (p = "/l/o~n,coeaeij"; *p; p += 2) { + // Use variables to work around bug in gcc 2.0 + char c0 = cmupper(p[0]); + char c1 = cmupper(p[1]); + init_two_char_letter(p[0], p[1], c0, c1); + } + init_two_char_letter('v', 's', 'v', 'S', "s"); + init_two_char_letter('v', 'z', 'v', 'Z', "z"); + init_two_char_letter('o', 'a', 'o', 'A', "a"); + init_two_char_letter('T', 'p', 'T', 'P', THORN_SORT_KEY); + init_two_char_letter('-', 'd', '-', 'D'); + + store_token("\\(ss", TOKEN_LOWER, 0, "SS"); + store_token("\\[ss]", TOKEN_LOWER, 0, "SS"); + + store_token("\\(Sd", TOKEN_LOWER, "d", "\\(-D"); + store_token("\\[Sd]", TOKEN_LOWER, "d", "\\[-D]"); + store_token("\\(hy", TOKEN_HYPHEN); + store_token("\\[hy]", TOKEN_HYPHEN); + store_token("\\(en", TOKEN_RANGE_SEP); + store_token("\\[en]", TOKEN_RANGE_SEP); +} + +static void init_strings() +{ + char buf[6]; + buf[0] = '\\'; + buf[1] = '*'; + for (const char *p = "'`^^,:~v_o./;"; *p; p++) { + buf[2] = *p; + buf[3] = '\0'; + store_token(strsave(buf), TOKEN_ACCENT); + buf[2] = '['; + buf[3] = *p; + buf[4] = ']'; + buf[5] = '\0'; + store_token(strsave(buf), TOKEN_ACCENT); + } + + // -ms special letters + store_letter("\\*(th", "\\*(Th", THORN_SORT_KEY); + store_letter("\\*[th]", "\\*[Th]", THORN_SORT_KEY); + store_letter("\\*(d-", "\\*(D-"); + store_letter("\\*[d-]", "\\*[D-]"); + store_letter("\\*(ae", "\\*(Ae", "ae"); + store_letter("\\*[ae]", "\\*[Ae]", "ae"); + store_letter("\\*(oe", "\\*(Oe", "oe"); + store_letter("\\*[oe]", "\\*[Oe]", "oe"); + + store_token("\\*3", TOKEN_LOWER, "y", "Y"); + store_token("\\*8", TOKEN_LOWER, "ss", "SS"); + store_token("\\*q", TOKEN_LOWER, "o", "O"); +} + +struct token_initer { + token_initer(); +}; + +static token_initer the_token_initer; + +token_initer::token_initer() +{ + init_ascii(); + init_latin1(); + init_special_chars(); + init_strings(); + default_token_info.set(TOKEN_OTHER); +} diff --git a/contrib/groff/src/preproc/refer/token.h b/contrib/groff/src/preproc/refer/token.h new file mode 100644 index 0000000..6da430d --- /dev/null +++ b/contrib/groff/src/preproc/refer/token.h @@ -0,0 +1,88 @@ +// -*- C++ -*- +/* Copyright (C) 1989, 1990, 1991, 1992 Free Software Foundation, Inc. + Written by James Clark (jjc@jclark.com) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License along +with groff; see the file COPYING. If not, write to the Free Software +Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + +enum token_type { + TOKEN_OTHER, + TOKEN_UPPER, + TOKEN_LOWER, + TOKEN_ACCENT, + TOKEN_PUNCT, + TOKEN_HYPHEN, + TOKEN_RANGE_SEP +}; + +class token_info { +private: + token_type type; + const char *sort_key; + const char *other_case; +public: + token_info(); + void set(token_type, const char *sk = 0, const char *oc = 0); + void lower_case(const char *start, const char *end, string &result) const; + void upper_case(const char *start, const char *end, string &result) const; + void sortify(const char *start, const char *end, string &result) const; + int sortify_non_empty(const char *start, const char *end) const; + int is_upper() const; + int is_lower() const; + int is_accent() const; + int is_other() const; + int is_punct() const; + int is_hyphen() const; + int is_range_sep() const; +}; + +inline int token_info::is_upper() const +{ + return type == TOKEN_UPPER; +} + +inline int token_info::is_lower() const +{ + return type == TOKEN_LOWER; +} + +inline int token_info::is_accent() const +{ + return type == TOKEN_ACCENT; +} + +inline int token_info::is_other() const +{ + return type == TOKEN_OTHER; +} + +inline int token_info::is_punct() const +{ + return type == TOKEN_PUNCT; +} + +inline int token_info::is_hyphen() const +{ + return type == TOKEN_HYPHEN; +} + +inline int token_info::is_range_sep() const +{ + return type == TOKEN_RANGE_SEP; +} + +int get_token(const char **ptr, const char *end); +const token_info *lookup_token(const char *start, const char *end); -- cgit v1.1