diff options
Diffstat (limited to 'contrib/texinfo/makeinfo/lang.c')
-rw-r--r-- | contrib/texinfo/makeinfo/lang.c | 393 |
1 files changed, 293 insertions, 100 deletions
diff --git a/contrib/texinfo/makeinfo/lang.c b/contrib/texinfo/makeinfo/lang.c index 2938196..c72e8db 100644 --- a/contrib/texinfo/makeinfo/lang.c +++ b/contrib/texinfo/makeinfo/lang.c @@ -1,7 +1,8 @@ /* lang.c -- language-dependent support. - $Id: lang.c,v 1.8 2003/05/01 00:05:27 karl Exp $ + $Id: lang.c,v 1.14 2004/11/22 23:57:33 karl Exp $ - Copyright (C) 1999, 2000, 2001, 2002, 2003 Free Software Foundation, Inc. + Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004 Free Software + Foundation, Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -21,6 +22,7 @@ #include "system.h" #include "cmds.h" +#include "files.h" #include "lang.h" #include "makeinfo.h" #include "xml.h" @@ -31,6 +33,9 @@ encoding_code_type document_encoding_code = no_encoding; /* Current language code; default is English. */ language_code_type language_code = en; +/* By default, unsupported encoding is an empty string. */ +char *unknown_encoding = NULL; + static iso_map_type us_ascii_map [] = {{NULL, 0, 0}}; /* ASCII map is trivial */ /* Translation table between HTML and ISO Codes. The last item is @@ -137,6 +142,126 @@ static iso_map_type iso8859_1_map [] = { { NULL, 0, 0 } }; + +/* ISO 8859-15, also known as Latin 9, differs from Latin 1 in only a + few positions. http://www.cs.tut.fi/~jkorpela/latin9.html has a good + explanation and listing, summarized here. The names are abbreviated + from the official Unicode names, to fit in a decent line length. + + code position + dec oct hex latin1 latin1 name latin9 latin9 name + + 164 0244 0xA4 U+00A4 currency symbol U+20AC euro sign + 166 0246 0xA6 U+00A6 broken bar U+0160 S with caron + 168 0250 0xA8 U+00A8 diaeresis U+0161 s with caron + 180 0264 0xB4 U+00B4 acute accent U+017D Z with caron + 184 0270 0xB8 U+00B8 cedilla U+017E z with caron + 188 0274 0xBC U+00BC fraction 1/4 U+0152 ligature OE + 189 0275 0xBD U+00BD fraction 1/2 U+0153 ligature oe + 190 0276 0xBE U+00BE fraction 3/4 U+0178 Y with diaeresis +*/ + +static iso_map_type iso8859_15_map [] = { + { "nbsp", 0xA0, 0x00A0 }, + { "iexcl", 0xA1, 0x00A1 }, + { "cent", 0xA2, 0x00A2 }, + { "pound", 0xA3, 0x00A3 }, + { "euro", 0xA4, 0x20AC }, + { "yen", 0xA5, 0x00A5 }, + { "Scaron", 0xA6, 0x0160 }, + { "sect", 0xA7, 0x00A7 }, + { "scaron", 0xA8, 0x0161 }, + { "copy", 0xA9, 0x00A9 }, + { "ordf", 0xAA, 0x00AA }, + { "laquo", 0xAB, 0x00AB }, + { "not", 0xAC, 0x00AC }, + { "shy", 0xAD, 0x00AD }, + { "reg", 0xAE, 0x00AE }, + { "hibar", 0xAF, 0x00AF }, + { "deg", 0xB0, 0x00B0 }, + { "plusmn", 0xB1, 0x00B1 }, + { "sup2", 0xB2, 0x00B2 }, + { "sup3", 0xB3, 0x00B3 }, + { "Zcaron", 0xB4, 0x017D }, + { "micro", 0xB5, 0x00B5 }, + { "para", 0xB6, 0x00B6 }, + { "middot", 0xB7, 0x00B7 }, + { "zcaron", 0xB8, 0x017E }, + { "sup1", 0xB9, 0x00B9 }, + { "ordm", 0xBA, 0x00BA }, + { "raquo", 0xBB, 0x00BB }, + { "OElig", 0xBC, 0x0152 }, + { "oelig", 0xBD, 0x0153 }, + { "Yuml", 0xBE, 0x0178 }, + { "iquest", 0xBF, 0x00BF }, + { "Agrave", 0xC0, 0x00C0 }, + { "Aacute", 0xC1, 0x00C1 }, + { "Acirc", 0xC2, 0x00C2 }, + { "Atilde", 0xC3, 0x00C3 }, + { "Auml", 0xC4, 0x00C4 }, + { "Aring", 0xC5, 0x00C5 }, + { "AElig", 0xC6, 0x00C6 }, + { "Ccedil", 0xC7, 0x00C7 }, + { "Ccedil", 0xC7, 0x00C7 }, + { "Egrave", 0xC8, 0x00C8 }, + { "Eacute", 0xC9, 0x00C9 }, + { "Ecirc", 0xCA, 0x00CA }, + { "Euml", 0xCB, 0x00CB }, + { "Igrave", 0xCC, 0x00CC }, + { "Iacute", 0xCD, 0x00CD }, + { "Icirc", 0xCE, 0x00CE }, + { "Iuml", 0xCF, 0x00CF }, + { "ETH", 0xD0, 0x00D0 }, + { "Ntilde", 0xD1, 0x00D1 }, + { "Ograve", 0xD2, 0x00D2 }, + { "Oacute", 0xD3, 0x00D3 }, + { "Ocirc", 0xD4, 0x00D4 }, + { "Otilde", 0xD5, 0x00D5 }, + { "Ouml", 0xD6, 0x00D6 }, + { "times", 0xD7, 0x00D7 }, + { "Oslash", 0xD8, 0x00D8 }, + { "Ugrave", 0xD9, 0x00D9 }, + { "Uacute", 0xDA, 0x00DA }, + { "Ucirc", 0xDB, 0x00DB }, + { "Uuml", 0xDC, 0x00DC }, + { "Yacute", 0xDD, 0x00DD }, + { "THORN", 0xDE, 0x00DE }, + { "szlig", 0xDF, 0x00DF }, + { "agrave", 0xE0, 0x00E0 }, + { "aacute", 0xE1, 0x00E1 }, + { "acirc", 0xE2, 0x00E2 }, + { "atilde", 0xE3, 0x00E3 }, + { "auml", 0xE4, 0x00E4 }, + { "aring", 0xE5, 0x00E5 }, + { "aelig", 0xE6, 0x00E6 }, + { "ccedil", 0xE7, 0x00E7 }, + { "egrave", 0xE8, 0x00E8 }, + { "eacute", 0xE9, 0x00E9 }, + { "ecirc", 0xEA, 0x00EA }, + { "euml", 0xEB, 0x00EB }, + { "igrave", 0xEC, 0x00EC }, + { "iacute", 0xED, 0x00ED }, + { "icirc", 0xEE, 0x00EE }, + { "iuml", 0xEF, 0x00EF }, + { "eth", 0xF0, 0x00F0 }, + { "ntilde", 0xF1, 0x00F1 }, + { "ograve", 0xF2, 0x00F2 }, + { "oacute", 0xF3, 0x00F3 }, + { "ocirc", 0xF4, 0x00F4 }, + { "otilde", 0xF5, 0x00F5 }, + { "ouml", 0xF6, 0x00F6 }, + { "divide", 0xF7, 0x00F7 }, + { "oslash", 0xF8, 0x00F8 }, + { "ugrave", 0xF9, 0x00F9 }, + { "uacute", 0xFA, 0x00FA }, + { "ucirc", 0xFB, 0x00FB }, + { "uuml", 0xFC, 0x00FC }, + { "yacute", 0xFD, 0x00FD }, + { "thorn", 0xFE, 0x00FE }, + { "yuml", 0xFF, 0x00FF }, + { NULL, 0, 0 } +}; + /* Date: Mon, 31 Mar 2003 00:19:28 +0200 @@ -262,21 +387,21 @@ static iso_map_type iso8859_2_map [] = { encoding_type encoding_table[] = { { no_encoding, "(no encoding)", NULL }, { US_ASCII, "US-ASCII", us_ascii_map }, - { ISO_8859_1, "ISO-8859-1", (iso_map_type *) iso8859_1_map }, - { ISO_8859_2, "ISO-8859-2", (iso_map_type *) iso8859_2_map }, - { ISO_8859_3, "ISO-8859-3", NULL }, - { ISO_8859_4, "ISO-8859-4", NULL }, - { ISO_8859_5, "ISO-8859-5", NULL }, - { ISO_8859_6, "ISO-8859-6", NULL }, - { ISO_8859_7, "ISO-8859-7", NULL }, - { ISO_8859_8, "ISO-8859-8", NULL }, - { ISO_8859_9, "ISO-8859-9", NULL }, - { ISO_8859_10, "ISO-8859-10", NULL }, - { ISO_8859_11, "ISO-8859-11", NULL }, - { ISO_8859_12, "ISO-8859-12", NULL }, - { ISO_8859_13, "ISO-8859-13", NULL }, - { ISO_8859_14, "ISO-8859-14", NULL }, - { ISO_8859_15, "ISO-8859-15", NULL }, + { ISO_8859_1, "iso-8859-1", (iso_map_type *) iso8859_1_map }, + { ISO_8859_2, "iso-8859-2", (iso_map_type *) iso8859_2_map }, + { ISO_8859_3, "iso-8859-3", NULL }, + { ISO_8859_4, "iso-8859-4", NULL }, + { ISO_8859_5, "iso-8859-5", NULL }, + { ISO_8859_6, "iso-8859-6", NULL }, + { ISO_8859_7, "iso-8859-7", NULL }, + { ISO_8859_8, "iso-8859-8", NULL }, + { ISO_8859_9, "iso-8859-9", NULL }, + { ISO_8859_10, "iso-8859-10", NULL }, + { ISO_8859_11, "iso-8859-11", NULL }, + { ISO_8859_12, "iso-8859-12", NULL }, + { ISO_8859_13, "iso-8859-13", NULL }, + { ISO_8859_14, "iso-8859-14", NULL }, + { ISO_8859_15, "iso-8859-15", (iso_map_type *) iso8859_15_map }, { last_encoding_code, NULL, NULL } }; @@ -423,13 +548,16 @@ language_type language_table[] = { { zu, "zu", "Zulu" }, { last_language_code, NULL, NULL } }; - - /* @documentlanguage. Maybe we'll do something useful with this in the future. For now, we just recognize it. */ + +/* XML documents can make use of this data. Unfortunately, it clashes with + the structure currently used. So instead of enclosing content into + a language block, we just output an empty element. Anyways, a stream based + parser can make good use of it. */ void -cm_documentlanguage () +cm_documentlanguage (void) { language_code_type c; char *lang_arg; @@ -451,6 +579,12 @@ cm_documentlanguage () if (c == last_language_code) warning (_("%s is not a valid ISO 639 language code"), lang_arg); + if (xml && !docbook) + { + xml_insert_element_with_attribute (DOCUMENTLANGUAGE, START, "xml:lang=\"%s\"", lang_arg); + xml_insert_element (DOCUMENTLANGUAGE, END); + } + free (lang_arg); } @@ -460,8 +594,7 @@ cm_documentlanguage () its equivalent. */ static int -cm_search_iso_map (html) - char *html; +cm_search_iso_map (char *html) { int i; iso_map_type *iso = encoding_table[document_encoding_code].isotab; @@ -483,43 +616,88 @@ cm_search_iso_map (html) /* @documentencoding. Set the translation table. */ void -cm_documentencoding () +cm_documentencoding (void) { - encoding_code_type enc; - char *enc_arg; - - get_rest_of_line (1, &enc_arg); - - /* See if we have this encoding. */ - for (enc = no_encoding+1; enc != last_encoding_code; enc++) + if (!handling_delayed_writes) { - if (strcasecmp (enc_arg, encoding_table[enc].encname) == 0) + encoding_code_type enc; + char *enc_arg; + + /* This is ugly and probably needs to apply to other commands' + argument parsing as well. When we're doing @documentencoding, + we're generally in the frontmatter of the document, and so the. + expansion in html/xml/docbook would generally be the empty string. + (Because those modes wait until the first normal text of the + document to start outputting.) The result would thus be a warning + "unrecognized encoding name `'". Sigh. */ + int save_html = html; + int save_xml = xml; + + html = 0; + xml = 0; + get_rest_of_line (1, &enc_arg); + html = save_html; + xml = save_xml; + + /* See if we have this encoding. */ + for (enc = no_encoding+1; enc != last_encoding_code; enc++) { - document_encoding_code = enc; - break; + if (strcasecmp (enc_arg, encoding_table[enc].encname) == 0) + { + document_encoding_code = enc; + break; + } + } + + /* If we didn't find this code, complain. */ + if (enc == last_encoding_code) + { + warning (_("unrecognized encoding name `%s'"), enc_arg); + /* Let the previous one go. */ + if (unknown_encoding && *unknown_encoding) + free (unknown_encoding); + unknown_encoding = xstrdup (enc_arg); } + + else if (encoding_table[document_encoding_code].isotab == NULL) + warning (_("sorry, encoding `%s' not supported"), enc_arg); + + free (enc_arg); } + else if (xml) + { + char *encoding = current_document_encoding (); - /* If we didn't find this code, complain. */ - if (enc == last_encoding_code) - warning (_("unrecognized encoding name `%s'"), enc_arg); + if (encoding && *encoding) + { + insert_string (" encoding=\""); + insert_string (encoding); + insert_string ("\""); + } - else if (encoding_table[document_encoding_code].isotab == NULL) - warning (_("sorry, encoding `%s' not supported"), enc_arg); + free (encoding); + } +} - free (enc_arg); +char * +current_document_encoding (void) +{ + if (document_encoding_code != no_encoding) + return xstrdup (encoding_table[document_encoding_code].encname); + else if (unknown_encoding && *unknown_encoding) + return xstrdup (unknown_encoding); + else + return xstrdup (""); } -/* If html or xml output, add HTML_STR to the output. If not html and +/* If html or xml output, add &HTML_STR; to the output. If not html and the user requested encoded output, add the real 8-bit character corresponding to HTML_STR from the translation tables. Otherwise, add INFO_STR. */ -void -add_encoded_char (html_str, info_str) - char *html_str; - char *info_str; +static void +add_encoded_char (char *html_str, char *info_str) { if (html) add_word_args ("&%s;", html_str); @@ -547,13 +725,8 @@ add_encoded_char (html_str, info_str) /* Output an accent for HTML or XML. */ static void -cm_accent_generic_html (arg, start, end, html_supported, single, - html_solo_standalone, html_solo) - int arg, start, end; - char *html_supported; - int single; - int html_solo_standalone; - char *html_solo; +cm_accent_generic_html (int arg, int start, int end, char *html_supported, + int single, int html_solo_standalone, char *html_solo) { static int valid_html_accent; /* yikes */ @@ -569,20 +742,39 @@ cm_accent_generic_html (arg, start, end, html_supported, single, escape_html = saved_escape_html; } else - { - valid_html_accent = 0; - if (html_solo_standalone) - { /* No special HTML support, so produce standalone char. */ - if (xml) - xml_insert_entity (html_solo); + { /* @dotless{i} is not listed in html_supported but HTML entities + starting with `i' can be used, such as î. */ + int save_input_text_offset = input_text_offset; + char *accent_contents; + + get_until_in_braces ("\n", &accent_contents); + canon_white (accent_contents); + + if (strstr (accent_contents, "@dotless{i")) + { + add_word_args ("&%c", accent_contents[9]); + valid_html_accent = 1; + } + else + { + /* Search for @dotless{} wasn't successful, so rewind. */ + input_text_offset = save_input_text_offset; + valid_html_accent = 0; + if (html_solo_standalone) + { /* No special HTML support, so produce standalone char. */ + if (xml) + xml_insert_entity (html_solo); + else + add_word_args ("&%s;", html_solo); + } else - add_word_args ("&%s;", html_solo); - } - else - /* If the html_solo does not exist as standalone character - (namely ˆ ` ˜), then we use - the single character version instead. */ - add_char (single); + /* If the html_solo does not exist as standalone character + (namely ˆ ` ˜), then we use + the single character version instead. */ + add_char (single); + } + + free (accent_contents); } } else if (arg == END) @@ -598,10 +790,8 @@ cm_accent_generic_html (arg, start, end, html_supported, single, static void -cm_accent_generic_no_headers (arg, start, end, single, html_solo) - int arg, start, end; - int single; - char *html_solo; +cm_accent_generic_no_headers (int arg, int start, int end, int single, + char *html_solo) { if (arg == END) { @@ -628,8 +818,11 @@ cm_accent_generic_no_headers (arg, start, end, single, html_solo) { /* If we didn't find a translation for this character, put the single instead. E.g., &Xuml; does not exist so X¨ should be produced. */ - warning (_("%s is an invalid ISO code, using %c"), - buffer, single); + /* When the below warning is issued, an author has nothing + wrong in their document, let alone anything ``fixable'' + on their side. So it is commented out for now. */ + /* warning (_("%s is an invalid ISO code, using %c"), + buffer, single); */ add_char (single); } @@ -644,8 +837,7 @@ cm_accent_generic_no_headers (arg, start, end, single, html_solo) special HTML support. */ void -cm_accent (arg) - int arg; +cm_accent (int arg) { int old_escape_html = escape_html; escape_html = 0; @@ -687,14 +879,14 @@ cm_accent (arg) exists as valid standalone character in HTML, e.g., ¨. */ static void -cm_accent_generic (arg, start, end, html_supported, single, - html_solo_standalone, html_solo) - int arg, start, end; - char *html_supported; - int single; - int html_solo_standalone; - char *html_solo; +cm_accent_generic (int arg, int start, int end, char *html_supported, + int single, int html_solo_standalone, char *html_solo) { + /* Accentuating space characters makes no sense, so issue a warning. */ + if (arg == START && isspace (input_text[input_text_offset])) + warning ("Accent command `@%s' must not be followed by whitespace", + command); + if (html || xml) cm_accent_generic_html (arg, start, end, html_supported, single, html_solo_standalone, html_solo); @@ -712,43 +904,37 @@ cm_accent_generic (arg, start, end, html_supported, single, } void -cm_accent_umlaut (arg, start, end) - int arg, start, end; +cm_accent_umlaut (int arg, int start, int end) { cm_accent_generic (arg, start, end, "aouAOUEeIiy", '"', 1, "uml"); } void -cm_accent_acute (arg, start, end) - int arg, start, end; +cm_accent_acute (int arg, int start, int end) { cm_accent_generic (arg, start, end, "AEIOUYaeiouy", '\'', 1, "acute"); } void -cm_accent_cedilla (arg, start, end) - int arg, start, end; +cm_accent_cedilla (int arg, int start, int end) { cm_accent_generic (arg, start, end, "Cc", ',', 1, "cedil"); } void -cm_accent_hat (arg, start, end) - int arg, start, end; +cm_accent_hat (int arg, int start, int end) { cm_accent_generic (arg, start, end, "AEIOUaeiou", '^', 0, "circ"); } void -cm_accent_grave (arg, start, end) - int arg, start, end; +cm_accent_grave (int arg, int start, int end) { cm_accent_generic (arg, start, end, "AEIOUaeiou", '`', 0, "grave"); } void -cm_accent_tilde (arg, start, end) - int arg, start, end; +cm_accent_tilde (int arg, int start, int end) { cm_accent_generic (arg, start, end, "ANOano", '~', 0, "tilde"); } @@ -757,7 +943,7 @@ cm_accent_tilde (arg, start, end) /* Non-English letters/characters that don't insert themselves. */ void -cm_special_char (arg) +cm_special_char (int arg) { int old_escape_html = escape_html; escape_html = 0; @@ -769,27 +955,35 @@ cm_special_char (arg) && command[1] == 0) { /* Lslash lslash Oslash oslash. Lslash and lslash aren't supported in HTML. */ - if ((html || xml) && command[0] == 'O') + if (command[0] == 'O') add_encoded_char ("Oslash", "/O"); - else if ((html || xml) && command[0] == 'o') + else if (command[0] == 'o') add_encoded_char ("oslash", "/o"); else add_word_args ("/%c", command[0]); } else if (strcmp (command, "exclamdown") == 0) add_encoded_char ("iexcl", "!"); - else if (strcmp (command, "pounds") == 0) - add_encoded_char ("pound" , "#"); else if (strcmp (command, "questiondown") == 0) add_encoded_char ("iquest", "?"); + else if (strcmp (command, "euro") == 0) + /* http://www.cs.tut.fi/~jkorpela/html/euro.html suggests that + € degrades best in old browsers. */ + add_encoded_char ("euro", "Euro "); + else if (strcmp (command, "pounds") == 0) + add_encoded_char ("pound" , "#"); + else if (strcmp (command, "ordf") == 0) + add_encoded_char ("ordf" , "a"); + else if (strcmp (command, "ordm") == 0) + add_encoded_char ("ordm" , "o"); else if (strcmp (command, "AE") == 0) add_encoded_char ("AElig", command); else if (strcmp (command, "ae") == 0) add_encoded_char ("aelig", command); else if (strcmp (command, "OE") == 0) - add_encoded_char ("#140", command); + add_encoded_char ("OElig", command); else if (strcmp (command, "oe") == 0) - add_encoded_char ("#156", command); + add_encoded_char ("oelig", command); else if (strcmp (command, "AA") == 0) add_encoded_char ("Aring", command); else if (strcmp (command, "aa") == 0) @@ -804,8 +998,7 @@ cm_special_char (arg) /* Dotless i or j. */ void -cm_dotless (arg, start, end) - int arg, start, end; +cm_dotless (int arg, int start, int end) { if (arg == END) { |