diff options
author | markm <markm@FreeBSD.org> | 1998-09-09 07:00:04 +0000 |
---|---|---|
committer | markm <markm@FreeBSD.org> | 1998-09-09 07:00:04 +0000 |
commit | 4fcbc3669aa997848e15198cc9fb856287a6788c (patch) | |
tree | 58b20e81687d6d5931f120b50802ed21225bf440 /contrib/perl5/lib/Text | |
download | FreeBSD-src-4fcbc3669aa997848e15198cc9fb856287a6788c.zip FreeBSD-src-4fcbc3669aa997848e15198cc9fb856287a6788c.tar.gz |
Initial import of Perl5. The king is dead; long live the king!
Diffstat (limited to 'contrib/perl5/lib/Text')
-rw-r--r-- | contrib/perl5/lib/Text/Abbrev.pm | 87 | ||||
-rw-r--r-- | contrib/perl5/lib/Text/ParseWords.pm | 256 | ||||
-rw-r--r-- | contrib/perl5/lib/Text/Soundex.pm | 148 | ||||
-rw-r--r-- | contrib/perl5/lib/Text/Tabs.pm | 97 | ||||
-rw-r--r-- | contrib/perl5/lib/Text/Wrap.pm | 125 |
5 files changed, 713 insertions, 0 deletions
diff --git a/contrib/perl5/lib/Text/Abbrev.pm b/contrib/perl5/lib/Text/Abbrev.pm new file mode 100644 index 0000000..ae6797c --- /dev/null +++ b/contrib/perl5/lib/Text/Abbrev.pm @@ -0,0 +1,87 @@ +package Text::Abbrev; +require 5.000; +require Exporter; + +=head1 NAME + +abbrev - create an abbreviation table from a list + +=head1 SYNOPSIS + + use Text::Abbrev; + abbrev $hashref, LIST + + +=head1 DESCRIPTION + +Stores all unambiguous truncations of each element of LIST +as keys key in the associative array referenced to by C<$hashref>. +The values are the original list elements. + +=head1 EXAMPLE + + $hashref = abbrev qw(list edit send abort gripe); + + %hash = abbrev qw(list edit send abort gripe); + + abbrev $hashref, qw(list edit send abort gripe); + + abbrev(*hash, qw(list edit send abort gripe)); + +=cut + +@ISA = qw(Exporter); +@EXPORT = qw(abbrev); + +# Usage: +# &abbrev(*foo,LIST); +# ... +# $long = $foo{$short}; + +sub abbrev { + my (%domain); + my ($name, $ref, $glob); + + if (ref($_[0])) { # hash reference preferably + $ref = shift; + } elsif ($_[0] =~ /^\*/) { # looks like a glob (deprecated) + $glob = shift; + } + my @cmp = @_; + + foreach $name (@_) { + my @extra = split(//,$name); + my $abbrev = shift(@extra); + my $len = 1; + my $cmp; + WORD: foreach $cmp (@cmp) { + next if $cmp eq $name; + while (substr($cmp,0,$len) eq $abbrev) { + last WORD unless @extra; + $abbrev .= shift(@extra); + ++$len; + } + } + $domain{$abbrev} = $name; + while (@extra) { + $abbrev .= shift(@extra); + $domain{$abbrev} = $name; + } + } + if ($ref) { + %$ref = %domain; + return; + } elsif ($glob) { # old style + local (*hash) = $glob; + %hash = %domain; + return; + } + if (wantarray) { + %domain; + } else { + \%domain; + } +} + +1; + diff --git a/contrib/perl5/lib/Text/ParseWords.pm b/contrib/perl5/lib/Text/ParseWords.pm new file mode 100644 index 0000000..2414f80 --- /dev/null +++ b/contrib/perl5/lib/Text/ParseWords.pm @@ -0,0 +1,256 @@ +package Text::ParseWords; + +use vars qw($VERSION @ISA @EXPORT $PERL_SINGLE_QUOTE); +$VERSION = "3.1"; + +require 5.000; + +use Exporter; +@ISA = qw(Exporter); +@EXPORT = qw(shellwords quotewords nested_quotewords parse_line); +@EXPORT_OK = qw(old_shellwords); + + +sub shellwords { + local(@lines) = @_; + $lines[$#lines] =~ s/\s+$//; + return(quotewords('\s+', 0, @lines)); +} + + + +sub quotewords { + my($delim, $keep, @lines) = @_; + my($line, @words, @allwords); + + + foreach $line (@lines) { + @words = parse_line($delim, $keep, $line); + return() unless (@words || !length($line)); + push(@allwords, @words); + } + return(@allwords); +} + + + +sub nested_quotewords { + my($delim, $keep, @lines) = @_; + my($i, @allwords); + + for ($i = 0; $i < @lines; $i++) { + @{$allwords[$i]} = parse_line($delim, $keep, $lines[$i]); + return() unless (@{$allwords[$i]} || !length($lines[$i])); + } + return(@allwords); +} + + + +sub parse_line { + # We will be testing undef strings + local($^W) = 0; + + my($delimiter, $keep, $line) = @_; + my($quote, $quoted, $unquoted, $delim, $word, @pieces); + + while (length($line)) { + + ($quote, $quoted, undef, $unquoted, $delim, undef) = + $line =~ m/^(["']) # a $quote + ((?:\\.|(?!\1)[^\\])*) # and $quoted text + \1 # followed by the same quote + ([\000-\377]*) # and the rest + | # --OR-- + ^((?:\\.|[^\\"'])*?) # an $unquoted text + (\Z(?!\n)|$delimiter|(?!^)(?=["'])) + # plus EOL, delimiter, or quote + ([\000-\377]*) # the rest + /x; # extended layout + return() unless( $quote || length($unquoted) || length($delim)); + + $line = $+; + + if ($keep) { + $quoted = "$quote$quoted$quote"; + } + else { + $unquoted =~ s/\\(.)/$1/g; + if (defined $quote) { + $quoted =~ s/\\(.)/$1/g if ($quote eq '"'); + $quoted =~ s/\\([\\'])/$1/g if ( $PERL_SINGLE_QUOTE && $quote eq "'"); + } + } + $word .= defined $quote ? $quoted : $unquoted; + + if (length($delim)) { + push(@pieces, $word); + push(@pieces, $delim) if ($keep eq 'delimiters'); + undef $word; + } + if (!length($line)) { + push(@pieces, $word); + } + } + return(@pieces); +} + + + +sub old_shellwords { + + # Usage: + # use ParseWords; + # @words = old_shellwords($line); + # or + # @words = old_shellwords(@lines); + + local($_) = join('', @_); + my(@words,$snippet,$field); + + s/^\s+//; + while ($_ ne '') { + $field = ''; + for (;;) { + if (s/^"(([^"\\]|\\.)*)"//) { + ($snippet = $1) =~ s#\\(.)#$1#g; + } + elsif (/^"/) { + return(); + } + elsif (s/^'(([^'\\]|\\.)*)'//) { + ($snippet = $1) =~ s#\\(.)#$1#g; + } + elsif (/^'/) { + return(); + } + elsif (s/^\\(.)//) { + $snippet = $1; + } + elsif (s/^([^\s\\'"]+)//) { + $snippet = $1; + } + else { + s/^\s+//; + last; + } + $field .= $snippet; + } + push(@words, $field); + } + @words; +} + +1; + +__END__ + +=head1 NAME + +Text::ParseWords - parse text into an array of tokens or array of arrays + +=head1 SYNOPSIS + + use Text::ParseWords; + @lists = &nested_quotewords($delim, $keep, @lines); + @words = "ewords($delim, $keep, @lines); + @words = &shellwords(@lines); + @words = &parse_line($delim, $keep, $line); + @words = &old_shellwords(@lines); # DEPRECATED! + +=head1 DESCRIPTION + +The &nested_quotewords() and "ewords() functions accept a delimiter +(which can be a regular expression) +and a list of lines and then breaks those lines up into a list of +words ignoring delimiters that appear inside quotes. "ewords() +returns all of the tokens in a single long list, while &nested_quotewords() +returns a list of token lists corresponding to the elements of @lines. +&parse_line() does tokenizing on a single string. The &*quotewords() +functions simply call &parse_lines(), so if you're only splitting +one line you can call &parse_lines() directly and save a function +call. + +The $keep argument is a boolean flag. If true, then the tokens are +split on the specified delimiter, but all other characters (quotes, +backslashes, etc.) are kept in the tokens. If $keep is false then the +&*quotewords() functions remove all quotes and backslashes that are +not themselves backslash-escaped or inside of single quotes (i.e., +"ewords() tries to interpret these characters just like the Bourne +shell). NB: these semantics are significantly different from the +original version of this module shipped with Perl 5.000 through 5.004. +As an additional feature, $keep may be the keyword "delimiters" which +causes the functions to preserve the delimiters in each string as +tokens in the token lists, in addition to preserving quote and +backslash characters. + +&shellwords() is written as a special case of "ewords(), and it +does token parsing with whitespace as a delimiter-- similar to most +Unix shells. + +=head1 EXAMPLES + +The sample program: + + use Text::ParseWords; + @words = "ewords('\s+', 0, q{this is "a test" of\ quotewords \"for you}); + $i = 0; + foreach (@words) { + print "$i: <$_>\n"; + $i++; + } + +produces: + + 0: <this> + 1: <is> + 2: <a test> + 3: <of quotewords> + 4: <"for> + 5: <you> + +demonstrating: + +=over 4 + +=item 0 +a simple word + +=item 1 +multiple spaces are skipped because of our $delim + +=item 2 +use of quotes to include a space in a word + +=item 3 +use of a backslash to include a space in a word + +=item 4 +use of a backslash to remove the special meaning of a double-quote + +=item 5 +another simple word (note the lack of effect of the +backslashed double-quote) + +=back + +Replacing C<"ewords('\s+', 0, q{this is...})> +with C<&shellwords(q{this is...})> +is a simpler way to accomplish the same thing. + +=head1 AUTHORS + +Maintainer is Hal Pomeranz <pomeranz@netcom.com>, 1994-1997 (Original +author unknown). Much of the code for &parse_line() (including the +primary regexp) from Joerk Behrends <jbehrends@multimediaproduzenten.de>. + +Examples section another documentation provided by John Heidemann +<johnh@ISI.EDU> + +Bug reports, patches, and nagging provided by lots of folks-- thanks +everybody! Special thanks to Michael Schwern <schwern@envirolink.org> +for assuring me that a &nested_quotewords() would be useful, and to +Jeff Friedl <jfriedl@yahoo-inc.com> for telling me not to worry about +error-checking (sort of-- you had to be there). + +=cut diff --git a/contrib/perl5/lib/Text/Soundex.pm b/contrib/perl5/lib/Text/Soundex.pm new file mode 100644 index 0000000..ddc758c --- /dev/null +++ b/contrib/perl5/lib/Text/Soundex.pm @@ -0,0 +1,148 @@ +package Text::Soundex; +require 5.000; +require Exporter; + +@ISA = qw(Exporter); +@EXPORT = qw(&soundex $soundex_nocode); + +# $Id: soundex.pl,v 1.2 1994/03/24 00:30:27 mike Exp $ +# +# Implementation of soundex algorithm as described by Knuth in volume +# 3 of The Art of Computer Programming, with ideas stolen from Ian +# Phillips <ian@pipex.net>. +# +# Mike Stok <Mike.Stok@meiko.concord.ma.us>, 2 March 1994. +# +# Knuth's test cases are: +# +# Euler, Ellery -> E460 +# Gauss, Ghosh -> G200 +# Hilbert, Heilbronn -> H416 +# Knuth, Kant -> K530 +# Lloyd, Ladd -> L300 +# Lukasiewicz, Lissajous -> L222 +# +# $Log: soundex.pl,v $ +# Revision 1.2 1994/03/24 00:30:27 mike +# Subtle bug (any excuse :-) spotted by Rich Pinder <rpinder@hsc.usc.edu> +# in the way I handles leasing characters which were different but had +# the same soundex code. This showed up comparing it with Oracle's +# soundex output. +# +# Revision 1.1 1994/03/02 13:01:30 mike +# Initial revision +# +# +############################################################################## + +# $soundex_nocode is used to indicate a string doesn't have a soundex +# code, I like undef other people may want to set it to 'Z000'. + +$soundex_nocode = undef; + +sub soundex +{ + local (@s, $f, $fc, $_) = @_; + + push @s, '' unless @s; # handle no args as a single empty string + + foreach (@s) + { + $_ = uc $_; + tr/A-Z//cd; + + if ($_ eq '') + { + $_ = $soundex_nocode; + } + else + { + ($f) = /^(.)/; + tr/AEHIOUWYBFPVCGJKQSXZDTLMNR/00000000111122222222334556/; + ($fc) = /^(.)/; + s/^$fc+//; + tr///cs; + tr/0//d; + $_ = $f . $_ . '000'; + s/^(.{4}).*/$1/; + } + } + + wantarray ? @s : shift @s; +} + +1; + +__END__ + +=head1 NAME + +Text::Soundex - Implementation of the Soundex Algorithm as Described by Knuth + +=head1 SYNOPSIS + + use Text::Soundex; + + $code = soundex $string; # get soundex code for a string + @codes = soundex @list; # get list of codes for list of strings + + # set value to be returned for strings without soundex code + + $soundex_nocode = 'Z000'; + +=head1 DESCRIPTION + +This module implements the soundex algorithm as described by Donald Knuth +in Volume 3 of B<The Art of Computer Programming>. The algorithm is +intended to hash words (in particular surnames) into a small space using a +simple model which approximates the sound of the word when spoken by an English +speaker. Each word is reduced to a four character string, the first +character being an upper case letter and the remaining three being digits. + +If there is no soundex code representation for a string then the value of +C<$soundex_nocode> is returned. This is initially set to C<undef>, but +many people seem to prefer an I<unlikely> value like C<Z000> +(how unlikely this is depends on the data set being dealt with.) Any value +can be assigned to C<$soundex_nocode>. + +In scalar context C<soundex> returns the soundex code of its first +argument, and in array context a list is returned in which each element is the +soundex code for the corresponding argument passed to C<soundex> e.g. + + @codes = soundex qw(Mike Stok); + +leaves C<@codes> containing C<('M200', 'S320')>. + +=head1 EXAMPLES + +Knuth's examples of various names and the soundex codes they map to +are listed below: + + Euler, Ellery -> E460 + Gauss, Ghosh -> G200 + Hilbert, Heilbronn -> H416 + Knuth, Kant -> K530 + Lloyd, Ladd -> L300 + Lukasiewicz, Lissajous -> L222 + +so: + + $code = soundex 'Knuth'; # $code contains 'K530' + @list = soundex qw(Lloyd Gauss); # @list contains 'L300', 'G200' + +=head1 LIMITATIONS + +As the soundex algorithm was originally used a B<long> time ago in the US +it considers only the English alphabet and pronunciation. + +As it is mapping a large space (arbitrary length strings) onto a small +space (single letter plus 3 digits) no inference can be made about the +similarity of two strings which end up with the same soundex code. For +example, both C<Hilbert> and C<Heilbronn> end up with a soundex code +of C<H416>. + +=head1 AUTHOR + +This code was implemented by Mike Stok (C<stok@cybercom.net>) from the +description given by Knuth. Ian Phillips (C<ian@pipex.net>) and Rich Pinder +(C<rpinder@hsc.usc.edu>) supplied ideas and spotted mistakes. diff --git a/contrib/perl5/lib/Text/Tabs.pm b/contrib/perl5/lib/Text/Tabs.pm new file mode 100644 index 0000000..acd7afb --- /dev/null +++ b/contrib/perl5/lib/Text/Tabs.pm @@ -0,0 +1,97 @@ + +package Text::Tabs; + +require Exporter; + +@ISA = (Exporter); +@EXPORT = qw(expand unexpand $tabstop); + +use vars qw($VERSION $tabstop $debug); +$VERSION = 96.121201; + +use strict; + +BEGIN { + $tabstop = 8; + $debug = 0; +} + +sub expand +{ + my @l = @_; + for $_ (@l) { + 1 while s/(^|\n)([^\t\n]*)(\t+)/ + $1. $2 . (" " x + ($tabstop * length($3) + - (length($2) % $tabstop))) + /sex; + } + return @l if wantarray; + return $l[0]; +} + +sub unexpand +{ + my @l = @_; + my @e; + my $x; + my $line; + my @lines; + my $lastbit; + for $x (@l) { + @lines = split("\n", $x, -1); + for $line (@lines) { + $line = expand($line); + @e = split(/(.{$tabstop})/,$line,-1); + $lastbit = pop(@e); + $lastbit = '' unless defined $lastbit; + $lastbit = "\t" + if $lastbit eq " "x$tabstop; + for $_ (@e) { + if ($debug) { + my $x = $_; + $x =~ s/\t/^I\t/gs; + print "sub on '$x'\n"; + } + s/ +$/\t/; + } + $line = join('',@e, $lastbit); + } + $x = join("\n", @lines); + } + return @l if wantarray; + return $l[0]; +} + +1; +__END__ + + +=head1 NAME + +Text::Tabs -- expand and unexpand tabs per the unix expand(1) and unexpand(1) + +=head1 SYNOPSIS + +use Text::Tabs; + +$tabstop = 4; +@lines_without_tabs = expand(@lines_with_tabs); +@lines_with_tabs = unexpand(@lines_without_tabs); + +=head1 DESCRIPTION + +Text::Tabs does about what the unix utilities expand(1) and unexpand(1) +do. Given a line with tabs in it, expand will replace the tabs with +the appropriate number of spaces. Given a line with or without tabs in +it, unexpand will add tabs when it can save bytes by doing so. Invisible +compression with plain ascii! + +=head1 BUGS + +expand doesn't handle newlines very quickly -- do not feed it an +entire document in one string. Instead feed it an array of lines. + +=head1 AUTHOR + +David Muir Sharnoff <muir@idiom.com> diff --git a/contrib/perl5/lib/Text/Wrap.pm b/contrib/perl5/lib/Text/Wrap.pm new file mode 100644 index 0000000..0fe7fb9 --- /dev/null +++ b/contrib/perl5/lib/Text/Wrap.pm @@ -0,0 +1,125 @@ +package Text::Wrap; + +use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION $columns $debug); +use strict; +use Exporter; + +$VERSION = "97.02"; +@ISA = qw(Exporter); +@EXPORT = qw(wrap); +@EXPORT_OK = qw($columns $tabstop fill); + +use Text::Tabs qw(expand unexpand $tabstop); + + +BEGIN { + $columns = 76; # <= screen width + $debug = 0; +} + +sub wrap +{ + my ($ip, $xp, @t) = @_; + + my @rv; + my $t = expand(join(" ",@t)); + + my $lead = $ip; + my $ll = $columns - length(expand($lead)) - 1; + my $nl = ""; + + $t =~ s/^\s+//; + while(length($t) > $ll) { + # remove up to a line length of things that + # aren't new lines and tabs. + if ($t =~ s/^([^\n]{0,$ll})(\s|\Z(?!\n))//) { + my ($l,$r) = ($1,$2); + $l =~ s/\s+$//; + print "WRAP $lead$l..($r)\n" if $debug; + push @rv, unexpand($lead . $l), "\n"; + + } elsif ($t =~ s/^([^\n]{$ll})//) { + print "SPLIT $lead$1..\n" if $debug; + push @rv, unexpand($lead . $1),"\n"; + } + # recompute the leader + $lead = $xp; + $ll = $columns - length(expand($lead)) - 1; + $t =~ s/^\s+//; + } + print "TAIL $lead$t\n" if $debug; + push @rv, $lead.$t if $t ne ""; + return join '', @rv; +} + + +sub fill +{ + my ($ip, $xp, @raw) = @_; + my @para; + my $pp; + + for $pp (split(/\n\s+/, join("\n",@raw))) { + $pp =~ s/\s+/ /g; + my $x = wrap($ip, $xp, $pp); + push(@para, $x); + } + + # if paragraph_indent is the same as line_indent, + # separate paragraphs with blank lines + + return join ($ip eq $xp ? "\n\n" : "\n", @para); +} + +1; +__END__ + +=head1 NAME + +Text::Wrap - line wrapping to form simple paragraphs + +=head1 SYNOPSIS + + use Text::Wrap + + print wrap($initial_tab, $subsequent_tab, @text); + + use Text::Wrap qw(wrap $columns $tabstop fill); + + $columns = 132; + $tabstop = 4; + + print fill($initial_tab, $subsequent_tab, @text); + print fill("", "", `cat book`); + +=head1 DESCRIPTION + +Text::Wrap::wrap() is a very simple paragraph formatter. It formats a +single paragraph at a time by breaking lines at word boundries. +Indentation is controlled for the first line ($initial_tab) and +all subsquent lines ($subsequent_tab) independently. $Text::Wrap::columns +should be set to the full width of your output device. + +Text::Wrap::fill() is a simple multi-paragraph formatter. It formats +each paragraph separately and then joins them together when it's done. It +will destory any whitespace in the original text. It breaks text into +paragraphs by looking for whitespace after a newline. In other respects +it acts like wrap(). + +=head1 EXAMPLE + + print wrap("\t","","This is a bit of text that forms + a normal book-style paragraph"); + +=head1 BUGS + +It's not clear what the correct behavior should be when Wrap() is +presented with a word that is longer than a line. The previous +behavior was to die. Now the word is now split at line-length. + +=head1 AUTHOR + +David Muir Sharnoff <muir@idiom.com> with help from Tim Pierce and +others. Updated by Jacqui Caren. + +=cut |