summaryrefslogtreecommitdiffstats
path: root/contrib/perl5/lib/unicode/mktables.PL
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/perl5/lib/unicode/mktables.PL')
-rwxr-xr-xcontrib/perl5/lib/unicode/mktables.PL174
1 files changed, 141 insertions, 33 deletions
diff --git a/contrib/perl5/lib/unicode/mktables.PL b/contrib/perl5/lib/unicode/mktables.PL
index cef6936..5aca93e 100755
--- a/contrib/perl5/lib/unicode/mktables.PL
+++ b/contrib/perl5/lib/unicode/mktables.PL
@@ -1,28 +1,47 @@
#!../../miniperl
-$UnicodeData = "Unicode.300";
+use bytes;
+
+$UnicodeData = "Unicode.301";
+$SyllableData = "syllables.txt";
+$PropData = "PropList.txt";
+
# Note: we try to keep filenames unique within first 8 chars. Using
# subdirectories for the following helps.
-mkdir "In", 0777;
-mkdir "Is", 0777;
-mkdir "To", 0777;
+mkdir "In", 0755;
+mkdir "Is", 0755;
+mkdir "To", 0755;
@todo = (
# typical
- ['IsWord', '$cat =~ /^L[ulo]|^Nd/ or $code eq "005F"', ''],
- ['IsAlnum', '$cat =~ /^L[ulo]|^Nd/', ''],
- ['IsAlpha', '$cat =~ /^L[ulo]/', ''],
- ['IsSpace', '$cat =~ /^Z/ or $code lt "0020" and chr(hex $code) =~ /^\s/', ''],
+ # 005F: SPACING UNDERSCROE
+ ['IsWord', '$cat =~ /^[LMN]/ or $code eq "005F"', ''],
+ ['IsAlnum', '$cat =~ /^[LMN]/', ''],
+ ['IsAlpha', '$cat =~ /^[LM]/', ''],
+ # 0009: HORIZONTAL TABULATION
+ # 000A: LINE FEED
+ # 000B: VERTICAL TABULATION
+ # 000C: FORM FEED
+ # 000D: CARRIAGE RETURN
+ # 0020: SPACE
+ ['IsSpace', '$cat =~ /^Z/ ||
+ $code =~ /^(0009|000A|000B|000C|000D)$/', ''],
+ ['IsSpacePerl',
+ '$cat =~ /^Z/ ||
+ $code =~ /^(0009|000A|000C|000D)$/', ''],
+ ['IsBlank', '$code =~ /^(0020|0009)$/ ||
+ $cat =~ /^Z[^lp]$/', ''],
['IsDigit', '$cat =~ /^Nd$/', ''],
- ['IsUpper', '$cat =~ /^Lu$/', ''],
+ ['IsUpper', '$cat =~ /^L[ut]$/', ''],
['IsLower', '$cat =~ /^Ll$/', ''],
- ['IsASCII', 'hex $code <= 127', ''],
+ ['IsASCII', '$code le "007f"', ''],
['IsCntrl', '$cat =~ /^C/', ''],
- ['IsGraph', '$cat =~ /^[^C]/ and $code ne "0020"', ''],
- ['IsPrint', '$cat =~ /^[^C]/', ''],
+ ['IsGraph', '$cat =~ /^([LMNPS]|Co)/', ''],
+ ['IsPrint', '$cat =~ /^([LMNPS]|Co|Zs)/', ''],
['IsPunct', '$cat =~ /^P/', ''],
+ # 003[0-9]: DIGIT ZERO..NINE, 00[46][1-6]: A..F, a..f
['IsXDigit', '$code =~ /^00(3[0-9]|[46][1-6])$/', ''],
['ToUpper', '$up', '$up'],
['ToLower', '$down', '$down'],
@@ -42,12 +61,14 @@ mkdir "To", 0777;
['IsM', '$cat =~ /^M/', ''], # Mark
['IsMn', '$cat eq "Mn"', ''], # Mark, Non-Spacing
['IsMc', '$cat eq "Mc"', ''], # Mark, Combining
+ ['IsMe', '$cat eq "Me"', ''], # Mark, Enclosing
['IsN', '$cat =~ /^N/', ''], # Number
['IsNd', '$cat eq "Nd"', ''], # Number, Decimal Digit
['IsNo', '$cat eq "No"', ''], # Number, Other
+ ['IsNl', '$cat eq "Nl"', ''], # Number, Letter
- ['IsZ', '$cat =~ /^Z/', ''], # Zeparator
+ ['IsZ', '$cat =~ /^Z/', ''], # Separator
['IsZs', '$cat eq "Zs"', ''], # Separator, Space
['IsZl', '$cat eq "Zl"', ''], # Separator, Line
['IsZp', '$cat eq "Zp"', ''], # Separator, Paragraph
@@ -56,6 +77,9 @@ mkdir "To", 0777;
['IsCc', '$cat eq "Cc"', ''], # Other, Control or Format
['IsCo', '$cat eq "Co"', ''], # Other, Private Use
['IsCn', '$cat eq "Cn"', ''], # Other, Not Assigned
+ ['IsCf', '$cat eq "Cf"', ''], # Other, Format
+ ['IsCs', '$cat eq "Cs"', ''], # Other, Surrogate
+ ['IsCn', 'Unassigned Code Value',$PropData], # Other, Not Assigned
# Informative
@@ -71,9 +95,13 @@ mkdir "To", 0777;
['IsPs', '$cat eq "Ps"', ''], # Punctuation, Open
['IsPe', '$cat eq "Pe"', ''], # Punctuation, Close
['IsPo', '$cat eq "Po"', ''], # Punctuation, Other
+ ['IsPc', '$cat eq "Pc"', ''], # Punctuation, Connector
+ ['IsPi', '$cat eq "Pi"', ''], # Punctuation, Initial quote
+ ['IsPf', '$cat eq "Pf"', ''], # Punctuation, Final quote
['IsS', '$cat =~ /^S/', ''], # Symbol
['IsSm', '$cat eq "Sm"', ''], # Symbol, Math
+ ['IsSk', '$cat eq "Sk"', ''], # Symbol, Modifier
['IsSc', '$cat eq "Sc"', ''], # Symbol, Currency
['IsSo', '$cat eq "So"', ''], # Symbol, Other
@@ -94,6 +122,15 @@ mkdir "To", 0777;
# and punctuation specific to
# those scripts
+ ['IsBidiLRE', '$bid eq "LRE"', ''], # Left-to-Right Embedding
+ ['IsBidiLRO', '$bid eq "LRO"', ''], # Left-to-Right Override
+ ['IsBidiAL', '$bid eq "AL"', ''], # Right-to-Left Arabic
+ ['IsBidiRLE', '$bid eq "RLE"', ''], # Right-to-Left Embedding
+ ['IsBidiRLO', '$bid eq "RLO"', ''], # Right-to-Left Override
+ ['IsBidiPDF', '$bid eq "PDF"', ''], # Pop Directional Format
+ ['IsBidiNSM', '$bid eq "NSM"', ''], # Non-Spacing Mark
+ ['IsBidiBN', '$bid eq "BN"', ''], # Boundary Neutral
+
# Weak types:
['IsBidiEN','$bid eq "EN"', ''], # European Number
@@ -122,7 +159,7 @@ mkdir "To", 0777;
['IsDCfont', '$decomp =~ /^<font>/', ''],
['IsDCnoBreak', '$decomp =~ /^<noBreak>/', ''],
['IsDCinitial', '$decomp =~ /^<initial>/', ''],
- ['IsDCinital', '$decomp =~ /^<medial>/', ''],
+ ['IsDCmedial', '$decomp =~ /^<medial>/', ''],
['IsDCfinal', '$decomp =~ /^<final>/', ''],
['IsDCisolated', '$decomp =~ /^<isolated>/', ''],
['IsDCcircle', '$decomp =~ /^<circle>/', ''],
@@ -133,11 +170,12 @@ mkdir "To", 0777;
['IsDCnarrow', '$decomp =~ /^<narrow>/', ''],
['IsDCsmall', '$decomp =~ /^<small>/', ''],
['IsDCsquare', '$decomp =~ /^<square>/', ''],
+ ['IsDCfraction', '$decomp =~ /^<fraction>/', ''],
['IsDCcompat', '$decomp =~ /^<compat>/', ''],
# Number
- ['Number', '$num', '$num'],
+ ['Number', '$num ne ""', '$num'],
# Mirrored
@@ -154,18 +192,41 @@ mkdir "To", 0777;
# Syllables
- ['IsSylV', '$syl eq "V"', ''],
- ['IsSylU', '$syl eq "U"', ''],
- ['IsSylI', '$syl eq "I"', ''],
- ['IsSylA', '$syl eq "A"', ''],
- ['IsSylE', '$syl eq "E"', ''],
- ['IsSylC', '$syl eq "C"', ''],
- ['IsSylO', '$syl eq "O"', ''],
- ['IsSylWV', '$syl eq "V"', ''],
- ['IsSylWI', '$syl eq "I"', ''],
- ['IsSylWA', '$syl eq "A"', ''],
- ['IsSylWE', '$syl eq "E"', ''],
- ['IsSylWC', '$syl eq "C"', ''],
+ syllable_defs(),
+
+# Line break properties - Normative
+
+ ['IsLbrkBK','$brk eq "BK"', ''], # Mandatory Break
+ ['IsLbrkCR','$brk eq "CR"', ''], # Carriage Return
+ ['IsLbrkLF','$brk eq "LF"', ''], # Line Feed
+ ['IsLbrkCM','$brk eq "CM"', ''], # Attached Characters and Combining Marks
+ ['IsLbrkSG','$brk eq "SG"', ''], # Surrogates
+ ['IsLbrkGL','$brk eq "GL"', ''], # Non-breaking (Glue)
+ ['IsLbrkCB','$brk eq "CB"', ''], # Contingent Break Opportunity
+ ['IsLbrkSP','$brk eq "SP"', ''], # Space
+ ['IsLbrkZW','$brk eq "ZW"', ''], # Zero Width Space
+
+# Line break properties - Informative
+ ['IsLbrkXX','$brk eq "XX"', ''], # Unknown
+ ['IsLbrkOP','$brk eq "OP"', ''], # Opening Punctuation
+ ['IsLbrkCL','$brk eq "CL"', ''], # Closing Punctuation
+ ['IsLbrkQU','$brk eq "QU"', ''], # Ambiguous Quotation
+ ['IsLbrkNS','$brk eq "NS"', ''], # Non Starter
+ ['IsLbrkEX','$brk eq "EX"', ''], # Exclamation/Interrogation
+ ['IsLbrkSY','$brk eq "SY"', ''], # Symbols Allowing Breaks
+ ['IsLbrkIS','$brk eq "IS"', ''], # Infix Separator (Numeric)
+ ['IsLbrkPR','$brk eq "PR"', ''], # Prefix (Numeric)
+ ['IsLbrkPO','$brk eq "PO"', ''], # Postfix (Numeric)
+ ['IsLbrkNU','$brk eq "NU"', ''], # Numeric
+ ['IsLbrkAL','$brk eq "AL"', ''], # Ordinary Alphabetic and Symbol Characters
+ ['IsLbrkID','$brk eq "ID"', ''], # Ideographic
+ ['IsLbrkIN','$brk eq "IN"', ''], # Inseparable
+ ['IsLbrkHY','$brk eq "HY"', ''], # Hyphen
+ ['IsLbrkBB','$brk eq "BB"', ''], # Break Opportunity Before
+ ['IsLbrkBA','$brk eq "BA"', ''], # Break Opportunity After
+ ['IsLbrkSA','$brk eq "SA"', ''], # Complex Context (South East Asian)
+ ['IsLbrkAI','$brk eq "AI"', ''], # Ambiguous (Alphabetic or Ideographic)
+ ['IsLbrkB2','$brk eq "B2"', ''], # Break Opportunity Before and After
);
# This is not written for speed...
@@ -197,8 +258,8 @@ END
exit if @ARGV and not grep { $_ eq Block } @ARGV;
print "Block\n";
-open(UD, 'Blocks.txt') or die "Can't open blocks.txt: $!\n";
-open(OUT, ">Block.pl") or die "Can't create $table.pl: $!\n";
+open(UD, 'Blocks.txt') or die "Can't open Blocks.txt: $!\n";
+open(OUT, ">Block.pl") or die "Can't create Block.pl: $!\n";
print OUT <<EOH;
# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
# This file is built by $0 from e.g. $UnicodeData.
@@ -242,6 +303,8 @@ sub proplist {
my $out;
my $split;
+ return listFromPropFile($wanted) if $val eq $PropData;
+
if ($table =~ /^Arab/) {
open(UD, "ArabShap.txt") or warn "Can't open $table: $!";
@@ -253,10 +316,15 @@ sub proplist {
$split = '($code, $short, $name) = split(/; */); $code =~ s/^U\+//;';
}
elsif ($table =~ /^IsSyl/) {
- open(UD, "syllables.txt") or warn "Can't open $table: $!";
+ open(UD, $SyllableData) or warn "Can't open $table: $!";
$split = '($code, $short, $syl) = split(/; */); $code =~ s/^U\+//;';
}
+ elsif ($table =~ /^IsLbrk/) {
+ open(UD, "LineBrk.txt") or warn "Can't open $table: $!";
+
+ $split = '($code, $brk, $name) = split(/;/);';
+ }
else {
open(UD, $UnicodeData) or warn "Can't open $UnicodeData: $!";
@@ -268,8 +336,8 @@ sub proplist {
eval <<"END";
while (<UD>) {
next if /^#/;
- next if /^\s/;
- chop;
+ next if /^\\s/;
+ s/\\s+\$//;
$split
if ($wanted) {
push(\@wanted, [hex \$code, hex $val, \$name =~ /, First>\$/]);
@@ -303,7 +371,7 @@ END
eval <<"END";
while (<UD>) {
next if /^#/;
- next if /^\s*\$/;
+ next if /^\\s*\$/;
chop;
$split
if ($wanted) {
@@ -336,4 +404,44 @@ END
$out;
}
+sub listFromPropFile {
+ my ($wanted) = @_;
+ my $out;
+
+ open (UD, $PropData) or die "Can't open $PropData: $!\n";
+ local($/) = "\n" . '*' x 43 . "\n\nProperty dump for:"; # not 42?
+
+ <UD>;
+ while (<UD>) {
+ chomp;
+ if (s/0x[\d\w]+\s+\((.*?)\)// and $wanted eq $1) {
+ s/\(\d+ chars\)//g;
+ s/^\s+//mg;
+ s/\s+$//mg;
+ s/\.\./\t/g;
+ $out = lc $_;
+ last;
+ }
+ }
+ close (UD);
+ "$out\n";
+}
+
+sub syllable_defs {
+ my @defs;
+ my %seen;
+
+ open (SD, $SyllableData) or die "Can't open $SyllableData: $!\n";
+ while (<SD>) {
+ next if /^\s*(#|$)/;
+ s/\s+$//;
+ ($code, $name, $syl) = split /; */;
+ next unless $syl;
+ push (@defs, ["IsSyl$syl", qq{\$syl eq "$syl"}, ''])
+ unless $seen{$syl}++;
+ }
+ close (SD);
+ return (@defs);
+}
+
# eof
OpenPOWER on IntegriCloud