contrib/tzdata/zishrink.awk


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320

# Convert tzdata source into a smaller version of itself.

# Contributed by Paul Eggert.  This file is in the public domain.

# This is not a general-purpose converter; it is designed for current tzdata.
# 'zic' should treat this script's output as if it were identical to
# this script's input.

# Record a hash N for the new name NAME, checking for collisions.

function record_hash(n, name)
{
  if (used_hashes[n]) {
    printf "# ! collision: %s %s\n", used_hashes[n], name
    exit 1
  }
  used_hashes[n] = name
}

# Return a shortened rule name representing NAME,
# and record this relationship to the hash table.

function gen_rule_name(name, n)
{
  # Use a simple memonic: the first two letters.
  n = substr(name, 1, 2)
  record_hash(n, name)
  # printf "# %s = %s\n", n, name
  return n
}

function prehash_rule_names(name)
{
  # Rule names are not part of the tzdb API, so substitute shorter
  # ones.  Shortening them consistently from one release to the next
  # simplifies comparison of the output.  That being said, the
  # 1-letter names below are not standardized in any way, and can
  # change arbitrarily from one release to the next, as the main goal
  # here is compression not comparison.

  # Abbreviating these rules names to one letter saved the most space
  # circa 2018e.
  rule["Arg"] = "A"
  rule["Brazil"] = "B"
  rule["Canada"] = "C"
  rule["Denmark"] = "D"
  rule["EU"] = "E"
  rule["France"] = "F"
  rule["GB-Eire"] = "G"
  rule["Halifax"] = "H"
  rule["Italy"] = "I"
  rule["Jordan"] = "J"
  rule["Egypt"] = "K" # "Kemet" in ancient Egyptian
  rule["Libya"] = "L"
  rule["Morocco"] = "M"
  rule["Neth"] = "N"
  rule["Poland"] = "O" # arbitrary
  rule["Palestine"] = "P"
  rule["Cuba"] = "Q" # Its start sounds like "Q".
  rule["Russia"] = "R"
  rule["Syria"] = "S"
  rule["Turkey"] = "T"
  rule["Uruguay"] = "U"
  rule["Vincennes"] = "V"
  rule["Winn"] = "W"
  rule["Mongol"] = "X" # arbitrary
  rule["NT_YK"] = "Y"
  rule["Zion"] = "Z"
  rule["Austria"] = "a"
  rule["Belgium"] = "b"
  rule["C-Eur"] = "c"
  rule["Algeria"] = "d" # country code DZ
  rule["E-Eur"] = "e"
  rule["Taiwan"] = "f" # Formosa
  rule["Greece"] = "g"
  rule["Hungary"] = "h"
  rule["Iran"] = "i"
  rule["StJohns"] = "j"
  rule["Chatham"] = "k" # arbitrary
  rule["Lebanon"] = "l"
  rule["Mexico"] = "m"
  rule["Tunisia"] = "n" # country code TN
  rule["Moncton"] = "o" # arbitrary
  rule["Port"] = "p"
  rule["Albania"] = "q" # arbitrary
  rule["Regina"] = "r"
  rule["Spain"] = "s"
  rule["Toronto"] = "t"
  rule["US"] = "u"
  rule["Louisville"] = "v" # ville
  rule["Iceland"] = "w" # arbitrary
  rule["Chile"] = "x" # arbitrary
  rule["Para"] = "y" # country code PY
  rule["Romania"] = "z" # arbitrary
  rule["Macau"] = "_" # arbitrary

  # Use ISO 3166 alpha-2 country codes for remaining names that are countries.
  # This is more systematic, and avoids collisions (e.g., Malta and Moldova).
  rule["Armenia"] = "AM"
  rule["Aus"] = "AU"
  rule["Azer"] = "AZ"
  rule["Barb"] = "BB"
  rule["Dhaka"] = "BD"
  rule["Bulg"] = "BG"
  rule["Bahamas"] = "BS"
  rule["Belize"] = "BZ"
  rule["Swiss"] = "CH"
  rule["Cook"] = "CK"
  rule["PRC"] = "CN"
  rule["Cyprus"] = "CY"
  rule["Czech"] = "CZ"
  rule["Germany"] = "DE"
  rule["DR"] = "DO"
  rule["Ecuador"] = "EC"
  rule["Finland"] = "FI"
  rule["Fiji"] = "FJ"
  rule["Falk"] = "FK"
  rule["Ghana"] = "GH"
  rule["Guat"] = "GT"
  rule["Hond"] = "HN"
  rule["Haiti"] = "HT"
  rule["Eire"] = "IE"
  rule["Iraq"] = "IQ"
  rule["Japan"] = "JP"
  rule["Kyrgyz"] = "KG"
  rule["ROK"] = "KR"
  rule["Latvia"] = "LV"
  rule["Lux"] = "LX"
  rule["Moldova"] = "MD"
  rule["Malta"] = "MT"
  rule["Mauritius"] = "MU"
  rule["Namibia"] = "NA"
  rule["Nic"] = "NI"
  rule["Norway"] = "NO"
  rule["Peru"] = "PE"
  rule["Phil"] = "PH"
  rule["Pakistan"] = "PK"
  rule["Sudan"] = "SD"
  rule["Salv"] = "SV"
  rule["Tonga"] = "TO"
  rule["Vanuatu"] = "VU"

  # Avoid collisions.
  rule["Detroit"] = "Dt" # De = Denver

  for (name in rule) {
    record_hash(rule[name], name)
  }
}

# Process an input line and save it for later output.

function process_input_line(line, field, end, i, n, startdef)
{
  # Remove comments, normalize spaces, and append a space to each line.
  sub(/#.*/, "", line)
  line = line " "
  gsub(/[\t ]+/, " ", line)

  # Abbreviate keywords.  Do not abbreviate "Link" to just "L",
  # as pre-2017c zic erroneously diagnoses "Li" as ambiguous.
  sub(/^Link /, "Li ", line)
  sub(/^Rule /, "R ", line)
  sub(/^Zone /, "Z ", line)

  # SystemV rules are not needed.
  if (line ~ /^R SystemV /) return

  # Replace FooAsia rules with the same rules without "Asia", as they
  # are duplicates.
  if (match(line, /[^ ]Asia /)) {
    if (line ~ /^R /) return
    line = substr(line, 1, RSTART) substr(line, RSTART + 5)
  }

  # Abbreviate times.
  while (match(line, /[: ]0+[0-9]/))
    line = substr(line, 1, RSTART) substr(line, RSTART + RLENGTH - 1)
  while (match(line, /:0[^:]/))
    line = substr(line, 1, RSTART - 1) substr(line, RSTART + 2)

  # Abbreviate weekday names.  Do not abbreviate "Sun" and "Sat", as
  # pre-2017c zic erroneously diagnoses "Su" and "Sa" as ambiguous.
  while (match(line, / (last)?(Mon|Wed|Fri)[ <>]/)) {
    end = RSTART + RLENGTH
    line = substr(line, 1, end - 4) substr(line, end - 1)
  }
  while (match(line, / (last)?(Tue|Thu)[ <>]/)) {
    end = RSTART + RLENGTH
    line = substr(line, 1, end - 3) substr(line, end - 1)
  }

  # Abbreviate "max", "only" and month names.
  # Do not abbreviate "min", as pre-2017c zic erroneously diagnoses "mi"
  # as ambiguous.
  gsub(/ max /, " ma ", line)
  gsub(/ only /, " o ", line)
  gsub(/ Jan /, " Ja ", line)
  gsub(/ Feb /, " F ", line)
  gsub(/ Apr /, " Ap ", line)
  gsub(/ Aug /, " Au ", line)
  gsub(/ Sep /, " S ", line)
  gsub(/ Oct /, " O ", line)
  gsub(/ Nov /, " N ", line)
  gsub(/ Dec /, " D ", line)

  # Strip leading and trailing space.
  sub(/^ /, "", line)
  sub(/ $/, "", line)

  # Remove unnecessary trailing zero fields.
  sub(/ 0+$/, "", line)

  # Remove unnecessary trailing days-of-month "1".
  if (match(line, /[A-Za-z] 1$/))
    line = substr(line, 1, RSTART)

  # Remove unnecessary trailing " Ja" (for January).
  sub(/ Ja$/, "", line)

  n = split(line, field)

  # Abbreviate rule names.
  i = field[1] == "Z" ? 4 : field[1] == "Li" ? 0 : 2
  if (i && field[i] ~ /^[^-+0-9]/) {
    if (!rule[field[i]])
      rule[field[i]] = gen_rule_name(field[i])
    field[i] = rule[field[i]]
  }

  # If this zone supersedes an earlier one, delete the earlier one
  # from the saved output lines.
  startdef = ""
  if (field[1] == "Z")
    zonename = startdef = field[2]
  else if (field[1] == "Li")
    zonename = startdef = field[3]
  else if (field[1] == "R")
    zonename = ""
  if (startdef) {
    i = zonedef[startdef]
    if (i) {
      do
	output_line[i - 1] = ""
      while (output_line[i++] ~ /^[-+0-9]/);
    }
  }
  zonedef[zonename] = nout + 1

  # Save the line for later output.
  line = field[1]
  for (i = 2; i <= n; i++)
    line = line " " field[i]
  output_line[nout++] = line
}

function output_saved_lines(i)
{
  for (i = 0; i < nout; i++)
    if (output_line[i])
      print output_line[i]
}

BEGIN {
  # Files that the output normally depends on.
  default_dep["africa"] = 1
  default_dep["antarctica"] = 1
  default_dep["asia"] = 1
  default_dep["australasia"] = 1
  default_dep["backward"] = 1
  default_dep["etcetera"] = 1
  default_dep["europe"] = 1
  default_dep["factory"] = 1
  default_dep["northamerica"] = 1
  default_dep["southamerica"] = 1
  default_dep["systemv"] = 1
  default_dep["ziguard.awk"] = 1
  default_dep["zishrink.awk"] = 1

  # Output a version string from 'version' and related configuration variables
  # supported by tzdb's Makefile.  If you change the makefile or any other files
  # that affect the output of this script, you should append '-SOMETHING'
  # to the contents of 'version', where SOMETHING identifies what was changed.

  ndeps = split(deps, dep)
  ddeps = ""
  for (i = 1; i <= ndeps; i++) {
    if (default_dep[dep[i]]) {
      default_dep[dep[i]]++
    } else {
      ddeps = ddeps " " dep[i]
    }
  }
  for (d in default_dep) {
    if (default_dep[d] == 1) {
      ddeps = ddeps " !" d
    }
  }
  print "# version", version
  if (dataform != "main") {
    print "# dataform", dataform
  }
  if (redo != "posix_right") {
    print "# redo " redo
  }
  if (ddeps) {
    print "# ddeps" ddeps
  }
  print "# This zic input file is in the public domain."

  prehash_rule_names()
}

/^[\t ]*[^#\t ]/ {
  process_input_line($0)
}

END {
  output_saved_lines()
}