-
Notifications
You must be signed in to change notification settings - Fork 31
/
Copy pathbuild-data.nanp
335 lines (301 loc) · 12.2 KB
/
build-data.nanp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
#!/usr/bin/env perl
# THIS SCRIPT IS NOT INTENDED FOR END USERS OR FOR PEOPLE INSTALLING
# THE MODULES, BUT FOR THE AUTHOR'S USE WHEN UPDATING THE DATA FROM OFCOM'S
# PUBLISHED DATA.
use strict;
use warnings;
use LWP::UserAgent;
use XML::XPath;
use Text::CSV_XS;
use lib 'lib';
use Number::Phone::Country;
$| = 1;
my $csv = Text::CSV_XS->new({ binary => 1 });
open(MODULE, '>lib/Number/Phone/NANP/Data.pm') || die("Can't write lib/Number/Phone/NANP/Data.pm\n");
print MODULE "# automatically generated file, don't edit\n\n";
print MODULE q{
# Copyright 2024 David Cantrell
#
# Uses data from:
# libphonenumber http://code.google.com/p/libphonenumber/
# CNAC http://www.cnac.ca/
# NANPA https://www.nationalpooling.com/
# Local Calling Guide https://localcallingguide.com/
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
package Number::Phone::NANP::Data;
use strict;
use warnings;
use vars qw(%areanames %fixed_line_regexes %mobile_regexes);
}.
"our \$VERSION = 1.".join("", (gmtime())[5] + 1900, sprintf('%02d', (gmtime())[4] + 1), map { sprintf('%02d', $_) } (gmtime())[3, 2, 1, 0]).";\n\n".q{
sub _prefixes {
my $number = shift;
map { substr($number, 0, $_) } reverse(1..length($number));
}
sub _areaname {
my $number = shift;
foreach my $prefix (_prefixes($number)) {
if(exists($areanames{$prefix})) { return $areanames{$prefix} };
}
return undef;
}
};
# Extract area names
print MODULE '%areanames = (';
open(LIBPHONENUMBERDATA, 'libphonenumber/resources/geocoding/en/1.txt') || die("Can't read libphonenumber/resources/geocoding/en/1.txt\n");
my @lines = <LIBPHONENUMBERDATA>;
close(LIBPHONENUMBERDATA);
foreach my $line (@lines) {
chomp($line);
next if($line =~ /^(#.*|\s+)?$/);
my($prefix, $area) = split(/\|/, $line);
$area = quotemeta($area);
print MODULE " '$prefix' => \"$area\",\n";
}
print MODULE ' );';
# despite this data being in libphonenumber, it's only useful for
# some countries. eg it looks sane for JM but not for CA
# they list +1 204 as being both fixed line and mobile. Grargh!
# # extract mobile/fixed-line prefix regexes
my $xml = XML::XPath->new(filename => 'libphonenumber/resources/PhoneNumberMetadata.xml');
my @territories = $xml->find('/phoneNumberMetadata/territories/territory')->get_nodelist();
TERRITORY: foreach my $territory (@territories) {
next TERRITORY unless($territory->find('@countryCode').'' eq '1');
(my $fixed = $territory->find("fixedLine/nationalNumberPattern")->string_value()) =~ s/\s+//g;
(my $mobile = $territory->find("mobile/nationalNumberPattern")->string_value()) =~ s/\s+//g;
my $ISO_country_code = $territory->find('@id').'';
if($fixed eq $mobile) {
warn("$ISO_country_code: same regexes for fixed and mobile\n");
next TERRITORY;
} else {
my @area_codes = Number::Phone::Country::_NANP_area_codes($ISO_country_code);
foreach my $area_code (@area_codes) {
foreach my $co (map { sprintf("%03d", $_ ) } 0 .. 999) {
my $number = "${area_code}${co}3333";
if($number =~ /^$fixed$/x && $number =~ /^$mobile$/x) {
warn("$ISO_country_code: Overlap between fixed/mobile regexes for $number; can't be trusted\n");
next TERRITORY;
}
}
}
}
warn("$ISO_country_code: good data to tell what's fixed/mobile\n");
print MODULE "\$fixed_line_regexes{$ISO_country_code} = '$fixed';\n";
print MODULE "\$mobile_regexes{$ISO_country_code} = '$mobile';\n";
}
mkdir('share');
unlink('share/Number-Phone-NANP-Data.db');
open(my $randomfh, "> :raw :bytes", 'share/Number-Phone-NANP-Data.db') || die($!);
# file format:
# The first 6 bytes of the file are: N A N P O P
# These are followed by a 16-bit file format version number
print $randomfh "NANPOP", chr(0), chr(0);
# In version 0 of the format, the next byte is the word length. The word length
# is *not specified* otherwise.
my $WORDLENGTH = 4;
print $randomfh chr($WORDLENGTH);
# The remainder of the file is a list of $WORDLENGTH byte pointers, one for
# each 10,000 number block NPA-NXX-xxxx, starting at 200-000. So the address of
# the pointer is $WORDLENGTH * (NPANXX - 200000). Note that the D digit can't
# be 0 or 1, so the pointers really start at byte $WORDLENGTH * 200.
#
# After the pointer array comes the data that they point at. Those structures
# can be of various types identified by their leading byte, with the NULL
# pointer meaning "no data":
#
# 0: length byte then string
# 1: block of 10 pointers, one for each thousand number block in
# an NPA-NXX-X. Currently these only point at strings, ie type 0.
my %seen_operators = ();
my $next_data_offset = $WORDLENGTH * 800000;
warn("Extracting Canadian operators\n");
open(my $CA_operators, '<', 'data-files/COCodeStatus_ALL.csv') ||
die("Couldn't open data-files/COCodeStatus_ALL.csv: $!\n");
$csv->getline($CA_operators); # header line
$csv->getline($CA_operators); # date stamp?
# format is "NPA","CO Code (NXX)","Status","Exchange Area","Province","Company","OCN","Remarks"
my($NPA_FIELD, $CO_FIELD, $OP_FIELD) = (0, 1, 5);
while(my $row = $csv->getline($CA_operators)) {
last unless($row->[$NPA_FIELD]);
next unless($row->[$OP_FIELD]);
my($prefix, $op) = (join('', $row->[$NPA_FIELD], $row->[$CO_FIELD]), $row->[$OP_FIELD]);
_write_operator_data($prefix, $op);
}
# CSV file format doco at https://www.nanpa.com/reports/thousands-block-reports/region
warn("Extracting US operators\n");
open(my $US_operators, '<', 'data-files/ThousandsBlockAssignment_All_Augmented.txt') ||
die("Couldn't open data-files/ThousandsBlockAssignment_All_Augmented.txt: $!\n");
$csv->getline($US_operators); # header line
my %US = ();
while(my $row = $csv->getline($US_operators)) {
my($npa, $nxx, $thousands, $status, $op) = map { $row->[$_] } (2, 3, 4, 5, 12);
$op = _clean_up_op($op);
next unless($status eq 'AS');
$US{"$npa$nxx"} ||= [];
push @{$US{"$npa$nxx"}}, { thousands => $thousands, op => $op }
}
foreach my $npanxx (keys %US) {
my $data = $US{$npanxx};
_write_multiple_operators($npanxx, $data);
}
warn("Extracting non-US/CA operators\n");
foreach my $NPA (Number::Phone::Country::_non_US_CA_area_codes()) {
warn(" ... $NPA\n");
my $xml = _get_xml_data($NPA);
my %ops = ();
my @ten_thousand_prefixes = $xml->find("/root/prefixdata[x = 'A']")->get_nodelist();
foreach my $prefix (@ten_thousand_prefixes) {
my $nxx = $prefix->find('nxx')->string_value();
$ops{$NPA.$nxx} = _op_from_prefix($prefix);
}
my @thousand_prefixes = $xml->find("/root/prefixdata[x != 'A']")->get_nodelist();
foreach my $prefix (@thousand_prefixes) {
my $nxx = $prefix->find('nxx')->string_value();
if(!exists($ops{$NPA.$nxx}) || !ref($ops{$NPA.$nxx})) {
$ops{$NPA.$nxx} = [];
}
my $x = $prefix->find('x')->string_value();
my $op = _op_from_prefix($prefix);
push @{$ops{$NPA.$nxx}}, { thousands => $x, op => $op };
}
foreach my $co (keys %ops) {
if(!ref($ops{$co})) {
_write_operator_data($co, $ops{$co});
} else {
_write_multiple_operators($co, $ops{$co});
}
}
}
sub _op_from_prefix {
my $prefix = shift;
(my $op = $prefix->find('company-name')->string_value()) =~ s/\s+$//;
return _clean_up_op($op);
}
sub _clean_up_op {
my $op = shift;
if($op =~ /([^\x00-\x7f])/) {
$op =~ s/\x{2013}/-/g; # en-dash
}
if($op =~ /([^\x00-\x7f])/) {
die "Couldn't clean up $1 in $op\n";
}
# leading/trailing space
$op =~ s/^\s+|\s+$//g;
return $op;
}
sub _get_xml_data {
my $NPA = shift;
if(!-e "$NPA.xml" || -M "$NPA.xml" > 14) {
my $retries = 0;
my $xml;
my $data;
my $ua = LWP::UserAgent->new(
ssl_opts => { verify_hostname => 0 },
agent => 'Number::Phone-builder/1.0 ([email protected])'.
($ENV{CI} ? ' (CI build)' : ''),
);
TRY: my $res = $ua->get("https://localcallingguide.com/xmlprefix.php?npa=$NPA&blocks=1");
if($res->is_success()) {
# OK, we got data - but is it a complete download?
$data = $res->content();
eval {
# ...->new only creates an object from the data, it doesn't validate
$xml = XML::XPath->new(xml => $data);
$xml->find("/root");
} || do {
undef $xml;
warn("Can't parse XML for $NPA: $@\n");
};
}
if(!$xml) {
$retries++;
if($retries < 3) {
warn("Couldn't get data from localcallingguide.com for $NPA: retrying\n");
sleep 5;
goto TRY
}
die("Couldn't get data from localcallingguide.com for $NPA\n");
}
open(my $fh, '>', "data-files/$NPA.xml") || die("Can't write data-files/$NPA.xml: $!\n");
print $fh $data;
close($fh);
}
# writing the file above and then re-reading it instead of just returning the data,
# and also making sure to read it from the file into a different variable, is
# because Unicode is fucking witchcraft
open(my $fh, '<', "data-files/$NPA.xml") || die("Can't read data-files/$NPA.xml: $!\n");
my $data = join('', <$fh>);
close($fh);
$xml = XML::XPath->new(xml => $data);
return $xml;
}
sub _write_operator_data {
my($co, $op) = @_;
# area codes 000 to 199 are invalid
seek($randomfh, $WORDLENGTH * ($co - 200000), 0);
if(exists($seen_operators{$op})) {
print $randomfh pack('N', $seen_operators{$op});
} else {
print $randomfh pack('N', $next_data_offset);
seek($randomfh, $next_data_offset, 0);
die("Operator '$op' is longer than 255 bytes\n")
if(length($op) > 255);
print $randomfh pack('CCA*', 0, length($op), $op);
$seen_operators{$op} = $next_data_offset;
$next_data_offset += length($op) + 2;
}
}
sub _write_multiple_operators {
my($co, $data) = @_;
# first see if we can consolidate all ten of the
# thousand blocks into one ten-thousand block
if(
$#{$data} == 9 && # got 10 assigned blocks
scalar(keys(%{{
map { $data->[$_]->{op} => 1 } (0 .. 9)
}})) == 1 # but only one operator
) {
return _write_operator_data($co, $data->[0]->{op});
}
seek($randomfh, $WORDLENGTH * ($co - 200000), 0);
print $randomfh pack('N', $next_data_offset);
seek($randomfh, $next_data_offset, 0);
print $randomfh pack('C', 1); # block-type marker
$next_data_offset++;
# the array might have less than 10 entries, eg
# [
# { thousands => 3, op => "blah", pointer => ... },
# { thousands => 7, op => "otherblah", pointer => ... },
# { thousands => 9, op => "blah", pointer => ... },
# ]
# but the array of pointers in the file is always ten entries
my $start_of_pointers = $next_data_offset;
$next_data_offset += $WORDLENGTH * 10;
foreach my $index (0 .. $#{$data}) {
my($thousands_digit, $op) = map { $data->[$index]->{$_} } qw(thousands op);
seek($randomfh, $start_of_pointers + $WORDLENGTH * $thousands_digit, 0);
if(exists($seen_operators{$op})) {
print $randomfh pack('N', $seen_operators{$op});
} else {
print $randomfh pack('N', $next_data_offset);
seek($randomfh, $next_data_offset, 0);
die("Operator '$op' is longer than 255 bytes\n")
if(length($op) > 255);
print $randomfh pack('CCA*', 0, length($op), $op);
$seen_operators{$op} = $next_data_offset;
$next_data_offset += length($op) + 2;
}
}
}
close(MODULE);