build-data.nanp

#!/usr/bin/env perl

# THIS SCRIPT IS NOT INTENDED FOR END USERS OR FOR PEOPLE INSTALLING
# THE MODULES, BUT FOR THE AUTHOR'S USE WHEN UPDATING THE DATA FROM OFCOM'S
# PUBLISHED DATA.

use strict;
use warnings;
use LWP::UserAgent;
use XML::XPath;
use Text::CSV_XS;

use lib 'lib';
use Number::Phone::Country;

$| = 1;

my $csv = Text::CSV_XS->new({ binary => 1 });

open(MODULE, '>lib/Number/Phone/NANP/Data.pm') || die("Can't write lib/Number/Phone/NANP/Data.pm\n");

print MODULE "# automatically generated file, don't edit\n\n";
print MODULE q{

  # Copyright 2024 David Cantrell
  #
  # Uses data from:
  #   libphonenumber      http://code.google.com/p/libphonenumber/
  #   CNAC                http://www.cnac.ca/
  #   NANPA               https://www.nationalpooling.com/
  #   Local Calling Guide https://localcallingguide.com/
  #
  # Licensed under the Apache License, Version 2.0 (the "License");
  # you may not use this file except in compliance with the License.
  # You may obtain a copy of the License at
  # 
  #     http://www.apache.org/licenses/LICENSE-2.0
  # 
  # Unless required by applicable law or agreed to in writing, software
  # distributed under the License is distributed on an "AS IS" BASIS,
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  # See the License for the specific language governing permissions and
  # limitations under the License.
  package Number::Phone::NANP::Data;

  use strict;
  use warnings;
  use vars qw(%areanames %fixed_line_regexes %mobile_regexes);
}.
"our \$VERSION = 1.".join("", (gmtime())[5] + 1900, sprintf('%02d', (gmtime())[4] + 1), map { sprintf('%02d', $_) } (gmtime())[3, 2, 1, 0]).";\n\n".q{

  sub _prefixes {
      my $number = shift;
      map { substr($number, 0, $_) } reverse(1..length($number));
  }

  sub _areaname {
    my $number = shift;
    foreach my $prefix (_prefixes($number)) {
      if(exists($areanames{$prefix})) { return $areanames{$prefix} };
    }
    return undef;
  }
};

# Extract area names
print MODULE '%areanames = (';
open(LIBPHONENUMBERDATA, 'libphonenumber/resources/geocoding/en/1.txt') || die("Can't read libphonenumber/resources/geocoding/en/1.txt\n");
my @lines = <LIBPHONENUMBERDATA>;
close(LIBPHONENUMBERDATA);
foreach my $line (@lines) {
  chomp($line);
  next if($line =~ /^(#.*|\s+)?$/);
  my($prefix, $area) = split(/\|/, $line);
  $area = quotemeta($area);
  print MODULE "    '$prefix' => \"$area\",\n";
}
print MODULE '  );';

# despite this data being in libphonenumber, it's only useful for
# some countries. eg it looks sane for JM but not for CA
# they list +1 204 as being both fixed line and mobile. Grargh!
# # extract mobile/fixed-line prefix regexes
my $xml = XML::XPath->new(filename => 'libphonenumber/resources/PhoneNumberMetadata.xml');
my @territories = $xml->find('/phoneNumberMetadata/territories/territory')->get_nodelist();
TERRITORY: foreach my $territory (@territories) {
    next TERRITORY unless($territory->find('@countryCode').'' eq '1');
    (my $fixed  = $territory->find("fixedLine/nationalNumberPattern")->string_value()) =~ s/\s+//g;
    (my $mobile = $territory->find("mobile/nationalNumberPattern")->string_value()) =~ s/\s+//g;
    my $ISO_country_code = $territory->find('@id').'';
    if($fixed eq $mobile) {
        warn("$ISO_country_code: same regexes for fixed and mobile\n");
        next TERRITORY;
    } else {
        my @area_codes = Number::Phone::Country::_NANP_area_codes($ISO_country_code);
        foreach my $area_code (@area_codes) {
            foreach my $co (map { sprintf("%03d", $_ ) } 0 .. 999) {
                my $number = "${area_code}${co}3333";
                if($number =~ /^$fixed$/x && $number =~ /^$mobile$/x) {
                    warn("$ISO_country_code: Overlap between fixed/mobile regexes for $number; can't be trusted\n");
                    next TERRITORY;
                }
            }
        }
    }
    warn("$ISO_country_code: good data to tell what's fixed/mobile\n");
    print MODULE "\$fixed_line_regexes{$ISO_country_code} = '$fixed';\n";
    print MODULE "\$mobile_regexes{$ISO_country_code} = '$mobile';\n";
}

mkdir('share');
unlink('share/Number-Phone-NANP-Data.db');
open(my $randomfh, "> :raw :bytes", 'share/Number-Phone-NANP-Data.db') || die($!);

# file format:
# The first 6 bytes of the file are: N A N P O P
# These are followed by a 16-bit file format version number
print $randomfh "NANPOP", chr(0), chr(0);

# In version 0 of the format, the next byte is the word length. The word length
# is *not specified* otherwise.
my $WORDLENGTH = 4;
print $randomfh chr($WORDLENGTH);

# The remainder of the file is a list of $WORDLENGTH byte pointers, one for
# each 10,000 number block NPA-NXX-xxxx, starting at 200-000. So the address of
# the pointer is $WORDLENGTH * (NPANXX - 200000).  Note that the D digit can't
# be 0 or 1, so the pointers really start at byte $WORDLENGTH * 200.
# 
# After the pointer array comes the data that they point at. Those structures
# can be of various types identified by their leading byte, with the NULL
# pointer meaning "no data":
#
# 0: length byte then string
# 1: block of 10 pointers, one for each thousand number block in
#    an NPA-NXX-X. Currently these only point at strings, ie type 0.

my %seen_operators = ();
my $next_data_offset = $WORDLENGTH * 800000;

warn("Extracting Canadian operators\n");
open(my $CA_operators, '<', 'data-files/COCodeStatus_ALL.csv') ||
    die("Couldn't open data-files/COCodeStatus_ALL.csv: $!\n");
$csv->getline($CA_operators); # header line
$csv->getline($CA_operators); # date stamp?
# format is "NPA","CO Code (NXX)","Status","Exchange Area","Province","Company","OCN","Remarks"
my($NPA_FIELD, $CO_FIELD, $OP_FIELD) = (0, 1, 5);
while(my $row = $csv->getline($CA_operators)) {
    last unless($row->[$NPA_FIELD]);
    next unless($row->[$OP_FIELD]);
    my($prefix, $op) = (join('', $row->[$NPA_FIELD], $row->[$CO_FIELD]), $row->[$OP_FIELD]);
    _write_operator_data($prefix, $op);
}

# CSV file format doco at https://www.nanpa.com/reports/thousands-block-reports/region
warn("Extracting US operators\n");
open(my $US_operators, '<', 'data-files/ThousandsBlockAssignment_All_Augmented.txt') ||
    die("Couldn't open data-files/ThousandsBlockAssignment_All_Augmented.txt: $!\n");
$csv->getline($US_operators); # header line
my %US = ();
while(my $row = $csv->getline($US_operators)) {
    my($npa, $nxx, $thousands, $status, $op) = map { $row->[$_] } (2, 3, 4, 5, 12);
    $op = _clean_up_op($op);
    next unless($status eq 'AS');
    $US{"$npa$nxx"} ||= [];
    push @{$US{"$npa$nxx"}}, { thousands => $thousands, op => $op }
}
foreach my $npanxx (keys %US) {
    my $data = $US{$npanxx};
    _write_multiple_operators($npanxx, $data);
}

warn("Extracting non-US/CA operators\n");
foreach my $NPA (Number::Phone::Country::_non_US_CA_area_codes()) {
    warn("  ... $NPA\n");
    my $xml = _get_xml_data($NPA);
    my %ops = ();

    my @ten_thousand_prefixes = $xml->find("/root/prefixdata[x = 'A']")->get_nodelist();
    foreach my $prefix (@ten_thousand_prefixes) {
        my $nxx = $prefix->find('nxx')->string_value();
        $ops{$NPA.$nxx} = _op_from_prefix($prefix);
    }

    my @thousand_prefixes = $xml->find("/root/prefixdata[x != 'A']")->get_nodelist();
    foreach my $prefix (@thousand_prefixes) {
        my $nxx = $prefix->find('nxx')->string_value();
        if(!exists($ops{$NPA.$nxx}) || !ref($ops{$NPA.$nxx})) {
            $ops{$NPA.$nxx} = [];
        }
        my $x = $prefix->find('x')->string_value();
        my $op = _op_from_prefix($prefix);
        push @{$ops{$NPA.$nxx}}, { thousands => $x, op => $op };
    }
    foreach my $co (keys %ops) {
        if(!ref($ops{$co})) {
            _write_operator_data($co, $ops{$co});
        } else {
            _write_multiple_operators($co, $ops{$co});
        }
    }
}

sub _op_from_prefix {
    my $prefix = shift;
    (my $op = $prefix->find('company-name')->string_value()) =~ s/\s+$//;
    return _clean_up_op($op);
}

sub _clean_up_op {
    my $op = shift;
    if($op =~ /([^\x00-\x7f])/) {
        $op =~ s/\x{2013}/-/g; # en-dash
    }
    if($op =~ /([^\x00-\x7f])/) {
        die "Couldn't clean up $1 in $op\n";
    }
    # leading/trailing space
    $op =~ s/^\s+|\s+$//g;
    return $op;
}

sub _get_xml_data {
    my $NPA = shift;
    if(!-e "$NPA.xml" || -M "$NPA.xml" > 14) {
        my $retries = 0;
        my $xml;
        my $data;
        my $ua = LWP::UserAgent->new(
            ssl_opts => { verify_hostname => 0 },
            agent    => 'Number::Phone-builder/1.0 (david@cantrell.org.uk)'.
                        ($ENV{CI} ? ' (CI build)' : ''),
        );
        TRY: my $res = $ua->get("https://localcallingguide.com/xmlprefix.php?npa=$NPA&blocks=1");
        if($res->is_success()) {
            # OK, we got data - but is it a complete download?
            $data = $res->content();
            eval {
                # ...->new only creates an object from the data, it doesn't validate
                $xml = XML::XPath->new(xml => $data);
                $xml->find("/root");
            } || do {
                undef $xml;
                warn("Can't parse XML for $NPA: $@\n");
            };
        }
        if(!$xml) {
            $retries++;
            if($retries < 3) {
                warn("Couldn't get data from localcallingguide.com for $NPA: retrying\n");
                sleep 5;
                goto TRY
            }
            die("Couldn't get data from localcallingguide.com for $NPA\n");
        }
        open(my $fh, '>', "data-files/$NPA.xml") || die("Can't write data-files/$NPA.xml: $!\n");
        print $fh $data;
        close($fh);
    }
    # writing the file above and then re-reading it instead of just returning the data,
    # and also making sure to read it from the file into a different variable, is
    # because Unicode is fucking witchcraft
    open(my $fh, '<', "data-files/$NPA.xml") || die("Can't read data-files/$NPA.xml: $!\n");
    my $data = join('', <$fh>);
    close($fh);
    $xml = XML::XPath->new(xml => $data);
    return $xml;
}

sub _write_operator_data {
    my($co, $op) = @_;
    # area codes 000 to 199 are invalid
    seek($randomfh, $WORDLENGTH * ($co - 200000), 0);
    if(exists($seen_operators{$op})) {
        print $randomfh pack('N', $seen_operators{$op});
    } else {
        print $randomfh pack('N', $next_data_offset);
        seek($randomfh, $next_data_offset, 0);
        die("Operator '$op' is longer than 255 bytes\n")
            if(length($op) > 255);
        print $randomfh pack('CCA*', 0, length($op), $op);
        $seen_operators{$op} = $next_data_offset;
        $next_data_offset += length($op) + 2;
    }
}

sub _write_multiple_operators {
    my($co, $data) = @_;

    # first see if we can consolidate all ten of the
    # thousand blocks into one ten-thousand block
    if(
        $#{$data} == 9 && # got 10 assigned blocks
        scalar(keys(%{{
            map { $data->[$_]->{op} => 1 } (0 .. 9)
        }})) == 1 # but only one operator
    ) {
        return _write_operator_data($co, $data->[0]->{op});
    }

    seek($randomfh, $WORDLENGTH * ($co - 200000), 0);
    print $randomfh pack('N', $next_data_offset);
    seek($randomfh, $next_data_offset, 0);
    print $randomfh pack('C', 1); # block-type marker
    $next_data_offset++;
    # the array might have less than 10 entries, eg
    # [
    #   { thousands => 3, op => "blah",      pointer => ... },
    #   { thousands => 7, op => "otherblah", pointer => ... },
    #   { thousands => 9, op => "blah",      pointer => ... },
    # ]
    # but the array of pointers in the file is always ten entries
    my $start_of_pointers = $next_data_offset;
    $next_data_offset += $WORDLENGTH * 10;

    foreach my $index (0 .. $#{$data}) {
        my($thousands_digit, $op) = map { $data->[$index]->{$_} } qw(thousands op);

        seek($randomfh, $start_of_pointers + $WORDLENGTH * $thousands_digit, 0);
        if(exists($seen_operators{$op})) {
            print $randomfh pack('N', $seen_operators{$op});
        } else {
            print $randomfh pack('N', $next_data_offset);
            seek($randomfh, $next_data_offset, 0);
            die("Operator '$op' is longer than 255 bytes\n")
                if(length($op) > 255);

            print $randomfh pack('CCA*', 0, length($op), $op);
            $seen_operators{$op} = $next_data_offset;
            $next_data_offset += length($op) + 2;
        }
    }
}

close(MODULE);