build-data.uk

#!/usr/bin/env perl

use strict;
use warnings;

use Data::CompactReadonly;
use File::Find::Rule;
use Text::CSV_XS;

my @telco_length_data = my @geo_prefices = my @free_prefices = my @network_svc_prefices = my @corporate_prefices = my @personal_prefices = my @pager_prefices = my @mobile_prefices = my @special_prefices = my @adult_prefices = my @ip_prefices = my %areanames = my %statuses = ();

open(my $area_codes, '<', 'libphonenumber/resources/geocoding/en/44.txt') || die("Couldn't open libphonenumber/resources/geocoding/en/44.txt: $!\n");
print "Working on libphonenumber/resources/geocoding/en/44.txt\n";
while(my $line = <$area_codes>) {
    chomp($line) if($line);
    next if(!$line || $line =~ /^#/);
    my @row = split(/\|/, $line);
    $row[0] =~ s/^44//;
    $areanames{$row[0]} = $row[1];
}

foreach my $csvfile (File::Find::Rule->name('S?.csv')->in('data-files')) {
    print "Working on $csvfile\n";
    my $parser = Text::CSV_XS->new();

    open(my $fh, '<', $csvfile) || die "Could not open '$csvfile': $!\n";
    my @rows = ();

    while (my $row = $parser->getline($fh)) {
        push @rows, $row;
    }
    my $count = 0;

    my %header_fields_index_by_name = map { $_ => $count++ } my @header_fields_index_by_column = @{shift(@rows)};
    my @prefix_fields = grep { exists($header_fields_index_by_name{$_}) } (qw(SABC D/DE FG), 'NMS Number Block: Number Block');
    foreach my $row (@rows) {
        my $prefix = join('', map { s/\s//g; $_; }  grep { defined } map { $row->[$_] } @header_fields_index_by_name{@prefix_fields});

        my $format_field =
            exists($header_fields_index_by_name{'Number Length'})         ? 'Number Length' :
            exists($header_fields_index_by_name{'NumberLengthNonGeo'})    ? 'NumberLengthNonGeo' :
            exists($header_fields_index_by_name{'Non Geo Number Length'}) ? 'Non Geo Number Length' :
            exists($header_fields_index_by_name{'Geographic Number Length'}) ? 'Geographic Number Length' :
            die("Can't find a format field in $csvfile\n");
        my $telco_field =
            exists($header_fields_index_by_name{'Communications Provider'}) ? 'Communications Provider' :
            exists($header_fields_index_by_name{'CP Name'})                 ? 'CP Name' :
            die("Can't find a telco field in $csvfile\n");
        my $status_field =
            exists($header_fields_index_by_name{'Status'})               ? 'Status' :
            exists($header_fields_index_by_name{'Block Status'})         ? 'Block Status' :
            exists($header_fields_index_by_name{'Block Status '})        ? 'Block Status ' :
            exists($header_fields_index_by_name{'Block Status Formula'}) ? 'Block Status Formula' :
            die("Can't find a status field in $csvfile\n");

        my($status, $telco, $format) = map {
            $row->[$_]
        } @header_fields_index_by_name{$status_field, $telco_field, $format_field};

        $status ||= '';
        $telco  ||= '';
        $format ||= '';

        next if(
            $status !~ /^(Protected|Allocated ?(\(Closed Range\)|for Migration only)?)$/
        );
        $statuses{"$prefix"} = $status;

        if($prefix =~ /^[12]/)                         { push @geo_prefices, $prefix }
         elsif($prefix =~ /^80/)                       { push @free_prefices, $prefix }
         elsif($prefix =~ /^55/)                       { push @corporate_prefices, $prefix }
         elsif($prefix =~ /^56/)                       { push @ip_prefices, $prefix }
         elsif($prefix =~ /^70/)                       { push @personal_prefices, $prefix }
         # NB order is important. 7624 is IOM mobiles except 76242 which is pagers. Apparently.
         elsif($prefix =~ /^7([12345789]|624[013-9])/) { push @mobile_prefices, $prefix }
         elsif($prefix =~ /^76/)                       { push @pager_prefices, $prefix }
         elsif($prefix =~ /^(3|8[47]|9)/)              { push @special_prefices, $prefix }
         ## previously allocated for "internet for schools" and "inbound routing codes" some time
         ## before 30 Sep 2020. This classification may be obsolete for the UK now
         # elsif($prefix =~ /^82[09]/)                   { push @network_svc_prefices, $prefix } # internet for schools
        if($prefix =~ /^9(8|0[89])/) { push @adult_prefices, $prefix }
        push @telco_length_data, [$prefix, $telco, $format];
    }
}

# special numbers allocated for use in drama that don't appear
# in the spreadsheets
push @free_prefices, '8081570';
push @special_prefices, '9098790', '3069990';

print "Building telco/length data ...\n";
my %telco_by_prefix  = ();
my %format_by_prefix = ();
foreach my $datum (@telco_length_data) {
    my($prefix, $telco, $format) = @{$datum};

    # if($prefix eq '1442600') {
    #     warn "Correcting OFCOM's broken data for 1442 60 0\n";
    #     warn "  check this and remove the warning if fixed in XLS file (last checked 2017/06/20)\n";
    #     if($format eq '') { $format = '4+6' }
    # }

    # see https://github.com/DrHyde/perl-modules-Number-Phone/issues/112
    # if($prefix eq '800716') {
    #     warn "Correcting OFCOM's broken data for 8007 16\n";
    #     $format = '0+9/10';
    # }

    if($format eq '') {
        if(    $prefix =~ /^[37]/     ) { $format = '10 digit numbers' }
         elsif($prefix =~ /^2/        ) { $format = '2+8' }
         elsif($prefix =~ /^(1.1|11.)/) { $format = '3+7' }
    }

    if($format eq '(0)+10' || $format =~ /^10 digit number/i) {
        $format = '0+10';
    } elsif($format =~ /^9 +digit number/i) {
        $format = '0+9';
    } elsif($format =~ /^7 digit number/i) {
        $format = '0+7';
    } elsif($format =~ /^Mixed 4\+5 &(amp;)? 4\+6$/) {
        $format = '4+5/6';
    } elsif(
        $format ne '2+8' &&
        $format ne '3+7' &&
        $format ne '4+6' &&
        $format ne '4+5' &&
        $format ne '5+5' &&
        $format ne '5+4' &&
        $format !~ /^Mixed 4\+5 &(amp;)? 4\+6$/
    ) {
        warn "Unknown format: $format (r: $prefix; t: $telco)\n"
            unless($prefix =~ /^[89]/ || $statuses{$prefix} eq 'Protected')
    }

    $telco_by_prefix{$prefix} = $telco;
    $format_by_prefix{$prefix} = $format;
}

print "Creating Data::CompactReadonly file ...\n";
mkdir('share');
unlink('share/Number-Phone-UK-Data.db');
Data::CompactReadonly->create(
    'share/Number-Phone-UK-Data.db',
    {
        # Booleans to save space in the db if building with perl 5.36 or newer
        geo_prefices         => { map { ($_, 1 == 1) } @geo_prefices },
        network_svc_prefices => { map { ($_, 1 == 1) } @network_svc_prefices },
        free_prefices        => { map { ($_, 1 == 1) } @free_prefices },
        corporate_prefices   => { map { ($_, 1 == 1) } @corporate_prefices },
        personal_prefices    => { map { ($_, 1 == 1) } @personal_prefices },
        pager_prefices       => { map { ($_, 1 == 1) } @pager_prefices },
        mobile_prefices      => { map { ($_, 1 == 1) } @mobile_prefices },
        special_prefices     => { map { ($_, 1 == 1) } @special_prefices },
        adult_prefices       => { map { ($_, 1 == 1) } @adult_prefices },
        ip_prefices          => { map { ($_, 1 == 1) } @ip_prefices },
        areanames            => \%areanames,
        telco                => \%telco_by_prefix,
        format               => \%format_by_prefix,
        subclass             => {
            # taken from libphonenumber
            # checked on 2024-12-11
            # next check due 2025-12-01 (annually)
            # NB for all of these also check build-data.country-mapping
            (map { $_ => 'GG' } (
                1481, 7781, 7839, 79111, 79117
            )),
            (map { $_ => 'IM' } (
                1624,
                762450,
                762456,
                74576,
                7524,
                7924,
                76240,
                76241,
                76242,
                76243,
                76244,
                76246,
                76248,
                76249,
                808162,
                8440406, 8440906,
                872299,
                845624, 870624,
                900624, 901624, 906624, 907624,
            )),
            (map { $_ => 'JE' } (
                1534,
                7509, 7829, 7937,
                77003, 77007, 77008,
                # the rest of 7797 is *reserved* but not allocated
                77977, 77978, 77979,
                800735, 800781,
                # libphonenumber says 808901 is JE, OFCOM say only
                # these three ranges are allocated
                8089012, 8089013, 8089019,
            )),
        }
    }
);