Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/use gb full #18

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,7 @@ ENV/
.mypy_cache/
_build/
generated/

# PyCharm
.idea/

2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,6 @@ Supported countries

The list of countries available in the GeoNames database, with the corresponding country codes, are given below,

Andorra (AD), Argentina (AR), American Samoa (AS), Austria (AT), Australia (AU), Åland Islands (AX), Bangladesh (BD), Belgium (BE), Bulgaria (BG), Bermuda (BM), Brazil (BR), Belarus (BY), Canada (CA), Switzerland (CH), Colombia (CO), Costa Rica (CR), Czechia (CZ), Germany (DE), Denmark (DK), Dominican Republic (DO), Algeria (DZ), Spain (ES), Finland (FI), Faroe Islands (FO), France (FR), United Kingdom of Great Britain and Northern Ireland (GB), French Guiana (GF), Guernsey (GG), Greenland (GL), Guadeloupe (GP), Guatemala (GT), Guam (GU), Croatia (HR), Hungary (HU), Ireland (IE), Isle of Man (IM), India (IN), Iceland (IS), Italy (IT), Jersey (JE), Japan (JP), Liechtenstein (LI), Sri Lanka (LK), Lithuania (LT), Luxembourg (LU), Latvia (LV), Monaco (MC), Republic of Moldova (MD), Marshall Islands (MH), The former Yugoslav Republic of Macedonia (MK), Northern Mariana Islands (MP), Martinique (MQ), Malta (MT), Mexico (MX), Malaysia (MY), New Caledonia (NC), Netherlands (NL), Norway (NO), New Zealand (NZ), Philippines (PH), Pakistan (PK), Poland (PL), Saint Pierre and Miquelon (PM), Puerto Rico (PR), Portugal (PT), Réunion (RE), Romania (RO), Russian Federation (RU), Sweden (SE), Slovenia (SI), Svalbard and Jan Mayen Islands (SJ), Slovakia (SK), San Marino (SM), Thailand (TH), Turkey (TR), Ukraine (UA), United States of America (US), Uruguay (UY), Holy See (VA), United States Virgin Islands (VI), Wallis and Futuna Islands (WF), Mayotte (YT), South Africa (ZA)
Andorra (AD), Argentina (AR), American Samoa (AS), Austria (AT), Australia (AU), Åland Islands (AX), Bangladesh (BD), Belgium (BE), Bulgaria (BG), Bermuda (BM), Brazil (BR), Belarus (BY), Canada (CA), Switzerland (CH), Colombia (CO), Costa Rica (CR), Czechia (CZ), Germany (DE), Denmark (DK), Dominican Republic (DO), Algeria (DZ), Spain (ES), Finland (FI), Faroe Islands (FO), France (FR), United Kingdom of Great Britain and Northern Ireland (GB for outwardcode, GB_full for the whole postcode), French Guiana (GF), Guernsey (GG), Greenland (GL), Guadeloupe (GP), Guatemala (GT), Guam (GU), Croatia (HR), Hungary (HU), Ireland (IE), Isle of Man (IM), India (IN), Iceland (IS), Italy (IT), Jersey (JE), Japan (JP), Liechtenstein (LI), Sri Lanka (LK), Lithuania (LT), Luxembourg (LU), Latvia (LV), Monaco (MC), Republic of Moldova (MD), Marshall Islands (MH), The former Yugoslav Republic of Macedonia (MK), Northern Mariana Islands (MP), Martinique (MQ), Malta (MT), Mexico (MX), Malaysia (MY), New Caledonia (NC), Netherlands (NL), Norway (NO), New Zealand (NZ), Philippines (PH), Pakistan (PK), Poland (PL), Saint Pierre and Miquelon (PM), Puerto Rico (PR), Portugal (PT), Réunion (RE), Romania (RO), Russian Federation (RU), Sweden (SE), Slovenia (SI), Svalbard and Jan Mayen Islands (SJ), Slovakia (SK), San Marino (SM), Thailand (TH), Turkey (TR), Ukraine (UA), United States of America (US), Uruguay (UY), Holy See (VA), United States Virgin Islands (VI), Wallis and Futuna Islands (WF), Mayotte (YT), South Africa (ZA)

See `GeoNames database <http://download.geonames.org/export/zip/>`_ for more information.
89 changes: 52 additions & 37 deletions pgeocode.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,32 +4,24 @@

import os
import warnings
from zipfile import ZipFile

import numpy as np
import pandas as pd
from pandas.io.common import get_filepath_or_buffer

__version__ = '0.1.2'

STORAGE_DIR = os.path.join(os.path.expanduser('~'),
'pgeocode_data')

DOWNLOAD_URL = "http://download.geonames.org/export/zip/{country}.zip"
DOWNLOAD_URL = "https://download.geonames.org/export/zip/{country}.zip"

DATA_FIELDS = ['country code', 'postal_code', 'place_name',
'state_name', 'state_code', 'county_name', 'county_code',
'community_name', 'community_code',
'latitude', 'longitude', 'accuracy']

COUNTRIES_VALID = ["AD", "AR", "AS", "AT", "AU", "AX", "BD", "BE", "BG", "BM",
"BR", "BY", "CA", "CH", "CO", "CR", "CZ", "DE", "DK", "DO",
"DZ", "ES", "FI", "FO", "FR", "GB", "GF", "GG", "GL", "GP",
"GT", "GU", "HR", "HU", "IE", "IM", "IN", "IS", "IT", "JE",
"JP", "LI", "LK", "LT", "LU", "LV", "MC", "MD", "MH", "MK",
"MP", "MQ", "MT", "MX", "MY", "NC", "NL", "NO", "NZ", "PH",
"PK", "PL", "PM", "PR", "PT", "RE", "RO", "RU", "SE", "SI",
"SJ", "SK", "SM", "TH", "TR", "UA", "US", "UY", "VA", "VI",
"WF", "YT", "ZA"]


class Nominatim(object):
"""Query geographical location from a city name or a postal code
Expand All @@ -43,41 +35,25 @@ class Nominatim(object):
into a single entry
"""
def __init__(self, country='fr', unique=True):

country = country.upper()
if country not in COUNTRIES_VALID:
raise ValueError(('country={} is not a known country code. '
'See the README for a list of supported '
'countries')
.format(country))
if country == 'AR':
warnings.warn("The Argentina data file contains 4-digit postal "
"codes which were replaced with a new system "
"in 1999.")
self.country = country
self._data_path, self._data = self._get_data(country)
country_obj = _Country(country)
self.country = country_obj.name
self.download_path = country_obj.get_download_path()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's make this private as well,

Suggested change
self.download_path = country_obj.get_download_path()
self._download_path = country_obj.get_download_path()

self._data_path, self._data = self._get_data(self.country)
if unique:
self._data_frame = self._index_postal_codes()
else:
self._data_frame = self._data
self.unique = unique

@staticmethod
def _get_data(country):
def _get_data(self, country):
"""Load the data from disk; otherwise download and save it"""
from zipfile import ZipFile
from pandas.io.common import get_filepath_or_buffer, _infer_compression
data_path = os.path.join(STORAGE_DIR,
country.upper() + '.txt')
data_path = os.path.join(STORAGE_DIR, country + '.txt')
if os.path.exists(data_path):
data = pd.read_csv(data_path,
dtype={'postal_code': str})
data = pd.read_csv(data_path, dtype={'postal_code': str})
else:
url = DOWNLOAD_URL.format(country=country)
compression = _infer_compression(url, "zip")
reader, encoding, compression = get_filepath_or_buffer(url)[:3]
reader, encoding, compression = get_filepath_or_buffer(self.download_path)[:3]
with ZipFile(reader) as fh_zip:
with fh_zip.open(country.upper() + '.txt') as fh:
with fh_zip.open(country + '.txt') as fh:
data = pd.read_csv(fh,
sep='\t', header=0,
names=DATA_FIELDS,
Expand Down Expand Up @@ -107,6 +83,8 @@ def _index_postal_codes(self):
data_unique[key] = df_unique_cp_group[key].first()
data_unique = data_unique.reset_index()[DATA_FIELDS]
data_unique.to_csv(data_path_unique, index=None)
if self.country == 'GB_full':
data_unique['postal_code'] = data_unique['postal_code'].str.replace(' ', '')
return data_unique

def _normalize_postal_code(self, codes):
Expand All @@ -119,6 +97,8 @@ def _normalize_postal_code(self, codes):

if self.country in ['GB', 'IE', 'CA']:
codes['postal_code'] = codes.postal_code.str.split().str.get(0)
elif self.country == 'GB_full':
codes['postal_code'] = codes.postal_code.str.replace(' ', '')
else:
pass

Expand Down Expand Up @@ -157,10 +137,45 @@ def query_postal_code(self, codes):
return response

def query_location(self, name):
"""Get locations information from a community/minicipality name"""
"""Get locations information from a community/municipality name"""
pass


class _Country:
def __init__(self, country_code):
self.countries_valid = {"AD", "AR", "AS", "AT", "AU", "AX", "BD", "BE", "BG", "BM",
"BR", "BY", "CA", "CH", "CO", "CR", "CZ", "DE", "DK", "DO",
"DZ", "ES", "FI", "FO", "FR", "GB", "GB_full", "GF", "GG",
"GL", "GP", "GT", "GU", "HR", "HU", "IE", "IM", "IN", "IS",
"IT", "JE", "JP", "LI", "LK", "LT", "LU", "LV", "MC", "MD",
"MH", "MK", "MP", "MQ", "MT", "MX", "MY", "NC", "NL", "NO",
"NZ", "PH", "PK", "PL", "PM", "PR", "PT", "RE", "RO", "RU",
"SE", "SI", "SJ", "SK", "SM", "TH", "TR", "UA", "US", "UY",
"VA", "VI", "WF", "YT", "ZA"}
self.name = self.get_clean_country(country_code)

def get_clean_country(self, country_code):
country_code = country_code.upper()
if country_code == 'AR':
warnings.warn('The Argentina data file contains the first 5 positions of the postal code.')
if country_code == 'GB_FULL':
return 'GB_full'
if country_code in self.countries_valid:
return country_code
else:
raise ValueError(('country={} is not a known country code. '
'See the README for a list of supported '
'countries')
.format(country_code))

def get_download_path(self):
if self.name == 'GB_full':
name = 'GB_full.csv'
else:
name = self.name
return DOWNLOAD_URL.format(country=name)


class GeoDistance(Nominatim):
""" Distance calculation from a city name or a postal code

Expand Down
49 changes: 49 additions & 0 deletions test_pgeocode.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ def temp_dir():
shutil.rmtree(path)


@pytest.fixture(scope='session')
def country_gb_full():
return pgeocode._Country('gb_full')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typically session level fixtures are used for expensive operation (e.g. loading the data, or file access operations) once per session run.

To test _Country we should rather initialize it in each test that needs (and not use fixture).



def _normalize_str(x):
if x is np.nan:
return x
Expand Down Expand Up @@ -85,6 +90,27 @@ def test_download_dataset(temp_dir):
assert len(res2.place_name.split(',')) > 1


def test_download_gb_full_dataset(temp_dir):
assert not os.path.exists(os.path.join(temp_dir, 'GB_full.txt'))
nomi = Nominatim('gb_full')
# the data file was downloaded
assert os.path.exists(os.path.join(temp_dir, 'GB_full.txt'))
res = nomi.query_postal_code('BS6 5JR')

nomi2 = Nominatim('gb_full')
res2 = nomi.query_postal_code('BS65JR')

assert_array_equal(nomi._data.columns,
nomi2._data.columns)
assert_array_equal(nomi._data_frame.columns,
nomi2._data_frame.columns)
assert nomi._data.shape == nomi._data.shape
assert nomi._data_frame.shape == nomi._data_frame.shape

assert res.place_name == 'Bristol'
assert res2.place_name == 'Bristol'


amarchin marked this conversation as resolved.
Show resolved Hide resolved
def test_nominatim_query_postal_code():
nomi = Nominatim('fr')

Expand Down Expand Up @@ -172,3 +198,26 @@ def test_haversine_distance():
d_pred = haversine_distance(x, y)
# same distance +/- 3 km
assert_allclose(d_ref, d_pred, atol=3)


def test_that_get_clean_country_raises_value_error():
with pytest.raises(ValueError):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
with pytest.raises(ValueError):
with pytest.raises(ValueError, match="not a known country code"):

pgeocode._Country('invalid')


def test_that_get_clean_country_handles_gb_full(country_gb_full):
assert country_gb_full.name == 'GB_full'


def test_that_get_clean_country_works_for_valid_countries():
country_obj = pgeocode._Country('ES')
assert country_obj.name == 'ES'


def test_that_get_download_path_handles_gb_full(country_gb_full):
assert country_gb_full.get_download_path() == "https://download.geonames.org/export/zip/GB_full.csv.zip"


def test_that_get_download_path_works_for_valid_countries():
country_obj = pgeocode._Country('CA')
assert country_obj.get_download_path() == "https://download.geonames.org/export/zip/CA.zip"