From dc75889134790fec1ffc0fed66d8a6fe0fc75678 Mon Sep 17 00:00:00 2001 From: "andrea.marchini" Date: Wed, 27 Nov 2019 17:28:01 +0000 Subject: [PATCH 1/7] Add .idea folder to .gitignore --- .gitignore | 4 ++++ countries.py | 0 2 files changed, 4 insertions(+) create mode 100644 countries.py diff --git a/.gitignore b/.gitignore index a20b065..6de7ac0 100644 --- a/.gitignore +++ b/.gitignore @@ -101,3 +101,7 @@ ENV/ .mypy_cache/ _build/ generated/ + +# PyCharm +.idea/ + diff --git a/countries.py b/countries.py new file mode 100644 index 0000000..e69de29 From 3990bf2480383a25617d9755818fb3518f0b967a Mon Sep 17 00:00:00 2001 From: "andrea.marchini" Date: Wed, 27 Nov 2019 17:28:26 +0000 Subject: [PATCH 2/7] Add a class to handle countries --- countries.py | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/countries.py b/countries.py index e69de29..e54518d 100644 --- a/countries.py +++ b/countries.py @@ -0,0 +1,40 @@ +import warnings +from typing import Set + + +class Countries: + def __init__(self): + self.__countries_valid = {"AD", "AR", "AS", "AT", "AU", "AX", "BD", "BE", "BG", "BM", + "BR", "BY", "CA", "CH", "CO", "CR", "CZ", "DE", "DK", "DO", + "DZ", "ES", "FI", "FO", "FR", "GB", "GB_full", "GF", "GG", + "GL", "GP", "GT", "GU", "HR", "HU", "IE", "IM", "IN", "IS", + "IT", "JE", "JP", "LI", "LK", "LT", "LU", "LV", "MC", "MD", + "MH", "MK", "MP", "MQ", "MT", "MX", "MY", "NC", "NL", "NO", + "NZ", "PH", "PK", "PL", "PM", "PR", "PT", "RE", "RO", "RU", + "SE", "SI", "SJ", "SK", "SM", "TH", "TR", "UA", "US", "UY", + "VA", "VI", "WF", "YT", "ZA"} + + @property + def countries_valid(self) -> Set[str]: + return self.__countries_valid + + def get_clean_country(self, country: str) -> str: + country = country.upper() + if country == 'AR': + warnings.warn('The Argentina data file contains the first 5 positions of the postal code.') + if country == 'GB_FULL': + return 'GB_full' + if country in self.__countries_valid: + return country + else: + raise ValueError(('country={} is not a known country code. ' + 'See the README for a list of supported ' + 'countries') + .format(country)) + + def get_clean_country_for_download_path(self, country: str) -> str: + country = self.get_clean_country(country) + if country == 'GB_full': + return 'GB_full.csv' + else: + return country From a7399c4707d4dc2eca245654ad57eef606872b23 Mon Sep 17 00:00:00 2001 From: "andrea.marchini" Date: Wed, 27 Nov 2019 17:28:59 +0000 Subject: [PATCH 3/7] Handle GB_full file --- README.rst | 2 +- pgeocode.py | 53 +++++++++++++++++------------------------------- test_pgeocode.py | 23 ++++++++++++++++++++- 3 files changed, 42 insertions(+), 36 deletions(-) diff --git a/README.rst b/README.rst index bfbcd10..bacfe81 100644 --- a/README.rst +++ b/README.rst @@ -103,6 +103,6 @@ Supported countries The list of countries available in the GeoNames database, with the corresponding country codes, are given below, -Andorra (AD), Argentina (AR), American Samoa (AS), Austria (AT), Australia (AU), Åland Islands (AX), Bangladesh (BD), Belgium (BE), Bulgaria (BG), Bermuda (BM), Brazil (BR), Belarus (BY), Canada (CA), Switzerland (CH), Colombia (CO), Costa Rica (CR), Czechia (CZ), Germany (DE), Denmark (DK), Dominican Republic (DO), Algeria (DZ), Spain (ES), Finland (FI), Faroe Islands (FO), France (FR), United Kingdom of Great Britain and Northern Ireland (GB), French Guiana (GF), Guernsey (GG), Greenland (GL), Guadeloupe (GP), Guatemala (GT), Guam (GU), Croatia (HR), Hungary (HU), Ireland (IE), Isle of Man (IM), India (IN), Iceland (IS), Italy (IT), Jersey (JE), Japan (JP), Liechtenstein (LI), Sri Lanka (LK), Lithuania (LT), Luxembourg (LU), Latvia (LV), Monaco (MC), Republic of Moldova (MD), Marshall Islands (MH), The former Yugoslav Republic of Macedonia (MK), Northern Mariana Islands (MP), Martinique (MQ), Malta (MT), Mexico (MX), Malaysia (MY), New Caledonia (NC), Netherlands (NL), Norway (NO), New Zealand (NZ), Philippines (PH), Pakistan (PK), Poland (PL), Saint Pierre and Miquelon (PM), Puerto Rico (PR), Portugal (PT), Réunion (RE), Romania (RO), Russian Federation (RU), Sweden (SE), Slovenia (SI), Svalbard and Jan Mayen Islands (SJ), Slovakia (SK), San Marino (SM), Thailand (TH), Turkey (TR), Ukraine (UA), United States of America (US), Uruguay (UY), Holy See (VA), United States Virgin Islands (VI), Wallis and Futuna Islands (WF), Mayotte (YT), South Africa (ZA) +Andorra (AD), Argentina (AR), American Samoa (AS), Austria (AT), Australia (AU), Åland Islands (AX), Bangladesh (BD), Belgium (BE), Bulgaria (BG), Bermuda (BM), Brazil (BR), Belarus (BY), Canada (CA), Switzerland (CH), Colombia (CO), Costa Rica (CR), Czechia (CZ), Germany (DE), Denmark (DK), Dominican Republic (DO), Algeria (DZ), Spain (ES), Finland (FI), Faroe Islands (FO), France (FR), United Kingdom of Great Britain and Northern Ireland (GB for outwardcode, GB_full for the whole postcode), French Guiana (GF), Guernsey (GG), Greenland (GL), Guadeloupe (GP), Guatemala (GT), Guam (GU), Croatia (HR), Hungary (HU), Ireland (IE), Isle of Man (IM), India (IN), Iceland (IS), Italy (IT), Jersey (JE), Japan (JP), Liechtenstein (LI), Sri Lanka (LK), Lithuania (LT), Luxembourg (LU), Latvia (LV), Monaco (MC), Republic of Moldova (MD), Marshall Islands (MH), The former Yugoslav Republic of Macedonia (MK), Northern Mariana Islands (MP), Martinique (MQ), Malta (MT), Mexico (MX), Malaysia (MY), New Caledonia (NC), Netherlands (NL), Norway (NO), New Zealand (NZ), Philippines (PH), Pakistan (PK), Poland (PL), Saint Pierre and Miquelon (PM), Puerto Rico (PR), Portugal (PT), Réunion (RE), Romania (RO), Russian Federation (RU), Sweden (SE), Slovenia (SI), Svalbard and Jan Mayen Islands (SJ), Slovakia (SK), San Marino (SM), Thailand (TH), Turkey (TR), Ukraine (UA), United States of America (US), Uruguay (UY), Holy See (VA), United States Virgin Islands (VI), Wallis and Futuna Islands (WF), Mayotte (YT), South Africa (ZA) See `GeoNames database `_ for more information. diff --git a/pgeocode.py b/pgeocode.py index bae970b..347c9ba 100644 --- a/pgeocode.py +++ b/pgeocode.py @@ -3,33 +3,26 @@ # Authors: Roman Yurchak import os -import warnings +from zipfile import ZipFile import numpy as np import pandas as pd +from pandas.io.common import get_filepath_or_buffer + +from countries import Countries __version__ = '0.1.2' STORAGE_DIR = os.path.join(os.path.expanduser('~'), 'pgeocode_data') -DOWNLOAD_URL = "http://www.geonames.org/export/zip/{country}.zip" +DOWNLOAD_URL = "https://download.geonames.org/export/zip/{country}.zip" DATA_FIELDS = ['country code', 'postal_code', 'place_name', 'state_name', 'state_code', 'county_name', 'county_code', 'community_name', 'community_code', 'latitude', 'longitude', 'accuracy'] -COUNTRIES_VALID = ["AD", "AR", "AS", "AT", "AU", "AX", "BD", "BE", "BG", "BM", - "BR", "BY", "CA", "CH", "CO", "CR", "CZ", "DE", "DK", "DO", - "DZ", "ES", "FI", "FO", "FR", "GB", "GF", "GG", "GL", "GP", - "GT", "GU", "HR", "HU", "IE", "IM", "IN", "IS", "IT", "JE", - "JP", "LI", "LK", "LT", "LU", "LV", "MC", "MD", "MH", "MK", - "MP", "MQ", "MT", "MX", "MY", "NC", "NL", "NO", "NZ", "PH", - "PK", "PL", "PM", "PR", "PT", "RE", "RO", "RU", "SE", "SI", - "SJ", "SK", "SM", "TH", "TR", "UA", "US", "UY", "VA", "VI", - "WF", "YT", "ZA"] - class Nominatim(object): """Query geographical location from a city name or a postal code @@ -43,19 +36,9 @@ class Nominatim(object): into a single entry """ def __init__(self, country='fr', unique=True): - - country = country.upper() - if country not in COUNTRIES_VALID: - raise ValueError(('country={} is not a known country code. ' - 'See the README for a list of supported ' - 'countries') - .format(country)) - if country == 'AR': - warnings.warn("The Argentina data file contains 4-digit postal " - "codes which were replaced with a new system " - "in 1999.") - self.country = country - self._data_path, self._data = self._get_data(country) + countries = Countries() + self.country = countries.get_clean_country(country) + self._data_path, self._data = self._get_data(self.country) if unique: self._data_frame = self._index_postal_codes() else: @@ -65,19 +48,17 @@ def __init__(self, country='fr', unique=True): @staticmethod def _get_data(country): """Load the data from disk; otherwise download and save it""" - from zipfile import ZipFile - from pandas.io.common import get_filepath_or_buffer, _infer_compression - data_path = os.path.join(STORAGE_DIR, - country.upper() + '.txt') + countries = Countries() + country = countries.get_clean_country(country) + country_for_download_path = countries.get_clean_country_for_download_path(country) + data_path = os.path.join(STORAGE_DIR, country + '.txt') if os.path.exists(data_path): - data = pd.read_csv(data_path, - dtype={'postal_code': str}) + data = pd.read_csv(data_path, dtype={'postal_code': str}) else: - url = DOWNLOAD_URL.format(country=country) - compression = _infer_compression(url, "zip") + url = DOWNLOAD_URL.format(country=country_for_download_path) reader, encoding, compression = get_filepath_or_buffer(url)[:3] with ZipFile(reader) as fh_zip: - with fh_zip.open(country.upper() + '.txt') as fh: + with fh_zip.open(country + '.txt') as fh: data = pd.read_csv(fh, sep='\t', header=0, names=DATA_FIELDS, @@ -107,6 +88,8 @@ def _index_postal_codes(self): data_unique[key] = df_unique_cp_group[key].first() data_unique = data_unique.reset_index()[DATA_FIELDS] data_unique.to_csv(data_path_unique, index=None) + if self.country == 'GB_full': + data_unique['postal_code'] = data_unique['postal_code'].str.replace(' ', '') return data_unique def _normalize_postal_code(self, codes): @@ -119,6 +102,8 @@ def _normalize_postal_code(self, codes): if self.country in ['GB', 'IE', 'CA']: codes['postal_code'] = codes.postal_code.str.split().str.get(0) + elif self.country == 'GB_full': + codes['postal_code'] = codes.postal_code.str.replace(' ', '') else: pass diff --git a/test_pgeocode.py b/test_pgeocode.py index 129521d..1f9bd10 100644 --- a/test_pgeocode.py +++ b/test_pgeocode.py @@ -42,7 +42,7 @@ def _normalize_str(x): ('AU', '6837', 'Perth', '0221', 'Barton', 3089), ('US', '60605', 'Chicago', '94103', 'San Francisco', 2984), ('CA', 'M5R 1X8', 'Toronto', 'H2Z 1A7', 'Montreal', 503), - ('IE', 'D01 R2PO', 'Dublin', 'T12 RW26', 'Cork', 219), + #('IE', 'D01 R2PO', 'Dublin', 'T12 RW26', 'Cork', 219), ]) def test_countries(country, pc1, location1, pc2, location2, distance12): @@ -83,6 +83,27 @@ def test_download_dataset(temp_dir): assert len(res2.place_name.split(',')) > 1 +def test_download_gb_full_dataset(temp_dir): + assert not os.path.exists(os.path.join(temp_dir, 'GB_full.txt')) + nomi = Nominatim('gb_full') + # the data file was downloaded + assert os.path.exists(os.path.join(temp_dir, 'GB_full.txt')) + res = nomi.query_postal_code('BS6 5JR') + + nomi2 = Nominatim('gb_full') + res2 = nomi.query_postal_code('BS65JR') + + assert_array_equal(nomi._data.columns, + nomi2._data.columns) + assert_array_equal(nomi._data_frame.columns, + nomi2._data_frame.columns) + assert nomi._data.shape == nomi._data.shape + assert nomi._data_frame.shape == nomi._data_frame.shape + + assert res.place_name == 'Bristol' + assert res2.place_name == 'Bristol' + + def test_nominatim_query_postal_code(): nomi = Nominatim('fr') From 1d3a297ca99f176587c21d449de98a54a82d812a Mon Sep 17 00:00:00 2001 From: "andrea.marchini" Date: Wed, 27 Nov 2019 17:39:33 +0000 Subject: [PATCH 4/7] Remove type tags --- countries.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/countries.py b/countries.py index e54518d..3c740f2 100644 --- a/countries.py +++ b/countries.py @@ -1,5 +1,4 @@ import warnings -from typing import Set class Countries: @@ -15,10 +14,10 @@ def __init__(self): "VA", "VI", "WF", "YT", "ZA"} @property - def countries_valid(self) -> Set[str]: + def countries_valid(self): return self.__countries_valid - def get_clean_country(self, country: str) -> str: + def get_clean_country(self, country): country = country.upper() if country == 'AR': warnings.warn('The Argentina data file contains the first 5 positions of the postal code.') @@ -32,7 +31,7 @@ def get_clean_country(self, country: str) -> str: 'countries') .format(country)) - def get_clean_country_for_download_path(self, country: str) -> str: + def get_clean_country_for_download_path(self, country): country = self.get_clean_country(country) if country == 'GB_full': return 'GB_full.csv' From e56ad3fa20cb654bc1b70aaf0bb6ebd08ce0c71a Mon Sep 17 00:00:00 2001 From: "andrea.marchini" Date: Thu, 28 Nov 2019 11:28:31 +0000 Subject: [PATCH 5/7] Move Countries to _Country in pgeocode.py --- countries.py | 39 ------------------------------------- pgeocode.py | 54 ++++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 42 insertions(+), 51 deletions(-) delete mode 100644 countries.py diff --git a/countries.py b/countries.py deleted file mode 100644 index 3c740f2..0000000 --- a/countries.py +++ /dev/null @@ -1,39 +0,0 @@ -import warnings - - -class Countries: - def __init__(self): - self.__countries_valid = {"AD", "AR", "AS", "AT", "AU", "AX", "BD", "BE", "BG", "BM", - "BR", "BY", "CA", "CH", "CO", "CR", "CZ", "DE", "DK", "DO", - "DZ", "ES", "FI", "FO", "FR", "GB", "GB_full", "GF", "GG", - "GL", "GP", "GT", "GU", "HR", "HU", "IE", "IM", "IN", "IS", - "IT", "JE", "JP", "LI", "LK", "LT", "LU", "LV", "MC", "MD", - "MH", "MK", "MP", "MQ", "MT", "MX", "MY", "NC", "NL", "NO", - "NZ", "PH", "PK", "PL", "PM", "PR", "PT", "RE", "RO", "RU", - "SE", "SI", "SJ", "SK", "SM", "TH", "TR", "UA", "US", "UY", - "VA", "VI", "WF", "YT", "ZA"} - - @property - def countries_valid(self): - return self.__countries_valid - - def get_clean_country(self, country): - country = country.upper() - if country == 'AR': - warnings.warn('The Argentina data file contains the first 5 positions of the postal code.') - if country == 'GB_FULL': - return 'GB_full' - if country in self.__countries_valid: - return country - else: - raise ValueError(('country={} is not a known country code. ' - 'See the README for a list of supported ' - 'countries') - .format(country)) - - def get_clean_country_for_download_path(self, country): - country = self.get_clean_country(country) - if country == 'GB_full': - return 'GB_full.csv' - else: - return country diff --git a/pgeocode.py b/pgeocode.py index 347c9ba..98c491c 100644 --- a/pgeocode.py +++ b/pgeocode.py @@ -3,14 +3,13 @@ # Authors: Roman Yurchak import os +import warnings from zipfile import ZipFile import numpy as np import pandas as pd from pandas.io.common import get_filepath_or_buffer -from countries import Countries - __version__ = '0.1.2' STORAGE_DIR = os.path.join(os.path.expanduser('~'), @@ -36,8 +35,9 @@ class Nominatim(object): into a single entry """ def __init__(self, country='fr', unique=True): - countries = Countries() - self.country = countries.get_clean_country(country) + country_obj = _Country(country) + self.country = country_obj.name + self.download_path = country_obj.get_download_path() self._data_path, self._data = self._get_data(self.country) if unique: self._data_frame = self._index_postal_codes() @@ -45,18 +45,13 @@ def __init__(self, country='fr', unique=True): self._data_frame = self._data self.unique = unique - @staticmethod - def _get_data(country): + def _get_data(self, country): """Load the data from disk; otherwise download and save it""" - countries = Countries() - country = countries.get_clean_country(country) - country_for_download_path = countries.get_clean_country_for_download_path(country) data_path = os.path.join(STORAGE_DIR, country + '.txt') if os.path.exists(data_path): data = pd.read_csv(data_path, dtype={'postal_code': str}) else: - url = DOWNLOAD_URL.format(country=country_for_download_path) - reader, encoding, compression = get_filepath_or_buffer(url)[:3] + reader, encoding, compression = get_filepath_or_buffer(self.download_path)[:3] with ZipFile(reader) as fh_zip: with fh_zip.open(country + '.txt') as fh: data = pd.read_csv(fh, @@ -142,10 +137,45 @@ def query_postal_code(self, codes): return response def query_location(self, name): - """Get locations information from a community/minicipality name""" + """Get locations information from a community/municipality name""" pass +class _Country: + def __init__(self, country_code): + self.countries_valid = {"AD", "AR", "AS", "AT", "AU", "AX", "BD", "BE", "BG", "BM", + "BR", "BY", "CA", "CH", "CO", "CR", "CZ", "DE", "DK", "DO", + "DZ", "ES", "FI", "FO", "FR", "GB", "GB_full", "GF", "GG", + "GL", "GP", "GT", "GU", "HR", "HU", "IE", "IM", "IN", "IS", + "IT", "JE", "JP", "LI", "LK", "LT", "LU", "LV", "MC", "MD", + "MH", "MK", "MP", "MQ", "MT", "MX", "MY", "NC", "NL", "NO", + "NZ", "PH", "PK", "PL", "PM", "PR", "PT", "RE", "RO", "RU", + "SE", "SI", "SJ", "SK", "SM", "TH", "TR", "UA", "US", "UY", + "VA", "VI", "WF", "YT", "ZA"} + self.name = self.get_clean_country(country_code) + + def get_clean_country(self, country_code): + country_code = country_code.upper() + if country_code == 'AR': + warnings.warn('The Argentina data file contains the first 5 positions of the postal code.') + if country_code == 'GB_FULL': + return 'GB_full' + if country_code in self.countries_valid: + return country_code + else: + raise ValueError(('country={} is not a known country code. ' + 'See the README for a list of supported ' + 'countries') + .format(country_code)) + + def get_download_path(self): + if self.name == 'GB_full': + name = 'GB_full.csv' + else: + name = self.name + return DOWNLOAD_URL.format(country=name) + + class GeoDistance(Nominatim): """ Distance calculation from a city name or a postal code From 3d56aad0d7143806c5ffff108af16933019a05be Mon Sep 17 00:00:00 2001 From: "andrea.marchini" Date: Thu, 28 Nov 2019 11:29:01 +0000 Subject: [PATCH 6/7] Uncomment IE --- test_pgeocode.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_pgeocode.py b/test_pgeocode.py index e39c981..864ff77 100644 --- a/test_pgeocode.py +++ b/test_pgeocode.py @@ -42,7 +42,7 @@ def _normalize_str(x): ('AU', '6837', 'Perth', '0221', 'Barton', 3089), ('US', '60605', 'Chicago', '94103', 'San Francisco', 2984), ('CA', 'M5R 1X8', 'Toronto', 'H2Z 1A7', 'Montreal', 503), - #('IE', 'D01 R2PO', 'Dublin', 'T12 RW26', 'Cork', 219), + ('IE', 'D01 R2PO', 'Dublin', 'T12 RW26', 'Cork', 219), ]) def test_countries(country, pc1, location1, pc2, location2, distance12): From fb449c4745c567b67fd086f9441e50b2b07e603e Mon Sep 17 00:00:00 2001 From: "andrea.marchini" Date: Thu, 28 Nov 2019 11:57:20 +0000 Subject: [PATCH 7/7] Add tests for _Country --- test_pgeocode.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/test_pgeocode.py b/test_pgeocode.py index 864ff77..73b38df 100644 --- a/test_pgeocode.py +++ b/test_pgeocode.py @@ -26,6 +26,11 @@ def temp_dir(): shutil.rmtree(path) +@pytest.fixture(scope='session') +def country_gb_full(): + return pgeocode._Country('gb_full') + + def _normalize_str(x): if x is np.nan: return x @@ -193,3 +198,26 @@ def test_haversine_distance(): d_pred = haversine_distance(x, y) # same distance +/- 3 km assert_allclose(d_ref, d_pred, atol=3) + + +def test_that_get_clean_country_raises_value_error(): + with pytest.raises(ValueError): + pgeocode._Country('invalid') + + +def test_that_get_clean_country_handles_gb_full(country_gb_full): + assert country_gb_full.name == 'GB_full' + + +def test_that_get_clean_country_works_for_valid_countries(): + country_obj = pgeocode._Country('ES') + assert country_obj.name == 'ES' + + +def test_that_get_download_path_handles_gb_full(country_gb_full): + assert country_gb_full.get_download_path() == "https://download.geonames.org/export/zip/GB_full.csv.zip" + + +def test_that_get_download_path_works_for_valid_countries(): + country_obj = pgeocode._Country('CA') + assert country_obj.get_download_path() == "https://download.geonames.org/export/zip/CA.zip"