From 9670fe024b6ac484e863e93d970c445ce090a1df Mon Sep 17 00:00:00 2001 From: alalalalaki Date: Wed, 20 Jan 2021 03:30:39 +0900 Subject: [PATCH] Add z2h. Publish 0.0.4. --- jpstat/estat/core.py | 4 ++++ jpstat/util/z2h.py | 47 ++++++++++++++++++++++++++++++++++++++++++++ jpstat/version.py | 2 +- pyproject.toml | 2 +- 4 files changed, 53 insertions(+), 2 deletions(-) create mode 100644 jpstat/util/z2h.py diff --git a/jpstat/estat/core.py b/jpstat/estat/core.py index bc3fbcc..04e29a6 100644 --- a/jpstat/estat/core.py +++ b/jpstat/estat/core.py @@ -11,6 +11,7 @@ from .. import config from .api import API from .util.clean import clean_dict_cols +from ..util.z2h import str_z2h def get_list(statsCode=None, searchWord=None, outputRaw=False, key=None, lang=None, **kwargs): @@ -23,6 +24,7 @@ def get_list(statsCode=None, searchWord=None, outputRaw=False, key=None, lang=No 'STATISTICS_NAME', 'TITLE', 'SURVEY_DATE', 'OPEN_DATE', 'OVERALL_TOTAL_NUMBER'] df = df[cols_simple].pipe(clean_dict_cols, ['STAT_NAME', 'GOV_ORG', 'TITLE']) + df = df.applymap(str_z2h) return df @@ -31,6 +33,7 @@ def get_stat(key=None, lang=None,): data = api.get_list(statsNameList="Y") df = pd.DataFrame(data['DATALIST_INF']['LIST_INF']) df = df.pipe(clean_dict_cols, ['STAT_NAME', 'GOV_ORG']) + df = df.applymap(str_z2h) return df @@ -61,6 +64,7 @@ def get_data(statsDataId, return_note=True, key=None, lang=None, **kwargs): df.drop(col_name, axis=1, inplace=True) df['Value'] = df['$'] df.drop('$', axis=1, inplace=True) + df = df.applymap(str_z2h) if return_note: try: note = pd.DataFrame(data['STATISTICAL_DATA']['DATA_INF']['NOTE']) diff --git a/jpstat/util/z2h.py b/jpstat/util/z2h.py new file mode 100644 index 0000000..1eeab76 --- /dev/null +++ b/jpstat/util/z2h.py @@ -0,0 +1,47 @@ +""" +This code is generally borrown from under-maintained pkg `japandas` +and used to turn 「全角」strings to 「半角」strings. +""" + +__version__ = 0.1 + +from unicodedata import normalize + +# soundmarks require special handlings +_ZALPHA = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' + 'abcdefghijklmnopqrstuvwxyz') +_ZSYMBOL = '!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ ' +_ZDIGIT = '0123456789' + +# mapping from full-width to half-width +_ALPHA_MAPPER = {c: normalize('NFKC', c) for c in _ZALPHA} +_DIGIT_MAPPER = {c: normalize('NFKC', c) for c in _ZDIGIT} +_SYMBOL_MAPPER = {c: normalize('NFKC', c) for c in _ZSYMBOL} + +# adding symbols that un-normalizable +# https://www.utf8-chartable.de/unicode-utf8-table.pl?start=12224&names=-&utf8=string-literal +_ZSYMBOL_MAPPER = {"〜": "~", } +_SYMBOL_MAPPER.update(_ZSYMBOL_MAPPER) + + +def _ord_dict(dict): + return {ord(k): v for k, v in dict.items()} + + +# for unicode.translate +_Z2H_ALPHA = _ord_dict(_ALPHA_MAPPER) +_Z2H_DIGIT = _ord_dict(_DIGIT_MAPPER) +_Z2H_SYMBOL = _ord_dict(_SYMBOL_MAPPER) + +mapper = dict() +mapper.update(_Z2H_ALPHA) +mapper.update(_Z2H_DIGIT) +mapper.update(_Z2H_SYMBOL) + + +def str_z2h(string: str, ): + try: + res = string.translate(mapper) + return res + except AttributeError: + return string diff --git a/jpstat/version.py b/jpstat/version.py index c301537..156d6f9 100644 --- a/jpstat/version.py +++ b/jpstat/version.py @@ -1 +1 @@ -__version__ = '0.0.3.2' +__version__ = '0.0.4' diff --git a/pyproject.toml b/pyproject.toml index 6c3092a..7e64e85 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "jpstat" -version = "0.0.3.2" +version = "0.0.4" description = "A python library for accessing official statistics of Japan." authors = ["Xuanli Zhu "] license = "MIT"