From e42ac4862146d6a9de5c936ca674096e96b85aac Mon Sep 17 00:00:00 2001 From: Koichi Akabe Date: Thu, 13 Apr 2023 15:42:13 +0900 Subject: [PATCH] Test examples using doctest (#11) * Test examples using doctest * Decompress model in CI * Update CI.yml * Update CI.yml --- .github/workflows/CI.yml | 3 + README.md | 67 ++++++++++-------- docs/source/examples.rst | 15 ++-- requirements-dev.txt | 1 - src/lib.rs | 5 +- tests/data/{model.zst => vaporetto.model.zst} | Bin tests/test_vaporetto.py | 39 +++------- 7 files changed, 59 insertions(+), 71 deletions(-) rename tests/data/{model.zst => vaporetto.model.zst} (100%) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 24476c6..693a08b 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -32,9 +32,12 @@ jobs: run: | python -m pip install --upgrade pip pip install pytest mypy zstandard + python -c "import zstandard;zstandard.ZstdDecompressor().copy_stream(open('tests/data/vaporetto.model.zst','rb'),open('tests/data/vaporetto.model','wb'))" pip install vaporetto --no-index --find-links target/wheels --force-reinstall mypy --strict tests pytest tests/test_vaporetto.py + python -m doctest README.md + python -m doctest docs/source/examples.rst pack-sdist: needs: [ test ] diff --git a/README.md b/README.md index 6b54245..20079ac 100644 --- a/README.md +++ b/README.md @@ -40,39 +40,46 @@ To perform tokenization, follow [the document of Vaporetto](https://github.com/d Check the version number as shown below to use compatible models: ```python -import vaporetto -vaporetto.VAPORETTO_VERSION -#=> "0.6.3" +>>> import vaporetto +>>> vaporetto.VAPORETTO_VERSION +'0.6.3' + ``` Examples: ```python # Import vaporetto module -import vaporetto +>>> import vaporetto # Load the model file -with open('path/to/model', 'rb') as fp: - model = fp.read() +>>> with open('tests/data/vaporetto.model', 'rb') as fp: +... model = fp.read() # Create an instance of the Vaporetto -tokenizer = vaporetto.Vaporetto(model, predict_tags = True) +>>> tokenizer = vaporetto.Vaporetto(model, predict_tags = True) # Tokenize -tokenizer.tokenize_to_string('まぁ社長は火星猫だ') -#=> 'まぁ/名詞/マー 社長/名詞/シャチョー は/助詞/ワ 火星/名詞/カセー 猫/名詞/ネコ だ/助動詞/ダ' - -tokens = tokenizer.tokenize('まぁ社長は火星猫だ') -len(tokens) -#=> 6 -tokens[0].surface() -#=> 'まぁ' -tokens[0].tag(0) -#=> '名詞' -tokens[0].tag(1) -#=> 'マー' -[token.surface() for token in tokens] -#=> ['まぁ', '社長', 'は', '火星', '猫', 'だ'] +>>> tokenizer.tokenize_to_string('まぁ社長は火星猫だ') +'まぁ/名詞/マー 社長/名詞/シャチョー は/助詞/ワ 火星/名詞/カセー 猫/名詞/ネコ だ/助動詞/ダ' + +>>> tokens = tokenizer.tokenize('まぁ社長は火星猫だ') + +>>> len(tokens) +6 + +>>> tokens[0].surface() +'まぁ' + +>>> tokens[0].tag(0) +'名詞' + +>>> tokens[0].tag(1) +'マー' + +>>> [token.surface() for token in tokens] +['まぁ', '社長', 'は', '火星', '猫', 'だ'] + ``` ## Note for distributed models @@ -81,13 +88,14 @@ The distributed models are compressed in zstd format. If you want to load these you must decompress them outside the API. ```python -import vaporetto -import zstandard # zstandard package in PyPI +>>> import vaporetto +>>> import zstandard # zstandard package in PyPI + +>>> dctx = zstandard.ZstdDecompressor() +>>> with open('tests/data/vaporetto.model.zst', 'rb') as fp: +... with dctx.stream_reader(fp) as dict_reader: +... tokenizer = vaporetto.Vaporetto(dict_reader.read(), predict_tags = True) -dctx = zstandard.ZstdDecompressor() -with open('path/to/model.zst', 'rb') as fp: - dict_reader = dctx.stream_reader(fp) - tokenizer = vaporetto.Vaporetto(dict_reader.read(), predict_tags = True) ``` ## Note for KyTea's models @@ -95,10 +103,9 @@ with open('path/to/model.zst', 'rb') as fp: You can also use KyTea's models as follows: ```python -with open('path/to/jp-0.4.7-5.mod', 'rb') as fp: - model = fp.read() +>>> with open('path/to/jp-0.4.7-5.mod', 'rb') as fp: # doctest: +SKIP +... tokenizer = vaporetto.Vaporetto.create_from_kytea_model(fp.read()) -tokenizer = vaporetto.Vaporetto.create_from_kytea_model(model) ``` Note: Vaporetto does not support tag prediction with KyTea's models. diff --git a/docs/source/examples.rst b/docs/source/examples.rst index 59e404d..055a3d2 100644 --- a/docs/source/examples.rst +++ b/docs/source/examples.rst @@ -21,7 +21,7 @@ The following example tokenizes a string using a Vaporetto model. .. code-block:: python >>> import vaporetto - >>> with open('path/to/model', 'rb') as fp: + >>> with open('tests/data/vaporetto.model', 'rb') as fp: ... model = fp.read() >>> tokenizer = vaporetto.Vaporetto(model, predict_tags = True) @@ -50,9 +50,9 @@ you must decompress them outside the API: >>> import zstandard # zstandard package in PyPI >>> dctx = zstandard.ZstdDecompressor() - >>> with open('path/to/model.zst', 'rb') as fp: - ... dict_reader = dctx.stream_reader(fp) - >>> tokenizer = vaporetto.Vaporetto(dict_reader.read(), predict_tags = True) + >>> with open('tests/data/vaporetto.model.zst', 'rb') as fp: + ... with dctx.stream_reader(fp) as dict_reader: + ... tokenizer = vaporetto.Vaporetto(dict_reader.read(), predict_tags = True) Tokenize with KyTea model ------------------------- @@ -61,7 +61,6 @@ If you want to use a KyTea model, use ``create_from_kytea_model()`` instead. .. code-block:: python - >>> with open('path/to/jp-0.4.7-5.mod', 'rb') as fp: - ... model = fp.read() - - >>> tokenizer = vaporetto.Vaporetto.create_from_kytea_model(model) + >>> import vaporetto + >>> with open('path/to/jp-0.4.7-5.mod', 'rb') as fp: # doctest: +SKIP + ... tokenizer = vaporetto.Vaporetto.create_from_kytea_model(fp.read()) diff --git a/requirements-dev.txt b/requirements-dev.txt index 6c275b2..c8790a8 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,4 +4,3 @@ mypy>=1.2.0 kytea>=0.1.7 SudachiPy>=0.6.7 SudachiDict-core>=20230110 -zstandard>=0.20.0 diff --git a/src/lib.rs b/src/lib.rs index 3805729..3f144f0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -210,9 +210,8 @@ impl PredictorWrapper { /// /// Examples: /// >>> import vaporetto -/// >>> with open('path/to/model', 'rb') as fp: -/// ... model = fp.read() -/// >>> tokenizer = vaporetto.Vaporetto(model, predict_tags = True) +/// >>> with open('path/to/vaporetto.model', 'rb') as fp: +/// ... tokenizer = vaporetto.Vaporetto(fp.read(), predict_tags = True) /// >>> tokenizer.tokenize_to_string('まぁ社長は火星猫だ') /// 'まぁ/名詞/マー 社長/名詞/シャチョー は/助詞/ワ 火星/名詞/カセー 猫/名詞/ネコ だ/助動詞/ダ' /// >>> tokens = tokenizer.tokenize('まぁ社長は火星猫だ') diff --git a/tests/data/model.zst b/tests/data/vaporetto.model.zst similarity index 100% rename from tests/data/model.zst rename to tests/data/vaporetto.model.zst diff --git a/tests/test_vaporetto.py b/tests/test_vaporetto.py index 5d81088..028c8ac 100644 --- a/tests/test_vaporetto.py +++ b/tests/test_vaporetto.py @@ -3,26 +3,21 @@ import pathlib import vaporetto -import zstandard -MODEL_PATH = pathlib.PurePath(__file__).parent / 'data/model.zst' +MODEL_PATH = pathlib.PurePath(__file__).parent / 'data/vaporetto.model' def test_tokenlist_empty() -> None: - dctx = zstandard.ZstdDecompressor() with open(MODEL_PATH, 'rb') as fp: - dict_reader = dctx.stream_reader(fp) - tokenizer = vaporetto.Vaporetto(dict_reader.read()) + tokenizer = vaporetto.Vaporetto(fp.read()) tokens = tokenizer.tokenize('') assert [] == list(tokens) def test_tokenlist_index() -> None: - dctx = zstandard.ZstdDecompressor() with open(MODEL_PATH, 'rb') as fp: - dict_reader = dctx.stream_reader(fp) - tokenizer = vaporetto.Vaporetto(dict_reader.read()) + tokenizer = vaporetto.Vaporetto(fp.read()) tokens = tokenizer.tokenize('まぁ社長は火星猫だ') assert 'まぁ' == tokens[0].surface() @@ -34,10 +29,8 @@ def test_tokenlist_index() -> None: def test_tokenlist_iter() -> None: - dctx = zstandard.ZstdDecompressor() with open(MODEL_PATH, 'rb') as fp: - dict_reader = dctx.stream_reader(fp) - tokenizer = vaporetto.Vaporetto(dict_reader.read()) + tokenizer = vaporetto.Vaporetto(fp.read()) tokens = tokenizer.tokenize('まぁ社長は火星猫だ') assert ['まぁ', '社長', 'は', '火星', '猫', 'だ'] == list( @@ -46,10 +39,8 @@ def test_tokenlist_iter() -> None: def test_tokenlist_iter_positions() -> None: - dctx = zstandard.ZstdDecompressor() with open(MODEL_PATH, 'rb') as fp: - dict_reader = dctx.stream_reader(fp) - tokenizer = vaporetto.Vaporetto(dict_reader.read()) + tokenizer = vaporetto.Vaporetto(fp.read()) tokens = tokenizer.tokenize('まぁ社長は火星猫だ') assert [(0, 2), (2, 4), (4, 5), (5, 7), (7, 8), (8, 9)] == list( @@ -58,20 +49,16 @@ def test_tokenlist_iter_positions() -> None: def test_wsconst() -> None: - dctx = zstandard.ZstdDecompressor() with open(MODEL_PATH, 'rb') as fp: - dict_reader = dctx.stream_reader(fp) - tokenizer = vaporetto.Vaporetto(dict_reader.read(), wsconst='K') + tokenizer = vaporetto.Vaporetto(fp.read(), wsconst='K') tokens = tokenizer.tokenize('まぁ社長は火星猫だ') assert ['まぁ', '社長', 'は', '火星猫', 'だ'] == list(token.surface() for token in tokens) def test_tags_1() -> None: - dctx = zstandard.ZstdDecompressor() with open(MODEL_PATH, 'rb') as fp: - dict_reader = dctx.stream_reader(fp) - tokenizer = vaporetto.Vaporetto(dict_reader.read(), predict_tags=True) + tokenizer = vaporetto.Vaporetto(fp.read(), predict_tags=True) tokens = tokenizer.tokenize('まぁ社長は火星猫だ') assert ['名詞', '名詞', '助詞', '名詞', '名詞', '助動詞'] == list( @@ -80,10 +67,8 @@ def test_tags_1() -> None: def test_tags_2() -> None: - dctx = zstandard.ZstdDecompressor() with open(MODEL_PATH, 'rb') as fp: - dict_reader = dctx.stream_reader(fp) - tokenizer = vaporetto.Vaporetto(dict_reader.read(), predict_tags=True) + tokenizer = vaporetto.Vaporetto(fp.read(), predict_tags=True) tokens = tokenizer.tokenize('まぁ社長は火星猫だ') assert ['マー', 'シャチョー', 'ワ', 'カセー', 'ネコ', 'ダ'] == list( @@ -92,18 +77,14 @@ def test_tags_2() -> None: def test_tokenize_to_string_empty() -> None: - dctx = zstandard.ZstdDecompressor() with open(MODEL_PATH, 'rb') as fp: - dict_reader = dctx.stream_reader(fp) - tokenizer = vaporetto.Vaporetto(dict_reader.read(), predict_tags=True) + tokenizer = vaporetto.Vaporetto(fp.read(), predict_tags=True) assert '' == tokenizer.tokenize_to_string('') def test_tokenize_to_string() -> None: - dctx = zstandard.ZstdDecompressor() with open(MODEL_PATH, 'rb') as fp: - dict_reader = dctx.stream_reader(fp) - tokenizer = vaporetto.Vaporetto(dict_reader.read(), predict_tags=True) + tokenizer = vaporetto.Vaporetto(fp.read(), predict_tags=True) assert ( 'まぁ/名詞/マー 社長/名詞/シャチョー は/助詞/ワ 火星/名詞/カセー 猫/名詞/ネコ だ/助動詞/ダ' == tokenizer.tokenize_to_string('まぁ社長は火星猫だ')