Skip to content

Commit

Permalink
Test examples using doctest (#11)
Browse files Browse the repository at this point in the history
* Test examples using doctest

* Decompress model in CI

* Update CI.yml

* Update CI.yml
  • Loading branch information
vbkaisetsu authored Apr 13, 2023
1 parent 5b7e435 commit e42ac48
Show file tree
Hide file tree
Showing 7 changed files with 59 additions and 71 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,12 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install pytest mypy zstandard
python -c "import zstandard;zstandard.ZstdDecompressor().copy_stream(open('tests/data/vaporetto.model.zst','rb'),open('tests/data/vaporetto.model','wb'))"
pip install vaporetto --no-index --find-links target/wheels --force-reinstall
mypy --strict tests
pytest tests/test_vaporetto.py
python -m doctest README.md
python -m doctest docs/source/examples.rst
pack-sdist:
needs: [ test ]
Expand Down
67 changes: 37 additions & 30 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,39 +40,46 @@ To perform tokenization, follow [the document of Vaporetto](https://github.com/d
Check the version number as shown below to use compatible models:

```python
import vaporetto
vaporetto.VAPORETTO_VERSION
#=> "0.6.3"
>>> import vaporetto
>>> vaporetto.VAPORETTO_VERSION
'0.6.3'

```

Examples:

```python
# Import vaporetto module
import vaporetto
>>> import vaporetto

# Load the model file
with open('path/to/model', 'rb') as fp:
model = fp.read()
>>> with open('tests/data/vaporetto.model', 'rb') as fp:
... model = fp.read()

# Create an instance of the Vaporetto
tokenizer = vaporetto.Vaporetto(model, predict_tags = True)
>>> tokenizer = vaporetto.Vaporetto(model, predict_tags = True)

# Tokenize
tokenizer.tokenize_to_string('まぁ社長は火星猫だ')
#=> 'まぁ/名詞/マー 社長/名詞/シャチョー は/助詞/ワ 火星/名詞/カセー 猫/名詞/ネコ だ/助動詞/ダ'

tokens = tokenizer.tokenize('まぁ社長は火星猫だ')
len(tokens)
#=> 6
tokens[0].surface()
#=> 'まぁ'
tokens[0].tag(0)
#=> '名詞'
tokens[0].tag(1)
#=> 'マー'
[token.surface() for token in tokens]
#=> ['まぁ', '社長', 'は', '火星', '猫', 'だ']
>>> tokenizer.tokenize_to_string('まぁ社長は火星猫だ')
'まぁ/名詞/マー 社長/名詞/シャチョー は/助詞/ワ 火星/名詞/カセー 猫/名詞/ネコ だ/助動詞/ダ'

>>> tokens = tokenizer.tokenize('まぁ社長は火星猫だ')

>>> len(tokens)
6

>>> tokens[0].surface()
'まぁ'

>>> tokens[0].tag(0)
'名詞'

>>> tokens[0].tag(1)
'マー'

>>> [token.surface() for token in tokens]
['まぁ', '社長', '', '火星', '', '']

```

## Note for distributed models
Expand All @@ -81,24 +88,24 @@ The distributed models are compressed in zstd format. If you want to load these
you must decompress them outside the API.

```python
import vaporetto
import zstandard # zstandard package in PyPI
>>> import vaporetto
>>> import zstandard # zstandard package in PyPI

>>> dctx = zstandard.ZstdDecompressor()
>>> with open('tests/data/vaporetto.model.zst', 'rb') as fp:
... with dctx.stream_reader(fp) as dict_reader:
... tokenizer = vaporetto.Vaporetto(dict_reader.read(), predict_tags = True)

dctx = zstandard.ZstdDecompressor()
with open('path/to/model.zst', 'rb') as fp:
dict_reader = dctx.stream_reader(fp)
tokenizer = vaporetto.Vaporetto(dict_reader.read(), predict_tags = True)
```

## Note for KyTea's models

You can also use KyTea's models as follows:

```python
with open('path/to/jp-0.4.7-5.mod', 'rb') as fp:
model = fp.read()
>>> with open('path/to/jp-0.4.7-5.mod', 'rb') as fp: # doctest: +SKIP
... tokenizer = vaporetto.Vaporetto.create_from_kytea_model(fp.read())

tokenizer = vaporetto.Vaporetto.create_from_kytea_model(model)
```

Note: Vaporetto does not support tag prediction with KyTea's models.
Expand Down
15 changes: 7 additions & 8 deletions docs/source/examples.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ The following example tokenizes a string using a Vaporetto model.
.. code-block:: python
>>> import vaporetto
>>> with open('path/to/model', 'rb') as fp:
>>> with open('tests/data/vaporetto.model', 'rb') as fp:
... model = fp.read()
>>> tokenizer = vaporetto.Vaporetto(model, predict_tags = True)
Expand Down Expand Up @@ -50,9 +50,9 @@ you must decompress them outside the API:
>>> import zstandard # zstandard package in PyPI
>>> dctx = zstandard.ZstdDecompressor()
>>> with open('path/to/model.zst', 'rb') as fp:
... dict_reader = dctx.stream_reader(fp)
>>> tokenizer = vaporetto.Vaporetto(dict_reader.read(), predict_tags = True)
>>> with open('tests/data/vaporetto.model.zst', 'rb') as fp:
... with dctx.stream_reader(fp) as dict_reader:
... tokenizer = vaporetto.Vaporetto(dict_reader.read(), predict_tags = True)
Tokenize with KyTea model
-------------------------
Expand All @@ -61,7 +61,6 @@ If you want to use a KyTea model, use ``create_from_kytea_model()`` instead.

.. code-block:: python
>>> with open('path/to/jp-0.4.7-5.mod', 'rb') as fp:
... model = fp.read()
>>> tokenizer = vaporetto.Vaporetto.create_from_kytea_model(model)
>>> import vaporetto
>>> with open('path/to/jp-0.4.7-5.mod', 'rb') as fp: # doctest: +SKIP
... tokenizer = vaporetto.Vaporetto.create_from_kytea_model(fp.read())
1 change: 0 additions & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,3 @@ mypy>=1.2.0
kytea>=0.1.7
SudachiPy>=0.6.7
SudachiDict-core>=20230110
zstandard>=0.20.0
5 changes: 2 additions & 3 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -210,9 +210,8 @@ impl PredictorWrapper {
///
/// Examples:
/// >>> import vaporetto
/// >>> with open('path/to/model', 'rb') as fp:
/// ... model = fp.read()
/// >>> tokenizer = vaporetto.Vaporetto(model, predict_tags = True)
/// >>> with open('path/to/vaporetto.model', 'rb') as fp:
/// ... tokenizer = vaporetto.Vaporetto(fp.read(), predict_tags = True)
/// >>> tokenizer.tokenize_to_string('まぁ社長は火星猫だ')
/// 'まぁ/名詞/マー 社長/名詞/シャチョー は/助詞/ワ 火星/名詞/カセー 猫/名詞/ネコ だ/助動詞/ダ'
/// >>> tokens = tokenizer.tokenize('まぁ社長は火星猫だ')
Expand Down
File renamed without changes.
39 changes: 10 additions & 29 deletions tests/test_vaporetto.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,21 @@
import pathlib

import vaporetto
import zstandard

MODEL_PATH = pathlib.PurePath(__file__).parent / 'data/model.zst'
MODEL_PATH = pathlib.PurePath(__file__).parent / 'data/vaporetto.model'


def test_tokenlist_empty() -> None:
dctx = zstandard.ZstdDecompressor()
with open(MODEL_PATH, 'rb') as fp:
dict_reader = dctx.stream_reader(fp)
tokenizer = vaporetto.Vaporetto(dict_reader.read())
tokenizer = vaporetto.Vaporetto(fp.read())
tokens = tokenizer.tokenize('')

assert [] == list(tokens)


def test_tokenlist_index() -> None:
dctx = zstandard.ZstdDecompressor()
with open(MODEL_PATH, 'rb') as fp:
dict_reader = dctx.stream_reader(fp)
tokenizer = vaporetto.Vaporetto(dict_reader.read())
tokenizer = vaporetto.Vaporetto(fp.read())
tokens = tokenizer.tokenize('まぁ社長は火星猫だ')

assert 'まぁ' == tokens[0].surface()
Expand All @@ -34,10 +29,8 @@ def test_tokenlist_index() -> None:


def test_tokenlist_iter() -> None:
dctx = zstandard.ZstdDecompressor()
with open(MODEL_PATH, 'rb') as fp:
dict_reader = dctx.stream_reader(fp)
tokenizer = vaporetto.Vaporetto(dict_reader.read())
tokenizer = vaporetto.Vaporetto(fp.read())
tokens = tokenizer.tokenize('まぁ社長は火星猫だ')

assert ['まぁ', '社長', 'は', '火星', '猫', 'だ'] == list(
Expand All @@ -46,10 +39,8 @@ def test_tokenlist_iter() -> None:


def test_tokenlist_iter_positions() -> None:
dctx = zstandard.ZstdDecompressor()
with open(MODEL_PATH, 'rb') as fp:
dict_reader = dctx.stream_reader(fp)
tokenizer = vaporetto.Vaporetto(dict_reader.read())
tokenizer = vaporetto.Vaporetto(fp.read())
tokens = tokenizer.tokenize('まぁ社長は火星猫だ')

assert [(0, 2), (2, 4), (4, 5), (5, 7), (7, 8), (8, 9)] == list(
Expand All @@ -58,20 +49,16 @@ def test_tokenlist_iter_positions() -> None:


def test_wsconst() -> None:
dctx = zstandard.ZstdDecompressor()
with open(MODEL_PATH, 'rb') as fp:
dict_reader = dctx.stream_reader(fp)
tokenizer = vaporetto.Vaporetto(dict_reader.read(), wsconst='K')
tokenizer = vaporetto.Vaporetto(fp.read(), wsconst='K')
tokens = tokenizer.tokenize('まぁ社長は火星猫だ')

assert ['まぁ', '社長', 'は', '火星猫', 'だ'] == list(token.surface() for token in tokens)


def test_tags_1() -> None:
dctx = zstandard.ZstdDecompressor()
with open(MODEL_PATH, 'rb') as fp:
dict_reader = dctx.stream_reader(fp)
tokenizer = vaporetto.Vaporetto(dict_reader.read(), predict_tags=True)
tokenizer = vaporetto.Vaporetto(fp.read(), predict_tags=True)
tokens = tokenizer.tokenize('まぁ社長は火星猫だ')

assert ['名詞', '名詞', '助詞', '名詞', '名詞', '助動詞'] == list(
Expand All @@ -80,10 +67,8 @@ def test_tags_1() -> None:


def test_tags_2() -> None:
dctx = zstandard.ZstdDecompressor()
with open(MODEL_PATH, 'rb') as fp:
dict_reader = dctx.stream_reader(fp)
tokenizer = vaporetto.Vaporetto(dict_reader.read(), predict_tags=True)
tokenizer = vaporetto.Vaporetto(fp.read(), predict_tags=True)
tokens = tokenizer.tokenize('まぁ社長は火星猫だ')

assert ['マー', 'シャチョー', 'ワ', 'カセー', 'ネコ', 'ダ'] == list(
Expand All @@ -92,18 +77,14 @@ def test_tags_2() -> None:


def test_tokenize_to_string_empty() -> None:
dctx = zstandard.ZstdDecompressor()
with open(MODEL_PATH, 'rb') as fp:
dict_reader = dctx.stream_reader(fp)
tokenizer = vaporetto.Vaporetto(dict_reader.read(), predict_tags=True)
tokenizer = vaporetto.Vaporetto(fp.read(), predict_tags=True)
assert '' == tokenizer.tokenize_to_string('')


def test_tokenize_to_string() -> None:
dctx = zstandard.ZstdDecompressor()
with open(MODEL_PATH, 'rb') as fp:
dict_reader = dctx.stream_reader(fp)
tokenizer = vaporetto.Vaporetto(dict_reader.read(), predict_tags=True)
tokenizer = vaporetto.Vaporetto(fp.read(), predict_tags=True)
assert (
'まぁ/名詞/マー 社長/名詞/シャチョー は/助詞/ワ 火星/名詞/カセー 猫/名詞/ネコ だ/助動詞/ダ'
== tokenizer.tokenize_to_string('まぁ社長は火星猫だ')
Expand Down

0 comments on commit e42ac48

Please sign in to comment.