From 2bb1d779d4f4acaf70b6dfa35dd1899dccbb1ae6 Mon Sep 17 00:00:00 2001 From: Tanja Date: Fri, 10 Jan 2025 13:52:45 +0100 Subject: [PATCH] Replace pickle with safer alternatives (#13067) * Update slack release notification step * [ENG-1424] Use `pickle` alternatives (#1453) * use json.dump and json.load in count_vectors_featurizer and lexical_syntactic_featurizer instead of pickle * update load and persist in sklearn intent classifier * update persist and load in dietclassifier * update load and persist in sklearn intent classifier * use json.dump and json.load in tracker featurizers * update persist and load of TEDPolicy * updated unexpected intent policy persist and load of model utilities. * save and load fake features * rename patterns.pkl to patterns.json * update poetry.lock * ruff formatting * move skops import * add comments * clean up save_features and load_features * WIP: update model data saving and loading * add tests for save and load features * update tests for test_tracker_featurizer * update tests for test_tracker_featurizer * WIP: serialization of feature arrays. * update serialization and deserialization for feature array * remove not needed tests/utils/tensorflow/test_model_data_storage.py * start writing tests for feature array * update feature array tests * update tests * fix linting * add changelog * add new dependencies to .github/dependabot.yml * fix some tests * fix loading and saving of unexpected intent ted policy * fix linting issue * fix converting of features in cvf and lsf * fix lint issues * convert vocab in cvf * fix linting * update crf entity extractor * fix to_dict of crf_token * addressed type issues * ruff formatting * fix typing and lint issues * remove cloudpickle dependency * update logistic_regression_classifier and remove joblib as dependency * update formatting of pyproject.toml * next try: update formatting of pyproject.toml * update logging * update poetry.lock * refactor loading of lexical_syntactic_featurizer * rename FeatureMetadata.type -> FeatureMetadata.data_type * clean up tests test_features.py and test_crf_entity_extractor.py * update test_feature_array.py * check for type when loading tracker featurizer. * update changelog * fix line too long * move import of skops * Prepared release of version 3.10.9.dev1 (#1496) * prepared release of version 3.10.9.dev1 * update minimum model version * Check for 'step_id' and 'active_flow' keys in the metadata when adding 'ActionExecuted' event to flows paths stack. * fix parsing of commands * improve logging * formatting * add changelog * fix parse commands for multi step * [ATO-2985] - Windows model loading test (#1537) * Add test for model loading on windows * Improve the error message logged when handling the user message * Add a changelog * Fix Code Quality - line too long * Rasa-sdk-update (#1546) * all rasa-sdk micro updates * update poetry lock * update rasa-sdk in lock file * Remove trailing white sapce * Prepared release of version 3.10.11 (#1570) * prepared release of version 3.10.11 * add comments again in pyproject.toml * update poetry.lock * revert changes in github workflows * undo changes in pyproject.toml * update changelog * revert changes in github workflows * update poetry.lock * update poetry.lock * update pyproject.toml * update poetry.lock * update setuptools = '>=65.5.1,<75.6.0' * update setuptools = '~75.3.0' * reformat code * undo deleting of ping_slack_about_package_release.sh * fix formatting and type issues * downgrade setuptools to 70.3.0 * fixing logging issues (?) --------- Co-authored-by: sancharigr --- .github/workflows/continous-integration.yml | 1 - changelog/1424.bugfix.md | 19 + poetry.lock | 454 +++++++++++++----- pyproject.toml | 9 +- .../featurizers/single_state_featurizer.py | 23 +- rasa/core/featurizers/tracker_featurizers.py | 133 ++++- rasa/core/policies/ted_policy.py | 91 ++-- .../core/policies/unexpected_intent_policy.py | 22 +- rasa/nlu/classifiers/diet_classifier.py | 63 ++- .../logistic_regression_classifier.py | 31 +- .../classifiers/sklearn_intent_classifier.py | 53 +- rasa/nlu/extractors/crf_entity_extractor.py | 155 +++--- .../count_vectors_featurizer.py | 59 ++- .../lexical_syntactic_featurizer.py | 69 ++- .../sparse_featurizer/regex_featurizer.py | 8 +- rasa/shared/nlu/training_data/features.py | 122 ++++- rasa/shared/utils/io.py | 1 + rasa/utils/common.py | 7 +- rasa/utils/io.py | 76 +-- rasa/utils/tensorflow/feature_array.py | 370 ++++++++++++++ rasa/utils/tensorflow/model_data.py | 198 +------- scripts/ping_slack_about_package_release.sh | 1 - .../featurizers/test_tracker_featurizer.py | 38 +- .../extractors/test_crf_entity_extractor.py | 121 ++++- .../shared/nlu/training_data/test_features.py | 219 ++++++++- tests/utils/tensorflow/test_feature_array.py | 197 ++++++++ tests/utils/test_io.py | 18 - 27 files changed, 1942 insertions(+), 616 deletions(-) create mode 100644 changelog/1424.bugfix.md create mode 100644 rasa/utils/tensorflow/feature_array.py mode change 100755 => 100644 scripts/ping_slack_about_package_release.sh create mode 100644 tests/utils/tensorflow/test_feature_array.py diff --git a/.github/workflows/continous-integration.yml b/.github/workflows/continous-integration.yml index 67594243b1f1..b6f2bd8d38f3 100644 --- a/.github/workflows/continous-integration.yml +++ b/.github/workflows/continous-integration.yml @@ -1307,7 +1307,6 @@ jobs: with: args: "💥 New *Rasa Open Source * version `${{ github.ref_name }}` has been released!" - send_slack_notification_for_release_on_failure: name: Notify Slack & Publish Release Notes runs-on: ubuntu-24.04 diff --git a/changelog/1424.bugfix.md b/changelog/1424.bugfix.md new file mode 100644 index 000000000000..d71648f6da3c --- /dev/null +++ b/changelog/1424.bugfix.md @@ -0,0 +1,19 @@ +Replace `pickle` and `joblib` with safer alternatives, e.g. `json`, `safetensors`, and `skops`, for +serializing components. + +**Note**: This is a model breaking change. Please retrain your model. + +If you have a custom component that inherits from one of the components listed below and modified the `persist` or +`load` method, make sure to update your code. Please contact us in case you encounter any problems. + +Affected components: + +- `CountVectorFeaturizer` +- `LexicalSyntacticFeaturizer` +- `LogisticRegressionClassifier` +- `SklearnIntentClassifier` +- `DIETClassifier` +- `CRFEntityExtractor` +- `TrackerFeaturizer` +- `TEDPolicy` +- `UnexpectedIntentTEDPolicy` \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index 58af70bada4b..ba2092a01ff4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1412,7 +1412,7 @@ requests = ">=2.0" name = "filelock" version = "3.12.2" description = "A platform independent file lock." -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "filelock-3.12.2-py3-none-any.whl", hash = "sha256:cbb791cdea2a72f23da6ac5b5269ab0a0d161e9ef0100e653b69049a7706d1ec"}, @@ -2214,18 +2214,18 @@ socks = ["socksio (==1.*)"] [[package]] name = "huggingface-hub" -version = "0.16.2" +version = "0.27.0" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" -optional = true -python-versions = ">=3.7.0" +optional = false +python-versions = ">=3.8.0" files = [ - {file = "huggingface_hub-0.16.2-py3-none-any.whl", hash = "sha256:92facff575c11a8cf4b35d184ae67867a577a1b30865edcd8a9c5a48d2202133"}, - {file = "huggingface_hub-0.16.2.tar.gz", hash = "sha256:205abbf02a3408129a309f09e6d1a88d0c82de296b498682a813d9baa91c272f"}, + {file = "huggingface_hub-0.27.0-py3-none-any.whl", hash = "sha256:8f2e834517f1f1ddf1ecc716f91b120d7333011b7485f665a9a412eacb1a2a81"}, + {file = "huggingface_hub-0.27.0.tar.gz", hash = "sha256:902cce1a1be5739f5589e560198a65a8edcfd3b830b1666f36e4b961f0454fac"}, ] [package.dependencies] filelock = "*" -fsspec = "*" +fsspec = ">=2023.5.0" packaging = ">=20.9" pyyaml = ">=5.1" requests = "*" @@ -2233,16 +2233,18 @@ tqdm = ">=4.42.1" typing-extensions = ">=3.7.4.3" [package.extras] -all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "black (>=23.1,<24.0)", "gradio", "jedi", "mypy (==0.982)", "numpy", "pydantic", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-vcr", "pytest-xdist", "ruff (>=0.0.241)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "urllib3 (<2.0)"] +all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "libcst (==1.4.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.5.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"] cli = ["InquirerPy (==0.3.4)"] -dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "black (>=23.1,<24.0)", "gradio", "jedi", "mypy (==0.982)", "numpy", "pydantic", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-vcr", "pytest-xdist", "ruff (>=0.0.241)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "urllib3 (<2.0)"] +dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "libcst (==1.4.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.5.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"] fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"] -inference = ["aiohttp", "pydantic"] -quality = ["black (>=23.1,<24.0)", "mypy (==0.982)", "ruff (>=0.0.241)"] +hf-transfer = ["hf-transfer (>=0.1.4)"] +inference = ["aiohttp"] +quality = ["libcst (==1.4.0)", "mypy (==1.5.1)", "ruff (>=0.5.0)"] tensorflow = ["graphviz", "pydot", "tensorflow"] -testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "gradio", "jedi", "numpy", "pydantic", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"] -torch = ["torch"] -typing = ["pydantic", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3"] +tensorflow-testing = ["keras (<3.0)", "tensorflow"] +testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"] +torch = ["safetensors[torch]", "torch"] +typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)"] [[package]] name = "humanfriendly" @@ -2416,30 +2418,30 @@ files = [ [[package]] name = "joblib" -version = "1.2.0" +version = "1.4.2" description = "Lightweight pipelining with Python functions" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "joblib-1.2.0-py3-none-any.whl", hash = "sha256:091138ed78f800342968c523bdde947e7a305b8594b910a0fea2ab83c3c6d385"}, - {file = "joblib-1.2.0.tar.gz", hash = "sha256:e1cee4a79e4af22881164f218d4311f60074197fb707e082e803b61f6d137018"}, + {file = "joblib-1.4.2-py3-none-any.whl", hash = "sha256:06d478d5674cbc267e7496a410ee875abd68e4340feff4490bcb7afb88060ae6"}, + {file = "joblib-1.4.2.tar.gz", hash = "sha256:2382c5816b2636fbd20a09e0f4e9dad4736765fdfb7dca582943b9c1366b3f0e"}, ] [[package]] name = "jsonpickle" -version = "3.0.1" -description = "Python library for serializing any arbitrary object graph into JSON" +version = "3.0.4" +description = "Serialize any Python object to JSON" optional = false python-versions = ">=3.7" files = [ - {file = "jsonpickle-3.0.1-py2.py3-none-any.whl", hash = "sha256:130d8b293ea0add3845de311aaba55e6d706d0bb17bc123bd2c8baf8a39ac77c"}, - {file = "jsonpickle-3.0.1.tar.gz", hash = "sha256:032538804795e73b94ead410800ac387fdb6de98f8882ac957fcd247e3a85200"}, + {file = "jsonpickle-3.0.4-py3-none-any.whl", hash = "sha256:04ae7567a14269579e3af66b76bda284587458d7e8a204951ca8f71a3309952e"}, + {file = "jsonpickle-3.0.4.tar.gz", hash = "sha256:a1b14c8d6221cd8f394f2a97e735ea1d7edc927fbd135b26f2f8700657c8c62b"}, ] [package.extras] -docs = ["jaraco.packaging (>=3.2)", "rst.linker (>=1.9)", "sphinx"] -testing = ["ecdsa", "feedparser", "gmpy2", "numpy", "pandas", "pymongo", "pytest (>=3.5,!=3.7.3)", "pytest-black-multipy", "pytest-checkdocs (>=1.2.3)", "pytest-cov", "pytest-flake8 (>=1.1.1)", "scikit-learn", "sqlalchemy"] -testing-libs = ["simplejson", "ujson"] +docs = ["furo", "rst.linker (>=1.9)", "sphinx"] +packaging = ["build", "twine"] +testing = ["bson", "ecdsa", "feedparser", "gmpy2", "numpy", "pandas", "pymongo", "pytest (>=3.5,!=3.7.3)", "pytest-benchmark", "pytest-benchmark[histogram]", "pytest-checkdocs (>=1.2.3)", "pytest-cov", "pytest-enabler (>=1.0.1)", "pytest-ruff (>=0.2.1)", "scikit-learn", "scipy", "scipy (>=1.9.3)", "simplejson", "sqlalchemy", "ujson"] [[package]] name = "jsonschema" @@ -4746,6 +4748,138 @@ botocore = ">=1.12.36,<2.0a.0" [package.extras] crt = ["botocore[crt] (>=1.20.29,<2.0a.0)"] +[[package]] +name = "safetensors" +version = "0.4.5" +description = "" +optional = false +python-versions = ">=3.7" +files = [ + {file = "safetensors-0.4.5-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:a63eaccd22243c67e4f2b1c3e258b257effc4acd78f3b9d397edc8cf8f1298a7"}, + {file = "safetensors-0.4.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:23fc9b4ec7b602915cbb4ec1a7c1ad96d2743c322f20ab709e2c35d1b66dad27"}, + {file = "safetensors-0.4.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6885016f34bef80ea1085b7e99b3c1f92cb1be78a49839203060f67b40aee761"}, + {file = "safetensors-0.4.5-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:133620f443450429322f238fda74d512c4008621227fccf2f8cf4a76206fea7c"}, + {file = "safetensors-0.4.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4fb3e0609ec12d2a77e882f07cced530b8262027f64b75d399f1504ffec0ba56"}, + {file = "safetensors-0.4.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d0f1dd769f064adc33831f5e97ad07babbd728427f98e3e1db6902e369122737"}, + {file = "safetensors-0.4.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c6d156bdb26732feada84f9388a9f135528c1ef5b05fae153da365ad4319c4c5"}, + {file = "safetensors-0.4.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9e347d77e2c77eb7624400ccd09bed69d35c0332f417ce8c048d404a096c593b"}, + {file = "safetensors-0.4.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9f556eea3aec1d3d955403159fe2123ddd68e880f83954ee9b4a3f2e15e716b6"}, + {file = "safetensors-0.4.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9483f42be3b6bc8ff77dd67302de8ae411c4db39f7224dec66b0eb95822e4163"}, + {file = "safetensors-0.4.5-cp310-none-win32.whl", hash = "sha256:7389129c03fadd1ccc37fd1ebbc773f2b031483b04700923c3511d2a939252cc"}, + {file = "safetensors-0.4.5-cp310-none-win_amd64.whl", hash = "sha256:e98ef5524f8b6620c8cdef97220c0b6a5c1cef69852fcd2f174bb96c2bb316b1"}, + {file = "safetensors-0.4.5-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:21f848d7aebd5954f92538552d6d75f7c1b4500f51664078b5b49720d180e47c"}, + {file = "safetensors-0.4.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bb07000b19d41e35eecef9a454f31a8b4718a185293f0d0b1c4b61d6e4487971"}, + {file = "safetensors-0.4.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:09dedf7c2fda934ee68143202acff6e9e8eb0ddeeb4cfc24182bef999efa9f42"}, + {file = "safetensors-0.4.5-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:59b77e4b7a708988d84f26de3ebead61ef1659c73dcbc9946c18f3b1786d2688"}, + {file = "safetensors-0.4.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5d3bc83e14d67adc2e9387e511097f254bd1b43c3020440e708858c684cbac68"}, + {file = "safetensors-0.4.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:39371fc551c1072976073ab258c3119395294cf49cdc1f8476794627de3130df"}, + {file = "safetensors-0.4.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a6c19feda32b931cae0acd42748a670bdf56bee6476a046af20181ad3fee4090"}, + {file = "safetensors-0.4.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a659467495de201e2f282063808a41170448c78bada1e62707b07a27b05e6943"}, + {file = "safetensors-0.4.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:bad5e4b2476949bcd638a89f71b6916fa9a5cae5c1ae7eede337aca2100435c0"}, + {file = "safetensors-0.4.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a3a315a6d0054bc6889a17f5668a73f94f7fe55121ff59e0a199e3519c08565f"}, + {file = "safetensors-0.4.5-cp311-none-win32.whl", hash = "sha256:a01e232e6d3d5cf8b1667bc3b657a77bdab73f0743c26c1d3c5dd7ce86bd3a92"}, + {file = "safetensors-0.4.5-cp311-none-win_amd64.whl", hash = "sha256:cbd39cae1ad3e3ef6f63a6f07296b080c951f24cec60188378e43d3713000c04"}, + {file = "safetensors-0.4.5-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:473300314e026bd1043cef391bb16a8689453363381561b8a3e443870937cc1e"}, + {file = "safetensors-0.4.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:801183a0f76dc647f51a2d9141ad341f9665602a7899a693207a82fb102cc53e"}, + {file = "safetensors-0.4.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1524b54246e422ad6fb6aea1ac71edeeb77666efa67230e1faf6999df9b2e27f"}, + {file = "safetensors-0.4.5-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b3139098e3e8b2ad7afbca96d30ad29157b50c90861084e69fcb80dec7430461"}, + {file = "safetensors-0.4.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:65573dc35be9059770808e276b017256fa30058802c29e1038eb1c00028502ea"}, + {file = "safetensors-0.4.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fd33da8e9407559f8779c82a0448e2133737f922d71f884da27184549416bfed"}, + {file = "safetensors-0.4.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3685ce7ed036f916316b567152482b7e959dc754fcc4a8342333d222e05f407c"}, + {file = "safetensors-0.4.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dde2bf390d25f67908278d6f5d59e46211ef98e44108727084d4637ee70ab4f1"}, + {file = "safetensors-0.4.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7469d70d3de970b1698d47c11ebbf296a308702cbaae7fcb993944751cf985f4"}, + {file = "safetensors-0.4.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3a6ba28118636a130ccbb968bc33d4684c48678695dba2590169d5ab03a45646"}, + {file = "safetensors-0.4.5-cp312-none-win32.whl", hash = "sha256:c859c7ed90b0047f58ee27751c8e56951452ed36a67afee1b0a87847d065eec6"}, + {file = "safetensors-0.4.5-cp312-none-win_amd64.whl", hash = "sha256:b5a8810ad6a6f933fff6c276eae92c1da217b39b4d8b1bc1c0b8af2d270dc532"}, + {file = "safetensors-0.4.5-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:25e5f8e2e92a74f05b4ca55686234c32aac19927903792b30ee6d7bd5653d54e"}, + {file = "safetensors-0.4.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:81efb124b58af39fcd684254c645e35692fea81c51627259cdf6d67ff4458916"}, + {file = "safetensors-0.4.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:585f1703a518b437f5103aa9cf70e9bd437cb78eea9c51024329e4fb8a3e3679"}, + {file = "safetensors-0.4.5-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4b99fbf72e3faf0b2f5f16e5e3458b93b7d0a83984fe8d5364c60aa169f2da89"}, + {file = "safetensors-0.4.5-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b17b299ca9966ca983ecda1c0791a3f07f9ca6ab5ded8ef3d283fff45f6bcd5f"}, + {file = "safetensors-0.4.5-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:76ded72f69209c9780fdb23ea89e56d35c54ae6abcdec67ccb22af8e696e449a"}, + {file = "safetensors-0.4.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2783956926303dcfeb1de91a4d1204cd4089ab441e622e7caee0642281109db3"}, + {file = "safetensors-0.4.5-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d94581aab8c6b204def4d7320f07534d6ee34cd4855688004a4354e63b639a35"}, + {file = "safetensors-0.4.5-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:67e1e7cb8678bb1b37ac48ec0df04faf689e2f4e9e81e566b5c63d9f23748523"}, + {file = "safetensors-0.4.5-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:dbd280b07e6054ea68b0cb4b16ad9703e7d63cd6890f577cb98acc5354780142"}, + {file = "safetensors-0.4.5-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:77d9b228da8374c7262046a36c1f656ba32a93df6cc51cd4453af932011e77f1"}, + {file = "safetensors-0.4.5-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:500cac01d50b301ab7bb192353317035011c5ceeef0fca652f9f43c000bb7f8d"}, + {file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:75331c0c746f03158ded32465b7d0b0e24c5a22121743662a2393439c43a45cf"}, + {file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:670e95fe34e0d591d0529e5e59fd9d3d72bc77b1444fcaa14dccda4f36b5a38b"}, + {file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:098923e2574ff237c517d6e840acada8e5b311cb1fa226019105ed82e9c3b62f"}, + {file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:13ca0902d2648775089fa6a0c8fc9e6390c5f8ee576517d33f9261656f851e3f"}, + {file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f0032bedc869c56f8d26259fe39cd21c5199cd57f2228d817a0e23e8370af25"}, + {file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f4b15f51b4f8f2a512341d9ce3475cacc19c5fdfc5db1f0e19449e75f95c7dc8"}, + {file = "safetensors-0.4.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:f6594d130d0ad933d885c6a7b75c5183cb0e8450f799b80a39eae2b8508955eb"}, + {file = "safetensors-0.4.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:60c828a27e852ded2c85fc0f87bf1ec20e464c5cd4d56ff0e0711855cc2e17f8"}, + {file = "safetensors-0.4.5-cp37-none-win32.whl", hash = "sha256:6d3de65718b86c3eeaa8b73a9c3d123f9307a96bbd7be9698e21e76a56443af5"}, + {file = "safetensors-0.4.5-cp37-none-win_amd64.whl", hash = "sha256:5a2d68a523a4cefd791156a4174189a4114cf0bf9c50ceb89f261600f3b2b81a"}, + {file = "safetensors-0.4.5-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:e7a97058f96340850da0601a3309f3d29d6191b0702b2da201e54c6e3e44ccf0"}, + {file = "safetensors-0.4.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:63bfd425e25f5c733f572e2246e08a1c38bd6f2e027d3f7c87e2e43f228d1345"}, + {file = "safetensors-0.4.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3664ac565d0e809b0b929dae7ccd74e4d3273cd0c6d1220c6430035befb678e"}, + {file = "safetensors-0.4.5-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:313514b0b9b73ff4ddfb4edd71860696dbe3c1c9dc4d5cc13dbd74da283d2cbf"}, + {file = "safetensors-0.4.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:31fa33ee326f750a2f2134a6174773c281d9a266ccd000bd4686d8021f1f3dac"}, + {file = "safetensors-0.4.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:09566792588d77b68abe53754c9f1308fadd35c9f87be939e22c623eaacbed6b"}, + {file = "safetensors-0.4.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:309aaec9b66cbf07ad3a2e5cb8a03205663324fea024ba391594423d0f00d9fe"}, + {file = "safetensors-0.4.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:53946c5813b8f9e26103c5efff4a931cc45d874f45229edd68557ffb35ffb9f8"}, + {file = "safetensors-0.4.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:868f9df9e99ad1e7f38c52194063a982bc88fedc7d05096f4f8160403aaf4bd6"}, + {file = "safetensors-0.4.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:9cc9449bd0b0bc538bd5e268221f0c5590bc5c14c1934a6ae359d44410dc68c4"}, + {file = "safetensors-0.4.5-cp38-none-win32.whl", hash = "sha256:83c4f13a9e687335c3928f615cd63a37e3f8ef072a3f2a0599fa09f863fb06a2"}, + {file = "safetensors-0.4.5-cp38-none-win_amd64.whl", hash = "sha256:b98d40a2ffa560653f6274e15b27b3544e8e3713a44627ce268f419f35c49478"}, + {file = "safetensors-0.4.5-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:cf727bb1281d66699bef5683b04d98c894a2803442c490a8d45cd365abfbdeb2"}, + {file = "safetensors-0.4.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:96f1d038c827cdc552d97e71f522e1049fef0542be575421f7684756a748e457"}, + {file = "safetensors-0.4.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:139fbee92570ecea774e6344fee908907db79646d00b12c535f66bc78bd5ea2c"}, + {file = "safetensors-0.4.5-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c36302c1c69eebb383775a89645a32b9d266878fab619819ce660309d6176c9b"}, + {file = "safetensors-0.4.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d641f5b8149ea98deb5ffcf604d764aad1de38a8285f86771ce1abf8e74c4891"}, + {file = "safetensors-0.4.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b4db6a61d968de73722b858038c616a1bebd4a86abe2688e46ca0cc2d17558f2"}, + {file = "safetensors-0.4.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b75a616e02f21b6f1d5785b20cecbab5e2bd3f6358a90e8925b813d557666ec1"}, + {file = "safetensors-0.4.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:788ee7d04cc0e0e7f944c52ff05f52a4415b312f5efd2ee66389fb7685ee030c"}, + {file = "safetensors-0.4.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:87bc42bd04fd9ca31396d3ca0433db0be1411b6b53ac5a32b7845a85d01ffc2e"}, + {file = "safetensors-0.4.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4037676c86365a721a8c9510323a51861d703b399b78a6b4486a54a65a975fca"}, + {file = "safetensors-0.4.5-cp39-none-win32.whl", hash = "sha256:1500418454529d0ed5c1564bda376c4ddff43f30fce9517d9bee7bcce5a8ef50"}, + {file = "safetensors-0.4.5-cp39-none-win_amd64.whl", hash = "sha256:9d1a94b9d793ed8fe35ab6d5cea28d540a46559bafc6aae98f30ee0867000cab"}, + {file = "safetensors-0.4.5-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:fdadf66b5a22ceb645d5435a0be7a0292ce59648ca1d46b352f13cff3ea80410"}, + {file = "safetensors-0.4.5-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d42ffd4c2259f31832cb17ff866c111684c87bd930892a1ba53fed28370c918c"}, + {file = "safetensors-0.4.5-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd8a1f6d2063a92cd04145c7fd9e31a1c7d85fbec20113a14b487563fdbc0597"}, + {file = "safetensors-0.4.5-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:951d2fcf1817f4fb0ef0b48f6696688a4e852a95922a042b3f96aaa67eedc920"}, + {file = "safetensors-0.4.5-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6ac85d9a8c1af0e3132371d9f2d134695a06a96993c2e2f0bbe25debb9e3f67a"}, + {file = "safetensors-0.4.5-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:e3cec4a29eb7fe8da0b1c7988bc3828183080439dd559f720414450de076fcab"}, + {file = "safetensors-0.4.5-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:21742b391b859e67b26c0b2ac37f52c9c0944a879a25ad2f9f9f3cd61e7fda8f"}, + {file = "safetensors-0.4.5-pp37-pypy37_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c7db3006a4915151ce1913652e907cdede299b974641a83fbc092102ac41b644"}, + {file = "safetensors-0.4.5-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f68bf99ea970960a237f416ea394e266e0361895753df06e3e06e6ea7907d98b"}, + {file = "safetensors-0.4.5-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8158938cf3324172df024da511839d373c40fbfaa83e9abf467174b2910d7b4c"}, + {file = "safetensors-0.4.5-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:540ce6c4bf6b58cb0fd93fa5f143bc0ee341c93bb4f9287ccd92cf898cc1b0dd"}, + {file = "safetensors-0.4.5-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:bfeaa1a699c6b9ed514bd15e6a91e74738b71125a9292159e3d6b7f0a53d2cde"}, + {file = "safetensors-0.4.5-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:01c8f00da537af711979e1b42a69a8ec9e1d7112f208e0e9b8a35d2c381085ef"}, + {file = "safetensors-0.4.5-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a0dd565f83b30f2ca79b5d35748d0d99dd4b3454f80e03dfb41f0038e3bdf180"}, + {file = "safetensors-0.4.5-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:023b6e5facda76989f4cba95a861b7e656b87e225f61811065d5c501f78cdb3f"}, + {file = "safetensors-0.4.5-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9633b663393d5796f0b60249549371e392b75a0b955c07e9c6f8708a87fc841f"}, + {file = "safetensors-0.4.5-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78dd8adfb48716233c45f676d6e48534d34b4bceb50162c13d1f0bdf6f78590a"}, + {file = "safetensors-0.4.5-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8e8deb16c4321d61ae72533b8451ec4a9af8656d1c61ff81aa49f966406e4b68"}, + {file = "safetensors-0.4.5-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:52452fa5999dc50c4decaf0c53aa28371f7f1e0fe5c2dd9129059fbe1e1599c7"}, + {file = "safetensors-0.4.5-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:d5f23198821e227cfc52d50fa989813513db381255c6d100927b012f0cfec63d"}, + {file = "safetensors-0.4.5-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f4beb84b6073b1247a773141a6331117e35d07134b3bb0383003f39971d414bb"}, + {file = "safetensors-0.4.5-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:68814d599d25ed2fdd045ed54d370d1d03cf35e02dce56de44c651f828fb9b7b"}, + {file = "safetensors-0.4.5-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f0b6453c54c57c1781292c46593f8a37254b8b99004c68d6c3ce229688931a22"}, + {file = "safetensors-0.4.5-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:adaa9c6dead67e2dd90d634f89131e43162012479d86e25618e821a03d1eb1dc"}, + {file = "safetensors-0.4.5-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:73e7d408e9012cd17511b382b43547850969c7979efc2bc353f317abaf23c84c"}, + {file = "safetensors-0.4.5-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:775409ce0fcc58b10773fdb4221ed1eb007de10fe7adbdf8f5e8a56096b6f0bc"}, + {file = "safetensors-0.4.5-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:834001bed193e4440c4a3950a31059523ee5090605c907c66808664c932b549c"}, + {file = "safetensors-0.4.5.tar.gz", hash = "sha256:d73de19682deabb02524b3d5d1f8b3aaba94c72f1bbfc7911b9b9d5d391c0310"}, +] + +[package.extras] +all = ["safetensors[jax]", "safetensors[numpy]", "safetensors[paddlepaddle]", "safetensors[pinned-tf]", "safetensors[quality]", "safetensors[testing]", "safetensors[torch]"] +dev = ["safetensors[all]"] +jax = ["flax (>=0.6.3)", "jax (>=0.3.25)", "jaxlib (>=0.3.25)", "safetensors[numpy]"] +mlx = ["mlx (>=0.0.9)"] +numpy = ["numpy (>=1.21.6)"] +paddlepaddle = ["paddlepaddle (>=2.4.1)", "safetensors[numpy]"] +pinned-tf = ["safetensors[numpy]", "tensorflow (==2.11.0)"] +quality = ["black (==22.3)", "click (==8.0.4)", "flake8 (>=3.8.3)", "isort (>=5.5.4)"] +tensorflow = ["safetensors[numpy]", "tensorflow (>=2.11.0)"] +testing = ["h5py (>=3.7.0)", "huggingface-hub (>=0.12.1)", "hypothesis (>=6.70.2)", "pytest (>=7.2.0)", "pytest-benchmark (>=4.0.0)", "safetensors[numpy]", "setuptools-rust (>=1.5.2)"] +torch = ["safetensors[numpy]", "torch (>=1.10)"] + [[package]] name = "sanic" version = "21.12.2" @@ -5014,19 +5148,18 @@ tornado = ["tornado (>=5)"] [[package]] name = "setuptools" -version = "68.0.0" +version = "70.3.0" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "setuptools-68.0.0-py3-none-any.whl", hash = "sha256:11e52c67415a381d10d6b462ced9cfb97066179f0e871399e006c4ab101fc85f"}, - {file = "setuptools-68.0.0.tar.gz", hash = "sha256:baf1fdb41c6da4cd2eae722e135500da913332ab3f2f5c7d33af9b492acb5235"}, + {file = "setuptools-70.3.0-py3-none-any.whl", hash = "sha256:fe384da74336c398e0d956d1cae0669bc02eed936cdb1d49b57de1990dc11ffc"}, + {file = "setuptools-70.3.0.tar.gz", hash = "sha256:f171bab1dfbc86b132997f26a119f6056a57950d058587841a0082e8830f9dc5"}, ] [package.extras] -docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"] -testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] -testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] +test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "mypy (==1.10.0)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.3.2)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] [[package]] name = "six" @@ -5056,6 +5189,28 @@ six = "*" tabulate = "*" tqdm = ">=2.0" +[[package]] +name = "skops" +version = "0.9.0" +description = "A set of tools to push scikit-learn based models to and pull from Hugging Face Hub" +optional = false +python-versions = ">=3.8" +files = [ + {file = "skops-0.9.0-py3-none-any.whl", hash = "sha256:05645199bf6976e1f6dbba4a0704799cd5d2fcef18a98b069b4c84744e1a80a1"}, + {file = "skops-0.9.0.tar.gz", hash = "sha256:3e39333d65f26d5863ad44db5001b4cfe6a29642274ac37af54fb834813aee3f"}, +] + +[package.dependencies] +huggingface-hub = ">=0.17.0" +packaging = ">=17.0" +scikit-learn = ">=0.24" +tabulate = ">=0.8.8" + +[package.extras] +docs = ["fairlearn (>=0.7.0)", "matplotlib (>=3.3)", "numpydoc (>=1.0.0)", "pandas (>=1)", "scikit-learn-intelex (>=2021.7.1)", "sphinx (>=3.2.0)", "sphinx-gallery (>=0.7.0)", "sphinx-issues (>=1.2.0)", "sphinx-prompt (>=1.3.0)", "sphinx-rtd-theme (>=1)"] +rich = ["rich (>=12)"] +tests = ["catboost (>=1.0)", "fairlearn (>=0.7.0)", "flake8 (>=3.8.2)", "flaky (>=3.7.0)", "lightgbm (>=3)", "matplotlib (>=3.3)", "pandas (>=1)", "pytest (>=5.0.1)", "pytest-cov (>=2.9.0)", "quantile-forest (>=1.0.0)", "rich (>=12)", "types-requests (>=2.28.5)", "xgboost (>=1.6)"] + [[package]] name = "slack-sdk" version = "3.21.3" @@ -5963,56 +6118,129 @@ files = [ [[package]] name = "tokenizers" -version = "0.13.3" -description = "Fast and Customizable Tokenizers" +version = "0.15.2" +description = "" optional = true -python-versions = "*" +python-versions = ">=3.7" files = [ - {file = "tokenizers-0.13.3-cp310-cp310-macosx_10_11_x86_64.whl", hash = "sha256:f3835c5be51de8c0a092058a4d4380cb9244fb34681fd0a295fbf0a52a5fdf33"}, - {file = "tokenizers-0.13.3-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:4ef4c3e821730f2692489e926b184321e887f34fb8a6b80b8096b966ba663d07"}, - {file = "tokenizers-0.13.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5fd1a6a25353e9aa762e2aae5a1e63883cad9f4e997c447ec39d071020459bc"}, - {file = "tokenizers-0.13.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ee0b1b311d65beab83d7a41c56a1e46ab732a9eed4460648e8eb0bd69fc2d059"}, - {file = "tokenizers-0.13.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ef4215284df1277dadbcc5e17d4882bda19f770d02348e73523f7e7d8b8d396"}, - {file = "tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a4d53976079cff8a033f778fb9adca2d9d69d009c02fa2d71a878b5f3963ed30"}, - {file = "tokenizers-0.13.3-cp310-cp310-win32.whl", hash = "sha256:1f0e3b4c2ea2cd13238ce43548959c118069db7579e5d40ec270ad77da5833ce"}, - {file = "tokenizers-0.13.3-cp310-cp310-win_amd64.whl", hash = "sha256:89649c00d0d7211e8186f7a75dfa1db6996f65edce4b84821817eadcc2d3c79e"}, - {file = "tokenizers-0.13.3-cp311-cp311-macosx_10_11_universal2.whl", hash = "sha256:56b726e0d2bbc9243872b0144515ba684af5b8d8cd112fb83ee1365e26ec74c8"}, - {file = "tokenizers-0.13.3-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:cc5c022ce692e1f499d745af293ab9ee6f5d92538ed2faf73f9708c89ee59ce6"}, - {file = "tokenizers-0.13.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f55c981ac44ba87c93e847c333e58c12abcbb377a0c2f2ef96e1a266e4184ff2"}, - {file = "tokenizers-0.13.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f247eae99800ef821a91f47c5280e9e9afaeed9980fc444208d5aa6ba69ff148"}, - {file = "tokenizers-0.13.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4b3e3215d048e94f40f1c95802e45dcc37c5b05eb46280fc2ccc8cd351bff839"}, - {file = "tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ba2b0bf01777c9b9bc94b53764d6684554ce98551fec496f71bc5be3a03e98b"}, - {file = "tokenizers-0.13.3-cp311-cp311-win32.whl", hash = "sha256:cc78d77f597d1c458bf0ea7c2a64b6aa06941c7a99cb135b5969b0278824d808"}, - {file = "tokenizers-0.13.3-cp311-cp311-win_amd64.whl", hash = "sha256:ecf182bf59bd541a8876deccf0360f5ae60496fd50b58510048020751cf1724c"}, - {file = "tokenizers-0.13.3-cp37-cp37m-macosx_10_11_x86_64.whl", hash = "sha256:0527dc5436a1f6bf2c0327da3145687d3bcfbeab91fed8458920093de3901b44"}, - {file = "tokenizers-0.13.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:07cbb2c307627dc99b44b22ef05ff4473aa7c7cc1fec8f0a8b37d8a64b1a16d2"}, - {file = "tokenizers-0.13.3-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4560dbdeaae5b7ee0d4e493027e3de6d53c991b5002d7ff95083c99e11dd5ac0"}, - {file = "tokenizers-0.13.3-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:64064bd0322405c9374305ab9b4c07152a1474370327499911937fd4a76d004b"}, - {file = "tokenizers-0.13.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8c6e2ab0f2e3d939ca66aa1d596602105fe33b505cd2854a4c1717f704c51de"}, - {file = "tokenizers-0.13.3-cp37-cp37m-win32.whl", hash = "sha256:6cc29d410768f960db8677221e497226e545eaaea01aa3613fa0fdf2cc96cff4"}, - {file = "tokenizers-0.13.3-cp37-cp37m-win_amd64.whl", hash = "sha256:fc2a7fdf864554a0dacf09d32e17c0caa9afe72baf9dd7ddedc61973bae352d8"}, - {file = "tokenizers-0.13.3-cp38-cp38-macosx_10_11_x86_64.whl", hash = "sha256:8791dedba834c1fc55e5f1521be325ea3dafb381964be20684b92fdac95d79b7"}, - {file = "tokenizers-0.13.3-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:d607a6a13718aeb20507bdf2b96162ead5145bbbfa26788d6b833f98b31b26e1"}, - {file = "tokenizers-0.13.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3791338f809cd1bf8e4fee6b540b36822434d0c6c6bc47162448deee3f77d425"}, - {file = "tokenizers-0.13.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c2f35f30e39e6aab8716f07790f646bdc6e4a853816cc49a95ef2a9016bf9ce6"}, - {file = "tokenizers-0.13.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:310204dfed5aa797128b65d63538a9837cbdd15da2a29a77d67eefa489edda26"}, - {file = "tokenizers-0.13.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0f9b92ea052305166559f38498b3b0cae159caea712646648aaa272f7160963"}, - {file = "tokenizers-0.13.3-cp38-cp38-win32.whl", hash = "sha256:9a3fa134896c3c1f0da6e762d15141fbff30d094067c8f1157b9fdca593b5806"}, - {file = "tokenizers-0.13.3-cp38-cp38-win_amd64.whl", hash = "sha256:8e7b0cdeace87fa9e760e6a605e0ae8fc14b7d72e9fc19c578116f7287bb873d"}, - {file = "tokenizers-0.13.3-cp39-cp39-macosx_10_11_x86_64.whl", hash = "sha256:00cee1e0859d55507e693a48fa4aef07060c4bb6bd93d80120e18fea9371c66d"}, - {file = "tokenizers-0.13.3-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:a23ff602d0797cea1d0506ce69b27523b07e70f6dda982ab8cf82402de839088"}, - {file = "tokenizers-0.13.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:70ce07445050b537d2696022dafb115307abdffd2a5c106f029490f84501ef97"}, - {file = "tokenizers-0.13.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:280ffe95f50eaaf655b3a1dc7ff1d9cf4777029dbbc3e63a74e65a056594abc3"}, - {file = "tokenizers-0.13.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:97acfcec592f7e9de8cadcdcda50a7134423ac8455c0166b28c9ff04d227b371"}, - {file = "tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd7730c98a3010cd4f523465867ff95cd9d6430db46676ce79358f65ae39797b"}, - {file = "tokenizers-0.13.3-cp39-cp39-win32.whl", hash = "sha256:48625a108029cb1ddf42e17a81b5a3230ba6888a70c9dc14e81bc319e812652d"}, - {file = "tokenizers-0.13.3-cp39-cp39-win_amd64.whl", hash = "sha256:bc0a6f1ba036e482db6453571c9e3e60ecd5489980ffd95d11dc9f960483d783"}, - {file = "tokenizers-0.13.3.tar.gz", hash = "sha256:2e546dbb68b623008a5442353137fbb0123d311a6d7ba52f2667c8862a75af2e"}, -] - -[package.extras] -dev = ["black (==22.3)", "datasets", "numpy", "pytest", "requests"] -docs = ["setuptools-rust", "sphinx", "sphinx-rtd-theme"] + {file = "tokenizers-0.15.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:52f6130c9cbf70544287575a985bf44ae1bda2da7e8c24e97716080593638012"}, + {file = "tokenizers-0.15.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:054c1cc9c6d68f7ffa4e810b3d5131e0ba511b6e4be34157aa08ee54c2f8d9ee"}, + {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:a9b9b070fdad06e347563b88c278995735292ded1132f8657084989a4c84a6d5"}, + {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea621a7eef4b70e1f7a4e84dd989ae3f0eeb50fc8690254eacc08acb623e82f1"}, + {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:cf7fd9a5141634fa3aa8d6b7be362e6ae1b4cda60da81388fa533e0b552c98fd"}, + {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:44f2a832cd0825295f7179eaf173381dc45230f9227ec4b44378322d900447c9"}, + {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8b9ec69247a23747669ec4b0ca10f8e3dfb3545d550258129bd62291aabe8605"}, + {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40b6a4c78da863ff26dbd5ad9a8ecc33d8a8d97b535172601cf00aee9d7ce9ce"}, + {file = "tokenizers-0.15.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:5ab2a4d21dcf76af60e05af8063138849eb1d6553a0d059f6534357bce8ba364"}, + {file = "tokenizers-0.15.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a47acfac7e511f6bbfcf2d3fb8c26979c780a91e06fb5b9a43831b2c0153d024"}, + {file = "tokenizers-0.15.2-cp310-none-win32.whl", hash = "sha256:064ff87bb6acdbd693666de9a4b692add41308a2c0ec0770d6385737117215f2"}, + {file = "tokenizers-0.15.2-cp310-none-win_amd64.whl", hash = "sha256:3b919afe4df7eb6ac7cafd2bd14fb507d3f408db7a68c43117f579c984a73843"}, + {file = "tokenizers-0.15.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:89cd1cb93e4b12ff39bb2d626ad77e35209de9309a71e4d3d4672667b4b256e7"}, + {file = "tokenizers-0.15.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cfed5c64e5be23d7ee0f0e98081a25c2a46b0b77ce99a4f0605b1ec43dd481fa"}, + {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:a907d76dcfda37023ba203ab4ceeb21bc5683436ebefbd895a0841fd52f6f6f2"}, + {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20ea60479de6fc7b8ae756b4b097572372d7e4032e2521c1bbf3d90c90a99ff0"}, + {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:48e2b9335be2bc0171df9281385c2ed06a15f5cf121c44094338306ab7b33f2c"}, + {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:112a1dd436d2cc06e6ffdc0b06d55ac019a35a63afd26475205cb4b1bf0bfbff"}, + {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4620cca5c2817177ee8706f860364cc3a8845bc1e291aaf661fb899e5d1c45b0"}, + {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ccd73a82751c523b3fc31ff8194702e4af4db21dc20e55b30ecc2079c5d43cb7"}, + {file = "tokenizers-0.15.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:107089f135b4ae7817affe6264f8c7a5c5b4fd9a90f9439ed495f54fcea56fb4"}, + {file = "tokenizers-0.15.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0ff110ecc57b7aa4a594396525a3451ad70988e517237fe91c540997c4e50e29"}, + {file = "tokenizers-0.15.2-cp311-none-win32.whl", hash = "sha256:6d76f00f5c32da36c61f41c58346a4fa7f0a61be02f4301fd30ad59834977cc3"}, + {file = "tokenizers-0.15.2-cp311-none-win_amd64.whl", hash = "sha256:cc90102ed17271cf0a1262babe5939e0134b3890345d11a19c3145184b706055"}, + {file = "tokenizers-0.15.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:f86593c18d2e6248e72fb91c77d413a815153b8ea4e31f7cd443bdf28e467670"}, + {file = "tokenizers-0.15.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0774bccc6608eca23eb9d620196687c8b2360624619623cf4ba9dc9bd53e8b51"}, + {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:d0222c5b7c9b26c0b4822a82f6a7011de0a9d3060e1da176f66274b70f846b98"}, + {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3835738be1de66624fff2f4f6f6684775da4e9c00bde053be7564cbf3545cc66"}, + {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0143e7d9dcd811855c1ce1ab9bf5d96d29bf5e528fd6c7824d0465741e8c10fd"}, + {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:db35825f6d54215f6b6009a7ff3eedee0848c99a6271c870d2826fbbedf31a38"}, + {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3f5e64b0389a2be47091d8cc53c87859783b837ea1a06edd9d8e04004df55a5c"}, + {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e0480c452217edd35eca56fafe2029fb4d368b7c0475f8dfa3c5c9c400a7456"}, + {file = "tokenizers-0.15.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:a33ab881c8fe70474980577e033d0bc9a27b7ab8272896e500708b212995d834"}, + {file = "tokenizers-0.15.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a308a607ca9de2c64c1b9ba79ec9a403969715a1b8ba5f998a676826f1a7039d"}, + {file = "tokenizers-0.15.2-cp312-none-win32.whl", hash = "sha256:b8fcfa81bcb9447df582c5bc96a031e6df4da2a774b8080d4f02c0c16b42be0b"}, + {file = "tokenizers-0.15.2-cp312-none-win_amd64.whl", hash = "sha256:38d7ab43c6825abfc0b661d95f39c7f8af2449364f01d331f3b51c94dcff7221"}, + {file = "tokenizers-0.15.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:38bfb0204ff3246ca4d5e726e8cc8403bfc931090151e6eede54d0e0cf162ef0"}, + {file = "tokenizers-0.15.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9c861d35e8286a53e06e9e28d030b5a05bcbf5ac9d7229e561e53c352a85b1fc"}, + {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:936bf3842db5b2048eaa53dade907b1160f318e7c90c74bfab86f1e47720bdd6"}, + {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:620beacc3373277700d0e27718aa8b25f7b383eb8001fba94ee00aeea1459d89"}, + {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2735ecbbf37e52db4ea970e539fd2d450d213517b77745114f92867f3fc246eb"}, + {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:473c83c5e2359bb81b0b6fde870b41b2764fcdd36d997485e07e72cc3a62264a"}, + {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:968fa1fb3c27398b28a4eca1cbd1e19355c4d3a6007f7398d48826bbe3a0f728"}, + {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:865c60ae6eaebdde7da66191ee9b7db52e542ed8ee9d2c653b6d190a9351b980"}, + {file = "tokenizers-0.15.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:7c0d8b52664ab2d4a8d6686eb5effc68b78608a9008f086a122a7b2996befbab"}, + {file = "tokenizers-0.15.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:f33dfbdec3784093a9aebb3680d1f91336c56d86cc70ddf88708251da1fe9064"}, + {file = "tokenizers-0.15.2-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:d44ba80988ff9424e33e0a49445072ac7029d8c0e1601ad25a0ca5f41ed0c1d6"}, + {file = "tokenizers-0.15.2-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:dce74266919b892f82b1b86025a613956ea0ea62a4843d4c4237be2c5498ed3a"}, + {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:0ef06b9707baeb98b316577acb04f4852239d856b93e9ec3a299622f6084e4be"}, + {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c73e2e74bbb07910da0d37c326869f34113137b23eadad3fc00856e6b3d9930c"}, + {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4eeb12daf02a59e29f578a865f55d87cd103ce62bd8a3a5874f8fdeaa82e336b"}, + {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9ba9f6895af58487ca4f54e8a664a322f16c26bbb442effd01087eba391a719e"}, + {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ccec77aa7150e38eec6878a493bf8c263ff1fa8a62404e16c6203c64c1f16a26"}, + {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3f40604f5042ff210ba82743dda2b6aa3e55aa12df4e9f2378ee01a17e2855e"}, + {file = "tokenizers-0.15.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:5645938a42d78c4885086767c70923abad047163d809c16da75d6b290cb30bbe"}, + {file = "tokenizers-0.15.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:05a77cbfebe28a61ab5c3891f9939cc24798b63fa236d84e5f29f3a85a200c00"}, + {file = "tokenizers-0.15.2-cp37-none-win32.whl", hash = "sha256:361abdc068e8afe9c5b818769a48624687fb6aaed49636ee39bec4e95e1a215b"}, + {file = "tokenizers-0.15.2-cp37-none-win_amd64.whl", hash = "sha256:7ef789f83eb0f9baeb4d09a86cd639c0a5518528f9992f38b28e819df397eb06"}, + {file = "tokenizers-0.15.2-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:4fe1f74a902bee74a3b25aff180fbfbf4f8b444ab37c4d496af7afd13a784ed2"}, + {file = "tokenizers-0.15.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4c4b89038a684f40a6b15d6b09f49650ac64d951ad0f2a3ea9169687bbf2a8ba"}, + {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:d05a1b06f986d41aed5f2de464c003004b2df8aaf66f2b7628254bcbfb72a438"}, + {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:508711a108684111ec8af89d3a9e9e08755247eda27d0ba5e3c50e9da1600f6d"}, + {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:daa348f02d15160cb35439098ac96e3a53bacf35885072611cd9e5be7d333daa"}, + {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:494fdbe5932d3416de2a85fc2470b797e6f3226c12845cadf054dd906afd0442"}, + {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c2d60f5246f4da9373f75ff18d64c69cbf60c3bca597290cea01059c336d2470"}, + {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:93268e788825f52de4c7bdcb6ebc1fcd4a5442c02e730faa9b6b08f23ead0e24"}, + {file = "tokenizers-0.15.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6fc7083ab404019fc9acafe78662c192673c1e696bd598d16dc005bd663a5cf9"}, + {file = "tokenizers-0.15.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:41e39b41e5531d6b2122a77532dbea60e171ef87a3820b5a3888daa847df4153"}, + {file = "tokenizers-0.15.2-cp38-none-win32.whl", hash = "sha256:06cd0487b1cbfabefb2cc52fbd6b1f8d4c37799bd6c6e1641281adaa6b2504a7"}, + {file = "tokenizers-0.15.2-cp38-none-win_amd64.whl", hash = "sha256:5179c271aa5de9c71712e31cb5a79e436ecd0d7532a408fa42a8dbfa4bc23fd9"}, + {file = "tokenizers-0.15.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:82f8652a74cc107052328b87ea8b34291c0f55b96d8fb261b3880216a9f9e48e"}, + {file = "tokenizers-0.15.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:02458bee6f5f3139f1ebbb6d042b283af712c0981f5bc50edf771d6b762d5e4f"}, + {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:c9a09cd26cca2e1c349f91aa665309ddb48d71636370749414fbf67bc83c5343"}, + {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:158be8ea8554e5ed69acc1ce3fbb23a06060bd4bbb09029431ad6b9a466a7121"}, + {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1ddba9a2b0c8c81633eca0bb2e1aa5b3a15362b1277f1ae64176d0f6eba78ab1"}, + {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3ef5dd1d39797044642dbe53eb2bc56435308432e9c7907728da74c69ee2adca"}, + {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:454c203164e07a860dbeb3b1f4a733be52b0edbb4dd2e5bd75023ffa8b49403a"}, + {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0cf6b7f1d4dc59af960e6ffdc4faffe6460bbfa8dce27a58bf75755ffdb2526d"}, + {file = "tokenizers-0.15.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:2ef09bbc16519f6c25d0c7fc0c6a33a6f62923e263c9d7cca4e58b8c61572afb"}, + {file = "tokenizers-0.15.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c9a2ebdd2ad4ec7a68e7615086e633857c85e2f18025bd05d2a4399e6c5f7169"}, + {file = "tokenizers-0.15.2-cp39-none-win32.whl", hash = "sha256:918fbb0eab96fe08e72a8c2b5461e9cce95585d82a58688e7f01c2bd546c79d0"}, + {file = "tokenizers-0.15.2-cp39-none-win_amd64.whl", hash = "sha256:524e60da0135e106b254bd71f0659be9f89d83f006ea9093ce4d1fab498c6d0d"}, + {file = "tokenizers-0.15.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:6a9b648a58281c4672212fab04e60648fde574877d0139cd4b4f93fe28ca8944"}, + {file = "tokenizers-0.15.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:7c7d18b733be6bbca8a55084027f7be428c947ddf871c500ee603e375013ffba"}, + {file = "tokenizers-0.15.2-pp310-pypy310_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:13ca3611de8d9ddfbc4dc39ef54ab1d2d4aaa114ac8727dfdc6a6ec4be017378"}, + {file = "tokenizers-0.15.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:237d1bf3361cf2e6463e6c140628e6406766e8b27274f5fcc62c747ae3c6f094"}, + {file = "tokenizers-0.15.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67a0fe1e49e60c664915e9fb6b0cb19bac082ab1f309188230e4b2920230edb3"}, + {file = "tokenizers-0.15.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:4e022fe65e99230b8fd89ebdfea138c24421f91c1a4f4781a8f5016fd5cdfb4d"}, + {file = "tokenizers-0.15.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:d857be2df69763362ac699f8b251a8cd3fac9d21893de129bc788f8baaef2693"}, + {file = "tokenizers-0.15.2-pp37-pypy37_pp73-macosx_10_12_x86_64.whl", hash = "sha256:708bb3e4283177236309e698da5fcd0879ce8fd37457d7c266d16b550bcbbd18"}, + {file = "tokenizers-0.15.2-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:64c35e09e9899b72a76e762f9854e8750213f67567787d45f37ce06daf57ca78"}, + {file = "tokenizers-0.15.2-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c1257f4394be0d3b00de8c9e840ca5601d0a4a8438361ce9c2b05c7d25f6057b"}, + {file = "tokenizers-0.15.2-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:02272fe48280e0293a04245ca5d919b2c94a48b408b55e858feae9618138aeda"}, + {file = "tokenizers-0.15.2-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:dc3ad9ebc76eabe8b1d7c04d38be884b8f9d60c0cdc09b0aa4e3bcf746de0388"}, + {file = "tokenizers-0.15.2-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:32e16bdeffa7c4f46bf2152172ca511808b952701d13e7c18833c0b73cb5c23f"}, + {file = "tokenizers-0.15.2-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:fb16ba563d59003028b678d2361a27f7e4ae0ab29c7a80690efa20d829c81fdb"}, + {file = "tokenizers-0.15.2-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:2277c36d2d6cdb7876c274547921a42425b6810d38354327dd65a8009acf870c"}, + {file = "tokenizers-0.15.2-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:1cf75d32e8d250781940d07f7eece253f2fe9ecdb1dc7ba6e3833fa17b82fcbc"}, + {file = "tokenizers-0.15.2-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f1b3b31884dc8e9b21508bb76da80ebf7308fdb947a17affce815665d5c4d028"}, + {file = "tokenizers-0.15.2-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b10122d8d8e30afb43bb1fe21a3619f62c3e2574bff2699cf8af8b0b6c5dc4a3"}, + {file = "tokenizers-0.15.2-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:d88b96ff0fe8e91f6ef01ba50b0d71db5017fa4e3b1d99681cec89a85faf7bf7"}, + {file = "tokenizers-0.15.2-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:37aaec5a52e959892870a7c47cef80c53797c0db9149d458460f4f31e2fb250e"}, + {file = "tokenizers-0.15.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:e2ea752f2b0fe96eb6e2f3adbbf4d72aaa1272079b0dfa1145507bd6a5d537e6"}, + {file = "tokenizers-0.15.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:4b19a808d8799fda23504a5cd31d2f58e6f52f140380082b352f877017d6342b"}, + {file = "tokenizers-0.15.2-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:64c86e5e068ac8b19204419ed8ca90f9d25db20578f5881e337d203b314f4104"}, + {file = "tokenizers-0.15.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:de19c4dc503c612847edf833c82e9f73cd79926a384af9d801dcf93f110cea4e"}, + {file = "tokenizers-0.15.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ea09acd2fe3324174063d61ad620dec3bcf042b495515f27f638270a7d466e8b"}, + {file = "tokenizers-0.15.2-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:cf27fd43472e07b57cf420eee1e814549203d56de00b5af8659cb99885472f1f"}, + {file = "tokenizers-0.15.2-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:7ca22bd897537a0080521445d91a58886c8c04084a6a19e6c78c586e0cfa92a5"}, + {file = "tokenizers-0.15.2.tar.gz", hash = "sha256:e6e9c6e019dd5484be5beafc775ae6c925f4c69a3487040ed09b45e13df2cb91"}, +] + +[package.dependencies] +huggingface_hub = ">=0.16.4,<1.0" + +[package.extras] +dev = ["tokenizers[testing]"] +docs = ["setuptools_rust", "sphinx", "sphinx_rtd_theme"] testing = ["black (==22.3)", "datasets", "numpy", "pytest", "requests"] [[package]] @@ -6103,69 +6331,71 @@ telegram = ["requests"] [[package]] name = "transformers" -version = "4.26.0" +version = "4.36.2" description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow" optional = true -python-versions = ">=3.7.0" +python-versions = ">=3.8.0" files = [ - {file = "transformers-4.26.0-py3-none-any.whl", hash = "sha256:6a902eee6098d9a737faadf185b8df5a169acc695ebbde5a81b90528f43e665f"}, - {file = "transformers-4.26.0.tar.gz", hash = "sha256:d7859bd83829a3682ca632197ee5c72556e1063d199ab84eec35c4f23b3d73a3"}, + {file = "transformers-4.36.2-py3-none-any.whl", hash = "sha256:462066c4f74ee52516f12890dcc9ec71d1a5e97998db621668455117a54330f6"}, + {file = "transformers-4.36.2.tar.gz", hash = "sha256:d8068e897e47793281501e547d2bbdfc5b8556409c2cb6c3d9e2ca77d4c0b4ec"}, ] [package.dependencies] filelock = "*" -huggingface-hub = ">=0.11.0,<1.0" +huggingface-hub = ">=0.19.3,<1.0" numpy = ">=1.17" packaging = ">=20.0" pyyaml = ">=5.1" regex = "!=2019.12.17" requests = "*" -tokenizers = ">=0.11.1,<0.11.3 || >0.11.3,<0.14" +safetensors = ">=0.3.1" +tokenizers = ">=0.14,<0.19" tqdm = ">=4.27" [package.extras] -accelerate = ["accelerate (>=0.10.0)"] -all = ["Pillow", "accelerate (>=0.10.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1)", "jax (>=0.2.8,!=0.3.2,<=0.3.6)", "jaxlib (>=0.1.65,<=0.3.6)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "onnxconverter-common", "optax (>=0.0.8)", "optuna", "phonemizer", "protobuf (<=3.20.2)", "pyctcdecode (>=0.4.0)", "ray[tune]", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>=2.4,<2.12)", "tensorflow-text", "tf2onnx", "timm", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.7,!=1.12.0)", "torchaudio"] +accelerate = ["accelerate (>=0.21.0)"] +agents = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch (>=1.10,!=1.12.0)"] +all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>=2.6,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm", "tokenizers (>=0.14,<0.19)", "torch (>=1.10,!=1.12.0)", "torchaudio", "torchvision"] audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] codecarbon = ["codecarbon (==1.2.0)"] -deepspeed = ["accelerate (>=0.10.0)", "deepspeed (>=0.6.5)"] -deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.10.0)", "beautifulsoup4", "black (==22.3)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.6.5)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder (>=0.3.0)", "nltk", "optuna", "parameterized", "protobuf (<=3.20.2)", "psutil", "pytest", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "safetensors (>=0.2.1)", "sentencepiece (>=0.1.91,!=0.1.92)", "timeout-decorator"] -dev = ["GitPython (<3.1.19)", "Pillow", "accelerate (>=0.10.0)", "beautifulsoup4", "black (==22.3)", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "decord (==0.6.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flake8 (>=3.8.3)", "flax (>=0.4.1)", "fugashi (>=1.0)", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.2.8,!=0.3.2,<=0.3.6)", "jaxlib (>=0.1.65,<=0.3.6)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "optax (>=0.0.8)", "optuna", "parameterized", "phonemizer", "protobuf (<=3.20.2)", "psutil", "pyctcdecode (>=0.4.0)", "pytest", "pytest-timeout", "pytest-xdist", "ray[tune]", "rhoknp (>=1.1.0)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "safetensors (>=0.2.1)", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorflow (>=2.4,<2.12)", "tensorflow-text", "tf2onnx", "timeout-decorator", "timm", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.7,!=1.12.0)", "torchaudio", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)"] -dev-tensorflow = ["GitPython (<3.1.19)", "Pillow", "beautifulsoup4", "black (==22.3)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flake8 (>=3.8.3)", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf (<=3.20.2)", "psutil", "pyctcdecode (>=0.4.0)", "pytest", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "safetensors (>=0.2.1)", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorflow (>=2.4,<2.12)", "tensorflow-text", "tf2onnx", "timeout-decorator", "tokenizers (>=0.11.1,!=0.11.3,<0.14)"] -dev-torch = ["GitPython (<3.1.19)", "Pillow", "beautifulsoup4", "black (==22.3)", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flake8 (>=3.8.3)", "fugashi (>=1.0)", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "librosa", "nltk", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf (<=3.20.2)", "psutil", "pyctcdecode (>=0.4.0)", "pytest", "pytest-timeout", "pytest-xdist", "ray[tune]", "rhoknp (>=1.1.0)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "safetensors (>=0.2.1)", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "timeout-decorator", "timm", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.7,!=1.12.0)", "torchaudio", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)"] -docs = ["Pillow", "accelerate (>=0.10.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1)", "hf-doc-builder", "jax (>=0.2.8,!=0.3.2,<=0.3.6)", "jaxlib (>=0.1.65,<=0.3.6)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "onnxconverter-common", "optax (>=0.0.8)", "optuna", "phonemizer", "protobuf (<=3.20.2)", "pyctcdecode (>=0.4.0)", "ray[tune]", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>=2.4,<2.12)", "tensorflow-text", "tf2onnx", "timm", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.7,!=1.12.0)", "torchaudio"] +deepspeed = ["accelerate (>=0.21.0)", "deepspeed (>=0.9.3)"] +deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.21.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder (>=0.3.0)", "nltk", "optuna", "parameterized", "protobuf", "psutil", "pydantic (<2)", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"] +dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "av (==9.2.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "decord (==0.6.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (<2)", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>=2.6,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm", "tokenizers (>=0.14,<0.19)", "torch (>=1.10,!=1.12.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"] +dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (<2)", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>=2.6,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.14,<0.19)", "urllib3 (<2.0.0)"] +dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "librosa", "nltk", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (<2)", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm", "tokenizers (>=0.14,<0.19)", "torch (>=1.10,!=1.12.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"] +docs = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "hf-doc-builder", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>=2.6,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm", "tokenizers (>=0.14,<0.19)", "torch (>=1.10,!=1.12.0)", "torchaudio", "torchvision"] docs-specific = ["hf-doc-builder"] -fairscale = ["fairscale (>0.3)"] -flax = ["flax (>=0.4.1)", "jax (>=0.2.8,!=0.3.2,<=0.3.6)", "jaxlib (>=0.1.65,<=0.3.6)", "optax (>=0.0.8)"] +flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)"] flax-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] ftfy = ["ftfy"] -integrations = ["optuna", "ray[tune]", "sigopt"] -ja = ["fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "rhoknp (>=1.1.0)", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)"] +integrations = ["optuna", "ray[tune] (>=2.7.0)", "sigopt"] +ja = ["fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "rhoknp (>=1.1.0,<1.3.1)", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)"] modelcreation = ["cookiecutter (==1.7.3)"] -natten = ["natten (>=0.14.4)"] +natten = ["natten (>=0.14.6)"] onnx = ["onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "tf2onnx"] onnxruntime = ["onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)"] optuna = ["optuna"] -quality = ["GitPython (<3.1.19)", "black (==22.3)", "datasets (!=2.5.0)", "flake8 (>=3.8.3)", "hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)"] -ray = ["ray[tune]"] +quality = ["GitPython (<3.1.19)", "datasets (!=2.5.0)", "hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)", "ruff (==0.1.5)", "urllib3 (<2.0.0)"] +ray = ["ray[tune] (>=2.7.0)"] retrieval = ["datasets (!=2.5.0)", "faiss-cpu"] sagemaker = ["sagemaker (>=2.31.0)"] -sentencepiece = ["protobuf (<=3.20.2)", "sentencepiece (>=0.1.91,!=0.1.92)"] -serving = ["fastapi", "pydantic", "starlette", "uvicorn"] +sentencepiece = ["protobuf", "sentencepiece (>=0.1.91,!=0.1.92)"] +serving = ["fastapi", "pydantic (<2)", "starlette", "uvicorn"] sigopt = ["sigopt"] sklearn = ["scikit-learn"] speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"] -testing = ["GitPython (<3.1.19)", "beautifulsoup4", "black (==22.3)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder (>=0.3.0)", "nltk", "parameterized", "protobuf (<=3.20.2)", "psutil", "pytest", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "safetensors (>=0.2.1)", "timeout-decorator"] -tf = ["keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow (>=2.4,<2.12)", "tensorflow-text", "tf2onnx"] -tf-cpu = ["keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow-cpu (>=2.4,<2.12)", "tensorflow-text", "tf2onnx"] +testing = ["GitPython (<3.1.19)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder (>=0.3.0)", "nltk", "parameterized", "protobuf", "psutil", "pydantic (<2)", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "tensorboard", "timeout-decorator"] +tf = ["keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow (>=2.6,<2.16)", "tensorflow-text (<2.16)", "tf2onnx"] +tf-cpu = ["keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow-cpu (>=2.6,<2.16)", "tensorflow-text (<2.16)", "tf2onnx"] tf-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] timm = ["timm"] -tokenizers = ["tokenizers (>=0.11.1,!=0.11.3,<0.14)"] -torch = ["torch (>=1.7,!=1.12.0)"] +tokenizers = ["tokenizers (>=0.14,<0.19)"] +torch = ["accelerate (>=0.21.0)", "torch (>=1.10,!=1.12.0)"] torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"] -torchhub = ["filelock", "huggingface-hub (>=0.11.0,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf (<=3.20.2)", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.7,!=1.12.0)", "tqdm (>=4.27)"] -video = ["decord (==0.6.0)"] -vision = ["Pillow"] +torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"] +torchhub = ["filelock", "huggingface-hub (>=0.19.3,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.14,<0.19)", "torch (>=1.10,!=1.12.0)", "tqdm (>=4.27)"] +video = ["av (==9.2.0)", "decord (==0.6.0)"] +vision = ["Pillow (>=10.0.1,<=15.0)"] [[package]] name = "twilio" @@ -6956,4 +7186,4 @@ transformers = ["sentencepiece", "transformers"] [metadata] lock-version = "2.0" python-versions = ">=3.8,<3.11" -content-hash = "4c84d994449f859816e48dd00d77f31f6f9d964e29a9f6060300c51d923786e0" +content-hash = "c1c51259ab3b886039dcf7eb746a45815d4b8afcaa4bdbe179c891810aee553f" diff --git a/pyproject.toml b/pyproject.toml index 0944c09460d6..d2a48c527aab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -120,7 +120,6 @@ sanic-cors = "~2.0.0" sanic-jwt = "^1.6.0" sanic-routing = "^0.7.2" websockets = ">=10.0,<11.0" -cloudpickle = ">=1.2,<2.3" aiohttp = ">=3.9.0,<3.10" questionary = ">=1.5.1,<1.11.0" prompt-toolkit = "^3.0,<3.0.29" @@ -133,10 +132,9 @@ psycopg2-binary = ">=2.8.2,<2.10.0" python-dateutil = "~2.8" protobuf = ">=4.23.3,< 4.23.4" tensorflow_hub = "^0.13.0" -setuptools = ">=65.5.1" +setuptools = "~70.3.0" ujson = ">=1.35,<6.0" regex = ">=2020.6,<2022.11" -joblib = ">=0.15.1,<1.3.0" sentry-sdk = ">=0.17.0,<1.15.0" aio-pika = ">=6.7.1,<8.2.4" aiogram = "<2.26" @@ -156,6 +154,9 @@ dnspython = "2.3.0" wheel = ">=0.38.1" certifi = ">=2023.7.22" cryptography = ">=41.0.7" +skops = "0.9.0" +safetensors = "~0.4.5" + [[tool.poetry.dependencies.tensorflow-io-gcs-filesystem]] version = "==0.31" markers = "sys_platform == 'win32'" @@ -285,7 +286,7 @@ version = "~3.2.0" optional = true [tool.poetry.dependencies.transformers] -version = ">=4.13.0, <=4.26.0" +version = "~4.36.2" optional = true [tool.poetry.dependencies.sentencepiece] diff --git a/rasa/core/featurizers/single_state_featurizer.py b/rasa/core/featurizers/single_state_featurizer.py index 7d8c504084c1..0a6c921491a4 100644 --- a/rasa/core/featurizers/single_state_featurizer.py +++ b/rasa/core/featurizers/single_state_featurizer.py @@ -1,7 +1,8 @@ import logging +from typing import List, Optional, Dict, Text, Set, Any + import numpy as np import scipy.sparse -from typing import List, Optional, Dict, Text, Set, Any from rasa.core.featurizers.precomputation import MessageContainerForCoreFeaturization from rasa.nlu.extractors.extractor import EntityTagSpec @@ -362,6 +363,26 @@ def encode_all_labels( for action in domain.action_names_or_texts ] + def to_dict(self) -> Dict[str, Any]: + return { + "action_texts": self.action_texts, + "entity_tag_specs": self.entity_tag_specs, + "feature_states": self._default_feature_states, + } + + @classmethod + def create_from_dict( + cls, data: Dict[str, Any] + ) -> Optional["SingleStateFeaturizer"]: + if not data: + return None + + featurizer = SingleStateFeaturizer() + featurizer.action_texts = data["action_texts"] + featurizer._default_feature_states = data["feature_states"] + featurizer.entity_tag_specs = data["entity_tag_specs"] + return featurizer + class IntentTokenizerSingleStateFeaturizer(SingleStateFeaturizer): """A SingleStateFeaturizer for use with policies that predict intent labels.""" diff --git a/rasa/core/featurizers/tracker_featurizers.py b/rasa/core/featurizers/tracker_featurizers.py index 42df6e4e1187..9c6dbca92d47 100644 --- a/rasa/core/featurizers/tracker_featurizers.py +++ b/rasa/core/featurizers/tracker_featurizers.py @@ -1,11 +1,9 @@ from __future__ import annotations -from pathlib import Path -from collections import defaultdict -from abc import abstractmethod -import jsonpickle -import logging -from tqdm import tqdm +import logging +from abc import abstractmethod +from collections import defaultdict +from pathlib import Path from typing import ( Tuple, List, @@ -18,25 +16,30 @@ Set, DefaultDict, cast, + Type, + Callable, + ClassVar, ) + import numpy as np +from tqdm import tqdm -from rasa.core.featurizers.single_state_featurizer import SingleStateFeaturizer -from rasa.core.featurizers.precomputation import MessageContainerForCoreFeaturization -from rasa.core.exceptions import InvalidTrackerFeaturizerUsageError import rasa.shared.core.trackers import rasa.shared.utils.io -from rasa.shared.nlu.constants import TEXT, INTENT, ENTITIES, ACTION_NAME -from rasa.shared.nlu.training_data.features import Features -from rasa.shared.core.trackers import DialogueStateTracker -from rasa.shared.core.domain import State, Domain -from rasa.shared.core.events import Event, ActionExecuted, UserUttered +from rasa.core.exceptions import InvalidTrackerFeaturizerUsageError +from rasa.core.featurizers.precomputation import MessageContainerForCoreFeaturization +from rasa.core.featurizers.single_state_featurizer import SingleStateFeaturizer from rasa.shared.core.constants import ( USER, ACTION_UNLIKELY_INTENT_NAME, PREVIOUS_ACTION, ) +from rasa.shared.core.domain import State, Domain +from rasa.shared.core.events import Event, ActionExecuted, UserUttered +from rasa.shared.core.trackers import DialogueStateTracker from rasa.shared.exceptions import RasaException +from rasa.shared.nlu.constants import TEXT, INTENT, ENTITIES, ACTION_NAME +from rasa.shared.nlu.training_data.features import Features from rasa.utils.tensorflow.constants import LABEL_PAD_ID from rasa.utils.tensorflow.model_data import ragged_array_to_ndarray @@ -64,6 +67,10 @@ def __str__(self) -> Text: class TrackerFeaturizer: """Base class for actual tracker featurizers.""" + # Class registry to store all subclasses + _registry: ClassVar[Dict[str, Type["TrackerFeaturizer"]]] = {} + _featurizer_type: str = "TrackerFeaturizer" + def __init__( self, state_featurizer: Optional[SingleStateFeaturizer] = None ) -> None: @@ -74,6 +81,36 @@ def __init__( """ self.state_featurizer = state_featurizer + @classmethod + def register(cls, featurizer_type: str) -> Callable: + """Decorator to register featurizer subclasses.""" + + def wrapper(subclass: Type["TrackerFeaturizer"]) -> Type["TrackerFeaturizer"]: + cls._registry[featurizer_type] = subclass + # Store the type identifier in the class for serialization + subclass._featurizer_type = featurizer_type + return subclass + + return wrapper + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "TrackerFeaturizer": + """Create featurizer instance from dictionary.""" + featurizer_type = data.pop("type") + + if featurizer_type not in cls._registry: + raise ValueError(f"Unknown featurizer type: {featurizer_type}") + + # Get the correct subclass and instantiate it + subclass = cls._registry[featurizer_type] + return subclass.create_from_dict(data) + + @classmethod + @abstractmethod + def create_from_dict(cls, data: Dict[str, Any]) -> "TrackerFeaturizer": + """Each subclass must implement its own creation from dict method.""" + pass + @staticmethod def _create_states( tracker: DialogueStateTracker, @@ -465,9 +502,7 @@ def persist(self, path: Union[Text, Path]) -> None: self.state_featurizer.entity_tag_specs = [] # noinspection PyTypeChecker - rasa.shared.utils.io.write_text_file( - str(jsonpickle.encode(self)), featurizer_file - ) + rasa.shared.utils.io.dump_obj_as_json_to_file(featurizer_file, self.to_dict()) @staticmethod def load(path: Union[Text, Path]) -> Optional[TrackerFeaturizer]: @@ -481,7 +516,17 @@ def load(path: Union[Text, Path]) -> Optional[TrackerFeaturizer]: """ featurizer_file = Path(path) / FEATURIZER_FILE if featurizer_file.is_file(): - return jsonpickle.decode(rasa.shared.utils.io.read_file(featurizer_file)) + data = rasa.shared.utils.io.read_json_file(featurizer_file) + + if "type" not in data: + logger.error( + f"Couldn't load featurizer for policy. " + f"File '{featurizer_file}' does not contain all " + f"necessary information. 'type' is missing." + ) + return None + + return TrackerFeaturizer.from_dict(data) logger.error( f"Couldn't load featurizer for policy. " @@ -508,7 +553,16 @@ def _remove_action_unlikely_intent_from_events(events: List[Event]) -> List[Even ) ] + def to_dict(self) -> Dict[str, Any]: + return { + "type": self.__class__._featurizer_type, + "state_featurizer": ( + self.state_featurizer.to_dict() if self.state_featurizer else None + ), + } + +@TrackerFeaturizer.register("FullDialogueTrackerFeaturizer") class FullDialogueTrackerFeaturizer(TrackerFeaturizer): """Creates full dialogue training data for time distributed architectures. @@ -646,7 +700,20 @@ def prediction_states( return trackers_as_states + def to_dict(self) -> Dict[str, Any]: + return super().to_dict() + @classmethod + def create_from_dict(cls, data: Dict[str, Any]) -> "FullDialogueTrackerFeaturizer": + state_featurizer = SingleStateFeaturizer.create_from_dict( + data["state_featurizer"] + ) + return cls( + state_featurizer, + ) + + +@TrackerFeaturizer.register("MaxHistoryTrackerFeaturizer") class MaxHistoryTrackerFeaturizer(TrackerFeaturizer): """Truncates the tracker history into `max_history` long sequences. @@ -887,7 +954,25 @@ def prediction_states( return trackers_as_states + def to_dict(self) -> Dict[str, Any]: + data = super().to_dict() + data.update( + { + "remove_duplicates": self.remove_duplicates, + "max_history": self.max_history, + } + ) + return data + + @classmethod + def create_from_dict(cls, data: Dict[str, Any]) -> "MaxHistoryTrackerFeaturizer": + state_featurizer = SingleStateFeaturizer.create_from_dict( + data["state_featurizer"] + ) + return cls(state_featurizer, data["max_history"], data["remove_duplicates"]) + +@TrackerFeaturizer.register("IntentMaxHistoryTrackerFeaturizer") class IntentMaxHistoryTrackerFeaturizer(MaxHistoryTrackerFeaturizer): """Truncates the tracker history into `max_history` long sequences. @@ -1166,6 +1251,18 @@ def prediction_states( return trackers_as_states + def to_dict(self) -> Dict[str, Any]: + return super().to_dict() + + @classmethod + def create_from_dict( + cls, data: Dict[str, Any] + ) -> "IntentMaxHistoryTrackerFeaturizer": + state_featurizer = SingleStateFeaturizer.create_from_dict( + data["state_featurizer"] + ) + return cls(state_featurizer, data["max_history"], data["remove_duplicates"]) + def _is_prev_action_unlikely_intent_in_state(state: State) -> bool: prev_action_name = state.get(PREVIOUS_ACTION, {}).get(ACTION_NAME) diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py index c5f895e3ce64..af96af627de6 100644 --- a/rasa/core/policies/ted_policy.py +++ b/rasa/core/policies/ted_policy.py @@ -1,15 +1,15 @@ from __future__ import annotations -import logging -from rasa.engine.recipes.default_recipe import DefaultV1Recipe +import logging from pathlib import Path from collections import defaultdict import contextlib +from typing import Any, List, Optional, Text, Dict, Tuple, Union, Type import numpy as np import tensorflow as tf -from typing import Any, List, Optional, Text, Dict, Tuple, Union, Type +from rasa.engine.recipes.default_recipe import DefaultV1Recipe from rasa.engine.graph import ExecutionContext from rasa.engine.storage.resource import Resource from rasa.engine.storage.storage import ModelStorage @@ -49,18 +49,22 @@ from rasa.shared.core.events import EntitiesAdded, Event from rasa.shared.core.domain import Domain from rasa.shared.nlu.training_data.message import Message -from rasa.shared.nlu.training_data.features import Features +from rasa.shared.nlu.training_data.features import ( + Features, + save_features, + load_features, +) import rasa.shared.utils.io import rasa.utils.io from rasa.utils import train_utils -from rasa.utils.tensorflow.models import RasaModel, TransformerRasaModel -from rasa.utils.tensorflow import rasa_layers -from rasa.utils.tensorflow.model_data import ( - RasaModelData, - FeatureSignature, +from rasa.utils.tensorflow.feature_array import ( FeatureArray, - Data, + serialize_nested_feature_arrays, + deserialize_nested_feature_arrays, ) +from rasa.utils.tensorflow.models import RasaModel, TransformerRasaModel +from rasa.utils.tensorflow import rasa_layers +from rasa.utils.tensorflow.model_data import RasaModelData, FeatureSignature, Data from rasa.utils.tensorflow.model_data_utils import convert_to_data_format from rasa.utils.tensorflow.constants import ( LABEL, @@ -961,22 +965,32 @@ def persist_model_utilities(self, model_path: Path) -> None: model_path: Path where model is to be persisted """ model_filename = self._metadata_filename() - rasa.utils.io.json_pickle( - model_path / f"{model_filename}.priority.pkl", self.priority - ) - rasa.utils.io.pickle_dump( - model_path / f"{model_filename}.meta.pkl", self.config + rasa.shared.utils.io.dump_obj_as_json_to_file( + model_path / f"{model_filename}.priority.json", self.priority ) - rasa.utils.io.pickle_dump( - model_path / f"{model_filename}.data_example.pkl", self.data_example + rasa.shared.utils.io.dump_obj_as_json_to_file( + model_path / f"{model_filename}.meta.json", self.config ) - rasa.utils.io.pickle_dump( - model_path / f"{model_filename}.fake_features.pkl", self.fake_features + # save data example + serialize_nested_feature_arrays( + self.data_example, + str(model_path / f"{model_filename}.data_example.st"), + str(model_path / f"{model_filename}.data_example_metadata.json"), ) - rasa.utils.io.pickle_dump( - model_path / f"{model_filename}.label_data.pkl", + # save label data + serialize_nested_feature_arrays( dict(self._label_data.data) if self._label_data is not None else {}, + str(model_path / f"{model_filename}.label_data.st"), + str(model_path / f"{model_filename}.label_data_metadata.json"), + ) + # save fake features + metadata = save_features( + self.fake_features, str(model_path / f"{model_filename}.fake_features.st") + ) + rasa.shared.utils.io.dump_obj_as_json_to_file( + model_path / f"{model_filename}.fake_features_metadata.json", metadata ) + entity_tag_specs = ( [tag_spec._asdict() for tag_spec in self._entity_tag_specs] if self._entity_tag_specs @@ -994,18 +1008,29 @@ def _load_model_utilities(cls, model_path: Path) -> Dict[Text, Any]: model_path: Path where model is to be persisted. """ tf_model_file = model_path / f"{cls._metadata_filename()}.tf_model" - loaded_data = rasa.utils.io.pickle_load( - model_path / f"{cls._metadata_filename()}.data_example.pkl" + + # load data example + loaded_data = deserialize_nested_feature_arrays( + str(model_path / f"{cls._metadata_filename()}.data_example.st"), + str(model_path / f"{cls._metadata_filename()}.data_example_metadata.json"), ) - label_data = rasa.utils.io.pickle_load( - model_path / f"{cls._metadata_filename()}.label_data.pkl" + # load label data + loaded_label_data = deserialize_nested_feature_arrays( + str(model_path / f"{cls._metadata_filename()}.label_data.st"), + str(model_path / f"{cls._metadata_filename()}.label_data_metadata.json"), ) - fake_features = rasa.utils.io.pickle_load( - model_path / f"{cls._metadata_filename()}.fake_features.pkl" + label_data = RasaModelData(data=loaded_label_data) + + # load fake features + metadata = rasa.shared.utils.io.read_json_file( + model_path / f"{cls._metadata_filename()}.fake_features_metadata.json" ) - label_data = RasaModelData(data=label_data) - priority = rasa.utils.io.json_unpickle( - model_path / f"{cls._metadata_filename()}.priority.pkl" + fake_features = load_features( + str(model_path / f"{cls._metadata_filename()}.fake_features.st"), metadata + ) + + priority = rasa.shared.utils.io.read_json_file( + model_path / f"{cls._metadata_filename()}.priority.json" ) entity_tag_specs = rasa.shared.utils.io.read_json_file( model_path / f"{cls._metadata_filename()}.entity_tag_specs.json" @@ -1023,8 +1048,8 @@ def _load_model_utilities(cls, model_path: Path) -> Dict[Text, Any]: ) for tag_spec in entity_tag_specs ] - model_config = rasa.utils.io.pickle_load( - model_path / f"{cls._metadata_filename()}.meta.pkl" + model_config = rasa.shared.utils.io.read_json_file( + model_path / f"{cls._metadata_filename()}.meta.json" ) return { @@ -1070,7 +1095,7 @@ def _load( ) -> TEDPolicy: featurizer = TrackerFeaturizer.load(model_path) - if not (model_path / f"{cls._metadata_filename()}.data_example.pkl").is_file(): + if not (model_path / f"{cls._metadata_filename()}.data_example.st").is_file(): return cls( config, model_storage, diff --git a/rasa/core/policies/unexpected_intent_policy.py b/rasa/core/policies/unexpected_intent_policy.py index d5b39a561b82..ca788662f133 100644 --- a/rasa/core/policies/unexpected_intent_policy.py +++ b/rasa/core/policies/unexpected_intent_policy.py @@ -5,6 +5,7 @@ import numpy as np import tensorflow as tf + import rasa.utils.common from rasa.engine.graph import ExecutionContext from rasa.engine.recipes.default_recipe import DefaultV1Recipe @@ -16,6 +17,7 @@ from rasa.shared.core.trackers import DialogueStateTracker from rasa.shared.core.constants import SLOTS, ACTIVE_LOOP, ACTION_UNLIKELY_INTENT_NAME from rasa.shared.core.events import UserUttered, ActionExecuted +import rasa.shared.utils.io from rasa.shared.nlu.constants import ( INTENT, TEXT, @@ -103,8 +105,6 @@ ) from rasa.utils.tensorflow import layers from rasa.utils.tensorflow.model_data import RasaModelData, FeatureArray, Data - -import rasa.utils.io as io_utils from rasa.core.exceptions import RasaCoreException from rasa.shared.utils import common @@ -881,9 +881,12 @@ def persist_model_utilities(self, model_path: Path) -> None: model_path: Path where model is to be persisted """ super().persist_model_utilities(model_path) - io_utils.pickle_dump( - model_path / f"{self._metadata_filename()}.label_quantiles.pkl", - self.label_quantiles, + + from safetensors.numpy import save_file + + save_file( + {str(k): np.array(v) for k, v in self.label_quantiles.items()}, + model_path / f"{self._metadata_filename()}.label_quantiles.st", ) @classmethod @@ -894,9 +897,14 @@ def _load_model_utilities(cls, model_path: Path) -> Dict[Text, Any]: model_path: Path where model is to be persisted. """ model_utilties = super()._load_model_utilities(model_path) - label_quantiles = io_utils.pickle_load( - model_path / f"{cls._metadata_filename()}.label_quantiles.pkl" + + from safetensors.numpy import load_file + + loaded_label_quantiles = load_file( + model_path / f"{cls._metadata_filename()}.label_quantiles.st" ) + label_quantiles = {int(k): list(v) for k, v in loaded_label_quantiles.items()} + model_utilties.update({"label_quantiles": label_quantiles}) return model_utilties diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py index bea4735da6fe..b53eb5db8d76 100644 --- a/rasa/nlu/classifiers/diet_classifier.py +++ b/rasa/nlu/classifiers/diet_classifier.py @@ -1,18 +1,17 @@ from __future__ import annotations + import copy import logging from collections import defaultdict from pathlib import Path - -from rasa.exceptions import ModelNotFound -from rasa.nlu.featurizers.featurizer import Featurizer +from typing import Any, Dict, List, Optional, Text, Tuple, Union, TypeVar, Type import numpy as np import scipy.sparse import tensorflow as tf -from typing import Any, Dict, List, Optional, Text, Tuple, Union, TypeVar, Type - +from rasa.exceptions import ModelNotFound +from rasa.nlu.featurizers.featurizer import Featurizer from rasa.engine.graph import ExecutionContext, GraphComponent from rasa.engine.recipes.default_recipe import DefaultV1Recipe from rasa.engine.storage.resource import Resource @@ -20,18 +19,21 @@ from rasa.nlu.extractors.extractor import EntityExtractorMixin from rasa.nlu.classifiers.classifier import IntentClassifier import rasa.shared.utils.io -import rasa.utils.io as io_utils import rasa.nlu.utils.bilou_utils as bilou_utils from rasa.shared.constants import DIAGNOSTIC_DATA from rasa.nlu.extractors.extractor import EntityTagSpec from rasa.nlu.classifiers import LABEL_RANKING_LENGTH from rasa.utils import train_utils from rasa.utils.tensorflow import rasa_layers +from rasa.utils.tensorflow.feature_array import ( + FeatureArray, + serialize_nested_feature_arrays, + deserialize_nested_feature_arrays, +) from rasa.utils.tensorflow.models import RasaModel, TransformerRasaModel from rasa.utils.tensorflow.model_data import ( RasaModelData, FeatureSignature, - FeatureArray, ) from rasa.nlu.constants import TOKENS_NAMES, DEFAULT_TRANSFORMER_SIZE from rasa.shared.nlu.constants import ( @@ -118,7 +120,6 @@ POSSIBLE_TAGS = [ENTITY_ATTRIBUTE_TYPE, ENTITY_ATTRIBUTE_ROLE, ENTITY_ATTRIBUTE_GROUP] - DIETClassifierT = TypeVar("DIETClassifierT", bound="DIETClassifier") @@ -1085,18 +1086,24 @@ def persist(self) -> None: self.model.save(str(tf_model_file)) - io_utils.pickle_dump( - model_path / f"{file_name}.data_example.pkl", self._data_example - ) - io_utils.pickle_dump( - model_path / f"{file_name}.sparse_feature_sizes.pkl", - self._sparse_feature_sizes, + # save data example + serialize_nested_feature_arrays( + self._data_example, + model_path / f"{file_name}.data_example.st", + model_path / f"{file_name}.data_example_metadata.json", ) - io_utils.pickle_dump( - model_path / f"{file_name}.label_data.pkl", + # save label data + serialize_nested_feature_arrays( dict(self._label_data.data) if self._label_data is not None else {}, + model_path / f"{file_name}.label_data.st", + model_path / f"{file_name}.label_data_metadata.json", ) - io_utils.json_pickle( + + rasa.shared.utils.io.dump_obj_as_json_to_file( + model_path / f"{file_name}.sparse_feature_sizes.json", + self._sparse_feature_sizes, + ) + rasa.shared.utils.io.dump_obj_as_json_to_file( model_path / f"{file_name}.index_label_id_mapping.json", self.index_label_id_mapping, ) @@ -1185,15 +1192,22 @@ def _load_from_files( ]: file_name = cls.__name__ - data_example = io_utils.pickle_load( - model_path / f"{file_name}.data_example.pkl" + # load data example + data_example = deserialize_nested_feature_arrays( + str(model_path / f"{file_name}.data_example.st"), + str(model_path / f"{file_name}.data_example_metadata.json"), ) - label_data = io_utils.pickle_load(model_path / f"{file_name}.label_data.pkl") - label_data = RasaModelData(data=label_data) - sparse_feature_sizes = io_utils.pickle_load( - model_path / f"{file_name}.sparse_feature_sizes.pkl" + # load label data + loaded_label_data = deserialize_nested_feature_arrays( + str(model_path / f"{file_name}.label_data.st"), + str(model_path / f"{file_name}.label_data_metadata.json"), + ) + label_data = RasaModelData(data=loaded_label_data) + + sparse_feature_sizes = rasa.shared.utils.io.read_json_file( + model_path / f"{file_name}.sparse_feature_sizes.json" ) - index_label_id_mapping = io_utils.json_unpickle( + index_label_id_mapping = rasa.shared.utils.io.read_json_file( model_path / f"{file_name}.index_label_id_mapping.json" ) entity_tag_specs = rasa.shared.utils.io.read_json_file( @@ -1213,7 +1227,6 @@ def _load_from_files( for tag_spec in entity_tag_specs ] - # jsonpickle converts dictionary keys to strings index_label_id_mapping = { int(key): value for key, value in index_label_id_mapping.items() } diff --git a/rasa/nlu/classifiers/logistic_regression_classifier.py b/rasa/nlu/classifiers/logistic_regression_classifier.py index c652d20af9c0..46303a11697e 100644 --- a/rasa/nlu/classifiers/logistic_regression_classifier.py +++ b/rasa/nlu/classifiers/logistic_regression_classifier.py @@ -1,20 +1,19 @@ import logging from typing import Any, Text, Dict, List, Type, Tuple -import joblib from scipy.sparse import hstack, vstack, csr_matrix from sklearn.linear_model import LogisticRegression +from rasa.engine.graph import ExecutionContext, GraphComponent +from rasa.engine.recipes.default_recipe import DefaultV1Recipe from rasa.engine.storage.resource import Resource from rasa.engine.storage.storage import ModelStorage -from rasa.engine.recipes.default_recipe import DefaultV1Recipe -from rasa.engine.graph import ExecutionContext, GraphComponent from rasa.nlu.classifiers import LABEL_RANKING_LENGTH -from rasa.nlu.featurizers.featurizer import Featurizer from rasa.nlu.classifiers.classifier import IntentClassifier -from rasa.shared.nlu.training_data.training_data import TrainingData -from rasa.shared.nlu.training_data.message import Message +from rasa.nlu.featurizers.featurizer import Featurizer from rasa.shared.nlu.constants import TEXT, INTENT +from rasa.shared.nlu.training_data.message import Message +from rasa.shared.nlu.training_data.training_data import TrainingData from rasa.utils.tensorflow.constants import RANKING_LENGTH logger = logging.getLogger(__name__) @@ -158,9 +157,11 @@ def process(self, messages: List[Message]) -> List[Message]: def persist(self) -> None: """Persist this model into the passed directory.""" + import skops.io as sio + with self._model_storage.write_to(self._resource) as model_dir: - path = model_dir / f"{self._resource.name}.joblib" - joblib.dump(self.clf, path) + path = model_dir / f"{self._resource.name}.skops" + sio.dump(self.clf, path) logger.debug(f"Saved intent classifier to '{path}'.") @classmethod @@ -173,9 +174,21 @@ def load( **kwargs: Any, ) -> "LogisticRegressionClassifier": """Loads trained component (see parent class for full docstring).""" + import skops.io as sio + try: with model_storage.read_from(resource) as model_dir: - classifier = joblib.load(model_dir / f"{resource.name}.joblib") + classifier_file = model_dir / f"{resource.name}.skops" + unknown_types = sio.get_untrusted_types(file=classifier_file) + + if unknown_types: + logger.debug( + f"Untrusted types ({unknown_types}) found when " + f"loading {classifier_file}!", + ) + raise ValueError() + + classifier = sio.load(classifier_file, trusted=unknown_types) component = cls( config, execution_context.node_name, model_storage, resource ) diff --git a/rasa/nlu/classifiers/sklearn_intent_classifier.py b/rasa/nlu/classifiers/sklearn_intent_classifier.py index 5c941d3d8806..3aa656f0f3ba 100644 --- a/rasa/nlu/classifiers/sklearn_intent_classifier.py +++ b/rasa/nlu/classifiers/sklearn_intent_classifier.py @@ -1,6 +1,6 @@ from __future__ import annotations + import logging -from rasa.nlu.featurizers.dense_featurizer.dense_featurizer import DenseFeaturizer import typing import warnings from typing import Any, Dict, List, Optional, Text, Tuple, Type @@ -8,18 +8,18 @@ import numpy as np import rasa.shared.utils.io -import rasa.utils.io as io_utils from rasa.engine.graph import GraphComponent, ExecutionContext from rasa.engine.recipes.default_recipe import DefaultV1Recipe from rasa.engine.storage.resource import Resource from rasa.engine.storage.storage import ModelStorage -from rasa.shared.constants import DOCS_URL_TRAINING_DATA_NLU from rasa.nlu.classifiers import LABEL_RANKING_LENGTH +from rasa.nlu.classifiers.classifier import IntentClassifier +from rasa.nlu.featurizers.dense_featurizer.dense_featurizer import DenseFeaturizer +from rasa.shared.constants import DOCS_URL_TRAINING_DATA_NLU from rasa.shared.exceptions import RasaException from rasa.shared.nlu.constants import TEXT -from rasa.nlu.classifiers.classifier import IntentClassifier -from rasa.shared.nlu.training_data.training_data import TrainingData from rasa.shared.nlu.training_data.message import Message +from rasa.shared.nlu.training_data.training_data import TrainingData from rasa.utils.tensorflow.constants import FEATURIZERS logger = logging.getLogger(__name__) @@ -266,14 +266,20 @@ def predict(self, X: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: def persist(self) -> None: """Persist this model into the passed directory.""" + import skops.io as sio + with self._model_storage.write_to(self._resource) as model_dir: file_name = self.__class__.__name__ - classifier_file_name = model_dir / f"{file_name}_classifier.pkl" - encoder_file_name = model_dir / f"{file_name}_encoder.pkl" + classifier_file_name = model_dir / f"{file_name}_classifier.skops" + encoder_file_name = model_dir / f"{file_name}_encoder.json" if self.clf and self.le: - io_utils.json_pickle(encoder_file_name, self.le.classes_) - io_utils.json_pickle(classifier_file_name, self.clf.best_estimator_) + # convert self.le.classes_ (numpy array of strings) to a list in order + # to use json dump + rasa.shared.utils.io.dump_obj_as_json_to_file( + encoder_file_name, list(self.le.classes_) + ) + sio.dump(self.clf.best_estimator_, classifier_file_name) @classmethod def load( @@ -286,21 +292,36 @@ def load( ) -> SklearnIntentClassifier: """Loads trained component (see parent class for full docstring).""" from sklearn.preprocessing import LabelEncoder + import skops.io as sio try: with model_storage.read_from(resource) as model_dir: file_name = cls.__name__ - classifier_file = model_dir / f"{file_name}_classifier.pkl" + classifier_file = model_dir / f"{file_name}_classifier.skops" if classifier_file.exists(): - classifier = io_utils.json_unpickle(classifier_file) + unknown_types = sio.get_untrusted_types(file=classifier_file) - encoder_file = model_dir / f"{file_name}_encoder.pkl" - classes = io_utils.json_unpickle(encoder_file) - encoder = LabelEncoder() - encoder.classes_ = classes + if unknown_types: + logger.error( + f"Untrusted types ({unknown_types}) found when " + f"loading {classifier_file}!" + ) + raise ValueError() + else: + classifier = sio.load(classifier_file, trusted=unknown_types) + + encoder_file = model_dir / f"{file_name}_encoder.json" + classes = rasa.shared.utils.io.read_json_file(encoder_file) - return cls(config, model_storage, resource, classifier, encoder) + encoder = LabelEncoder() + intent_classifier = cls( + config, model_storage, resource, classifier, encoder + ) + # convert list of strings (class labels) back to numpy array of + # strings + intent_classifier.transform_labels_str2num(classes) + return intent_classifier except ValueError: logger.debug( f"Failed to load '{cls.__name__}' from model storage. Resource " diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py index 1332c250d55a..357d6044fec8 100644 --- a/rasa/nlu/extractors/crf_entity_extractor.py +++ b/rasa/nlu/extractors/crf_entity_extractor.py @@ -1,12 +1,12 @@ from __future__ import annotations -from collections import OrderedDict -from enum import Enum import logging import typing +from collections import OrderedDict +from enum import Enum +from typing import Any, Dict, List, Optional, Text, Tuple, Callable, Type import numpy as np -from typing import Any, Dict, List, Optional, Text, Tuple, Callable, Type import rasa.nlu.utils.bilou_utils as bilou_utils import rasa.shared.utils.io @@ -15,13 +15,12 @@ from rasa.engine.recipes.default_recipe import DefaultV1Recipe from rasa.engine.storage.resource import Resource from rasa.engine.storage.storage import ModelStorage +from rasa.nlu.constants import TOKENS_NAMES +from rasa.nlu.extractors.extractor import EntityExtractorMixin from rasa.nlu.test import determine_token_labels from rasa.nlu.tokenizers.spacy_tokenizer import POS_TAG_KEY -from rasa.nlu.extractors.extractor import EntityExtractorMixin from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer -from rasa.shared.nlu.training_data.training_data import TrainingData -from rasa.shared.nlu.training_data.message import Message -from rasa.nlu.constants import TOKENS_NAMES +from rasa.shared.constants import DOCS_URL_COMPONENTS from rasa.shared.nlu.constants import ( TEXT, ENTITIES, @@ -32,7 +31,8 @@ SPLIT_ENTITIES_BY_COMMA, SPLIT_ENTITIES_BY_COMMA_DEFAULT_VALUE, ) -from rasa.shared.constants import DOCS_URL_COMPONENTS +from rasa.shared.nlu.training_data.message import Message +from rasa.shared.nlu.training_data.training_data import TrainingData from rasa.utils.tensorflow.constants import BILOU_FLAG, FEATURIZERS logger = logging.getLogger(__name__) @@ -41,6 +41,9 @@ from sklearn_crfsuite import CRF +CONFIG_FEATURES = "features" + + class CRFToken: def __init__( self, @@ -60,6 +63,29 @@ def __init__( self.entity_role_tag = entity_role_tag self.entity_group_tag = entity_group_tag + def to_dict(self) -> Dict[str, Any]: + return { + "text": self.text, + "pos_tag": self.pos_tag, + "pattern": self.pattern, + "dense_features": [str(x) for x in list(self.dense_features)], + "entity_tag": self.entity_tag, + "entity_role_tag": self.entity_role_tag, + "entity_group_tag": self.entity_group_tag, + } + + @classmethod + def create_from_dict(cls, data: Dict[str, Any]) -> "CRFToken": + return cls( + data["text"], + data["pos_tag"], + data["pattern"], + np.array([float(x) for x in data["dense_features"]]), + data["entity_tag"], + data["entity_role_tag"], + data["entity_group_tag"], + ) + class CRFEntityExtractorOptions(str, Enum): """Features that can be used for the 'CRFEntityExtractor'.""" @@ -137,7 +163,7 @@ def get_default_config() -> Dict[Text, Any]: # "is the preceding token in title case?" # POS features require SpacyTokenizer # pattern feature require RegexFeaturizer - CRFEntityExtractor.CONFIG_FEATURES: [ + CONFIG_FEATURES: [ [ CRFEntityExtractorOptions.LOW, CRFEntityExtractorOptions.TITLE, @@ -200,7 +226,7 @@ def __init__( ) def _validate_configuration(self) -> None: - if len(self.component_config.get(self.CONFIG_FEATURES, [])) % 2 != 1: + if len(self.component_config.get(CONFIG_FEATURES, [])) % 2 != 1: raise ValueError( "Need an odd number of crf feature lists to have a center word." ) @@ -251,9 +277,11 @@ def train(self, training_data: TrainingData) -> Resource: ] dataset = [self._convert_to_crf_tokens(example) for example in entity_examples] - self._train_model(dataset) + self.entity_taggers = self.train_model( + dataset, self.component_config, self.crf_order + ) - self.persist() + self.persist(dataset) return self._resource @@ -299,7 +327,9 @@ def extract_entities(self, message: Message) -> List[Dict[Text, Any]]: if include_tag_features: self._add_tag_to_crf_token(crf_tokens, predictions) - features = self._crf_tokens_to_features(crf_tokens, include_tag_features) + features = self._crf_tokens_to_features( + crf_tokens, self.component_config, include_tag_features + ) predictions[tag_name] = entity_tagger.predict_marginals_single(features) # convert predictions into a list of tags and a list of confidences @@ -389,27 +419,25 @@ def load( **kwargs: Any, ) -> CRFEntityExtractor: """Loads trained component (see parent class for full docstring).""" - import joblib - try: - entity_taggers = OrderedDict() with model_storage.read_from(resource) as model_dir: - # We have to load in the same order as we persisted things as otherwise - # the predictions might be off - file_names = sorted(model_dir.glob("**/*.pkl")) - if not file_names: - logger.debug( - "Failed to load model for 'CRFEntityExtractor'. " - "Maybe you did not provide enough training data and " - "no model was trained." - ) - return cls(config, model_storage, resource) + dataset = rasa.shared.utils.io.read_json_file( + model_dir / "crf_dataset.json" + ) + crf_order = rasa.shared.utils.io.read_json_file( + model_dir / "crf_order.json" + ) + + dataset = [ + [CRFToken.create_from_dict(token_data) for token_data in sub_list] + for sub_list in dataset + ] - for file_name in file_names: - name = file_name.stem[1:] - entity_taggers[name] = joblib.load(file_name) + entity_taggers = cls.train_model(dataset, config, crf_order) - return cls(config, model_storage, resource, entity_taggers) + entity_extractor = cls(config, model_storage, resource, entity_taggers) + entity_extractor.crf_order = crf_order + return entity_extractor except ValueError: logger.warning( f"Failed to load {cls.__name__} from model storage. Resource " @@ -417,23 +445,29 @@ def load( ) return cls(config, model_storage, resource) - def persist(self) -> None: + def persist(self, dataset: List[List[CRFToken]]) -> None: """Persist this model into the passed directory.""" - import joblib - with self._model_storage.write_to(self._resource) as model_dir: - if self.entity_taggers: - for idx, (name, entity_tagger) in enumerate( - self.entity_taggers.items() - ): - model_file_name = model_dir / f"{idx}{name}.pkl" - joblib.dump(entity_tagger, model_file_name) + data_to_store = [ + [token.to_dict() for token in sub_list] for sub_list in dataset + ] + rasa.shared.utils.io.dump_obj_as_json_to_file( + model_dir / "crf_dataset.json", data_to_store + ) + rasa.shared.utils.io.dump_obj_as_json_to_file( + model_dir / "crf_order.json", self.crf_order + ) + + @classmethod def _crf_tokens_to_features( - self, crf_tokens: List[CRFToken], include_tag_features: bool = False + cls, + crf_tokens: List[CRFToken], + config: Dict[str, Any], + include_tag_features: bool = False, ) -> List[Dict[Text, Any]]: """Convert the list of tokens into discrete features.""" - configured_features = self.component_config[self.CONFIG_FEATURES] + configured_features = config[CONFIG_FEATURES] sentence_features = [] for token_idx in range(len(crf_tokens)): @@ -444,28 +478,31 @@ def _crf_tokens_to_features( half_window_size = window_size // 2 window_range = range(-half_window_size, half_window_size + 1) - token_features = self._create_features_for_token( + token_features = cls._create_features_for_token( crf_tokens, token_idx, half_window_size, window_range, include_tag_features, + config, ) sentence_features.append(token_features) return sentence_features + @classmethod def _create_features_for_token( - self, + cls, crf_tokens: List[CRFToken], token_idx: int, half_window_size: int, window_range: range, include_tag_features: bool, + config: Dict[str, Any], ) -> Dict[Text, Any]: """Convert a token into discrete features including words before and after.""" - configured_features = self.component_config[self.CONFIG_FEATURES] + configured_features = config[CONFIG_FEATURES] prefixes = [str(i) for i in window_range] token_features = {} @@ -505,13 +542,13 @@ def _create_features_for_token( # set in the training data, 'matched' is either 'True' or # 'False' depending on whether the token actually matches the # pattern or not - regex_patterns = self.function_dict[feature](token) + regex_patterns = cls.function_dict[feature](token) for pattern_name, matched in regex_patterns.items(): token_features[ f"{prefix}:{feature}:{pattern_name}" ] = matched else: - value = self.function_dict[feature](token) + value = cls.function_dict[feature](token) token_features[f"{prefix}:{feature}"] = value return token_features @@ -635,38 +672,46 @@ def _get_tags(self, message: Message) -> Dict[Text, List[Text]]: return tags - def _train_model(self, df_train: List[List[CRFToken]]) -> None: + @classmethod + def train_model( + cls, + df_train: List[List[CRFToken]], + config: Dict[str, Any], + crf_order: List[str], + ) -> OrderedDict[str, CRF]: """Train the crf tagger based on the training data.""" import sklearn_crfsuite - self.entity_taggers = OrderedDict() + entity_taggers = OrderedDict() - for tag_name in self.crf_order: + for tag_name in crf_order: logger.debug(f"Training CRF for '{tag_name}'.") # add entity tag features for second level CRFs include_tag_features = tag_name != ENTITY_ATTRIBUTE_TYPE X_train = ( - self._crf_tokens_to_features(sentence, include_tag_features) + cls._crf_tokens_to_features(sentence, config, include_tag_features) for sentence in df_train ) y_train = ( - self._crf_tokens_to_tags(sentence, tag_name) for sentence in df_train + cls._crf_tokens_to_tags(sentence, tag_name) for sentence in df_train ) entity_tagger = sklearn_crfsuite.CRF( algorithm="lbfgs", # coefficient for L1 penalty - c1=self.component_config["L1_c"], + c1=config["L1_c"], # coefficient for L2 penalty - c2=self.component_config["L2_c"], + c2=config["L2_c"], # stop earlier - max_iterations=self.component_config["max_iterations"], + max_iterations=config["max_iterations"], # include transitions that are possible, but not observed all_possible_transitions=True, ) entity_tagger.fit(X_train, y_train) - self.entity_taggers[tag_name] = entity_tagger + entity_taggers[tag_name] = entity_tagger logger.debug("Training finished.") + + return entity_taggers diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py index 0c76b71fa6ea..98cecba9ca3e 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py @@ -1,29 +1,31 @@ from __future__ import annotations + import logging import re +from typing import Any, Dict, List, Optional, Text, Tuple, Set, Type, Union + +import numpy as np import scipy.sparse -from typing import Any, Dict, List, Optional, Text, Tuple, Set, Type -from rasa.nlu.tokenizers.tokenizer import Tokenizer +from sklearn.feature_extraction.text import CountVectorizer import rasa.shared.utils.io from rasa.engine.graph import GraphComponent, ExecutionContext from rasa.engine.recipes.default_recipe import DefaultV1Recipe from rasa.engine.storage.resource import Resource from rasa.engine.storage.storage import ModelStorage -from rasa.nlu.featurizers.sparse_featurizer.sparse_featurizer import SparseFeaturizer -from rasa.nlu.utils.spacy_utils import SpacyModel -from rasa.shared.constants import DOCS_URL_COMPONENTS -import rasa.utils.io as io_utils -from sklearn.feature_extraction.text import CountVectorizer -from rasa.shared.nlu.training_data.training_data import TrainingData -from rasa.shared.nlu.training_data.message import Message -from rasa.shared.exceptions import RasaException, FileIOException from rasa.nlu.constants import ( TOKENS_NAMES, MESSAGE_ATTRIBUTES, DENSE_FEATURIZABLE_ATTRIBUTES, ) +from rasa.nlu.featurizers.sparse_featurizer.sparse_featurizer import SparseFeaturizer +from rasa.nlu.tokenizers.tokenizer import Tokenizer +from rasa.nlu.utils.spacy_utils import SpacyModel +from rasa.shared.constants import DOCS_URL_COMPONENTS +from rasa.shared.exceptions import RasaException, FileIOException from rasa.shared.nlu.constants import TEXT, INTENT, INTENT_RESPONSE_KEY, ACTION_NAME +from rasa.shared.nlu.training_data.message import Message +from rasa.shared.nlu.training_data.training_data import TrainingData BUFFER_SLOTS_PREFIX = "buf_" @@ -686,6 +688,31 @@ def _is_any_model_trained( """Check if any model got trained.""" return any(value is not None for value in attribute_vocabularies.values()) + @staticmethod + def convert_vocab( + vocab: Dict[str, Union[int, Optional[Dict[str, int]]]], to_int: bool + ) -> Dict[str, Union[None, int, np.int64, Dict[str, Union[int, np.int64]]]]: + """Converts numpy integers in the vocabulary to Python integers.""" + + def convert_value(value: int) -> Union[int, np.int64]: + """Helper function to convert a single value based on to_int flag.""" + return int(value) if to_int else np.int64(value) + + result_dict: Dict[ + str, Union[None, int, np.int64, Dict[str, Union[int, np.int64]]] + ] = {} + for key, sub_dict in vocab.items(): + if isinstance(sub_dict, int): + result_dict[key] = convert_value(sub_dict) + elif not sub_dict: + result_dict[key] = None + else: + result_dict[key] = { + sub_key: convert_value(value) for sub_key, value in sub_dict.items() + } + + return result_dict + def persist(self) -> None: """Persist this model into the passed directory. @@ -699,17 +726,18 @@ def persist(self) -> None: attribute_vocabularies = self._collect_vectorizer_vocabularies() if self._is_any_model_trained(attribute_vocabularies): # Definitely need to persist some vocabularies - featurizer_file = model_dir / "vocabularies.pkl" + featurizer_file = model_dir / "vocabularies.json" # Only persist vocabulary from one attribute if `use_shared_vocab`. # Can be loaded and distributed to all attributes. - vocab = ( + loaded_vocab = ( attribute_vocabularies[TEXT] if self.use_shared_vocab else attribute_vocabularies ) + vocab = self.convert_vocab(loaded_vocab, to_int=True) - io_utils.json_pickle(featurizer_file, vocab) + rasa.shared.utils.io.dump_obj_as_json_to_file(featurizer_file, vocab) # Dump OOV words separately as they might have been modified during # training @@ -784,8 +812,9 @@ def load( """Loads trained component (see parent class for full docstring).""" try: with model_storage.read_from(resource) as model_dir: - featurizer_file = model_dir / "vocabularies.pkl" - vocabulary = io_utils.json_unpickle(featurizer_file) + featurizer_file = model_dir / "vocabularies.json" + vocabulary = rasa.shared.utils.io.read_json_file(featurizer_file) + vocabulary = cls.convert_vocab(vocabulary, to_int=False) share_vocabulary = config["use_shared_vocab"] diff --git a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py index 92312197755a..2c4ee3928348 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py @@ -1,9 +1,7 @@ from __future__ import annotations + import logging from collections import OrderedDict - -import scipy.sparse -import numpy as np from typing import ( Any, Dict, @@ -17,30 +15,34 @@ Union, ) +import numpy as np +import scipy.sparse + +import rasa.shared.utils.io +import rasa.utils.io from rasa.engine.graph import ExecutionContext, GraphComponent from rasa.engine.recipes.default_recipe import DefaultV1Recipe from rasa.engine.storage.resource import Resource from rasa.engine.storage.storage import ModelStorage +from rasa.nlu.constants import TOKENS_NAMES +from rasa.nlu.featurizers.sparse_featurizer.sparse_featurizer import SparseFeaturizer from rasa.nlu.tokenizers.spacy_tokenizer import POS_TAG_KEY, SpacyTokenizer from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer -from rasa.nlu.featurizers.sparse_featurizer.sparse_featurizer import SparseFeaturizer -from rasa.nlu.constants import TOKENS_NAMES from rasa.shared.constants import DOCS_URL_COMPONENTS -from rasa.shared.nlu.training_data.training_data import TrainingData -from rasa.shared.nlu.training_data.message import Message -from rasa.shared.nlu.constants import TEXT from rasa.shared.exceptions import InvalidConfigException -import rasa.shared.utils.io -import rasa.utils.io +from rasa.shared.nlu.constants import TEXT +from rasa.shared.nlu.training_data.message import Message +from rasa.shared.nlu.training_data.training_data import TrainingData logger = logging.getLogger(__name__) - END_OF_SENTENCE = "EOS" BEGIN_OF_SENTENCE = "BOS" FEATURES = "features" +SEPERATOR = "###" + @DefaultV1Recipe.register( DefaultV1Recipe.ComponentType.MESSAGE_FEATURIZER, is_trainable=True @@ -72,7 +74,7 @@ class LexicalSyntacticFeaturizer(SparseFeaturizer, GraphComponent): of the token at position `t+1`. """ - FILENAME_FEATURE_TO_IDX_DICT = "feature_to_idx_dict.pkl" + FILENAME_FEATURE_TO_IDX_DICT = "feature_to_idx_dict.json" # NOTE: "suffix5" of the token "is" will be "is". Hence, when combining multiple # prefixes, short words will be represented/encoded repeatedly. @@ -489,6 +491,32 @@ def create( """Creates a new untrained component (see parent class for full docstring).""" return cls(config, model_storage, resource, execution_context) + @staticmethod + def _restructure_feature_to_idx_dict( + loaded_data: Dict[str, Dict[str, int]], + ) -> Dict[Tuple[int, str], Dict[str, int]]: + """Reconstructs the feature to idx dict. + + When storing the feature_to_idx_dict to disk, we need to convert the tuple (key) + into a string to be able to store it via json. When loading the data + we need to reconstruct the tuple from the stored string. + + Args: + loaded_data: The loaded feature to idx dict from file. + + Returns: + The reconstructed feature_to_idx_dict + """ + feature_to_idx_dict = {} + for tuple_string, feature_value in loaded_data.items(): + # Example of tuple_string: "1###low" + index, feature_name = tuple_string.split(SEPERATOR) + + feature_key = (int(index), feature_name) + feature_to_idx_dict[feature_key] = feature_value + + return feature_to_idx_dict + @classmethod def load( cls, @@ -501,10 +529,13 @@ def load( """Loads trained component (see parent class for full docstring).""" try: with model_storage.read_from(resource) as model_path: - feature_to_idx_dict = rasa.utils.io.json_unpickle( + loaded_data = rasa.shared.utils.io.read_json_file( model_path / cls.FILENAME_FEATURE_TO_IDX_DICT, - encode_non_string_keys=True, ) + + # convert the key back into tuple + feature_to_idx_dict = cls._restructure_feature_to_idx_dict(loaded_data) + return cls( config=config, model_storage=model_storage, @@ -529,9 +560,13 @@ def persist(self) -> None: if not self._feature_to_idx_dict: return None + # as we cannot dump tuples, convert the tuple into a string + restructured_feature_dict = { + f"{k[0]}{SEPERATOR}{k[1]}": v for k, v in self._feature_to_idx_dict.items() + } + with self._model_storage.write_to(self._resource) as model_path: - rasa.utils.io.json_pickle( + rasa.shared.utils.io.dump_obj_as_json_to_file( model_path / self.FILENAME_FEATURE_TO_IDX_DICT, - self._feature_to_idx_dict, - encode_non_string_keys=True, + restructured_feature_dict, ) diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py index fee53fd5b4f6..baed7f2c4852 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py @@ -1,11 +1,13 @@ from __future__ import annotations + import logging import re from typing import Any, Dict, List, Optional, Text, Tuple, Type + import numpy as np import scipy.sparse -from rasa.nlu.tokenizers.tokenizer import Tokenizer +from rasa.nlu.tokenizers.tokenizer import Tokenizer import rasa.shared.utils.io import rasa.utils.io import rasa.nlu.utils.pattern_utils as pattern_utils @@ -240,7 +242,7 @@ def load( try: with model_storage.read_from(resource) as model_dir: - patterns_file_name = model_dir / "patterns.pkl" + patterns_file_name = model_dir / "patterns.json" known_patterns = rasa.shared.utils.io.read_json_file(patterns_file_name) except (ValueError, FileNotFoundError): logger.warning( @@ -258,7 +260,7 @@ def load( def _persist(self) -> None: with self._model_storage.write_to(self._resource) as model_dir: - regex_file = model_dir / "patterns.pkl" + regex_file = model_dir / "patterns.json" rasa.shared.utils.io.dump_obj_as_json_to_file( regex_file, self.known_patterns ) diff --git a/rasa/shared/nlu/training_data/features.py b/rasa/shared/nlu/training_data/features.py index d981c1563bb0..0c5553df20c8 100644 --- a/rasa/shared/nlu/training_data/features.py +++ b/rasa/shared/nlu/training_data/features.py @@ -1,15 +1,133 @@ from __future__ import annotations -from typing import Iterable, Union, Text, Optional, List, Any, Tuple, Dict, Set + import itertools +from dataclasses import dataclass +from typing import Iterable, Union, Text, Optional, List, Any, Tuple, Dict, Set import numpy as np import scipy.sparse +from safetensors.numpy import save_file, load_file -import rasa.shared.utils.io import rasa.shared.nlu.training_data.util +import rasa.shared.utils.io from rasa.shared.nlu.constants import FEATURE_TYPE_SEQUENCE, FEATURE_TYPE_SENTENCE +@dataclass +class FeatureMetadata: + data_type: str + attribute: str + origin: Union[str, List[str]] + is_sparse: bool + shape: tuple + safetensors_key: str + + +def save_features( + features_dict: Dict[Text, List[Features]], file_name: str +) -> Dict[str, Any]: + """Save a dictionary of Features lists to disk using safetensors. + + Args: + features_dict: Dictionary mapping strings to lists of Features objects + file_name: File to save the features to + + Returns: + The metadata to reconstruct the features. + """ + # All tensors are stored in a single safetensors file + tensors_to_save = {} + # Metadata will be stored separately + metadata = {} + + for key, features_list in features_dict.items(): + feature_metadata_list = [] + + for idx, feature in enumerate(features_list): + # Create a unique key for this tensor in the safetensors file + safetensors_key = f"{key}_{idx}" + + # Convert sparse matrices to dense if needed + if feature.is_sparse(): + # For sparse matrices, use the COO format + coo = feature.features.tocoo() # type:ignore[union-attr] + # Save data, row indices and col indices separately + tensors_to_save[f"{safetensors_key}_data"] = coo.data + tensors_to_save[f"{safetensors_key}_row"] = coo.row + tensors_to_save[f"{safetensors_key}_col"] = coo.col + else: + tensors_to_save[safetensors_key] = feature.features + + # Store metadata + metadata_item = FeatureMetadata( + data_type=feature.type, + attribute=feature.attribute, + origin=feature.origin, + is_sparse=feature.is_sparse(), + shape=feature.features.shape, + safetensors_key=safetensors_key, + ) + feature_metadata_list.append(vars(metadata_item)) + + metadata[key] = feature_metadata_list + + # Save tensors + save_file(tensors_to_save, file_name) + + return metadata + + +def load_features( + filename: str, metadata: Dict[str, Any] +) -> Dict[Text, List[Features]]: + """Load Features dictionary from disk. + + Args: + filename: File name of the safetensors file. + metadata: Metadata to reconstruct the features. + + Returns: + Dictionary mapping strings to lists of Features objects + """ + # Load tensors + tensors = load_file(filename) + + # Reconstruct the features dictionary + features_dict: Dict[Text, List[Features]] = {} + + for key, feature_metadata_list in metadata.items(): + features_list = [] + + for meta in feature_metadata_list: + safetensors_key = meta["safetensors_key"] + + if meta["is_sparse"]: + # Reconstruct sparse matrix from COO format + data = tensors[f"{safetensors_key}_data"] + row = tensors[f"{safetensors_key}_row"] + col = tensors[f"{safetensors_key}_col"] + + features_matrix = scipy.sparse.coo_matrix( + (data, (row, col)), shape=tuple(meta["shape"]) + ).tocsr() # Convert back to CSR format + else: + features_matrix = tensors[safetensors_key] + + # Reconstruct Features object + features = Features( + features=features_matrix, + feature_type=meta["data_type"], + attribute=meta["attribute"], + origin=meta["origin"], + ) + + features_list.append(features) + + features_dict[key] = features_list + + return features_dict + + class Features: """Stores the features produced by any featurizer.""" diff --git a/rasa/shared/utils/io.py b/rasa/shared/utils/io.py index de2b1bc28f6c..3b13b5c18063 100644 --- a/rasa/shared/utils/io.py +++ b/rasa/shared/utils/io.py @@ -12,6 +12,7 @@ import warnings import random import string + import portalocker from ruamel import yaml as yaml diff --git a/rasa/utils/common.py b/rasa/utils/common.py index 27b754664317..f99f54042a1f 100644 --- a/rasa/utils/common.py +++ b/rasa/utils/common.py @@ -8,6 +8,7 @@ import tempfile import warnings from pathlib import Path +from socket import SOCK_DGRAM, SOCK_STREAM from types import TracebackType from typing import ( Any, @@ -24,8 +25,9 @@ Tuple, ) -from socket import SOCK_DGRAM, SOCK_STREAM import numpy as np + +import rasa.shared.utils.io import rasa.utils.io from rasa.constants import ( DEFAULT_LOG_LEVEL_LIBRARIES, @@ -36,7 +38,6 @@ ) from rasa.shared.constants import DEFAULT_LOG_LEVEL, ENV_LOG_LEVEL, TCP_PROTOCOL from rasa.shared.exceptions import RasaException -import rasa.shared.utils.io logger = logging.getLogger(__name__) @@ -153,7 +154,7 @@ def configure_logging_from_file(logging_config_file: Text) -> None: try: logging.config.dictConfig(logging_config_dict) except (ValueError, TypeError, AttributeError, ImportError) as e: - logging.debug( + logger.debug( f"The logging config file {logging_config_file} could not " f"be applied because it failed validation against " f"the built-in Python logging schema. " diff --git a/rasa/utils/io.py b/rasa/utils/io.py index 3388ef98b049..da0800c61f0a 100644 --- a/rasa/utils/io.py +++ b/rasa/utils/io.py @@ -2,13 +2,13 @@ import filecmp import logging import os -import pickle +import re import tempfile import warnings -import re from asyncio import AbstractEventLoop from pathlib import Path -from typing import Text, Any, Union, List, Type, Callable, TYPE_CHECKING, Pattern +from typing import Text, Any, List, Type, Callable, TYPE_CHECKING, Pattern + from typing_extensions import Protocol import rasa.shared.constants @@ -81,29 +81,6 @@ def enable_async_loop_debugging( return event_loop -def pickle_dump(filename: Union[Text, Path], obj: Any) -> None: - """Saves object to file. - - Args: - filename: the filename to save the object to - obj: the object to store - """ - with open(filename, "wb") as f: - pickle.dump(obj, f) - - -def pickle_load(filename: Union[Text, Path]) -> Any: - """Loads an object from a file. - - Args: - filename: the filename to load the object from - - Returns: the loaded object - """ - with open(filename, "rb") as f: - return pickle.load(f) - - def create_temporary_file(data: Any, suffix: Text = "", mode: Text = "w+") -> Text: """Creates a tempfile.NamedTemporaryFile object for data.""" encoding = None if "b" in mode else rasa.shared.utils.io.DEFAULT_ENCODING @@ -124,7 +101,6 @@ def create_temporary_directory() -> Text: def create_path(file_path: Text) -> None: """Makes sure all directories in the 'file_path' exists.""" - parent_dir = os.path.dirname(os.path.abspath(file_path)) if not os.path.exists(parent_dir): os.makedirs(parent_dir) @@ -160,8 +136,8 @@ def create_validator( function: Callable[[Text], bool], error_message: Text ) -> Type["Validator"]: """Helper method to create `Validator` classes from callable functions. Should be - removed when questionary supports `Validator` objects.""" - + removed when questionary supports `Validator` objects. + """ from prompt_toolkit.validation import Validator, ValidationError from prompt_toolkit.document import Document @@ -175,48 +151,6 @@ def validate(document: Document) -> None: return FunctionValidator -def json_unpickle( - file_name: Union[Text, Path], encode_non_string_keys: bool = False -) -> Any: - """Unpickle an object from file using json. - - Args: - file_name: the file to load the object from - encode_non_string_keys: If set to `True` then jsonpickle will encode non-string - dictionary keys instead of coercing them into strings via `repr()`. - - Returns: the object - """ - import jsonpickle.ext.numpy as jsonpickle_numpy - import jsonpickle - - jsonpickle_numpy.register_handlers() - - file_content = rasa.shared.utils.io.read_file(file_name) - return jsonpickle.loads(file_content, keys=encode_non_string_keys) - - -def json_pickle( - file_name: Union[Text, Path], obj: Any, encode_non_string_keys: bool = False -) -> None: - """Pickle an object to a file using json. - - Args: - file_name: the file to store the object to - obj: the object to store - encode_non_string_keys: If set to `True` then jsonpickle will encode non-string - dictionary keys instead of coercing them into strings via `repr()`. - """ - import jsonpickle.ext.numpy as jsonpickle_numpy - import jsonpickle - - jsonpickle_numpy.register_handlers() - - rasa.shared.utils.io.write_text_file( - jsonpickle.dumps(obj, keys=encode_non_string_keys), file_name - ) - - def get_emoji_regex() -> Pattern: """Returns regex to identify emojis.""" return re.compile( diff --git a/rasa/utils/tensorflow/feature_array.py b/rasa/utils/tensorflow/feature_array.py new file mode 100644 index 000000000000..9af50e3ceb66 --- /dev/null +++ b/rasa/utils/tensorflow/feature_array.py @@ -0,0 +1,370 @@ +from typing import Dict, Any, List, Tuple, Optional, Union + +import numpy as np +import scipy.sparse +from safetensors.numpy import load_file +from safetensors.numpy import save_file + +import rasa.shared.utils.io + + +def _recursive_serialize( + array: Any, prefix: str, data_dict: Dict[str, Any], metadata: List[Dict[str, Any]] +) -> None: + """Recursively serialize arrays and matrices for high dimensional data.""" + if isinstance(array, np.ndarray) and array.ndim <= 2: + data_key = f"{prefix}_array" + data_dict[data_key] = array + metadata.append({"type": "dense", "key": data_key, "shape": array.shape}) + + elif isinstance(array, list) and all([isinstance(v, float) for v in array]): + data_key = f"{prefix}_list" + data_dict[data_key] = np.array(array, dtype=np.float32) + metadata.append({"type": "list", "key": data_key}) + + elif isinstance(array, list) and all([isinstance(v, int) for v in array]): + data_key = f"{prefix}_list" + data_dict[data_key] = np.array(array, dtype=np.int64) + metadata.append({"type": "list", "key": data_key}) + + elif isinstance(array, scipy.sparse.spmatrix): + data_key_data = f"{prefix}_data" + data_key_row = f"{prefix}_row" + data_key_col = f"{prefix}_col" + array = array.tocoo() + data_dict.update( + { + data_key_data: array.data, + data_key_row: array.row, + data_key_col: array.col, + } + ) + metadata.append({"type": "sparse", "key": prefix, "shape": array.shape}) + + elif isinstance(array, list) or isinstance(array, np.ndarray): + group_metadata = {"type": "group", "subcomponents": []} + for idx, item in enumerate(array): + new_prefix = f"{prefix}_{idx}" + _recursive_serialize( + item, new_prefix, data_dict, group_metadata["subcomponents"] + ) + metadata.append(group_metadata) + + +def _serialize_nested_data( + nested_data: Dict[str, Dict[str, List["FeatureArray"]]], + prefix: str, + data_dict: Dict[str, np.ndarray], + metadata: List[Dict[str, Union[str, List]]], +) -> None: + """Handle serialization across dictionary and list levels.""" + for outer_key, inner_dict in nested_data.items(): + inner_metadata = {"key": outer_key, "components": []} + + for inner_key, feature_arrays in inner_dict.items(): + array_metadata = { + "key": inner_key, + "number_of_dimensions": feature_arrays[0].number_of_dimensions, + "features": [], + } + + for idx, feature_array in enumerate(feature_arrays): + feature_prefix = f"{prefix}_{outer_key}_{inner_key}_{idx}" + _recursive_serialize( + feature_array.tolist(), + feature_prefix, + data_dict, + array_metadata["features"], + ) + + inner_metadata["components"].append( # type:ignore[attr-defined] + array_metadata + ) + + metadata.append(inner_metadata) + + +def serialize_nested_feature_arrays( + nested_feature_array: Dict[str, Dict[str, List["FeatureArray"]]], + data_filename: str, + metadata_filename: str, +) -> None: + data_dict: Dict[str, np.ndarray] = {} + metadata: List[Dict[str, Union[str, List]]] = [] + + _serialize_nested_data(nested_feature_array, "component", data_dict, metadata) + + # Save serialized data and metadata + save_file(data_dict, data_filename) + rasa.shared.utils.io.dump_obj_as_json_to_file(metadata_filename, metadata) + + +def _recursive_deserialize( + metadata: List[Dict[str, Any]], data: Dict[str, Any] +) -> List[Any]: + """Recursively deserialize arrays and matrices for high dimensional data.""" + result = [] + + for item in metadata: + if item["type"] == "dense": + key = item["key"] + array = np.asarray(data[key]).reshape(item["shape"]) + result.append(array) + + elif item["type"] == "list": + key = item["key"] + result.append(list(data[key])) + + elif item["type"] == "sparse": + data_vals = data[f"{item['key']}_data"] + row_vals = data[f"{item['key']}_row"] + col_vals = data[f"{item['key']}_col"] + sparse_matrix = scipy.sparse.coo_matrix( + (data_vals, (row_vals, col_vals)), shape=item["shape"] + ) + result.append(sparse_matrix) + elif item["type"] == "group": + sublist = _recursive_deserialize(item["subcomponents"], data) + result.append(sublist) + + return result + + +def _deserialize_nested_data( + metadata: List[Dict[str, Any]], data_dict: Dict[str, Any] +) -> Dict[str, Dict[str, List["FeatureArray"]]]: + """Handle deserialization across all dictionary and list levels.""" + result: Dict[str, Dict[str, List["FeatureArray"]]] = {} + + for outer_item in metadata: + outer_key = outer_item["key"] + result[outer_key] = {} + + for inner_item in outer_item["components"]: + inner_key = inner_item["key"] + feature_arrays = [] + + # Reconstruct the list of FeatureArrays + for feature_item in inner_item["features"]: + # Reconstruct the list of FeatureArrays + feature_array_data = _recursive_deserialize([feature_item], data_dict) + # Prepare the input for the FeatureArray; + # ensure it is np.ndarray compatible + input_array = np.array(feature_array_data[0], dtype=object) + feature_array = FeatureArray( + input_array, inner_item["number_of_dimensions"] + ) + feature_arrays.append(feature_array) + + result[outer_key][inner_key] = feature_arrays + + return result + + +def deserialize_nested_feature_arrays( + data_filename: str, metadata_filename: str +) -> Dict[str, Dict[str, List["FeatureArray"]]]: + metadata = rasa.shared.utils.io.read_json_file(metadata_filename) + data_dict = load_file(data_filename) + + return _deserialize_nested_data(metadata, data_dict) + + +class FeatureArray(np.ndarray): + """Stores any kind of features ready to be used by a RasaModel. + + Next to the input numpy array of features, it also received the number of + dimensions of the features. + As our features can have 1 to 4 dimensions we might have different number of numpy + arrays stacked. The number of dimensions helps us to figure out how to handle this + particular feature array. Also, it is automatically determined whether the feature + array is sparse or not and the number of units is determined as well. + + Subclassing np.array: https://numpy.org/doc/stable/user/basics.subclassing.html + """ + + def __new__( + cls, input_array: np.ndarray, number_of_dimensions: int + ) -> "FeatureArray": + """Create and return a new object. See help(type) for accurate signature.""" + FeatureArray._validate_number_of_dimensions(number_of_dimensions, input_array) + + feature_array = np.asarray(input_array).view(cls) + + if number_of_dimensions <= 2: + feature_array.units = input_array.shape[-1] + feature_array.is_sparse = isinstance(input_array[0], scipy.sparse.spmatrix) + elif number_of_dimensions == 3: + feature_array.units = input_array[0].shape[-1] + feature_array.is_sparse = isinstance(input_array[0], scipy.sparse.spmatrix) + elif number_of_dimensions == 4: + feature_array.units = input_array[0][0].shape[-1] + feature_array.is_sparse = isinstance( + input_array[0][0], scipy.sparse.spmatrix + ) + else: + raise ValueError( + f"Number of dimensions '{number_of_dimensions}' currently not " + f"supported." + ) + + feature_array.number_of_dimensions = number_of_dimensions + + return feature_array + + def __init__( + self, input_array: Any, number_of_dimensions: int, **kwargs: Any + ) -> None: + """Initialize. FeatureArray. + + Needed in order to avoid 'Invalid keyword argument number_of_dimensions + to function FeatureArray.__init__ ' + Args: + input_array: the array that contains features + number_of_dimensions: number of dimensions in input_array + """ + super().__init__(**kwargs) + self.number_of_dimensions = number_of_dimensions + + def __array_finalize__(self, obj: Optional[np.ndarray]) -> None: + """This method is called when the system allocates a new array from obj. + + Args: + obj: A subclass (subtype) of ndarray. + """ + if obj is None: + return + + self.units = getattr(obj, "units", None) + self.number_of_dimensions = getattr( + obj, "number_of_dimensions", None + ) # type: ignore[assignment] + self.is_sparse = getattr(obj, "is_sparse", None) + + default_attributes = { + "units": self.units, + "number_of_dimensions": self.number_of_dimensions, + "is_spare": self.is_sparse, + } + self.__dict__.update(default_attributes) + + # pytype: disable=attribute-error + def __array_ufunc__( + self, ufunc: Any, method: str, *inputs: Any, **kwargs: Any + ) -> Any: + """Overwrite this method as we are subclassing numpy array. + + Args: + ufunc: The ufunc object that was called. + method: A string indicating which Ufunc method was called + (one of "__call__", "reduce", "reduceat", "accumulate", "outer", + "inner"). + *inputs: A tuple of the input arguments to the ufunc. + **kwargs: Any additional arguments + + Returns: + The result of the operation. + """ + f = { + "reduce": ufunc.reduce, + "accumulate": ufunc.accumulate, + "reduceat": ufunc.reduceat, + "outer": ufunc.outer, + "at": ufunc.at, + "__call__": ufunc, + } + # convert the inputs to np.ndarray to prevent recursion, call the function, + # then cast it back as FeatureArray + output = FeatureArray( + f[method](*(i.view(np.ndarray) for i in inputs), **kwargs), + number_of_dimensions=kwargs["number_of_dimensions"], + ) + output.__dict__ = self.__dict__ # carry forward attributes + return output + + def __reduce__(self) -> Tuple[Any, Any, Any]: + """Needed in order to pickle this object. + + Returns: + A tuple. + """ + pickled_state = super(FeatureArray, self).__reduce__() + if isinstance(pickled_state, str): + raise TypeError("np array __reduce__ returned string instead of tuple.") + new_state = pickled_state[2] + ( + self.number_of_dimensions, + self.is_sparse, + self.units, + ) + return pickled_state[0], pickled_state[1], new_state + + def __setstate__(self, state: Any, **kwargs: Any) -> None: + """Sets the state. + + Args: + state: The state argument must be a sequence that contains the following + elements version, shape, dtype, isFortan, rawdata. + **kwargs: Any additional parameter + """ + # Needed in order to load the object + self.number_of_dimensions = state[-3] + self.is_sparse = state[-2] + self.units = state[-1] + super(FeatureArray, self).__setstate__(state[0:-3], **kwargs) + + # pytype: enable=attribute-error + + @staticmethod + def _validate_number_of_dimensions( + number_of_dimensions: int, input_array: np.ndarray + ) -> None: + """Validates if the input array has given number of dimensions. + + Args: + number_of_dimensions: number of dimensions + input_array: input array + + Raises: ValueError in case the dimensions do not match + """ + # when loading the feature arrays from disk, the shape represents + # the correct number of dimensions + if len(input_array.shape) == number_of_dimensions: + return + + _sub_array = input_array + dim = 0 + # Go number_of_dimensions into the given input_array + for i in range(1, number_of_dimensions + 1): + _sub_array = _sub_array[0] + if isinstance(_sub_array, scipy.sparse.spmatrix): + dim = i + break + if isinstance(_sub_array, np.ndarray) and _sub_array.shape[0] == 0: + # sequence dimension is 0, we are dealing with "fake" features + dim = i + break + + # If the resulting sub_array is sparse, the remaining number of dimensions + # should be at least 2 + if isinstance(_sub_array, scipy.sparse.spmatrix): + if dim > 2: + raise ValueError( + f"Given number of dimensions '{number_of_dimensions}' does not " + f"match dimensions of given input array: {input_array}." + ) + elif isinstance(_sub_array, np.ndarray) and _sub_array.shape[0] == 0: + # sequence dimension is 0, we are dealing with "fake" features, + # but they should be of dim 2 + if dim > 2: + raise ValueError( + f"Given number of dimensions '{number_of_dimensions}' does not " + f"match dimensions of given input array: {input_array}." + ) + # If the resulting sub_array is dense, the sub_array should be a single number + elif not np.issubdtype(type(_sub_array), np.integer) and not isinstance( + _sub_array, (np.float32, np.float64) + ): + raise ValueError( + f"Given number of dimensions '{number_of_dimensions}' does not match " + f"dimensions of given input array: {input_array}." + ) diff --git a/rasa/utils/tensorflow/model_data.py b/rasa/utils/tensorflow/model_data.py index 128ff6cbd575..393756972305 100644 --- a/rasa/utils/tensorflow/model_data.py +++ b/rasa/utils/tensorflow/model_data.py @@ -20,6 +20,8 @@ import scipy.sparse from sklearn.model_selection import train_test_split +from rasa.utils.tensorflow.feature_array import FeatureArray + logger = logging.getLogger(__name__) @@ -37,199 +39,6 @@ def ragged_array_to_ndarray(ragged_array: Iterable[np.ndarray]) -> np.ndarray: return np.array(ragged_array, dtype=object) -class FeatureArray(np.ndarray): - """Stores any kind of features ready to be used by a RasaModel. - - Next to the input numpy array of features, it also received the number of - dimensions of the features. - As our features can have 1 to 4 dimensions we might have different number of numpy - arrays stacked. The number of dimensions helps us to figure out how to handle this - particular feature array. Also, it is automatically determined whether the feature - array is sparse or not and the number of units is determined as well. - - Subclassing np.array: https://numpy.org/doc/stable/user/basics.subclassing.html - """ - - def __new__( - cls, input_array: np.ndarray, number_of_dimensions: int - ) -> "FeatureArray": - """Create and return a new object. See help(type) for accurate signature.""" - FeatureArray._validate_number_of_dimensions(number_of_dimensions, input_array) - - feature_array = np.asarray(input_array).view(cls) - - if number_of_dimensions <= 2: - feature_array.units = input_array.shape[-1] - feature_array.is_sparse = isinstance(input_array[0], scipy.sparse.spmatrix) - elif number_of_dimensions == 3: - feature_array.units = input_array[0].shape[-1] - feature_array.is_sparse = isinstance(input_array[0], scipy.sparse.spmatrix) - elif number_of_dimensions == 4: - feature_array.units = input_array[0][0].shape[-1] - feature_array.is_sparse = isinstance( - input_array[0][0], scipy.sparse.spmatrix - ) - else: - raise ValueError( - f"Number of dimensions '{number_of_dimensions}' currently not " - f"supported." - ) - - feature_array.number_of_dimensions = number_of_dimensions - - return feature_array - - def __init__( - self, input_array: Any, number_of_dimensions: int, **kwargs: Any - ) -> None: - """Initialize. FeatureArray. - - Needed in order to avoid 'Invalid keyword argument number_of_dimensions - to function FeatureArray.__init__ ' - Args: - input_array: the array that contains features - number_of_dimensions: number of dimensions in input_array - """ - super().__init__(**kwargs) - self.number_of_dimensions = number_of_dimensions - - def __array_finalize__(self, obj: Optional[np.ndarray]) -> None: - """This method is called when the system allocates a new array from obj. - - Args: - obj: A subclass (subtype) of ndarray. - """ - if obj is None: - return - - self.units = getattr(obj, "units", None) - self.number_of_dimensions = getattr(obj, "number_of_dimensions", None) # type: ignore[assignment] # noqa:E501 - self.is_sparse = getattr(obj, "is_sparse", None) - - default_attributes = { - "units": self.units, - "number_of_dimensions": self.number_of_dimensions, - "is_spare": self.is_sparse, - } - self.__dict__.update(default_attributes) - - # pytype: disable=attribute-error - def __array_ufunc__( - self, ufunc: Any, method: Text, *inputs: Any, **kwargs: Any - ) -> Any: - """Overwrite this method as we are subclassing numpy array. - - Args: - ufunc: The ufunc object that was called. - method: A string indicating which Ufunc method was called - (one of "__call__", "reduce", "reduceat", "accumulate", "outer", - "inner"). - *inputs: A tuple of the input arguments to the ufunc. - **kwargs: Any additional arguments - - Returns: - The result of the operation. - """ - f = { - "reduce": ufunc.reduce, - "accumulate": ufunc.accumulate, - "reduceat": ufunc.reduceat, - "outer": ufunc.outer, - "at": ufunc.at, - "__call__": ufunc, - } - # convert the inputs to np.ndarray to prevent recursion, call the function, - # then cast it back as FeatureArray - output = FeatureArray( - f[method](*(i.view(np.ndarray) for i in inputs), **kwargs), - number_of_dimensions=kwargs["number_of_dimensions"], - ) - output.__dict__ = self.__dict__ # carry forward attributes - return output - - def __reduce__(self) -> Tuple[Any, Any, Any]: - """Needed in order to pickle this object. - - Returns: - A tuple. - """ - pickled_state = super(FeatureArray, self).__reduce__() - if isinstance(pickled_state, str): - raise TypeError("np array __reduce__ returned string instead of tuple.") - new_state = pickled_state[2] + ( - self.number_of_dimensions, - self.is_sparse, - self.units, - ) - return pickled_state[0], pickled_state[1], new_state - - def __setstate__(self, state: Any, **kwargs: Any) -> None: - """Sets the state. - - Args: - state: The state argument must be a sequence that contains the following - elements version, shape, dtype, isFortan, rawdata. - **kwargs: Any additional parameter - """ - # Needed in order to load the object - self.number_of_dimensions = state[-3] - self.is_sparse = state[-2] - self.units = state[-1] - super(FeatureArray, self).__setstate__(state[0:-3], **kwargs) - - # pytype: enable=attribute-error - - @staticmethod - def _validate_number_of_dimensions( - number_of_dimensions: int, input_array: np.ndarray - ) -> None: - """Validates if the the input array has given number of dimensions. - - Args: - number_of_dimensions: number of dimensions - input_array: input array - - Raises: ValueError in case the dimensions do not match - """ - _sub_array = input_array - dim = 0 - # Go number_of_dimensions into the given input_array - for i in range(1, number_of_dimensions + 1): - _sub_array = _sub_array[0] - if isinstance(_sub_array, scipy.sparse.spmatrix): - dim = i - break - if isinstance(_sub_array, np.ndarray) and _sub_array.shape[0] == 0: - # sequence dimension is 0, we are dealing with "fake" features - dim = i - break - - # If the resulting sub_array is sparse, the remaining number of dimensions - # should be at least 2 - if isinstance(_sub_array, scipy.sparse.spmatrix): - if dim > 2: - raise ValueError( - f"Given number of dimensions '{number_of_dimensions}' does not " - f"match dimensions of given input array: {input_array}." - ) - elif isinstance(_sub_array, np.ndarray) and _sub_array.shape[0] == 0: - # sequence dimension is 0, we are dealing with "fake" features, - # but they should be of dim 2 - if dim > 2: - raise ValueError( - f"Given number of dimensions '{number_of_dimensions}' does not " - f"match dimensions of given input array: {input_array}." - ) - # If the resulting sub_array is dense, the sub_array should be a single number - elif not np.issubdtype(type(_sub_array), np.integer) and not isinstance( - _sub_array, (np.float32, np.float64) - ): - raise ValueError( - f"Given number of dimensions '{number_of_dimensions}' does not match " - f"dimensions of given input array: {input_array}." - ) - - class FeatureSignature(NamedTuple): """Signature of feature arrays. @@ -270,8 +79,7 @@ def __init__( label_sub_key: Optional[Text] = None, data: Optional[Data] = None, ) -> None: - """ - Initializes the RasaModelData object. + """Initializes the RasaModelData object. Args: label_key: the key of a label used for balancing, etc. diff --git a/scripts/ping_slack_about_package_release.sh b/scripts/ping_slack_about_package_release.sh old mode 100755 new mode 100644 index ef97ead7a178..b8ee68b4b6cd --- a/scripts/ping_slack_about_package_release.sh +++ b/scripts/ping_slack_about_package_release.sh @@ -7,4 +7,3 @@ if [[ ${GITHUB_TAG} =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then --data "{\"text\":\"💥 New *Rasa Open Source* version ${GITHUB_TAG} has been released! https://github.com/RasaHQ/rasa/releases/tag/${GITHUB_TAG}\"}" \ "https://hooks.slack.com/services/T0GHWFTS8/BMTQQL47K/${SLACK_WEBHOOK_TOKEN}" fi - diff --git a/tests/core/featurizers/test_tracker_featurizer.py b/tests/core/featurizers/test_tracker_featurizer.py index 99ffea6e9641..20a0a08f558d 100644 --- a/tests/core/featurizers/test_tracker_featurizer.py +++ b/tests/core/featurizers/test_tracker_featurizer.py @@ -34,7 +34,25 @@ def test_fail_to_load_non_existent_featurizer(): assert TrackerFeaturizer.load("non_existent_class") is None -def test_persist_and_load_tracker_featurizer(tmp_path: Text, moodbot_domain: Domain): +def test_persist_and_load_full_dialogue_tracker_featurizer( + tmp_path: Text, moodbot_domain: Domain +): + state_featurizer = SingleStateFeaturizer() + state_featurizer.prepare_for_training(moodbot_domain) + tracker_featurizer = FullDialogueTrackerFeaturizer(state_featurizer) + + tracker_featurizer.persist(tmp_path) + + loaded_tracker_featurizer = TrackerFeaturizer.load(tmp_path) + + assert loaded_tracker_featurizer is not None + assert loaded_tracker_featurizer.state_featurizer is not None + assert loaded_tracker_featurizer.to_dict() == tracker_featurizer.to_dict() + + +def test_persist_and_load_max_history_tracker_featurizer( + tmp_path: Text, moodbot_domain: Domain +): state_featurizer = SingleStateFeaturizer() state_featurizer.prepare_for_training(moodbot_domain) tracker_featurizer = MaxHistoryTrackerFeaturizer(state_featurizer) @@ -45,6 +63,23 @@ def test_persist_and_load_tracker_featurizer(tmp_path: Text, moodbot_domain: Dom assert loaded_tracker_featurizer is not None assert loaded_tracker_featurizer.state_featurizer is not None + assert loaded_tracker_featurizer.to_dict() == tracker_featurizer.to_dict() + + +def test_persist_and_load_intent_max_history_tracker_featurizer( + tmp_path: Text, moodbot_domain: Domain +): + state_featurizer = SingleStateFeaturizer() + state_featurizer.prepare_for_training(moodbot_domain) + tracker_featurizer = IntentMaxHistoryTrackerFeaturizer(state_featurizer) + + tracker_featurizer.persist(tmp_path) + + loaded_tracker_featurizer = TrackerFeaturizer.load(tmp_path) + + assert loaded_tracker_featurizer is not None + assert loaded_tracker_featurizer.state_featurizer is not None + assert loaded_tracker_featurizer.to_dict() == tracker_featurizer.to_dict() def test_convert_action_labels_to_ids(domain: Domain): @@ -127,7 +162,6 @@ def compare_featurized_states( """Compares two lists of featurized states and returns True if they are identical and False otherwise. """ - if len(states1) != len(states2): return False diff --git a/tests/nlu/extractors/test_crf_entity_extractor.py b/tests/nlu/extractors/test_crf_entity_extractor.py index b2342ab30fb7..bf095385017b 100644 --- a/tests/nlu/extractors/test_crf_entity_extractor.py +++ b/tests/nlu/extractors/test_crf_entity_extractor.py @@ -1,23 +1,25 @@ import copy from typing import Dict, Text, List, Any, Callable +import numpy as np import pytest from rasa.engine.graph import ExecutionContext from rasa.engine.storage.resource import Resource from rasa.engine.storage.storage import ModelStorage +from rasa.nlu.constants import SPACY_DOCS +from rasa.nlu.extractors.crf_entity_extractor import ( + CRFEntityExtractor, + CRFEntityExtractorOptions, + CRFToken, +) from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer -from rasa.nlu.constants import SPACY_DOCS from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer from rasa.nlu.utils.spacy_utils import SpacyModel, SpacyNLP from rasa.shared.importers.rasa import RasaFileImporter from rasa.shared.nlu.constants import TEXT, ENTITIES from rasa.shared.nlu.training_data.message import Message -from rasa.nlu.extractors.crf_entity_extractor import ( - CRFEntityExtractor, - CRFEntityExtractorOptions, -) @pytest.fixture() @@ -204,7 +206,7 @@ def test_crf_use_dense_features( spacy_featurizer.process([message]) text_data = crf_extractor._convert_to_crf_tokens(message) - features = crf_extractor._crf_tokens_to_features(text_data) + features = crf_extractor._crf_tokens_to_features(text_data, component_config) assert "0:text_dense_features" in features[0] dense_features, _ = message.get_dense_features(TEXT, []) @@ -249,3 +251,110 @@ def test_process_unfeaturized_input( assert processed_message.get(TEXT) == message_text assert processed_message.get(ENTITIES) == [] + + +@pytest.fixture +def sample_data(): + return { + "text": "apple", + "pos_tag": "NOUN", + "pattern": {"length": 5, "is_capitalized": False}, + "dense_features": np.array([0.1, 0.2, 0.3]), + "entity_tag": "B-FOOD", + "entity_role_tag": "INGREDIENT", + "entity_group_tag": "ITEM", + } + + +@pytest.fixture +def sample_token(sample_data): + return CRFToken( + sample_data["text"], + sample_data["pos_tag"], + sample_data["pattern"], + sample_data["dense_features"], + sample_data["entity_tag"], + sample_data["entity_role_tag"], + sample_data["entity_group_tag"], + ) + + +def test_crf_token_to_dict(sample_data, sample_token): + token_dict = sample_token.to_dict() + + assert token_dict["text"] == sample_data["text"] + assert token_dict["pos_tag"] == sample_data["pos_tag"] + assert token_dict["pattern"] == sample_data["pattern"] + assert token_dict["dense_features"] == [ + str(x) for x in sample_data["dense_features"] + ] + assert token_dict["entity_tag"] == sample_data["entity_tag"] + assert token_dict["entity_role_tag"] == sample_data["entity_role_tag"] + assert token_dict["entity_group_tag"] == sample_data["entity_group_tag"] + + +def test_crf_token_create_from_dict(sample_data): + dict_data = { + "text": sample_data["text"], + "pos_tag": sample_data["pos_tag"], + "pattern": sample_data["pattern"], + "dense_features": [str(x) for x in sample_data["dense_features"]], + "entity_tag": sample_data["entity_tag"], + "entity_role_tag": sample_data["entity_role_tag"], + "entity_group_tag": sample_data["entity_group_tag"], + } + + token = CRFToken.create_from_dict(dict_data) + + assert token.text == sample_data["text"] + assert token.pos_tag == sample_data["pos_tag"] + assert token.pattern == sample_data["pattern"] + np.testing.assert_array_equal(token.dense_features, sample_data["dense_features"]) + assert token.entity_tag == sample_data["entity_tag"] + assert token.entity_role_tag == sample_data["entity_role_tag"] + assert token.entity_group_tag == sample_data["entity_group_tag"] + + +def test_crf_token_roundtrip_conversion(sample_token): + token_dict = sample_token.to_dict() + new_token = CRFToken.create_from_dict(token_dict) + + assert new_token.text == sample_token.text + assert new_token.pos_tag == sample_token.pos_tag + assert new_token.pattern == sample_token.pattern + np.testing.assert_array_equal(new_token.dense_features, sample_token.dense_features) + assert new_token.entity_tag == sample_token.entity_tag + assert new_token.entity_role_tag == sample_token.entity_role_tag + assert new_token.entity_group_tag == sample_token.entity_group_tag + + +def test_crf_token_empty_dense_features(sample_data): + sample_data["dense_features"] = np.array([]) + token = CRFToken( + sample_data["text"], + sample_data["pos_tag"], + sample_data["pattern"], + sample_data["dense_features"], + sample_data["entity_tag"], + sample_data["entity_role_tag"], + sample_data["entity_group_tag"], + ) + token_dict = token.to_dict() + new_token = CRFToken.create_from_dict(token_dict) + np.testing.assert_array_equal(new_token.dense_features, np.array([])) + + +def test_crf_token_empty_pattern(sample_data): + sample_data["pattern"] = {} + token = CRFToken( + sample_data["text"], + sample_data["pos_tag"], + sample_data["pattern"], + sample_data["dense_features"], + sample_data["entity_tag"], + sample_data["entity_role_tag"], + sample_data["entity_group_tag"], + ) + token_dict = token.to_dict() + new_token = CRFToken.create_from_dict(token_dict) + assert new_token.pattern == {} diff --git a/tests/shared/nlu/training_data/test_features.py b/tests/shared/nlu/training_data/test_features.py index bd0f29fa046b..457e9648f28f 100644 --- a/tests/shared/nlu/training_data/test_features.py +++ b/tests/shared/nlu/training_data/test_features.py @@ -1,17 +1,56 @@ import itertools +import os +import tempfile +from pathlib import Path from typing import Optional, Text, List, Dict, Tuple, Any import numpy as np import pytest import scipy.sparse -from rasa.shared.nlu.training_data.features import Features from rasa.shared.nlu.constants import ( FEATURE_TYPE_SENTENCE, FEATURE_TYPE_SEQUENCE, TEXT, INTENT, ) +from rasa.shared.nlu.training_data.features import ( + Features, + FeatureMetadata, + save_features, + load_features, +) + + +@pytest.fixture +def safe_tensors_tmp_file() -> str: + with tempfile.NamedTemporaryFile(delete=False, suffix=".safetensors") as f: + yield f.name + os.unlink(f.name) + + +@pytest.fixture +def dense_features() -> Features: + features_matrix = np.array([[1, 2, 3], [4, 5, 6]]) + return Features( + features=features_matrix, + feature_type="dense", + attribute="test", + origin="test_origin", + ) + + +@pytest.fixture +def sparse_features() -> Features: + features_matrix = scipy.sparse.csr_matrix( + ([1, 2, 3], ([0, 1, 1], [0, 1, 2])), shape=(2, 3) + ) + return Features( + features=features_matrix, + feature_type="sparse", + attribute="test", + origin="test_origin", + ) @pytest.mark.parametrize( @@ -181,6 +220,7 @@ def _generate_feature_list_and_modifications( instantiate `Features` that differ from the aforementioned list of features in exactly one property (i.e. type, sequence length (if the given `type` is sequence type only), attribute, origin) + Args: is_sparse: whether all features should be sparse type: the type to be used for all features @@ -190,7 +230,6 @@ def _generate_feature_list_and_modifications( a list of kwargs dictionaries that can be used to instantiate `Features` that differ from the aforementioned list of features in exactly one property """ - seq_len = 3 first_dim = 1 if type == FEATURE_TYPE_SENTENCE else 3 @@ -467,3 +506,179 @@ def test_reduce_raises_if_combining_different_origins_or_attributes(differ: Text expected_origin = ["origin-1"] with pytest.raises(ValueError, match=message): Features.reduce(features_list, expected_origins=expected_origin) + + +def test_feature_metadata(): + metadata = FeatureMetadata( + data_type="dense", + attribute="text", + origin="test", + is_sparse=False, + shape=(10, 5), + safetensors_key="key_0", + ) + + assert metadata.data_type == "dense" + assert metadata.attribute == "text" + assert metadata.origin == "test" + assert not metadata.is_sparse + assert metadata.shape == (10, 5) + assert metadata.safetensors_key == "key_0" + + +def test_save_dense_features(safe_tensors_tmp_file: str, dense_features: Features): + features_dict = {"test_key": [dense_features]} + metadata = save_features(features_dict, safe_tensors_tmp_file) + + assert "test_key" in metadata + assert len(metadata["test_key"]) == 1 + assert metadata["test_key"][0]["data_type"] == "dense" + assert metadata["test_key"][0]["shape"] == (2, 3) + assert not metadata["test_key"][0]["is_sparse"] + assert Path(safe_tensors_tmp_file).exists() + + +def test_save_sparse_features(safe_tensors_tmp_file: str, sparse_features: Features): + features_dict = {"test_key": [sparse_features]} + metadata = save_features(features_dict, safe_tensors_tmp_file) + + assert "test_key" in metadata + assert len(metadata["test_key"]) == 1 + assert metadata["test_key"][0]["data_type"] == "sparse" + assert metadata["test_key"][0]["shape"] == (2, 3) + assert metadata["test_key"][0]["is_sparse"] + assert Path(safe_tensors_tmp_file).exists() + + +def test_save_mixed_features( + safe_tensors_tmp_file: str, dense_features: Features, sparse_features: Features +): + features_dict = {"test_key": [dense_features, sparse_features]} + metadata = save_features(features_dict, safe_tensors_tmp_file) + + assert "test_key" in metadata + assert len(metadata["test_key"]) == 2 + assert metadata["test_key"][0]["data_type"] == "dense" + assert metadata["test_key"][1]["data_type"] == "sparse" + assert Path(safe_tensors_tmp_file).exists() + + +def test_save_multiple_keys( + safe_tensors_tmp_file: str, dense_features: Features, sparse_features: Features +): + features_dict = {"dense_key": [dense_features], "sparse_key": [sparse_features]} + metadata = save_features(features_dict, safe_tensors_tmp_file) + + assert "dense_key" in metadata + assert "sparse_key" in metadata + assert metadata["dense_key"][0]["data_type"] == "dense" + assert metadata["sparse_key"][0]["data_type"] == "sparse" + assert Path(safe_tensors_tmp_file).exists() + + +@pytest.fixture +def setup_save_load( + safe_tensors_tmp_file: str, dense_features: Features, sparse_features: Features +) -> Tuple[str, Dict[str, Any], Dict[str, List[Features]]]: + features_dict = {"dense_key": [dense_features], "sparse_key": [sparse_features]} + metadata = save_features(features_dict, safe_tensors_tmp_file) + return safe_tensors_tmp_file, metadata, features_dict + + +def test_load_dense_features( + setup_save_load: Tuple[str, Dict[str, Any], Dict[str, List[Features]]], +): + temp_file, metadata, original_dict = setup_save_load + loaded_dict = load_features(temp_file, metadata) + + assert "dense_key" in loaded_dict + assert len(loaded_dict["dense_key"]) == 1 + assert not loaded_dict["dense_key"][0].is_sparse() + np.testing.assert_array_equal( + loaded_dict["dense_key"][0].features, original_dict["dense_key"][0].features + ) + + +def test_load_sparse_features( + setup_save_load: Tuple[str, Dict[str, Any], Dict[str, List[Features]]], +): + temp_file, metadata, original_dict = setup_save_load + loaded_dict = load_features(temp_file, metadata) + + assert "sparse_key" in loaded_dict + assert len(loaded_dict["sparse_key"]) == 1 + assert loaded_dict["sparse_key"][0].is_sparse() + assert ( + loaded_dict["sparse_key"][0].features != original_dict["sparse_key"][0].features + ).nnz == 0 + + +def test_load_preserves_metadata( + setup_save_load: Tuple[str, Dict[str, Any], Dict[str, List[Features]]], +): + temp_file, metadata, original_dict = setup_save_load + loaded_dict = load_features(temp_file, metadata) + + for key in original_dict: + for orig_feat, loaded_feat in zip(original_dict[key], loaded_dict[key]): + assert orig_feat.type == loaded_feat.type + assert orig_feat.attribute == loaded_feat.attribute + assert orig_feat.origin == loaded_feat.origin + + +def test_load_nonexistent_file(): + with pytest.raises(Exception): + load_features("nonexistent.safetensors", {}) + + +def test_load_invalid_metadata(safe_tensors_tmp_file: str, dense_features: Features): + features_dict = {"test_key": [dense_features]} + metadata = save_features(features_dict, safe_tensors_tmp_file) + + # Corrupt the metadata + metadata["test_key"][0]["safetensors_key"] = "invalid_key" + + with pytest.raises(Exception): + load_features(safe_tensors_tmp_file, metadata) + + +def test_end_to_end(safe_tensors_tmp_file: str): + # Create test data + dense_matrix = np.array([[1, 2], [3, 4]]) + sparse_matrix = scipy.sparse.csr_matrix(([1, 2], ([0, 1], [0, 1])), shape=(2, 2)) + + features_dict = { + "group1": [ + Features(dense_matrix, "dense", "test1", "origin1"), + Features(sparse_matrix, "sparse", "test2", "origin2"), + ], + "group2": [ + Features(dense_matrix * 2, "dense", "test3", ["origin3", "origin4"]) + ], + } + + # Save features + metadata = save_features(features_dict, safe_tensors_tmp_file) + + # Load features + loaded_dict = load_features(safe_tensors_tmp_file, metadata) + + # Verify structure + assert set(loaded_dict.keys()) == set(features_dict.keys()) + assert len(loaded_dict["group1"]) == 2 + assert len(loaded_dict["group2"]) == 1 + + # Verify dense features + np.testing.assert_array_equal( + loaded_dict["group1"][0].features, features_dict["group1"][0].features + ) + + # Verify sparse features + assert ( + loaded_dict["group1"][1].features != features_dict["group1"][1].features + ).nnz == 0 + + # Verify metadata + assert loaded_dict["group1"][0].type == "dense" + assert loaded_dict["group1"][1].type == "sparse" + assert loaded_dict["group2"][0].origin == ["origin3", "origin4"] diff --git a/tests/utils/tensorflow/test_feature_array.py b/tests/utils/tensorflow/test_feature_array.py new file mode 100644 index 000000000000..95be7ba993a6 --- /dev/null +++ b/tests/utils/tensorflow/test_feature_array.py @@ -0,0 +1,197 @@ +import numpy as np +import scipy.sparse + +from rasa.utils.tensorflow.feature_array import ( + _recursive_serialize, + _serialize_nested_data, + _deserialize_nested_data, +) +from rasa.utils.tensorflow.model_data import RasaModelData + + +def test_recursive_serialize_numpy_array(): + data_dict = {} + metadata = [] + + _recursive_serialize(np.array([1, 2, 3]), "test_array", data_dict, metadata) + assert "test_array_array" in data_dict + assert metadata[0] == {"type": "dense", "key": "test_array_array", "shape": (3,)} + + +def test_recursive_serialize_floats(): + data_dict = {} + metadata = [] + + _recursive_serialize([1.0, 2.0, 3.0], "test_list", data_dict, metadata) + assert "test_list_list" in data_dict + assert metadata[0] == {"type": "list", "key": "test_list_list"} + + +def test_recursive_serialize_sparse_matrix(): + data_dict = {} + metadata = [] + + sparse_matrix = scipy.sparse.random(5, 10, density=0.1, format="coo") + _recursive_serialize(sparse_matrix, "test_sparse", data_dict, metadata) + assert "test_sparse_data" in data_dict + assert "test_sparse_row" in data_dict + assert "test_sparse_col" in data_dict + assert metadata[0] == { + "type": "sparse", + "key": "test_sparse", + "shape": sparse_matrix.shape, + } + + +def test_serialize_model_data(model_data: RasaModelData): + nested_data = model_data.data + + data_dict = {} + metadata = [] + _serialize_nested_data(nested_data, "component", data_dict, metadata) + + assert len(metadata) == 5 + + assert metadata[0]["key"] == "text" + assert len(metadata[0]["components"]) == 1 + assert metadata[0]["components"][0]["key"] == "sentence" + assert metadata[0]["components"][0]["number_of_dimensions"] == 3 + assert len(metadata[0]["components"][0]["features"]) == 2 + assert metadata[0]["components"][0]["features"][0]["type"] == "group" + assert len(metadata[0]["components"][0]["features"][0]["subcomponents"]) == 5 + assert ( + metadata[0]["components"][0]["features"][0]["subcomponents"][0]["type"] + == "dense" + ) + assert metadata[0]["components"][0]["features"][0]["subcomponents"][0]["shape"] == ( + 5, + 14, + ) + assert metadata[0]["components"][0]["features"][1]["type"] == "group" + assert len(metadata[0]["components"][0]["features"][1]["subcomponents"]) == 5 + assert ( + metadata[0]["components"][0]["features"][1]["subcomponents"][0]["type"] + == "sparse" + ) + assert metadata[0]["components"][0]["features"][1]["subcomponents"][0]["shape"] == ( + 5, + 10, + ) + + assert metadata[3]["key"] == "label" + assert len(metadata[3]["components"]) == 1 + assert metadata[3]["components"][0]["key"] == "ids" + assert metadata[3]["components"][0]["number_of_dimensions"] == 1 + assert metadata[3]["components"][0]["features"][0]["type"] == "list" + assert ( + metadata[3]["components"][0]["features"][0]["key"] + == "component_label_ids_0_list" + ) + + assert len(data_dict) == 87 + assert ( + data_dict["component_label_ids_0_list"] + == model_data.data["label"]["ids"][0].view(np.ndarray) + ).all() + + +def test_serialize_and_deserialize_model_data(model_data: RasaModelData): + actual_data = model_data.data + + data_dict = {} + metadata = [] + _serialize_nested_data(actual_data, "component", data_dict, metadata) + + loaded_data = _deserialize_nested_data(metadata, data_dict) + + assert len(actual_data) == len(loaded_data) + + assert len(actual_data["text"]["sentence"]) == len(loaded_data["text"]["sentence"]) + + # text.sentence has a dimension of 3 + assert len(actual_data["text"]["sentence"][0]) == len( + loaded_data["text"]["sentence"][0] + ) + # assert that the numpy arrays of the actual and loaded data in + # text.sentence are the same + for i in range(0, 5): + assert ( + actual_data["text"]["sentence"][0][i] + == loaded_data["text"]["sentence"][0][i] + ).all() + assert len(actual_data["text"]["sentence"][1]) == len( + loaded_data["text"]["sentence"][1] + ) + # assert that the sparse matrices of the actual and loaded data in + # text.sentence are the same + for i in range(0, 5): + assert ( + actual_data["text"]["sentence"][1][i] + == loaded_data["text"]["sentence"][1][i] + ).data.all() + + # action_text.sequence has a dimension of 4 + assert len(actual_data["action_text"]["sequence"]) == len( + loaded_data["action_text"]["sequence"] + ) + assert len(actual_data["action_text"]["sequence"][0]) == len( + loaded_data["action_text"]["sequence"][0] + ) + # assert that the sparse matrices of the actual and loaded data in + # action_text.sequence are the same + for i in range(0, 5): + for j in range(0, len(actual_data["action_text"]["sequence"][0][i])): + assert ( + actual_data["action_text"]["sequence"][0][i][j] + == loaded_data["action_text"]["sequence"][0][i][j] + ).data.all() + assert len(actual_data["action_text"]["sequence"][1]) == len( + loaded_data["action_text"]["sequence"][1] + ) + # assert that the numpy array of the actual and loaded data in + # action_text.sequence are the same + for i in range(0, 5): + for j in range(0, len(actual_data["action_text"]["sequence"][1][i])): + assert ( + actual_data["action_text"]["sequence"][1][i][j] + == loaded_data["action_text"]["sequence"][1][i][j] + ).all() + + # dialogue.sentence has a dimension of 3 + assert len(actual_data["dialogue"]["sentence"]) == len( + loaded_data["dialogue"]["sentence"] + ) + assert len(actual_data["dialogue"]["sentence"][0]) == len( + loaded_data["dialogue"]["sentence"][0] + ) + # assert that the numpy array of the actual and loaded data in + # dialogue.sentence are the same + for i in range(0, 5): + assert ( + actual_data["dialogue"]["sentence"][0][i] + == loaded_data["dialogue"]["sentence"][0][i] + ).all() + + # label.ids has a dimension of 4 + assert len(actual_data["label"]["ids"]) == len(loaded_data["label"]["ids"]) + # assert that the numpy array of the actual and loaded data in + # label.ids are the same + assert ( + actual_data["label"]["ids"][0].view(np.ndarray) + == loaded_data["label"]["ids"][0].view(np.ndarray) + ).all() + + # entities.tag_ids has a dimension of 3 + assert len(actual_data["entities"]["tag_ids"]) == len( + loaded_data["entities"]["tag_ids"] + ) + assert len(actual_data["entities"]["tag_ids"][0]) == len( + loaded_data["entities"]["tag_ids"][0] + ) + # assert that the numpy array of the actual and loaded data in + # entities.tag_ids are the same + for i in range(0, 5): + assert ( + actual_data["entities"]["tag_ids"][0][i] + == loaded_data["entities"]["tag_ids"][0][i] + ).all() diff --git a/tests/utils/test_io.py b/tests/utils/test_io.py index ac788373a422..3042b113aaf3 100644 --- a/tests/utils/test_io.py +++ b/tests/utils/test_io.py @@ -1,5 +1,3 @@ -from pathlib import Path -from typing import Dict, Text import pytest from _pytest.tmpdir import TempPathFactory from prompt_toolkit.document import Document @@ -71,22 +69,6 @@ def is_valid(user_input) -> None: assert e.value.message == error_message -@pytest.mark.parametrize( - "input,kwargs,expected", - [ - ({(1, 2): 3}, {}, {repr((1, 2)): 3}), - ({(1, 2): 3}, {"encode_non_string_keys": True}, {(1, 2): 3}), - ], -) -def test_write_and_load_dict_via_jsonpickle( - tmp_path: Path, input: Dict, kwargs: Dict[Text, bool], expected: Dict -): - file_name = tmp_path / "bla.pkl" - rasa.utils.io.json_pickle(file_name=file_name, obj=input, **kwargs) - loaded = rasa.utils.io.json_unpickle(file_name=file_name, **kwargs) - assert loaded == expected - - def test_empty_directories_are_equal(tmp_path_factory: TempPathFactory): dir1 = tmp_path_factory.mktemp("dir1") dir2 = tmp_path_factory.mktemp("dir2")