diff --git a/keras_nlp/__init__.py b/keras_nlp/__init__.py index 30f8a53b16..407a4b7a71 100644 --- a/keras_nlp/__init__.py +++ b/keras_nlp/__init__.py @@ -26,5 +26,6 @@ from keras_nlp import samplers from keras_nlp import tokenizers from keras_nlp import utils +from keras_nlp.utils.preset_utils import upload_preset from keras_nlp.version_utils import __version__ from keras_nlp.version_utils import version diff --git a/keras_nlp/models/backbone.py b/keras_nlp/models/backbone.py index bfdc8207ad..08b9f86e96 100644 --- a/keras_nlp/models/backbone.py +++ b/keras_nlp/models/backbone.py @@ -17,6 +17,7 @@ from keras_nlp.backend import keras from keras_nlp.utils.preset_utils import check_preset_class from keras_nlp.utils.preset_utils import load_from_preset +from keras_nlp.utils.preset_utils import save_to_preset from keras_nlp.utils.python_utils import classproperty from keras_nlp.utils.python_utils import format_docstring @@ -141,6 +142,14 @@ def from_preset( config_overrides=kwargs, ) + def save_to_preset(self, preset): + """Save backbone to a preset directory. + + Args: + preset: The path to the local model preset directory. + """ + save_to_preset(self, preset) + def __init_subclass__(cls, **kwargs): # Use __init_subclass__ to setup a correct docstring for from_preset. super().__init_subclass__(**kwargs) diff --git a/keras_nlp/tokenizers/tokenizer.py b/keras_nlp/tokenizers/tokenizer.py index 4c26e45241..834b99e5b1 100644 --- a/keras_nlp/tokenizers/tokenizer.py +++ b/keras_nlp/tokenizers/tokenizer.py @@ -18,6 +18,8 @@ from keras_nlp.layers.preprocessing.preprocessing_layer import ( PreprocessingLayer, ) +from keras_nlp.utils.preset_utils import TOKENIZER_CONFIG_FILE +from keras_nlp.utils.preset_utils import save_to_preset @keras_nlp_export("keras_nlp.tokenizers.Tokenizer") @@ -121,5 +123,13 @@ def token_to_id(self, token: str) -> int: f"{self.__class__.__name__}." ) + def save_to_preset(self, preset): + """Save tokenizer to a preset directory. + + Args: + preset: The path to the local model preset directory. + """ + save_to_preset(self, preset, config_filename=TOKENIZER_CONFIG_FILE) + def call(self, inputs, *args, training=None, **kwargs): return self.tokenize(inputs, *args, **kwargs) diff --git a/keras_nlp/utils/preset_utils.py b/keras_nlp/utils/preset_utils.py index 01c11a3db1..dcee9bc66f 100644 --- a/keras_nlp/utils/preset_utils.py +++ b/keras_nlp/utils/preset_utils.py @@ -16,6 +16,9 @@ import json import os +from absl import logging + +from keras_nlp.api_export import keras_nlp_export from keras_nlp.backend import config as backend_config from keras_nlp.backend import keras @@ -27,6 +30,8 @@ KAGGLE_PREFIX = "kaggle://" GS_PREFIX = "gs://" TOKENIZER_ASSET_DIR = "assets/tokenizer" +CONFIG_FILE = "config.json" +TOKENIZER_CONFIG_FILE = "tokenizer.json" def get_file(preset, path): @@ -155,6 +160,115 @@ def save_to_preset( metadata_file.write(json.dumps(metadata, indent=4)) +def _validate_tokenizer(preset, allow_incomplete=False): + config_path = get_file(preset, TOKENIZER_CONFIG_FILE) + if not os.path.exists(config_path): + if allow_incomplete: + logging.warning( + f"`{TOKENIZER_CONFIG_FILE}` is missing from the preset directory `{preset}`." + ) + return + else: + raise FileNotFoundError( + f"`{TOKENIZER_CONFIG_FILE}` is missing from the preset directory `{preset}`. " + "To upload the model without a tokenizer, " + "set `allow_incomplete=True`." + ) + try: + with open(config_path) as config_file: + config = json.load(config_file) + except Exception as e: + raise ValueError( + f"Tokenizer config file `{config_path}` is an invalid json file. " + f"Error message: {e}" + ) + layer = keras.saving.deserialize_keras_object(config) + + if not config["assets"]: + raise ValueError( + f"Tokenizer config file {config_path} is missing `asset`." + ) + + for asset in config["assets"]: + asset_path = os.path.join(preset, asset) + if not os.path.exists(asset_path): + raise FileNotFoundError( + f"Asset `{asset}` doesn't exist in the preset direcotry `{preset}`." + ) + config_dir = os.path.dirname(config_path) + asset_dir = os.path.join(config_dir, TOKENIZER_ASSET_DIR) + + tokenizer = get_tokenizer(layer) + if not tokenizer: + raise ValueError(f"Model or layer `{layer}` is missing tokenizer.") + tokenizer.load_assets(asset_dir) + + +def _validate_backbone(preset): + config_path = os.path.join(preset, CONFIG_FILE) + if not os.path.exists(config_path): + raise FileNotFoundError( + f"`{CONFIG_FILE}` is missing from the preset directory `{preset}`." + ) + try: + with open(config_path) as config_file: + config = json.load(config_file) + except Exception as e: + raise ValueError( + f"Config file `{config_path}` is an invalid json file. " + f"Error message: {e}" + ) + + if config["weights"]: + weights_path = os.path.join(preset, config["weights"]) + if not os.path.exists(weights_path): + raise FileNotFoundError( + f"The weights file doesn't exist in preset directory `{preset}`." + ) + else: + raise ValueError( + f"No weights listed in `{CONFIG_FILE}`. Make sure to use " + "`save_to_preset()` which adds additional data to a serialized " + "Keras object." + ) + + +@keras_nlp_export("keras_nlp.upload_preset") +def upload_preset( + uri, + preset, + allow_incomplete=False, +): + """Upload a preset directory to a model hub. + + Args: + uri: The URI identifying model to upload to. + URIs with format + `kaggle://///` + will be uploaded to Kaggle Hub. + preset: The path to the local model preset directory. + allow_incomplete: If True, allows the upload of presets without + a tokenizer configuration. Otherwise, a tokenizer + is required. + """ + + # Check if preset directory exists. + if not os.path.exists(preset): + raise FileNotFoundError(f"The preset directory {preset} doesn't exist.") + + _validate_backbone(preset) + _validate_tokenizer(preset, allow_incomplete) + + if uri.startswith(KAGGLE_PREFIX): + kaggle_handle = uri.removeprefix(KAGGLE_PREFIX) + kagglehub.model_upload(kaggle_handle, preset) + else: + raise ValueError( + f"Unexpected URI `'{uri}'`. Kaggle upload format should follow " + "`kaggle://///`." + ) + + def load_from_preset( preset, load_weights=True, diff --git a/keras_nlp/utils/preset_utils_test.py b/keras_nlp/utils/preset_utils_test.py index 44dc39f477..289e13b6ab 100644 --- a/keras_nlp/utils/preset_utils_test.py +++ b/keras_nlp/utils/preset_utils_test.py @@ -18,6 +18,7 @@ import pytest from absl.testing import parameterized +from keras_nlp import upload_preset from keras_nlp.models.albert.albert_classifier import AlbertClassifier from keras_nlp.models.backbone import Backbone from keras_nlp.models.bert.bert_classifier import BertClassifier @@ -105,3 +106,14 @@ def test_preset_errors(self): with self.assertRaisesRegex(ValueError, "Unknown preset identifier"): AlbertClassifier.from_preset("snaggle://bort/bort/bort") + + def test_upload_empty_preset(self): + temp_dir = self.get_temp_dir() + empty_preset = os.path.join(temp_dir, "empty") + os.mkdir(empty_preset) + uri = "kaggle://test/test/test" + + with self.assertRaises(FileNotFoundError): + upload_preset(uri, empty_preset) + + # TODO: add more test to cover various invalid scenarios such as invalid json, missing files, etc.