Skip to content

Commit

Permalink
Upload Model to Kaggle (keras-team#1512)
Browse files Browse the repository at this point in the history
* Initial Kaggle upload.

* Address review comments.

* Add upload valiations.

* Address review comments.

* Fix init.

* Address review comments.

* Improve error handling.

* Address review comments.
  • Loading branch information
SamanehSaadat authored Mar 25, 2024
1 parent 45d8bd3 commit 0dc383c
Show file tree
Hide file tree
Showing 5 changed files with 146 additions and 0 deletions.
1 change: 1 addition & 0 deletions keras_nlp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,6 @@
from keras_nlp import samplers
from keras_nlp import tokenizers
from keras_nlp import utils
from keras_nlp.utils.preset_utils import upload_preset
from keras_nlp.version_utils import __version__
from keras_nlp.version_utils import version
9 changes: 9 additions & 0 deletions keras_nlp/models/backbone.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from keras_nlp.backend import keras
from keras_nlp.utils.preset_utils import check_preset_class
from keras_nlp.utils.preset_utils import load_from_preset
from keras_nlp.utils.preset_utils import save_to_preset
from keras_nlp.utils.python_utils import classproperty
from keras_nlp.utils.python_utils import format_docstring

Expand Down Expand Up @@ -141,6 +142,14 @@ def from_preset(
config_overrides=kwargs,
)

def save_to_preset(self, preset):
"""Save backbone to a preset directory.
Args:
preset: The path to the local model preset directory.
"""
save_to_preset(self, preset)

def __init_subclass__(cls, **kwargs):
# Use __init_subclass__ to setup a correct docstring for from_preset.
super().__init_subclass__(**kwargs)
Expand Down
10 changes: 10 additions & 0 deletions keras_nlp/tokenizers/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
from keras_nlp.layers.preprocessing.preprocessing_layer import (
PreprocessingLayer,
)
from keras_nlp.utils.preset_utils import TOKENIZER_CONFIG_FILE
from keras_nlp.utils.preset_utils import save_to_preset


@keras_nlp_export("keras_nlp.tokenizers.Tokenizer")
Expand Down Expand Up @@ -121,5 +123,13 @@ def token_to_id(self, token: str) -> int:
f"{self.__class__.__name__}."
)

def save_to_preset(self, preset):
"""Save tokenizer to a preset directory.
Args:
preset: The path to the local model preset directory.
"""
save_to_preset(self, preset, config_filename=TOKENIZER_CONFIG_FILE)

def call(self, inputs, *args, training=None, **kwargs):
return self.tokenize(inputs, *args, **kwargs)
114 changes: 114 additions & 0 deletions keras_nlp/utils/preset_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
import json
import os

from absl import logging

from keras_nlp.api_export import keras_nlp_export
from keras_nlp.backend import config as backend_config
from keras_nlp.backend import keras

Expand All @@ -27,6 +30,8 @@
KAGGLE_PREFIX = "kaggle://"
GS_PREFIX = "gs://"
TOKENIZER_ASSET_DIR = "assets/tokenizer"
CONFIG_FILE = "config.json"
TOKENIZER_CONFIG_FILE = "tokenizer.json"


def get_file(preset, path):
Expand Down Expand Up @@ -155,6 +160,115 @@ def save_to_preset(
metadata_file.write(json.dumps(metadata, indent=4))


def _validate_tokenizer(preset, allow_incomplete=False):
config_path = get_file(preset, TOKENIZER_CONFIG_FILE)
if not os.path.exists(config_path):
if allow_incomplete:
logging.warning(
f"`{TOKENIZER_CONFIG_FILE}` is missing from the preset directory `{preset}`."
)
return
else:
raise FileNotFoundError(
f"`{TOKENIZER_CONFIG_FILE}` is missing from the preset directory `{preset}`. "
"To upload the model without a tokenizer, "
"set `allow_incomplete=True`."
)
try:
with open(config_path) as config_file:
config = json.load(config_file)
except Exception as e:
raise ValueError(
f"Tokenizer config file `{config_path}` is an invalid json file. "
f"Error message: {e}"
)
layer = keras.saving.deserialize_keras_object(config)

if not config["assets"]:
raise ValueError(
f"Tokenizer config file {config_path} is missing `asset`."
)

for asset in config["assets"]:
asset_path = os.path.join(preset, asset)
if not os.path.exists(asset_path):
raise FileNotFoundError(
f"Asset `{asset}` doesn't exist in the preset direcotry `{preset}`."
)
config_dir = os.path.dirname(config_path)
asset_dir = os.path.join(config_dir, TOKENIZER_ASSET_DIR)

tokenizer = get_tokenizer(layer)
if not tokenizer:
raise ValueError(f"Model or layer `{layer}` is missing tokenizer.")
tokenizer.load_assets(asset_dir)


def _validate_backbone(preset):
config_path = os.path.join(preset, CONFIG_FILE)
if not os.path.exists(config_path):
raise FileNotFoundError(
f"`{CONFIG_FILE}` is missing from the preset directory `{preset}`."
)
try:
with open(config_path) as config_file:
config = json.load(config_file)
except Exception as e:
raise ValueError(
f"Config file `{config_path}` is an invalid json file. "
f"Error message: {e}"
)

if config["weights"]:
weights_path = os.path.join(preset, config["weights"])
if not os.path.exists(weights_path):
raise FileNotFoundError(
f"The weights file doesn't exist in preset directory `{preset}`."
)
else:
raise ValueError(
f"No weights listed in `{CONFIG_FILE}`. Make sure to use "
"`save_to_preset()` which adds additional data to a serialized "
"Keras object."
)


@keras_nlp_export("keras_nlp.upload_preset")
def upload_preset(
uri,
preset,
allow_incomplete=False,
):
"""Upload a preset directory to a model hub.
Args:
uri: The URI identifying model to upload to.
URIs with format
`kaggle://<KAGGLE_USERNAME>/<MODEL>/<FRAMEWORK>/<VARIATION>`
will be uploaded to Kaggle Hub.
preset: The path to the local model preset directory.
allow_incomplete: If True, allows the upload of presets without
a tokenizer configuration. Otherwise, a tokenizer
is required.
"""

# Check if preset directory exists.
if not os.path.exists(preset):
raise FileNotFoundError(f"The preset directory {preset} doesn't exist.")

_validate_backbone(preset)
_validate_tokenizer(preset, allow_incomplete)

if uri.startswith(KAGGLE_PREFIX):
kaggle_handle = uri.removeprefix(KAGGLE_PREFIX)
kagglehub.model_upload(kaggle_handle, preset)
else:
raise ValueError(
f"Unexpected URI `'{uri}'`. Kaggle upload format should follow "
"`kaggle://<KAGGLE_USERNAME>/<MODEL>/<FRAMEWORK>/<VARIATION>`."
)


def load_from_preset(
preset,
load_weights=True,
Expand Down
12 changes: 12 additions & 0 deletions keras_nlp/utils/preset_utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import pytest
from absl.testing import parameterized

from keras_nlp import upload_preset
from keras_nlp.models.albert.albert_classifier import AlbertClassifier
from keras_nlp.models.backbone import Backbone
from keras_nlp.models.bert.bert_classifier import BertClassifier
Expand Down Expand Up @@ -105,3 +106,14 @@ def test_preset_errors(self):

with self.assertRaisesRegex(ValueError, "Unknown preset identifier"):
AlbertClassifier.from_preset("snaggle://bort/bort/bort")

def test_upload_empty_preset(self):
temp_dir = self.get_temp_dir()
empty_preset = os.path.join(temp_dir, "empty")
os.mkdir(empty_preset)
uri = "kaggle://test/test/test"

with self.assertRaises(FileNotFoundError):
upload_preset(uri, empty_preset)

# TODO: add more test to cover various invalid scenarios such as invalid json, missing files, etc.

0 comments on commit 0dc383c

Please sign in to comment.