diff --git a/keras_nlp/tokenizers/byte_pair_tokenizer.py b/keras_nlp/tokenizers/byte_pair_tokenizer.py index d6b4f08ede..cc549c28e0 100644 --- a/keras_nlp/tokenizers/byte_pair_tokenizer.py +++ b/keras_nlp/tokenizers/byte_pair_tokenizer.py @@ -22,7 +22,6 @@ import json import os from typing import Iterable -from typing import List import keras import regex as re @@ -388,17 +387,17 @@ def set_vocabulary_and_merges(self, vocabulary, merges): default=self.merge_ranks_lookup_default, ) - def get_vocabulary(self) -> List[str]: + def get_vocabulary(self): """Get the tokenizer vocabulary as a list of strings tokens.""" self._check_vocabulary() return self.vocabulary.keys() - def vocabulary_size(self) -> int: - """Get the size of the tokenizer vocabulary.""" + def vocabulary_size(self): + """Get the integer size of the tokenizer vocabulary.""" self._check_vocabulary() return len(self.vocabulary) - def id_to_token(self, id: int) -> str: + def id_to_token(self, id): """Convert an integer id to a string token.""" # This will be slow, but keep memory usage down compared to building a # dict. Assuming the main use case is looking up a few special tokens @@ -411,7 +410,7 @@ def id_to_token(self, id: int) -> str: return token raise ValueError(f"`id` is out of the vocabulary. Received: {id}") - def token_to_id(self, token: str) -> int: + def token_to_id(self, token): """Convert a string token to an integer id.""" self._check_vocabulary() return self.vocabulary[token] diff --git a/keras_nlp/tokenizers/byte_tokenizer.py b/keras_nlp/tokenizers/byte_tokenizer.py index 3aefc4a01d..4d5c4a87ed 100644 --- a/keras_nlp/tokenizers/byte_tokenizer.py +++ b/keras_nlp/tokenizers/byte_tokenizer.py @@ -155,11 +155,11 @@ class ByteTokenizer(tokenizer.Tokenizer): def __init__( self, - lowercase: bool = True, - sequence_length: int = None, - normalization_form: str = None, - errors: str = "replace", - replacement_char: int = 65533, + lowercase=True, + sequence_length=None, + normalization_form=None, + errors="replace", + replacement_char=65533, dtype="int32", **kwargs, ): @@ -198,8 +198,8 @@ def __init__( [i.tobytes() for i in np.arange(256, dtype=np.uint8)] ) - def vocabulary_size(self) -> int: - """Get the size of the tokenizer vocabulary.""" + def vocabulary_size(self): + """Get the integer size of the tokenizer vocabulary.""" return 256 def tokenize(self, inputs): diff --git a/keras_nlp/tokenizers/sentence_piece_tokenizer.py b/keras_nlp/tokenizers/sentence_piece_tokenizer.py index da7b002454..fb01828c6a 100644 --- a/keras_nlp/tokenizers/sentence_piece_tokenizer.py +++ b/keras_nlp/tokenizers/sentence_piece_tokenizer.py @@ -15,7 +15,6 @@ import base64 import binascii import os -from typing import List import keras import tensorflow as tf @@ -108,7 +107,7 @@ def train_sentence_piece_file(ds, path, size): def __init__( self, proto=None, - sequence_length: int = None, + sequence_length=None, dtype="int32", **kwargs, ) -> None: @@ -172,12 +171,12 @@ def set_proto(self, proto): # byte array as a string for saving. self.proto = proto_bytes - def vocabulary_size(self) -> int: - """Get the size of the tokenizer vocabulary.""" + def vocabulary_size(self): + """Get the integer size of the tokenizer vocabulary.""" self._check_vocabulary() return int(self._sentence_piece.vocab_size().numpy()) - def get_vocabulary(self) -> List[str]: + def get_vocabulary(self): """Get the tokenizer vocabulary.""" self._check_vocabulary() return tensor_to_list( @@ -186,7 +185,7 @@ def get_vocabulary(self) -> List[str]: ) ) - def id_to_token(self, id: int) -> str: + def id_to_token(self, id): """Convert an integer id to a string token.""" self._check_vocabulary() if id >= self.vocabulary_size() or id < 0: @@ -196,7 +195,7 @@ def id_to_token(self, id: int) -> str: ) return tensor_to_list(self._sentence_piece.id_to_string(id)) - def token_to_id(self, token: str) -> int: + def token_to_id(self, token): """Convert a string token to an integer id.""" self._check_vocabulary() return int(self._sentence_piece.string_to_id(token).numpy()) diff --git a/keras_nlp/tokenizers/tokenizer.py b/keras_nlp/tokenizers/tokenizer.py index f522098fb2..9418741ea2 100644 --- a/keras_nlp/tokenizers/tokenizer.py +++ b/keras_nlp/tokenizers/tokenizer.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List - from keras_nlp.api_export import keras_nlp_export from keras_nlp.layers.preprocessing.preprocessing_layer import ( PreprocessingLayer, @@ -105,28 +103,28 @@ def detokenize(self, inputs, *args, **kwargs): f"{self.__class__.__name__}." ) - def get_vocabulary(self) -> List[str]: + def get_vocabulary(self): """Get the tokenizer vocabulary as a list of strings terms.""" raise NotImplementedError( "No implementation of `get_vocabulary()` was found for " f"{self.__class__.__name__}." ) - def vocabulary_size(self) -> int: + def vocabulary_size(self): """Returns the total size of the token id space.""" raise NotImplementedError( "No implementation of `vocabulary_size()` was found for " f"{self.__class__.__name__}." ) - def id_to_token(self, id: int) -> str: + def id_to_token(self, id): """Convert an integer id to a string token.""" raise NotImplementedError( "No implementation of `id_to_token()` was found for " f"{self.__class__.__name__}." ) - def token_to_id(self, token: str) -> int: + def token_to_id(self, token): """Convert a string token to an integer id.""" raise NotImplementedError( "No implementation of `token_to_id()` was found for " diff --git a/keras_nlp/tokenizers/unicode_codepoint_tokenizer.py b/keras_nlp/tokenizers/unicode_codepoint_tokenizer.py index 5fe8f0144d..578e03bca7 100644 --- a/keras_nlp/tokenizers/unicode_codepoint_tokenizer.py +++ b/keras_nlp/tokenizers/unicode_codepoint_tokenizer.py @@ -206,14 +206,14 @@ class UnicodeCodepointTokenizer(tokenizer.Tokenizer): def __init__( self, - sequence_length: int = None, - lowercase: bool = True, - normalization_form: str = None, - errors: str = "replace", - replacement_char: int = 65533, - input_encoding: str = "UTF-8", - output_encoding: str = "UTF-8", - vocabulary_size: int = None, + sequence_length=None, + lowercase=True, + normalization_form=None, + errors="replace", + replacement_char=65533, + input_encoding="UTF-8", + output_encoding="UTF-8", + vocabulary_size=None, dtype="int32", **kwargs, ) -> None: @@ -275,7 +275,7 @@ def get_config(self): ) return config - def vocabulary_size(self) -> int: + def vocabulary_size(self): """Get the size of the tokenizer vocabulary. None implies no vocabulary size was provided""" return self._vocabulary_size diff --git a/keras_nlp/tokenizers/word_piece_tokenizer.py b/keras_nlp/tokenizers/word_piece_tokenizer.py index bcf3a7cb5e..4b9b90a943 100644 --- a/keras_nlp/tokenizers/word_piece_tokenizer.py +++ b/keras_nlp/tokenizers/word_piece_tokenizer.py @@ -15,7 +15,6 @@ import os import re from typing import Iterable -from typing import List import keras import tensorflow as tf @@ -334,15 +333,15 @@ class WordPieceTokenizer(tokenizer.Tokenizer): def __init__( self, vocabulary=None, - sequence_length: int = None, - lowercase: bool = False, - strip_accents: bool = False, - split: bool = True, - split_on_cjk: bool = True, - suffix_indicator: str = "##", - oov_token: str = "[UNK]", - special_tokens: List[str] = None, - special_tokens_in_strings: bool = False, + sequence_length=None, + lowercase=False, + strip_accents=False, + split=True, + split_on_cjk=True, + suffix_indicator="##", + oov_token="[UNK]", + special_tokens=None, + special_tokens_in_strings=False, dtype="int32", **kwargs, ) -> None: @@ -437,17 +436,17 @@ def set_vocabulary(self, vocabulary): support_detokenization=True, ) - def get_vocabulary(self) -> List[str]: + def get_vocabulary(self): """Get the tokenizer vocabulary as a list of strings tokens.""" self._check_vocabulary() return self.vocabulary - def vocabulary_size(self) -> int: - """Get the size of the tokenizer vocabulary.""" + def vocabulary_size(self): + """Get the integer size of the tokenizer vocabulary.""" self._check_vocabulary() return len(self.vocabulary) - def id_to_token(self, id: int) -> str: + def id_to_token(self, id): """Convert an integer id to a string token.""" self._check_vocabulary() if id >= self.vocabulary_size() or id < 0: @@ -457,7 +456,7 @@ def id_to_token(self, id: int) -> str: ) return self.vocabulary[id] - def token_to_id(self, token: str) -> int: + def token_to_id(self, token): """Convert a string token to an integer id.""" # This will be slow, but keep memory usage down compared to building a # . Assuming the main use case is looking up a few special tokens