Skip to content

Commit

Permalink
Remove staggler type annotiations (keras-team#1536)
Browse files Browse the repository at this point in the history
Currently Keras as a whole is not doing type annotiations, but we still
have a few stragglers. Removing them as they occasionally cause
confusion.
  • Loading branch information
mattdangerw authored Mar 29, 2024
1 parent 5341426 commit 1286784
Show file tree
Hide file tree
Showing 6 changed files with 45 additions and 50 deletions.
11 changes: 5 additions & 6 deletions keras_nlp/tokenizers/byte_pair_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
import json
import os
from typing import Iterable
from typing import List

import keras
import regex as re
Expand Down Expand Up @@ -388,17 +387,17 @@ def set_vocabulary_and_merges(self, vocabulary, merges):
default=self.merge_ranks_lookup_default,
)

def get_vocabulary(self) -> List[str]:
def get_vocabulary(self):
"""Get the tokenizer vocabulary as a list of strings tokens."""
self._check_vocabulary()
return self.vocabulary.keys()

def vocabulary_size(self) -> int:
"""Get the size of the tokenizer vocabulary."""
def vocabulary_size(self):
"""Get the integer size of the tokenizer vocabulary."""
self._check_vocabulary()
return len(self.vocabulary)

def id_to_token(self, id: int) -> str:
def id_to_token(self, id):
"""Convert an integer id to a string token."""
# This will be slow, but keep memory usage down compared to building a
# dict. Assuming the main use case is looking up a few special tokens
Expand All @@ -411,7 +410,7 @@ def id_to_token(self, id: int) -> str:
return token
raise ValueError(f"`id` is out of the vocabulary. Received: {id}")

def token_to_id(self, token: str) -> int:
def token_to_id(self, token):
"""Convert a string token to an integer id."""
self._check_vocabulary()
return self.vocabulary[token]
Expand Down
14 changes: 7 additions & 7 deletions keras_nlp/tokenizers/byte_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,11 +155,11 @@ class ByteTokenizer(tokenizer.Tokenizer):

def __init__(
self,
lowercase: bool = True,
sequence_length: int = None,
normalization_form: str = None,
errors: str = "replace",
replacement_char: int = 65533,
lowercase=True,
sequence_length=None,
normalization_form=None,
errors="replace",
replacement_char=65533,
dtype="int32",
**kwargs,
):
Expand Down Expand Up @@ -198,8 +198,8 @@ def __init__(
[i.tobytes() for i in np.arange(256, dtype=np.uint8)]
)

def vocabulary_size(self) -> int:
"""Get the size of the tokenizer vocabulary."""
def vocabulary_size(self):
"""Get the integer size of the tokenizer vocabulary."""
return 256

def tokenize(self, inputs):
Expand Down
13 changes: 6 additions & 7 deletions keras_nlp/tokenizers/sentence_piece_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
import base64
import binascii
import os
from typing import List

import keras
import tensorflow as tf
Expand Down Expand Up @@ -108,7 +107,7 @@ def train_sentence_piece_file(ds, path, size):
def __init__(
self,
proto=None,
sequence_length: int = None,
sequence_length=None,
dtype="int32",
**kwargs,
) -> None:
Expand Down Expand Up @@ -172,12 +171,12 @@ def set_proto(self, proto):
# byte array as a string for saving.
self.proto = proto_bytes

def vocabulary_size(self) -> int:
"""Get the size of the tokenizer vocabulary."""
def vocabulary_size(self):
"""Get the integer size of the tokenizer vocabulary."""
self._check_vocabulary()
return int(self._sentence_piece.vocab_size().numpy())

def get_vocabulary(self) -> List[str]:
def get_vocabulary(self):
"""Get the tokenizer vocabulary."""
self._check_vocabulary()
return tensor_to_list(
Expand All @@ -186,7 +185,7 @@ def get_vocabulary(self) -> List[str]:
)
)

def id_to_token(self, id: int) -> str:
def id_to_token(self, id):
"""Convert an integer id to a string token."""
self._check_vocabulary()
if id >= self.vocabulary_size() or id < 0:
Expand All @@ -196,7 +195,7 @@ def id_to_token(self, id: int) -> str:
)
return tensor_to_list(self._sentence_piece.id_to_string(id))

def token_to_id(self, token: str) -> int:
def token_to_id(self, token):
"""Convert a string token to an integer id."""
self._check_vocabulary()
return int(self._sentence_piece.string_to_id(token).numpy())
Expand Down
10 changes: 4 additions & 6 deletions keras_nlp/tokenizers/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List

from keras_nlp.api_export import keras_nlp_export
from keras_nlp.layers.preprocessing.preprocessing_layer import (
PreprocessingLayer,
Expand Down Expand Up @@ -105,28 +103,28 @@ def detokenize(self, inputs, *args, **kwargs):
f"{self.__class__.__name__}."
)

def get_vocabulary(self) -> List[str]:
def get_vocabulary(self):
"""Get the tokenizer vocabulary as a list of strings terms."""
raise NotImplementedError(
"No implementation of `get_vocabulary()` was found for "
f"{self.__class__.__name__}."
)

def vocabulary_size(self) -> int:
def vocabulary_size(self):
"""Returns the total size of the token id space."""
raise NotImplementedError(
"No implementation of `vocabulary_size()` was found for "
f"{self.__class__.__name__}."
)

def id_to_token(self, id: int) -> str:
def id_to_token(self, id):
"""Convert an integer id to a string token."""
raise NotImplementedError(
"No implementation of `id_to_token()` was found for "
f"{self.__class__.__name__}."
)

def token_to_id(self, token: str) -> int:
def token_to_id(self, token):
"""Convert a string token to an integer id."""
raise NotImplementedError(
"No implementation of `token_to_id()` was found for "
Expand Down
18 changes: 9 additions & 9 deletions keras_nlp/tokenizers/unicode_codepoint_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,14 +206,14 @@ class UnicodeCodepointTokenizer(tokenizer.Tokenizer):

def __init__(
self,
sequence_length: int = None,
lowercase: bool = True,
normalization_form: str = None,
errors: str = "replace",
replacement_char: int = 65533,
input_encoding: str = "UTF-8",
output_encoding: str = "UTF-8",
vocabulary_size: int = None,
sequence_length=None,
lowercase=True,
normalization_form=None,
errors="replace",
replacement_char=65533,
input_encoding="UTF-8",
output_encoding="UTF-8",
vocabulary_size=None,
dtype="int32",
**kwargs,
) -> None:
Expand Down Expand Up @@ -275,7 +275,7 @@ def get_config(self):
)
return config

def vocabulary_size(self) -> int:
def vocabulary_size(self):
"""Get the size of the tokenizer vocabulary. None implies no vocabulary
size was provided"""
return self._vocabulary_size
Expand Down
29 changes: 14 additions & 15 deletions keras_nlp/tokenizers/word_piece_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
import os
import re
from typing import Iterable
from typing import List

import keras
import tensorflow as tf
Expand Down Expand Up @@ -334,15 +333,15 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
def __init__(
self,
vocabulary=None,
sequence_length: int = None,
lowercase: bool = False,
strip_accents: bool = False,
split: bool = True,
split_on_cjk: bool = True,
suffix_indicator: str = "##",
oov_token: str = "[UNK]",
special_tokens: List[str] = None,
special_tokens_in_strings: bool = False,
sequence_length=None,
lowercase=False,
strip_accents=False,
split=True,
split_on_cjk=True,
suffix_indicator="##",
oov_token="[UNK]",
special_tokens=None,
special_tokens_in_strings=False,
dtype="int32",
**kwargs,
) -> None:
Expand Down Expand Up @@ -437,17 +436,17 @@ def set_vocabulary(self, vocabulary):
support_detokenization=True,
)

def get_vocabulary(self) -> List[str]:
def get_vocabulary(self):
"""Get the tokenizer vocabulary as a list of strings tokens."""
self._check_vocabulary()
return self.vocabulary

def vocabulary_size(self) -> int:
"""Get the size of the tokenizer vocabulary."""
def vocabulary_size(self):
"""Get the integer size of the tokenizer vocabulary."""
self._check_vocabulary()
return len(self.vocabulary)

def id_to_token(self, id: int) -> str:
def id_to_token(self, id):
"""Convert an integer id to a string token."""
self._check_vocabulary()
if id >= self.vocabulary_size() or id < 0:
Expand All @@ -457,7 +456,7 @@ def id_to_token(self, id: int) -> str:
)
return self.vocabulary[id]

def token_to_id(self, token: str) -> int:
def token_to_id(self, token):
"""Convert a string token to an integer id."""
# This will be slow, but keep memory usage down compared to building a
# . Assuming the main use case is looking up a few special tokens
Expand Down

0 comments on commit 1286784

Please sign in to comment.