Remove staggler type annotiations (keras-team#1536)

Currently Keras as a whole is not doing type annotiations, but we still have a few stragglers. Removing them as they occasionally cause confusion.
SamanehSaadat · Mar 29, 2024 · 1286784 · 1286784
1 parent 5341426
commit 1286784
Show file tree

Hide file tree

Showing 6 changed files with 45 additions and 50 deletions.
diff --git a/keras_nlp/tokenizers/byte_pair_tokenizer.py b/keras_nlp/tokenizers/byte_pair_tokenizer.py
@@ -22,7 +22,6 @@
 import json
 import os
 from typing import Iterable
-from typing import List
 
 import keras
 import regex as re
@@ -388,17 +387,17 @@ def set_vocabulary_and_merges(self, vocabulary, merges):
             default=self.merge_ranks_lookup_default,
         )
 
-    def get_vocabulary(self) -> List[str]:
+    def get_vocabulary(self):
         """Get the tokenizer vocabulary as a list of strings tokens."""
         self._check_vocabulary()
         return self.vocabulary.keys()
 
-    def vocabulary_size(self) -> int:
-        """Get the size of the tokenizer vocabulary."""
+    def vocabulary_size(self):
+        """Get the integer size of the tokenizer vocabulary."""
         self._check_vocabulary()
         return len(self.vocabulary)
 
-    def id_to_token(self, id: int) -> str:
+    def id_to_token(self, id):
         """Convert an integer id to a string token."""
         # This will be slow, but keep memory usage down compared to building a
         # dict. Assuming the main use case is looking up a few special tokens
@@ -411,7 +410,7 @@ def id_to_token(self, id: int) -> str:
                 return token
         raise ValueError(f"`id` is out of the vocabulary. Received: {id}")
 
-    def token_to_id(self, token: str) -> int:
+    def token_to_id(self, token):
         """Convert a string token to an integer id."""
         self._check_vocabulary()
         return self.vocabulary[token]

diff --git a/keras_nlp/tokenizers/byte_tokenizer.py b/keras_nlp/tokenizers/byte_tokenizer.py
@@ -155,11 +155,11 @@ class ByteTokenizer(tokenizer.Tokenizer):
 
     def __init__(
         self,
-        lowercase: bool = True,
-        sequence_length: int = None,
-        normalization_form: str = None,
-        errors: str = "replace",
-        replacement_char: int = 65533,
+        lowercase=True,
+        sequence_length=None,
+        normalization_form=None,
+        errors="replace",
+        replacement_char=65533,
         dtype="int32",
         **kwargs,
     ):
@@ -198,8 +198,8 @@ def __init__(
             [i.tobytes() for i in np.arange(256, dtype=np.uint8)]
         )
 
-    def vocabulary_size(self) -> int:
-        """Get the size of the tokenizer vocabulary."""
+    def vocabulary_size(self):
+        """Get the integer size of the tokenizer vocabulary."""
         return 256
 
     def tokenize(self, inputs):

diff --git a/keras_nlp/tokenizers/sentence_piece_tokenizer.py b/keras_nlp/tokenizers/sentence_piece_tokenizer.py
@@ -15,7 +15,6 @@
 import base64
 import binascii
 import os
-from typing import List
 
 import keras
 import tensorflow as tf
@@ -108,7 +107,7 @@ def train_sentence_piece_file(ds, path, size):
     def __init__(
         self,
         proto=None,
-        sequence_length: int = None,
+        sequence_length=None,
         dtype="int32",
         **kwargs,
     ) -> None:
@@ -172,12 +171,12 @@ def set_proto(self, proto):
         # byte array as a string for saving.
         self.proto = proto_bytes
 
-    def vocabulary_size(self) -> int:
-        """Get the size of the tokenizer vocabulary."""
+    def vocabulary_size(self):
+        """Get the integer size of the tokenizer vocabulary."""
         self._check_vocabulary()
         return int(self._sentence_piece.vocab_size().numpy())
 
-    def get_vocabulary(self) -> List[str]:
+    def get_vocabulary(self):
         """Get the tokenizer vocabulary."""
         self._check_vocabulary()
         return tensor_to_list(
@@ -186,7 +185,7 @@ def get_vocabulary(self) -> List[str]:
             )
         )
 
-    def id_to_token(self, id: int) -> str:
+    def id_to_token(self, id):
         """Convert an integer id to a string token."""
         self._check_vocabulary()
         if id >= self.vocabulary_size() or id < 0:
@@ -196,7 +195,7 @@ def id_to_token(self, id: int) -> str:
             )
         return tensor_to_list(self._sentence_piece.id_to_string(id))
 
-    def token_to_id(self, token: str) -> int:
+    def token_to_id(self, token):
         """Convert a string token to an integer id."""
         self._check_vocabulary()
         return int(self._sentence_piece.string_to_id(token).numpy())

diff --git a/keras_nlp/tokenizers/tokenizer.py b/keras_nlp/tokenizers/tokenizer.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List
-
 from keras_nlp.api_export import keras_nlp_export
 from keras_nlp.layers.preprocessing.preprocessing_layer import (
     PreprocessingLayer,
@@ -105,28 +103,28 @@ def detokenize(self, inputs, *args, **kwargs):
             f"{self.__class__.__name__}."
         )
 
-    def get_vocabulary(self) -> List[str]:
+    def get_vocabulary(self):
         """Get the tokenizer vocabulary as a list of strings terms."""
         raise NotImplementedError(
             "No implementation of `get_vocabulary()` was found for "
             f"{self.__class__.__name__}."
         )
 
-    def vocabulary_size(self) -> int:
+    def vocabulary_size(self):
         """Returns the total size of the token id space."""
         raise NotImplementedError(
             "No implementation of `vocabulary_size()` was found for "
             f"{self.__class__.__name__}."
         )
 
-    def id_to_token(self, id: int) -> str:
+    def id_to_token(self, id):
         """Convert an integer id to a string token."""
         raise NotImplementedError(
             "No implementation of `id_to_token()` was found for "
             f"{self.__class__.__name__}."
         )
 
-    def token_to_id(self, token: str) -> int:
+    def token_to_id(self, token):
         """Convert a string token to an integer id."""
         raise NotImplementedError(
             "No implementation of `token_to_id()` was found for "

diff --git a/keras_nlp/tokenizers/unicode_codepoint_tokenizer.py b/keras_nlp/tokenizers/unicode_codepoint_tokenizer.py
@@ -206,14 +206,14 @@ class UnicodeCodepointTokenizer(tokenizer.Tokenizer):
 
     def __init__(
         self,
-        sequence_length: int = None,
-        lowercase: bool = True,
-        normalization_form: str = None,
-        errors: str = "replace",
-        replacement_char: int = 65533,
-        input_encoding: str = "UTF-8",
-        output_encoding: str = "UTF-8",
-        vocabulary_size: int = None,
+        sequence_length=None,
+        lowercase=True,
+        normalization_form=None,
+        errors="replace",
+        replacement_char=65533,
+        input_encoding="UTF-8",
+        output_encoding="UTF-8",
+        vocabulary_size=None,
         dtype="int32",
         **kwargs,
     ) -> None:
@@ -275,7 +275,7 @@ def get_config(self):
         )
         return config
 
-    def vocabulary_size(self) -> int:
+    def vocabulary_size(self):
         """Get the size of the tokenizer vocabulary. None implies no vocabulary
         size was provided"""
         return self._vocabulary_size

diff --git a/keras_nlp/tokenizers/word_piece_tokenizer.py b/keras_nlp/tokenizers/word_piece_tokenizer.py
@@ -15,7 +15,6 @@
 import os
 import re
 from typing import Iterable
-from typing import List
 
 import keras
 import tensorflow as tf
@@ -334,15 +333,15 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
     def __init__(
         self,
         vocabulary=None,
-        sequence_length: int = None,
-        lowercase: bool = False,
-        strip_accents: bool = False,
-        split: bool = True,
-        split_on_cjk: bool = True,
-        suffix_indicator: str = "##",
-        oov_token: str = "[UNK]",
-        special_tokens: List[str] = None,
-        special_tokens_in_strings: bool = False,
+        sequence_length=None,
+        lowercase=False,
+        strip_accents=False,
+        split=True,
+        split_on_cjk=True,
+        suffix_indicator="##",
+        oov_token="[UNK]",
+        special_tokens=None,
+        special_tokens_in_strings=False,
         dtype="int32",
         **kwargs,
     ) -> None:
@@ -437,17 +436,17 @@ def set_vocabulary(self, vocabulary):
             support_detokenization=True,
         )
 
-    def get_vocabulary(self) -> List[str]:
+    def get_vocabulary(self):
         """Get the tokenizer vocabulary as a list of strings tokens."""
         self._check_vocabulary()
         return self.vocabulary
 
-    def vocabulary_size(self) -> int:
-        """Get the size of the tokenizer vocabulary."""
+    def vocabulary_size(self):
+        """Get the integer size of the tokenizer vocabulary."""
         self._check_vocabulary()
         return len(self.vocabulary)
 
-    def id_to_token(self, id: int) -> str:
+    def id_to_token(self, id):
         """Convert an integer id to a string token."""
         self._check_vocabulary()
         if id >= self.vocabulary_size() or id < 0:
@@ -457,7 +456,7 @@ def id_to_token(self, id: int) -> str:
             )
         return self.vocabulary[id]
 
-    def token_to_id(self, token: str) -> int:
+    def token_to_id(self, token):
         """Convert a string token to an integer id."""
         # This will be slow, but keep memory usage down compared to building a
         # . Assuming the main use case is looking up a few special tokens