Skip to content

Commit

Permalink
Disable cleaning of spaces
Browse files Browse the repository at this point in the history
Now the outputs match the ones before AutoTokenizer was introduced.
  • Loading branch information
Marcin Kardas committed Feb 14, 2023
1 parent 7ee7c97 commit 1a1e448
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 3 deletions.
12 changes: 10 additions & 2 deletions galai/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,11 @@ def generate(
if not return_full_text:
out_tokens = out_tokens[:, input_v.shape[1]:]
# we keep special tokens such as [START_REF] or <work>
decoded = self.tokenizer.batch_decode(out_tokens, skip_special_tokens=False)
decoded = self.tokenizer.batch_decode(
out_tokens,
skip_special_tokens=False,
clean_up_tokenization_spaces=False,
)
# so we manually remove </s> and <pad>
decoded = [
text.replace(self.tokenizer.eos_token, "").replace(self.tokenizer.pad_token, "")
Expand Down Expand Up @@ -431,7 +435,11 @@ def generate_reference(
)
# cut-off the prompts
generated_tokens = out["sequences"][:, prompt_length:]
decoded = self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)
decoded = self.tokenizer.batch_decode(
generated_tokens,
skip_special_tokens=False,
clean_up_tokenization_spaces=False,
)
references = []
unfinished_generation = False
for text in decoded:
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from setuptools import setup, find_packages

PACKAGE_NAME = 'galai'
VERSION = "1.1.5"
VERSION = "1.1.6"
DESCRIPTION = "API for the GALACTICA model"
KEYWORDS = "Scientific Intelligence"
URL = 'https://github.com/paperswithcode/galai'
Expand Down

0 comments on commit 1a1e448

Please sign in to comment.