Skip to content

Commit

Permalink
Merge pull request #75 from LlmKira/dev
Browse files Browse the repository at this point in the history
(feat): Novelai tokenizer re-implement || New LLM
  • Loading branch information
sudoskys authored Sep 26, 2024
2 parents a04ca0f + f4af2e6 commit 9cbed20
Show file tree
Hide file tree
Showing 28 changed files with 3,950 additions and 950 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -177,3 +177,4 @@ cython_debug/
/playground/art_assert/
/playground/unpack/
/playground/boom-train/
/frontend/
26 changes: 24 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ The goal of this repository is to use Pydantic to build legitimate requests to a
- [x] tool.random_prompt
- [x] tool.paint_mask
- [x] tool.image_metadata
- [x] tokenizer
- [x] /ai/generate-image
- [x] /user/subscription
- [x] /user/login
Expand Down Expand Up @@ -85,7 +86,7 @@ from dotenv import load_dotenv
from pydantic import SecretStr

from novelai_python import APIError, LoginCredential
from novelai_python.sdk.ai.generate import TextLLMModel, LLM
from novelai_python.sdk.ai.generate import TextLLMModel, LLM, get_default_preset

load_dotenv()
username = os.getenv("NOVELAI_USER", None)
Expand All @@ -99,7 +100,13 @@ login_credential = LoginCredential(

async def chat(prompt: str):
try:
agent = LLM.build(prompt=prompt, model=TextLLMModel.Kayra)
model = TextLLMModel.ERATO # llama3
parameters = get_default_preset(model).parameters
agent = LLM.build(
prompt=prompt,
model=model,
parameters=None # Auto Select or get from preset
)
result = await agent.request(session=login_credential)
except APIError as e:
raise Exception(f"Error: {e.message}")
Expand All @@ -126,6 +133,21 @@ pip install novelai_python
python3 -m novelai_python.server -h '127.0.0.1' -p 7888
```

#### Tokenizer

```python
from novelai_python._enum import get_tokenizer_model, TextLLMModel
from novelai_python.tokenizer import NaiTokenizer

tokenizer_package = NaiTokenizer(get_tokenizer_model(TextLLMModel.ERATO))
t_text = "a fox jumped over the lazy dog"
encode_tokens = tokenizer_package.encode(t_text)
print(tokenizer_package.tokenize_text(t_text))
print(f"Tokenized text: {encode_tokens}")
print(tokenizer_package.decode(tokenizer_package.encode(t_text)))

```

## Acknowledgements 🙏

[BackEnd](https://api.novelai.net/docs)
Expand Down
176 changes: 108 additions & 68 deletions pdm.lock

Large diffs are not rendered by default.

12 changes: 7 additions & 5 deletions playground/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,20 @@ async def chat(prompt="Hello"):
try:
agent = LLM.build(
prompt=prompt,
model=TextLLMModel.Kayra,
model=TextLLMModel.ERATO,
)
result = await agent.request(session=credential)
result = await agent.request(session=_login_credential)
except APIError as e:
logger.exception(e)
print(f"Error: {e.message}")
return None
except Exception as e:
logger.exception(e)
else:
print(f"Result: \n{result.text}")
print(f"Result:\n{result.text}")


loop = asyncio.get_event_loop()
loop.run_until_complete(chat())
loop = asyncio.new_event_loop()
loop.run_until_complete(chat(
prompt="a fox jumped over the lazy dog, and the dog barked at the fox. The fox ran away."
))
6 changes: 2 additions & 4 deletions playground/generate_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,12 @@ async def stream(prompt="Hello"):
try:
agent = LLMStream.build(
prompt=prompt,
model=TextLLMModel.Kayra,
model=TextLLMModel.ERATO,
)
_data = []
# 现在,你可以使用异步for循环来处理每一部分数据
generator = agent.request(session=credential)
generator = agent.request(session=_login_credential)
async for data in generator:
data: LLMStreamResp
print(data.text) # 或者做其他需要的处理
_data.append(data)
except APIError as e:
print(f"Error: {e.message}")
Expand Down
94 changes: 94 additions & 0 deletions playground/tokenizer/tokenizer_demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import json
import os
import pathlib
import zlib
from typing import Dict, List, Optional

import requests
from json_repair import repair_json
from pydantic import BaseModel, model_validator
from tokenizers import Tokenizer, pre_tokenizers, Regex, decoders
from tokenizers.models import BPE

# https://novelai.net/tokenizer/compressed/llama3nai_tokenizer.def?v=2&static=true

model_name = "clip_tokenizer"
model_full_name = f"{model_name}.def"
url = f"https://novelai.net/tokenizer/compressed/{model_full_name}?v=2&static=true"
if not os.path.exists(model_full_name):
print(f"Downloading {model_full_name} from {url}")
response = requests.get(url)
response.raise_for_status()
# write down
with open(model_full_name, "wb") as f:
f.write(response.content)


class TokenizerSetting(BaseModel):
class TokenizerConfig(BaseModel):
splitRegex: str
maxEncodeChars: Optional[int] = None
maxNoWhitespaceChars: Optional[int] = None
ignoreMerges: Optional[bool] = False

config: TokenizerConfig
specialTokens: List[str]
vocab: Dict[str, int]
merges: list

@model_validator(mode="after")
def ensure(self):
self.merges = [tuple(merge) for merge in self.merges]
return self


# 读取和解压文件
file = pathlib.Path(__file__).parent.joinpath(model_full_name)
encoded_data = file.read_bytes()
decompress_obj = zlib.decompressobj(-zlib.MAX_WBITS)
decode = decompress_obj.decompress(encoded_data)

# 修复和解析 JSON
repaired_json = repair_json(decode.decode('utf-8'), return_objects=True)
json.dump(repaired_json, open(f"{model_name}.json", "w"), indent=2)
tokenizer_setting = TokenizerSetting.model_validate(repaired_json)

# 创建 tokenizer
tokenizer = Tokenizer(BPE(
vocab=tokenizer_setting.vocab,
merges=tokenizer_setting.merges,
ignore_merges=tokenizer_setting.config.ignoreMerges
))

# 设置特殊 tokens
tokenizer.add_special_tokens(tokenizer_setting.specialTokens)
print(tokenizer.token_to_id(" "))
if tokenizer_setting.config.maxEncodeChars:
tokenizer.enable_truncation(max_length=tokenizer_setting.config.maxEncodeChars)
# 设置 normalizer
# tokenizer.normalizer = normalizers.Sequence([])

# 设置 pre_tokenizer
pre_zus = [
pre_tokenizers.Split(
behavior="merged_with_next",
pattern=Regex(tokenizer_setting.config.splitRegex)
),
]
if tokenizer.token_to_id(" ") is None:
pre_zus.append(pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=False))
pre_tokenizer = pre_tokenizers.Sequence(pre_zus)

tokenizer.pre_tokenizer = pre_tokenizer
tokenizer.decoder = decoders.ByteLevel()

# 使用 tokenizer
text = "Hello, World! This is a test."
encoded = tokenizer.encode(text, add_special_tokens=True)
print(f"Pre-tokenized text: {pre_tokenizer.pre_tokenize_str(text)}")
print(f"Encoded tokens: {encoded.tokens}")
print(f"Token IDs: {encoded.ids}")

# 解码
decoded = tokenizer.decode(encoded.ids)
print(f"Decoded text:{decoded}")
22 changes: 14 additions & 8 deletions playground/tokenizer/usage.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,22 @@
from novelai_python.tokenizer import ImagePromptTokenizer
from novelai_python._enum import TextTokenizerGroup, get_tokenizer_model, TextLLMModel
from novelai_python.tokenizer import NaiTokenizer
from novelai_python.utils.encode import b64_to_tokens

tokenizer_util = ImagePromptTokenizer(ImagePromptTokenizer.MODEL_V2_PATH)
text = "The quick brown fox jumps over the goblin."
token_id = tokenizer_util.encode(text)
print("Token IDs:", token_id)
decoded_text = tokenizer_util.decode(token_id)
print("Decoded text:", decoded_text)
tokenizer_package = NaiTokenizer(get_tokenizer_model(TextLLMModel.ERATO))
t_text = "a fox jumped over the lazy dog"
encode_tokens = tokenizer_package.encode(t_text)
print(tokenizer_package.tokenize_text(t_text))
print(f"Tokenized text: {encode_tokens}")
print(tokenizer_package.decode(tokenizer_package.encode(t_text)))

b64 = "UfQBADoAAABIAQAAGQAAANwAAAATAAAAexQAAEAAAAD/mwAA2GkAAJ8DAAAXAQAAtT4AAC8WAAA="
oks = b64_to_tokens(b64)
print(oks)


def limit_prompt_shown(raw_text: str, token_limit=225):
assert isinstance(raw_text, str), "raw_text must be a string"
tokenizer = ImagePromptTokenizer(ImagePromptTokenizer.MODEL_V2_PATH)
tokenizer = NaiTokenizer(TextTokenizerGroup.NERDSTASH_V2)
token_array = tokenizer.encode(raw_text)
used_tokens_len = len(token_array)
if used_tokens_len > token_limit:
Expand Down
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "novelai-python"
version = "0.4.17"
version = "0.5.0"
description = "NovelAI Python Binding With Pydantic"
authors = [
{ name = "sudoskys", email = "[email protected]" },
Expand All @@ -26,6 +26,8 @@ dependencies = [
"ftfy>=6.2.0",
"regex>=2023.12.25",
"tokenizers>=0.15.2",
"json-repair>=0.29.4",
"robust-downloader>=0.0.2",
]
requires-python = ">=3.9"
readme = "README.md"
Expand Down
1 change: 0 additions & 1 deletion src/novelai_python/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
# @Time : 2023/11/18 上午12:18
# @Author : sudoskys
# @File : __init__.py

from ._exceptions import (
NovelAiError,
APIError,
Expand Down
94 changes: 94 additions & 0 deletions src/novelai_python/_enum.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
from enum import Enum
from typing import Optional, Union


class TextLLMModel(Enum):
NEO_2B = "2.7B"
J_6B = "6B"
J_6B_V3 = "6B-v3"
J_6B_V4 = "6B-v4"
GENJI_PYTHON_6B = "genji-python-6b"
GENJI_JP_6B = "genji-jp-6b"
GENJI_JP_6B_V2 = "genji-jp-6b-v2"
EUTERPE_V0 = "euterpe-v0"
EUTERPE_V2 = "euterpe-v2"
KRAKE_V1 = "krake-v1"
KRAKE_V2 = "krake-v2"
BLUE = "blue"
RED = "red"
GREEN = "green"
PURPLE = "purple"
PINK = "pink"
YELLOW = "yellow"
WHITE = "white"
BLACK = "black"
CASSANDRA = "cassandra"
COMMENT_BOT = "hypebot"
INFILL = "infillmodel"
CLIO = "clio-v1"
KAYRA = "kayra-v1"
ERATO = "llama-3-erato-v1"


class TextTokenizerGroup(object):
GENJI = "genji_tokenizer.def"
PILE = "pile_tokenizer.def"
PILE_NAI = "pile_tokenizer.def"
NAI_INLINE = "gpt2_tokenizer.def"
NERDSTASH_V2 = "nerdstash_tokenizer_v2.def"
NERDSTASH = "nerdstash_tokenizer.def"
LLAMA3 = "llama3_tokenizer.def"
GPT2 = "gpt2_tokenizer.def"
CLIP = "clip_tokenizer.def"


TextLLMModelTypeAlias = Union[TextLLMModel, str]

TOKENIZER_MODEL_MAP = {
TextLLMModel.GENJI_JP_6B_V2: TextTokenizerGroup.GENJI,
TextLLMModel.CASSANDRA: TextTokenizerGroup.PILE,
TextLLMModel.KRAKE_V2: TextTokenizerGroup.PILE,
TextLLMModel.INFILL: TextTokenizerGroup.NAI_INLINE,
TextLLMModel.KAYRA: TextTokenizerGroup.NERDSTASH_V2,
TextLLMModel.BLUE: TextTokenizerGroup.NERDSTASH_V2,
TextLLMModel.PINK: TextTokenizerGroup.NERDSTASH_V2,
TextLLMModel.YELLOW: TextTokenizerGroup.NERDSTASH_V2,
TextLLMModel.RED: TextTokenizerGroup.NERDSTASH_V2,
TextLLMModel.GREEN: TextTokenizerGroup.NERDSTASH_V2,
TextLLMModel.BLACK: TextTokenizerGroup.NERDSTASH_V2,
TextLLMModel.CLIO: TextTokenizerGroup.NERDSTASH,
TextLLMModel.PURPLE: TextTokenizerGroup.LLAMA3,
TextLLMModel.WHITE: TextTokenizerGroup.LLAMA3,
TextLLMModel.ERATO: TextTokenizerGroup.LLAMA3,
}

COLORS_LLM = [
TextLLMModel.BLUE,
TextLLMModel.RED,
TextLLMModel.GREEN,
TextLLMModel.PURPLE,
TextLLMModel.PINK,
TextLLMModel.YELLOW,
TextLLMModel.WHITE,
TextLLMModel.BLACK,
]


def get_llm_group(model: TextLLMModel) -> Optional[TextTokenizerGroup]:
if isinstance(model, str):
model = TextLLMModel(model)
return TOKENIZER_MODEL_MAP.get(model, None)


def get_tokenizer_model(model: TextLLMModel) -> str:
if isinstance(model, str):
model = TextLLMModel(model)
group = TOKENIZER_MODEL_MAP.get(model, TextTokenizerGroup.GPT2)
return group


def get_tokenizer_model_url(model: TextLLMModel) -> str:
model_name = get_tokenizer_model(model)
if not model_name.endswith(".def"):
model_name = f"{model_name}.def"
return f"https://novelai.net/tokenizer/compressed/{model_name}?v=2&static=true"
9 changes: 6 additions & 3 deletions src/novelai_python/_response/ai/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

from pydantic import BaseModel, ConfigDict

from novelai_python.sdk.ai.generate._enum import TOKENIZER, TextLLMModel # noqa
from novelai_python.tokenizer import LLMTokenizer
from novelai_python._enum import get_tokenizer_model, TextLLMModel
from novelai_python.tokenizer import NaiTokenizer
from novelai_python.utils.encode import b64_to_tokens

if TYPE_CHECKING:
Expand All @@ -20,4 +20,7 @@ class LLMResp(BaseModel):

@staticmethod
def decode_token(token_str, model: TextLLMModel) -> str:
return LLMTokenizer().decode(b64_to_tokens(token_str), tokenizer_name=TOKENIZER.get(model))
dtype = 'uint32' if model in [TextLLMModel.ERATO] else 'uint16'
return NaiTokenizer(model=get_tokenizer_model(model)).decode(
b64_to_tokens(token_str, dtype=dtype)
)
9 changes: 6 additions & 3 deletions src/novelai_python/_response/ai/generate_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

from pydantic import BaseModel, ConfigDict

from novelai_python.sdk.ai.generate._enum import TOKENIZER, TextLLMModel # noqa
from novelai_python.tokenizer import LLMTokenizer
from novelai_python._enum import get_tokenizer_model, TextLLMModel
from novelai_python.tokenizer import NaiTokenizer
from novelai_python.utils.encode import b64_to_tokens


Expand All @@ -20,4 +20,7 @@ class LLMStreamResp(BaseModel):

@staticmethod
def decode(token_str, model: TextLLMModel) -> str:
return LLMTokenizer().decode(b64_to_tokens(token_str), tokenizer_name=TOKENIZER.get(model))
dtype = 'uint32' if model in [TextLLMModel.ERATO] else 'uint16'
return NaiTokenizer(model=get_tokenizer_model(model)).decode(
b64_to_tokens(token_str, dtype=dtype)
)
Loading

0 comments on commit 9cbed20

Please sign in to comment.