Skip to content

Commit

Permalink
Add tokenizer o200k_base for gpt-4o- models (#7564)
Browse files Browse the repository at this point in the history
* Add tokenizer o200k_base for gpt-4o- models

* Add valid special chars

* Fix regex: same logic with join + raw string instead of regex as for other tokenizers
  • Loading branch information
PopDaph authored Sep 23, 2024
1 parent de14352 commit 1b8a006
Show file tree
Hide file tree
Showing 3 changed files with 200,048 additions and 5 deletions.
16 changes: 11 additions & 5 deletions core/src/providers/openai.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ use crate::providers::llm::{
};
use crate::providers::provider::{ModelError, ModelErrorRetryOptions, Provider, ProviderID};
use crate::providers::tiktoken::tiktoken::{
batch_tokenize_async, cl100k_base_singleton, p50k_base_singleton, r50k_base_singleton, CoreBPE,
batch_tokenize_async, cl100k_base_singleton, o200k_base_singleton, p50k_base_singleton,
r50k_base_singleton, CoreBPE,
};
use crate::providers::tiktoken::tiktoken::{decode_async, encode_async};
use crate::run::Credentials;
Expand Down Expand Up @@ -1668,10 +1669,15 @@ impl OpenAILLM {
match self.id.as_str() {
"code_davinci-002" | "code-cushman-001" => p50k_base_singleton(),
"text-davinci-002" | "text-davinci-003" => p50k_base_singleton(),
_ => match self.id.starts_with("gpt-3.5-turbo") || self.id.starts_with("gpt-4") {
true => cl100k_base_singleton(),
false => r50k_base_singleton(),
},
_ => {
if self.id.starts_with("gpt-4o-") {
o200k_base_singleton()
} else if self.id.starts_with("gpt-3.5-turbo") || self.id.starts_with("gpt-4") {
cl100k_base_singleton()
} else {
r50k_base_singleton()
}
}
}
}

Expand Down
Loading

0 comments on commit 1b8a006

Please sign in to comment.