Skip to content

Commit

Permalink
fix: wrap and tokenize algorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
sigoden committed Nov 3, 2023
1 parent b34e40e commit 148794b
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 6 deletions.
20 changes: 16 additions & 4 deletions src/render/markdown.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,13 +82,13 @@ impl MarkdownRender {
.join("\n")
}

pub fn render_with_indent(&mut self, text: &str, padding: usize) -> String {
let text = format!("{}{}", " ".repeat(padding), text);
pub fn render_with_indent(&mut self, text: &str, indent: usize) -> String {
let text = format!("{}{}", " ".repeat(indent), text);
let output = self.render(&text);
if output.starts_with('\n') {
output
} else {
output.chars().skip(padding).collect()
output.chars().skip(indent).collect()
}
}

Expand Down Expand Up @@ -186,7 +186,7 @@ impl MarkdownRender {
if is_code && !self.options.wrap_code {
return line;
}
textwrap::wrap(&line, width as usize).join("\n")
wrap(&line, width as usize)
} else {
line
}
Expand All @@ -203,6 +203,12 @@ impl MarkdownRender {
}
}

fn wrap(text: &str, width: usize) -> String {
let indent: usize = text.chars().take_while(|c| *c == ' ').count();
let wrap_options = textwrap::Options::new(width).initial_indent(&text[0..indent]);
textwrap::wrap(&text[indent..], wrap_options).join("\n")
}

#[derive(Debug, Clone, Default)]
pub struct RenderOptions {
pub theme: Option<Theme>,
Expand Down Expand Up @@ -381,5 +387,11 @@ std::error::Error>> {
let expect =
"To unzip a file in Rust, you can use the\n`zip` crate. Here's an example code";
assert_eq!(output, expect);

let input = "Unzip a file";
let output = render.render_with_indent(input, 76);
let expect = "\nUnzip a file";

assert_eq!(output, expect);
}
}
29 changes: 27 additions & 2 deletions src/utils/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,23 @@ pub fn get_env_name(key: &str) -> String {

/// Split text to tokens
pub fn tokenize(text: &str) -> Vec<String> {
let tokens = cl100k_base_singleton().lock().tokenize(text);
tokens.into_iter().map(|(_, text)| text).collect()
let tokens = cl100k_base_singleton()
.lock()
.encode_with_special_tokens(text);
let token_bytes: Vec<Vec<u8>> = tokens
.into_iter()
.map(|v| cl100k_base_singleton().lock().decode_bytes(vec![v]))
.collect();
let mut output = vec![];
let mut current_bytes = vec![];
for bytes in token_bytes {
current_bytes.extend(bytes);
if let Ok(v) = std::str::from_utf8(&current_bytes) {
output.push(v.to_string());
current_bytes.clear();
}
}
output
}

/// Count how many tokens a piece of text needs to consume
Expand Down Expand Up @@ -60,3 +75,13 @@ pub fn init_tokio_runtime() -> anyhow::Result<tokio::runtime::Runtime> {
.build()
.with_context(|| "Failed to init tokio")
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_tokenize() {
assert_eq!(tokenize("😊 hello world"), ["😊", " hello", " world"])
}
}

0 comments on commit 148794b

Please sign in to comment.