fix: update the estimate_token_length function to use the standard word count method
This commit is contained in:
+2
-16
@@ -34,7 +34,6 @@ use is_terminal::IsTerminal;
|
||||
use std::borrow::Cow;
|
||||
use std::sync::LazyLock;
|
||||
use std::{cmp, env, path::PathBuf, process};
|
||||
use unicode_segmentation::UnicodeSegmentation;
|
||||
|
||||
pub static CODE_BLOCK_RE: LazyLock<Regex> =
|
||||
LazyLock::new(|| Regex::new(r"(?ms)```\w*(.*)```").unwrap());
|
||||
@@ -74,21 +73,8 @@ pub fn parse_bool(value: &str) -> Option<bool> {
|
||||
}
|
||||
|
||||
pub fn estimate_token_length(text: &str) -> usize {
|
||||
let words: Vec<&str> = text.unicode_words().collect();
|
||||
let mut output: f32 = 0.0;
|
||||
for word in words {
|
||||
if word.is_ascii() {
|
||||
output += 1.3;
|
||||
} else {
|
||||
let count = word.chars().count();
|
||||
if count == 1 {
|
||||
output += 1.0
|
||||
} else {
|
||||
output += (count as f32) * 0.5;
|
||||
}
|
||||
}
|
||||
}
|
||||
output.ceil() as usize
|
||||
let weighted: usize = text.chars().map(|c| if c.is_ascii() { 1 } else { 2 }).sum();
|
||||
weighted.div_ceil(4)
|
||||
}
|
||||
|
||||
pub fn strip_think_tag(text: &str) -> Cow<'_, str> {
|
||||
|
||||
Reference in New Issue
Block a user