fix: update the estimate_token_length function to use the standard word count method

This commit is contained in:
2026-05-19 12:25:53 -06:00
parent 04498b96ec
commit e7bb668ac7
+2 -16
View File
@@ -34,7 +34,6 @@ use is_terminal::IsTerminal;
use std::borrow::Cow;
use std::sync::LazyLock;
use std::{cmp, env, path::PathBuf, process};
use unicode_segmentation::UnicodeSegmentation;
pub static CODE_BLOCK_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?ms)```\w*(.*)```").unwrap());
@@ -74,21 +73,8 @@ pub fn parse_bool(value: &str) -> Option<bool> {
}
pub fn estimate_token_length(text: &str) -> usize {
let words: Vec<&str> = text.unicode_words().collect();
let mut output: f32 = 0.0;
for word in words {
if word.is_ascii() {
output += 1.3;
} else {
let count = word.chars().count();
if count == 1 {
output += 1.0
} else {
output += (count as f32) * 0.5;
}
}
}
output.ceil() as usize
let weighted: usize = text.chars().map(|c| if c.is_ascii() { 1 } else { 2 }).sum();
weighted.div_ceil(4)
}
pub fn strip_think_tag(text: &str) -> Cow<'_, str> {