fix: update the estimate_token_length function to use the standard word count method

2026-05-19 12:25:53 -06:00
parent 04498b96ec
commit e7bb668ac7
1 changed files with 2 additions and 16 deletions
@@ -34,7 +34,6 @@ use is_terminal::IsTerminal;
 use std::borrow::Cow;
 use std::sync::LazyLock;
 use std::{cmp, env, path::PathBuf, process};
 use unicode_segmentation::UnicodeSegmentation;
 pub static CODE_BLOCK_RE: LazyLock<Regex> =
    LazyLock::new(|| Regex::new(r"(?ms)```\w*(.*)```").unwrap());
@@ -74,21 +73,8 @@ pub fn parse_bool(value: &str) -> Option<bool> {
 }
 pub fn estimate_token_length(text: &str) -> usize {
-    let words: Vec<&str> = text.unicode_words().collect();
+    let weighted: usize = text.chars().map(|c| if c.is_ascii() { 1 } else { 2 }).sum();
-    let mut output: f32 = 0.0;
+    weighted.div_ceil(4)
    for word in words {
        if word.is_ascii() {
            output += 1.3;
        } else {
            let count = word.chars().count();
            if count == 1 {
                output += 1.0
            } else {
                output += (count as f32) * 0.5;
            }
        }
    }
    output.ceil() as usize
 }
 pub fn strip_think_tag(text: &str) -> Cow<'_, str> {