fix: update the estimate_token_length function to use the standard word count method
This commit is contained in:
+2
-16
@@ -34,7 +34,6 @@ use is_terminal::IsTerminal;
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::sync::LazyLock;
|
use std::sync::LazyLock;
|
||||||
use std::{cmp, env, path::PathBuf, process};
|
use std::{cmp, env, path::PathBuf, process};
|
||||||
use unicode_segmentation::UnicodeSegmentation;
|
|
||||||
|
|
||||||
pub static CODE_BLOCK_RE: LazyLock<Regex> =
|
pub static CODE_BLOCK_RE: LazyLock<Regex> =
|
||||||
LazyLock::new(|| Regex::new(r"(?ms)```\w*(.*)```").unwrap());
|
LazyLock::new(|| Regex::new(r"(?ms)```\w*(.*)```").unwrap());
|
||||||
@@ -74,21 +73,8 @@ pub fn parse_bool(value: &str) -> Option<bool> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn estimate_token_length(text: &str) -> usize {
|
pub fn estimate_token_length(text: &str) -> usize {
|
||||||
let words: Vec<&str> = text.unicode_words().collect();
|
let weighted: usize = text.chars().map(|c| if c.is_ascii() { 1 } else { 2 }).sum();
|
||||||
let mut output: f32 = 0.0;
|
weighted.div_ceil(4)
|
||||||
for word in words {
|
|
||||||
if word.is_ascii() {
|
|
||||||
output += 1.3;
|
|
||||||
} else {
|
|
||||||
let count = word.chars().count();
|
|
||||||
if count == 1 {
|
|
||||||
output += 1.0
|
|
||||||
} else {
|
|
||||||
output += (count as f32) * 0.5;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
output.ceil() as usize
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn strip_think_tag(text: &str) -> Cow<'_, str> {
|
pub fn strip_think_tag(text: &str) -> Cow<'_, str> {
|
||||||
|
|||||||
Reference in New Issue
Block a user