Baseline project

This commit is contained in:
2025-10-07 10:45:42 -06:00
parent 88288a98b6
commit 650dbd92e0
54 changed files with 18982 additions and 0 deletions
+235
View File
@@ -0,0 +1,235 @@
#[derive(PartialEq, Eq, Hash)]
pub enum Language {
Cpp,
Go,
Java,
Js,
Php,
Proto,
Python,
Rst,
Ruby,
Rust,
Scala,
Swift,
Markdown,
Latex,
Html,
Sol,
}
impl Language {
pub fn separators(&self) -> Vec<&str> {
match self {
Language::Cpp => vec![
"\nclass ",
"\nvoid ",
"\nint ",
"\nfloat ",
"\ndouble ",
"\nif ",
"\nfor ",
"\nwhile ",
"\nswitch ",
"\ncase ",
"\n\n",
"\n",
" ",
"",
],
Language::Go => vec![
"\nfunc ",
"\nvar ",
"\nconst ",
"\ntype ",
"\nif ",
"\nfor ",
"\nswitch ",
"\ncase ",
"\n\n",
"\n",
" ",
"",
],
Language::Java => vec![
"\nclass ",
"\npublic ",
"\nprotected ",
"\nprivate ",
"\nstatic ",
"\nif ",
"\nfor ",
"\nwhile ",
"\nswitch ",
"\ncase ",
"\n\n",
"\n",
" ",
"",
],
Language::Js => vec![
"\nfunction ",
"\nconst ",
"\nlet ",
"\nvar ",
"\nclass ",
"\nif ",
"\nfor ",
"\nwhile ",
"\nswitch ",
"\ncase ",
"\ndefault ",
"\n\n",
"\n",
" ",
"",
],
Language::Php => vec![
"\nfunction ",
"\nclass ",
"\nif ",
"\nforeach ",
"\nwhile ",
"\ndo ",
"\nswitch ",
"\ncase ",
"\n\n",
"\n",
" ",
"",
],
Language::Proto => vec![
"\nmessage ",
"\nservice ",
"\nenum ",
"\noption ",
"\nimport ",
"\nsyntax ",
"\n\n",
"\n",
" ",
"",
],
Language::Python => vec!["\nclass ", "\ndef ", "\n\tdef ", "\n\n", "\n", " ", ""],
Language::Rst => vec![
"\n===\n", "\n---\n", "\n***\n", "\n.. ", "\n\n", "\n", " ", "",
],
Language::Ruby => vec![
"\ndef ",
"\nclass ",
"\nif ",
"\nunless ",
"\nwhile ",
"\nfor ",
"\ndo ",
"\nbegin ",
"\nrescue ",
"\n\n",
"\n",
" ",
"",
],
Language::Rust => vec![
"\nfn ", "\nconst ", "\nlet ", "\nif ", "\nwhile ", "\nfor ", "\nloop ",
"\nmatch ", "\nconst ", "\n\n", "\n", " ", "",
],
Language::Scala => vec![
"\nclass ",
"\nobject ",
"\ndef ",
"\nval ",
"\nvar ",
"\nif ",
"\nfor ",
"\nwhile ",
"\nmatch ",
"\ncase ",
"\n\n",
"\n",
" ",
"",
],
Language::Swift => vec![
"\nfunc ",
"\nclass ",
"\nstruct ",
"\nenum ",
"\nif ",
"\nfor ",
"\nwhile ",
"\ndo ",
"\nswitch ",
"\ncase ",
"\n\n",
"\n",
" ",
"",
],
Language::Markdown => vec![
"\n## ",
"\n### ",
"\n#### ",
"\n##### ",
"\n###### ",
"```\n\n",
"\n\n***\n\n",
"\n\n---\n\n",
"\n\n___\n\n",
"\n\n",
"\n",
" ",
"",
],
Language::Latex => vec![
"\n\\chapter{",
"\n\\section{",
"\n\\subsection{",
"\n\\subsubsection{",
"\n\\begin{enumerate}",
"\n\\begin{itemize}",
"\n\\begin{description}",
"\n\\begin{list}",
"\n\\begin{quote}",
"\n\\begin{quotation}",
"\n\\begin{verse}",
"\n\\begin{verbatim}",
"\n\\begin{align}",
"$$",
"$",
"\n\n",
"\n",
" ",
"",
],
Language::Html => vec![
"<body>", "<div>", "<p>", "<br>", "<li>", "<h1>", "<h2>", "<h3>", "<h4>", "<h5>",
"<h6>", "<span>", "<table>", "<tr>", "<td>", "<th>", "<ul>", "<ol>", "<header>",
"<footer>", "<nav>", "<head>", "<style>", "<script>", "<meta>", "<title>", " ", "",
],
Language::Sol => vec![
"\npragma ",
"\nusing ",
"\ncontract ",
"\ninterface ",
"\nlibrary ",
"\nconstructor ",
"\ntype ",
"\nfunction ",
"\nevent ",
"\nmodifier ",
"\nerror ",
"\nstruct ",
"\nenum ",
"\nif ",
"\nfor ",
"\nwhile ",
"\ndo while ",
"\nassembly ",
"\n\n",
"\n",
" ",
"",
],
}
}
}
+475
View File
@@ -0,0 +1,475 @@
mod language;
pub use self::language::*;
use super::{DocumentMetadata, RagDocument};
pub const DEFAULT_SEPARATORS: [&str; 4] = ["\n\n", "\n", " ", ""];
pub fn get_separators(extension: &str) -> Vec<&'static str> {
match extension {
"c" | "cc" | "cpp" => Language::Cpp.separators(),
"go" => Language::Go.separators(),
"java" => Language::Java.separators(),
"js" | "mjs" | "cjs" => Language::Js.separators(),
"php" => Language::Php.separators(),
"proto" => Language::Proto.separators(),
"py" => Language::Python.separators(),
"rst" => Language::Rst.separators(),
"rb" => Language::Ruby.separators(),
"rs" => Language::Rust.separators(),
"scala" => Language::Scala.separators(),
"swift" => Language::Swift.separators(),
"md" | "mkd" => Language::Markdown.separators(),
"tex" => Language::Latex.separators(),
"htm" | "html" => Language::Html.separators(),
"sol" => Language::Sol.separators(),
_ => DEFAULT_SEPARATORS.to_vec(),
}
}
pub struct RecursiveCharacterTextSplitter {
pub chunk_size: usize,
pub chunk_overlap: usize,
pub separators: Vec<String>,
pub length_function: Box<dyn Fn(&str) -> usize + Send + Sync>,
}
impl Default for RecursiveCharacterTextSplitter {
fn default() -> Self {
Self {
chunk_size: 1000,
chunk_overlap: 20,
separators: DEFAULT_SEPARATORS.iter().map(|v| v.to_string()).collect(),
length_function: Box::new(|text| text.len()),
}
}
}
impl RecursiveCharacterTextSplitter {
pub fn new(chunk_size: usize, chunk_overlap: usize, separators: &[&str]) -> Self {
Self::default()
.with_chunk_size(chunk_size)
.with_chunk_overlap(chunk_overlap)
.with_separators(separators)
}
pub fn with_chunk_size(mut self, chunk_size: usize) -> Self {
self.chunk_size = chunk_size;
self
}
pub fn with_chunk_overlap(mut self, chunk_overlap: usize) -> Self {
self.chunk_overlap = chunk_overlap;
self
}
pub fn with_separators(mut self, separators: &[&str]) -> Self {
self.separators = separators.iter().map(|v| v.to_string()).collect();
self
}
pub fn split_documents(
&self,
documents: &[RagDocument],
chunk_header_options: &SplitterChunkHeaderOptions,
) -> Vec<RagDocument> {
let mut texts: Vec<String> = Vec::new();
let mut metadatas: Vec<DocumentMetadata> = Vec::new();
documents.iter().for_each(|d| {
if !d.page_content.is_empty() {
texts.push(d.page_content.clone());
metadatas.push(d.metadata.clone());
}
});
self.create_documents(&texts, &metadatas, chunk_header_options)
}
pub fn create_documents(
&self,
texts: &[String],
metadatas: &[DocumentMetadata],
chunk_header_options: &SplitterChunkHeaderOptions,
) -> Vec<RagDocument> {
let SplitterChunkHeaderOptions {
chunk_header,
chunk_overlap_header,
} = chunk_header_options;
let mut documents = Vec::new();
for (i, text) in texts.iter().enumerate() {
let mut prev_chunk: Option<String> = None;
let mut index_prev_chunk = -1;
for chunk in self.split_text(text) {
let mut page_content = chunk_header.clone();
let index_chunk = if index_prev_chunk < 0 {
text.find(&chunk).map(|i| i as i32).unwrap_or(-1)
} else {
match text[(index_prev_chunk as usize)..].chars().next() {
Some(c) => {
let offset = (index_prev_chunk as usize) + c.len_utf8();
text[offset..]
.find(&chunk)
.map(|i| (i + offset) as i32)
.unwrap_or(-1)
}
None => -1,
}
};
if prev_chunk.is_some() {
if let Some(chunk_overlap_header) = chunk_overlap_header {
page_content += chunk_overlap_header;
}
}
let metadata = metadatas[i].clone();
page_content += &chunk;
documents.push(RagDocument {
page_content,
metadata,
});
prev_chunk = Some(chunk);
index_prev_chunk = index_chunk;
}
}
documents
}
pub fn split_text(&self, text: &str) -> Vec<String> {
let keep_separator = self
.separators
.iter()
.any(|v| v.chars().any(|v| !v.is_whitespace()));
self.split_text_impl(text, &self.separators, keep_separator)
}
fn split_text_impl(
&self,
text: &str,
separators: &[String],
keep_separator: bool,
) -> Vec<String> {
let mut final_chunks = Vec::new();
let mut separator: String = separators.last().cloned().unwrap_or_default();
let mut new_separators: Vec<String> = vec![];
for (i, s) in separators.iter().enumerate() {
if s.is_empty() {
separator.clone_from(s);
break;
}
if text.contains(s) {
separator.clone_from(s);
new_separators = separators[i + 1..].to_vec();
break;
}
}
// Now that we have the separator, split the text
let splits = split_on_separator(text, &separator, keep_separator);
// Now go merging things, recursively splitting longer texts.
let mut good_splits = Vec::new();
let _separator = if keep_separator { "" } else { &separator };
for s in splits {
if (self.length_function)(s) < self.chunk_size {
good_splits.push(s.to_string());
} else {
if !good_splits.is_empty() {
let merged_text = self.merge_splits(&good_splits, _separator);
final_chunks.extend(merged_text);
good_splits.clear();
}
if new_separators.is_empty() {
final_chunks.push(s.to_string());
} else {
let other_info = self.split_text_impl(s, &new_separators, keep_separator);
final_chunks.extend(other_info);
}
}
}
if !good_splits.is_empty() {
let merged_text = self.merge_splits(&good_splits, _separator);
final_chunks.extend(merged_text);
}
final_chunks
}
fn merge_splits(&self, splits: &[String], separator: &str) -> Vec<String> {
let mut docs = Vec::new();
let mut current_doc = Vec::new();
let mut total = 0;
for d in splits {
let _len = (self.length_function)(d);
if total + _len + current_doc.len() * separator.len() > self.chunk_size {
if total > self.chunk_size {
// warn!("Warning: Created a chunk of size {}, which is longer than the specified {}", total, self.chunk_size);
}
if !current_doc.is_empty() {
let doc = self.join_docs(&current_doc, separator);
if let Some(doc) = doc {
docs.push(doc);
}
// Keep on popping if:
// - we have a larger chunk than in the chunk overlap
// - or if we still have any chunks and the length is long
while total > self.chunk_overlap
|| (total + _len + current_doc.len() * separator.len() > self.chunk_size
&& total > 0)
{
total -= (self.length_function)(&current_doc[0]);
current_doc.remove(0);
}
}
}
current_doc.push(d.to_string());
total += _len;
}
let doc = self.join_docs(&current_doc, separator);
if let Some(doc) = doc {
docs.push(doc);
}
docs
}
fn join_docs(&self, docs: &[String], separator: &str) -> Option<String> {
let text = docs.join(separator).trim().to_string();
if text.is_empty() {
None
} else {
Some(text)
}
}
}
pub struct SplitterChunkHeaderOptions {
pub chunk_header: String,
pub chunk_overlap_header: Option<String>,
}
impl Default for SplitterChunkHeaderOptions {
fn default() -> Self {
Self {
chunk_header: "".into(),
chunk_overlap_header: None,
}
}
}
impl SplitterChunkHeaderOptions {
// Set the value of chunk_header
#[allow(unused)]
pub fn with_chunk_header(mut self, header: &str) -> Self {
self.chunk_header = header.to_string();
self
}
// Set the value of chunk_overlap_header
#[allow(unused)]
pub fn with_chunk_overlap_header(mut self, overlap_header: &str) -> Self {
self.chunk_overlap_header = Some(overlap_header.to_string());
self
}
}
fn split_on_separator<'a>(text: &'a str, separator: &str, keep_separator: bool) -> Vec<&'a str> {
let splits: Vec<&str> = if !separator.is_empty() {
if keep_separator {
let mut splits = Vec::new();
let mut prev_idx = 0;
let sep_len = separator.len();
while let Some(idx) = text[prev_idx..].find(separator) {
splits.push(&text[prev_idx.saturating_sub(sep_len)..prev_idx + idx]);
prev_idx += idx + sep_len;
}
if prev_idx < text.len() {
splits.push(&text[prev_idx.saturating_sub(sep_len)..]);
}
splits
} else {
text.split(separator).collect()
}
} else {
text.split("").collect()
};
splits.into_iter().filter(|s| !s.is_empty()).collect()
}
#[cfg(test)]
mod tests {
use super::*;
use indexmap::IndexMap;
use pretty_assertions::assert_eq;
use serde_json::{json, Value};
fn build_metadata(source: &str) -> Value {
json!({ "source": source })
}
#[test]
fn test_split_text() {
let splitter = RecursiveCharacterTextSplitter {
chunk_size: 7,
chunk_overlap: 3,
separators: vec![" ".into()],
..Default::default()
};
let output = splitter.split_text("foo bar baz 123");
assert_eq!(output, vec!["foo bar", "bar baz", "baz 123"]);
}
#[test]
fn test_create_document() {
let splitter = RecursiveCharacterTextSplitter::new(3, 0, &[" "]);
let chunk_header_options = SplitterChunkHeaderOptions::default();
let mut metadata1 = IndexMap::new();
metadata1.insert("source".into(), "1".into());
let mut metadata2 = IndexMap::new();
metadata2.insert("source".into(), "2".into());
let output = splitter.create_documents(
&["foo bar".into(), "baz".into()],
&[metadata1, metadata2],
&chunk_header_options,
);
let output = json!(output);
assert_eq!(
output,
json!([
{
"page_content": "foo",
"metadata": build_metadata("1"),
},
{
"page_content": "bar",
"metadata": build_metadata("1"),
},
{
"page_content": "baz",
"metadata": build_metadata("2"),
},
])
);
}
#[test]
fn test_chunk_header() {
let splitter = RecursiveCharacterTextSplitter::new(3, 0, &[" "]);
let chunk_header_options = SplitterChunkHeaderOptions::default()
.with_chunk_header("SOURCE NAME: testing\n-----\n")
.with_chunk_overlap_header("(cont'd) ");
let mut metadata1 = IndexMap::new();
metadata1.insert("source".into(), "1".into());
let mut metadata2 = IndexMap::new();
metadata2.insert("source".into(), "2".into());
let output = splitter.create_documents(
&["foo bar".into(), "baz".into()],
&[metadata1, metadata2],
&chunk_header_options,
);
let output = json!(output);
assert_eq!(
output,
json!([
{
"page_content": "SOURCE NAME: testing\n-----\nfoo",
"metadata": build_metadata("1"),
},
{
"page_content": "SOURCE NAME: testing\n-----\n(cont'd) bar",
"metadata": build_metadata("1"),
},
{
"page_content": "SOURCE NAME: testing\n-----\nbaz",
"metadata": build_metadata("2"),
},
])
);
}
#[test]
fn test_markdown_splitter() {
let text = r#"# 🦜️🔗 LangChain
⚡ Building applications with LLMs through composability ⚡
## Quick Install
```bash
# Hopefully this code block isn't split
pip install langchain
```
As an open source project in a rapidly developing field, we are extremely open to contributions."#;
let splitter =
RecursiveCharacterTextSplitter::new(100, 0, &Language::Markdown.separators());
let output = splitter.split_text(text);
let expected_output = vec![
"# 🦜️🔗 LangChain\n\n⚡ Building applications with LLMs through composability ⚡",
"## Quick Install\n\n```bash\n# Hopefully this code block isn't split\npip install langchain",
"```",
"As an open source project in a rapidly developing field, we are extremely open to contributions.",
];
assert_eq!(output, expected_output);
}
#[test]
fn test_html_splitter() {
let text = r#"<!DOCTYPE html>
<html>
<head>
<title>🦜️🔗 LangChain</title>
<style>
body {
font-family: Arial, sans-serif;
}
h1 {
color: darkblue;
}
</style>
</head>
<body>
<div>
<h1>🦜️🔗 LangChain</h1>
<p>⚡ Building applications with LLMs through composability ⚡</p>
</div>
<div>
As an open source project in a rapidly developing field, we are extremely open to contributions.
</div>
</body>
</html>"#;
let splitter = RecursiveCharacterTextSplitter::new(175, 20, &Language::Html.separators());
let output = splitter.split_text(text);
let expected_output = vec![
"<!DOCTYPE html>\n<html>",
"<head>\n <title>🦜️🔗 LangChain</title>",
r#"<style>
body {
font-family: Arial, sans-serif;
}
h1 {
color: darkblue;
}
</style>
</head>"#,
r#"<body>
<div>
<h1>🦜️🔗 LangChain</h1>
<p>⚡ Building applications with LLMs through composability ⚡</p>
</div>"#,
r#"<div>
As an open source project in a rapidly developing field, we are extremely open to contributions.
</div>
</body>
</html>"#,
];
assert_eq!(output, expected_output);
}
}