Baseline project
This commit is contained in:
@@ -0,0 +1,125 @@
|
||||
use super::*;
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use indexmap::IndexMap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
|
||||
pub const EXTENSION_METADATA: &str = "__extension__";
|
||||
|
||||
pub type DocumentMetadata = IndexMap<String, String>;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct LoadedDocument {
|
||||
pub path: String,
|
||||
pub contents: String,
|
||||
#[serde(default)]
|
||||
pub metadata: DocumentMetadata,
|
||||
}
|
||||
|
||||
impl LoadedDocument {
|
||||
pub fn new(path: String, contents: String, metadata: DocumentMetadata) -> Self {
|
||||
Self {
|
||||
path,
|
||||
contents,
|
||||
metadata,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn load_recursive_url(
|
||||
loaders: &HashMap<String, String>,
|
||||
path: &str,
|
||||
) -> Result<Vec<LoadedDocument>> {
|
||||
let extension = RECURSIVE_URL_LOADER;
|
||||
let pages: Vec<Page> = match loaders.get(extension) {
|
||||
Some(loader_command) => {
|
||||
let contents = run_loader_command(path, extension, loader_command)?;
|
||||
serde_json::from_str(&contents).context(r#"The crawler response is invalid. It should follow the JSON format: `[{"path":"...", "text":"..."}]`."#)?
|
||||
}
|
||||
None => {
|
||||
let options = CrawlOptions::preset(path);
|
||||
crawl_website(path, options).await?
|
||||
}
|
||||
};
|
||||
let output = pages
|
||||
.into_iter()
|
||||
.map(|v| {
|
||||
let Page { path, text } = v;
|
||||
let mut metadata: DocumentMetadata = Default::default();
|
||||
metadata.insert(EXTENSION_METADATA.into(), "md".into());
|
||||
LoadedDocument::new(path, text, metadata)
|
||||
})
|
||||
.collect();
|
||||
Ok(output)
|
||||
}
|
||||
|
||||
pub async fn load_file(loaders: &HashMap<String, String>, path: &str) -> Result<LoadedDocument> {
|
||||
let extension = get_patch_extension(path).unwrap_or_else(|| DEFAULT_EXTENSION.into());
|
||||
match loaders.get(&extension) {
|
||||
Some(loader_command) => load_with_command(path, &extension, loader_command),
|
||||
None => load_plain(path, &extension).await,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn load_url(loaders: &HashMap<String, String>, path: &str) -> Result<LoadedDocument> {
|
||||
let (contents, extension) = fetch_with_loaders(loaders, path, false).await?;
|
||||
let mut metadata: DocumentMetadata = Default::default();
|
||||
metadata.insert(EXTENSION_METADATA.into(), extension);
|
||||
Ok(LoadedDocument::new(path.into(), contents, metadata))
|
||||
}
|
||||
|
||||
async fn load_plain(path: &str, extension: &str) -> Result<LoadedDocument> {
|
||||
let contents = tokio::fs::read_to_string(path).await?;
|
||||
let mut metadata: DocumentMetadata = Default::default();
|
||||
metadata.insert(EXTENSION_METADATA.into(), extension.to_string());
|
||||
Ok(LoadedDocument::new(path.into(), contents, metadata))
|
||||
}
|
||||
|
||||
fn load_with_command(path: &str, extension: &str, loader_command: &str) -> Result<LoadedDocument> {
|
||||
let contents = run_loader_command(path, extension, loader_command)?;
|
||||
let mut metadata: DocumentMetadata = Default::default();
|
||||
metadata.insert(EXTENSION_METADATA.into(), DEFAULT_EXTENSION.to_string());
|
||||
Ok(LoadedDocument::new(path.into(), contents, metadata))
|
||||
}
|
||||
|
||||
pub fn is_loader_protocol(loaders: &HashMap<String, String>, path: &str) -> bool {
|
||||
match path.split_once(':') {
|
||||
Some((protocol, _)) => loaders.contains_key(protocol),
|
||||
None => false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn load_protocol_path(
|
||||
loaders: &HashMap<String, String>,
|
||||
path: &str,
|
||||
) -> Result<Vec<LoadedDocument>> {
|
||||
let (protocol, loader_command, new_path) = path
|
||||
.split_once(':')
|
||||
.and_then(|(protocol, path)| {
|
||||
let loader_command = loaders.get(protocol)?;
|
||||
Some((protocol, loader_command, path))
|
||||
})
|
||||
.ok_or_else(|| anyhow!("No document loader for '{}'", path))?;
|
||||
let contents = run_loader_command(new_path, protocol, loader_command)?;
|
||||
let output = if let Ok(list) = serde_json::from_str::<Vec<LoadedDocument>>(&contents) {
|
||||
list.into_iter()
|
||||
.map(|mut v| {
|
||||
if v.path.starts_with(path) {
|
||||
} else if v.path.starts_with(new_path) {
|
||||
v.path = format!("{}:{}", protocol, v.path);
|
||||
} else {
|
||||
v.path = format!("{}/{}", path, v.path);
|
||||
}
|
||||
v
|
||||
})
|
||||
.collect()
|
||||
} else {
|
||||
vec![LoadedDocument::new(
|
||||
path.into(),
|
||||
contents,
|
||||
Default::default(),
|
||||
)]
|
||||
};
|
||||
Ok(output)
|
||||
}
|
||||
Reference in New Issue
Block a user