Baseline project

This commit is contained in:
2025-10-07 10:45:42 -06:00
parent 88288a98b6
commit 650dbd92e0
54 changed files with 18982 additions and 0 deletions
+88
View File
@@ -0,0 +1,88 @@
use anyhow::Result;
use crossterm::event::{self, Event, KeyCode, KeyModifiers};
use std::{
sync::{
atomic::{AtomicBool, Ordering},
Arc,
},
time::Duration,
};
pub type AbortSignal = Arc<AbortSignalInner>;
pub struct AbortSignalInner {
ctrlc: AtomicBool,
ctrld: AtomicBool,
}
pub fn create_abort_signal() -> AbortSignal {
AbortSignalInner::new()
}
impl AbortSignalInner {
pub fn new() -> AbortSignal {
Arc::new(Self {
ctrlc: AtomicBool::new(false),
ctrld: AtomicBool::new(false),
})
}
pub fn aborted(&self) -> bool {
if self.aborted_ctrlc() {
return true;
}
if self.aborted_ctrld() {
return true;
}
false
}
pub fn aborted_ctrlc(&self) -> bool {
self.ctrlc.load(Ordering::SeqCst)
}
pub fn aborted_ctrld(&self) -> bool {
self.ctrld.load(Ordering::SeqCst)
}
pub fn reset(&self) {
self.ctrlc.store(false, Ordering::SeqCst);
self.ctrld.store(false, Ordering::SeqCst);
}
pub fn set_ctrlc(&self) {
self.ctrlc.store(true, Ordering::SeqCst);
}
pub fn set_ctrld(&self) {
self.ctrld.store(true, Ordering::SeqCst);
}
}
pub async fn wait_abort_signal(abort_signal: &AbortSignal) {
loop {
if abort_signal.aborted() {
break;
}
tokio::time::sleep(Duration::from_millis(25)).await;
}
}
pub fn poll_abort_signal(abort_signal: &AbortSignal) -> Result<bool> {
if event::poll(Duration::from_millis(25))? {
if let Event::Key(key) = event::read()? {
match key.code {
KeyCode::Char('c') if key.modifiers == KeyModifiers::CONTROL => {
abort_signal.set_ctrlc();
return Ok(true);
}
KeyCode::Char('d') if key.modifiers == KeyModifiers::CONTROL => {
abort_signal.set_ctrld();
return Ok(true);
}
_ => {}
}
}
}
Ok(false)
}
+49
View File
@@ -0,0 +1,49 @@
use anyhow::Context;
#[cfg(not(any(target_os = "android", target_os = "emscripten")))]
mod internal {
use arboard::Clipboard;
use base64::{engine::general_purpose::STANDARD, Engine as _};
use std::sync::{LazyLock, Mutex};
static CLIPBOARD: LazyLock<Mutex<Option<Clipboard>>> =
LazyLock::new(|| Mutex::new(Clipboard::new().ok()));
pub fn set_text(text: &str) -> anyhow::Result<()> {
let mut clipboard = CLIPBOARD.lock().unwrap();
match clipboard.as_mut() {
Some(clipboard) => {
clipboard.set_text(text)?;
#[cfg(target_os = "linux")]
std::thread::sleep(std::time::Duration::from_millis(50));
Ok(())
}
None => set_text_osc52(text),
}
}
/// Attempts to set text to clipboard with OSC52 escape sequence
/// Works in many modern terminals, including over SSH.
fn set_text_osc52(text: &str) -> anyhow::Result<()> {
let encoded = STANDARD.encode(text);
let seq = format!("\x1b]52;c;{encoded}\x07");
if let Err(e) = std::io::Write::write_all(&mut std::io::stdout(), seq.as_bytes()) {
return Err(anyhow::anyhow!("Failed to send OSC52 sequence").context(e));
}
if let Err(e) = std::io::Write::flush(&mut std::io::stdout()) {
return Err(anyhow::anyhow!("Failed to flush OSC52 sequence").context(e));
}
Ok(())
}
}
#[cfg(any(target_os = "android", target_os = "emscripten"))]
mod internal {
pub fn set_text(_text: &str) -> anyhow::Result<()> {
Err(anyhow::anyhow!("No clipboard available"))
}
}
pub fn set_text(text: &str) -> anyhow::Result<()> {
internal::set_text(text).context("Failed to copy")
}
+242
View File
@@ -0,0 +1,242 @@
use super::*;
use std::{
collections::HashMap,
env,
ffi::OsStr,
fs::OpenOptions,
io::{self, Write},
path::{Path, PathBuf},
process::Command,
};
use anyhow::{anyhow, bail, Context, Result};
use dirs::home_dir;
use std::sync::LazyLock;
pub static SHELL: LazyLock<Shell> = LazyLock::new(detect_shell);
pub struct Shell {
pub name: String,
pub cmd: String,
pub arg: String,
}
impl Shell {
pub fn new(name: &str, cmd: &str, arg: &str) -> Self {
Self {
name: name.to_string(),
cmd: cmd.to_string(),
arg: arg.to_string(),
}
}
}
pub fn detect_shell() -> Shell {
let cmd = env::var(get_env_name("shell")).ok().or_else(|| {
if cfg!(windows) {
if let Ok(ps_module_path) = env::var("PSModulePath") {
let ps_module_path = ps_module_path.to_lowercase();
if ps_module_path.starts_with(r"c:\users") {
return if ps_module_path.contains(r"\powershell\7\") {
Some("pwsh.exe".to_string())
} else {
Some("powershell.exe".to_string())
};
}
}
None
} else {
env::var("SHELL").ok()
}
});
let name = cmd
.as_ref()
.and_then(|v| Path::new(v).file_stem().and_then(|v| v.to_str()))
.map(|v| {
if v == "nu" {
"nushell".into()
} else {
v.to_lowercase()
}
});
let (cmd, name) = match (cmd.as_deref(), name.as_deref()) {
(Some(cmd), Some(name)) => (cmd, name),
_ => {
if cfg!(windows) {
("cmd.exe", "cmd")
} else {
("/bin/sh", "sh")
}
}
};
let shell_arg = match name {
"powershell" => "-Command",
"cmd" => "/C",
_ => "-c",
};
Shell::new(name, cmd, shell_arg)
}
pub fn run_command<T: AsRef<OsStr>>(
cmd: &str,
args: &[T],
envs: Option<HashMap<String, String>>,
) -> Result<i32> {
let status = Command::new(cmd)
.args(args.iter())
.envs(envs.unwrap_or_default())
.status()?;
Ok(status.code().unwrap_or_default())
}
pub fn run_command_with_output<T: AsRef<OsStr>>(
cmd: &str,
args: &[T],
envs: Option<HashMap<String, String>>,
) -> Result<(bool, String, String)> {
let output = Command::new(cmd)
.args(args.iter())
.envs(envs.unwrap_or_default())
.output()?;
let status = output.status;
let stdout = std::str::from_utf8(&output.stdout).context("Invalid UTF-8 in stdout")?;
let stderr = std::str::from_utf8(&output.stderr).context("Invalid UTF-8 in stderr")?;
if !status.success() {
debug!("Command `{cmd}` exited with non-zero: {status}");
}
if !stdout.is_empty() {
debug!("Command `{cmd}` exited with non-zero. stderr: {stderr}");
}
if !stderr.is_empty() {
debug!("Command `{cmd}` executed successfully. stdout: {stdout}");
}
Ok((status.success(), stdout.to_string(), stderr.to_string()))
}
pub fn run_loader_command(path: &str, extension: &str, loader_command: &str) -> Result<String> {
let cmd_args = shell_words::split(loader_command)
.with_context(|| anyhow!("Invalid document loader '{extension}': `{loader_command}`"))?;
let mut use_stdout = true;
let outpath = temp_file("-output-", "").display().to_string();
let cmd_args: Vec<_> = cmd_args
.into_iter()
.map(|mut v| {
if v.contains("$1") {
v = v.replace("$1", path);
}
if v.contains("$2") {
use_stdout = false;
v = v.replace("$2", &outpath);
}
v
})
.collect();
let cmd_eval = shell_words::join(&cmd_args);
debug!("run `{cmd_eval}`");
let (cmd, args) = cmd_args.split_at(1);
let cmd = &cmd[0];
if use_stdout {
let (success, stdout, stderr) =
run_command_with_output(cmd, args, None).with_context(|| {
format!("Unable to run `{cmd_eval}`, Perhaps '{cmd}' is not installed?")
})?;
if !success {
let err = if !stderr.is_empty() {
stderr
} else {
format!("The command `{cmd_eval}` exited with non-zero.")
};
bail!("{err}")
}
Ok(stdout)
} else {
let status = run_command(cmd, args, None).with_context(|| {
format!("Unable to run `{cmd_eval}`, Perhaps '{cmd}' is not installed?")
})?;
if status != 0 {
bail!("The command `{cmd_eval}` exited with non-zero.")
}
let contents = std::fs::read_to_string(&outpath)
.context("Failed to read file generated by the loader")?;
Ok(contents)
}
}
pub fn edit_file(editor: &str, path: &Path) -> Result<()> {
let mut child = Command::new(editor).arg(path).spawn()?;
child.wait()?;
Ok(())
}
pub fn append_to_shell_history(shell: &str, command: &str, exit_code: i32) -> io::Result<()> {
if let Some(history_file) = get_history_file(shell) {
let command = command.replace('\n', " ");
let now = now_timestamp();
let history_txt = if shell == "fish" {
format!("- cmd: {command}\n when: {now}")
} else if shell == "zsh" {
format!(": {now}:{exit_code};{command}",)
} else {
command
};
let mut file = OpenOptions::new()
.create(true)
.append(true)
.open(&history_file)?;
writeln!(file, "{history_txt}")?;
}
Ok(())
}
fn get_history_file(shell: &str) -> Option<PathBuf> {
match shell {
"bash" | "sh" => env::var("HISTFILE")
.ok()
.map(PathBuf::from)
.or(Some(home_dir()?.join(".bash_history"))),
"zsh" => env::var("HISTFILE")
.ok()
.map(PathBuf::from)
.or(Some(home_dir()?.join(".zsh_history"))),
"nushell" => Some(dirs::config_dir()?.join("nushell").join("history.txt")),
"fish" => Some(
home_dir()?
.join(".local")
.join("share")
.join("fish")
.join("fish_history"),
),
"powershell" | "pwsh" => {
#[cfg(not(windows))]
{
Some(
home_dir()?
.join(".local")
.join("share")
.join("powershell")
.join("PSReadLine")
.join("ConsoleHost_history.txt"),
)
}
#[cfg(windows)]
{
Some(
dirs::data_dir()?
.join("Microsoft")
.join("Windows")
.join("PowerShell")
.join("PSReadLine")
.join("ConsoleHost_history.txt"),
)
}
}
"ksh" => Some(home_dir()?.join(".ksh_history")),
"tcsh" => Some(home_dir()?.join(".history")),
_ => None,
}
}
+35
View File
@@ -0,0 +1,35 @@
use base64::{engine::general_purpose::STANDARD, Engine};
use hmac::{Hmac, Mac};
use sha2::{Digest, Sha256};
pub fn sha256(input: &str) -> String {
let mut hasher = Sha256::new();
hasher.update(input);
format!("{:x}", hasher.finalize())
}
pub fn hmac_sha256(key: &[u8], msg: &str) -> Vec<u8> {
let mut mac = Hmac::<Sha256>::new_from_slice(key).expect("HMAC can take key of any size");
mac.update(msg.as_bytes());
mac.finalize().into_bytes().to_vec()
}
pub fn hex_encode(bytes: &[u8]) -> String {
bytes
.iter()
.fold(String::new(), |acc, b| acc + &format!("{b:02x}"))
}
pub fn encode_uri(uri: &str) -> String {
uri.split('/')
.map(|v| urlencoding::encode(v))
.collect::<Vec<_>>()
.join("/")
}
pub fn base64_encode<T: AsRef<[u8]>>(input: T) -> String {
STANDARD.encode(input)
}
pub fn base64_decode<T: AsRef<[u8]>>(input: T) -> Result<Vec<u8>, base64::DecodeError> {
STANDARD.decode(input)
}
+18
View File
@@ -0,0 +1,18 @@
use std::{cell::RefCell, rc::Rc};
use html_to_markdown::{markdown, TagHandler};
pub fn html_to_md(html: &str) -> String {
let mut handlers: Vec<TagHandler> = vec![
Rc::new(RefCell::new(markdown::ParagraphHandler)),
Rc::new(RefCell::new(markdown::HeadingHandler)),
Rc::new(RefCell::new(markdown::ListHandler)),
Rc::new(RefCell::new(markdown::TableHandler::new())),
Rc::new(RefCell::new(markdown::StyledTextHandler)),
Rc::new(RefCell::new(markdown::CodeHandler)),
Rc::new(RefCell::new(markdown::WebpageChromeRemover)),
];
html_to_markdown::convert_html_to_markdown(html.as_bytes(), &mut handlers)
.unwrap_or_else(|_| html.to_string())
}
+47
View File
@@ -0,0 +1,47 @@
use anyhow::Result;
use crossterm::event::{self, Event, KeyCode, KeyEvent, KeyModifiers};
use crossterm::terminal::{disable_raw_mode, enable_raw_mode};
use std::io::{stdout, Write};
/// Reads a single character from stdin without requiring Enter
/// Returns the character if it's one of the valid options, or the default if Enter is pressed
pub fn read_single_key(valid_chars: &[char], default: char, prompt: &str) -> Result<char> {
print!("{prompt}");
stdout().flush()?;
enable_raw_mode()?;
let result = loop {
if let Ok(Event::Key(KeyEvent {
code, modifiers, ..
})) = event::read()
{
match code {
KeyCode::Char('c') if modifiers.contains(KeyModifiers::CONTROL) => {
break Err(anyhow::anyhow!("Interrupted"));
}
KeyCode::Char(c) => {
if valid_chars.contains(&c) {
break Ok(c);
}
// Invalid character, continue loop
}
KeyCode::Enter => {
break Ok(default);
}
_ => {
// Other keys are ignored, continue loop
}
}
}
};
disable_raw_mode()?;
// Print the chosen character and newline for clean output
if let Ok(chosen) = &result {
println!("{chosen}");
}
result
}
+125
View File
@@ -0,0 +1,125 @@
use super::*;
use anyhow::{anyhow, Context, Result};
use indexmap::IndexMap;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
pub const EXTENSION_METADATA: &str = "__extension__";
pub type DocumentMetadata = IndexMap<String, String>;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LoadedDocument {
pub path: String,
pub contents: String,
#[serde(default)]
pub metadata: DocumentMetadata,
}
impl LoadedDocument {
pub fn new(path: String, contents: String, metadata: DocumentMetadata) -> Self {
Self {
path,
contents,
metadata,
}
}
}
pub async fn load_recursive_url(
loaders: &HashMap<String, String>,
path: &str,
) -> Result<Vec<LoadedDocument>> {
let extension = RECURSIVE_URL_LOADER;
let pages: Vec<Page> = match loaders.get(extension) {
Some(loader_command) => {
let contents = run_loader_command(path, extension, loader_command)?;
serde_json::from_str(&contents).context(r#"The crawler response is invalid. It should follow the JSON format: `[{"path":"...", "text":"..."}]`."#)?
}
None => {
let options = CrawlOptions::preset(path);
crawl_website(path, options).await?
}
};
let output = pages
.into_iter()
.map(|v| {
let Page { path, text } = v;
let mut metadata: DocumentMetadata = Default::default();
metadata.insert(EXTENSION_METADATA.into(), "md".into());
LoadedDocument::new(path, text, metadata)
})
.collect();
Ok(output)
}
pub async fn load_file(loaders: &HashMap<String, String>, path: &str) -> Result<LoadedDocument> {
let extension = get_patch_extension(path).unwrap_or_else(|| DEFAULT_EXTENSION.into());
match loaders.get(&extension) {
Some(loader_command) => load_with_command(path, &extension, loader_command),
None => load_plain(path, &extension).await,
}
}
pub async fn load_url(loaders: &HashMap<String, String>, path: &str) -> Result<LoadedDocument> {
let (contents, extension) = fetch_with_loaders(loaders, path, false).await?;
let mut metadata: DocumentMetadata = Default::default();
metadata.insert(EXTENSION_METADATA.into(), extension);
Ok(LoadedDocument::new(path.into(), contents, metadata))
}
async fn load_plain(path: &str, extension: &str) -> Result<LoadedDocument> {
let contents = tokio::fs::read_to_string(path).await?;
let mut metadata: DocumentMetadata = Default::default();
metadata.insert(EXTENSION_METADATA.into(), extension.to_string());
Ok(LoadedDocument::new(path.into(), contents, metadata))
}
fn load_with_command(path: &str, extension: &str, loader_command: &str) -> Result<LoadedDocument> {
let contents = run_loader_command(path, extension, loader_command)?;
let mut metadata: DocumentMetadata = Default::default();
metadata.insert(EXTENSION_METADATA.into(), DEFAULT_EXTENSION.to_string());
Ok(LoadedDocument::new(path.into(), contents, metadata))
}
pub fn is_loader_protocol(loaders: &HashMap<String, String>, path: &str) -> bool {
match path.split_once(':') {
Some((protocol, _)) => loaders.contains_key(protocol),
None => false,
}
}
pub fn load_protocol_path(
loaders: &HashMap<String, String>,
path: &str,
) -> Result<Vec<LoadedDocument>> {
let (protocol, loader_command, new_path) = path
.split_once(':')
.and_then(|(protocol, path)| {
let loader_command = loaders.get(protocol)?;
Some((protocol, loader_command, path))
})
.ok_or_else(|| anyhow!("No document loader for '{}'", path))?;
let contents = run_loader_command(new_path, protocol, loader_command)?;
let output = if let Ok(list) = serde_json::from_str::<Vec<LoadedDocument>>(&contents) {
list.into_iter()
.map(|mut v| {
if v.path.starts_with(path) {
} else if v.path.starts_with(new_path) {
v.path = format!("{}:{}", protocol, v.path);
} else {
v.path = format!("{}/{}", path, v.path);
}
v
})
.collect()
} else {
vec![LoadedDocument::new(
path.into(),
contents,
Default::default(),
)]
};
Ok(output)
}
+63
View File
@@ -0,0 +1,63 @@
use crate::config::Config;
use colored::Colorize;
use fancy_regex::Regex;
use std::fs::File;
use std::io::{BufRead, BufReader, Seek, SeekFrom};
use std::process;
pub async fn tail_logs(no_color: bool) {
let re = Regex::new(r"^(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3})\s+<(?P<opid>[^\s>]+)>\s+\[(?P<level>[A-Z]+)\]\s+(?P<logger>[^:]+):(?P<line>\d+)\s+-\s+(?P<message>.*)$").unwrap();
let file_path = Config::log_path();
let file = File::open(&file_path).expect("Cannot open file");
let mut reader = BufReader::new(file);
if let Err(e) = reader.seek(SeekFrom::End(0)) {
eprintln!("Unable to tail log file: {e:?}");
process::exit(1);
};
let mut lines = reader.lines();
loop {
if let Some(Ok(line)) = lines.next() {
if no_color {
println!("{line}");
} else {
let colored_line = colorize_log_line(&line, &re);
println!("{colored_line}");
}
}
}
}
fn colorize_log_line(line: &str, re: &Regex) -> String {
if let Some(caps) = re.captures(line).expect("Failed to capture log line") {
let level = &caps["level"];
let message = &caps["message"];
let colored_message = match level {
"ERROR" => message.red(),
"WARN" => message.yellow(),
"INFO" => message.green(),
"DEBUG" => message.blue(),
_ => message.normal(),
};
let timestamp = &caps["timestamp"];
let opid = &caps["opid"];
let logger = &caps["logger"];
let line_number = &caps["line"];
format!(
"{} <{}> [{}] {}:{} - {}",
timestamp.white(),
opid.cyan(),
level.bold(),
logger.magenta(),
line_number.bold(),
colored_message
)
} else {
line.to_string()
}
}
+252
View File
@@ -0,0 +1,252 @@
mod abort_signal;
mod clipboard;
mod command;
mod crypto;
mod html_to_md;
mod input;
mod loader;
mod logs;
pub mod native;
mod path;
mod render_prompt;
mod request;
mod spinner;
mod variables;
pub use self::abort_signal::*;
pub use self::clipboard::set_text;
pub use self::command::*;
pub use self::crypto::*;
pub use self::html_to_md::*;
pub use self::input::*;
pub use self::loader::*;
pub use self::logs::*;
pub use self::path::*;
pub use self::render_prompt::render_prompt;
pub use self::request::*;
pub use self::spinner::*;
pub use self::variables::*;
use anyhow::{Context, Result};
use fancy_regex::Regex;
use fuzzy_matcher::{skim::SkimMatcherV2, FuzzyMatcher};
use is_terminal::IsTerminal;
use std::borrow::Cow;
use std::sync::LazyLock;
use std::{env, path::PathBuf, process};
use unicode_segmentation::UnicodeSegmentation;
pub static CODE_BLOCK_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?ms)```\w*(.*)```").unwrap());
pub static THINK_TAG_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?s)^\s*<think>.*?</think>(\s*|$)").unwrap());
pub static IS_STDOUT_TERMINAL: LazyLock<bool> = LazyLock::new(|| std::io::stdout().is_terminal());
pub static NO_COLOR: LazyLock<bool> = LazyLock::new(|| {
env::var("NO_COLOR")
.ok()
.and_then(|v| parse_bool(&v))
.unwrap_or_default()
|| !*IS_STDOUT_TERMINAL
});
pub fn now() -> String {
chrono::Local::now().to_rfc3339_opts(chrono::SecondsFormat::Secs, false)
}
pub fn now_timestamp() -> i64 {
chrono::Local::now().timestamp()
}
pub fn get_env_name(key: &str) -> String {
format!("{}_{key}", env!("CARGO_CRATE_NAME"),).to_ascii_uppercase()
}
pub fn normalize_env_name(value: &str) -> String {
value.replace('-', "_").to_ascii_uppercase()
}
pub fn parse_bool(value: &str) -> Option<bool> {
match value {
"1" | "true" => Some(true),
"0" | "false" => Some(false),
_ => None,
}
}
pub fn estimate_token_length(text: &str) -> usize {
let words: Vec<&str> = text.unicode_words().collect();
let mut output: f32 = 0.0;
for word in words {
if word.is_ascii() {
output += 1.3;
} else {
let count = word.chars().count();
if count == 1 {
output += 1.0
} else {
output += (count as f32) * 0.5;
}
}
}
output.ceil() as usize
}
pub fn strip_think_tag(text: &str) -> Cow<'_, str> {
THINK_TAG_RE.replace_all(text, "")
}
pub fn extract_code_block(text: &str) -> &str {
CODE_BLOCK_RE
.captures(text)
.ok()
.and_then(|v| v?.get(1).map(|v| v.as_str().trim()))
.unwrap_or(text)
}
pub fn convert_option_string(value: &str) -> Option<String> {
if value.is_empty() {
None
} else {
Some(value.to_string())
}
}
pub fn fuzzy_filter<T, F>(values: Vec<T>, get: F, pattern: &str) -> Vec<T>
where
F: Fn(&T) -> &str,
{
let matcher = SkimMatcherV2::default();
let mut list: Vec<(T, i64)> = values
.into_iter()
.filter_map(|v| {
let score = matcher.fuzzy_match(get(&v), pattern)?;
Some((v, score))
})
.collect();
list.sort_unstable_by(|a, b| b.1.cmp(&a.1));
list.into_iter().map(|(v, _)| v).collect()
}
pub fn pretty_error(err: &anyhow::Error) -> String {
let mut output = vec![];
output.push(format!("Error: {err}"));
let causes: Vec<_> = err.chain().skip(1).collect();
let causes_len = causes.len();
if causes_len > 0 {
output.push("\nCaused by:".to_string());
if causes_len == 1 {
output.push(format!(" {}", indent_text(causes[0], 4).trim()));
} else {
for (i, cause) in causes.into_iter().enumerate() {
output.push(format!("{i:5}: {}", indent_text(cause, 7).trim()));
}
}
}
output.join("\n")
}
pub fn indent_text<T: ToString>(s: T, size: usize) -> String {
let indent_str = " ".repeat(size);
s.to_string()
.split('\n')
.map(|line| format!("{indent_str}{line}"))
.collect::<Vec<String>>()
.join("\n")
}
pub fn error_text(input: &str) -> String {
color_text(input, nu_ansi_term::Color::Red)
}
pub fn warning_text(input: &str) -> String {
color_text(input, nu_ansi_term::Color::Yellow)
}
pub fn color_text(input: &str, color: nu_ansi_term::Color) -> String {
if *NO_COLOR {
return input.to_string();
}
nu_ansi_term::Style::new()
.fg(color)
.paint(input)
.to_string()
}
pub fn dimmed_text(input: &str) -> String {
if *NO_COLOR {
return input.to_string();
}
nu_ansi_term::Style::new().dimmed().paint(input).to_string()
}
pub fn multiline_text(input: &str) -> String {
input
.split('\n')
.enumerate()
.map(|(i, v)| {
if i == 0 {
v.to_string()
} else {
format!(".. {v}")
}
})
.collect::<Vec<String>>()
.join("\n")
}
pub fn temp_file(prefix: &str, suffix: &str) -> PathBuf {
env::temp_dir().join(format!(
"{}-{}{prefix}{}{suffix}",
env!("CARGO_CRATE_NAME").to_lowercase(),
process::id(),
uuid::Uuid::new_v4()
))
}
pub fn is_url(path: &str) -> bool {
path.starts_with("http://") || path.starts_with("https://")
}
pub fn set_proxy(
mut builder: reqwest::ClientBuilder,
proxy: &str,
) -> Result<reqwest::ClientBuilder> {
builder = builder.no_proxy();
if !proxy.is_empty() && proxy != "-" {
builder = builder
.proxy(reqwest::Proxy::all(proxy).with_context(|| format!("Invalid proxy `{proxy}`"))?);
};
Ok(builder)
}
pub fn decode_bin<T: serde::de::DeserializeOwned>(data: &[u8]) -> Result<T> {
let (v, _) = bincode::serde::decode_from_slice(data, bincode::config::legacy())?;
Ok(v)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
#[cfg(not(target_os = "windows"))]
fn test_safe_join_path() {
assert_eq!(
safe_join_path("/home/user/dir1", "files/file1"),
Some(PathBuf::from("/home/user/dir1/files/file1"))
);
assert!(safe_join_path("/home/user/dir1", "/files/file1").is_none());
assert!(safe_join_path("/home/user/dir1", "../file1").is_none());
}
#[test]
#[cfg(target_os = "windows")]
fn test_safe_join_path() {
assert_eq!(
safe_join_path("C:\\Users\\user\\dir1", "files/file1"),
Some(PathBuf::from("C:\\Users\\user\\dir1\\files\\file1"))
);
assert!(safe_join_path("C:\\Users\\user\\dir1", "/files/file1").is_none());
assert!(safe_join_path("C:\\Users\\user\\dir1", "../file1").is_none());
}
}
+46
View File
@@ -0,0 +1,46 @@
#[cfg(windows)]
pub mod runtime {
use std::path::Path;
pub fn bash_path() -> Option<String> {
let bash_path = "C:\\Program Files\\Git\\bin\\bash.exe";
if exist_path(bash_path) {
return Some(bash_path.into());
}
let git_path = which("git")?;
let git_parent_path = parent_path(&git_path)?;
let bash_path = join_path(&parent_path(&git_parent_path)?, &["bin", "bash.exe"]);
if exist_path(&bash_path) {
return Some(bash_path);
}
let bash_path = join_path(&git_parent_path, &["bash.exe"]);
if exist_path(&bash_path) {
return Some(bash_path);
}
None
}
fn exist_path(path: &str) -> bool {
Path::new(path).exists()
}
pub fn which(name: &str) -> Option<String> {
which::which(name)
.ok()
.map(|path| path.to_string_lossy().into())
}
fn parent_path(path: &str) -> Option<String> {
Path::new(path)
.parent()
.map(|path| path.to_string_lossy().into())
}
fn join_path(path: &str, parts: &[&str]) -> String {
let mut path = Path::new(path).to_path_buf();
for part in parts {
path = path.join(part);
}
path.to_string_lossy().into()
}
}
+356
View File
@@ -0,0 +1,356 @@
use std::fs;
use std::path::{Component, Path, PathBuf};
use anyhow::{bail, Result};
use fancy_regex::Regex;
use indexmap::IndexSet;
use path_absolutize::Absolutize;
type ParseGlobResult = (String, Option<Vec<String>>, bool, Option<usize>);
pub fn safe_join_path<T1: AsRef<Path>, T2: AsRef<Path>>(
base_path: T1,
sub_path: T2,
) -> Option<PathBuf> {
let base_path = base_path.as_ref();
let sub_path = sub_path.as_ref();
if sub_path.is_absolute() {
return None;
}
let mut joined_path = PathBuf::from(base_path);
for component in sub_path.components() {
if Component::ParentDir == component {
return None;
}
joined_path.push(component);
}
if joined_path.starts_with(base_path) {
Some(joined_path)
} else {
None
}
}
pub async fn expand_glob_paths<T: AsRef<str>>(
paths: &[T],
bail_non_exist: bool,
) -> Result<IndexSet<String>> {
let mut new_paths = IndexSet::new();
for path in paths {
let (path_str, suffixes, current_only, depth) = parse_glob(path.as_ref())?;
list_files(
&mut new_paths,
Path::new(&path_str),
suffixes.as_ref(),
current_only,
bail_non_exist,
depth,
)
.await?;
}
Ok(new_paths)
}
pub fn clear_dir(dir: &Path) -> Result<()> {
for entry in fs::read_dir(dir)? {
let entry = entry?;
let path = entry.path();
if path.is_dir() {
fs::remove_dir_all(&path)?;
} else {
fs::remove_file(&path)?;
}
}
Ok(())
}
pub fn list_file_names<T: AsRef<Path>>(dir: T, ext: &str) -> Vec<String> {
match fs::read_dir(dir.as_ref()) {
Ok(rd) => {
let mut names = vec![];
for entry in rd.flatten() {
let name = entry.file_name();
if let Some(name) = name.to_string_lossy().strip_suffix(ext) {
names.push(name.to_string());
}
}
names.sort_unstable();
names
}
Err(_) => vec![],
}
}
pub fn get_patch_extension(path: &str) -> Option<String> {
Path::new(&path)
.extension()
.map(|v| v.to_string_lossy().to_lowercase())
}
pub fn to_absolute_path(path: &str) -> Result<String> {
Ok(Path::new(&path).absolutize()?.display().to_string())
}
pub fn resolve_home_dir(path: &str) -> String {
let mut path = path.to_string();
if path.starts_with("~/") || path.starts_with("~\\") {
if let Some(home_dir) = dirs::home_dir() {
path.replace_range(..1, &home_dir.display().to_string());
}
}
path
}
fn parse_glob(path_str: &str) -> Result<ParseGlobResult> {
let globbed_single_subdir_regex = Regex::new(r"\*/[^/]+\.[^/]+$").expect("invalid regex");
let globbed_recursive_subdir_regex = Regex::new(r"\*\*/[^/]+\.[^/]+$").expect("invalid regex");
let glob_result =
if let Some(start) = path_str.find("/**/*.").or_else(|| path_str.find(r"\**\*.")) {
Some((start, 6, false, None))
} else if let Some(start) = path_str.find("**/*.").or_else(|| path_str.find(r"**\*.")) {
if start == 0 {
Some((start, 5, false, None))
} else {
None
}
} else if let Some(m) = globbed_recursive_subdir_regex.find(path_str)? {
Some((m.start(), 3, false, None))
} else if let Some(m) = globbed_single_subdir_regex.find(path_str)? {
Some((m.start(), 2, false, Some(1usize)))
} else if let Some(start) = path_str.find("/*.").or_else(|| path_str.find(r"\*.")) {
Some((start, 3, true, None))
} else if let Some(start) = path_str.find("*.") {
if start == 0 {
Some((start, 2, true, None))
} else {
None
}
} else {
None
};
if let Some((start, offset, current_only, depth)) = glob_result {
let mut base_path = path_str[..start].to_string();
if base_path.is_empty() {
base_path = if path_str
.chars()
.next()
.map(|v| v == '/')
.unwrap_or_default()
{
"/"
} else {
"."
}
.into();
}
let extensions = if let Some(curly_brace_end) = path_str[start..].find('}') {
let end = start + curly_brace_end;
let extensions_str = &path_str[start + offset..end + 1];
if extensions_str.starts_with('{') && extensions_str.ends_with('}') {
extensions_str[1..extensions_str.len() - 1]
.split(',')
.map(|s| s.to_string())
.collect::<Vec<String>>()
} else {
bail!("Invalid path '{path_str}'");
}
} else {
let extensions_str = &path_str[start + offset..];
vec![extensions_str.to_string()]
};
let extensions = if extensions.is_empty() {
None
} else {
Some(extensions)
};
Ok((base_path, extensions, current_only, depth))
} else if path_str.ends_with("/**") || path_str.ends_with(r"\**") {
Ok((
path_str[0..path_str.len() - 3].to_string(),
None,
false,
None,
))
} else {
Ok((path_str.to_string(), None, false, None))
}
}
#[async_recursion::async_recursion]
async fn list_files(
files: &mut IndexSet<String>,
entry_path: &Path,
suffixes: Option<&Vec<String>>,
current_only: bool,
bail_non_exist: bool,
depth: Option<usize>,
) -> Result<()> {
if !entry_path.exists() {
if bail_non_exist {
bail!("Not found '{}'", entry_path.display());
} else {
return Ok(());
}
}
if entry_path.is_dir() {
let mut reader = tokio::fs::read_dir(entry_path).await?;
while let Some(entry) = reader.next_entry().await? {
let path = entry.path();
if path.is_dir() {
if !current_only {
if let Some(remaining_depth) = depth {
if remaining_depth > 0 {
list_files(
files,
&path,
suffixes,
current_only,
bail_non_exist,
Some(remaining_depth - 1),
)
.await?;
}
} else {
list_files(files, &path, suffixes, current_only, bail_non_exist, None)
.await?;
}
}
} else {
add_file(files, suffixes, &path);
}
}
} else {
add_file(files, suffixes, entry_path);
}
Ok(())
}
fn add_file(files: &mut IndexSet<String>, suffixes: Option<&Vec<String>>, path: &Path) {
if is_valid_extension(suffixes, path) {
let path = path.display().to_string();
if !files.contains(&path) {
files.insert(path);
}
}
}
fn is_valid_extension(suffixes: Option<&Vec<String>>, path: &Path) -> bool {
let filename_regex = Regex::new(r"^.+\.*").unwrap();
if let Some(suffixes) = suffixes {
if !suffixes.is_empty() {
if let Ok(Some(_)) = filename_regex.find(&suffixes.join(",")) {
let file_name = path
.file_name()
.and_then(|v| v.to_str())
.expect("invalid filename")
.to_string();
return suffixes.contains(&file_name);
} else if let Some(extension) =
path.extension().map(|v| v.to_string_lossy().to_string())
{
return suffixes.contains(&extension);
}
return false;
}
}
true
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_glob() {
assert_eq!(
parse_glob("dir").unwrap(),
("dir".into(), None, false, None)
);
assert_eq!(
parse_glob("dir/**").unwrap(),
("dir".into(), None, false, None)
);
assert_eq!(
parse_glob("dir/file.md").unwrap(),
("dir/file.md".into(), None, false, None)
);
assert_eq!(
parse_glob("**/*.md").unwrap(),
(".".into(), Some(vec!["md".into()]), false, None)
);
assert_eq!(
parse_glob("/**/*.md").unwrap(),
("/".into(), Some(vec!["md".into()]), false, None)
);
assert_eq!(
parse_glob("dir/**/*.md").unwrap(),
("dir".into(), Some(vec!["md".into()]), false, None)
);
assert_eq!(
parse_glob("dir/**/test.md").unwrap(),
("dir/".into(), Some(vec!["test.md".into()]), false, None)
);
assert_eq!(
parse_glob("dir/*/test.md").unwrap(),
(
"dir/".into(),
Some(vec!["test.md".into()]),
false,
Some(1usize)
)
);
assert_eq!(
parse_glob("dir/**/*.{md,txt}").unwrap(),
(
"dir".into(),
Some(vec!["md".into(), "txt".into()]),
false,
None
)
);
assert_eq!(
parse_glob("C:\\dir\\**\\*.{md,txt}").unwrap(),
(
"C:\\dir".into(),
Some(vec!["md".into(), "txt".into()]),
false,
None
)
);
assert_eq!(
parse_glob("*.md").unwrap(),
(".".into(), Some(vec!["md".into()]), true, None)
);
assert_eq!(
parse_glob("/*.md").unwrap(),
("/".into(), Some(vec!["md".into()]), true, None)
);
assert_eq!(
parse_glob("dir/*.md").unwrap(),
("dir".into(), Some(vec!["md".into()]), true, None)
);
assert_eq!(
parse_glob("dir/*.{md,txt}").unwrap(),
(
"dir".into(),
Some(vec!["md".into(), "txt".into()]),
true,
None
)
);
assert_eq!(
parse_glob("C:\\dir\\*.{md,txt}").unwrap(),
(
"C:\\dir".into(),
Some(vec!["md".into(), "txt".into()]),
true,
None
)
);
}
}
+155
View File
@@ -0,0 +1,155 @@
use std::collections::HashMap;
/// Render REPL prompt
///
/// The template comprises plain text and `{...}`.
///
/// The syntax of `{...}`:
/// - `{var}` - When `var` has a value, replace `var` with the value and eval `template`
/// - `{?var <template>}` - Eval `template` when `var` is evaluated as true
/// - `{!var <template>}` - Eval `template` when `var` is evaluated as false
pub fn render_prompt(template: &str, variables: &HashMap<&str, String>) -> String {
let exprs = parse_template(template);
eval_exprs(&exprs, variables)
}
fn parse_template(template: &str) -> Vec<Expr> {
let chars: Vec<char> = template.chars().collect();
let mut exprs = vec![];
let mut current = vec![];
let mut balances = vec![];
for ch in chars.iter().cloned() {
if !balances.is_empty() {
if ch == '}' {
balances.pop();
if balances.is_empty() {
if !current.is_empty() {
let block = parse_block(&mut current);
exprs.push(block)
}
} else {
current.push(ch);
}
} else if ch == '{' {
balances.push(ch);
current.push(ch);
} else {
current.push(ch);
}
} else if ch == '{' {
balances.push(ch);
add_text(&mut exprs, &mut current);
} else {
current.push(ch)
}
}
add_text(&mut exprs, &mut current);
exprs
}
fn parse_block(current: &mut Vec<char>) -> Expr {
let value: String = current.drain(..).collect();
match value.split_once(' ') {
Some((name, tail)) => {
if let Some(name) = name.strip_prefix('?') {
let block_exprs = parse_template(tail);
Expr::Block(BlockType::Yes, name.to_string(), block_exprs)
} else if let Some(name) = name.strip_prefix('!') {
let block_exprs = parse_template(tail);
Expr::Block(BlockType::No, name.to_string(), block_exprs)
} else {
Expr::Text(format!("{{{value}}}"))
}
}
None => Expr::Variable(value),
}
}
fn eval_exprs(exprs: &[Expr], variables: &HashMap<&str, String>) -> String {
let mut output = String::new();
for part in exprs {
match part {
Expr::Text(text) => output.push_str(text),
Expr::Variable(variable) => {
let value = variables
.get(variable.as_str())
.cloned()
.unwrap_or_default();
output.push_str(&value);
}
Expr::Block(typ, variable, block_exprs) => {
let value = variables
.get(variable.as_str())
.cloned()
.unwrap_or_default();
match typ {
BlockType::Yes => {
if truly(&value) {
let block_output = eval_exprs(block_exprs, variables);
output.push_str(&block_output)
}
}
BlockType::No => {
if !truly(&value) {
let block_output = eval_exprs(block_exprs, variables);
output.push_str(&block_output)
}
}
}
}
}
}
output
}
fn add_text(exprs: &mut Vec<Expr>, current: &mut Vec<char>) {
if current.is_empty() {
return;
}
let value: String = current.drain(..).collect();
exprs.push(Expr::Text(value));
}
fn truly(value: &str) -> bool {
!(value.is_empty() || value == "0" || value == "false")
}
#[derive(Debug)]
enum Expr {
Text(String),
Variable(String),
Block(BlockType, String, Vec<Expr>),
}
#[derive(Debug)]
enum BlockType {
Yes,
No,
}
#[cfg(test)]
mod tests {
use super::*;
macro_rules! assert_render {
($template:expr, [$(($key:literal, $value:literal),)*], $expect:literal) => {
let data = HashMap::from([
$(($key, $value.into()),)*
]);
assert_eq!(render_prompt($template, &data), $expect);
};
}
#[test]
fn test_render() {
let prompt = "{?session {session}{?role /}}{role}{?session )}{!session >}";
assert_render!(prompt, [], ">");
assert_render!(prompt, [("role", "coder"),], "coder>");
assert_render!(prompt, [("session", "temp"),], "temp)");
assert_render!(
prompt,
[("session", "temp"), ("role", "coder"),],
"temp/coder)"
);
}
}
+464
View File
@@ -0,0 +1,464 @@
use super::*;
use anyhow::{anyhow, bail, Context, Result};
use fancy_regex::Regex;
use futures_util::{stream, StreamExt};
use http::header::CONTENT_TYPE;
use reqwest::Url;
use scraper::{Html, Selector};
use serde::Deserialize;
use serde_json::Value;
use std::sync::LazyLock;
use std::{
collections::{HashMap, HashSet},
sync::Arc,
time::Duration,
};
use tokio::io::AsyncWriteExt;
use tokio::sync::Semaphore;
pub const URL_LOADER: &str = "url";
pub const RECURSIVE_URL_LOADER: &str = "recursive_url";
pub const MEDIA_URL_EXTENSION: &str = "media_url";
pub const DEFAULT_EXTENSION: &str = "txt";
const MAX_CRAWLS: usize = 5;
const BREAK_ON_ERROR: bool = false;
const USER_AGENT: &str = "curl/8.6.0";
static CLIENT: LazyLock<Result<reqwest::Client>> = LazyLock::new(|| {
let builder = reqwest::ClientBuilder::new().timeout(Duration::from_secs(16));
let client = builder.build()?;
Ok(client)
});
static PRESET: LazyLock<Vec<(Regex, CrawlOptions)>> = LazyLock::new(|| {
vec![
(
Regex::new(r"github.com/([^/]+)/([^/]+)/tree/([^/]+)").unwrap(),
CrawlOptions {
exclude: vec!["changelog".into(), "changes".into(), "license".into()],
..Default::default()
},
),
(
Regex::new(r"github.com/([^/]+)/([^/]+)/wiki").unwrap(),
CrawlOptions {
exclude: vec!["_history".into()],
extract: Some("#wiki-body".into()),
..Default::default()
},
),
]
});
static EXTENSION_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\.[^.]+$").unwrap());
static GITHUB_REPO_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"^https://github\.com/([^/]+)/([^/]+)/tree/([^/]+)").unwrap());
pub async fn fetch(url: &str) -> Result<String> {
let client = match *CLIENT {
Ok(ref client) => client,
Err(ref err) => bail!("{err}"),
};
let res = client.get(url).send().await?;
let output = res.text().await?;
Ok(output)
}
pub async fn fetch_with_loaders(
loaders: &HashMap<String, String>,
path: &str,
allow_media: bool,
) -> Result<(String, String)> {
if let Some(loader_command) = loaders.get(URL_LOADER) {
let contents = run_loader_command(path, URL_LOADER, loader_command)?;
return Ok((contents, DEFAULT_EXTENSION.into()));
}
let client = match *CLIENT {
Ok(ref client) => client,
Err(ref err) => bail!("{err}"),
};
let mut res = client.get(path).send().await?;
if !res.status().is_success() {
bail!("Invalid status: {}", res.status());
}
let content_type = res
.headers()
.get(CONTENT_TYPE)
.and_then(|v| v.to_str().ok())
.map(|v| match v.split_once(';') {
Some((mime, _)) => mime.trim(),
None => v,
})
.map(|v| v.to_string())
.unwrap_or_else(|| {
format!(
"_/{}",
get_patch_extension(path).unwrap_or_else(|| DEFAULT_EXTENSION.into())
)
});
let mut is_media = false;
let extension = match content_type.as_str() {
"application/pdf" => "pdf".into(),
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" => "docx".into(),
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" => "xlsx".into(),
"application/vnd.openxmlformats-officedocument.presentationml.presentation" => {
"pptx".into()
}
"application/vnd.oasis.opendocument.text" => "odt".into(),
"application/vnd.oasis.opendocument.spreadsheet" => "ods".into(),
"application/vnd.oasis.opendocument.presentation" => "odp".into(),
"application/rtf" => "rtf".into(),
"text/javascript" => "js".into(),
"text/html" => "html".into(),
_ => content_type
.rsplit_once('/')
.map(|(first, last)| {
if ["image", "video", "audio"].contains(&first) {
is_media = true;
MEDIA_URL_EXTENSION.into()
} else {
last.to_lowercase()
}
})
.unwrap_or_else(|| DEFAULT_EXTENSION.into()),
};
let result = if is_media {
if !allow_media {
bail!("Unexpected media type")
}
let image_bytes = res.bytes().await?;
let image_base64 = base64_encode(&image_bytes);
let contents = format!("data:{content_type};base64,{image_base64}");
(contents, extension)
} else {
match loaders.get(&extension) {
Some(loader_command) => {
let save_path = temp_file("-download-", &format!(".{extension}"))
.display()
.to_string();
let mut save_file = tokio::fs::File::create(&save_path).await?;
let mut size = 0;
while let Some(chunk) = res.chunk().await? {
size += chunk.len();
save_file.write_all(&chunk).await?;
}
let contents = if size == 0 {
println!("{}", warning_text(&format!("No content at '{path}'")));
String::new()
} else {
run_loader_command(&save_path, &extension, loader_command)?
};
(contents, DEFAULT_EXTENSION.into())
}
None => {
let contents = res.text().await?;
if extension == "html" {
(html_to_md(&contents), "md".into())
} else {
(contents, extension)
}
}
}
};
Ok(result)
}
pub async fn fetch_models(api_base: &str, api_key: Option<&str>) -> Result<Vec<String>> {
let client = match *CLIENT {
Ok(ref client) => client,
Err(ref err) => bail!("{err}"),
};
let mut builder = client.get(format!("{}/models", api_base.trim_end_matches('/')));
if let Some(api_key) = api_key {
builder = builder.bearer_auth(api_key);
}
let res_body: Value = builder.send().await?.json().await?;
let mut result: Vec<String> = res_body
.get("data")
.and_then(|v| v.as_array())
.map(|v| {
v.iter()
.filter_map(|v| v.get("id").and_then(|v| v.as_str().map(|v| v.to_string())))
.collect()
})
.unwrap_or_default();
if result.is_empty() {
bail!("No valid models")
}
result.sort_unstable();
Ok(result)
}
#[derive(Debug, Clone, Default)]
pub struct CrawlOptions {
extract: Option<String>,
exclude: Vec<String>,
no_log: bool,
}
impl CrawlOptions {
pub fn preset(start_url: &str) -> CrawlOptions {
for (re, options) in PRESET.iter() {
if let Ok(true) = re.is_match(start_url) {
return options.clone();
}
}
CrawlOptions::default()
}
}
pub async fn crawl_website(start_url: &str, options: CrawlOptions) -> Result<Vec<Page>> {
let start_url = Url::parse(start_url)?;
let mut paths = vec![start_url.path().to_string()];
let normalized_start_url = normalize_start_url(&start_url);
if !options.no_log {
println!(
"Start crawling url={start_url} exclude={} extract={}",
options.exclude.join(","),
options.extract.as_deref().unwrap_or_default()
);
}
if let Ok(true) = GITHUB_REPO_RE.is_match(start_url.as_str()) {
paths = crawl_gh_tree(&start_url, &options.exclude)
.await
.with_context(|| "Failed to craw github repo".to_string())?;
}
let semaphore = Arc::new(Semaphore::new(MAX_CRAWLS));
let mut result_pages = Vec::new();
let mut index = 0;
while index < paths.len() {
let batch = paths[index..std::cmp::min(index + MAX_CRAWLS, paths.len())].to_vec();
let tasks: Vec<_> = batch
.iter()
.map(|path| {
let options = options.clone();
let permit = semaphore.clone().acquire_owned(); // acquire a permit for concurrency control
let normalized_start_url = normalized_start_url.clone();
let path = path.clone();
async move {
let _permit = permit.await?;
let url = normalized_start_url
.join(&path)
.map_err(|_| anyhow!("Invalid crawl page at {}", path))?;
let mut page = crawl_page(&normalized_start_url, &path, options)
.await
.with_context(|| format!("Failed to crawl {}", url.as_str()))?;
page.0 = url.as_str().to_string();
Ok(page)
}
})
.collect();
let results = stream::iter(tasks)
.buffer_unordered(MAX_CRAWLS)
.collect::<Vec<_>>()
.await;
let mut new_paths = Vec::new();
for res in results {
match res {
Ok((path, text, links)) => {
if !options.no_log {
println!("Crawled {path}");
}
if !text.is_empty() {
result_pages.push(Page { path, text });
}
for link in links {
if !paths.iter().any(|p| match_link(p, &link)) {
new_paths.push(link);
}
}
}
Err(err) => {
if BREAK_ON_ERROR {
return Err(err);
} else if !options.no_log {
println!("{}", error_text(&pretty_error(&err)));
}
}
}
}
paths.extend(new_paths);
index += batch.len();
}
Ok(result_pages)
}
#[derive(Debug, Deserialize)]
pub struct Page {
pub path: String,
pub text: String,
}
async fn crawl_gh_tree(start_url: &Url, exclude: &[String]) -> Result<Vec<String>> {
let path_segs: Vec<&str> = start_url.path().split('/').collect();
if path_segs.len() < 4 {
bail!("Invalid gh tree {}", start_url.as_str());
}
let client = match *CLIENT {
Ok(ref client) => client,
Err(ref err) => bail!("{err}"),
};
let owner = path_segs[1];
let repo = path_segs[2];
let branch = path_segs[4];
let root_path = path_segs[5..].join("/");
let url = format!("https://api.github.com/repos/{owner}/{repo}/git/ref/heads/{branch}");
let res_body: Value = client
.get(&url)
.header("User-Agent", USER_AGENT)
.header("Accept", "application/vnd.github+json")
.header("X-GitHub-Api-Version", "2022-11-28")
.send()
.await?
.json()
.await?;
let sha = res_body["object"]["sha"]
.as_str()
.ok_or_else(|| anyhow!("Not found branch or tag"))?;
let url = format!("https://api.github.com/repos/{owner}/{repo}/git/trees/{sha}?recursive=true");
let res_body: Value = client
.get(&url)
.header("User-Agent", USER_AGENT)
.header("Accept", "application/vnd.github+json")
.header("X-GitHub-Api-Version", "2022-11-28")
.send()
.await?
.json()
.await?;
let tree = res_body["tree"]
.as_array()
.ok_or_else(|| anyhow!("Invalid github repo tree"))?;
let paths = tree
.iter()
.flat_map(|v| {
let typ = v["type"].as_str()?;
let path = v["path"].as_str()?;
if typ == "blob"
&& (path.ends_with(".md") || path.ends_with(".MD"))
&& path.starts_with(&root_path)
&& !should_exclude_link(path, exclude)
{
Some(format!(
"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{path}"
))
} else {
None
}
})
.collect();
Ok(paths)
}
async fn crawl_page(
start_url: &Url,
path: &str,
options: CrawlOptions,
) -> Result<(String, String, Vec<String>)> {
let client = match *CLIENT {
Ok(ref client) => client,
Err(ref err) => bail!("{err}"),
};
let location = start_url.join(path)?;
let response = client
.get(location.as_str())
.header("User-Agent", USER_AGENT)
.send()
.await?;
let body = response.text().await?;
if let Ok(true) = GITHUB_REPO_RE.is_match(start_url.as_str()) {
return Ok((path.to_string(), body, vec![]));
}
let mut links = HashSet::new();
let document = Html::parse_document(&body);
let selector = Selector::parse("a").map_err(|err| anyhow!("Invalid link selector, {}", err))?;
for element in document.select(&selector) {
if let Some(href) = element.value().attr("href") {
let href = Url::parse(href).ok().or_else(|| location.join(href).ok());
match href {
None => continue,
Some(href) => {
if href.as_str().starts_with(location.as_str())
&& !should_exclude_link(href.path(), &options.exclude)
{
links.insert(href.path().to_string());
}
}
}
}
}
let text = if let Some(selector) = &options.extract {
let selector = Selector::parse(selector)
.map_err(|err| anyhow!("Invalid extract selector, {}", err))?;
document
.select(&selector)
.map(|v| html_to_md(&v.html()))
.collect::<Vec<String>>()
.join("\n\n")
} else {
html_to_md(&body)
};
Ok((path.to_string(), text, links.into_iter().collect()))
}
fn should_exclude_link(link: &str, exclude: &[String]) -> bool {
if link.contains("#") {
return true;
}
let parts: Vec<&str> = link.trim_end_matches('/').split('/').collect();
let name = parts.last().unwrap_or(&"").to_lowercase();
for exclude_name in exclude {
let cond = match EXTENSION_RE.is_match(exclude_name) {
Ok(true) => exclude_name.to_lowercase() == name.to_lowercase(),
_ => exclude_name.to_lowercase() == EXTENSION_RE.replace(&name, "").to_lowercase(),
};
if cond {
return true;
}
}
false
}
fn normalize_start_url(start_url: &Url) -> Url {
let mut start_url = start_url.clone();
start_url.set_query(None);
start_url.set_fragment(None);
let new_path = match start_url.path().rfind('/') {
Some(last_slash_index) => start_url.path()[..last_slash_index + 1].to_string(),
None => start_url.path().to_string(),
};
start_url.set_path(&new_path);
start_url
}
fn match_link(path: &str, link: &str) -> bool {
path == link
|| path
== link
.trim_end_matches("/index.html")
.trim_end_matches("/index.htm")
}
+217
View File
@@ -0,0 +1,217 @@
use super::{poll_abort_signal, wait_abort_signal, AbortSignal, IS_STDOUT_TERMINAL};
use anyhow::{bail, Result};
use crossterm::{cursor, queue, style, terminal};
use std::{
future::Future,
io::{stdout, Write},
time::Duration,
};
use tokio::{
sync::{
mpsc::{self, UnboundedReceiver},
oneshot,
},
time::interval,
};
#[derive(Debug, Default)]
pub struct SpinnerInner {
index: usize,
message: String,
}
impl SpinnerInner {
const DATA: [&'static str; 10] = ["", "", "", "", "", "", "", "", "", ""];
fn step(&mut self) -> Result<()> {
if !*IS_STDOUT_TERMINAL || self.message.is_empty() {
return Ok(());
}
let mut writer = stdout();
let frame = Self::DATA[self.index % Self::DATA.len()];
let dots = ".".repeat((self.index / 5) % 4);
let line = format!("{frame}{}{:<3}", self.message, dots);
queue!(writer, cursor::MoveToColumn(0), style::Print(line),)?;
if self.index == 0 {
queue!(writer, cursor::Hide)?;
}
writer.flush()?;
self.index += 1;
Ok(())
}
fn set_message(&mut self, message: String) -> Result<()> {
self.clear_message()?;
if !message.is_empty() {
self.message = format!(" {message}");
}
Ok(())
}
fn clear_message(&mut self) -> Result<()> {
if !*IS_STDOUT_TERMINAL || self.message.is_empty() {
return Ok(());
}
self.message.clear();
let mut writer = stdout();
queue!(
writer,
cursor::MoveToColumn(0),
terminal::Clear(terminal::ClearType::FromCursorDown),
cursor::Show
)?;
writer.flush()?;
Ok(())
}
}
#[derive(Clone)]
pub struct Spinner(mpsc::UnboundedSender<SpinnerEvent>);
impl Spinner {
pub fn create(message: &str) -> (Self, UnboundedReceiver<SpinnerEvent>) {
let (tx, spinner_rx) = mpsc::unbounded_channel();
let spinner = Spinner(tx);
let _ = spinner.set_message(message.to_string());
(spinner, spinner_rx)
}
pub fn set_message(&self, message: String) -> Result<()> {
self.0.send(SpinnerEvent::SetMessage(message))?;
std::thread::sleep(Duration::from_millis(10));
Ok(())
}
pub fn stop(&self) {
let _ = self.0.send(SpinnerEvent::Stop);
std::thread::sleep(Duration::from_millis(10));
}
}
pub enum SpinnerEvent {
SetMessage(String),
Stop,
}
pub fn spawn_spinner(message: &str) -> Spinner {
let (spinner, mut spinner_rx) = Spinner::create(message);
tokio::spawn(async move {
let mut spinner = SpinnerInner::default();
let mut interval = interval(Duration::from_millis(50));
loop {
tokio::select! {
evt = spinner_rx.recv() => {
if let Some(evt) = evt {
match evt {
SpinnerEvent::SetMessage(message) => {
spinner.set_message(message)?;
}
SpinnerEvent::Stop => {
spinner.clear_message()?;
break;
}
}
}
}
_ = interval.tick() => {
let _ = spinner.step();
}
}
}
Ok::<(), anyhow::Error>(())
});
spinner
}
pub async fn abortable_run_with_spinner<F, T>(
task: F,
message: &str,
abort_signal: AbortSignal,
) -> Result<T>
where
F: Future<Output = Result<T>>,
{
let (_, spinner_rx) = Spinner::create(message);
abortable_run_with_spinner_rx(task, spinner_rx, abort_signal).await
}
pub async fn abortable_run_with_spinner_rx<F, T>(
task: F,
spinner_rx: UnboundedReceiver<SpinnerEvent>,
abort_signal: AbortSignal,
) -> Result<T>
where
F: Future<Output = Result<T>>,
{
if *IS_STDOUT_TERMINAL {
let (done_tx, done_rx) = oneshot::channel();
let run_task = async {
tokio::select! {
ret = task => {
let _ = done_tx.send(());
ret
}
_ = tokio::signal::ctrl_c() => {
abort_signal.set_ctrlc();
let _ = done_tx.send(());
bail!("Aborted!")
},
_ = wait_abort_signal(&abort_signal) => {
let _ = done_tx.send(());
bail!("Aborted.");
},
}
};
let (task_ret, spinner_ret) = tokio::join!(
run_task,
run_abortable_spinner(spinner_rx, done_rx, abort_signal.clone())
);
spinner_ret?;
task_ret
} else {
task.await
}
}
async fn run_abortable_spinner(
mut spinner_rx: UnboundedReceiver<SpinnerEvent>,
mut done_rx: oneshot::Receiver<()>,
abort_signal: AbortSignal,
) -> Result<()> {
let mut spinner = SpinnerInner::default();
loop {
if abort_signal.aborted() {
break;
}
tokio::time::sleep(Duration::from_millis(25)).await;
match done_rx.try_recv() {
Ok(_) | Err(oneshot::error::TryRecvError::Closed) => {
break;
}
_ => {}
}
match spinner_rx.try_recv() {
Ok(SpinnerEvent::SetMessage(message)) => {
spinner.set_message(message)?;
}
Ok(SpinnerEvent::Stop) => {
spinner.clear_message()?;
}
Err(_) => {}
}
if poll_abort_signal(&abort_signal)? {
break;
}
spinner.step()?;
}
spinner.clear_message()?;
Ok(())
}
+32
View File
@@ -0,0 +1,32 @@
use super::*;
use fancy_regex::{Captures, Regex};
use std::sync::LazyLock;
pub static RE_VARIABLE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\{\{(\w+)\}\}").unwrap());
pub fn interpolate_variables(text: &mut String) {
*text = RE_VARIABLE
.replace_all(text, |caps: &Captures<'_>| {
let key = &caps[1];
match key {
"__os__" => env::consts::OS.to_string(),
"__os_distro__" => {
let info = os_info::get();
if env::consts::OS == "linux" {
format!("{info} (linux)")
} else {
info.to_string()
}
}
"__os_family__" => env::consts::FAMILY.to_string(),
"__arch__" => env::consts::ARCH.to_string(),
"__shell__" => SHELL.name.clone(),
"__locale__" => sys_locale::get_locale().unwrap_or_default(),
"__now__" => now(),
"__cwd__" => env::current_dir()
.map(|v| v.display().to_string())
.unwrap_or_default(),
_ => format!("{{{{{key}}}}}"),
}
})
.to_string();
}