feat: implemented the frontier-based scheduling for the graph executor with simplified state management (gotta love .clone)

This commit is contained in:
2026-05-20 13:48:55 -06:00
parent 5e4d3ff011
commit 36ac924d77
6 changed files with 341 additions and 136 deletions
+50
View File
@@ -149,6 +149,56 @@ impl RequestContext {
})
}
/// Forks the context for one parallel branch of a graph super-step.
///
/// Each branch gets a fresh, owned clone — mutations (role swap,
/// `before/after_chat_completion`, tool tracker, last_message, etc.) are
/// scoped to the branch and discarded when the branch finishes. The
/// user-visible state communication happens through the graph's
/// `StateManager` (via `fork_for_branch_state` + `diff_against` +
/// `apply_branch_writes` reducers), NOT through `RequestContext`.
///
/// Distinction from `new_for_child`: `new_for_child` builds a fresh context
/// for a SPAWNED SUB-AGENT (different agent identity, different supervisor
/// hierarchy, depth+1, fresh tool tracker). `fork_for_branch` keeps the
/// caller's identity and supervisor hierarchy — it's a sibling clone of the
/// SAME logical agent, running one of N parallel work items.
///
/// Behavior of per-field cloning:
/// - `Arc`-wrapped fields (`app`, `rag`, `supervisor`, `parent_supervisor`,
/// `inbox`, `escalation_queue`) — shared via Arc::clone
/// - Owned heap fields (`model`, `role`, `session`, `agent`, `tool_scope`,
/// `todo_list`, etc.) — deep `.clone()` so the branch can mutate freely
/// - `auto_continue_count` reset to 0 (each branch starts a fresh
/// continuation budget)
/// - `last_continuation_response` reset to None
#[allow(dead_code)]
pub fn fork_for_branch(&self) -> Self {
Self {
app: Arc::clone(&self.app),
macro_flag: self.macro_flag,
info_flag: self.info_flag,
working_mode: self.working_mode,
model: self.model.clone(),
agent_variables: self.agent_variables.clone(),
role: self.role.clone(),
session: self.session.clone(),
rag: self.rag.clone(),
agent: self.agent.clone(),
last_message: self.last_message.clone(),
tool_scope: self.tool_scope.clone(),
supervisor: self.supervisor.clone(),
parent_supervisor: self.parent_supervisor.clone(),
self_agent_id: self.self_agent_id.clone(),
inbox: self.inbox.clone(),
escalation_queue: self.escalation_queue.clone(),
current_depth: self.current_depth,
auto_continue_count: 0,
todo_list: self.todo_list.clone(),
last_continuation_response: None,
}
}
pub fn new_for_child(
app: Arc<AppState>,
parent: &Self,
+2 -1
View File
@@ -8,6 +8,7 @@ use serde_json::{Value, json};
use std::collections::HashMap;
use std::sync::Arc;
#[derive(Clone)]
pub struct ToolScope {
pub functions: Functions,
pub mcp_runtime: McpRuntime,
@@ -24,7 +25,7 @@ impl Default for ToolScope {
}
}
#[derive(Default)]
#[derive(Default, Clone)]
pub struct McpRuntime {
pub servers: HashMap<String, Arc<ConnectedServer>>,
}
+170 -45
View File
@@ -3,6 +3,7 @@ use super::llm::LlmNodeExecutor;
use super::logging::GraphLogger;
use super::rag::RagNodeExecutor;
use super::script::ScriptExecutor;
use super::staging::BranchWrites;
use super::state::StateManager;
use super::types::{EndNode, Graph, Node, NodeType};
use super::user_interaction::{ApprovalNodeExecutor, InputNodeExecutor};
@@ -10,11 +11,13 @@ use super::validator::{AgentValidationContext, GraphValidator};
use crate::config::RequestContext;
use crate::utils::AbortSignal;
use anyhow::{Context, Result, anyhow, bail};
use futures_util::future::join_all;
use serde_json::Value;
use std::collections::HashMap;
use std::collections::{HashMap, HashSet};
use std::path::PathBuf;
use std::sync::Arc;
use std::time::{Duration, Instant};
use tokio::sync::Semaphore;
pub struct GraphExecutor {
graph: Graph,
@@ -70,74 +73,196 @@ impl GraphExecutor {
let script_executor = ScriptExecutor::new(&base_dir);
let max_iterations = graph.settings.max_loop_iterations;
let graph_timeout = graph.settings.timeout.map(Duration::from_secs);
let max_concurrency = graph.settings.max_concurrency;
let start = Instant::now();
let mut current = graph.start.clone();
logger.graph_start(&current, graph.nodes.len());
let mut frontier: HashSet<String> = HashSet::from([graph.start.clone()]);
logger.graph_start(&graph.start, graph.nodes.len());
loop {
if frontier.is_empty() {
bail!(
"Graph '{}' frontier emptied without reaching an End node",
graph.name
);
}
let output = loop {
if abort_signal.aborted() {
bail!("Graph '{}' aborted at '{}'", graph.name, current);
bail!(
"Graph '{}' aborted before super-step with frontier {:?}",
graph.name,
sorted_frontier(&frontier)
);
}
if let Some(t) = graph_timeout
&& start.elapsed() > t
{
bail!(
"Graph '{}' timed out after {}s at '{}'",
"Graph '{}' timed out after {}s before super-step with frontier {:?}",
graph.name,
t.as_secs(),
current
sorted_frontier(&frontier)
);
}
state.state_mut().visit_node(&current);
let visits = state.state().loop_count(&current);
if visits > max_iterations {
// Loop-count and visit tracking on live state, BEFORE forking.
// This counts every entry to a node toward max_loop_iterations
// regardless of how many parallel branches converged on it.
for node_id in &frontier {
state.state_mut().visit_node(node_id);
let visits = state.state().loop_count(node_id);
if visits > max_iterations {
bail!(
"Node '{}' visited {} times (max_loop_iterations={}). \
Possible infinite loop.",
node_id,
visits,
max_iterations
);
}
}
for node_id in &frontier {
let node = graph.get_node(node_id).ok_or_else(|| {
anyhow!("Node '{}' not found in graph '{}'", node_id, graph.name)
})?;
let visits = state.state().loop_count(node_id);
logger.node_entry(node, visits);
}
let snapshot_label = if frontier.len() == 1 {
frontier.iter().next().cloned().unwrap_or_default()
} else {
format!("super-step {{{}}}", sorted_frontier(&frontier).join(","))
};
logger.state_snapshot(&snapshot_label, &state);
let snapshot = state.read_snapshot();
let semaphore = Arc::new(Semaphore::new(max_concurrency));
let mut branch_tasks = Vec::with_capacity(frontier.len());
for node_id in &frontier {
let node = graph
.get_node(node_id)
.ok_or_else(|| {
anyhow!("Node '{}' not found in graph '{}'", node_id, graph.name)
})?
.clone();
let branch_state = state.fork_for_branch_state();
let branch_ctx = ctx.fork_for_branch();
let script_exec_clone = script_executor.clone();
let graph_name = graph.name.clone();
let current = node_id.clone();
let sem_clone = semaphore.clone();
let abort_clone = abort_signal.clone();
let task = tokio::spawn(async move {
let _permit = sem_clone
.acquire()
.await
.expect("semaphore should not be closed");
if abort_clone.aborted() {
return (
current.clone(),
branch_state,
Err(anyhow!("branch aborted")),
Duration::default(),
);
}
let node_start = Instant::now();
let mut state = branch_state;
let mut ctx = branch_ctx;
let result = step(
&node,
&mut state,
&mut ctx,
&script_exec_clone,
&graph_name,
&current,
)
.await;
let elapsed = node_start.elapsed();
(current, state, result, elapsed)
});
branch_tasks.push(task);
}
let joined = join_all(branch_tasks).await;
let mut branch_writes: Vec<BranchWrites> = Vec::new();
let mut next_frontier: HashSet<String> = HashSet::new();
let mut end_results: Vec<(String, StateManager, String)> = Vec::new();
for join_result in joined {
let (node_id, branch_state, step_result, elapsed) =
join_result.map_err(|e| anyhow!("Branch task panicked: {e}"))?;
logger.record_timing(&node_id, elapsed);
let step_outcome = step_result.with_context(|| format!("at node '{node_id}'"))?;
match step_outcome {
StepResult::Continue(target) => {
logger.routing(&node_id, &target);
let diff = branch_state.diff_against(snapshot.as_ref());
branch_writes.push(BranchWrites {
node_id: node_id.clone(),
invocation_index: 0,
writes: diff,
});
next_frontier.insert(target);
}
StepResult::End(output) => {
end_results.push((node_id.clone(), branch_state, output));
}
}
}
if end_results.len() > 1 {
let mut ids: Vec<String> =
end_results.iter().map(|(id, _, _)| id.clone()).collect();
ids.sort();
bail!(
"Node '{}' visited {} times (max_loop_iterations={}). \
Possible infinite loop.",
current,
visits,
max_iterations
"super-step ended with multiple End targets ({}). \
Fan-out branches must converge at a join node before \
terminating. To fix: route all parallel branches to a \
single shared next-node, then terminate from there.",
ids.join(", ")
);
}
let node = graph
.get_node(&current)
.ok_or_else(|| anyhow!("Node '{}' not found in graph '{}'", current, graph.name))?;
// Sort by (node_id, invocation_index) so non-commutative reducers
// like Concat/Merge produce deterministic output across runs.
branch_writes.sort_by(|a, b| {
a.node_id
.cmp(&b.node_id)
.then(a.invocation_index.cmp(&b.invocation_index))
});
state.apply_branch_writes(branch_writes, &graph.reducers)?;
logger.node_entry(node, visits);
logger.state_snapshot(&current, &state);
let node_start = Instant::now();
let step_result = step(
node,
&mut state,
ctx,
&script_executor,
&graph.name,
&current,
)
.await;
logger.record_timing(&current, node_start.elapsed());
let next = step_result.with_context(|| format!("at node '{current}'"))?;
match next {
StepResult::Continue(next_id) => {
logger.routing(&current, &next_id);
current = next_id;
}
StepResult::End(out) => {
logger.graph_complete(&current, start.elapsed());
break out;
}
if let Some((node_id, end_state, output)) = end_results.into_iter().next() {
let diff = end_state.diff_against(snapshot.as_ref());
state.apply_branch_writes(
vec![BranchWrites {
node_id: node_id.clone(),
invocation_index: 0,
writes: diff,
}],
&graph.reducers,
)?;
logger.graph_complete(&node_id, start.elapsed());
return Ok(output);
}
};
Ok(output)
frontier = next_frontier;
}
}
}
fn sorted_frontier(frontier: &HashSet<String>) -> Vec<String> {
let mut v: Vec<String> = frontier.iter().cloned().collect();
v.sort();
v
}
enum StepResult {
Continue(String),
End(String),
+1
View File
@@ -10,6 +10,7 @@ use std::time::Duration;
use tokio::process::Command;
use tokio::time::timeout;
#[derive(Clone)]
pub struct ScriptExecutor {
base_dir: PathBuf,
}
-82
View File
@@ -1,38 +1,6 @@
use serde_json::Value;
use std::collections::HashMap;
#[derive(Debug, Default, Clone)]
pub struct StagingArea {
writes: HashMap<String, Value>,
}
#[allow(dead_code)]
impl StagingArea {
pub fn new() -> Self {
Self::default()
}
pub fn write(&mut self, key: impl Into<String>, value: Value) {
self.writes.insert(key.into(), value);
}
pub fn get(&self, key: &str) -> Option<&Value> {
self.writes.get(key)
}
pub fn is_empty(&self) -> bool {
self.writes.is_empty()
}
pub fn len(&self) -> usize {
self.writes.len()
}
pub fn into_writes(self) -> HashMap<String, Value> {
self.writes
}
}
/// Published form of one branch's writes for the super-step merge phase.
/// Callers assemble these into a deterministically-ordered `Vec` keyed by
/// `(node_id, invocation_index)` before passing to
@@ -40,58 +8,8 @@ impl StagingArea {
/// branches and the input-list position for map sub-branches — so multiple
/// invocations of the same `branch:` node by a `map` are still totally ordered.
#[derive(Debug, Clone)]
#[allow(dead_code)]
pub struct BranchWrites {
pub node_id: String,
pub invocation_index: usize,
pub writes: HashMap<String, Value>,
}
#[cfg(test)]
mod tests {
use super::*;
use serde_json::json;
#[test]
fn new_staging_area_is_empty() {
let s = StagingArea::new();
assert!(s.is_empty());
assert_eq!(s.len(), 0);
}
#[test]
fn write_stores_value_under_key() {
let mut s = StagingArea::new();
s.write("key", json!("value"));
assert_eq!(s.get("key"), Some(&json!("value")));
assert_eq!(s.len(), 1);
assert!(!s.is_empty());
}
#[test]
fn write_overwrites_existing_key() {
let mut s = StagingArea::new();
s.write("k", json!(1));
s.write("k", json!(2));
assert_eq!(s.get("k"), Some(&json!(2)));
assert_eq!(s.len(), 1);
}
#[test]
fn into_writes_consumes_and_yields_map() {
let mut s = StagingArea::new();
s.write("a", json!(1));
s.write("b", json!(2));
let writes = s.into_writes();
assert_eq!(writes.len(), 2);
assert_eq!(writes.get("a"), Some(&json!(1)));
assert_eq!(writes.get("b"), Some(&json!(2)));
}
}
+118 -8
View File
@@ -159,13 +159,44 @@ impl StateManager {
}
}
/// Returns an `Arc`-wrapped snapshot of the current graph state. Each branch
/// in a parallel super-step shares this snapshot for reads; their writes
/// accumulate into per-branch `StagingArea` instances, which are merged via
/// `apply_branch_writes` at the end of the super-step.
/// Forks state for a parallel branch: returns a fully-owned `StateManager`
/// seeded from the current state's data. The branch mutates its fork
/// freely; callers extract its writes via `diff_against` after the branch
/// completes, then merge them via `apply_branch_writes`.
///
/// Distinct from the older `snapshot()` method (returns a `HashMap` clone of
/// the data only — used by `script_executor` to ship state to child processes).
/// Distinct from `read_snapshot` (returns a shared `Arc<GraphState>` for
/// reads) — `fork_for_branch_state` returns a writable owned clone.
pub fn fork_for_branch_state(&self) -> Self {
Self {
state: self.state.clone(),
temp_file: None,
}
}
/// Returns the keys whose values differ from `snapshot`. Use this after a
/// branch finishes to extract its writes (input to `apply_branch_writes`).
/// Keys present in `self` but absent from `snapshot`, or with different
/// values, count as writes. Deletions are not represented (no current node
/// executor deletes state).
pub fn diff_against(&self, snapshot: &GraphState) -> HashMap<String, Value> {
let mut diff = HashMap::new();
for (k, v) in self.state.data() {
if snapshot.get(k) != Some(v) {
diff.insert(k.clone(), v.clone());
}
}
diff
}
/// Returns an `Arc`-wrapped snapshot of the current graph state. Each
/// branch in a parallel super-step uses this snapshot as the baseline for
/// its `diff_against` call at branch end. The executor extracts each
/// branch's writes (the diff) and merges them via `apply_branch_writes` at
/// the super-step boundary.
///
/// Distinct from the older `snapshot()` method (returns a `HashMap` clone
/// of the data only — used by `script_executor` to ship state to child
/// processes).
#[allow(dead_code)]
pub fn read_snapshot(&self) -> Arc<GraphState> {
Arc::new(self.state.clone())
@@ -936,12 +967,91 @@ mod tests {
#[test]
fn interpolate_raw_inner_spaces_treated_as_mixed() {
let manager = manager_with(&[("k", json!("v"))]);
// `{{ k }}` is not a valid pure reference (spaces inside braces are
// outside the allowed character set). Fall back to string interpolation
// -- which doesn't match the regex either, so the literal passes through.
let result = manager.interpolate_raw("{{ k }}").unwrap();
assert_eq!(result, json!("{{ k }}"));
}
#[test]
fn fork_for_branch_state_copies_data() {
let parent = manager_with(&[("a", json!(1)), ("b", json!("x"))]);
let fork = parent.fork_for_branch_state();
assert_eq!(fork.state().get("a"), Some(&json!(1)));
assert_eq!(fork.state().get("b"), Some(&json!("x")));
}
#[test]
fn fork_for_branch_state_isolates_writes_from_parent() {
let parent = manager_with(&[("count", json!(10))]);
let mut fork = parent.fork_for_branch_state();
fork.state_mut().set("count".into(), json!(999));
assert_eq!(fork.state().get("count"), Some(&json!(999)));
assert_eq!(parent.state().get("count"), Some(&json!(10)));
}
#[test]
fn fork_for_branch_state_does_not_share_temp_file_lifecycle() {
let parent = manager_with(&[("k", json!("v"))]);
let fork = parent.fork_for_branch_state();
assert!(fork.temp_file.is_none());
// Dropping the fork must not affect the parent's data
drop(fork);
assert_eq!(parent.state().get("k"), Some(&json!("v")));
}
#[test]
fn diff_against_returns_empty_when_unchanged() {
let original = manager_with(&[("a", json!(1)), ("b", json!(2))]);
let fork = original.fork_for_branch_state();
let diff = fork.diff_against(original.state());
assert!(diff.is_empty());
}
#[test]
fn diff_against_reports_newly_written_keys() {
let original = manager_with(&[]);
let mut fork = original.fork_for_branch_state();
fork.state_mut().set("new".into(), json!(42));
let diff = fork.diff_against(original.state());
assert_eq!(diff.len(), 1);
assert_eq!(diff.get("new"), Some(&json!(42)));
}
#[test]
fn diff_against_reports_changed_values_only() {
let original = manager_with(&[("a", json!(1)), ("b", json!(2)), ("c", json!(3))]);
let mut fork = original.fork_for_branch_state();
fork.state_mut().set("b".into(), json!(99));
let diff = fork.diff_against(original.state());
assert_eq!(diff.len(), 1);
assert_eq!(diff.get("b"), Some(&json!(99)));
assert!(!diff.contains_key("a"));
assert!(!diff.contains_key("c"));
}
#[test]
fn diff_against_does_not_report_reverted_writes() {
// Branch writes then writes back to the original value; net change = 0.
let original = manager_with(&[("x", json!("initial"))]);
let mut fork = original.fork_for_branch_state();
fork.state_mut().set("x".into(), json!("modified"));
fork.state_mut().set("x".into(), json!("initial"));
let diff = fork.diff_against(original.state());
assert!(diff.is_empty(), "reverted write should not appear in diff");
}
}