feat: Created the step-runner graph agent for more deterministic coding workflows to produce even more reliable and higher-quality results
This commit is contained in:
@@ -0,0 +1,599 @@
|
||||
name: step-runner
|
||||
description: |
|
||||
Executes ONE step of a phased implementation plan (plans/ repo) with the
|
||||
step protocol enforced as graph edges: orient -> staleness check ->
|
||||
implement (coder) -> verify -> edge-case sweep -> optional independent
|
||||
review -> evidence-backed handoff -> user approval gate. Designed to be
|
||||
delegated to by sisyphus.
|
||||
version: "1.0"
|
||||
|
||||
global_tools:
|
||||
- fs_cat.sh
|
||||
- fs_ls.sh
|
||||
- fs_write.sh
|
||||
- fs_patch.sh
|
||||
- execute_command.sh
|
||||
|
||||
skills_enabled: true
|
||||
enabled_skills:
|
||||
- step-implementation
|
||||
- handoff-protocol
|
||||
- code-review
|
||||
- ai-slop-remover
|
||||
|
||||
variables:
|
||||
- name: project_dir
|
||||
description: |
|
||||
Absolute path to the project directory. Defaults to "." (the directory
|
||||
coyote was invoked from). The coder sub-agent resolves its own
|
||||
project_dir the same way, so invoke step-runner FROM the project root
|
||||
unless you override this for both.
|
||||
default: "."
|
||||
- name: plans_dir
|
||||
description: |
|
||||
Path to the plan repo. Relative paths resolve against project_dir.
|
||||
Expected layout: <plans_dir>/steps/NN-<slug>.md,
|
||||
<plans_dir>/handoffs/, <plans_dir>/NOTES.md.
|
||||
default: "plans"
|
||||
- name: step
|
||||
description: |
|
||||
Which step to execute: a step number, or "next" to pick the first
|
||||
in-progress (resume) or pending step plan.
|
||||
default: "next"
|
||||
|
||||
settings:
|
||||
max_loop_iterations: 20
|
||||
log_state_snapshots: true
|
||||
validate_before_run: true
|
||||
timeout: 7200
|
||||
|
||||
initial_state:
|
||||
project_dir: ""
|
||||
plans_dir: ""
|
||||
step_number: 0
|
||||
step_slug: ""
|
||||
step_title: ""
|
||||
step_plan_path: ""
|
||||
step_plan: ""
|
||||
prev_handoff_path: "(none)"
|
||||
prev_handoff: "(none - this is the first step)"
|
||||
notes_path: ""
|
||||
notes: "(none)"
|
||||
handoff_path: ""
|
||||
blocking_reason: ""
|
||||
plan_summary: ""
|
||||
implementation_brief: ""
|
||||
staleness_report: ""
|
||||
has_major_deviation: false
|
||||
deviation_summary: ""
|
||||
user_feedback: ""
|
||||
fix_instructions: ""
|
||||
fix_attempts: 0
|
||||
max_fix_attempts: 2
|
||||
coder_result: ""
|
||||
format_output: ""
|
||||
lint_ok: true
|
||||
lint_output: ""
|
||||
build_ok: true
|
||||
build_output: ""
|
||||
tests_ok: true
|
||||
tests_output: ""
|
||||
edge_case_report: ""
|
||||
downstream_updates: ""
|
||||
needs_independent_review: false
|
||||
review_report: ""
|
||||
review_attempts: 0
|
||||
max_review_attempts: 1
|
||||
handoff_attempts: 0
|
||||
handoff_fix: ""
|
||||
step_summary: ""
|
||||
|
||||
start: resolve_step
|
||||
|
||||
nodes:
|
||||
resolve_step:
|
||||
id: resolve_step
|
||||
type: script
|
||||
description: |
|
||||
Locate the step plan, previous handoff, and NOTES.md; parse frontmatter;
|
||||
check depends_on satisfaction against existing handoffs; mark the plan
|
||||
in-progress. Routes to gate_blocked when dependencies are unsatisfied.
|
||||
script: scripts/resolve_step.sh
|
||||
timeout: 30
|
||||
fallback: end_failure
|
||||
next: orient
|
||||
|
||||
gate_blocked:
|
||||
id: gate_blocked
|
||||
type: approval
|
||||
description: Escalate unsatisfied dependencies instead of building on missing ground.
|
||||
question: |
|
||||
Step {{step_number}} ({{step_title}}) is BLOCKED:
|
||||
|
||||
{{blocking_reason}}
|
||||
|
||||
Proceed anyway?
|
||||
options:
|
||||
- "yes"
|
||||
- "no"
|
||||
routes:
|
||||
"yes": orient
|
||||
"no": end_blocked
|
||||
on_other: end_blocked
|
||||
|
||||
orient:
|
||||
id: orient
|
||||
type: llm
|
||||
description: |
|
||||
Read-only orientation and staleness check: merge the previous handoff's
|
||||
directives with the step plan, then verify the plan's assumptions
|
||||
against the CURRENT codebase before any edit.
|
||||
skills_enabled: true
|
||||
enabled_skills:
|
||||
- step-implementation
|
||||
instructions: |
|
||||
You are orienting for one step of a phased implementation plan. Load
|
||||
`step-implementation` and apply its Orient and Staleness-check phases.
|
||||
You are READ-ONLY in this node: no edits, no fixes.
|
||||
|
||||
1. Read the previous handoff (below). Note directives aimed at this
|
||||
step, deviations that changed the codebase, and bare assertions
|
||||
that need re-verification.
|
||||
2. Staleness-check the step plan against the code at {{project_dir}}:
|
||||
grep the symbols it references (via execute_command), read its
|
||||
Context snippets at their claimed locations with fs_cat, confirm
|
||||
its Test commands exist.
|
||||
3. Classify discrepancies per the skill's deviation table: minor
|
||||
(mechanics differ; correct silently in the brief) vs major (scope,
|
||||
approach, interfaces, or a later step's assumptions affected).
|
||||
|
||||
Produce `implementation_brief`: the corrected, self-contained marching
|
||||
orders for the implementer - plan tasks in order, handoff directives
|
||||
applied, minor staleness corrections folded in, acceptance criteria
|
||||
restated. The implementer sees ONLY the step plan plus your brief.
|
||||
prompt: |
|
||||
## Step plan ({{step_plan_path}})
|
||||
{{step_plan}}
|
||||
|
||||
## Previous handoff ({{prev_handoff_path}})
|
||||
{{prev_handoff}}
|
||||
|
||||
## Rolling project notes
|
||||
{{notes}}
|
||||
tools:
|
||||
- fs_cat
|
||||
- fs_ls
|
||||
- execute_command
|
||||
max_iterations: 20
|
||||
output_schema:
|
||||
type: object
|
||||
properties:
|
||||
plan_summary:
|
||||
type: string
|
||||
description: 1-3 sentences summarizing what this step delivers
|
||||
implementation_brief:
|
||||
type: string
|
||||
description: Corrected, self-contained instructions for the implementer
|
||||
staleness_report:
|
||||
type: string
|
||||
description: Findings from checking plan assumptions against current code; "clean" if none
|
||||
has_major_deviation:
|
||||
type: boolean
|
||||
description: True when a discrepancy changes scope, approach, or interfaces
|
||||
deviation_summary:
|
||||
type: string
|
||||
description: Major deviations only, with the plan claim vs current reality. Empty when none
|
||||
required: [plan_summary, implementation_brief, staleness_report, has_major_deviation, deviation_summary]
|
||||
fallback: end_failure
|
||||
next: route_staleness
|
||||
|
||||
route_staleness:
|
||||
id: route_staleness
|
||||
type: script
|
||||
description: Major deviation -> user gate; otherwise straight to implement.
|
||||
script: scripts/route_staleness.sh
|
||||
timeout: 5
|
||||
fallback: implement
|
||||
|
||||
gate_deviation:
|
||||
id: gate_deviation
|
||||
type: approval
|
||||
description: Major deviations are never silently absorbed - the user decides.
|
||||
question: |
|
||||
Step {{step_number}} ({{step_title}}): the plan no longer matches the
|
||||
codebase in a way that changes scope or approach.
|
||||
|
||||
{{deviation_summary}}
|
||||
|
||||
Staleness report:
|
||||
{{staleness_report}}
|
||||
|
||||
Proceed with the corrected brief? (Answer with anything else to give
|
||||
your own guidance to the implementer.)
|
||||
options:
|
||||
- "proceed"
|
||||
- "abort"
|
||||
routes:
|
||||
"proceed": implement
|
||||
"abort": end_rejected
|
||||
on_other: implement
|
||||
state_updates:
|
||||
user_feedback: "{{choice}}"
|
||||
|
||||
implement:
|
||||
id: implement
|
||||
type: agent
|
||||
description: |
|
||||
Delegate implementation to the coder graph agent, which runs its own
|
||||
plan -> implement -> build -> tests -> self-review fix-loop internally.
|
||||
agent: coder
|
||||
prompt: |
|
||||
## TASK
|
||||
Execute step {{step_number}} ({{step_title}}) of a phased implementation
|
||||
plan for the project at {{project_dir}}.
|
||||
|
||||
## EXPECTED OUTCOME
|
||||
Every task in the step plan below is implemented and its acceptance
|
||||
criteria are met. Tests are derived from the Acceptance criteria
|
||||
section (not from the implementation). Build and full test suite pass.
|
||||
|
||||
## MUST DO
|
||||
- Follow the Orientation brief below - it supersedes the raw plan where
|
||||
they disagree (it folds in corrections from the staleness check).
|
||||
- Match the patterns pasted in the step plan's Context section.
|
||||
- Derive tests from the plan's Acceptance criteria.
|
||||
|
||||
## MUST NOT DO
|
||||
- Do not touch anything listed in the plan's Out of scope section.
|
||||
- Do not modify files under {{plans_dir}}.
|
||||
- Do not implement work belonging to other steps.
|
||||
|
||||
## CONTEXT
|
||||
### Step plan
|
||||
{{step_plan}}
|
||||
|
||||
### Orientation brief (handoff directives + staleness corrections applied)
|
||||
{{implementation_brief}}
|
||||
|
||||
### User guidance (if any)
|
||||
{{user_feedback}}
|
||||
|
||||
### Fix loop status (empty on first attempt)
|
||||
{{fix_instructions}}
|
||||
timeout: 3600
|
||||
state_updates:
|
||||
coder_result: "{{output}}"
|
||||
next: route_coder_result
|
||||
|
||||
route_coder_result:
|
||||
id: route_coder_result
|
||||
type: script
|
||||
description: Route on the coder sentinel - COMPLETE verifies, REJECTED/FAILED terminate.
|
||||
script: scripts/route_coder_result.sh
|
||||
timeout: 5
|
||||
fallback: end_failure
|
||||
|
||||
verify_format_lint:
|
||||
id: verify_format_lint
|
||||
type: script
|
||||
description: |
|
||||
Format BEFORE evidence collection (FORMAT_CMD override or per-type
|
||||
heuristic), then lint (LINT_CMD, when configured). Lint failure routes
|
||||
to the fix loop.
|
||||
script: scripts/verify_format_lint.sh
|
||||
timeout: 300
|
||||
fallback: fix_loop_gate
|
||||
|
||||
verify_build:
|
||||
id: verify_build
|
||||
type: script
|
||||
description: Step-level build/typecheck evidence, collected AFTER formatting.
|
||||
script: scripts/verify_build.sh
|
||||
timeout: 600
|
||||
fallback: fix_loop_gate
|
||||
|
||||
verify_tests:
|
||||
id: verify_tests
|
||||
type: script
|
||||
description: FULL test suite - regressions in untouched code fail the step too.
|
||||
script: scripts/verify_tests.sh
|
||||
timeout: 1200
|
||||
fallback: fix_loop_gate
|
||||
|
||||
fix_loop_gate:
|
||||
id: fix_loop_gate
|
||||
type: script
|
||||
description: |
|
||||
Step-level fix budget (the coder already ran its own internal fix
|
||||
loop). Loops to implement with fix_instructions, or ends as failure.
|
||||
script: scripts/fix_loop_gate.sh
|
||||
timeout: 5
|
||||
fallback: end_failure
|
||||
|
||||
edge_case_sweep:
|
||||
id: edge_case_sweep
|
||||
type: llm
|
||||
description: |
|
||||
Post-implementation sweep: missed spots, edge cases, downstream plan
|
||||
implications. May annotate downstream plans' Edge cases sections
|
||||
(annotate vs propose per handoff-protocol). Also judges whether the
|
||||
change warrants an independent review pass.
|
||||
skills_enabled: true
|
||||
enabled_skills:
|
||||
- step-implementation
|
||||
- handoff-protocol
|
||||
instructions: |
|
||||
The implementation for this step just passed build and tests. Load
|
||||
`step-implementation` (edge-case sweep phase) and `handoff-protocol`
|
||||
(annotate-vs-propose rules), then:
|
||||
|
||||
1. Read the changed code (the coder result below names the files).
|
||||
Look for edge cases the plan missed: empty inputs, error paths,
|
||||
concurrency, partial failure, compat.
|
||||
2. For each edge case belonging to a LATER step: check that step's
|
||||
plan under {{plans_dir}}/steps/. If its Edge cases section already
|
||||
covers it, done. If not, append an entry to that section via
|
||||
fs_patch - touch NOTHING else in the file.
|
||||
3. NEVER edit a later plan's Objective, Tasks, Acceptance criteria,
|
||||
or Out of scope. Scope-affecting changes become proposed diffs in
|
||||
`downstream_updates` instead.
|
||||
4. Set needs_independent_review=true when the change touched 5+ files
|
||||
or crosses architectural boundaries (auth, public APIs, schema,
|
||||
security-sensitive paths).
|
||||
|
||||
Be terse. Findings, not prose.
|
||||
prompt: |
|
||||
## Coder result
|
||||
{{coder_result}}
|
||||
|
||||
## Step plan
|
||||
{{step_plan}}
|
||||
|
||||
## Staleness report from orientation
|
||||
{{staleness_report}}
|
||||
tools:
|
||||
- fs_cat
|
||||
- fs_ls
|
||||
- fs_patch
|
||||
- execute_command
|
||||
max_iterations: 20
|
||||
output_schema:
|
||||
type: object
|
||||
properties:
|
||||
edge_case_report:
|
||||
type: string
|
||||
description: Edge cases discovered - both handled and punted, one per line. "none" if empty
|
||||
downstream_updates:
|
||||
type: string
|
||||
description: Annotations made (plan file + section) and proposed diffs for scope-affecting changes. "none" if empty
|
||||
needs_independent_review:
|
||||
type: boolean
|
||||
required: [edge_case_report, downstream_updates, needs_independent_review]
|
||||
fallback: write_handoff
|
||||
next: route_sweep
|
||||
|
||||
route_sweep:
|
||||
id: route_sweep
|
||||
type: script
|
||||
description: Broad or boundary-crossing changes get an independent reviewer.
|
||||
script: scripts/route_sweep.sh
|
||||
timeout: 5
|
||||
fallback: write_handoff
|
||||
|
||||
independent_review:
|
||||
id: independent_review
|
||||
type: agent
|
||||
description: Independent review pass - the author's self-review cannot catch its own rationalizations.
|
||||
agent: code-reviewer
|
||||
prompt: |
|
||||
Review the changes produced for step {{step_number}} ({{step_title}})
|
||||
of a phased implementation plan in {{project_dir}}.
|
||||
|
||||
What the step was supposed to do:
|
||||
{{plan_summary}}
|
||||
|
||||
Coder summary (names the modified/created files):
|
||||
{{coder_result}}
|
||||
|
||||
Review the changed files against the step plan's acceptance criteria.
|
||||
Preserve severity tags in your findings.
|
||||
timeout: 1200
|
||||
state_updates:
|
||||
review_report: "{{output}}"
|
||||
next: route_review
|
||||
|
||||
route_review:
|
||||
id: route_review
|
||||
type: script
|
||||
description: Critical findings loop back to implement (bounded); otherwise proceed to handoff.
|
||||
script: scripts/route_review.sh
|
||||
timeout: 5
|
||||
fallback: write_handoff
|
||||
|
||||
write_handoff:
|
||||
id: write_handoff
|
||||
type: llm
|
||||
description: |
|
||||
Write the evidence-backed handoff per handoff-protocol and append
|
||||
durable facts to NOTES.md. The completion gate (check_handoff)
|
||||
verifies the document afterward.
|
||||
skills_enabled: true
|
||||
enabled_skills:
|
||||
- handoff-protocol
|
||||
- ai-slop-remover
|
||||
instructions: |
|
||||
Load `handoff-protocol` and follow its writer schema EXACTLY: the
|
||||
frontmatter (step, title, result) and all eight sections, writing
|
||||
"None" rather than omitting a section.
|
||||
|
||||
Write the handoff to {{handoff_path}} with fs_write. Paste the
|
||||
verification evidence below verbatim into the Evidence section -
|
||||
commands, exit codes, decisive output lines. Deviations come from the
|
||||
staleness report, gate decisions, and fix loop history. Downstream
|
||||
plan updates come from the sweep results.
|
||||
|
||||
Then append durable, step-independent facts (if any) to {{notes_path}}
|
||||
- create the file if missing, never rewrite existing entries.
|
||||
|
||||
If "Gate feedback" below is non-empty, a previous handoff attempt
|
||||
failed validation - fix exactly what it lists.
|
||||
prompt: |
|
||||
## Step
|
||||
{{step_number}} ({{step_title}}) - plan at {{step_plan_path}}
|
||||
|
||||
## Plan summary
|
||||
{{plan_summary}}
|
||||
|
||||
## Coder result
|
||||
{{coder_result}}
|
||||
|
||||
## Staleness report / deviations
|
||||
{{staleness_report}}
|
||||
|
||||
Major deviation summary (if any): {{deviation_summary}}
|
||||
User guidance given (if any): {{user_feedback}}
|
||||
Fix loop attempts used: {{fix_attempts}} of {{max_fix_attempts}}
|
||||
|
||||
## Edge cases discovered
|
||||
{{edge_case_report}}
|
||||
|
||||
## Downstream plan updates
|
||||
{{downstream_updates}}
|
||||
|
||||
## Independent review report (if any)
|
||||
{{review_report}}
|
||||
|
||||
## Verification evidence (paste verbatim)
|
||||
### Format
|
||||
{{format_output}}
|
||||
### Lint
|
||||
{{lint_output}}
|
||||
### Build
|
||||
{{build_output}}
|
||||
### Tests
|
||||
{{tests_output}}
|
||||
|
||||
## Gate feedback
|
||||
{{handoff_fix}}
|
||||
tools:
|
||||
- fs_cat
|
||||
- fs_ls
|
||||
- fs_write
|
||||
- fs_patch
|
||||
max_iterations: 15
|
||||
output_schema:
|
||||
type: object
|
||||
properties:
|
||||
step_summary:
|
||||
type: string
|
||||
description: 3-6 sentence summary of the step for the user's approval decision - what was done, deviations, anything needing their attention
|
||||
required: [step_summary]
|
||||
fallback: end_failure
|
||||
next: check_handoff
|
||||
|
||||
check_handoff:
|
||||
id: check_handoff
|
||||
type: script
|
||||
description: |
|
||||
Deterministic completion gate - handoff exists with frontmatter and all
|
||||
required sections. On success, marks the step plan status complete.
|
||||
One retry back to write_handoff, then failure.
|
||||
script: scripts/check_handoff.sh
|
||||
timeout: 10
|
||||
fallback: end_failure
|
||||
|
||||
gate_user_review:
|
||||
id: gate_user_review
|
||||
type: approval
|
||||
description: The hard stop - the next step never starts without explicit approval.
|
||||
question: |
|
||||
## Step {{step_number}} ({{step_title}}) - ready for review
|
||||
|
||||
{{step_summary}}
|
||||
|
||||
Handoff: {{handoff_path}}
|
||||
Build: {{build_ok}} | Tests: {{tests_ok}} | Fix attempts: {{fix_attempts}}/{{max_fix_attempts}}
|
||||
|
||||
Approve this step? (Answer with anything else to send revision
|
||||
instructions straight to the implementer.)
|
||||
options:
|
||||
- "approve"
|
||||
- "revise"
|
||||
routes:
|
||||
"approve": end_success
|
||||
"revise": get_revision
|
||||
on_other: revise_from_choice
|
||||
state_updates:
|
||||
user_feedback: "{{choice}}"
|
||||
|
||||
get_revision:
|
||||
id: get_revision
|
||||
type: input
|
||||
description: Collect revision instructions, then loop back through implement -> verify -> handoff.
|
||||
question: "What should change? Your comments go to the implementer verbatim."
|
||||
validation: "len(input) > 0"
|
||||
state_updates:
|
||||
fix_instructions: "{{input}}"
|
||||
next: implement
|
||||
|
||||
revise_from_choice:
|
||||
id: revise_from_choice
|
||||
type: script
|
||||
description: Free-form approval answers are treated as revision instructions.
|
||||
script: scripts/revise_from_choice.sh
|
||||
timeout: 5
|
||||
fallback: get_revision
|
||||
|
||||
end_success:
|
||||
id: end_success
|
||||
type: end
|
||||
output: |
|
||||
STEP_COMPLETE
|
||||
Step: {{step_number}} ({{step_title}})
|
||||
Plan: {{step_plan_path}}
|
||||
Handoff: {{handoff_path}}
|
||||
Build: passed | Tests: passed | Fix attempts: {{fix_attempts}}/{{max_fix_attempts}}
|
||||
|
||||
{{step_summary}}
|
||||
|
||||
Downstream plan updates:
|
||||
{{downstream_updates}}
|
||||
|
||||
end_blocked:
|
||||
id: end_blocked
|
||||
type: end
|
||||
output: |
|
||||
STEP_BLOCKED
|
||||
Step: {{step_number}} ({{step_title}})
|
||||
Reason:
|
||||
{{blocking_reason}}
|
||||
|
||||
end_rejected:
|
||||
id: end_rejected
|
||||
type: end
|
||||
output: |
|
||||
STEP_REJECTED
|
||||
Step: {{step_number}} ({{step_title}})
|
||||
Rejected at: deviation gate or coder approval gate.
|
||||
Deviation summary:
|
||||
{{deviation_summary}}
|
||||
Coder result (if it ran):
|
||||
{{coder_result}}
|
||||
|
||||
end_failure:
|
||||
id: end_failure
|
||||
type: end
|
||||
output: |
|
||||
STEP_FAILED
|
||||
Step: {{step_number}} ({{step_title}})
|
||||
Fix attempts: {{fix_attempts}}/{{max_fix_attempts}}
|
||||
Blocking reason (if resolution failed): {{blocking_reason}}
|
||||
|
||||
Coder result:
|
||||
{{coder_result}}
|
||||
|
||||
Last build output:
|
||||
{{build_output}}
|
||||
|
||||
Last tests output:
|
||||
{{tests_output}}
|
||||
Reference in New Issue
Block a user