600 lines
18 KiB
YAML
600 lines
18 KiB
YAML
name: step-runner
|
|
description: |
|
|
Executes ONE step of a phased implementation plan (plans/ repo) with the
|
|
step protocol enforced as graph edges: orient -> staleness check ->
|
|
implement (coder) -> verify -> edge-case sweep -> optional independent
|
|
review -> evidence-backed handoff -> user approval gate. Designed to be
|
|
delegated to by sisyphus.
|
|
version: "1.0"
|
|
|
|
global_tools:
|
|
- fs_cat.sh
|
|
- fs_ls.sh
|
|
- fs_write.sh
|
|
- fs_patch.sh
|
|
- execute_command.sh
|
|
|
|
skills_enabled: true
|
|
enabled_skills:
|
|
- step-implementation
|
|
- handoff-protocol
|
|
- code-review
|
|
- ai-slop-remover
|
|
|
|
variables:
|
|
- name: project_dir
|
|
description: |
|
|
Absolute path to the project directory. Defaults to "." (the directory
|
|
coyote was invoked from). The coder sub-agent resolves its own
|
|
project_dir the same way, so invoke step-runner FROM the project root
|
|
unless you override this for both.
|
|
default: "."
|
|
- name: plans_dir
|
|
description: |
|
|
Path to the plan repo. Relative paths resolve against project_dir.
|
|
Expected layout: <plans_dir>/steps/NN-<slug>.md,
|
|
<plans_dir>/handoffs/, <plans_dir>/NOTES.md.
|
|
default: "plans"
|
|
- name: step
|
|
description: |
|
|
Which step to execute: a step number, or "next" to pick the first
|
|
in-progress (resume) or pending step plan.
|
|
default: "next"
|
|
|
|
settings:
|
|
max_loop_iterations: 20
|
|
log_state_snapshots: true
|
|
validate_before_run: true
|
|
timeout: 7200
|
|
|
|
initial_state:
|
|
project_dir: ""
|
|
plans_dir: ""
|
|
step_number: 0
|
|
step_slug: ""
|
|
step_title: ""
|
|
step_plan_path: ""
|
|
step_plan: ""
|
|
prev_handoff_path: "(none)"
|
|
prev_handoff: "(none - this is the first step)"
|
|
notes_path: ""
|
|
notes: "(none)"
|
|
handoff_path: ""
|
|
blocking_reason: ""
|
|
plan_summary: ""
|
|
implementation_brief: ""
|
|
staleness_report: ""
|
|
has_major_deviation: false
|
|
deviation_summary: ""
|
|
user_feedback: ""
|
|
fix_instructions: ""
|
|
fix_attempts: 0
|
|
max_fix_attempts: 2
|
|
coder_result: ""
|
|
format_output: ""
|
|
lint_ok: true
|
|
lint_output: ""
|
|
build_ok: true
|
|
build_output: ""
|
|
tests_ok: true
|
|
tests_output: ""
|
|
edge_case_report: ""
|
|
downstream_updates: ""
|
|
needs_independent_review: false
|
|
review_report: ""
|
|
review_attempts: 0
|
|
max_review_attempts: 1
|
|
handoff_attempts: 0
|
|
handoff_fix: ""
|
|
step_summary: ""
|
|
|
|
start: resolve_step
|
|
|
|
nodes:
|
|
resolve_step:
|
|
id: resolve_step
|
|
type: script
|
|
description: |
|
|
Locate the step plan, previous handoff, and NOTES.md; parse frontmatter;
|
|
check depends_on satisfaction against existing handoffs; mark the plan
|
|
in-progress. Routes to gate_blocked when dependencies are unsatisfied.
|
|
script: scripts/resolve_step.sh
|
|
timeout: 30
|
|
fallback: end_failure
|
|
next: orient
|
|
|
|
gate_blocked:
|
|
id: gate_blocked
|
|
type: approval
|
|
description: Escalate unsatisfied dependencies instead of building on missing ground.
|
|
question: |
|
|
Step {{step_number}} ({{step_title}}) is BLOCKED:
|
|
|
|
{{blocking_reason}}
|
|
|
|
Proceed anyway?
|
|
options:
|
|
- "yes"
|
|
- "no"
|
|
routes:
|
|
"yes": orient
|
|
"no": end_blocked
|
|
on_other: end_blocked
|
|
|
|
orient:
|
|
id: orient
|
|
type: llm
|
|
description: |
|
|
Read-only orientation and staleness check: merge the previous handoff's
|
|
directives with the step plan, then verify the plan's assumptions
|
|
against the CURRENT codebase before any edit.
|
|
skills_enabled: true
|
|
enabled_skills:
|
|
- step-implementation
|
|
instructions: |
|
|
You are orienting for one step of a phased implementation plan. Load
|
|
`step-implementation` and apply its Orient and Staleness-check phases.
|
|
You are READ-ONLY in this node: no edits, no fixes.
|
|
|
|
1. Read the previous handoff (below). Note directives aimed at this
|
|
step, deviations that changed the codebase, and bare assertions
|
|
that need re-verification.
|
|
2. Staleness-check the step plan against the code at {{project_dir}}:
|
|
grep the symbols it references (via execute_command), read its
|
|
Context snippets at their claimed locations with fs_cat, confirm
|
|
its Test commands exist.
|
|
3. Classify discrepancies per the skill's deviation table: minor
|
|
(mechanics differ; correct silently in the brief) vs major (scope,
|
|
approach, interfaces, or a later step's assumptions affected).
|
|
|
|
Produce `implementation_brief`: the corrected, self-contained marching
|
|
orders for the implementer - plan tasks in order, handoff directives
|
|
applied, minor staleness corrections folded in, acceptance criteria
|
|
restated. The implementer sees ONLY the step plan plus your brief.
|
|
prompt: |
|
|
## Step plan ({{step_plan_path}})
|
|
{{step_plan}}
|
|
|
|
## Previous handoff ({{prev_handoff_path}})
|
|
{{prev_handoff}}
|
|
|
|
## Rolling project notes
|
|
{{notes}}
|
|
tools:
|
|
- fs_cat
|
|
- fs_ls
|
|
- execute_command
|
|
max_iterations: 20
|
|
output_schema:
|
|
type: object
|
|
properties:
|
|
plan_summary:
|
|
type: string
|
|
description: 1-3 sentences summarizing what this step delivers
|
|
implementation_brief:
|
|
type: string
|
|
description: Corrected, self-contained instructions for the implementer
|
|
staleness_report:
|
|
type: string
|
|
description: Findings from checking plan assumptions against current code; "clean" if none
|
|
has_major_deviation:
|
|
type: boolean
|
|
description: True when a discrepancy changes scope, approach, or interfaces
|
|
deviation_summary:
|
|
type: string
|
|
description: Major deviations only, with the plan claim vs current reality. Empty when none
|
|
required: [plan_summary, implementation_brief, staleness_report, has_major_deviation, deviation_summary]
|
|
fallback: end_failure
|
|
next: route_staleness
|
|
|
|
route_staleness:
|
|
id: route_staleness
|
|
type: script
|
|
description: Major deviation -> user gate; otherwise straight to implement.
|
|
script: scripts/route_staleness.sh
|
|
timeout: 5
|
|
fallback: implement
|
|
|
|
gate_deviation:
|
|
id: gate_deviation
|
|
type: approval
|
|
description: Major deviations are never silently absorbed - the user decides.
|
|
question: |
|
|
Step {{step_number}} ({{step_title}}): the plan no longer matches the
|
|
codebase in a way that changes scope or approach.
|
|
|
|
{{deviation_summary}}
|
|
|
|
Staleness report:
|
|
{{staleness_report}}
|
|
|
|
Proceed with the corrected brief? (Answer with anything else to give
|
|
your own guidance to the implementer.)
|
|
options:
|
|
- "proceed"
|
|
- "abort"
|
|
routes:
|
|
"proceed": implement
|
|
"abort": end_rejected
|
|
on_other: implement
|
|
state_updates:
|
|
user_feedback: "{{choice}}"
|
|
|
|
implement:
|
|
id: implement
|
|
type: agent
|
|
description: |
|
|
Delegate implementation to the coder graph agent, which runs its own
|
|
plan -> implement -> build -> tests -> self-review fix-loop internally.
|
|
agent: coder
|
|
prompt: |
|
|
## TASK
|
|
Execute step {{step_number}} ({{step_title}}) of a phased implementation
|
|
plan for the project at {{project_dir}}.
|
|
|
|
## EXPECTED OUTCOME
|
|
Every task in the step plan below is implemented and its acceptance
|
|
criteria are met. Tests are derived from the Acceptance criteria
|
|
section (not from the implementation). Build and full test suite pass.
|
|
|
|
## MUST DO
|
|
- Follow the Orientation brief below - it supersedes the raw plan where
|
|
they disagree (it folds in corrections from the staleness check).
|
|
- Match the patterns pasted in the step plan's Context section.
|
|
- Derive tests from the plan's Acceptance criteria.
|
|
|
|
## MUST NOT DO
|
|
- Do not touch anything listed in the plan's Out of scope section.
|
|
- Do not modify files under {{plans_dir}}.
|
|
- Do not implement work belonging to other steps.
|
|
|
|
## CONTEXT
|
|
### Step plan
|
|
{{step_plan}}
|
|
|
|
### Orientation brief (handoff directives + staleness corrections applied)
|
|
{{implementation_brief}}
|
|
|
|
### User guidance (if any)
|
|
{{user_feedback}}
|
|
|
|
### Fix loop status (empty on first attempt)
|
|
{{fix_instructions}}
|
|
timeout: 3600
|
|
state_updates:
|
|
coder_result: "{{output}}"
|
|
next: route_coder_result
|
|
|
|
route_coder_result:
|
|
id: route_coder_result
|
|
type: script
|
|
description: Route on the coder sentinel - COMPLETE verifies, REJECTED/FAILED terminate.
|
|
script: scripts/route_coder_result.sh
|
|
timeout: 5
|
|
fallback: end_failure
|
|
|
|
verify_format_lint:
|
|
id: verify_format_lint
|
|
type: script
|
|
description: |
|
|
Format BEFORE evidence collection (FORMAT_CMD override or per-type
|
|
heuristic), then lint (LINT_CMD, when configured). Lint failure routes
|
|
to the fix loop.
|
|
script: scripts/verify_format_lint.sh
|
|
timeout: 300
|
|
fallback: fix_loop_gate
|
|
|
|
verify_build:
|
|
id: verify_build
|
|
type: script
|
|
description: Step-level build/typecheck evidence, collected AFTER formatting.
|
|
script: scripts/verify_build.sh
|
|
timeout: 600
|
|
fallback: fix_loop_gate
|
|
|
|
verify_tests:
|
|
id: verify_tests
|
|
type: script
|
|
description: FULL test suite - regressions in untouched code fail the step too.
|
|
script: scripts/verify_tests.sh
|
|
timeout: 1200
|
|
fallback: fix_loop_gate
|
|
|
|
fix_loop_gate:
|
|
id: fix_loop_gate
|
|
type: script
|
|
description: |
|
|
Step-level fix budget (the coder already ran its own internal fix
|
|
loop). Loops to implement with fix_instructions, or ends as failure.
|
|
script: scripts/fix_loop_gate.sh
|
|
timeout: 5
|
|
fallback: end_failure
|
|
|
|
edge_case_sweep:
|
|
id: edge_case_sweep
|
|
type: llm
|
|
description: |
|
|
Post-implementation sweep: missed spots, edge cases, downstream plan
|
|
implications. May annotate downstream plans' Edge cases sections
|
|
(annotate vs propose per handoff-protocol). Also judges whether the
|
|
change warrants an independent review pass.
|
|
skills_enabled: true
|
|
enabled_skills:
|
|
- step-implementation
|
|
- handoff-protocol
|
|
instructions: |
|
|
The implementation for this step just passed build and tests. Load
|
|
`step-implementation` (edge-case sweep phase) and `handoff-protocol`
|
|
(annotate-vs-propose rules), then:
|
|
|
|
1. Read the changed code (the coder result below names the files).
|
|
Look for edge cases the plan missed: empty inputs, error paths,
|
|
concurrency, partial failure, compat.
|
|
2. For each edge case belonging to a LATER step: check that step's
|
|
plan under {{plans_dir}}/steps/. If its Edge cases section already
|
|
covers it, done. If not, append an entry to that section via
|
|
fs_patch - touch NOTHING else in the file.
|
|
3. NEVER edit a later plan's Objective, Tasks, Acceptance criteria,
|
|
or Out of scope. Scope-affecting changes become proposed diffs in
|
|
`downstream_updates` instead.
|
|
4. Set needs_independent_review=true when the change touched 5+ files
|
|
or crosses architectural boundaries (auth, public APIs, schema,
|
|
security-sensitive paths).
|
|
|
|
Be terse. Findings, not prose.
|
|
prompt: |
|
|
## Coder result
|
|
{{coder_result}}
|
|
|
|
## Step plan
|
|
{{step_plan}}
|
|
|
|
## Staleness report from orientation
|
|
{{staleness_report}}
|
|
tools:
|
|
- fs_cat
|
|
- fs_ls
|
|
- fs_patch
|
|
- execute_command
|
|
max_iterations: 20
|
|
output_schema:
|
|
type: object
|
|
properties:
|
|
edge_case_report:
|
|
type: string
|
|
description: Edge cases discovered - both handled and punted, one per line. "none" if empty
|
|
downstream_updates:
|
|
type: string
|
|
description: Annotations made (plan file + section) and proposed diffs for scope-affecting changes. "none" if empty
|
|
needs_independent_review:
|
|
type: boolean
|
|
required: [edge_case_report, downstream_updates, needs_independent_review]
|
|
fallback: write_handoff
|
|
next: route_sweep
|
|
|
|
route_sweep:
|
|
id: route_sweep
|
|
type: script
|
|
description: Broad or boundary-crossing changes get an independent reviewer.
|
|
script: scripts/route_sweep.sh
|
|
timeout: 5
|
|
fallback: write_handoff
|
|
|
|
independent_review:
|
|
id: independent_review
|
|
type: agent
|
|
description: Independent review pass - the author's self-review cannot catch its own rationalizations.
|
|
agent: code-reviewer
|
|
prompt: |
|
|
Review the changes produced for step {{step_number}} ({{step_title}})
|
|
of a phased implementation plan in {{project_dir}}.
|
|
|
|
What the step was supposed to do:
|
|
{{plan_summary}}
|
|
|
|
Coder summary (names the modified/created files):
|
|
{{coder_result}}
|
|
|
|
Review the changed files against the step plan's acceptance criteria.
|
|
Preserve severity tags in your findings.
|
|
timeout: 1200
|
|
state_updates:
|
|
review_report: "{{output}}"
|
|
next: route_review
|
|
|
|
route_review:
|
|
id: route_review
|
|
type: script
|
|
description: Critical findings loop back to implement (bounded); otherwise proceed to handoff.
|
|
script: scripts/route_review.sh
|
|
timeout: 5
|
|
fallback: write_handoff
|
|
|
|
write_handoff:
|
|
id: write_handoff
|
|
type: llm
|
|
description: |
|
|
Write the evidence-backed handoff per handoff-protocol and append
|
|
durable facts to NOTES.md. The completion gate (check_handoff)
|
|
verifies the document afterward.
|
|
skills_enabled: true
|
|
enabled_skills:
|
|
- handoff-protocol
|
|
- ai-slop-remover
|
|
instructions: |
|
|
Load `handoff-protocol` and follow its writer schema EXACTLY: the
|
|
frontmatter (step, title, result) and all eight sections, writing
|
|
"None" rather than omitting a section.
|
|
|
|
Write the handoff to {{handoff_path}} with fs_write. Paste the
|
|
verification evidence below verbatim into the Evidence section -
|
|
commands, exit codes, decisive output lines. Deviations come from the
|
|
staleness report, gate decisions, and fix loop history. Downstream
|
|
plan updates come from the sweep results.
|
|
|
|
Then append durable, step-independent facts (if any) to {{notes_path}}
|
|
- create the file if missing, never rewrite existing entries.
|
|
|
|
If "Gate feedback" below is non-empty, a previous handoff attempt
|
|
failed validation - fix exactly what it lists.
|
|
prompt: |
|
|
## Step
|
|
{{step_number}} ({{step_title}}) - plan at {{step_plan_path}}
|
|
|
|
## Plan summary
|
|
{{plan_summary}}
|
|
|
|
## Coder result
|
|
{{coder_result}}
|
|
|
|
## Staleness report / deviations
|
|
{{staleness_report}}
|
|
|
|
Major deviation summary (if any): {{deviation_summary}}
|
|
User guidance given (if any): {{user_feedback}}
|
|
Fix loop attempts used: {{fix_attempts}} of {{max_fix_attempts}}
|
|
|
|
## Edge cases discovered
|
|
{{edge_case_report}}
|
|
|
|
## Downstream plan updates
|
|
{{downstream_updates}}
|
|
|
|
## Independent review report (if any)
|
|
{{review_report}}
|
|
|
|
## Verification evidence (paste verbatim)
|
|
### Format
|
|
{{format_output}}
|
|
### Lint
|
|
{{lint_output}}
|
|
### Build
|
|
{{build_output}}
|
|
### Tests
|
|
{{tests_output}}
|
|
|
|
## Gate feedback
|
|
{{handoff_fix}}
|
|
tools:
|
|
- fs_cat
|
|
- fs_ls
|
|
- fs_write
|
|
- fs_patch
|
|
max_iterations: 15
|
|
output_schema:
|
|
type: object
|
|
properties:
|
|
step_summary:
|
|
type: string
|
|
description: 3-6 sentence summary of the step for the user's approval decision - what was done, deviations, anything needing their attention
|
|
required: [step_summary]
|
|
fallback: end_failure
|
|
next: check_handoff
|
|
|
|
check_handoff:
|
|
id: check_handoff
|
|
type: script
|
|
description: |
|
|
Deterministic completion gate - handoff exists with frontmatter and all
|
|
required sections. On success, marks the step plan status complete.
|
|
One retry back to write_handoff, then failure.
|
|
script: scripts/check_handoff.sh
|
|
timeout: 10
|
|
fallback: end_failure
|
|
|
|
gate_user_review:
|
|
id: gate_user_review
|
|
type: approval
|
|
description: The hard stop - the next step never starts without explicit approval.
|
|
question: |
|
|
## Step {{step_number}} ({{step_title}}) - ready for review
|
|
|
|
{{step_summary}}
|
|
|
|
Handoff: {{handoff_path}}
|
|
Build: {{build_ok}} | Tests: {{tests_ok}} | Fix attempts: {{fix_attempts}}/{{max_fix_attempts}}
|
|
|
|
Approve this step? (Answer with anything else to send revision
|
|
instructions straight to the implementer.)
|
|
options:
|
|
- "approve"
|
|
- "revise"
|
|
routes:
|
|
"approve": end_success
|
|
"revise": get_revision
|
|
on_other: revise_from_choice
|
|
state_updates:
|
|
user_feedback: "{{choice}}"
|
|
|
|
get_revision:
|
|
id: get_revision
|
|
type: input
|
|
description: Collect revision instructions, then loop back through implement -> verify -> handoff.
|
|
question: "What should change? Your comments go to the implementer verbatim."
|
|
validation: "len(input) > 0"
|
|
state_updates:
|
|
fix_instructions: "{{input}}"
|
|
next: implement
|
|
|
|
revise_from_choice:
|
|
id: revise_from_choice
|
|
type: script
|
|
description: Free-form approval answers are treated as revision instructions.
|
|
script: scripts/revise_from_choice.sh
|
|
timeout: 5
|
|
fallback: get_revision
|
|
|
|
end_success:
|
|
id: end_success
|
|
type: end
|
|
output: |
|
|
STEP_COMPLETE
|
|
Step: {{step_number}} ({{step_title}})
|
|
Plan: {{step_plan_path}}
|
|
Handoff: {{handoff_path}}
|
|
Build: passed | Tests: passed | Fix attempts: {{fix_attempts}}/{{max_fix_attempts}}
|
|
|
|
{{step_summary}}
|
|
|
|
Downstream plan updates:
|
|
{{downstream_updates}}
|
|
|
|
end_blocked:
|
|
id: end_blocked
|
|
type: end
|
|
output: |
|
|
STEP_BLOCKED
|
|
Step: {{step_number}} ({{step_title}})
|
|
Reason:
|
|
{{blocking_reason}}
|
|
|
|
end_rejected:
|
|
id: end_rejected
|
|
type: end
|
|
output: |
|
|
STEP_REJECTED
|
|
Step: {{step_number}} ({{step_title}})
|
|
Rejected at: deviation gate or coder approval gate.
|
|
Deviation summary:
|
|
{{deviation_summary}}
|
|
Coder result (if it ran):
|
|
{{coder_result}}
|
|
|
|
end_failure:
|
|
id: end_failure
|
|
type: end
|
|
output: |
|
|
STEP_FAILED
|
|
Step: {{step_number}} ({{step_title}})
|
|
Fix attempts: {{fix_attempts}}/{{max_fix_attempts}}
|
|
Blocking reason (if resolution failed): {{blocking_reason}}
|
|
|
|
Coder result:
|
|
{{coder_result}}
|
|
|
|
Last build output:
|
|
{{build_output}}
|
|
|
|
Last tests output:
|
|
{{tests_output}}
|