feat: Refactored the sisyhpus agent system to utilize the new skills system to improve performance and reliability

2026-06-02 13:14:25 -06:00
parent b1782b614f
commit c17db05f39
10 changed files with 790 additions and 261 deletions
@@ -9,7 +9,15 @@ global_tools:
  - fs_ls.sh
  - fs_write.sh
  - fs_patch.sh
-  - fs_mkdir.sh
+  - execute_command.sh
+
+skills_enabled: true
+enabled_skills:
+  - ai-slop-remover
+  - code-review
+  - git-master
+  - frontend-ui-ux
+  - verification-gates

 variables:
  - name: project_dir
@@ -38,6 +46,10 @@ initial_state:
  files_to_create: []
  risks: []
  complexity_score: 0
+  review_attempts: 0
+  max_review_attempts: 1
+  review_clean: true
+  review_notes: ""

 start: resolve_paths

@@ -143,10 +155,24 @@ nodes:
    id: implement
    type: llm
    description: Write code via fs tools. Bounded tool-call loop.
+    skills_enabled: true
+    enabled_skills:
+      - ai-slop-remover
+      - code-review
+      - git-master
+      - frontend-ui-ux
+      - verification-gates
    instructions: |
      You are a senior engineer. Implement the plan by writing code via
      tools. Follow existing patterns in the codebase.

+      ## Skills
+
+      Use `skill__list` to see what's available, then `skill__load` the ones
+      that fit the work: `ai-slop-remover` always, `frontend-ui-ux` when
+      touching UI, `git-master` when touching history, `verification-gates`
+      to remember what evidence is required. Unload when a phase ends.
+
      ## Writing code

      1. Use `fs_patch` for surgical edits to existing files.
@@ -239,6 +265,73 @@ nodes:
    timeout: 5
    fallback: end_failure

+  self_review:
+    id: self_review
+    type: llm
+    description: Skill-driven self-review of the diff. Catches AI slop, dishonest naming, suppressed errors. Bounded to max_review_attempts.
+    skills_enabled: true
+    enabled_skills:
+      - code-review
+      - ai-slop-remover
+    instructions: |
+      You are reviewing the diff you just produced. Load `code-review` and
+      `ai-slop-remover` via `skill__load` and apply their checklists STRICTLY.
+
+      Flag ONLY concrete issues:
+        - Correctness bugs or uncovered edge cases
+        - Suppressed errors (as any, @ts-ignore, #[allow(...)] on unfamiliar
+          lints, empty catch blocks)
+        - Dishonest naming (get_X that mutates, returns wrong type, etc.)
+        - Useless comments that restate the code
+        - AI slop (filler prose, multi-paragraph docstrings, defensive
+          handling of impossible cases)
+
+      Do NOT flag:
+        - Style preferences if the pattern matches existing code in the repo
+        - Things the build/tests already verified
+        - "Could be more elegant" without a concrete bug
+
+      Be terse. The orchestrator wants signal, not noise. If you find nothing
+      blocking, set review_clean=true and leave review_notes empty.
+
+      Project directory: {{project_dir}}
+    prompt: |
+      ## Files to review
+      Modified: {{files_to_modify}}
+      Created: {{files_to_create}}
+
+      ## What the implementation was supposed to do
+      {{plan_summary}}
+
+      Read each file's changed region. Apply the review skills. Output your verdict.
+    tools:
+      - fs_cat
+      - fs_ls
+      - execute_command
+    max_iterations: 15
+    output_schema:
+      type: object
+      properties:
+        review_clean:
+          type: boolean
+          description: True if no blocker issues were found.
+        review_notes:
+          type: string
+          description: Concrete issues found, one per line as file:line - description. Empty when review_clean is true.
+      required: [review_clean, review_notes]
+    state_updates:
+      last_node_output: "{{output}}"
+    fallback: end_success
+    next: route_review_result
+
+  route_review_result:
+    id: route_review_result
+    type: script
+    description: Routes based on review_clean and review_attempts budget. End on clean or budget exhausted; loop to implement otherwise.
+    script: scripts/route_review_result.sh
+    timeout: 5
+    fallback: end_success
+
  end_success:
    id: end_success
    type: end
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+if [[ -n "${GRAPH_STATE_FILE:-}" ]]; then
+  state=$(cat "$GRAPH_STATE_FILE")
+elif [[ -n "${GRAPH_STATE:-}" ]]; then
+  state="$GRAPH_STATE"
+else
+  state='{}'
+fi
+
+review_clean=$(echo "$state" | jq -r '.review_clean // true')
+review_attempts=$(echo "$state" | jq -r '.review_attempts // 0')
+max_review_attempts=$(echo "$state" | jq -r '.max_review_attempts // 1')
+review_notes=$(echo "$state" | jq -r '.review_notes // ""')
+
+if [[ "$review_clean" == "true" ]]; then
+  jq -nc '{"_next": "end_success"}'
+  exit 0
+fi
+
+if (( review_attempts >= max_review_attempts )); then
+  jq -nc \
+    --arg n "$review_notes" \
+    '{
+      "_next": "end_success",
+      "review_notes_unresolved": ("Shipped with unresolved review notes (budget exhausted):\n" + $n)
+    }'
+  exit 0
+fi
+
+next_review=$((review_attempts + 1))
+fix_instr=$(printf '## Self-review feedback (attempt %d of %d)\n\nThe code review found concrete issues. Address them with minimal edits. Do not refactor unrelated code.\n\n%s' \
+  "$next_review" "$max_review_attempts" "$review_notes")
+
+jq -nc \
+  --argjson n "$next_review" \
+  --arg fi "$fix_instr" \
+  '{
+    "review_attempts": $n,
+    "fix_instructions": $fi,
+    "_next": "implement"
+  }'
@@ -25,7 +25,7 @@ if [[ -z "$cmd" || "$cmd" == "null" ]]; then
  jq -nc '{
    "tests_ok": true,
    "tests_output": "(no test command available for this project type)",
-    "_next": "end_success"
+    "_next": "self_review"
  }'
  exit 0
 fi
@@ -40,7 +40,7 @@ if (( exit_code == 0 )); then
    '{
      "tests_ok": true,
      "tests_output": ("Ran: " + $cmd + "\n\n" + $out),
-      "_next": "end_success"
+      "_next": "self_review"
    }'
 else
  jq -nc \