coyote/assets/agents/deep-research/scripts/verify_sources.py

#!/usr/bin/env python3
"""Check that the sources cited in the research report are reachable.

Scans the final report for URLs and DOIs, probes each with a HEAD
request, and writes a `source_check` summary into state so the human
reviewer sees broken citations at the approval step.

Times out per request so a slow source cannot stall the graph.
"""
import json
import os
import re
import urllib.error
import urllib.request

DOI_RE = re.compile(r"\b(10\.\d{4,9}/[-._;()/:A-Z0-9]+)", re.IGNORECASE)
URL_RE = re.compile(r"https?://[^\s)\]\}\"'>]+")


def load_state():
    path = os.environ.get("GRAPH_STATE_FILE")
    if path:
        with open(path) as f:
            return json.load(f)
    return json.loads(os.environ.get("GRAPH_STATE", "{}"))


def reachable(url, timeout=5.0):
    req = urllib.request.Request(url, method="HEAD")
    try:
        with urllib.request.urlopen(req, timeout=timeout) as resp:
            return 200 <= resp.status < 400
    except urllib.error.HTTPError as e:
        return 200 <= e.code < 400
    except Exception:
        return False


def main():
    state = load_state()
    report = state.get("report") or ""

    urls = sorted({u.rstrip(".,;)") for u in URL_RE.findall(report)})
    dois = sorted(set(DOI_RE.findall(report)))

    results = []
    for url in urls:
        ok = reachable(url)
        results.append(f"  {'OK' if ok else 'UNREACHABLE'}  {url}")
    for doi in dois:
        url = f"https://doi.org/{doi}"
        if url in urls:
            continue
        ok = reachable(url)
        results.append(f"  {'OK' if ok else 'UNREACHABLE'}  DOI {doi} ({url})")

    if not results:
        summary = "No web sources were cited in the report."
    else:
        summary = (
            f"Source reachability ({len(results)} checked):\n"
            + "\n".join(results)
        )

    print(json.dumps({"source_check": summary}))


if __name__ == "__main__":
    main()