70 lines
1.9 KiB
Python
70 lines
1.9 KiB
Python
#!/usr/bin/env python3
|
|
"""Check that the sources cited in the research report are reachable.
|
|
|
|
Scans the final report for URLs and DOIs, probes each with a HEAD
|
|
request, and writes a `source_check` summary into state so the human
|
|
reviewer sees broken citations at the approval step.
|
|
|
|
Times out per request so a slow source cannot stall the graph.
|
|
"""
|
|
import json
|
|
import os
|
|
import re
|
|
import urllib.error
|
|
import urllib.request
|
|
|
|
DOI_RE = re.compile(r"\b(10\.\d{4,9}/[-._;()/:A-Z0-9]+)", re.IGNORECASE)
|
|
URL_RE = re.compile(r"https?://[^\s)\]\}\"'>]+")
|
|
|
|
|
|
def load_state():
|
|
path = os.environ.get("GRAPH_STATE_FILE")
|
|
if path:
|
|
with open(path) as f:
|
|
return json.load(f)
|
|
return json.loads(os.environ.get("GRAPH_STATE", "{}"))
|
|
|
|
|
|
def reachable(url, timeout=5.0):
|
|
req = urllib.request.Request(url, method="HEAD")
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
return 200 <= resp.status < 400
|
|
except urllib.error.HTTPError as e:
|
|
return 200 <= e.code < 400
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
def main():
|
|
state = load_state()
|
|
report = state.get("report") or ""
|
|
|
|
urls = sorted({u.rstrip(".,;)") for u in URL_RE.findall(report)})
|
|
dois = sorted(set(DOI_RE.findall(report)))
|
|
|
|
results = []
|
|
for url in urls:
|
|
ok = reachable(url)
|
|
results.append(f" {'OK' if ok else 'UNREACHABLE'} {url}")
|
|
for doi in dois:
|
|
url = f"https://doi.org/{doi}"
|
|
if url in urls:
|
|
continue
|
|
ok = reachable(url)
|
|
results.append(f" {'OK' if ok else 'UNREACHABLE'} DOI {doi} ({url})")
|
|
|
|
if not results:
|
|
summary = "No web sources were cited in the report."
|
|
else:
|
|
summary = (
|
|
f"Source reachability ({len(results)} checked):\n"
|
|
+ "\n".join(results)
|
|
)
|
|
|
|
print(json.dumps({"source_check": summary}))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|