fix: fail close topic brief provider shells (#129)

xiaojiou176 · web-flow · commit 74eddab47d50 · 2026-04-15T18:02:59.000-07:00
diff --git a/apps/orchestrator/src/openvibecoding_orch/scheduler/tool_execution_pipeline.py b/apps/orchestrator/src/openvibecoding_orch/scheduler/tool_execution_pipeline.py
@@ -13,6 +13,8 @@
 from tooling.page_brief_pipeline import write_page_brief_evidence_bundle, write_page_brief_result
 from tooling.search.ai_verifier import verify_search_results_ai
 from tooling.search_pipeline import (
+    _has_public_source_hits,
+    _summarize_public_source_failure,
     write_ai_verification,
     write_evidence_bundle,
     write_news_digest_result,
@@ -115,6 +117,10 @@ def _summarize_page_brief_failure(result: dict[str, Any]) -> str:
     return "页面抓取失败：浏览器任务未返回可用结果。"
 
 
+def _summarize_public_source_receipt_failure(results: list[dict[str, Any]]) -> str:
+    return _summarize_public_source_failure(results)
+
+
 def _write_public_task_result(
     run_id: str,
     request: dict[str, Any],
@@ -287,13 +293,15 @@ def _run_search_task(job: tuple[str, str, dict[str, Any] | None, str]) -> dict[s
                 domain_counts[domain] = domain_counts.get(domain, 0) + 1
     consensus_domains = [domain for domain, count in domain_counts.items() if count >= 2]
 
+    public_source_receipt_missing = not _has_public_source_hits(results)
     verification = {
         "queries": queries,
         "runs": len(results),
         "providers": provider_counts,
-        "consensus_domains": consensus_domains,
+        "consensus_domains": [] if public_source_receipt_missing else consensus_domains,
         "verification_runs": len(verify_results),
-        "all_consistent": all(r.get("verification", {}).get("consistent") for r in results),
+        "all_consistent": (not public_source_receipt_missing) and all(r.get("verification", {}).get("consistent") for r in results),
+        "public_source_receipt_missing": public_source_receipt_missing,
     }
     if policy_adjustments:
         verification["policy_adjustments"] = policy_adjustments
@@ -339,21 +347,26 @@ def _run_search_task(job: tuple[str, str, dict[str, Any] | None, str]) -> dict[s
     verify_failures = [item for item in verify_results if isinstance(item, dict) and not item.get("ok", True)]
     verification["failure_count"] = len(failures)
     verification["verify_failure_count"] = len(verify_failures)
-    if failures or verify_failures:
+    if failures or verify_failures or public_source_receipt_missing:
         _write_public_task_result(
             run_id,
             request,
             results,
             store=store,
             status_override="FAILED",
-            failure_reason_zh=_summarize_news_digest_failure(failures, verify_failures),
+            failure_reason_zh=(
+                _summarize_news_digest_failure(failures, verify_failures)
+                if (failures or verify_failures)
+                else _summarize_public_source_receipt_failure(results)
+            ),
         )
         return {
             "ok": False,
             "runs": len(results),
             "verification_runs": len(verify_results),
             "failures": failures,
             "verify_failures": verify_failures,
+            "public_source_receipt_missing": public_source_receipt_missing,
         }
     _write_public_task_result(run_id, request, results, store=store)
     return {"ok": True, "runs": len(results), "verification_runs": len(verify_results)}
diff --git a/apps/orchestrator/tests/test_news_digest_template.py b/apps/orchestrator/tests/test_news_digest_template.py
@@ -153,6 +153,44 @@ def test_topic_brief_intake_and_result_builder() -> None:
     assert search_payload["topic_brief_result"] == {"name": "topic_brief_result.json"}
 
 
+def test_topic_brief_fail_closes_when_only_provider_homepages_are_captured() -> None:
+    search_request = {
+        "task_template": "topic_brief",
+        "template_payload": {
+            "topic": "Seattle AI",
+            "time_range": "24h",
+            "max_results": 3,
+        },
+    }
+    results = [
+        {
+            "provider": "gemini_web",
+            "results": [
+                {
+                    "title": "gemini_web response",
+                    "href": "https://gemini.google.com/",
+                    "snippet": "Gemini 与 Gemini 对话 你说 Seattle AI",
+                }
+            ],
+        },
+        {
+            "provider": "grok_web",
+            "results": [
+                {
+                    "title": "grok_web response",
+                    "href": "https://grok.com/",
+                    "snippet": "登录 注册 Seattle AI",
+                }
+            ],
+        },
+    ]
+    brief = search_pipeline.build_topic_brief_result(search_request, results)
+    assert brief is not None
+    assert brief["status"] == "FAILED"
+    assert "provider outputs stayed on provider shell pages" in brief["summary"]
+    assert "provider 壳页" in brief["failure_reason_zh"]
+
+
 def test_page_brief_intake_builds_browser_contract_artifact(monkeypatch, tmp_path: Path) -> None:
     runtime_root = tmp_path / "runtime"
     monkeypatch.setenv("OPENVIBECODING_RUNTIME_ROOT", str(runtime_root))
@@ -370,3 +408,53 @@ def run_search(self, query: str, provider: str | None = None, browser_policy=Non
     assert digest_payload["status"] == "FAILED"
     assert digest_payload["summary"].startswith("The news digest for 'Seattle AI' did not complete successfully.")
     assert "来源链路失败" in digest_payload["failure_reason_zh"]
+
+
+def test_topic_brief_provider_homepage_only_results_write_failed_report(monkeypatch, tmp_path: Path) -> None:
+    runtime_root = tmp_path / "runtime"
+    monkeypatch.setenv("OPENVIBECODING_RUNTIME_ROOT", str(runtime_root))
+    monkeypatch.setenv("OPENVIBECODING_RUNS_ROOT", str(runtime_root / "runs"))
+
+    store = RunStore()
+    run_id = store.create_run("topic-brief-provider-homepage-only")
+
+    class ProviderHomepageOnlyToolRunner(ToolRunner):
+        def __init__(self) -> None:
+            super().__init__(run_id=run_id, store=store)
+
+        def run_search(self, query: str, provider: str | None = None, browser_policy=None, policy_audit=None) -> dict:
+            normalized_provider = "gemini_web" if provider == "chatgpt_web" else str(provider or "")
+            href = "https://gemini.google.com/" if normalized_provider == "gemini_web" else "https://grok.com/"
+            return {
+                "ok": True,
+                "provider": normalized_provider,
+                "results": [{"title": f"{normalized_provider} response", "href": href, "snippet": f"{query} provider shell"}],
+                "verification": {"consistent": True},
+            }
+
+    request = {
+        "queries": ["Seattle AI"],
+        "providers": ["chatgpt_web", "grok_web"],
+        "verify": {"providers": ["chatgpt_web"], "repeat": 1},
+        "task_template": "topic_brief",
+        "template_payload": {
+            "topic": "Seattle AI",
+            "time_range": "24h",
+            "max_results": 3,
+        },
+    }
+
+    result = run_search_pipeline(
+        run_id,
+        ProviderHomepageOnlyToolRunner(),
+        store,
+        request,
+        requested_by={"role": "PM", "agent_id": "pm-1"},
+    )
+    assert result["ok"] is False
+    assert result["public_source_receipt_missing"] is True
+    run_dir = runtime_root / "runs" / run_id / "reports"
+    digest_payload = json.loads((run_dir / "topic_brief_result.json").read_text(encoding="utf-8"))
+    assert digest_payload["status"] == "FAILED"
+    assert digest_payload["summary"].startswith("The topic brief for 'Seattle AI' did not complete successfully.")
+    assert "provider 壳页" in digest_payload["failure_reason_zh"]
diff --git a/tooling/search_pipeline.py b/tooling/search_pipeline.py
@@ -10,6 +10,12 @@
 from openvibecoding_orch.store.run_store import RunStore
 from openvibecoding_orch.contract.validator import ContractValidator
 
+_CHAT_PROVIDER_HOSTS: dict[str, set[str]] = {
+    "gemini_web": {"gemini.google.com"},
+    "grok_web": {"grok.com"},
+    "chatgpt_web": {"chatgpt.com", "chat.openai.com"},
+}
+
 
 def _now_ts() -> str:
     return datetime.now(timezone.utc).isoformat()
@@ -42,6 +48,47 @@ def _domain_from_href(href: str) -> str:
         return ""
 
 
+def _is_chat_provider_homepage(provider: Any, href: Any) -> bool:
+    provider_name = str(provider or "").strip().lower()
+    domain = _domain_from_href(str(href or "")).strip().lower()
+    return bool(domain) and domain in _CHAT_PROVIDER_HOSTS.get(provider_name, set())
+
+
+def _has_public_source_hits(results: list[dict[str, Any]]) -> bool:
+    for item in results:
+        if not isinstance(item, dict):
+            continue
+        provider = item.get("provider") or item.get("resolved_provider") or item.get("mode") or "unknown"
+        hits = item.get("results") if isinstance(item.get("results"), list) else []
+        for hit in hits:
+            if not isinstance(hit, dict):
+                continue
+            href = str(hit.get("href") or "").strip()
+            if href and not _is_chat_provider_homepage(provider, href):
+                return True
+    return False
+
+
+def _summarize_public_source_failure(results: list[dict[str, Any]]) -> str:
+    offenders: list[str] = []
+    for item in results:
+        if not isinstance(item, dict):
+            continue
+        provider = item.get("provider") or item.get("resolved_provider") or item.get("mode") or "unknown"
+        hits = item.get("results") if isinstance(item.get("results"), list) else []
+        for hit in hits:
+            if not isinstance(hit, dict):
+                continue
+            href = str(hit.get("href") or "").strip()
+            if href and _is_chat_provider_homepage(provider, href):
+                offender = f"{str(provider).strip()} -> {_domain_from_href(href)}"
+                if offender not in offenders:
+                    offenders.append(offender)
+    if offenders:
+        return f"来源链路失败：当前结果仍停在 provider 壳页而不是公开来源页面（{', '.join(offenders)}）。"
+    return "来源链路失败：当前结果没有产出可公开审计的来源页面。"
+
+
 def _build_sources(results: list[dict]) -> list[dict]:
     sources: list[dict] = []
     retrieved_at = _now_ts()
@@ -117,6 +164,8 @@ def _purify_results(results: list[dict], verification: dict | None = None) -> di
             if not href:
                 missing_href += 1
                 continue
+            if _is_chat_provider_homepage(provider, href):
+                continue
             domain = _domain_from_href(str(href))
             if domain:
                 domain_counts[domain] = domain_counts.get(domain, 0) + 1
@@ -242,6 +291,7 @@ def _build_digest_result(
 ) -> dict[str, Any]:
     digest_sources: list[dict[str, Any]] = []
     seen_urls: set[str] = set()
+    public_source_hit_found = False
     for provider_entry in results:
         if not isinstance(provider_entry, dict):
             continue
@@ -256,10 +306,13 @@ def _build_digest_result(
             if not isinstance(hit, dict):
                 continue
             href = str(hit.get("href") or "").strip()
+            if href and _is_chat_provider_homepage(provider, href):
+                continue
             if href and href in seen_urls:
                 continue
             if href:
                 seen_urls.add(href)
+                public_source_hit_found = True
             digest_sources.append(
                 {
                     "title": str(hit.get("title") or hit.get("name") or href or "result").strip() or "result",
@@ -283,6 +336,13 @@ def _build_digest_result(
             " Review failure_reason_zh and the evidence bundle for the detailed provider failure context."
         ).strip()
         status = "FAILED"
+    elif not public_source_hit_found and results:
+        summary = (
+            f"The {template_label} for '{topic}' did not produce a trustworthy public-source receipt."
+            " The current provider outputs stayed on provider shell pages instead of auditable source URLs."
+        )
+        status = "FAILED"
+        failure_reason_zh = failure_reason_zh or _summarize_public_source_failure(results)
     elif digest_sources:
         preview = ", ".join(item["title"] for item in digest_sources[:3])
         summary = (