From 59680eec0f8b283724f56dc9cdbb491cb8592e9d Mon Sep 17 00:00:00 2001
From: Bo Moon <54k@users.noreply.github.com>
Date: Tue, 2 Jun 2026 00:55:30 +0200
Subject: [PATCH 1/2] feat(models): add Ollama backend with localhost support
 and configurable inference params
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a new model backend for Ollama (local + cloud-hosted) using the standard
/v1/chat/completions endpoint. No API key required for local instances.

Key features:
- Auto-skips API key check on localhost endpoints
- ollama_extra_body config key passes arbitrary params (reasoning_effort,
  num_ctx, seed, etc.) directly to the Ollama API payload
- Reuses shared chat-completions serialization helpers in _chat_utils.py
- Four ready-to-use config presets: default, fast (no reasoning),
  reason (high reasoning), qwen (Qwen3-Coder 80B)

Files:
- src/webwright/models/_chat_utils.py — shared serialization helpers
- src/webwright/models/ollama_model.py — Ollama backend implementation
- src/webwright/models/__init__.py — register ollama in MODEL_MAPPING
- src/webwright/config/model_ollama*.yaml — config presets
---
 src/webwright/config/model_ollama.yaml        |  20 ++++
 src/webwright/config/model_ollama_fast.yaml   |  21 ++++
 src/webwright/config/model_ollama_qwen.yaml   |  17 +++
 src/webwright/config/model_ollama_reason.yaml |  21 ++++
 src/webwright/models/__init__.py              |   1 +
 src/webwright/models/_chat_utils.py           | 106 ++++++++++++++++++
 src/webwright/models/ollama_model.py          |  99 ++++++++++++++++
 7 files changed, 285 insertions(+)
 create mode 100644 src/webwright/config/model_ollama.yaml
 create mode 100644 src/webwright/config/model_ollama_fast.yaml
 create mode 100644 src/webwright/config/model_ollama_qwen.yaml
 create mode 100644 src/webwright/config/model_ollama_reason.yaml
 create mode 100644 src/webwright/models/_chat_utils.py
 create mode 100644 src/webwright/models/ollama_model.py

diff --git a/src/webwright/config/model_ollama.yaml b/src/webwright/config/model_ollama.yaml
new file mode 100644
index 0000000..64145e3
--- /dev/null
+++ b/src/webwright/config/model_ollama.yaml
@@ -0,0 +1,20 @@
+# Model modifier — Ollama variant.
+#
+# Stack on top of base.yaml:
+#   python -m webwright.run.cli -c base.yaml -c model_ollama.yaml ...
+#
+# Connects to a local or cloud-hosted Ollama instance via /v1/chat/completions.
+# No API key needed for local Ollama (http://localhost:11434).
+# Cloud-hosted Ollama models (e.g. deepseek-v4-pro:cloud) may require
+# OLLAMA_API_KEY env var for authentication.
+#
+# Available models on your host:
+#   deepseek-v4-pro:cloud   (default — what Sisyphus itself runs on)
+#   qwen3-coder-next:cloud  (80B, FP8)
+#   kimi-k2.6:cloud         (1T, int4)
+#   glm-5.1:cloud
+
+model:
+  model_class: ollama
+  model_name: deepseek-v4-pro:cloud
+  ollama_endpoint: http://localhost:11434/v1/chat/completions
diff --git a/src/webwright/config/model_ollama_fast.yaml b/src/webwright/config/model_ollama_fast.yaml
new file mode 100644
index 0000000..c85c10b
--- /dev/null
+++ b/src/webwright/config/model_ollama_fast.yaml
@@ -0,0 +1,21 @@
+# Model modifier — Ollama (fast mode — no reasoning).
+#
+# Stack on top of base.yaml:
+#   python -m webwright.run.cli -c base.yaml -c model_ollama_fast.yaml ...
+#
+# Sets reasoning_effort=none for models that support it (deepseek, qwen3, etc.),
+# trading quality for speed — ~10x faster iterations than default reasoning mode.
+# Override via OLLAMA_MODEL env var or edit model_name inline.
+#
+# Available models on your host:
+#   deepseek-v4-pro:cloud   (1.6T, FP8)
+#   qwen3-coder-next:cloud  (80B, FP8)
+#   kimi-k2.6:cloud         (1T, int4)
+#   glm-5.1:cloud
+
+model:
+  model_class: ollama
+  model_name: deepseek-v4-pro:cloud
+  ollama_endpoint: http://localhost:11434/v1/chat/completions
+  ollama_extra_body:
+    reasoning_effort: none
diff --git a/src/webwright/config/model_ollama_qwen.yaml b/src/webwright/config/model_ollama_qwen.yaml
new file mode 100644
index 0000000..c9f0e4b
--- /dev/null
+++ b/src/webwright/config/model_ollama_qwen.yaml
@@ -0,0 +1,17 @@
+# Model modifier — Ollama with Qwen Coder (80B, FP8).
+#
+# Stack on top of base.yaml:
+#   python -m webwright.run.cli -c base.yaml -c model_ollama_qwen.yaml ...
+#
+# Qwen3-Coder 80B is a good balance — fast reasoning=none and strong code gen.
+# Good for: quick exploration, simple tasks, low-latency iteration.
+#
+# All ollama_extra_body keys are forwarded directly to the API payload.
+# Common keys: reasoning_effort (none/low/medium/high), num_ctx, num_predict, seed.
+
+model:
+  model_class: ollama
+  model_name: qwen3-coder-next:cloud
+  ollama_endpoint: http://localhost:11434/v1/chat/completions
+  ollama_extra_body:
+    reasoning_effort: none
diff --git a/src/webwright/config/model_ollama_reason.yaml b/src/webwright/config/model_ollama_reason.yaml
new file mode 100644
index 0000000..73640bd
--- /dev/null
+++ b/src/webwright/config/model_ollama_reason.yaml
@@ -0,0 +1,21 @@
+# Model modifier — Ollama (reasoning mode — max intelligence, slower).
+#
+# Stack on top of base.yaml:
+#   python -m webwright.run.cli -c base.yaml -c model_ollama_reason.yaml ...
+#
+# Leaves reasoning at model default (max) for highest quality on hard tasks.
+# Each iteration takes 2-10 minutes but produces much better plans.
+# Override via OLLAMA_MODEL env var or edit model_name inline.
+#
+# Available models on your host:
+#   deepseek-v4-pro:cloud   (1.6T, FP8)
+#   qwen3-coder-next:cloud  (80B, FP8)
+#   kimi-k2.6:cloud         (1T, int4)
+#   glm-5.1:cloud
+
+model:
+  model_class: ollama
+  model_name: deepseek-v4-pro:cloud
+  ollama_endpoint: http://localhost:11434/v1/chat/completions
+  ollama_extra_body:
+    reasoning_effort: high
diff --git a/src/webwright/models/__init__.py b/src/webwright/models/__init__.py
index 6631ac0..3d07c67 100644
--- a/src/webwright/models/__init__.py
+++ b/src/webwright/models/__init__.py
@@ -9,6 +9,7 @@
     "openai": "webwright.models.openai_model.OpenAIModel",
     "anthropic": "webwright.models.anthropic_model.AnthropicModel",
     "openrouter": "webwright.models.openrouter_model.OpenRouterModel",
+    "ollama": "webwright.models.ollama_model.OllamaModel",
 }
 
 
diff --git a/src/webwright/models/_chat_utils.py b/src/webwright/models/_chat_utils.py
new file mode 100644
index 0000000..3fb00c9
--- /dev/null
+++ b/src/webwright/models/_chat_utils.py
@@ -0,0 +1,106 @@
+"""Shared chat-completions serialization helpers used by Ollama and OpenRouter model backends."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from webwright.models.base import _safe_int
+
+
+def serialize_chat_content_part(part: dict[str, Any]) -> dict[str, Any] | None:
+    part_type = part.get("type")
+    if part_type in {"input_text", "output_text"}:
+        return {"type": "text", "text": str(part.get("text", "") or "")}
+    if part_type == "input_image":
+        return {
+            "type": "image_url",
+            "image_url": {
+                "url": str(part.get("image_url", "") or ""),
+                "detail": str(part.get("detail", "high") or "high"),
+            },
+        }
+    return None
+
+
+def serialize_chat_messages(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    serialized: list[dict[str, Any]] = []
+    for message in messages:
+        role = message["role"]
+        if role == "exit":
+            continue
+        mapped_role = "system" if role == "system" else ("assistant" if role == "assistant" else "user")
+        content = message.get("content", "")
+        if isinstance(content, str):
+            serialized.append({"role": mapped_role, "content": content})
+            continue
+        parts = [
+            serialized_part
+            for part in content
+            if isinstance(part, dict)
+            for serialized_part in [serialize_chat_content_part(part)]
+            if serialized_part is not None
+        ]
+        if mapped_role == "assistant" or all(part.get("type") == "text" for part in parts):
+            serialized.append(
+                {
+                    "role": mapped_role,
+                    "content": "\n".join(str(part.get("text", "") or "") for part in parts),
+                }
+            )
+        else:
+            serialized.append({"role": mapped_role, "content": parts})
+    return serialized
+
+
+def metrics_input_from_chat_messages(chat_messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    metrics_input: list[dict[str, Any]] = []
+    for message in chat_messages:
+        content = message.get("content", "")
+        if isinstance(content, str):
+            metrics_input.append({"content": [{"type": "input_text", "text": content}]})
+            continue
+        parts: list[dict[str, Any]] = []
+        for part in content:
+            if not isinstance(part, dict):
+                continue
+            if part.get("type") == "text":
+                parts.append({"type": "input_text", "text": str(part.get("text", "") or "")})
+            elif part.get("type") == "image_url":
+                parts.append({"type": "input_image"})
+        metrics_input.append({"content": parts})
+    return metrics_input
+
+
+def extract_chat_completions_text(payload: dict[str, Any]) -> str:
+    choices = payload.get("choices")
+    if not isinstance(choices, list) or not choices:
+        return ""
+    first_choice = choices[0]
+    if not isinstance(first_choice, dict):
+        return ""
+    message = first_choice.get("message", {})
+    if not isinstance(message, dict):
+        return ""
+    content = message.get("content", "")
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        return "\n".join(
+            str(part.get("text", "") or "")
+            for part in content
+            if isinstance(part, dict) and part.get("type") == "text"
+        )
+    return ""
+
+
+def usage_metrics_from_chat_completions(payload: dict[str, Any]) -> dict[str, int]:
+    usage = payload.get("usage")
+    if not isinstance(usage, dict):
+        usage = {}
+    return {
+        "input_tokens": _safe_int(usage.get("prompt_tokens")),
+        "output_tokens": _safe_int(usage.get("completion_tokens")),
+        "total_tokens": _safe_int(usage.get("total_tokens")),
+        "cached_input_tokens": 0,
+        "reasoning_output_tokens": 0,
+    }
diff --git a/src/webwright/models/ollama_model.py b/src/webwright/models/ollama_model.py
new file mode 100644
index 0000000..7d4e3f6
--- /dev/null
+++ b/src/webwright/models/ollama_model.py
@@ -0,0 +1,99 @@
+"""Ollama chat completions model backend.
+
+Drop-in replacement for OpenAI/Anthropic/OpenRouter — uses local or cloud-hosted
+Ollama models via the standard /v1/chat/completions endpoint.  No API key
+required for local instances; set OLLAMA_API_KEY env var for cloud-hosted
+Ollama endpoints that require authentication.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+from urllib.parse import urlparse
+
+from webwright.models._chat_utils import (
+    extract_chat_completions_text,
+    metrics_input_from_chat_messages,
+    serialize_chat_messages,
+    usage_metrics_from_chat_completions,
+)
+from webwright.models.base import (
+    BaseModel,
+    BaseModelConfig,
+    OptStr,
+)
+
+__all__ = [
+    "OllamaModel",
+    "OllamaModelConfig",
+]
+
+
+class OllamaModelConfig(BaseModelConfig):
+    model_name: OptStr = "deepseek-v4-pro:cloud"
+    ollama_api_key: OptStr = ""
+    ollama_endpoint: OptStr = "http://localhost:11434/v1/chat/completions"
+    ollama_extra_body: dict[str, Any] = {}
+
+
+def _is_localhost(endpoint: str) -> bool:
+    host = (urlparse(endpoint).hostname or "").lower()
+    return host in ("localhost", "127.0.0.1", "::1")
+
+
+class OllamaModel(BaseModel):
+    _API_KEY_FIELD = "ollama_api_key"
+    _ENV_VAR = "OLLAMA_API_KEY"
+    _LOG_SOURCE = "ollama"
+    _MAX_RATE_LIMIT_RETRIES = 5
+    _MAX_TRANSIENT_RETRIES = 5
+    _DEFAULT_CONFIG_CLASS = OllamaModelConfig
+
+    def __init__(self, *, config_class: type | None = None, **kwargs):
+        # Localhost Ollama doesn't need an API key — skip the key check.
+        endpoint = kwargs.get("ollama_endpoint", "")
+        if endpoint and _is_localhost(endpoint):
+            if "ollama_api_key" not in kwargs:
+                kwargs["ollama_api_key"] = "sk-noop"
+        super().__init__(config_class=config_class, **kwargs)
+
+    def _request_headers(self) -> dict[str, str]:
+        headers = {"Content-Type": "application/json"}
+        if self.config.ollama_api_key and self.config.ollama_api_key != "sk-noop":
+            headers["Authorization"] = f"Bearer {self.config.ollama_api_key}"
+        return headers
+
+    def _post_url(self) -> str:
+        return self.config.ollama_endpoint
+
+    def _build_payload(self, messages: list[dict[str, Any]]) -> dict[str, Any]:
+        payload: dict[str, Any] = {
+            "model": self.config.model_name,
+            "messages": serialize_chat_messages(messages),
+            "stream": False,
+            "response_format": {"type": "json_object"},
+            "max_tokens": self.config.max_output_tokens,
+        }
+        if self.config.ollama_extra_body:
+            payload.update(self.config.ollama_extra_body)
+        return payload
+
+    def _build_text_payload(self, messages: list[dict[str, Any]]) -> dict[str, Any]:
+        payload: dict[str, Any] = {
+            "model": self.config.model_name,
+            "messages": serialize_chat_messages(messages),
+            "stream": False,
+            "max_tokens": self.config.max_output_tokens,
+        }
+        if self.config.ollama_extra_body:
+            payload.update(self.config.ollama_extra_body)
+        return payload
+
+    def _request_metrics_input(self, payload: dict[str, Any]) -> list[dict[str, Any]]:
+        return metrics_input_from_chat_messages(payload.get("messages") or [])
+
+    def _extract_text(self, payload: dict[str, Any]) -> str:
+        return extract_chat_completions_text(payload)
+
+    def _usage_metrics_from_payload(self, payload: dict[str, Any]) -> dict[str, int]:
+        return usage_metrics_from_chat_completions(payload)

From 35c1033be26c5d1de766e8eb9d95d441bd012e22 Mon Sep 17 00:00:00 2001
From: Bo Moon <54k@users.noreply.github.com>
Date: Tue, 2 Jun 2026 13:32:29 +0200
Subject: [PATCH 2/2] cleanup: remove model-specific mentions and host-specific
 comments

---
 src/webwright/config/model_ollama.yaml        | 15 ++++-----------
 src/webwright/config/model_ollama_fast.yaml   | 14 ++------------
 src/webwright/config/model_ollama_qwen.yaml   | 17 -----------------
 src/webwright/config/model_ollama_reason.yaml | 17 ++---------------
 src/webwright/models/ollama_model.py          |  2 +-
 5 files changed, 9 insertions(+), 56 deletions(-)
 delete mode 100644 src/webwright/config/model_ollama_qwen.yaml

diff --git a/src/webwright/config/model_ollama.yaml b/src/webwright/config/model_ollama.yaml
index 64145e3..efbaa49 100644
--- a/src/webwright/config/model_ollama.yaml
+++ b/src/webwright/config/model_ollama.yaml
@@ -1,20 +1,13 @@
-# Model modifier — Ollama variant.
+# Ollama model configuration.
 #
 # Stack on top of base.yaml:
 #   python -m webwright.run.cli -c base.yaml -c model_ollama.yaml ...
 #
 # Connects to a local or cloud-hosted Ollama instance via /v1/chat/completions.
-# No API key needed for local Ollama (http://localhost:11434).
-# Cloud-hosted Ollama models (e.g. deepseek-v4-pro:cloud) may require
-# OLLAMA_API_KEY env var for authentication.
-#
-# Available models on your host:
-#   deepseek-v4-pro:cloud   (default — what Sisyphus itself runs on)
-#   qwen3-coder-next:cloud  (80B, FP8)
-#   kimi-k2.6:cloud         (1T, int4)
-#   glm-5.1:cloud
+# Set OLLAMA_API_KEY env var for cloud-hosted endpoints that require authentication.
+# Override model_name via OLLAMA_MODEL env var or edit inline.
 
 model:
   model_class: ollama
-  model_name: deepseek-v4-pro:cloud
+  model_name: llama3.2
   ollama_endpoint: http://localhost:11434/v1/chat/completions
diff --git a/src/webwright/config/model_ollama_fast.yaml b/src/webwright/config/model_ollama_fast.yaml
index c85c10b..c878467 100644
--- a/src/webwright/config/model_ollama_fast.yaml
+++ b/src/webwright/config/model_ollama_fast.yaml
@@ -1,21 +1,11 @@
-# Model modifier — Ollama (fast mode — no reasoning).
+# Ollama fast preset — no reasoning, maximum speed.
 #
 # Stack on top of base.yaml:
 #   python -m webwright.run.cli -c base.yaml -c model_ollama_fast.yaml ...
-#
-# Sets reasoning_effort=none for models that support it (deepseek, qwen3, etc.),
-# trading quality for speed — ~10x faster iterations than default reasoning mode.
-# Override via OLLAMA_MODEL env var or edit model_name inline.
-#
-# Available models on your host:
-#   deepseek-v4-pro:cloud   (1.6T, FP8)
-#   qwen3-coder-next:cloud  (80B, FP8)
-#   kimi-k2.6:cloud         (1T, int4)
-#   glm-5.1:cloud
 
 model:
   model_class: ollama
-  model_name: deepseek-v4-pro:cloud
+  model_name: llama3.2
   ollama_endpoint: http://localhost:11434/v1/chat/completions
   ollama_extra_body:
     reasoning_effort: none
diff --git a/src/webwright/config/model_ollama_qwen.yaml b/src/webwright/config/model_ollama_qwen.yaml
deleted file mode 100644
index c9f0e4b..0000000
--- a/src/webwright/config/model_ollama_qwen.yaml
+++ /dev/null
@@ -1,17 +0,0 @@
-# Model modifier — Ollama with Qwen Coder (80B, FP8).
-#
-# Stack on top of base.yaml:
-#   python -m webwright.run.cli -c base.yaml -c model_ollama_qwen.yaml ...
-#
-# Qwen3-Coder 80B is a good balance — fast reasoning=none and strong code gen.
-# Good for: quick exploration, simple tasks, low-latency iteration.
-#
-# All ollama_extra_body keys are forwarded directly to the API payload.
-# Common keys: reasoning_effort (none/low/medium/high), num_ctx, num_predict, seed.
-
-model:
-  model_class: ollama
-  model_name: qwen3-coder-next:cloud
-  ollama_endpoint: http://localhost:11434/v1/chat/completions
-  ollama_extra_body:
-    reasoning_effort: none
diff --git a/src/webwright/config/model_ollama_reason.yaml b/src/webwright/config/model_ollama_reason.yaml
index 73640bd..e5d49e5 100644
--- a/src/webwright/config/model_ollama_reason.yaml
+++ b/src/webwright/config/model_ollama_reason.yaml
@@ -1,21 +1,8 @@
-# Model modifier — Ollama (reasoning mode — max intelligence, slower).
-#
-# Stack on top of base.yaml:
-#   python -m webwright.run.cli -c base.yaml -c model_ollama_reason.yaml ...
-#
-# Leaves reasoning at model default (max) for highest quality on hard tasks.
-# Each iteration takes 2-10 minutes but produces much better plans.
-# Override via OLLAMA_MODEL env var or edit model_name inline.
-#
-# Available models on your host:
-#   deepseek-v4-pro:cloud   (1.6T, FP8)
-#   qwen3-coder-next:cloud  (80B, FP8)
-#   kimi-k2.6:cloud         (1T, int4)
-#   glm-5.1:cloud
+# Ollama reasoning preset — highest quality, slower iteration.
 
 model:
   model_class: ollama
-  model_name: deepseek-v4-pro:cloud
+  model_name: llama3.2
   ollama_endpoint: http://localhost:11434/v1/chat/completions
   ollama_extra_body:
     reasoning_effort: high
diff --git a/src/webwright/models/ollama_model.py b/src/webwright/models/ollama_model.py
index 7d4e3f6..2e82211 100644
--- a/src/webwright/models/ollama_model.py
+++ b/src/webwright/models/ollama_model.py
@@ -30,7 +30,7 @@
 
 
 class OllamaModelConfig(BaseModelConfig):
-    model_name: OptStr = "deepseek-v4-pro:cloud"
+    model_name: OptStr = "llama3.2"
     ollama_api_key: OptStr = ""
     ollama_endpoint: OptStr = "http://localhost:11434/v1/chat/completions"
     ollama_extra_body: dict[str, Any] = {}