From 59680eec0f8b283724f56dc9cdbb491cb8592e9d Mon Sep 17 00:00:00 2001 From: Bo Moon <54k@users.noreply.github.com> Date: Tue, 2 Jun 2026 00:55:30 +0200 Subject: [PATCH 1/2] feat(models): add Ollama backend with localhost support and configurable inference params MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a new model backend for Ollama (local + cloud-hosted) using the standard /v1/chat/completions endpoint. No API key required for local instances. Key features: - Auto-skips API key check on localhost endpoints - ollama_extra_body config key passes arbitrary params (reasoning_effort, num_ctx, seed, etc.) directly to the Ollama API payload - Reuses shared chat-completions serialization helpers in _chat_utils.py - Four ready-to-use config presets: default, fast (no reasoning), reason (high reasoning), qwen (Qwen3-Coder 80B) Files: - src/webwright/models/_chat_utils.py — shared serialization helpers - src/webwright/models/ollama_model.py — Ollama backend implementation - src/webwright/models/__init__.py — register ollama in MODEL_MAPPING - src/webwright/config/model_ollama*.yaml — config presets --- src/webwright/config/model_ollama.yaml | 20 ++++ src/webwright/config/model_ollama_fast.yaml | 21 ++++ src/webwright/config/model_ollama_qwen.yaml | 17 +++ src/webwright/config/model_ollama_reason.yaml | 21 ++++ src/webwright/models/__init__.py | 1 + src/webwright/models/_chat_utils.py | 106 ++++++++++++++++++ src/webwright/models/ollama_model.py | 99 ++++++++++++++++ 7 files changed, 285 insertions(+) create mode 100644 src/webwright/config/model_ollama.yaml create mode 100644 src/webwright/config/model_ollama_fast.yaml create mode 100644 src/webwright/config/model_ollama_qwen.yaml create mode 100644 src/webwright/config/model_ollama_reason.yaml create mode 100644 src/webwright/models/_chat_utils.py create mode 100644 src/webwright/models/ollama_model.py diff --git a/src/webwright/config/model_ollama.yaml b/src/webwright/config/model_ollama.yaml new file mode 100644 index 0000000..64145e3 --- /dev/null +++ b/src/webwright/config/model_ollama.yaml @@ -0,0 +1,20 @@ +# Model modifier — Ollama variant. +# +# Stack on top of base.yaml: +# python -m webwright.run.cli -c base.yaml -c model_ollama.yaml ... +# +# Connects to a local or cloud-hosted Ollama instance via /v1/chat/completions. +# No API key needed for local Ollama (http://localhost:11434). +# Cloud-hosted Ollama models (e.g. deepseek-v4-pro:cloud) may require +# OLLAMA_API_KEY env var for authentication. +# +# Available models on your host: +# deepseek-v4-pro:cloud (default — what Sisyphus itself runs on) +# qwen3-coder-next:cloud (80B, FP8) +# kimi-k2.6:cloud (1T, int4) +# glm-5.1:cloud + +model: + model_class: ollama + model_name: deepseek-v4-pro:cloud + ollama_endpoint: http://localhost:11434/v1/chat/completions diff --git a/src/webwright/config/model_ollama_fast.yaml b/src/webwright/config/model_ollama_fast.yaml new file mode 100644 index 0000000..c85c10b --- /dev/null +++ b/src/webwright/config/model_ollama_fast.yaml @@ -0,0 +1,21 @@ +# Model modifier — Ollama (fast mode — no reasoning). +# +# Stack on top of base.yaml: +# python -m webwright.run.cli -c base.yaml -c model_ollama_fast.yaml ... +# +# Sets reasoning_effort=none for models that support it (deepseek, qwen3, etc.), +# trading quality for speed — ~10x faster iterations than default reasoning mode. +# Override via OLLAMA_MODEL env var or edit model_name inline. +# +# Available models on your host: +# deepseek-v4-pro:cloud (1.6T, FP8) +# qwen3-coder-next:cloud (80B, FP8) +# kimi-k2.6:cloud (1T, int4) +# glm-5.1:cloud + +model: + model_class: ollama + model_name: deepseek-v4-pro:cloud + ollama_endpoint: http://localhost:11434/v1/chat/completions + ollama_extra_body: + reasoning_effort: none diff --git a/src/webwright/config/model_ollama_qwen.yaml b/src/webwright/config/model_ollama_qwen.yaml new file mode 100644 index 0000000..c9f0e4b --- /dev/null +++ b/src/webwright/config/model_ollama_qwen.yaml @@ -0,0 +1,17 @@ +# Model modifier — Ollama with Qwen Coder (80B, FP8). +# +# Stack on top of base.yaml: +# python -m webwright.run.cli -c base.yaml -c model_ollama_qwen.yaml ... +# +# Qwen3-Coder 80B is a good balance — fast reasoning=none and strong code gen. +# Good for: quick exploration, simple tasks, low-latency iteration. +# +# All ollama_extra_body keys are forwarded directly to the API payload. +# Common keys: reasoning_effort (none/low/medium/high), num_ctx, num_predict, seed. + +model: + model_class: ollama + model_name: qwen3-coder-next:cloud + ollama_endpoint: http://localhost:11434/v1/chat/completions + ollama_extra_body: + reasoning_effort: none diff --git a/src/webwright/config/model_ollama_reason.yaml b/src/webwright/config/model_ollama_reason.yaml new file mode 100644 index 0000000..73640bd --- /dev/null +++ b/src/webwright/config/model_ollama_reason.yaml @@ -0,0 +1,21 @@ +# Model modifier — Ollama (reasoning mode — max intelligence, slower). +# +# Stack on top of base.yaml: +# python -m webwright.run.cli -c base.yaml -c model_ollama_reason.yaml ... +# +# Leaves reasoning at model default (max) for highest quality on hard tasks. +# Each iteration takes 2-10 minutes but produces much better plans. +# Override via OLLAMA_MODEL env var or edit model_name inline. +# +# Available models on your host: +# deepseek-v4-pro:cloud (1.6T, FP8) +# qwen3-coder-next:cloud (80B, FP8) +# kimi-k2.6:cloud (1T, int4) +# glm-5.1:cloud + +model: + model_class: ollama + model_name: deepseek-v4-pro:cloud + ollama_endpoint: http://localhost:11434/v1/chat/completions + ollama_extra_body: + reasoning_effort: high diff --git a/src/webwright/models/__init__.py b/src/webwright/models/__init__.py index 6631ac0..3d07c67 100644 --- a/src/webwright/models/__init__.py +++ b/src/webwright/models/__init__.py @@ -9,6 +9,7 @@ "openai": "webwright.models.openai_model.OpenAIModel", "anthropic": "webwright.models.anthropic_model.AnthropicModel", "openrouter": "webwright.models.openrouter_model.OpenRouterModel", + "ollama": "webwright.models.ollama_model.OllamaModel", } diff --git a/src/webwright/models/_chat_utils.py b/src/webwright/models/_chat_utils.py new file mode 100644 index 0000000..3fb00c9 --- /dev/null +++ b/src/webwright/models/_chat_utils.py @@ -0,0 +1,106 @@ +"""Shared chat-completions serialization helpers used by Ollama and OpenRouter model backends.""" + +from __future__ import annotations + +from typing import Any + +from webwright.models.base import _safe_int + + +def serialize_chat_content_part(part: dict[str, Any]) -> dict[str, Any] | None: + part_type = part.get("type") + if part_type in {"input_text", "output_text"}: + return {"type": "text", "text": str(part.get("text", "") or "")} + if part_type == "input_image": + return { + "type": "image_url", + "image_url": { + "url": str(part.get("image_url", "") or ""), + "detail": str(part.get("detail", "high") or "high"), + }, + } + return None + + +def serialize_chat_messages(messages: list[dict[str, Any]]) -> list[dict[str, Any]]: + serialized: list[dict[str, Any]] = [] + for message in messages: + role = message["role"] + if role == "exit": + continue + mapped_role = "system" if role == "system" else ("assistant" if role == "assistant" else "user") + content = message.get("content", "") + if isinstance(content, str): + serialized.append({"role": mapped_role, "content": content}) + continue + parts = [ + serialized_part + for part in content + if isinstance(part, dict) + for serialized_part in [serialize_chat_content_part(part)] + if serialized_part is not None + ] + if mapped_role == "assistant" or all(part.get("type") == "text" for part in parts): + serialized.append( + { + "role": mapped_role, + "content": "\n".join(str(part.get("text", "") or "") for part in parts), + } + ) + else: + serialized.append({"role": mapped_role, "content": parts}) + return serialized + + +def metrics_input_from_chat_messages(chat_messages: list[dict[str, Any]]) -> list[dict[str, Any]]: + metrics_input: list[dict[str, Any]] = [] + for message in chat_messages: + content = message.get("content", "") + if isinstance(content, str): + metrics_input.append({"content": [{"type": "input_text", "text": content}]}) + continue + parts: list[dict[str, Any]] = [] + for part in content: + if not isinstance(part, dict): + continue + if part.get("type") == "text": + parts.append({"type": "input_text", "text": str(part.get("text", "") or "")}) + elif part.get("type") == "image_url": + parts.append({"type": "input_image"}) + metrics_input.append({"content": parts}) + return metrics_input + + +def extract_chat_completions_text(payload: dict[str, Any]) -> str: + choices = payload.get("choices") + if not isinstance(choices, list) or not choices: + return "" + first_choice = choices[0] + if not isinstance(first_choice, dict): + return "" + message = first_choice.get("message", {}) + if not isinstance(message, dict): + return "" + content = message.get("content", "") + if isinstance(content, str): + return content + if isinstance(content, list): + return "\n".join( + str(part.get("text", "") or "") + for part in content + if isinstance(part, dict) and part.get("type") == "text" + ) + return "" + + +def usage_metrics_from_chat_completions(payload: dict[str, Any]) -> dict[str, int]: + usage = payload.get("usage") + if not isinstance(usage, dict): + usage = {} + return { + "input_tokens": _safe_int(usage.get("prompt_tokens")), + "output_tokens": _safe_int(usage.get("completion_tokens")), + "total_tokens": _safe_int(usage.get("total_tokens")), + "cached_input_tokens": 0, + "reasoning_output_tokens": 0, + } diff --git a/src/webwright/models/ollama_model.py b/src/webwright/models/ollama_model.py new file mode 100644 index 0000000..7d4e3f6 --- /dev/null +++ b/src/webwright/models/ollama_model.py @@ -0,0 +1,99 @@ +"""Ollama chat completions model backend. + +Drop-in replacement for OpenAI/Anthropic/OpenRouter — uses local or cloud-hosted +Ollama models via the standard /v1/chat/completions endpoint. No API key +required for local instances; set OLLAMA_API_KEY env var for cloud-hosted +Ollama endpoints that require authentication. +""" + +from __future__ import annotations + +from typing import Any +from urllib.parse import urlparse + +from webwright.models._chat_utils import ( + extract_chat_completions_text, + metrics_input_from_chat_messages, + serialize_chat_messages, + usage_metrics_from_chat_completions, +) +from webwright.models.base import ( + BaseModel, + BaseModelConfig, + OptStr, +) + +__all__ = [ + "OllamaModel", + "OllamaModelConfig", +] + + +class OllamaModelConfig(BaseModelConfig): + model_name: OptStr = "deepseek-v4-pro:cloud" + ollama_api_key: OptStr = "" + ollama_endpoint: OptStr = "http://localhost:11434/v1/chat/completions" + ollama_extra_body: dict[str, Any] = {} + + +def _is_localhost(endpoint: str) -> bool: + host = (urlparse(endpoint).hostname or "").lower() + return host in ("localhost", "127.0.0.1", "::1") + + +class OllamaModel(BaseModel): + _API_KEY_FIELD = "ollama_api_key" + _ENV_VAR = "OLLAMA_API_KEY" + _LOG_SOURCE = "ollama" + _MAX_RATE_LIMIT_RETRIES = 5 + _MAX_TRANSIENT_RETRIES = 5 + _DEFAULT_CONFIG_CLASS = OllamaModelConfig + + def __init__(self, *, config_class: type | None = None, **kwargs): + # Localhost Ollama doesn't need an API key — skip the key check. + endpoint = kwargs.get("ollama_endpoint", "") + if endpoint and _is_localhost(endpoint): + if "ollama_api_key" not in kwargs: + kwargs["ollama_api_key"] = "sk-noop" + super().__init__(config_class=config_class, **kwargs) + + def _request_headers(self) -> dict[str, str]: + headers = {"Content-Type": "application/json"} + if self.config.ollama_api_key and self.config.ollama_api_key != "sk-noop": + headers["Authorization"] = f"Bearer {self.config.ollama_api_key}" + return headers + + def _post_url(self) -> str: + return self.config.ollama_endpoint + + def _build_payload(self, messages: list[dict[str, Any]]) -> dict[str, Any]: + payload: dict[str, Any] = { + "model": self.config.model_name, + "messages": serialize_chat_messages(messages), + "stream": False, + "response_format": {"type": "json_object"}, + "max_tokens": self.config.max_output_tokens, + } + if self.config.ollama_extra_body: + payload.update(self.config.ollama_extra_body) + return payload + + def _build_text_payload(self, messages: list[dict[str, Any]]) -> dict[str, Any]: + payload: dict[str, Any] = { + "model": self.config.model_name, + "messages": serialize_chat_messages(messages), + "stream": False, + "max_tokens": self.config.max_output_tokens, + } + if self.config.ollama_extra_body: + payload.update(self.config.ollama_extra_body) + return payload + + def _request_metrics_input(self, payload: dict[str, Any]) -> list[dict[str, Any]]: + return metrics_input_from_chat_messages(payload.get("messages") or []) + + def _extract_text(self, payload: dict[str, Any]) -> str: + return extract_chat_completions_text(payload) + + def _usage_metrics_from_payload(self, payload: dict[str, Any]) -> dict[str, int]: + return usage_metrics_from_chat_completions(payload) From 35c1033be26c5d1de766e8eb9d95d441bd012e22 Mon Sep 17 00:00:00 2001 From: Bo Moon <54k@users.noreply.github.com> Date: Tue, 2 Jun 2026 13:32:29 +0200 Subject: [PATCH 2/2] cleanup: remove model-specific mentions and host-specific comments --- src/webwright/config/model_ollama.yaml | 15 ++++----------- src/webwright/config/model_ollama_fast.yaml | 14 ++------------ src/webwright/config/model_ollama_qwen.yaml | 17 ----------------- src/webwright/config/model_ollama_reason.yaml | 17 ++--------------- src/webwright/models/ollama_model.py | 2 +- 5 files changed, 9 insertions(+), 56 deletions(-) delete mode 100644 src/webwright/config/model_ollama_qwen.yaml diff --git a/src/webwright/config/model_ollama.yaml b/src/webwright/config/model_ollama.yaml index 64145e3..efbaa49 100644 --- a/src/webwright/config/model_ollama.yaml +++ b/src/webwright/config/model_ollama.yaml @@ -1,20 +1,13 @@ -# Model modifier — Ollama variant. +# Ollama model configuration. # # Stack on top of base.yaml: # python -m webwright.run.cli -c base.yaml -c model_ollama.yaml ... # # Connects to a local or cloud-hosted Ollama instance via /v1/chat/completions. -# No API key needed for local Ollama (http://localhost:11434). -# Cloud-hosted Ollama models (e.g. deepseek-v4-pro:cloud) may require -# OLLAMA_API_KEY env var for authentication. -# -# Available models on your host: -# deepseek-v4-pro:cloud (default — what Sisyphus itself runs on) -# qwen3-coder-next:cloud (80B, FP8) -# kimi-k2.6:cloud (1T, int4) -# glm-5.1:cloud +# Set OLLAMA_API_KEY env var for cloud-hosted endpoints that require authentication. +# Override model_name via OLLAMA_MODEL env var or edit inline. model: model_class: ollama - model_name: deepseek-v4-pro:cloud + model_name: llama3.2 ollama_endpoint: http://localhost:11434/v1/chat/completions diff --git a/src/webwright/config/model_ollama_fast.yaml b/src/webwright/config/model_ollama_fast.yaml index c85c10b..c878467 100644 --- a/src/webwright/config/model_ollama_fast.yaml +++ b/src/webwright/config/model_ollama_fast.yaml @@ -1,21 +1,11 @@ -# Model modifier — Ollama (fast mode — no reasoning). +# Ollama fast preset — no reasoning, maximum speed. # # Stack on top of base.yaml: # python -m webwright.run.cli -c base.yaml -c model_ollama_fast.yaml ... -# -# Sets reasoning_effort=none for models that support it (deepseek, qwen3, etc.), -# trading quality for speed — ~10x faster iterations than default reasoning mode. -# Override via OLLAMA_MODEL env var or edit model_name inline. -# -# Available models on your host: -# deepseek-v4-pro:cloud (1.6T, FP8) -# qwen3-coder-next:cloud (80B, FP8) -# kimi-k2.6:cloud (1T, int4) -# glm-5.1:cloud model: model_class: ollama - model_name: deepseek-v4-pro:cloud + model_name: llama3.2 ollama_endpoint: http://localhost:11434/v1/chat/completions ollama_extra_body: reasoning_effort: none diff --git a/src/webwright/config/model_ollama_qwen.yaml b/src/webwright/config/model_ollama_qwen.yaml deleted file mode 100644 index c9f0e4b..0000000 --- a/src/webwright/config/model_ollama_qwen.yaml +++ /dev/null @@ -1,17 +0,0 @@ -# Model modifier — Ollama with Qwen Coder (80B, FP8). -# -# Stack on top of base.yaml: -# python -m webwright.run.cli -c base.yaml -c model_ollama_qwen.yaml ... -# -# Qwen3-Coder 80B is a good balance — fast reasoning=none and strong code gen. -# Good for: quick exploration, simple tasks, low-latency iteration. -# -# All ollama_extra_body keys are forwarded directly to the API payload. -# Common keys: reasoning_effort (none/low/medium/high), num_ctx, num_predict, seed. - -model: - model_class: ollama - model_name: qwen3-coder-next:cloud - ollama_endpoint: http://localhost:11434/v1/chat/completions - ollama_extra_body: - reasoning_effort: none diff --git a/src/webwright/config/model_ollama_reason.yaml b/src/webwright/config/model_ollama_reason.yaml index 73640bd..e5d49e5 100644 --- a/src/webwright/config/model_ollama_reason.yaml +++ b/src/webwright/config/model_ollama_reason.yaml @@ -1,21 +1,8 @@ -# Model modifier — Ollama (reasoning mode — max intelligence, slower). -# -# Stack on top of base.yaml: -# python -m webwright.run.cli -c base.yaml -c model_ollama_reason.yaml ... -# -# Leaves reasoning at model default (max) for highest quality on hard tasks. -# Each iteration takes 2-10 minutes but produces much better plans. -# Override via OLLAMA_MODEL env var or edit model_name inline. -# -# Available models on your host: -# deepseek-v4-pro:cloud (1.6T, FP8) -# qwen3-coder-next:cloud (80B, FP8) -# kimi-k2.6:cloud (1T, int4) -# glm-5.1:cloud +# Ollama reasoning preset — highest quality, slower iteration. model: model_class: ollama - model_name: deepseek-v4-pro:cloud + model_name: llama3.2 ollama_endpoint: http://localhost:11434/v1/chat/completions ollama_extra_body: reasoning_effort: high diff --git a/src/webwright/models/ollama_model.py b/src/webwright/models/ollama_model.py index 7d4e3f6..2e82211 100644 --- a/src/webwright/models/ollama_model.py +++ b/src/webwright/models/ollama_model.py @@ -30,7 +30,7 @@ class OllamaModelConfig(BaseModelConfig): - model_name: OptStr = "deepseek-v4-pro:cloud" + model_name: OptStr = "llama3.2" ollama_api_key: OptStr = "" ollama_endpoint: OptStr = "http://localhost:11434/v1/chat/completions" ollama_extra_body: dict[str, Any] = {}