diff --git a/.gitignore b/.gitignore
index 4ca2aa69e1..a81359a7ef 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,6 +11,10 @@ config/local
 # Build system
 rundir/
 
+# Python
+__pycache__/
+*.py[cod]
+
 # F# / dotnet
 backend/src/LibExecution/package-ref-hashes.txt
 backend/packages/
@@ -54,5 +58,10 @@ clis/
 # Scratch planning/execution notes, not part of any PR.
 scratch/
 
+# Benchmark run output, regenerated by bench/run.py.
+bench/results.csv
+bench/runs/
+bench/report.md
+
 # Claude Code session lock (per-process, regenerated each session).
-.claude/scheduled_tasks.lock
\ No newline at end of file
+.claude/scheduled_tasks.lock
diff --git a/bench/README.md b/bench/README.md
new file mode 100644
index 0000000000..f3e5b8b0f1
--- /dev/null
+++ b/bench/README.md
@@ -0,0 +1,34 @@
+From a terminal run bench/run.py
+
+# bench
+
+Run `claude -p` against tasks in multiple languages and record
+cost/turns/wall/pass-fail metrics.
+
+## Invocations
+
+| invocation | runs |
+|---|---|
+| `bench/run.py` | every task in `bench/tasks/*.md` × python+dark × 1 trial |
+| `bench/run.py --task url-shortener-cli` | one task × python+dark × 1 trial |
+| `bench/run.py --langs python --trials 3` | every task × python × 3 trials |
+| `bench/run.py --task X --langs dark --trials 5` | one task × dark × 5 trials |
+
+## Adding a task
+
+Drop a markdown file at `bench/tasks/<name>.md` describing what to build.
+The runner picks it up automatically on the next run.
+
+The model gets your task md + a short language note (from `LANG_RULES` in
+`run.py`) + "print PASS on the last line if it works." Pass/fail is graded
+by reading the model's final reply.
+
+## Output
+
+- `bench/runs/<id>/` — one dir per trial: `prompt.md`, `claude_result.json`,
+  `metrics.json`, and a `workspace/` with the seed `./run` shim plus whatever
+  the model wrote. Gitignored.
+- `bench/results.csv` — one row per trial with the curated metrics. Gitignored.
+- `bench/report.md` — overwritten each run when any dark trial surfaces
+  friction notes. The summary table itself is printed to stdout, not
+  written here. Gitignored.
diff --git a/bench/lang/dark/run b/bench/lang/dark/run
new file mode 100755
index 0000000000..19843540bd
--- /dev/null
+++ b/bench/lang/dark/run
@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+# Seed shim: dispatches CLI args into the Darklang package tree on a
+# bench-specific branch.
+#
+# Required env (the harness sets these):
+#   BENCH_DARK_BRANCH     branch holding the implementation
+#   DARK_REPO             absolute path to the Darklang monorepo
+#
+# Auto-derived:
+#   BENCH_STATE_DIR       <workspace>/dark-state/  (one file per entry)
+#
+# {{namespace}} is substituted by the harness from the task name. Functions
+# live at Bench.{{namespace}}.<subcommand> — one per command in the spec.
+#
+# Do not modify this file.
+set -e
+
+: "${BENCH_DARK_BRANCH:?BENCH_DARK_BRANCH must be set}"
+: "${DARK_REPO:?DARK_REPO must be set}"
+
+CMD="$1"
+shift || true
+FN="Bench.{{namespace}}.${CMD}"
+
+WORKSPACE="$(pwd)"
+export BENCH_STATE_DIR="${WORKSPACE}/dark-state"
+mkdir -p "$BENCH_STATE_DIR"
+
+# Build Dark-literal args. Strings here don't contain " or \, so simple
+# double-quoting suffices. Zero-arg commands need explicit unit.
+DARK_ARGS=()
+if [ $# -eq 0 ]; then
+  DARK_ARGS+=("()")
+else
+  for a in "$@"; do
+    DARK_ARGS+=("\"$a\"")
+  done
+fi
+
+# run-cli requires cwd = repo root.
+cd "$DARK_REPO"
+exec ./scripts/run-cli --branch "$BENCH_DARK_BRANCH" run "@$FN" "${DARK_ARGS[@]}"
diff --git a/bench/lang/python/run b/bench/lang/python/run
new file mode 100755
index 0000000000..97155bb111
--- /dev/null
+++ b/bench/lang/python/run
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+# Seed shim: dispatches CLI args to the Python implementation.
+# The implementation should live in main.py (or a module imported from it).
+# Do not modify this file.
+set -e
+exec python3 "$(dirname "$0")/main.py" "$@"
diff --git a/bench/lang/ts/run b/bench/lang/ts/run
new file mode 100755
index 0000000000..c701a3052b
--- /dev/null
+++ b/bench/lang/ts/run
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+# Seed shim: dispatches CLI args to the TypeScript implementation.
+# The implementation should live in main.ts.
+# Do not modify this file.
+set -e
+exec bun "$(dirname "$0")/main.ts" "$@"
diff --git a/bench/run.py b/bench/run.py
new file mode 100755
index 0000000000..b9d6f748fe
--- /dev/null
+++ b/bench/run.py
@@ -0,0 +1,385 @@
+#!/usr/bin/env python3
+"""Bench runner: drive `claude -p` per (task, language, trial), grade the
+model's self-reported PASS, and append cost/token/pass-fail/time metrics to
+bench/results.csv.
+
+Usage:
+    bench/run.py --langs python --trials 1
+    bench/run.py --langs dark,python,ts --trials 5
+"""
+from __future__ import annotations
+
+import argparse
+import csv
+import datetime as dt
+import functools
+import json
+import os
+import re
+import subprocess
+import sys
+import time
+import uuid
+from pathlib import Path
+
+HARNESS_VERSION = "0.1"
+BENCH_ROOT = Path(__file__).resolve().parent
+REPO_ROOT  = BENCH_ROOT.parent
+TASKS_DIR  = BENCH_ROOT / "tasks"
+LANG_DIR   = BENCH_ROOT / "lang"
+RUNS_DIR   = BENCH_ROOT / "runs"
+RESULTS    = BENCH_ROOT / "results.csv"
+REPORT     = BENCH_ROOT / "report.md"
+
+CSV_FIELDS = [
+    "run_id", "timestamp", "task", "lang", "trial",
+    "passed", "num_turns", "wall_ms",
+    "input_tokens", "output_tokens",
+    "cache_read_tokens", "cache_creation_tokens",
+    "cost_usd", "model", "session_id",
+    "harness_version", "git_sha", "claude_version",
+    "error",
+]
+
+DEFAULT_TIMEOUT_SEC = 900   # 15 min per trial
+DEFAULT_BUDGET_USD  = 5.0
+DEFAULT_LANGS       = "python,dark"
+DEFAULT_MODEL       = "claude-opus-4-6"
+
+# Per-language structural rules. Inlined into every prompt so the model
+# knows where the seed shim expects the implementation. The Dark rules
+# are non-negotiable (the shim hardcodes the branch + Bench.<NS>.<cmd>
+# naming); python/ts only need to know the filename the shim runs.
+LANG_RULES = {
+    "python": (
+        "Build with Python 3 stdlib only. Put your implementation in `main.py` — "
+        "the seed `./run` runs `python3 main.py \"$@\"`."
+    ),
+    "ts": (
+        "Build with Bun. Put your implementation in `main.ts` — the seed `./run` "
+        "runs `bun main.ts \"$@\"`."
+    ),
+    "dark": (
+        "Build using `$DARK_REPO/scripts/run-cli`. Read `./scripts/run-cli docs for-ai` "
+        "for syntax and workflow.\n"
+        "Bench rules (the seed shim depends on these):\n"
+        "- `cd \"$DARK_REPO\"` before invoking `./scripts/run-cli` (your cwd is the "
+        "workspace, not the repo root).\n"
+        "- Pass `--branch \"$BENCH_DARK_BRANCH\"` on every `run-cli` invocation.\n"
+        "- Build functions named `Bench.{{namespace}}.<subcommand>` — one per "
+        "subcommand in the spec; the shim dispatches by name.\n"
+        "- Read/write persisted state under `$BENCH_STATE_DIR`."
+    ),
+}
+
+
+def shell(cmd: list[str], **kw) -> subprocess.CompletedProcess:
+    return subprocess.run(cmd, capture_output=True, text=True, **kw)
+
+
+@functools.cache
+def git_sha() -> str:
+    try:
+        return shell(["git", "-C", str(REPO_ROOT), "rev-parse", "--short", "HEAD"]).stdout.strip()
+    except FileNotFoundError:
+        return ""
+
+
+@functools.cache
+def claude_version() -> str:
+    try:
+        return shell(["claude", "--version"]).stdout.strip()
+    except FileNotFoundError:
+        return ""
+
+
+def create_dark_branch(branch_name: str) -> None:
+    r = shell(
+        [str(REPO_ROOT / "scripts" / "run-cli"), "branch", "create", branch_name],
+        cwd=str(REPO_ROOT),
+    )
+    if r.returncode != 0:
+        raise RuntimeError(f"branch create failed: {r.stderr or r.stdout}")
+
+
+def task_namespace(task: str) -> str:
+    """kebab-case task name → PascalCase Darklang namespace component.
+
+    e.g. "url-shortener-cli" → "UrlShortenerCli". The shim prepends "Bench."
+    """
+    return "".join(part[:1].upper() + part[1:] for part in task.split("-") if part)
+
+
+def setup_workspace(workspace: Path, lang: str, namespace: str) -> None:
+    """Drop the lang-default ./run shim into the workspace."""
+    workspace.mkdir(parents=True, exist_ok=True)
+    shim_src = (LANG_DIR / lang / "run").read_text()
+    if lang == "dark":
+        shim_src = shim_src.replace("{{namespace}}", namespace)
+    shim_dst = workspace / "run"
+    shim_dst.write_text(shim_src)
+    shim_dst.chmod(0o755)
+
+
+def build_prompt(task_md: str, lang: str, namespace: str) -> str:
+    rules = LANG_RULES[lang].replace("{{namespace}}", namespace)
+    where = ("All implementation work happens in the Darklang package tree on "
+             "the branch named in $BENCH_DARK_BRANCH."
+             if lang == "dark"
+             else "Build the implementation in the current working directory.")
+    friction = (
+        "Before the final PASS line, write a section titled `## Dark friction` "
+        "listing 3–5 specific things that slowed you down or surprised you while "
+        "implementing this in Darklang (concrete: name the function, feature, or "
+        "error message; skip generic complaints). Omit the section only if "
+        "nothing notable came up.\n\n"
+        if lang == "dark"
+        else ""
+    )
+    return (
+        f"{task_md.rstrip()}\n\n"
+        f"---\n\n"
+        f"Implement in **{lang}**. {where}\n\n"
+        f"{rules}\n\n"
+        f"When you finish, verify the implementation by exercising `./run`. "
+        f"{friction}"
+        f"Print `PASS` on the last line of your final reply if it meets every "
+        f"requirement, otherwise print a short reason."
+    )
+
+
+def collect_dark_friction(rows: list[dict]) -> list[tuple[str, int, str]]:
+    """For each dark trial, pull the `## Dark friction` section out of the
+    model's final reply. Returns (task, trial, body) tuples."""
+    out: list[tuple[str, int, str]] = []
+    for r in rows:
+        if r["lang"] != "dark":
+            continue
+        cr = RUNS_DIR / r["run_id"] / "claude_result.json"
+        if not cr.is_file():
+            continue
+        try:
+            data = json.loads(cr.read_text())
+        except json.JSONDecodeError:
+            continue
+        text = data.get("result") or ""
+        m = re.search(
+            r"##\s*Dark friction\s*\n(.+?)(?=\n##|\nPASS\b|\Z)",
+            text, re.DOTALL | re.IGNORECASE,
+        )
+        if m and m.group(1).strip():
+            out.append((r["task"], r["trial"], m.group(1).strip()))
+    return out
+
+
+def grade(claude_result: dict) -> tuple[bool, str]:
+    """Read claude's final message and look for PASS / a failure reason."""
+    text = (claude_result.get("result") or "").strip()
+    last = text.splitlines()[-1].strip() if text else ""
+    if last.upper().startswith("PASS"):
+        return True, ""
+    return False, last[:200]
+
+
+def run_claude(prompt: str, workspace: Path, env: dict, timeout_s: int, budget: float, model: str) -> tuple[dict, str, int]:
+    cmd = [
+        "claude", "-p",
+        "--output-format", "json",
+        "--permission-mode", "bypassPermissions",
+        "--add-dir", str(REPO_ROOT),
+        "--max-budget-usd", str(budget),
+        "--model", model,
+    ]
+    started = time.monotonic()
+    try:
+        proc = subprocess.run(
+            cmd,
+            cwd=str(workspace),
+            env=env,
+            input=prompt,
+            capture_output=True,
+            text=True,
+            timeout=timeout_s,
+        )
+    except subprocess.TimeoutExpired as e:
+        wall_ms = int((time.monotonic() - started) * 1000)
+        partial = e.stdout or b""
+        if isinstance(partial, bytes):
+            partial = partial.decode(errors="replace")
+        return ({"_error": "timeout", "_wall_ms": wall_ms}, partial, -1)
+    wall_ms = int((time.monotonic() - started) * 1000)
+    raw = proc.stdout
+    parsed: dict = {}
+    try:
+        parsed = json.loads(raw) if raw.strip() else {}
+    except json.JSONDecodeError:
+        parsed = {"_error": "json-parse"}
+    parsed["_wall_ms"]  = wall_ms
+    parsed["_stderr_tail"] = (proc.stderr or "")[-2000:]
+    return parsed, raw, proc.returncode
+
+
+def metrics_from_result(r: dict, requested_model: str) -> dict:
+    usage = r.get("usage") or {}
+    return {
+        "num_turns":             r.get("num_turns"),
+        "input_tokens":          usage.get("input_tokens"),
+        "output_tokens":         usage.get("output_tokens"),
+        "cache_read_tokens":     usage.get("cache_read_input_tokens"),
+        "cache_creation_tokens": usage.get("cache_creation_input_tokens"),
+        "cost_usd":              r.get("total_cost_usd"),
+        "model":                 requested_model,
+        "session_id":            r.get("session_id"),
+    }
+
+
+def append_csv(row: dict) -> None:
+    new = not RESULTS.exists()
+    RESULTS.parent.mkdir(parents=True, exist_ok=True)
+    with RESULTS.open("a", newline="") as f:
+        w = csv.DictWriter(f, fieldnames=CSV_FIELDS)
+        if new:
+            w.writeheader()
+        w.writerow({k: row.get(k, "") for k in CSV_FIELDS})
+
+
+def do_trial(task: str, lang: str, trial: int, args) -> dict:
+    task_md_path = TASKS_DIR / f"{task}.md"
+    if not task_md_path.is_file():
+        raise SystemExit(f"missing task file: {task_md_path}")
+    if not (LANG_DIR / lang / "run").is_file():
+        raise SystemExit(f"unknown lang '{lang}' (no bench/lang/{lang}/run)")
+
+    task_md = task_md_path.read_text()
+    namespace = task_namespace(task)
+
+    ts        = dt.datetime.now().strftime("%Y%m%dT%H%M%S")
+    run_id    = f"{ts}_{task}_{lang}_t{trial}_{uuid.uuid4().hex[:6]}"
+    run_dir   = RUNS_DIR / run_id
+    workspace = run_dir / "workspace"
+    run_dir.mkdir(parents=True)
+
+    setup_workspace(workspace, lang, namespace)
+
+    env = os.environ.copy()
+    if lang == "dark":
+        branch_name = f"bench-{task}-{ts}-t{trial}".replace("_", "-")
+        create_dark_branch(branch_name)
+        env["BENCH_DARK_BRANCH"] = branch_name
+        env["DARK_REPO"]         = str(REPO_ROOT)
+
+    prompt = build_prompt(task_md, lang, namespace)
+    (run_dir / "prompt.md").write_text(prompt)
+
+    print(f"[{run_id}] starting claude model={args.model} (timeout={args.timeout}s, budget=${args.budget})...", flush=True)
+    result, raw, rc = run_claude(prompt, workspace, env, args.timeout, args.budget, args.model)
+    (run_dir / "claude_result.json").write_text(raw or "")
+
+    err = result.get("_error", "")
+    if rc != 0 and not err:
+        err = f"claude-exit-{rc}"
+    passed, fail_reason = (False, "")
+    if not err:
+        passed, fail_reason = grade(result)
+
+    m = metrics_from_result(result, args.model)
+    metrics = {
+        "run_id":          run_id,
+        "timestamp":       dt.datetime.now().isoformat(timespec="seconds"),
+        "task":            task,
+        "lang":            lang,
+        "trial":           trial,
+        "passed":          passed,
+        "wall_ms":         result.get("_wall_ms"),
+        "harness_version": HARNESS_VERSION,
+        "git_sha":         git_sha(),
+        "claude_version":  claude_version(),
+        "error":           err or fail_reason,
+        **m,
+    }
+    (run_dir / "metrics.json").write_text(json.dumps(metrics, indent=2) + "\n")
+    append_csv(metrics)
+
+    status = "PASS" if passed else ("ERR" if err else "FAIL")
+    cost = metrics.get("cost_usd")
+    cost_s = f"${cost:.3f}" if isinstance(cost, (int, float)) else "$?"
+    print(f"[{run_id}] {status}  turns={metrics.get('num_turns')}  "
+          f"wall={metrics.get('wall_ms')}ms  {cost_s}", flush=True)
+    return metrics
+
+
+def write_report(rows: list[dict]) -> bool:
+    """Write bench/report.md with the dark friction notes from this run.
+    Returns True if a report was written, False if there was nothing to say."""
+    sections = collect_dark_friction(rows)
+    if not sections:
+        return False
+    out: list[str] = []
+    out.append("# Dark friction")
+    out.append("")
+    out.append(f"Generated: {dt.datetime.now().isoformat(timespec='seconds')} "
+               f"(git {git_sha() or '?'}, {claude_version() or 'claude ?'})")
+    for task, trial, body in sections:
+        out.append("")
+        out.append(f"## {task} (trial {trial})")
+        out.append("")
+        out.append(body)
+    REPORT.write_text("\n".join(out) + "\n")
+    return True
+
+
+def print_summary(rows: list[dict]) -> None:
+    if not rows:
+        return
+    by_task: dict[str, list[dict]] = {}
+    for r in rows:
+        by_task.setdefault(r["task"], []).append(r)
+    for task in sorted(by_task):
+        print()
+        print(task)
+        print(f"  {'lang':<8} {'pass':>5} {'turns':>6} {'wall_s':>8} {'cost':>8}")
+        by_lang: dict[str, list[dict]] = {}
+        for r in by_task[task]:
+            by_lang.setdefault(r["lang"], []).append(r)
+        for lang, rs in by_lang.items():
+            passed     = sum(1 for r in rs if r["passed"])
+            avg_turns  = sum((r.get("num_turns") or 0)  for r in rs) / len(rs)
+            avg_wall_s = sum((r.get("wall_ms")   or 0)  for r in rs) / len(rs) / 1000
+            avg_cost   = sum((r.get("cost_usd")  or 0.0) for r in rs) / len(rs)
+            print(f"  {lang:<8} {passed}/{len(rs):<3} {avg_turns:>6.1f} "
+                  f"{avg_wall_s:>8.1f} ${avg_cost:>6.3f}")
+
+
+def main() -> int:
+    p = argparse.ArgumentParser()
+    p.add_argument("--task",    default=None,
+                   help="limit to one task (default: every .md under bench/tasks/)")
+    p.add_argument("--langs",   default=DEFAULT_LANGS, help="comma-separated: dark,python,ts")
+    p.add_argument("--trials",  type=int,   default=1)
+    p.add_argument("--timeout", type=int,   default=DEFAULT_TIMEOUT_SEC)
+    p.add_argument("--budget",  type=float, default=DEFAULT_BUDGET_USD)
+    p.add_argument("--model",   default=DEFAULT_MODEL,
+                   help="model name passed to `claude -p --model`")
+    args = p.parse_args()
+
+    if args.task:
+        tasks = [args.task]
+    else:
+        tasks = sorted(t.stem for t in TASKS_DIR.glob("*.md"))
+        if not tasks:
+            raise SystemExit(f"no tasks found under {TASKS_DIR}")
+
+    langs = [l.strip() for l in args.langs.split(",") if l.strip()]
+    rows: list[dict] = []
+    for task in tasks:
+        for lang in langs:
+            for trial in range(1, args.trials + 1):
+                rows.append(do_trial(task, lang, trial, args))
+    print_summary(rows)
+    if write_report(rows):
+        print(f"\nfriction report: {REPORT}")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/bench/tasks/countdown-timer.md b/bench/tasks/countdown-timer.md
new file mode 100644
index 0000000000..7ffa1f4323
--- /dev/null
+++ b/bench/tasks/countdown-timer.md
@@ -0,0 +1,8 @@
+# countdown-timer
+
+Build a countdown timer that takes a number of seconds and counts down to
+zero, printing the remaining time each second. Non-integer or negative
+input prints an error.
+
+When done, verify it works and print `PASS` on the last line. Otherwise
+print a short reason.
diff --git a/bench/tasks/even-or-odd-cli.md b/bench/tasks/even-or-odd-cli.md
new file mode 100644
index 0000000000..107eac5b50
--- /dev/null
+++ b/bench/tasks/even-or-odd-cli.md
@@ -0,0 +1,6 @@
+# even-or-odd-cli
+
+Classify an integer as even or odd. Non-integer input prints an error.
+
+When done, verify it works and print `PASS` on the last line. Otherwise
+print a short reason.
diff --git a/bench/tasks/fizzbuzz.md b/bench/tasks/fizzbuzz.md
new file mode 100644
index 0000000000..9f17e5ffb1
--- /dev/null
+++ b/bench/tasks/fizzbuzz.md
@@ -0,0 +1,9 @@
+# fizzbuzz
+
+Build FizzBuzz: given a positive integer N, print the numbers 1..N, one per
+line, replacing multiples of 3 with `Fizz`, multiples of 5 with `Buzz`, and
+multiples of both with `FizzBuzz`. Non-integer or non-positive input prints
+an error.
+
+When done, verify it works and print `PASS` on the last line. Otherwise
+print a short reason.
diff --git a/bench/tasks/github-stats.md b/bench/tasks/github-stats.md
new file mode 100644
index 0000000000..7e9b619601
--- /dev/null
+++ b/bench/tasks/github-stats.md
@@ -0,0 +1,6 @@
+# github-stats
+
+Print how many open pull requests and open issues a GitHub repository has.
+test it with https://github.com/darklang/dark
+When done, verify it works and print `PASS` on the last line. Otherwise
+print a short reason.
diff --git a/bench/tasks/hangman.md b/bench/tasks/hangman.md
new file mode 100644
index 0000000000..1f92a98429
--- /dev/null
+++ b/bench/tasks/hangman.md
@@ -0,0 +1,9 @@
+# hangman
+
+Build the Hangman game. Pick a secret word, let the player guess letters
+one at a time, show progress after each guess, and limit wrong guesses to
+6. Announce win or loss at the end. Invalid input (not a single letter)
+prints an error and does not count as a guess.
+
+When done, verify it works and print `PASS` on the last line. Otherwise
+print a short reason.
diff --git a/bench/tasks/password-gen.md b/bench/tasks/password-gen.md
new file mode 100644
index 0000000000..8453b9035f
--- /dev/null
+++ b/bench/tasks/password-gen.md
@@ -0,0 +1,8 @@
+# password-gen
+
+Build a password generator. that prints a random
+alphanumeric password of the given length. Different invocations produce
+different passwords. Non-integer input prints an error.
+
+When done, verify it works and print `PASS` on the last line. Otherwise
+print a short reason.
diff --git a/bench/tasks/todo-list-cli.md b/bench/tasks/todo-list-cli.md
new file mode 100644
index 0000000000..4d5f861767
--- /dev/null
+++ b/bench/tasks/todo-list-cli.md
@@ -0,0 +1,7 @@
+# todo-list-cli
+
+Build a to-do list app with persistent state. Support adding items, listing
+them, and marking them done. Operating on an unknown item prints an error.
+
+When done, verify it works and print `PASS` on the last line. Otherwise
+print a short reason.
diff --git a/bench/tasks/unit-convert.md b/bench/tasks/unit-convert.md
new file mode 100644
index 0000000000..b0f9bfc0c4
--- /dev/null
+++ b/bench/tasks/unit-convert.md
@@ -0,0 +1,7 @@
+# unit-convert
+
+Build a unit converter between Celsius and Fahrenheit. Non-numeric input
+prints an error.
+
+When done, verify it works and print `PASS` on the last line. Otherwise
+print a short reason.
diff --git a/bench/tasks/url-shortener-cli.md b/bench/tasks/url-shortener-cli.md
new file mode 100644
index 0000000000..1df570c5ea
--- /dev/null
+++ b/bench/tasks/url-shortener-cli.md
@@ -0,0 +1,7 @@
+# url-shortener-cli
+
+Build a URL shortener with persistent state (same URL → same code).
+Looking up an unknown code prints an error.
+
+When done, verify it works and print `PASS` on the last line. Otherwise
+print a short reason.