diff --git a/kits/llm-eval-harness/.gitignore b/kits/llm-eval-harness/.gitignore new file mode 100644 index 00000000..e916ce5a --- /dev/null +++ b/kits/llm-eval-harness/.gitignore @@ -0,0 +1,5 @@ +.lamatic/ +node_modules/ +.next/ +.env +.env.local diff --git a/kits/llm-eval-harness/README.md b/kits/llm-eval-harness/README.md new file mode 100644 index 00000000..a7dc8fea --- /dev/null +++ b/kits/llm-eval-harness/README.md @@ -0,0 +1,81 @@ +# LLM Eval Harness + +A ready-to-deploy kit that scores an LLM prompt against a **golden set** using an **LLM-as-judge**, then applies a **CI-style pass/fail gate** — so you can catch quality regressions *before* they ship. + +> Point it at any system prompt, give it a handful of test cases with expected criteria, and it tells you whether the prompt's outputs are faithful, relevant, and correct — with a single GATE PASSED / GATE FAILED verdict. + +--- + +## The problem + +When you ship an LLM feature and then tweak a prompt or swap a model, output quality can silently regress — a small wording change makes the model hallucinate, over-promise, or drift off-task, and you don't find out until a user does. Eyeballing a few outputs doesn't scale and isn't repeatable. + +Teams solve this with an **evaluation harness**: a fixed set of representative inputs (a *golden set*), an automated grader, and a quality bar that must be met to ship. This kit packages that pattern as a hosted, reusable tool on Lamatic. + +## The approach + +For each case in the golden set, the kit runs two flows: + +1. **`run-target`** — sends your system-prompt-under-test + the case input to an LLM and captures the output (the *system under test*). +2. **`judge`** — an LLM-as-judge scores that output against the case's `criteria` (and optional `reference`) on three dimensions, **0–5** each: + - **Faithfulness** — is every claim grounded? (hallucination is penalised hard — it's a veto) + - **Relevancy** — does it actually address the input? + - **Correctness** — does it satisfy the case criteria? + +The app aggregates the per-case verdicts into a **pass rate** and compares it to a threshold you set (default **90%**) to produce the gate. A case **passes** only if `overall ≥ 3.5` **and** `faithfulness ≥ 3`. + +``` +golden case ──▶ run-target (LLM) ──▶ output ──▶ judge (LLM-as-judge) ──▶ {scores, pass, reasoning} + │ + all cases ──▶ pass rate vs threshold ──▶ GATE PASS / FAIL +``` + +## Results + +- Runs entirely on **Lamatic flows** (Groq `llama-3.3-70b-versatile`, temperature 0 for deterministic scoring). +- The judge reliably **distinguishes good from bad output** — e.g. it fails a support reply that invents a refund against a "final-sale is non-refundable" policy (faithfulness 0), and passes a correct, grounded reply. +- Per-case results are expandable to show the generated output and the judge's reasoning, so a failure tells you *why*. + +## Tradeoffs & assumptions + +- **Single provider (v1):** the flows use Groq. Lamatic stores model credentials at the project level, so multi-provider / bring-your-own-key was deliberately scoped out of v1 — runtime credential injection is a security tradeoff worth doing properly rather than quickly. +- **App-side loop:** the golden set is iterated in the Next.js server action (3 cases concurrently) rather than inside one flow, which keeps the flows simple and lets the UI surface per-case progress and errors. +- **Gate recomputed in code:** `overall` and `pass` are recomputed from the judge's dimension scores in the app, so the gate is deterministic and not dependent on the model's own arithmetic. +- **Defensive parsing:** judge output is tolerant of code fences and minor formatting; run-target output is HTML-entity-decoded before scoring. + +--- + +## Flows + +| Flow | Input | Output | +|------|-------|--------| +| `judge` | `{ input, output, criteria, reference? }` | `{ faithfulness, relevancy, correctness, overall, pass, reasoning }` | +| `run-target` | `{ systemPrompt, input }` | `{ answer }` (the generated output under test) | + +## Setup + +```bash +cd kits/llm-eval-harness/apps +cp .env.example .env.local # then fill in the values below +npm install +npm run dev # http://localhost:3000 +``` + +### Environment variables + +| Variable | Where to find it | +|----------|------------------| +| `JUDGE_FLOW` | Deploy the `judge` flow in Lamatic Studio → copy its Flow ID | +| `RUN_TARGET_FLOW` | Deploy the `run-target` flow → copy its Flow ID | +| `LAMATIC_API_URL` | Studio → Settings / API | +| `LAMATIC_PROJECT_ID` | Studio → Project settings | +| `LAMATIC_API_KEY` | Studio → API Keys | + +## Usage + +1. Paste the **system prompt** you want to evaluate. +2. Provide a **golden set** as JSON — an array of `{ input, criteria, reference? }`. +3. Set a **gate threshold** (default 90%). +4. Click **Run evaluation** — or **Load example** to try a support-agent scenario. + +Built on [Lamatic](https://lamatic.ai). diff --git a/kits/llm-eval-harness/agent.md b/kits/llm-eval-harness/agent.md new file mode 100644 index 00000000..15a8c0bf --- /dev/null +++ b/kits/llm-eval-harness/agent.md @@ -0,0 +1,52 @@ +# LLM Eval Harness + +## Overview +The LLM Eval Harness is a quality-gate agent for other LLM features. Given a system prompt and a golden set of test cases, it runs each case through the prompt-under-test and then grades the output with an LLM-as-judge across faithfulness, relevancy, and correctness, returning per-case scores and a single pass/fail gate. It is invoked by a Next.js web UI that calls two Lamatic flows and aggregates the verdicts. It depends on Lamatic's hosted runtime, project credentials, and a connected text-generation provider (Groq). + +## Purpose +Prompt and model changes can silently regress output quality — a reworded instruction starts hallucinating, over-promising, or drifting off-task. This agent makes that measurable and repeatable: a fixed golden set plus an automated judge plus a quality threshold, so a regression is caught as a failed gate rather than by a user. It generalises the eval-harness pattern (golden sets + LLM-as-judge + CI gate) into a hosted, reusable tool. + +## Flows + +### `judge` +- **Trigger:** API request with `{ input, output, criteria, reference? }`. +- **Processing:** a single LLM node (Groq `llama-3.3-70b-versatile`, temperature 0) acts as a strict evaluation judge using the system prompt in `prompts/`. It scores the candidate `output` against the `criteria` and optional `reference`. +- **Response:** JSON `{ faithfulness, relevancy, correctness, overall, pass, reasoning }`, each dimension 0–5. +- **When to use:** to score one already-generated output against case criteria. +- **Dependencies:** Groq text model credential. + +### `run-target` +- **Trigger:** API request with `{ systemPrompt, input }`. +- **Processing:** a single LLM node runs `systemPrompt` (system) + `input` (user) — this is the *system under test*. +- **Response:** `{ answer }`, the generated output. +- **When to use:** to produce the output that `judge` then scores. +- **Dependencies:** Groq text model credential. + +## Guardrails +- The `judge` only scores; it never completes the user's task or rewrites the output. +- It does not reward length, confidence, formatting, or politeness — an eloquent but unsupported answer scores low on faithfulness. +- Faithfulness is a veto: a hallucinated or contradicting answer fails regardless of other scores. +- Scoring is deterministic (temperature 0); identical inputs yield identical scores. + +## Integration Reference +- **Lamatic API runtime** — hosts and executes both flows. Requires `LAMATIC_API_URL`, `LAMATIC_PROJECT_ID`, `LAMATIC_API_KEY` in the calling app. +- **Groq (text generation)** — backs both LLM nodes; configured as a model credential in Lamatic Studio. + +## Environment Setup +- `JUDGE_FLOW` — deployed `judge` flow ID, called by the app. +- `RUN_TARGET_FLOW` — deployed `run-target` flow ID, called by the app. +- `LAMATIC_API_URL`, `LAMATIC_PROJECT_ID`, `LAMATIC_API_KEY` — Lamatic project credentials used by the app to invoke the flows. + +## Quickstart +1. Build and deploy the `judge` and `run-target` flows in Lamatic Studio; copy their Flow IDs. +2. In `apps/`, copy `.env.example` to `.env.local` and fill in the flow IDs + Lamatic credentials. +3. `npm install && npm run dev`, open `http://localhost:3000`. +4. Paste a system prompt + a golden set (or click **Load example**) and run. + +## Common Failure Modes +| Symptom | Likely cause | Fix | +|---|---|---| +| Judge scores look random | Model too small or temperature not 0 | Use `llama-3.3-70b-versatile`, set temperature 0 | +| "No answer returned from flow" | Wrong flow ID or response mapping | Verify `JUDGE_FLOW`/`RUN_TARGET_FLOW` and that the response maps `answer` | +| Auth error on run | Missing/invalid Lamatic credentials | Check `LAMATIC_API_*` in `.env.local` | +| A case shows "error" | run-target or judge failed for that input | Expand the row; the run continues for other cases | diff --git a/kits/llm-eval-harness/apps/.env.example b/kits/llm-eval-harness/apps/.env.example new file mode 100644 index 00000000..c0117785 --- /dev/null +++ b/kits/llm-eval-harness/apps/.env.example @@ -0,0 +1,8 @@ +# Deployed Lamatic flow IDs (Studio → deploy the flow → copy its Flow ID) +JUDGE_FLOW="your-judge-flow-id" +RUN_TARGET_FLOW="your-run-target-flow-id" + +# Lamatic project credentials (Studio → Settings / API) +LAMATIC_API_URL="https://your-project.lamatic.dev" +LAMATIC_PROJECT_ID="your-project-id" +LAMATIC_API_KEY="your-lamatic-api-key" diff --git a/kits/llm-eval-harness/apps/.gitignore b/kits/llm-eval-harness/apps/.gitignore new file mode 100644 index 00000000..a6067b6e --- /dev/null +++ b/kits/llm-eval-harness/apps/.gitignore @@ -0,0 +1,29 @@ +# See https://help.github.com/articles/ignoring-files/ for more about ignoring files. + +# dependencies +/node_modules + +# next.js +/.next/ +/out/ + +# production +/build + +# debug +npm-debug.log* +yarn-debug.log* +yarn-error.log* +.pnpm-debug.log* + +# env files +.env +.env.local +.env*.local + +# vercel +.vercel + +# typescript +*.tsbuildinfo +next-env.d.ts \ No newline at end of file diff --git a/kits/llm-eval-harness/apps/README.md b/kits/llm-eval-harness/apps/README.md new file mode 100644 index 00000000..463c5982 --- /dev/null +++ b/kits/llm-eval-harness/apps/README.md @@ -0,0 +1,34 @@ +# LLM Eval Harness — App + +Next.js front end for the **LLM Eval Harness** kit. It calls two Lamatic flows +(`run-target` and `judge`) to score a system prompt against a golden set and +render a CI-style pass/fail gate. + +See the [kit README](../README.md) for the full overview. + +## Run locally + +```bash +cp .env.example .env.local # fill in flow IDs + Lamatic credentials +npm install +npm run dev # http://localhost:3000 +``` + +## Environment variables + +| Variable | Source | +|----------|--------| +| `JUDGE_FLOW` | Deployed `judge` flow ID (Lamatic Studio) | +| `RUN_TARGET_FLOW` | Deployed `run-target` flow ID | +| `LAMATIC_API_URL` | Studio → Settings / API | +| `LAMATIC_PROJECT_ID` | Studio → Project settings | +| `LAMATIC_API_KEY` | Studio → API Keys | + +## Structure + +- `actions/orchestrate.ts` — server action: per-case `run-target` → `judge` loop, aggregation, gate +- `lib/lamatic-client.ts` — Lamatic SDK client + flow IDs from env +- `lib/eval.ts` — judge-output parsing, HTML decode, gate computation, bounded concurrency +- `lib/types.ts` — shared data contracts +- `components/gate-banner.tsx`, `components/results-table.tsx` — results UI +- `app/page.tsx` — the harness UI diff --git a/kits/llm-eval-harness/apps/actions/orchestrate.ts b/kits/llm-eval-harness/apps/actions/orchestrate.ts new file mode 100644 index 00000000..6e5f89e6 --- /dev/null +++ b/kits/llm-eval-harness/apps/actions/orchestrate.ts @@ -0,0 +1,84 @@ +"use server" + +import lamaticConfig from "../../lamatic.config" +import { getLamaticClient } from "@/lib/lamatic-client" +import { computeAggregate, decodeHtmlEntities, mapWithConcurrency, parseJudgeResult } from "@/lib/eval" +import type { CaseResult, GoldenCase, RunAggregate } from "@/lib/types" + +// Bounded concurrency keeps large golden sets from tripping Groq rate limits. +const CONCURRENCY = 3 + +/** Resolve a deployed flow ID from the kit's lamatic.config step definitions. */ +function resolveFlowId(stepId: string): string { + const step = lamaticConfig.steps.find((s) => s.id === stepId) + if (!step?.envKey) { + throw new Error(`lamatic.config has no step "${stepId}" with an envKey`) + } + const value = process.env[step.envKey] + if (!value) { + throw new Error(`Missing environment variable "${step.envKey}" for flow "${stepId}"`) + } + return value +} + +/** Execute a flow and pull the `answer` field out of the Lamatic response. */ +async function getAnswer(flowId: string, inputs: Record): Promise { + const resData = await getLamaticClient().executeFlow(flowId, inputs) + const envelope = resData as { result?: { answer?: unknown }; answer?: unknown } + const answer = envelope?.result?.answer ?? envelope?.answer + if (answer === undefined || answer === null) { + throw new Error("No answer returned from flow") + } + return answer +} + +/** Run one golden case through run-target, then score it with the judge. */ +async function evaluateCase( + systemPrompt: string, + testCase: GoldenCase, + flows: { judge: string; runTarget: string }, +): Promise { + try { + const rawOutput = await getAnswer(flows.runTarget, { systemPrompt, input: testCase.input }) + const output = decodeHtmlEntities(typeof rawOutput === "string" ? rawOutput : JSON.stringify(rawOutput)) + + const rawJudge = await getAnswer(flows.judge, { + input: testCase.input, + output, + criteria: testCase.criteria, + reference: testCase.reference ?? "", + }) + + return { case: testCase, output, judge: parseJudgeResult(rawJudge) } + } catch (error) { + return { + case: testCase, + output: "", + judge: null, + error: error instanceof Error ? error.message : "Evaluation failed", + } + } +} + +/** Evaluate a system prompt against a golden set and return the gate verdict. */ +export async function runEvaluation( + systemPrompt: string, + cases: GoldenCase[], + threshold: number, +): Promise<{ success: boolean; data?: RunAggregate; error?: string }> { + try { + if (!systemPrompt.trim()) throw new Error("A system prompt is required") + if (!Array.isArray(cases) || cases.length === 0) throw new Error("Provide at least one test case") + if (!Number.isFinite(threshold) || threshold < 0 || threshold > 100) { + throw new Error("Threshold must be a number between 0 and 100") + } + + const flows = { judge: resolveFlowId("judge"), runTarget: resolveFlowId("run-target") } + const results = await mapWithConcurrency(cases, CONCURRENCY, (testCase) => + evaluateCase(systemPrompt, testCase, flows), + ) + return { success: true, data: computeAggregate(results, threshold) } + } catch (error) { + return { success: false, error: error instanceof Error ? error.message : "Evaluation failed" } + } +} diff --git a/kits/llm-eval-harness/apps/app/globals.css b/kits/llm-eval-harness/apps/app/globals.css new file mode 100644 index 00000000..20b03b5d --- /dev/null +++ b/kits/llm-eval-harness/apps/app/globals.css @@ -0,0 +1,125 @@ +@import 'tailwindcss'; +@import 'tw-animate-css'; + +@custom-variant dark (&:is(.dark *)); + +:root { + --background: oklch(1 0 0); + --foreground: oklch(0.145 0 0); + --card: oklch(1 0 0); + --card-foreground: oklch(0.145 0 0); + --popover: oklch(1 0 0); + --popover-foreground: oklch(0.145 0 0); + --primary: oklch(0.205 0 0); + --primary-foreground: oklch(0.985 0 0); + --secondary: oklch(0.97 0 0); + --secondary-foreground: oklch(0.205 0 0); + --muted: oklch(0.97 0 0); + --muted-foreground: oklch(0.556 0 0); + --accent: oklch(0.97 0 0); + --accent-foreground: oklch(0.205 0 0); + --destructive: oklch(0.577 0.245 27.325); + --destructive-foreground: oklch(0.577 0.245 27.325); + --border: oklch(0.922 0 0); + --input: oklch(0.922 0 0); + --ring: oklch(0.708 0 0); + --chart-1: oklch(0.646 0.222 41.116); + --chart-2: oklch(0.6 0.118 184.704); + --chart-3: oklch(0.398 0.07 227.392); + --chart-4: oklch(0.828 0.189 84.429); + --chart-5: oklch(0.769 0.188 70.08); + --radius: 0.625rem; + --sidebar: oklch(0.985 0 0); + --sidebar-foreground: oklch(0.145 0 0); + --sidebar-primary: oklch(0.205 0 0); + --sidebar-primary-foreground: oklch(0.985 0 0); + --sidebar-accent: oklch(0.97 0 0); + --sidebar-accent-foreground: oklch(0.205 0 0); + --sidebar-border: oklch(0.922 0 0); + --sidebar-ring: oklch(0.708 0 0); +} + +.dark { + --background: oklch(0.145 0 0); + --foreground: oklch(0.985 0 0); + --card: oklch(0.145 0 0); + --card-foreground: oklch(0.985 0 0); + --popover: oklch(0.145 0 0); + --popover-foreground: oklch(0.985 0 0); + --primary: oklch(0.985 0 0); + --primary-foreground: oklch(0.205 0 0); + --secondary: oklch(0.269 0 0); + --secondary-foreground: oklch(0.985 0 0); + --muted: oklch(0.269 0 0); + --muted-foreground: oklch(0.708 0 0); + --accent: oklch(0.269 0 0); + --accent-foreground: oklch(0.985 0 0); + --destructive: oklch(0.396 0.141 25.723); + --destructive-foreground: oklch(0.637 0.237 25.331); + --border: oklch(0.269 0 0); + --input: oklch(0.269 0 0); + --ring: oklch(0.439 0 0); + --chart-1: oklch(0.488 0.243 264.376); + --chart-2: oklch(0.696 0.17 162.48); + --chart-3: oklch(0.769 0.188 70.08); + --chart-4: oklch(0.627 0.265 303.9); + --chart-5: oklch(0.645 0.246 16.439); + --sidebar: oklch(0.205 0 0); + --sidebar-foreground: oklch(0.985 0 0); + --sidebar-primary: oklch(0.488 0.243 264.376); + --sidebar-primary-foreground: oklch(0.985 0 0); + --sidebar-accent: oklch(0.269 0 0); + --sidebar-accent-foreground: oklch(0.985 0 0); + --sidebar-border: oklch(0.269 0 0); + --sidebar-ring: oklch(0.439 0 0); +} + +@theme inline { + --font-sans: var(--font-geist-sans), ui-sans-serif, system-ui, sans-serif; + --font-mono: var(--font-geist-mono), ui-monospace, monospace; + --color-background: var(--background); + --color-foreground: var(--foreground); + --color-card: var(--card); + --color-card-foreground: var(--card-foreground); + --color-popover: var(--popover); + --color-popover-foreground: var(--popover-foreground); + --color-primary: var(--primary); + --color-primary-foreground: var(--primary-foreground); + --color-secondary: var(--secondary); + --color-secondary-foreground: var(--secondary-foreground); + --color-muted: var(--muted); + --color-muted-foreground: var(--muted-foreground); + --color-accent: var(--accent); + --color-accent-foreground: var(--accent-foreground); + --color-destructive: var(--destructive); + --color-destructive-foreground: var(--destructive-foreground); + --color-border: var(--border); + --color-input: var(--input); + --color-ring: var(--ring); + --color-chart-1: var(--chart-1); + --color-chart-2: var(--chart-2); + --color-chart-3: var(--chart-3); + --color-chart-4: var(--chart-4); + --color-chart-5: var(--chart-5); + --radius-sm: calc(var(--radius) - 4px); + --radius-md: calc(var(--radius) - 2px); + --radius-lg: var(--radius); + --radius-xl: calc(var(--radius) + 4px); + --color-sidebar: var(--sidebar); + --color-sidebar-foreground: var(--sidebar-foreground); + --color-sidebar-primary: var(--sidebar-primary); + --color-sidebar-primary-foreground: var(--sidebar-primary-foreground); + --color-sidebar-accent: var(--sidebar-accent); + --color-sidebar-accent-foreground: var(--sidebar-accent-foreground); + --color-sidebar-border: var(--sidebar-border); + --color-sidebar-ring: var(--sidebar-ring); +} + +@layer base { + * { + @apply border-border outline-ring/50; + } + body { + @apply bg-background text-foreground; + } +} diff --git a/kits/llm-eval-harness/apps/app/layout.tsx b/kits/llm-eval-harness/apps/app/layout.tsx new file mode 100644 index 00000000..e3da52d8 --- /dev/null +++ b/kits/llm-eval-harness/apps/app/layout.tsx @@ -0,0 +1,44 @@ +import type { Metadata } from 'next' +import { Geist, Geist_Mono } from 'next/font/google' +import { Analytics } from '@vercel/analytics/next' +import './globals.css' + +const geistSans = Geist({ subsets: ["latin"], variable: "--font-geist-sans" }); +const geistMono = Geist_Mono({ subsets: ["latin"], variable: "--font-geist-mono" }); + +export const metadata: Metadata = { + title: 'LLM Eval Harness', + description: 'Score an LLM prompt against a golden set with an LLM-as-judge and a CI-style gate.', + icons: { + icon: [ + { + url: '/icon-light-32x32.png', + media: '(prefers-color-scheme: light)', + }, + { + url: '/icon-dark-32x32.png', + media: '(prefers-color-scheme: dark)', + }, + { + url: '/icon.svg', + type: 'image/svg+xml', + }, + ], + apple: '/apple-icon.png', + }, +} + +export default function RootLayout({ + children, +}: Readonly<{ + children: React.ReactNode +}>) { + return ( + + + {children} + + + + ) +} diff --git a/kits/llm-eval-harness/apps/app/page.tsx b/kits/llm-eval-harness/apps/app/page.tsx new file mode 100644 index 00000000..4d3c6a05 --- /dev/null +++ b/kits/llm-eval-harness/apps/app/page.tsx @@ -0,0 +1,278 @@ +"use client" + +import { useState } from "react" +import { useForm } from "react-hook-form" +import { zodResolver } from "@hookform/resolvers/zod" +import { z } from "zod" +import { AlertCircle, CheckCircle2, FlaskConical, Loader2, Play, Sparkles } from "lucide-react" +import { Button } from "@/components/ui/button" +import { Textarea } from "@/components/ui/textarea" +import { Input } from "@/components/ui/input" +import { Label } from "@/components/ui/label" +import { GateBanner } from "@/components/gate-banner" +import { ResultsTable } from "@/components/results-table" +import { runEvaluation } from "@/actions/orchestrate" +import { SAMPLE_GOLDEN_SET, SAMPLE_SYSTEM_PROMPT } from "@/lib/eval" +import { cn } from "@/lib/utils" +import type { GoldenCase, RunAggregate } from "@/lib/types" + +const caseSchema = z.object({ + id: z.string().optional(), + input: z.string().trim().min(1), + criteria: z.string().trim().min(1), + reference: z.string().optional(), +}) + +const formSchema = z.object({ + systemPrompt: z.string().trim().min(1, "Enter a system prompt to evaluate."), + threshold: z.number().min(0).max(100), + goldenSet: z.string().superRefine((value, ctx) => { + if (!value.trim()) { + ctx.addIssue({ code: z.ZodIssueCode.custom, message: "Add at least one test case." }) + return + } + let parsed: unknown + try { + parsed = JSON.parse(value) + } catch { + ctx.addIssue({ code: z.ZodIssueCode.custom, message: "Invalid JSON." }) + return + } + const result = z.array(caseSchema).min(1, "Add at least one test case.").safeParse(parsed) + if (!result.success) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: 'Each case needs non-empty "input" and "criteria" string fields.', + }) + } + }), +}) + +type FormValues = z.infer + +export default function EvalHarnessPage() { + const { + register, + handleSubmit, + watch, + setValue, + formState: { errors, isValid }, + } = useForm({ + resolver: zodResolver(formSchema), + mode: "onChange", + defaultValues: { systemPrompt: "", goldenSet: "", threshold: 90 }, + }) + + const [isLoading, setIsLoading] = useState(false) + const [result, setResult] = useState(null) + const [runError, setRunError] = useState("") + + const goldenSetValue = watch("goldenSet") + const goldenCount = (() => { + try { + const parsed: unknown = JSON.parse(goldenSetValue) + return Array.isArray(parsed) ? parsed.length : 0 + } catch { + return 0 + } + })() + + const loadExample = () => { + setValue("systemPrompt", SAMPLE_SYSTEM_PROMPT, { shouldValidate: true }) + setValue("goldenSet", JSON.stringify(SAMPLE_GOLDEN_SET, null, 2), { shouldValidate: true }) + setRunError("") + } + + const onSubmit = async (values: FormValues) => { + setRunError("") + setIsLoading(true) + setResult(null) + try { + const cases = JSON.parse(values.goldenSet) as GoldenCase[] + const res = await runEvaluation(values.systemPrompt, cases, values.threshold) + if (res.success && res.data) { + setResult(res.data) + } else { + setRunError(res.error || "Evaluation failed.") + } + } catch (e) { + setRunError(e instanceof Error ? e.message : "Evaluation failed.") + } finally { + setIsLoading(false) + } + } + + return ( +
+ {/* ambient accent glow */} +
+ + {/* Header */} +
+
+
+
+ +
+
+

LLM Eval Harness

+

LLM-as-judge quality gate for prompts

+
+
+ + powered by Lamatic + +
+
+ +
+ {/* Configuration */} +
+
+

Configuration

+

Define what to test and how to grade it.

+ +
+ {/* System prompt */} +
+ +