From 5ccb8481c2be170805915cb0e2d3020bfc4f8d75 Mon Sep 17 00:00:00 2001 From: Tharun Date: Mon, 22 Jun 2026 20:16:54 +0530 Subject: [PATCH 1/7] feat: add llm-eval-harness kit (flows + Next.js app) LLM-as-judge eval harness: scores a system prompt against a golden set across faithfulness/relevancy/correctness and gates on pass rate. - judge + run-target Lamatic flow prompts - Next.js app: orchestrate loop (run-target -> judge), gate banner, expandable per-case results table - eval utils: HTML-entity decode, defensive JSON parsing, app-side gate recompute, bounded concurrency - add tsconfig.json + postcss.config.mjs (absent in scaffold) WIP: lamatic.config / README / agent.md metadata and exported flow files still to be finalised. --- kits/llm-eval-harness/.gitignore | 5 + kits/llm-eval-harness/README.md | 106 + kits/llm-eval-harness/agent.md | 185 + kits/llm-eval-harness/apps/.env.example | 8 + kits/llm-eval-harness/apps/.gitignore | 29 + kits/llm-eval-harness/apps/README.md | 106 + .../apps/actions/orchestrate.ts | 63 + kits/llm-eval-harness/apps/app/globals.css | 125 + kits/llm-eval-harness/apps/app/layout.tsx | 44 + kits/llm-eval-harness/apps/app/page.tsx | 187 + kits/llm-eval-harness/apps/components.json | 21 + .../apps/components/gate-banner.tsx | 40 + .../apps/components/header.tsx | 47 + .../apps/components/results-table.tsx | 105 + .../apps/components/theme-provider.tsx | 11 + .../apps/components/ui/accordion.tsx | 66 + .../apps/components/ui/alert-dialog.tsx | 157 + .../apps/components/ui/alert.tsx | 66 + .../apps/components/ui/aspect-ratio.tsx | 11 + .../apps/components/ui/avatar.tsx | 53 + .../apps/components/ui/badge.tsx | 46 + .../apps/components/ui/breadcrumb.tsx | 109 + .../apps/components/ui/button-group.tsx | 83 + .../apps/components/ui/button.tsx | 60 + .../apps/components/ui/calendar.tsx | 213 + .../apps/components/ui/card.tsx | 92 + .../apps/components/ui/carousel.tsx | 241 + .../apps/components/ui/chart.tsx | 353 ++ .../apps/components/ui/checkbox.tsx | 32 + .../apps/components/ui/collapsible.tsx | 33 + .../apps/components/ui/command.tsx | 184 + .../apps/components/ui/context-menu.tsx | 252 + .../apps/components/ui/dialog.tsx | 143 + .../apps/components/ui/drawer.tsx | 135 + .../apps/components/ui/dropdown-menu.tsx | 257 + .../apps/components/ui/empty.tsx | 104 + .../apps/components/ui/field.tsx | 244 + .../apps/components/ui/form.tsx | 167 + .../apps/components/ui/hover-card.tsx | 44 + .../apps/components/ui/input-group.tsx | 169 + .../apps/components/ui/input-otp.tsx | 77 + .../apps/components/ui/input.tsx | 21 + .../apps/components/ui/item.tsx | 193 + .../apps/components/ui/kbd.tsx | 28 + .../apps/components/ui/label.tsx | 24 + .../apps/components/ui/menubar.tsx | 276 + .../apps/components/ui/navigation-menu.tsx | 166 + .../apps/components/ui/pagination.tsx | 127 + .../apps/components/ui/popover.tsx | 48 + .../apps/components/ui/progress.tsx | 31 + .../apps/components/ui/radio-group.tsx | 45 + .../apps/components/ui/resizable.tsx | 56 + .../apps/components/ui/scroll-area.tsx | 58 + .../apps/components/ui/select.tsx | 185 + .../apps/components/ui/separator.tsx | 28 + .../apps/components/ui/sheet.tsx | 139 + .../apps/components/ui/sidebar.tsx | 726 +++ .../apps/components/ui/skeleton.tsx | 13 + .../apps/components/ui/slider.tsx | 63 + .../apps/components/ui/sonner.tsx | 25 + .../apps/components/ui/spinner.tsx | 16 + .../apps/components/ui/switch.tsx | 31 + .../apps/components/ui/table.tsx | 116 + .../apps/components/ui/tabs.tsx | 66 + .../apps/components/ui/textarea.tsx | 18 + .../apps/components/ui/toast.tsx | 129 + .../apps/components/ui/toaster.tsx | 35 + .../apps/components/ui/toggle-group.tsx | 73 + .../apps/components/ui/toggle.tsx | 47 + .../apps/components/ui/tooltip.tsx | 61 + .../apps/components/ui/use-mobile.tsx | 19 + .../apps/components/ui/use-toast.ts | 191 + .../llm-eval-harness/apps/hooks/use-mobile.ts | 19 + kits/llm-eval-harness/apps/hooks/use-toast.ts | 191 + kits/llm-eval-harness/apps/lib/eval.ts | 140 + .../apps/lib/lamatic-client.ts | 31 + kits/llm-eval-harness/apps/lib/types.ts | 48 + kits/llm-eval-harness/apps/lib/utils.ts | 6 + kits/llm-eval-harness/apps/next.config.mjs | 11 + kits/llm-eval-harness/apps/package-lock.json | 5292 +++++++++++++++++ kits/llm-eval-harness/apps/package.json | 80 + kits/llm-eval-harness/apps/postcss.config.mjs | 7 + kits/llm-eval-harness/apps/tsconfig.json | 43 + .../llm-eval-harness/constitutions/default.md | 17 + .../flows/agentic-generate-content.ts | 632 ++ kits/llm-eval-harness/lamatic.config.ts | 21 + ...agentic-generate-content_generate-image.ts | 6 + .../agentic-generate-content_json.ts | 10 + .../agentic-generate-content_text.ts | 10 + ...ic-generate-content_generate-image_user.md | 1 + .../agentic-generate-content_json_user.md | 1 + .../agentic-generate-content_text_user.md | 1 + .../prompts/generate-image-system.md | 1 + kits/llm-eval-harness/prompts/json-system.md | 1 + kits/llm-eval-harness/prompts/judge_system.md | 59 + kits/llm-eval-harness/prompts/text-system.md | 1 + 96 files changed, 14156 insertions(+) create mode 100644 kits/llm-eval-harness/.gitignore create mode 100644 kits/llm-eval-harness/README.md create mode 100644 kits/llm-eval-harness/agent.md create mode 100644 kits/llm-eval-harness/apps/.env.example create mode 100644 kits/llm-eval-harness/apps/.gitignore create mode 100644 kits/llm-eval-harness/apps/README.md create mode 100644 kits/llm-eval-harness/apps/actions/orchestrate.ts create mode 100644 kits/llm-eval-harness/apps/app/globals.css create mode 100644 kits/llm-eval-harness/apps/app/layout.tsx create mode 100644 kits/llm-eval-harness/apps/app/page.tsx create mode 100644 kits/llm-eval-harness/apps/components.json create mode 100644 kits/llm-eval-harness/apps/components/gate-banner.tsx create mode 100644 kits/llm-eval-harness/apps/components/header.tsx create mode 100644 kits/llm-eval-harness/apps/components/results-table.tsx create mode 100644 kits/llm-eval-harness/apps/components/theme-provider.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/accordion.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/alert-dialog.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/alert.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/aspect-ratio.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/avatar.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/badge.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/breadcrumb.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/button-group.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/button.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/calendar.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/card.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/carousel.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/chart.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/checkbox.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/collapsible.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/command.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/context-menu.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/dialog.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/drawer.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/dropdown-menu.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/empty.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/field.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/form.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/hover-card.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/input-group.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/input-otp.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/input.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/item.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/kbd.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/label.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/menubar.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/navigation-menu.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/pagination.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/popover.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/progress.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/radio-group.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/resizable.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/scroll-area.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/select.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/separator.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/sheet.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/sidebar.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/skeleton.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/slider.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/sonner.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/spinner.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/switch.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/table.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/tabs.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/textarea.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/toast.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/toaster.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/toggle-group.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/toggle.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/tooltip.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/use-mobile.tsx create mode 100644 kits/llm-eval-harness/apps/components/ui/use-toast.ts create mode 100644 kits/llm-eval-harness/apps/hooks/use-mobile.ts create mode 100644 kits/llm-eval-harness/apps/hooks/use-toast.ts create mode 100644 kits/llm-eval-harness/apps/lib/eval.ts create mode 100644 kits/llm-eval-harness/apps/lib/lamatic-client.ts create mode 100644 kits/llm-eval-harness/apps/lib/types.ts create mode 100644 kits/llm-eval-harness/apps/lib/utils.ts create mode 100644 kits/llm-eval-harness/apps/next.config.mjs create mode 100644 kits/llm-eval-harness/apps/package-lock.json create mode 100644 kits/llm-eval-harness/apps/package.json create mode 100644 kits/llm-eval-harness/apps/postcss.config.mjs create mode 100644 kits/llm-eval-harness/apps/tsconfig.json create mode 100644 kits/llm-eval-harness/constitutions/default.md create mode 100644 kits/llm-eval-harness/flows/agentic-generate-content.ts create mode 100644 kits/llm-eval-harness/lamatic.config.ts create mode 100644 kits/llm-eval-harness/model-configs/agentic-generate-content_generate-image.ts create mode 100644 kits/llm-eval-harness/model-configs/agentic-generate-content_json.ts create mode 100644 kits/llm-eval-harness/model-configs/agentic-generate-content_text.ts create mode 100644 kits/llm-eval-harness/prompts/agentic-generate-content_generate-image_user.md create mode 100644 kits/llm-eval-harness/prompts/agentic-generate-content_json_user.md create mode 100644 kits/llm-eval-harness/prompts/agentic-generate-content_text_user.md create mode 100644 kits/llm-eval-harness/prompts/generate-image-system.md create mode 100644 kits/llm-eval-harness/prompts/json-system.md create mode 100644 kits/llm-eval-harness/prompts/judge_system.md create mode 100644 kits/llm-eval-harness/prompts/text-system.md diff --git a/kits/llm-eval-harness/.gitignore b/kits/llm-eval-harness/.gitignore new file mode 100644 index 00000000..e916ce5a --- /dev/null +++ b/kits/llm-eval-harness/.gitignore @@ -0,0 +1,5 @@ +.lamatic/ +node_modules/ +.next/ +.env +.env.local diff --git a/kits/llm-eval-harness/README.md b/kits/llm-eval-harness/README.md new file mode 100644 index 00000000..5943f657 --- /dev/null +++ b/kits/llm-eval-harness/README.md @@ -0,0 +1,106 @@ +# Agent Kit Generation by Lamatic.ai + +

+ + Live Demo + +

+ + +**Agent Kit Generation** is an AI-powered content generation system built with [Lamatic.ai](https://lamatic.ai). It uses intelligent workflows to generate text, images, and JSON content through a modern Next.js interface with markdown rendering support. + +[![Deploy with Vercel](https://vercel.com/button)](https://vercel.com/new/clone?repository-url=https://github.com/Lamatic/AgentKit&root-directory=kits/agentic/generation&env=AGENTIC_GENERATE_CONTENT,LAMATIC_API_URL,LAMATIC_PROJECT_ID,LAMATIC_API_KEY&envDescription=Your%20Lamatic%20Generation%20keys%20are%20required.&envLink=https://lamatic.ai/templates/agentkits/agentic/agent-kit-generation) + +--- + +## Lamatic Setup (Pre and Post) + +Before running this project, you must build and deploy the flow in Lamatic, then wire its config into this codebase. + +Pre: Build in Lamatic +1. Sign in or sign up at https://lamatic.ai +2. Create a project (if you don’t have one yet) +3. Click “+ New Flow” and select "Templates" +4. Select the 'Generation' agent kit +5. Configure providers/tools/inputs as prompted +6. Deploy the kit in Lamatic and obtain your .env keys +7. Copy the keys from your studio + +Post: Wire into this repo +1. Create a .env file and set the keys +2. Install and run locally: + - npm install + - npm run dev +3. Deploy (Vercel recommended): + - Import your repo, set the project's Root Directory (if applicable) + - Add env vars in Vercel (same as your .env) + - Deploy and test your live URL + +Notes +- Coming soon: single-click export and "Connect Git" in Lamatic to push config directly to your repo. + +--- + +## 🔑 Setup +## Required Keys and Config + +You’ll need these things to run this project locally: + +1. **.env Keys** → get it from your [Lamatic account](https://lamatic.ai) post kit deployment. + + +| Item | Purpose | Where to Get It | +| ----------------- | -------------------------------------------- | ----------------------------------------------- | +| .env Key | Authentication for Lamatic AI APIs and Orchestration | [lamatic.ai](https://lamatic.ai) | + +### 1. Environment Variables + +Create `.env.local` with: + +```bash +# Lamatic +AGENTIC_GENERATE_CONTENT = "AGENTIC_GENERATE_CONTENT Flow ID" +LAMATIC_API_URL = "LAMATIC_API_URL" +LAMATIC_PROJECT_ID = "LAMATIC_PROJECT_ID" +LAMATIC_API_KEY = "LAMATIC_API_KEY" +``` + +### 2. Install & Run + +```bash +npm install +npm run dev +# Open http://localhost:3000 +``` +--- + +## 📂 Repo Structure + +``` +/actions + └── orchestrate.ts # Lamatic workflow orchestration +/app + └── page.tsx # Main generation form UI +/components + ├── header.tsx # Header component with navigation + └── ui # shadcn/ui components +/lib + └── lamatic-client.ts # Lamatic SDK client +/public + └── lamatic-logo.png # Lamatic branding +/flows + └── ... # Lamatic Flows +/package.json # Dependencies & scripts +``` + +--- + +## 🤝 Contributing + +We welcome contributions! Open an issue or PR in this repo. + +--- + +## 📜 License + +MIT License – see [LICENSE](../../../LICENSE). diff --git a/kits/llm-eval-harness/agent.md b/kits/llm-eval-harness/agent.md new file mode 100644 index 00000000..7a51f76f --- /dev/null +++ b/kits/llm-eval-harness/agent.md @@ -0,0 +1,185 @@ +# Generative AI + +## Overview +This project solves the problem of turning a single user instruction into ready-to-use generative outputs (long-form text/markdown, structured JSON, and an image) from one consistent API surface. It implements a **single-flow** AgentKit pipeline that routes requests by “mode” and then orchestrates multiple model calls (text, JSON, and image) plus validation/formatting steps. The primary invoker is a Next.js web UI (and any backend service) that calls the flow via Lamatic’s API layer and renders results, including markdown rendering. It depends on Lamatic’s hosted runtime and credentials, plus connected LLM and image-generation providers configured in Lamatic. + +--- + +## Purpose +The goal of this agent system is to provide a simple, reliable content-generation endpoint that can produce different kinds of creative/structured artifacts from the same user prompt. After it runs, the caller has a polished result suitable for direct use in an application: readable markdown text, valid machine-consumable JSON, or a generated image prompt/result—without having to manually prompt-engineer, validate, or post-process raw model outputs. + +Operationally, the system centralizes generation logic into one deployed Lamatic flow so product teams can iterate on prompts, models, and formatting in Lamatic Studio while keeping the Next.js app thin. This reduces the surface area for application bugs and keeps model behavior consistent across environments. + +Although there is one runnable flow, it supports multiple output “modes” (text, JSON, image). These modes collectively serve the larger purpose of “agentic generation” by ensuring that user instructions can be transformed into the right artifact type with appropriate parsing and finalization steps. + +## Flows + +### `1. Agentic Generation - Generate Content` + +- **Flow ID / Env key mapping:** `agentic-generate-content` (configured via `AGENTIC_GENERATE_CONTENT`) + +#### Trigger +- **Invocation type:** API request via a GraphQL trigger node (`API Request (graphqlNode)`). +- **Expected input shape (conceptual):** + - `instructions` (string): the user’s instruction/prompt. + - `mode` (string): controls which generation path is taken. Supported intents implied by node chain: + - `text` → generate markdown text + - `json` → generate structured JSON + - `image` → generate an image from the instruction (or from intermediate prompt) + - Optional additional fields may be passed through depending on how the Lamatic GraphQL trigger is configured in Studio; the prompts reference `triggerNode_1.output.instructions`, so `instructions` must be present. + +#### What it does +Step-by-step walkthrough of the node chain: + +1. `API Request (graphqlNode)` + - Receives the GraphQL/API payload from the caller (UI/backend). + - Exposes the incoming fields to downstream nodes (notably `instructions`, and a mode selector used by the condition). + +2. `Condition (conditionNode)` + - Routes execution based on the requested generation mode. + - Ensures unsupported/unknown modes do not proceed to model execution. + +3. `Invalid Mode (codeNode)` + - Handles the error path when `mode` is missing or unsupported. + - Produces a safe, deterministic error payload for the API response (instead of attempting generation). + +4. `Text (LLMNode)` + - Generates high-quality, well-structured **markdown** content from the user instruction. + - Uses prompt pair: + - System: `text-system.md` (“You are a Text Generation Assistant… proper markdown…”) + - User: `agentic-generate-content_text_user.md` (`USER INSTRUCTION : {{triggerNode_1.output.instructions}}`) + +5. `JSON (LLMNode)` + - Generates a JSON representation for the same instruction. + - Uses prompt pair: + - System: `json-system.md` (“You are a JSON Generation Assistant… proper JSON form…”) + - User: `agentic-generate-content_json_user.md` (`GENERATE A JSON FOR THIS USER REQUEST : {{triggerNode_1.output.instructions}}`) + +6. `Parse JSON (codeNode)` + - Validates and parses the JSON output from the `JSON (LLMNode)`. + - Normalizes the result into an application-safe structure (e.g., converting a JSON string into an object, handling parse failures). + - This is the main “safety belt” for ensuring the API returns valid JSON even if the model output is slightly malformed. + +7. `Generate Image (ImageGenNode)` + - Produces an image based on the instruction. + - Uses prompt pair: + - System: `generate-image-system.md` (“You are an Image Generation Assistant… high-quality image…”) + - User: `agentic-generate-content_generate-image_user.md` (`CREATE AN IMAGE FOR THIS INSTRUCTION : {{triggerNode_1.output.instructions}}`) + +8. `Finalise Output (codeNode)` + - Consolidates outputs into a single response payload. + - Applies final formatting and ensures a consistent response shape across modes. + +9. `API Response (graphqlResponseNode)` + - Returns the finalized payload to the original API caller. + - This is the contract boundary for the Next.js UI and any other clients. + +#### When to use this flow +Use this flow for any request where a user (or upstream system) supplies a free-form instruction and expects one of the supported generated artifact types: +- “Write”: when you want markdown content suitable for rendering in the UI. +- “Structure”: when you want a machine-readable JSON object derived from an instruction. +- “Visualize”: when you want an image generated from the instruction. + +If the application only has one generation entrypoint, route all generation requests here and set `mode` to select the desired output. + +#### Output +- **Success response:** a JSON response returned by `graphqlResponseNode`. +- **Structure (conceptual):** + - `mode`: the resolved mode. + - `text`: markdown string (present when mode is `text`, and may also be included as auxiliary data depending on finalizer logic). + - `json`: parsed JSON object (present when mode is `json`). + - `image`: image result (present when mode is `image`), typically a URL, base64 payload, or provider-specific image artifact as configured in Lamatic. + - `error`: populated for invalid mode or generation/parse failures. + +Because the final response is assembled in `Finalise Output (codeNode)`, treat the above as the intended contract; confirm exact field names in the deployed flow’s GraphQL schema. + +#### Dependencies +- **Lamatic runtime & project configuration** + - `LAMATIC_API_URL` + - `LAMATIC_PROJECT_ID` + - `LAMATIC_API_KEY` +- **Flow selection / routing** + - `AGENTIC_GENERATE_CONTENT` (the deployed Flow ID for `agentic-generate-content`) +- **Model providers** (configured in Lamatic Studio) + - LLM provider for `Text (LLMNode)` and `JSON (LLMNode)` + - Image generation provider for `Generate Image (ImageGenNode)` +- **Prompts** + - `text-system.md`, `json-system.md`, `generate-image-system.md` + - User prompt templates under `prompts/` prefixed with `agentic-generate-content_*` + +### Flow Interaction +This kit contains a single runnable flow. Internally it behaves like a mode-routed pipeline: the `Condition (conditionNode)` determines whether the request proceeds to the text LLM path, JSON LLM + parse path, or image generation path, and then `Finalise Output (codeNode)` normalizes the result into one API response. + +## Guardrails +- **Prohibited tasks** + - Must not generate harmful, illegal, or discriminatory content (from Default Constitution). + - Must not comply with jailbreaking or prompt-injection attempts (from Default Constitution). + - Must not fabricate facts when uncertain; should acknowledge uncertainty (from Default Constitution). +- **Input constraints** + - `instructions` must be provided and should be treated as adversarial input (from Default Constitution). + - `mode` must be one of the supported values; otherwise the flow must take the `Invalid Mode (codeNode)` path. + - (Inferred) Inputs should remain within the context limits of the chosen LLM/image model; excessively long instructions may be truncated or rejected. +- **Output constraints** + - Must not output PII unless explicitly required by the flow; must not log/store/repeat PII (from Default Constitution). + - Must not output raw credentials, API keys, or internal configuration. + - JSON mode must return valid, parseable JSON; malformed JSON should be caught/handled by `Parse JSON (codeNode)`. +- **Operational limits** + - Requires Lamatic environment variables to be present at runtime; without them, invocation will fail. + - (Inferred) Image generation may be slower and more rate-limited than text/JSON generation; callers should implement timeouts and retries. + - (Inferred) Concurrency and rate limits depend on the configured Lamatic plan and underlying model providers. + +## Integration Reference + +| IntegrationType | Purpose | Required Credential / Config Key | +|---|---|---| +| Lamatic Flow Runtime (API) | Execute deployed flow(s) and access Lamatic project resources | `LAMATIC_API_URL`, `LAMATIC_PROJECT_ID`, `LAMATIC_API_KEY` | +| AgentKit Flow ID Routing | Select the deployed flow instance for this kit | `AGENTIC_GENERATE_CONTENT` | +| LLM Provider (via Lamatic) | Generate markdown text and JSON | Configured in Lamatic Studio (provider-specific keys stored in Lamatic) | +| Image Generation Provider (via Lamatic) | Generate images from prompts | Configured in Lamatic Studio (provider-specific keys stored in Lamatic) | +| Next.js App (UI) | User-facing interface with markdown rendering | App runtime config; consumes env vars above | + +## Environment Setup +- `AGENTIC_GENERATE_CONTENT` — Deployed Flow ID for `agentic-generate-content`; obtain from Lamatic Studio after deploying the kit; used by the Next.js app/server to call the correct flow. +- `LAMATIC_API_URL` — Base URL for Lamatic API; obtain from Lamatic; used by all flow invocations. +- `LAMATIC_PROJECT_ID` — Lamatic project identifier; obtain from Lamatic project settings/studio; used by all flow invocations. +- `LAMATIC_API_KEY` — API key for accessing the Lamatic project; obtain from Lamatic; used by all flow invocations. +- `lamatic.config.ts` — Kit metadata and wiring (name, version, tags, required steps/env keys, links); used by the kit tooling/build. +- `constitutions/` — Default constitution defining identity/safety/data-handling/tone constraints; governs runtime behavior in Lamatic. +- `prompts/` — System and user prompts used by LLM/Image nodes; changing these alters generation behavior. + +## Quickstart +1. In Lamatic Studio, create a project and deploy the “Generation” agent kit flow; copy the resulting keys and Flow ID. +2. In `apps/`, create `.env` from `.env.example` and set: + - `AGENTIC_GENERATE_CONTENT`, `LAMATIC_API_URL`, `LAMATIC_PROJECT_ID`, `LAMATIC_API_KEY` +3. Install and run the app: + 1. `npm install` + 2. `npm run dev` +4. Invoke the flow via the app UI, or call the GraphQL trigger directly using the shape below (placeholders; align field names with your deployed GraphQL schema): + - **GraphQL (conceptual)** + - Mutation/Query: `agenticGenerateContent` (name varies by deployment) + - Variables: + - `input`: + - `mode`: `"text" | "json" | "image"` + - `instructions`: `"Write a concise product description for a smart water bottle."` + - **Example variables JSON (conceptual):** + - `{"input":{"mode":"text","instructions":"Write a concise product description for a smart water bottle."}}` +5. Verify you receive a successful API response and that: + - `mode="text"` returns markdown text + - `mode="json"` returns a parsed JSON object + - `mode="image"` returns an image artifact (often a URL) + +## Common Failure Modes + +| Symptom | Likely Cause | Fix | +|---|---|---| +| Request fails with authentication/401/403 | Missing or incorrect `LAMATIC_API_KEY` / project mismatch | Re-copy keys from Lamatic Studio; ensure `LAMATIC_PROJECT_ID` matches the key scope | +| Flow not found / 404 / “invalid flow id” | `AGENTIC_GENERATE_CONTENT` not set or points to a non-deployed flow | Deploy the flow in Lamatic; update `AGENTIC_GENERATE_CONTENT` with the deployed Flow ID | +| “Invalid mode” response | `mode` missing or not one of the supported values | Send `mode` as `text`, `json`, or `image` (or update the condition node to support more modes) | +| JSON output is empty or parsing fails | Model returned non-JSON text, trailing commentary, or malformed JSON | Tighten `json-system.md` instructions; improve `Parse JSON (codeNode)` error handling; add retries or a “repair JSON” step | +| Image generation fails or is slow | Provider misconfiguration, rate limits, or large/complex prompts | Verify image provider in Lamatic; simplify prompt; add client-side timeout/retry; check Lamatic/provider quotas | +| UI renders raw markdown incorrectly | Markdown rendering configuration or unexpected markdown output | Validate markdown renderer settings; adjust `text-system.md` to constrain formatting | + +## Notes +- This kit is intended to be deployed via Vercel; a one-click deploy link is provided in `lamatic.config.ts` and the app README. +- The recommended workflow is “pre and post”: build and deploy the flow in Lamatic first, then wire the resulting env keys into this repo. +- “Coming soon” items noted by the project: single-click export and “Connect Git” from Lamatic Studio to push config directly into the repo. diff --git a/kits/llm-eval-harness/apps/.env.example b/kits/llm-eval-harness/apps/.env.example new file mode 100644 index 00000000..c0117785 --- /dev/null +++ b/kits/llm-eval-harness/apps/.env.example @@ -0,0 +1,8 @@ +# Deployed Lamatic flow IDs (Studio → deploy the flow → copy its Flow ID) +JUDGE_FLOW="your-judge-flow-id" +RUN_TARGET_FLOW="your-run-target-flow-id" + +# Lamatic project credentials (Studio → Settings / API) +LAMATIC_API_URL="https://your-project.lamatic.dev" +LAMATIC_PROJECT_ID="your-project-id" +LAMATIC_API_KEY="your-lamatic-api-key" diff --git a/kits/llm-eval-harness/apps/.gitignore b/kits/llm-eval-harness/apps/.gitignore new file mode 100644 index 00000000..a6067b6e --- /dev/null +++ b/kits/llm-eval-harness/apps/.gitignore @@ -0,0 +1,29 @@ +# See https://help.github.com/articles/ignoring-files/ for more about ignoring files. + +# dependencies +/node_modules + +# next.js +/.next/ +/out/ + +# production +/build + +# debug +npm-debug.log* +yarn-debug.log* +yarn-error.log* +.pnpm-debug.log* + +# env files +.env +.env.local +.env*.local + +# vercel +.vercel + +# typescript +*.tsbuildinfo +next-env.d.ts \ No newline at end of file diff --git a/kits/llm-eval-harness/apps/README.md b/kits/llm-eval-harness/apps/README.md new file mode 100644 index 00000000..5943f657 --- /dev/null +++ b/kits/llm-eval-harness/apps/README.md @@ -0,0 +1,106 @@ +# Agent Kit Generation by Lamatic.ai + +

+ + Live Demo + +

+ + +**Agent Kit Generation** is an AI-powered content generation system built with [Lamatic.ai](https://lamatic.ai). It uses intelligent workflows to generate text, images, and JSON content through a modern Next.js interface with markdown rendering support. + +[![Deploy with Vercel](https://vercel.com/button)](https://vercel.com/new/clone?repository-url=https://github.com/Lamatic/AgentKit&root-directory=kits/agentic/generation&env=AGENTIC_GENERATE_CONTENT,LAMATIC_API_URL,LAMATIC_PROJECT_ID,LAMATIC_API_KEY&envDescription=Your%20Lamatic%20Generation%20keys%20are%20required.&envLink=https://lamatic.ai/templates/agentkits/agentic/agent-kit-generation) + +--- + +## Lamatic Setup (Pre and Post) + +Before running this project, you must build and deploy the flow in Lamatic, then wire its config into this codebase. + +Pre: Build in Lamatic +1. Sign in or sign up at https://lamatic.ai +2. Create a project (if you don’t have one yet) +3. Click “+ New Flow” and select "Templates" +4. Select the 'Generation' agent kit +5. Configure providers/tools/inputs as prompted +6. Deploy the kit in Lamatic and obtain your .env keys +7. Copy the keys from your studio + +Post: Wire into this repo +1. Create a .env file and set the keys +2. Install and run locally: + - npm install + - npm run dev +3. Deploy (Vercel recommended): + - Import your repo, set the project's Root Directory (if applicable) + - Add env vars in Vercel (same as your .env) + - Deploy and test your live URL + +Notes +- Coming soon: single-click export and "Connect Git" in Lamatic to push config directly to your repo. + +--- + +## 🔑 Setup +## Required Keys and Config + +You’ll need these things to run this project locally: + +1. **.env Keys** → get it from your [Lamatic account](https://lamatic.ai) post kit deployment. + + +| Item | Purpose | Where to Get It | +| ----------------- | -------------------------------------------- | ----------------------------------------------- | +| .env Key | Authentication for Lamatic AI APIs and Orchestration | [lamatic.ai](https://lamatic.ai) | + +### 1. Environment Variables + +Create `.env.local` with: + +```bash +# Lamatic +AGENTIC_GENERATE_CONTENT = "AGENTIC_GENERATE_CONTENT Flow ID" +LAMATIC_API_URL = "LAMATIC_API_URL" +LAMATIC_PROJECT_ID = "LAMATIC_PROJECT_ID" +LAMATIC_API_KEY = "LAMATIC_API_KEY" +``` + +### 2. Install & Run + +```bash +npm install +npm run dev +# Open http://localhost:3000 +``` +--- + +## 📂 Repo Structure + +``` +/actions + └── orchestrate.ts # Lamatic workflow orchestration +/app + └── page.tsx # Main generation form UI +/components + ├── header.tsx # Header component with navigation + └── ui # shadcn/ui components +/lib + └── lamatic-client.ts # Lamatic SDK client +/public + └── lamatic-logo.png # Lamatic branding +/flows + └── ... # Lamatic Flows +/package.json # Dependencies & scripts +``` + +--- + +## 🤝 Contributing + +We welcome contributions! Open an issue or PR in this repo. + +--- + +## 📜 License + +MIT License – see [LICENSE](../../../LICENSE). diff --git a/kits/llm-eval-harness/apps/actions/orchestrate.ts b/kits/llm-eval-harness/apps/actions/orchestrate.ts new file mode 100644 index 00000000..c99ad1f0 --- /dev/null +++ b/kits/llm-eval-harness/apps/actions/orchestrate.ts @@ -0,0 +1,63 @@ +"use server" + +import { getFlowIds, getLamaticClient } from "@/lib/lamatic-client" +import { computeAggregate, decodeHtmlEntities, mapWithConcurrency, parseJudgeResult } from "@/lib/eval" +import type { CaseResult, GoldenCase, RunAggregate } from "@/lib/types" + +// Bounded concurrency keeps large golden sets from tripping Groq rate limits. +const CONCURRENCY = 3 + +/** Execute a flow and pull the `answer` field out of the Lamatic response. */ +async function getAnswer(flowId: string, inputs: Record): Promise { + const resData = await getLamaticClient().executeFlow(flowId, inputs) + const envelope = resData as { result?: { answer?: unknown }; answer?: unknown } + const answer = envelope?.result?.answer ?? envelope?.answer + if (answer === undefined || answer === null) { + throw new Error("No answer returned from flow") + } + return answer +} + +/** Run one golden case through run-target, then score it with the judge. */ +async function evaluateCase(systemPrompt: string, testCase: GoldenCase): Promise { + const { judge, runTarget } = getFlowIds() + try { + const rawOutput = await getAnswer(runTarget, { systemPrompt, input: testCase.input }) + const output = decodeHtmlEntities(typeof rawOutput === "string" ? rawOutput : JSON.stringify(rawOutput)) + + const rawJudge = await getAnswer(judge, { + input: testCase.input, + output, + criteria: testCase.criteria, + reference: testCase.reference ?? "", + }) + + return { case: testCase, output, judge: parseJudgeResult(rawJudge) } + } catch (error) { + return { + case: testCase, + output: "", + judge: null, + error: error instanceof Error ? error.message : "Evaluation failed", + } + } +} + +/** Evaluate a system prompt against a golden set and return the gate verdict. */ +export async function runEvaluation( + systemPrompt: string, + cases: GoldenCase[], + threshold: number, +): Promise<{ success: boolean; data?: RunAggregate; error?: string }> { + try { + if (!systemPrompt.trim()) throw new Error("A system prompt is required") + if (!Array.isArray(cases) || cases.length === 0) throw new Error("Provide at least one test case") + + const results = await mapWithConcurrency(cases, CONCURRENCY, (testCase) => + evaluateCase(systemPrompt, testCase), + ) + return { success: true, data: computeAggregate(results, threshold) } + } catch (error) { + return { success: false, error: error instanceof Error ? error.message : "Evaluation failed" } + } +} diff --git a/kits/llm-eval-harness/apps/app/globals.css b/kits/llm-eval-harness/apps/app/globals.css new file mode 100644 index 00000000..dc2aea17 --- /dev/null +++ b/kits/llm-eval-harness/apps/app/globals.css @@ -0,0 +1,125 @@ +@import 'tailwindcss'; +@import 'tw-animate-css'; + +@custom-variant dark (&:is(.dark *)); + +:root { + --background: oklch(1 0 0); + --foreground: oklch(0.145 0 0); + --card: oklch(1 0 0); + --card-foreground: oklch(0.145 0 0); + --popover: oklch(1 0 0); + --popover-foreground: oklch(0.145 0 0); + --primary: oklch(0.205 0 0); + --primary-foreground: oklch(0.985 0 0); + --secondary: oklch(0.97 0 0); + --secondary-foreground: oklch(0.205 0 0); + --muted: oklch(0.97 0 0); + --muted-foreground: oklch(0.556 0 0); + --accent: oklch(0.97 0 0); + --accent-foreground: oklch(0.205 0 0); + --destructive: oklch(0.577 0.245 27.325); + --destructive-foreground: oklch(0.577 0.245 27.325); + --border: oklch(0.922 0 0); + --input: oklch(0.922 0 0); + --ring: oklch(0.708 0 0); + --chart-1: oklch(0.646 0.222 41.116); + --chart-2: oklch(0.6 0.118 184.704); + --chart-3: oklch(0.398 0.07 227.392); + --chart-4: oklch(0.828 0.189 84.429); + --chart-5: oklch(0.769 0.188 70.08); + --radius: 0.625rem; + --sidebar: oklch(0.985 0 0); + --sidebar-foreground: oklch(0.145 0 0); + --sidebar-primary: oklch(0.205 0 0); + --sidebar-primary-foreground: oklch(0.985 0 0); + --sidebar-accent: oklch(0.97 0 0); + --sidebar-accent-foreground: oklch(0.205 0 0); + --sidebar-border: oklch(0.922 0 0); + --sidebar-ring: oklch(0.708 0 0); +} + +.dark { + --background: oklch(0.145 0 0); + --foreground: oklch(0.985 0 0); + --card: oklch(0.145 0 0); + --card-foreground: oklch(0.985 0 0); + --popover: oklch(0.145 0 0); + --popover-foreground: oklch(0.985 0 0); + --primary: oklch(0.985 0 0); + --primary-foreground: oklch(0.205 0 0); + --secondary: oklch(0.269 0 0); + --secondary-foreground: oklch(0.985 0 0); + --muted: oklch(0.269 0 0); + --muted-foreground: oklch(0.708 0 0); + --accent: oklch(0.269 0 0); + --accent-foreground: oklch(0.985 0 0); + --destructive: oklch(0.396 0.141 25.723); + --destructive-foreground: oklch(0.637 0.237 25.331); + --border: oklch(0.269 0 0); + --input: oklch(0.269 0 0); + --ring: oklch(0.439 0 0); + --chart-1: oklch(0.488 0.243 264.376); + --chart-2: oklch(0.696 0.17 162.48); + --chart-3: oklch(0.769 0.188 70.08); + --chart-4: oklch(0.627 0.265 303.9); + --chart-5: oklch(0.645 0.246 16.439); + --sidebar: oklch(0.205 0 0); + --sidebar-foreground: oklch(0.985 0 0); + --sidebar-primary: oklch(0.488 0.243 264.376); + --sidebar-primary-foreground: oklch(0.985 0 0); + --sidebar-accent: oklch(0.269 0 0); + --sidebar-accent-foreground: oklch(0.985 0 0); + --sidebar-border: oklch(0.269 0 0); + --sidebar-ring: oklch(0.439 0 0); +} + +@theme inline { + --font-sans: 'Geist', 'Geist Fallback'; + --font-mono: 'Geist Mono', 'Geist Mono Fallback'; + --color-background: var(--background); + --color-foreground: var(--foreground); + --color-card: var(--card); + --color-card-foreground: var(--card-foreground); + --color-popover: var(--popover); + --color-popover-foreground: var(--popover-foreground); + --color-primary: var(--primary); + --color-primary-foreground: var(--primary-foreground); + --color-secondary: var(--secondary); + --color-secondary-foreground: var(--secondary-foreground); + --color-muted: var(--muted); + --color-muted-foreground: var(--muted-foreground); + --color-accent: var(--accent); + --color-accent-foreground: var(--accent-foreground); + --color-destructive: var(--destructive); + --color-destructive-foreground: var(--destructive-foreground); + --color-border: var(--border); + --color-input: var(--input); + --color-ring: var(--ring); + --color-chart-1: var(--chart-1); + --color-chart-2: var(--chart-2); + --color-chart-3: var(--chart-3); + --color-chart-4: var(--chart-4); + --color-chart-5: var(--chart-5); + --radius-sm: calc(var(--radius) - 4px); + --radius-md: calc(var(--radius) - 2px); + --radius-lg: var(--radius); + --radius-xl: calc(var(--radius) + 4px); + --color-sidebar: var(--sidebar); + --color-sidebar-foreground: var(--sidebar-foreground); + --color-sidebar-primary: var(--sidebar-primary); + --color-sidebar-primary-foreground: var(--sidebar-primary-foreground); + --color-sidebar-accent: var(--sidebar-accent); + --color-sidebar-accent-foreground: var(--sidebar-accent-foreground); + --color-sidebar-border: var(--sidebar-border); + --color-sidebar-ring: var(--sidebar-ring); +} + +@layer base { + * { + @apply border-border outline-ring/50; + } + body { + @apply bg-background text-foreground; + } +} diff --git a/kits/llm-eval-harness/apps/app/layout.tsx b/kits/llm-eval-harness/apps/app/layout.tsx new file mode 100644 index 00000000..e708e594 --- /dev/null +++ b/kits/llm-eval-harness/apps/app/layout.tsx @@ -0,0 +1,44 @@ +import type { Metadata } from 'next' +import { Geist, Geist_Mono } from 'next/font/google' +import { Analytics } from '@vercel/analytics/next' +import './globals.css' + +const _geist = Geist({ subsets: ["latin"] }); +const _geistMono = Geist_Mono({ subsets: ["latin"] }); + +export const metadata: Metadata = { + title: 'LLM Eval Harness', + description: 'Score an LLM prompt against a golden set with an LLM-as-judge and a CI-style gate.', + icons: { + icon: [ + { + url: '/icon-light-32x32.png', + media: '(prefers-color-scheme: light)', + }, + { + url: '/icon-dark-32x32.png', + media: '(prefers-color-scheme: dark)', + }, + { + url: '/icon.svg', + type: 'image/svg+xml', + }, + ], + apple: '/apple-icon.png', + }, +} + +export default function RootLayout({ + children, +}: Readonly<{ + children: React.ReactNode +}>) { + return ( + + + {children} + + + + ) +} diff --git a/kits/llm-eval-harness/apps/app/page.tsx b/kits/llm-eval-harness/apps/app/page.tsx new file mode 100644 index 00000000..b31838a2 --- /dev/null +++ b/kits/llm-eval-harness/apps/app/page.tsx @@ -0,0 +1,187 @@ +"use client" + +import { useState } from "react" +import { FlaskConical, Loader2, Play, Sparkles } from "lucide-react" +import { Button } from "@/components/ui/button" +import { Textarea } from "@/components/ui/textarea" +import { Input } from "@/components/ui/input" +import { Label } from "@/components/ui/label" +import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card" +import { GateBanner } from "@/components/gate-banner" +import { ResultsTable } from "@/components/results-table" +import { runEvaluation } from "@/actions/orchestrate" +import { SAMPLE_GOLDEN_SET, SAMPLE_SYSTEM_PROMPT } from "@/lib/eval" +import type { GoldenCase, RunAggregate } from "@/lib/types" + +export default function EvalHarnessPage() { + const [systemPrompt, setSystemPrompt] = useState("") + const [goldenSet, setGoldenSet] = useState("") + const [threshold, setThreshold] = useState(90) + const [isLoading, setIsLoading] = useState(false) + const [result, setResult] = useState(null) + const [error, setError] = useState("") + + const loadExample = () => { + setSystemPrompt(SAMPLE_SYSTEM_PROMPT) + setGoldenSet(JSON.stringify(SAMPLE_GOLDEN_SET, null, 2)) + setError("") + } + + const handleRun = async () => { + setError("") + + if (!systemPrompt.trim()) { + setError("Enter a system prompt to evaluate.") + return + } + + let cases: GoldenCase[] + try { + const parsed: unknown = JSON.parse(goldenSet) + if (!Array.isArray(parsed)) throw new Error("Golden set must be a JSON array.") + for (const item of parsed) { + if (!item || typeof item.input !== "string" || typeof item.criteria !== "string") { + throw new Error('Each case needs at least "input" and "criteria" string fields.') + } + } + cases = parsed as GoldenCase[] + } catch (e) { + setError(e instanceof Error ? e.message : "Golden set is not valid JSON.") + return + } + + setIsLoading(true) + setResult(null) + try { + const res = await runEvaluation(systemPrompt, cases, threshold) + if (res.success && res.data) { + setResult(res.data) + } else { + setError(res.error || "Evaluation failed.") + } + } catch (e) { + setError(e instanceof Error ? e.message : "Evaluation failed.") + } finally { + setIsLoading(false) + } + } + + return ( +
+
+
+
+ +
+
+

LLM Eval Harness

+

+ Score a prompt against a golden set with an LLM-as-judge · powered by Lamatic +

+
+
+
+ +
+ {/* Configuration */} +
+ + + System prompt under test + The prompt whose output quality you want to measure. + + +