diff --git a/website/src/pages/agent-optimization.tsx b/website/src/pages/agent-optimization.tsx new file mode 100644 index 0000000000..8050a3c66b --- /dev/null +++ b/website/src/pages/agent-optimization.tsx @@ -0,0 +1,1080 @@ +import { useState } from "react"; +import Head from "@docusaurus/Head"; +import Link from "@docusaurus/Link"; +import { Highlight } from "prism-react-renderer"; +import { Header } from "../components/Header/Header"; +import { SocialLinksFooter } from "../components/SocialLinksFooter/SocialLinksFooter"; +import { ArticleSidebar } from "../components/ArticleSidebar/ArticleSidebar"; +import { MLFLOW_GENAI_DOCS_URL } from "@site/src/constants"; +import { CopyButton } from "../components/CodeSnippet/CopyButton"; +import { customNightOwl, CODE_BG } from "../components/CodeSnippet/codeTheme"; + +const SEO_TITLE = + "Agent Optimization: Debug, Evaluate & Improve AI Agents | MLflow"; +const SEO_DESCRIPTION = + "Learn how to optimize AI agents for quality, cost, and latency. Use MLflow's open-source tracing, evaluation, and prompt optimization to debug and improve agent performance."; + +const faqs: { + question: string; + answer: React.ReactNode; + answerText?: string; +}[] = [ + { + question: "What is agent optimization?", + answer: + "Agent optimization is the practice of improving AI agents across quality, cost, and latency. Unlike optimizing a single LLM call, agent optimization addresses multi-step reasoning chains, tool selection, execution paths, and compounding costs. It includes techniques like tracing agent execution, evaluating decision-making quality with LLM judges, optimizing system prompts with algorithms, and monitoring agent reliability in production.", + }, + { + question: "How does prompt optimization help improve AI agents?", + answer: ( + <> + Agent behavior is heavily influenced by system prompts, tool + descriptions, and few-shot examples.{" "} + Prompt optimization automates + the process of improving these prompts using data-driven algorithms + instead of manual trial-and-error. Optimizers like GEPA evaluate agent + prompts across training examples, analyze failure patterns, generate + improved variants, and repeat until quality converges. MLflow provides a + unified{" "} + + prompt optimization API + {" "} + that tracks every version and metric automatically. + + ), + answerText: + "Agent behavior is heavily influenced by system prompts, tool descriptions, and few-shot examples. Prompt optimization automates the process of improving these prompts using data-driven algorithms instead of manual trial-and-error. Optimizers like GEPA evaluate agent prompts across training examples, analyze failure patterns, generate improved variants, and repeat until quality converges. MLflow provides a unified prompt optimization API that tracks every version and metric automatically.", + }, + { + question: + "How do I optimize an agent with RAG (Retrieval-Augmented Generation)?", + answer: + "Optimizing a RAG agent involves improving both the retrieval and generation stages within the agent's execution. Use MLflow Tracing to see exactly what documents the agent retrieves and how they influence its reasoning and responses. Use MLflow Evaluation with groundedness and relevance judges to measure retrieval quality and answer accuracy. Then iterate on the agent's retrieval strategy, chunking approach, embedding model, and generation prompts, measuring the impact of each change across the full agent pipeline.", + }, + { + question: "What is the best tool for AI agent optimization?", + answer: + "The best tool for agent optimization depends on your needs. MLflow is the leading open-source option, providing the complete toolkit: tracing for full execution visibility across reasoning steps and tool calls, evaluation for quality measurement with LLM judges, prompt optimization for algorithmic improvement of system prompts, and an AI Gateway for cost management, compliance, and governance. Unlike proprietary tools, MLflow is 100% free, supports any LLM provider and agent framework (LangGraph, CrewAI, OpenAI Agents, and more), and is backed by the Linux Foundation with over 30 million monthly downloads.", + }, + { + question: "How do I reduce AI agent costs with MLflow?", + answer: ( + <> + Agents make multiple LLM calls per request, so costs compound quickly.{" "} + MLflow Tracing{" "} + captures token counts per span across every reasoning step and tool + call, revealing where tokens are being spent. From there, you can + eliminate redundant reasoning loops, use smaller models for simple + sub-tasks, cache repeated tool calls, shorten system prompts, or route + through an AI Gateway with rate limiting + and budget controls. + + ), + answerText: + "Agents make multiple LLM calls per request, so costs compound quickly. MLflow Tracing captures token counts per span across every reasoning step and tool call, revealing where tokens are being spent. From there, you can eliminate redundant reasoning loops, use smaller models for simple sub-tasks, cache repeated tool calls, shorten system prompts, or route through an AI Gateway with rate limiting and budget controls.", + }, + { + question: "How do I improve agent response quality with MLflow?", + answer: ( + <> + Agent quality depends on correct reasoning, appropriate tool selection, + and accurate final responses.{" "} + + MLflow Evaluation + {" "} + lets you score agent outputs with LLM judges across dimensions like + correctness, relevance, safety, and tool use quality. Once you have a + quality baseline, improve through{" "} + prompt optimization of system + prompts, better tool descriptions, retrieval improvements, or model + upgrades, and measure the impact of each change. + + ), + answerText: + "Agent quality depends on correct reasoning, appropriate tool selection, and accurate final responses. MLflow Evaluation lets you score agent outputs with LLM judges across dimensions like correctness, relevance, safety, and tool use quality. Once you have a quality baseline, improve through prompt optimization of system prompts, better tool descriptions, retrieval improvements, or model upgrades, and measure the impact of each change.", + }, + { + question: "How do I reduce agent latency with MLflow?", + answer: ( + <> + Agents suffer from compounding latency because each reasoning step and + tool call adds time.{" "} + MLflow Tracing{" "} + captures per-span latency across the full execution graph, making it + easy to find bottlenecks. Common optimizations include parallelizing + independent tool calls, caching repeated operations, using faster models + for lower-complexity reasoning steps, reducing the number of reasoning + loops, and streaming intermediate results. + + ), + answerText: + "Agents suffer from compounding latency because each reasoning step and tool call adds time. MLflow Tracing captures per-span latency across the full execution graph, making it easy to find bottlenecks. Common optimizations include parallelizing independent tool calls, caching repeated operations, using faster models for lower-complexity reasoning steps, reducing the number of reasoning loops, and streaming intermediate results.", + }, + { + question: "How do I debug a multi-step AI agent with MLflow?", + answer: ( + <> + Multi-step agents are difficult to debug because failures can occur at + any reasoning step, tool call, or decision point.{" "} + MLflow Tracing{" "} + captures the full execution graph, including every LLM invocation, tool + call, input, output, and intermediate reasoning step. This lets you + pinpoint exactly where an agent went wrong: a bad tool selection, a + hallucinated reasoning step, an unnecessary loop, or an incorrect final + synthesis.{" "} + + MLflow Evaluation + {" "} + can then assess agent decision-making quality across many examples. + + ), + answerText: + "Multi-step agents are difficult to debug because failures can occur at any reasoning step, tool call, or decision point. MLflow Tracing captures the full execution graph, including every LLM invocation, tool call, input, output, and intermediate reasoning step. This lets you pinpoint exactly where an agent went wrong: a bad tool selection, a hallucinated reasoning step, an unnecessary loop, or an incorrect final synthesis. MLflow Evaluation can then assess agent decision-making quality across many examples.", + }, + { + question: "How do I measure AI agent performance with MLflow?", + answer: ( + <> + Agent performance is measured across multiple dimensions: quality (using{" "} + + LLM judge scorers + {" "} + for correctness, relevance, tool use quality, and safety), cost (total + token usage across all reasoning steps), and latency (end-to-end + response time including tool calls).{" "} + MLflow Tracing{" "} + captures cost and latency automatically across the full agent execution + graph, while{" "} + + MLflow Evaluation + {" "} + provides automated quality scoring with 70+ built-in judges. + + ), + answerText: + "Agent performance is measured across multiple dimensions: quality (using LLM judge scorers for correctness, relevance, tool use quality, and safety), cost (total token usage across all reasoning steps), and latency (end-to-end response time including tool calls). MLflow Tracing captures cost and latency automatically across the full agent execution graph, while MLflow Evaluation provides automated quality scoring with 70+ built-in judges.", + }, + { + question: "Is MLflow free for agent optimization?", + answer: + "Yes. MLflow is 100% open source under the Apache 2.0 license, backed by the Linux Foundation. You can use all agent optimization features (tracing, evaluation, prompt optimization, AI Gateway) for free, including in commercial applications. There are no per-seat fees, no usage limits, and no vendor lock-in.", + }, + { + question: "How do I get started with agent optimization using MLflow?", + answer: ( + <> + Start by enabling{" "} + + MLflow Tracing + {" "} + with a single line of code to capture the full execution graph for every + agent invocation: reasoning steps, tool calls, token usage, and latency. + This gives you a baseline. Then use{" "} + + MLflow Evaluation + {" "} + to measure agent output quality with LLM judges. Once you can see and + measure performance, apply targeted optimizations: improve system + prompts with{" "} + automated prompt optimization, + reduce unnecessary tool calls, route sub-tasks to faster models, or + cache repeated operations. + + ), + answerText: + "Start by enabling MLflow Tracing with a single line of code to capture the full execution graph for every agent invocation: reasoning steps, tool calls, token usage, and latency. This gives you a baseline. Then use MLflow Evaluation to measure agent output quality with LLM judges. Once you can see and measure performance, apply targeted optimizations: improve system prompts with automated prompt optimization, reduce unnecessary tool calls, route sub-tasks to faster models, or cache repeated operations.", + }, +]; + +const faqJsonLd = { + "@context": "https://schema.org", + "@type": "FAQPage", + mainEntity: faqs.map((faq) => ({ + "@type": "Question", + name: faq.question, + acceptedAnswer: { + "@type": "Answer", + text: faq.answerText || faq.answer, + }, + })), +}; + +const softwareJsonLd = { + "@context": "https://schema.org", + "@type": "SoftwareApplication", + name: "MLflow", + applicationCategory: "DeveloperApplication", + operatingSystem: "Cross-platform", + offers: { + "@type": "Offer", + price: "0", + priceCurrency: "USD", + }, + description: + "Open-source platform for optimizing AI agents with tracing, evaluation, prompt optimization, and cost management, compliance, and governance.", + url: "https://mlflow.org", + license: "https://www.apache.org/licenses/LICENSE-2.0", +}; + +const OPTIMIZE_CODE = `import mlflow +from mlflow.genai.optimize import GepaPromptOptimizer +from mlflow.genai.scorers import Correctness + +base_prompt = mlflow.genai.register_prompt( + name="agent-system-prompt", + template="Answer the question based on the context.\\n\\n" + "Context: {{ context }}\\n" + "Question: {{ question }}\\n\\nAnswer:", +) + +result = mlflow.genai.optimize_prompts( + predict_fn=my_predict_fn, + train_data=train_data, # labeled examples + prompt_uris=[base_prompt.uri], + optimizer=GepaPromptOptimizer( + reflection_model="openai:/gpt-5.2", + ), + scorers=[Correctness()], +) + +optimized = mlflow.genai.load_prompt(result.optimized_prompts[0].uri) +print(optimized.template)`; + +export default function AgentOptimization() { + const [openFaqIndex, setOpenFaqIndex] = useState(0); + + return ( + <> + + {SEO_TITLE} + + + + + + + + + + + +
+
+ +
+

Agent Optimization

+ +

+ Agent optimization is the practice of improving AI agents across + quality, cost, and latency. Unlike optimizing a single LLM call, + agents introduce unique challenges: they make multiple LLM calls per + request, invoke tools, follow multi-step reasoning chains, and + produce compounding costs and latency at every step. Debugging why + an agent chose the wrong tool, entered a redundant reasoning loop, + or hallucinated an intermediate step requires specialized tooling + that captures the full execution graph and evaluates decision-making + quality at every level. +

+ +

+ Effective agent optimization starts with visibility.{" "} + Tracing captures the full execution + graph for every agent invocation: reasoning steps, tool calls, token + counts, and latency per span, revealing exactly where costs, + bottlenecks, and failures occur.{" "} + Evaluation measures agent output + quality with LLM judges, providing a baseline to track whether + changes actually improve performance.{" "} + Prompt optimization{" "} + automates the process of improving agent system prompts with + algorithms, replacing manual trial-and-error with data-driven + iteration. +

+ +

+ MLflow provides the complete open-source + toolkit for agent optimization:{" "} + tracing for + full execution visibility,{" "} + + evaluation + {" "} + for quality measurement,{" "} + + prompt optimization + {" "} + for algorithmic prompt improvement, and an{" "} + AI Gateway for cost management, + compliance, and governance across LLM providers. +

+ +
+ +
+ +
+

+ Quick Navigation: +

+ +
+ +

+ Why Agent Optimization Matters +

+ +

+ AI agents face unique optimization challenges that traditional + software profiling and testing can't address. Because agents combine + multi-step reasoning, tool use, and LLM calls into complex execution + paths, standard debugging and monitoring tools fall short: +

+ +
+
+

Compounding Costs

+

+ Problem: Agents make multiple LLM calls per + request across reasoning steps and tool invocations. Token costs + compound at every step, and without visibility, API bills grow + unpredictably. +

+

+ Solution:{" "} + Tracing{" "} + captures token usage per span across the full agent execution + graph, and an AI Gateway{" "} + enforces rate limits and budget controls across providers. +

+
+ +
+

Unreliable Decision-Making

+

+ Problem: Agents can select the wrong tools, + enter redundant reasoning loops, hallucinate intermediate steps, + or produce incorrect final responses. Traditional testing can't + catch these failures. +

+

+ Solution:{" "} + + Evaluation with LLM judges + {" "} + continuously assesses agent quality across correctness, + relevance, tool use, and safety, catching regressions before + users report them. +

+
+ +
+

Cascading Latency

+

+ Problem: Each reasoning step and tool call adds + latency. Agent workflows with sequential LLM calls and external + API requests compound delays, leading to slow response times + that frustrate users. +

+

+ Solution:{" "} + + Per-span latency tracing + {" "} + across the full execution graph identifies bottlenecks so you + can parallelize calls, cache repeated operations, or use faster + models for non-critical steps. +

+
+ +
+

Too Many Knobs to Tune

+

+ Problem: Agents have many interacting surfaces + to optimize: system prompts, tool descriptions, model selection + per step, few-shot examples, and retrieval parameters. Manual + tuning doesn't scale. +

+

+ Solution:{" "} + + Automated prompt optimization + {" "} + uses algorithms to improve agent prompts across hundreds of + examples, replacing manual trial-and-error with data-driven + iteration. +

+
+
+ +

Agent Optimization Techniques

+ +

+ Each of the challenges above has a corresponding optimization + approach. Here's what actually works: +

+ +
    +
  • + Cut compounding costs: Use{" "} + + tracing + {" "} + to see how many tokens each reasoning step and tool call uses. + You'll spot the waste fast: redundant reasoning loops, duplicate + tool calls, verbose prompts. Remove them, and costs drop. Then set + up budget policies and alerting through the{" "} + + AI Gateway + {" "} + to catch cost spikes before they eat through your budget. +
  • +
  • + Fix unreliable decision-making: Set up{" "} + + evaluation + {" "} + to score agent outputs with{" "} + + LLM judges + {" "} + and code-based metrics across correctness, relevance, and safety. + Get a quality baseline, make a change (better prompt, different + model, improved tool descriptions), re-evaluate, and see if it + actually helped. +
  • +
  • + Reduce cascading latency: Every sequential step + adds delay. Use{" "} + + per-span latency tracing + {" "} + to find the slowest steps, then parallelize independent tool + calls, cache repeated lookups, and use lighter models for + lower-complexity steps. +
  • +
  • + Automate prompt tuning: System prompts, tool + descriptions, few-shot examples. Too many surfaces to tweak by + hand.{" "} + + Prompt optimization + {" "} + runs your agent across test examples, finds where prompts fall + short, generates better versions, and picks the best one. +
  • +
+ +

Common Use Cases for Agent Optimization

+ +

+ These techniques apply to any agent architecture. Here are the most + common scenarios where teams see the biggest improvements: +

+ +
    +
  • + RAG agents: Agents that retrieve documents before + generating answers have two places to go wrong: bad retrieval and + bad generation. Use evaluation{" "} + with groundedness and relevance judges to figure out which side is + failing, then improve the retrieval strategy or generation prompts + accordingly. +
  • +
  • + Multi-step tool-use agents: Agents that chain + together multiple tool calls are prone to picking the wrong tool, + calling tools unnecessarily, or getting stuck in loops. Use{" "} + tracing to see the full execution + path and find where the agent goes off track. +
  • +
  • + Customer-facing chatbots: When agents talk + directly to users, quality and latency matter most. Set up{" "} + evaluation to catch bad + responses before users see them, and use{" "} + latency tracing to keep response + times fast. +
  • +
  • + Agents running at scale: When you have many + agents or high request volume, costs add up quickly. Use the{" "} + AI Gateway for budget policies and + model routing, and run{" "} + prompt optimization to + get better results from cheaper models. +
  • +
+ +

How to Implement Agent Optimization

+ +

+ MLflow provides the complete open-source + toolkit for agent optimization. Start with tracing to gain full + execution visibility, add evaluation to measure agent quality, then + apply targeted optimizations based on what the data shows. +

+ +

+ Optimize agent prompts automatically +

+ +
+
+ python + +
+
+ + {({ style, tokens, getLineProps, getTokenProps }) => ( +
+                    {tokens.map((line, i) => (
+                      
+ {line.map((token, key) => ( + + ))} +
+ ))} +
+ )} +
+
+
+ +

+ Evaluate agent quality with LLM judges +

+ +
+ +
+ Build custom LLM judges in the MLflow UI to score agent traces on + quality dimensions that matter for your use case +
+
+ +
+

+ + MLflow + {" "} + is the largest open-source{" "} + AI engineering platform, with over 30 million + monthly downloads. Thousands of organizations use MLflow to debug, + evaluate, monitor, and optimize production-quality AI agents while + controlling costs and managing access to models and data. Backed + by the Linux Foundation and licensed under Apache 2.0, MLflow + provides a complete agent optimization toolkit with no vendor + lock-in.{" "} + Get started → +

+
+ +

Open Source vs. Proprietary Agent Optimization

+ +

+ When choosing tools for agent optimization, the decision between + open source and proprietary platforms has significant implications + for cost, flexibility, and data ownership. +

+ +

+ + Open Source (MLflow): + {" "} + With MLflow, you maintain complete control over your agent + optimization data and infrastructure. Trace data, evaluation + results, and prompt versions stay on your own systems. There are no + per-seat fees, no usage limits, and no vendor lock-in. MLflow + integrates with any LLM provider and agent framework (LangGraph, + CrewAI, OpenAI Agents, and more) through OpenTelemetry-compatible + tracing. +

+ +

+ Proprietary SaaS Tools: Commercial agent + optimization and observability platforms offer convenience but at + the cost of flexibility and control. They typically charge per seat + or per trace volume, which becomes expensive at scale with agents + that generate many traces per request. Your trace data and + evaluation results are sent to their servers, raising privacy and + compliance concerns. You're locked into their ecosystem, making it + difficult to switch providers or customize workflows. +

+ +

+ Why Teams Choose Open Source: Organizations + building AI agents at scale choose MLflow because it provides + production-grade tracing, evaluation, prompt optimization, and cost + management, compliance, and governance without giving up control of + their data, cost predictability, or flexibility. The Apache 2.0 + license and Linux Foundation backing ensure MLflow remains truly + open and community-driven. +

+ +

Frequently Asked Questions

+ +
+ {faqs.map((faq, index) => ( +
+ + {openFaqIndex === index && ( +
{faq.answer}
+ )} +
+ ))} +
+ +

Related Resources

+ +
    +
  • + + Agent Tracing Documentation + +
  • +
  • + + Agent Evaluation Guide + +
  • +
  • + + Prompt Optimization Documentation + +
  • +
  • + AI Observability Guide +
  • +
  • + LLM Tracing Guide +
  • +
  • + LLM Evaluation Guide +
  • +
  • + AI Gateway Guide +
  • +
  • + Prompt Optimization Guide +
  • +
  • + LLMOps Guide +
  • +
  • + MLflow for Agents and LLMs Overview +
  • +
+
+ + + +
+ + ); +} diff --git a/website/src/pages/faq.tsx b/website/src/pages/faq.tsx index d6930f0fd9..e9af002807 100644 --- a/website/src/pages/faq.tsx +++ b/website/src/pages/faq.tsx @@ -1133,6 +1133,13 @@ export default function FAQ() { AI agents and LLM applications in production.

+ +

Agent Optimization

+

+ Debug, evaluate, and optimize AI agents for quality, cost, and + latency with tracing, evaluation, and prompt optimization. +

+

MLflow for LLMs & Agents

diff --git a/website/src/pages/llm-optimization.tsx b/website/src/pages/llm-optimization.tsx new file mode 100644 index 0000000000..553b6db60a --- /dev/null +++ b/website/src/pages/llm-optimization.tsx @@ -0,0 +1,1120 @@ +import { useState } from "react"; +import Head from "@docusaurus/Head"; +import Link from "@docusaurus/Link"; +import { Highlight } from "prism-react-renderer"; +import { Header } from "../components/Header/Header"; +import { SocialLinksFooter } from "../components/SocialLinksFooter/SocialLinksFooter"; +import { ArticleSidebar } from "../components/ArticleSidebar/ArticleSidebar"; +import { MLFLOW_GENAI_DOCS_URL } from "@site/src/constants"; +import ObservabilityHero from "@site/static/img/GenAI_observability/GenAI_observability_hero.png"; +import { CopyButton } from "../components/CodeSnippet/CopyButton"; +import { customNightOwl, CODE_BG } from "../components/CodeSnippet/codeTheme"; + +const SEO_TITLE = + "LLM Optimization: Reduce Costs & Improve Quality | MLflow AI Platform"; +const SEO_DESCRIPTION = + "Learn how to optimize LLM applications for cost, latency, and quality. Use MLflow's open-source tracing, evaluation, and prompt optimization to systematically improve LLM performance."; + +const faqs: { + question: string; + answer: React.ReactNode; + answerText?: string; +}[] = [ + { + question: "What is LLM optimization?", + answer: + "LLM optimization is the practice of systematically improving LLM applications and agents across dimensions like quality, cost, and latency. It includes techniques like prompt optimization, model selection, token usage reduction, caching, evaluation-driven iteration, and production monitoring to ensure LLM applications and agents perform well and cost-effectively at scale.", + }, + { + question: "How do I reduce LLM API costs with MLflow?", + answer: ( + <> + The most effective way to reduce LLM API costs is to gain visibility + into where tokens are being spent.{" "} + MLflow Tracing{" "} + captures token counts per span, so you can identify expensive + operations, redundant LLM calls, and oversized prompts. From there, you + can apply targeted optimizations: shorten prompts, cache repeated + queries, use smaller models for simpler tasks, or route through an{" "} + AI Gateway with rate limiting and + fallback routing. + + ), + answerText: + "The most effective way to reduce LLM API costs is to gain visibility into where tokens are being spent. MLflow Tracing captures token counts per span, so you can identify expensive operations, redundant LLM calls, and oversized prompts. From there, you can apply targeted optimizations: shorten prompts, cache repeated queries, use smaller models for simpler tasks, or route through an AI Gateway with rate limiting and fallback routing.", + }, + { + question: "How do I improve LLM response quality with MLflow?", + answer: ( + <> + Improving LLM response quality requires measurement and iteration.{" "} + + MLflow Evaluation + {" "} + lets you score outputs with LLM judges across dimensions like + correctness, relevance, safety, and groundedness. Once you have a + quality baseline, you can improve quality through{" "} + prompt optimization, better + retrieval pipelines for RAG, or model upgrades, and measure the impact + of each change. + + ), + answerText: + "Improving LLM response quality requires measurement and iteration. MLflow Evaluation lets you score outputs with LLM judges across dimensions like correctness, relevance, safety, and groundedness. Once you have a quality baseline, you can improve quality through prompt optimization, better retrieval pipelines for RAG, or model upgrades, and measure the impact of each change.", + }, + { + question: "How do I reduce LLM latency with MLflow?", + answer: ( + <> + MLflow Tracing{" "} + captures latency per span in your LLM pipeline, making it easy to find + bottlenecks. Common latency optimizations include using streaming + responses, caching frequent queries, parallelizing independent LLM + calls, using smaller or faster models for non-critical steps, and + reducing prompt length to decrease time-to-first-token. + + ), + answerText: + "MLflow Tracing captures latency per span in your LLM pipeline, making it easy to find bottlenecks. Common latency optimizations include using streaming responses, caching frequent queries, parallelizing independent LLM calls, using smaller or faster models for non-critical steps, and reducing prompt length to decrease time-to-first-token.", + }, + { + question: "What is prompt optimization in MLflow and how does it work?", + answer: ( + <> + Prompt optimization automates + the process of improving prompts using data-driven algorithms instead of + manual trial-and-error. Optimizers like GEPA evaluate prompts across + training examples, analyze failure patterns, generate improved variants, + and repeat until quality converges. MLflow provides a unified{" "} + + prompt optimization API + {" "} + that tracks every version and metric automatically. + + ), + answerText: + "Prompt optimization automates the process of improving prompts using data-driven algorithms instead of manual trial-and-error. Optimizers like GEPA evaluate prompts across training examples, analyze failure patterns, generate improved variants, and repeat until quality converges. MLflow provides a unified prompt optimization API that tracks every version and metric automatically.", + }, + { + question: + "How do I optimize a RAG (Retrieval-Augmented Generation) pipeline?", + answer: + "Optimizing a RAG pipeline involves improving both the retrieval and generation stages. Use MLflow Tracing to see exactly what documents are retrieved and how they affect the LLM's response. Use MLflow Evaluation with groundedness and relevance judges to measure retrieval quality. Then iterate on your chunking strategy, embedding model, retrieval parameters, and generation prompts, measuring the impact of each change.", + }, + { + question: "How do I optimize an AI agent with MLflow?", + answer: ( + <> + Agent optimization requires visibility into every reasoning step, tool + call, and LLM invocation.{" "} + MLflow Tracing{" "} + captures the full execution graph so you can identify unnecessary tool + calls, redundant reasoning loops, and expensive LLM invocations.{" "} + + MLflow Evaluation + {" "} + lets you assess agent decision-making quality with LLM judges, and{" "} + prompt optimization can improve + the agent's system prompts algorithmically. + + ), + answerText: + "Agent optimization requires visibility into every reasoning step, tool call, and LLM invocation. MLflow Tracing captures the full execution graph so you can identify unnecessary tool calls, redundant reasoning loops, and expensive LLM invocations. MLflow Evaluation lets you assess agent decision-making quality with LLM judges, and prompt optimization can improve the agent's system prompts algorithmically.", + }, + { + question: "What is the best tool for LLM optimization?", + answer: + "The best tool for LLM optimization depends on your needs. MLflow is the leading open-source option, providing the complete toolkit: tracing for cost and latency visibility, evaluation for quality measurement, prompt optimization for algorithmic improvement, and an AI Gateway for cost management, compliance, and governance. Unlike proprietary tools, MLflow is 100% free, supports any LLM provider and agent framework, and is backed by the Linux Foundation with over 30 million monthly downloads.", + }, + { + question: "How do I measure LLM performance with MLflow?", + answer: ( + <> + LLM performance is measured across multiple dimensions: quality (using{" "} + + LLM judge scorers + {" "} + for correctness, relevance, safety, etc.), cost (token usage per + request), and latency (response time per span).{" "} + MLflow Tracing{" "} + captures cost and latency automatically, while{" "} + + MLflow Evaluation + {" "} + provides automated quality scoring with 70+ built-in judges. + + ), + answerText: + "LLM performance is measured across multiple dimensions: quality (using LLM judge scorers for correctness, relevance, safety, etc.), cost (token usage per request), and latency (response time per span). MLflow Tracing captures cost and latency automatically, while MLflow Evaluation provides automated quality scoring with 70+ built-in judges.", + }, + { + question: "Is MLflow free for LLM optimization?", + answer: + "Yes. MLflow is 100% open source under the Apache 2.0 license, backed by the Linux Foundation. You can use all optimization features (tracing, evaluation, prompt optimization, AI Gateway) for free, including in commercial applications. There are no per-seat fees, no usage limits, and no vendor lock-in.", + }, + { + question: "How do I get started with LLM optimization using MLflow?", + answer: ( + <> + Start by enabling{" "} + + MLflow Tracing + {" "} + with a single line of code to capture token usage, latency, and + execution details for every LLM call. This gives you a baseline. Then + use{" "} + + MLflow Evaluation + {" "} + to measure output quality. Once you can see and measure performance, + apply targeted optimizations: shorten prompts, optimize retrieval, + adjust model selection, or run{" "} + automated prompt optimization. + + ), + answerText: + "Start by enabling MLflow Tracing with a single line of code to capture token usage, latency, and execution details for every LLM call. This gives you a baseline. Then use MLflow Evaluation to measure output quality. Once you can see and measure performance, apply targeted optimizations: shorten prompts, optimize retrieval, adjust model selection, or run automated prompt optimization.", + }, +]; + +const faqJsonLd = { + "@context": "https://schema.org", + "@type": "FAQPage", + mainEntity: faqs.map((faq) => ({ + "@type": "Question", + name: faq.question, + acceptedAnswer: { + "@type": "Answer", + text: faq.answerText || faq.answer, + }, + })), +}; + +const softwareJsonLd = { + "@context": "https://schema.org", + "@type": "SoftwareApplication", + name: "MLflow", + applicationCategory: "DeveloperApplication", + operatingSystem: "Cross-platform", + offers: { + "@type": "Offer", + price: "0", + priceCurrency: "USD", + }, + description: + "Open-source platform for optimizing LLM applications with tracing, evaluation, prompt optimization, and cost management, compliance, and governance.", + url: "https://mlflow.org", + license: "https://www.apache.org/licenses/LICENSE-2.0", +}; + +const TRACING_CODE = `import mlflow +from openai import OpenAI + +# Enable automatic tracing for OpenAI +mlflow.openai.autolog() + +# Every LLM call is now traced with token counts, +# latency, prompts, and responses +client = OpenAI() +response = client.chat.completions.create( + model="gpt-4.1", + messages=[{"role": "user", "content": "Summarize MLflow"}], +) + +# Search traces to analyze cost and latency patterns +traces = mlflow.search_traces(experiment_ids=["1"]) +for trace in traces: + print(f"Tokens: {trace.info.total_tokens}") + print(f"Latency: {trace.info.execution_duration_ms}ms")`; + +const EVAL_CODE = `import mlflow +from mlflow.genai.scorers import ( + Correctness, + RelevanceToInput, + Safety, +) + +# Evaluate LLM outputs with built-in judges +results = mlflow.genai.evaluate( + data=mlflow.search_traces(experiment_ids=["1"]), + scorers=[ + Correctness(), # Are responses factually correct? + RelevanceToInput(), # Are responses relevant to the query? + Safety(), # Are responses free of harmful content? + ], +) + +# View results in the MLflow UI or programmatically +print(results.tables["eval_results"])`; + +export default function LLMOptimization() { + const [openFaqIndex, setOpenFaqIndex] = useState(0); + + return ( + <> + + {SEO_TITLE} + + + + + + + + + + + +

+
+ +
+

LLM Optimization

+ +

+ LLM optimization is the practice of systematically improving LLM + applications and agents across dimensions like quality, cost, and + latency. For single-turn LLM applications, optimization typically + focuses on prompt quality, model selection, and token efficiency. + Agent optimization goes further: because agents make multiple LLM + calls, invoke tools, and follow multi-step reasoning chains, they + introduce additional challenges around debugging complex execution + paths, reducing compounding latency, and controlling costs that + scale with each reasoning step. Both require specialized tooling + because LLM behavior is non-deterministic, costs scale with token + usage, and quality can only be measured with semantic evaluation + rather than unit tests. +

+ +

+ Effective LLM optimization starts with visibility.{" "} + Tracing captures token counts, + latency, and execution details for every LLM call, revealing where + costs and bottlenecks are.{" "} + Evaluation measures output + quality with LLM judges, providing a baseline to track whether + changes actually improve performance.{" "} + Prompt optimization{" "} + automates the process of improving prompts algorithmically, + replacing manual trial-and-error with systematic, data-driven + iteration. +

+ +

+ MLflow provides the complete open-source + toolkit for LLM optimization:{" "} + tracing for + cost and latency visibility,{" "} + + evaluation + {" "} + for quality measurement,{" "} + + prompt optimization + {" "} + for algorithmic prompt improvement, and an{" "} + AI Gateway for cost management, + compliance, and governance across LLM providers. +

+ +
+ +
+ +
+

+ Quick Navigation: +

+ +
+ +

+ Why LLM Optimization Matters +

+ +

+ LLM applications face unique optimization challenges that + traditional software profiling and testing can't address: +

+ +
+
+

Runaway Costs

+

+ Problem: Token costs scale with usage, and + multi-step agents can make dozens of LLM calls per request. + Without visibility, API bills grow unpredictably. +

+

+ Solution:{" "} + Tracing{" "} + captures token usage per span so you can find expensive + operations, and an AI Gateway{" "} + enforces rate limits and budget controls across providers. +

+
+ +
+

Quality & Reliability

+

+ Problem: LLM applications can produce + hallucinations, irrelevant responses, and degraded outputs that + undermine user trust, and traditional testing can't catch these + issues. +

+

+ Solution:{" "} + + Evaluation with LLM judges + {" "} + continuously assesses quality dimensions like correctness, + relevance, and safety across every response, catching + regressions before users report them. +

+
+ +
+

Slow Response Times

+

+ Problem: LLM calls add hundreds of milliseconds + to seconds of latency per request. Agent workflows with multiple + sequential calls compound this significantly. +

+

+ Solution:{" "} + + Per-span latency tracing + {" "} + identifies bottlenecks so you can parallelize calls, cache + repeated queries, or use faster models for non-critical steps. +

+
+ +
+

Inefficient Iteration

+

+ Problem: Manual prompt engineering is slow, + inconsistent, and plateaus quickly. Engineers can't + systematically identify which prompt changes improve quality. +

+

+ Solution:{" "} + + Automated prompt optimization + {" "} + uses algorithms to systematically improve prompts across + hundreds of examples, replacing guesswork with measured + improvement. +

+
+
+ +

LLM Optimization Techniques

+ +

+ LLM optimization spans several complementary strategies. The most + effective approach combines visibility (tracing), measurement + (evaluation), and systematic improvement (prompt optimization): +

+ +
    +
  • + + Cost Optimization with Tracing + + : Capture token counts per span to identify the most expensive + operations in your pipeline. Common wins include shortening + verbose prompts, eliminating redundant LLM calls in agent loops, + routing simple queries to smaller models, and caching repeated + requests. +
  • +
  • + + Quality Optimization with Evaluation + + : Use LLM judges to measure output quality across dimensions like + correctness, relevance, safety, and groundedness. Establish a + quality baseline, then measure the impact of each change. + Automated evaluation replaces subjective spot-checking with + systematic measurement. +
  • +
  • + + Prompt Optimization + + : Replace manual prompt engineering with algorithms that + systematically improve prompts across training data. Optimizers + like GEPA analyze failure patterns, generate improved variants, + and select the best performer automatically. +
  • +
  • + + Cost Governance with AI Gateway + + : Route LLM requests through a centralized gateway with rate + limiting, budget controls, fallback routing, and unified + credential management. Track usage and costs across all providers + from a single dashboard. +
  • +
  • + + Production Monitoring + + : Run LLM judges continuously against production traces to catch + quality regressions, cost spikes, and latency degradation before + users report them. +
  • +
+ +

Common Use Cases for LLM Optimization

+ +

+ LLM optimization applies across the full lifecycle of LLM + applications: +

+ +
    +
  • + Reducing agent costs: Multi-step agents can make + many LLM calls per request. Use{" "} + tracing to identify unnecessary + reasoning loops and redundant tool calls, then optimize the + agent's prompts and logic to reduce token usage. +
  • +
  • + Improving RAG quality: Retrieval-augmented + generation pipelines depend on both retrieval and generation + quality. Use evaluation with + groundedness and relevance judges to measure end-to-end quality, + then iterate on chunking, embedding, and generation prompts. +
  • +
  • + Optimizing prompts at scale: Instead of manually + tweaking prompts for each feature, use{" "} + prompt optimization to + algorithmically improve prompts across hundreds of examples with + full tracking and versioning. +
  • +
  • + Model selection and routing: Not all queries need + the most expensive model. Use tracing and evaluation to identify + which queries can be routed to cheaper, faster models without + sacrificing quality, and implement routing through the{" "} + AI Gateway. +
  • +
  • + Latency optimization: Use{" "} + per-span latency tracing to find + bottlenecks, then apply streaming, caching, parallelization, or + model downsizing to reduce response times. +
  • +
  • + Production quality monitoring: Run{" "} + + LLM judges continuously + {" "} + against production traces to detect quality regressions, + hallucination spikes, and cost anomalies before they impact users. +
  • +
+ +

How to Implement LLM Optimization

+ +

+ MLflow provides the complete open-source + toolkit for LLM optimization. Start with tracing to gain visibility, + add evaluation to measure quality, then apply targeted optimizations + based on what the data shows. +

+ +

+ + Step 1: Enable tracing for cost and latency visibility + +

+ +
+
+ python + +
+
+ + {({ style, tokens, getLineProps, getTokenProps }) => ( +
+                    {tokens.map((line, i) => (
+                      
+ {line.map((token, key) => ( + + ))} +
+ ))} +
+ )} +
+
+
+ +

+ Step 2: Evaluate quality with LLM judges +

+ +
+
+ python + +
+
+ + {({ style, tokens, getLineProps, getTokenProps }) => ( +
+                    {tokens.map((line, i) => (
+                      
+ {line.map((token, key) => ( + + ))} +
+ ))} +
+ )} +
+
+
+ +

+ With tracing and evaluation in place, you have the visibility needed + to apply targeted optimizations: shorten prompts to reduce costs, + run{" "} + + automated prompt optimization + {" "} + to improve quality, route through the{" "} + AI Gateway for cost management, + compliance, and governance, and monitor production quality with + continuous evaluation. +

+ +
+ MLflow UI showing traced LLM calls with token counts, latency, and execution details for optimization +

+ MLflow captures traces with token counts, latency, and full + execution context for every LLM call +

+
+ +
+

+ + MLflow + {" "} + is the largest open-source{" "} + AI engineering platform, with over 30 million + monthly downloads. Thousands of organizations use MLflow to debug, + evaluate, monitor, and optimize production-quality AI agents and + LLM applications while controlling costs and managing access to + models and data. Backed by the Linux Foundation and licensed under + Apache 2.0, MLflow provides a complete LLM optimization toolkit + with no vendor lock-in.{" "} + Get started → +

+
+ +

Open Source vs. Proprietary LLM Optimization

+ +

+ When choosing tools for LLM optimization, the decision between open + source and proprietary platforms has significant implications for + cost, flexibility, and data ownership. +

+ +

+ + Open Source (MLflow): + {" "} + With MLflow, you maintain complete control over your optimization + data and infrastructure. Trace data, evaluation results, and prompt + versions stay on your own systems. There are no per-seat fees, no + usage limits, and no vendor lock-in. MLflow integrates with any LLM + provider and agent framework through OpenTelemetry-compatible + tracing. +

+ +

+ Proprietary SaaS Tools: Commercial optimization and + observability platforms offer convenience but at the cost of + flexibility and control. They typically charge per seat or per trace + volume, which becomes expensive at scale. Your trace data and + evaluation results are sent to their servers, raising privacy and + compliance concerns. You're locked into their ecosystem, making it + difficult to switch providers or customize workflows. +

+ +

+ Why Teams Choose Open Source: Organizations + optimizing LLM applications at scale choose MLflow because it + provides production-grade tracing, evaluation, prompt optimization, + and cost management, compliance, and governance without giving up + control of their data, cost predictability, or flexibility. The + Apache 2.0 license and Linux Foundation backing ensure MLflow + remains truly open and community-driven. +

+ +

Frequently Asked Questions

+ +
+ {faqs.map((faq, index) => ( +
+ + {openFaqIndex === index && ( +
{faq.answer}
+ )} +
+ ))} +
+ +

Related Resources

+ +
    +
  • + + LLM Tracing Documentation + +
  • +
  • + + LLM Evaluation Guide + +
  • +
  • + + Prompt Optimization Documentation + +
  • +
  • + AI Observability Guide +
  • +
  • + LLM Tracing Guide +
  • +
  • + LLM Evaluation Guide +
  • +
  • + AI Gateway Guide +
  • +
  • + Prompt Optimization Guide +
  • +
  • + LLMOps Guide +
  • +
  • + MLflow for Agents and LLMs Overview +
  • +
+
+ + + +
+ + ); +}