diff --git a/website/src/pages/agent-optimization.tsx b/website/src/pages/agent-optimization.tsx
new file mode 100644
index 0000000000..8050a3c66b
--- /dev/null
+++ b/website/src/pages/agent-optimization.tsx
@@ -0,0 +1,1080 @@
+import { useState } from "react";
+import Head from "@docusaurus/Head";
+import Link from "@docusaurus/Link";
+import { Highlight } from "prism-react-renderer";
+import { Header } from "../components/Header/Header";
+import { SocialLinksFooter } from "../components/SocialLinksFooter/SocialLinksFooter";
+import { ArticleSidebar } from "../components/ArticleSidebar/ArticleSidebar";
+import { MLFLOW_GENAI_DOCS_URL } from "@site/src/constants";
+import { CopyButton } from "../components/CodeSnippet/CopyButton";
+import { customNightOwl, CODE_BG } from "../components/CodeSnippet/codeTheme";
+
+const SEO_TITLE =
+ "Agent Optimization: Debug, Evaluate & Improve AI Agents | MLflow";
+const SEO_DESCRIPTION =
+ "Learn how to optimize AI agents for quality, cost, and latency. Use MLflow's open-source tracing, evaluation, and prompt optimization to debug and improve agent performance.";
+
+const faqs: {
+ question: string;
+ answer: React.ReactNode;
+ answerText?: string;
+}[] = [
+ {
+ question: "What is agent optimization?",
+ answer:
+ "Agent optimization is the practice of improving AI agents across quality, cost, and latency. Unlike optimizing a single LLM call, agent optimization addresses multi-step reasoning chains, tool selection, execution paths, and compounding costs. It includes techniques like tracing agent execution, evaluating decision-making quality with LLM judges, optimizing system prompts with algorithms, and monitoring agent reliability in production.",
+ },
+ {
+ question: "How does prompt optimization help improve AI agents?",
+ answer: (
+ <>
+ Agent behavior is heavily influenced by system prompts, tool
+ descriptions, and few-shot examples.{" "}
+ Prompt optimization automates
+ the process of improving these prompts using data-driven algorithms
+ instead of manual trial-and-error. Optimizers like GEPA evaluate agent
+ prompts across training examples, analyze failure patterns, generate
+ improved variants, and repeat until quality converges. MLflow provides a
+ unified{" "}
+
+ prompt optimization API
+ {" "}
+ that tracks every version and metric automatically.
+ >
+ ),
+ answerText:
+ "Agent behavior is heavily influenced by system prompts, tool descriptions, and few-shot examples. Prompt optimization automates the process of improving these prompts using data-driven algorithms instead of manual trial-and-error. Optimizers like GEPA evaluate agent prompts across training examples, analyze failure patterns, generate improved variants, and repeat until quality converges. MLflow provides a unified prompt optimization API that tracks every version and metric automatically.",
+ },
+ {
+ question:
+ "How do I optimize an agent with RAG (Retrieval-Augmented Generation)?",
+ answer:
+ "Optimizing a RAG agent involves improving both the retrieval and generation stages within the agent's execution. Use MLflow Tracing to see exactly what documents the agent retrieves and how they influence its reasoning and responses. Use MLflow Evaluation with groundedness and relevance judges to measure retrieval quality and answer accuracy. Then iterate on the agent's retrieval strategy, chunking approach, embedding model, and generation prompts, measuring the impact of each change across the full agent pipeline.",
+ },
+ {
+ question: "What is the best tool for AI agent optimization?",
+ answer:
+ "The best tool for agent optimization depends on your needs. MLflow is the leading open-source option, providing the complete toolkit: tracing for full execution visibility across reasoning steps and tool calls, evaluation for quality measurement with LLM judges, prompt optimization for algorithmic improvement of system prompts, and an AI Gateway for cost management, compliance, and governance. Unlike proprietary tools, MLflow is 100% free, supports any LLM provider and agent framework (LangGraph, CrewAI, OpenAI Agents, and more), and is backed by the Linux Foundation with over 30 million monthly downloads.",
+ },
+ {
+ question: "How do I reduce AI agent costs with MLflow?",
+ answer: (
+ <>
+ Agents make multiple LLM calls per request, so costs compound quickly.{" "}
+ MLflow Tracing{" "}
+ captures token counts per span across every reasoning step and tool
+ call, revealing where tokens are being spent. From there, you can
+ eliminate redundant reasoning loops, use smaller models for simple
+ sub-tasks, cache repeated tool calls, shorten system prompts, or route
+ through an AI Gateway with rate limiting
+ and budget controls.
+ >
+ ),
+ answerText:
+ "Agents make multiple LLM calls per request, so costs compound quickly. MLflow Tracing captures token counts per span across every reasoning step and tool call, revealing where tokens are being spent. From there, you can eliminate redundant reasoning loops, use smaller models for simple sub-tasks, cache repeated tool calls, shorten system prompts, or route through an AI Gateway with rate limiting and budget controls.",
+ },
+ {
+ question: "How do I improve agent response quality with MLflow?",
+ answer: (
+ <>
+ Agent quality depends on correct reasoning, appropriate tool selection,
+ and accurate final responses.{" "}
+
+ MLflow Evaluation
+ {" "}
+ lets you score agent outputs with LLM judges across dimensions like
+ correctness, relevance, safety, and tool use quality. Once you have a
+ quality baseline, improve through{" "}
+ prompt optimization of system
+ prompts, better tool descriptions, retrieval improvements, or model
+ upgrades, and measure the impact of each change.
+ >
+ ),
+ answerText:
+ "Agent quality depends on correct reasoning, appropriate tool selection, and accurate final responses. MLflow Evaluation lets you score agent outputs with LLM judges across dimensions like correctness, relevance, safety, and tool use quality. Once you have a quality baseline, improve through prompt optimization of system prompts, better tool descriptions, retrieval improvements, or model upgrades, and measure the impact of each change.",
+ },
+ {
+ question: "How do I reduce agent latency with MLflow?",
+ answer: (
+ <>
+ Agents suffer from compounding latency because each reasoning step and
+ tool call adds time.{" "}
+ MLflow Tracing{" "}
+ captures per-span latency across the full execution graph, making it
+ easy to find bottlenecks. Common optimizations include parallelizing
+ independent tool calls, caching repeated operations, using faster models
+ for lower-complexity reasoning steps, reducing the number of reasoning
+ loops, and streaming intermediate results.
+ >
+ ),
+ answerText:
+ "Agents suffer from compounding latency because each reasoning step and tool call adds time. MLflow Tracing captures per-span latency across the full execution graph, making it easy to find bottlenecks. Common optimizations include parallelizing independent tool calls, caching repeated operations, using faster models for lower-complexity reasoning steps, reducing the number of reasoning loops, and streaming intermediate results.",
+ },
+ {
+ question: "How do I debug a multi-step AI agent with MLflow?",
+ answer: (
+ <>
+ Multi-step agents are difficult to debug because failures can occur at
+ any reasoning step, tool call, or decision point.{" "}
+ MLflow Tracing{" "}
+ captures the full execution graph, including every LLM invocation, tool
+ call, input, output, and intermediate reasoning step. This lets you
+ pinpoint exactly where an agent went wrong: a bad tool selection, a
+ hallucinated reasoning step, an unnecessary loop, or an incorrect final
+ synthesis.{" "}
+
+ MLflow Evaluation
+ {" "}
+ can then assess agent decision-making quality across many examples.
+ >
+ ),
+ answerText:
+ "Multi-step agents are difficult to debug because failures can occur at any reasoning step, tool call, or decision point. MLflow Tracing captures the full execution graph, including every LLM invocation, tool call, input, output, and intermediate reasoning step. This lets you pinpoint exactly where an agent went wrong: a bad tool selection, a hallucinated reasoning step, an unnecessary loop, or an incorrect final synthesis. MLflow Evaluation can then assess agent decision-making quality across many examples.",
+ },
+ {
+ question: "How do I measure AI agent performance with MLflow?",
+ answer: (
+ <>
+ Agent performance is measured across multiple dimensions: quality (using{" "}
+
+ LLM judge scorers
+ {" "}
+ for correctness, relevance, tool use quality, and safety), cost (total
+ token usage across all reasoning steps), and latency (end-to-end
+ response time including tool calls).{" "}
+ MLflow Tracing{" "}
+ captures cost and latency automatically across the full agent execution
+ graph, while{" "}
+
+ MLflow Evaluation
+ {" "}
+ provides automated quality scoring with 70+ built-in judges.
+ >
+ ),
+ answerText:
+ "Agent performance is measured across multiple dimensions: quality (using LLM judge scorers for correctness, relevance, tool use quality, and safety), cost (total token usage across all reasoning steps), and latency (end-to-end response time including tool calls). MLflow Tracing captures cost and latency automatically across the full agent execution graph, while MLflow Evaluation provides automated quality scoring with 70+ built-in judges.",
+ },
+ {
+ question: "Is MLflow free for agent optimization?",
+ answer:
+ "Yes. MLflow is 100% open source under the Apache 2.0 license, backed by the Linux Foundation. You can use all agent optimization features (tracing, evaluation, prompt optimization, AI Gateway) for free, including in commercial applications. There are no per-seat fees, no usage limits, and no vendor lock-in.",
+ },
+ {
+ question: "How do I get started with agent optimization using MLflow?",
+ answer: (
+ <>
+ Start by enabling{" "}
+
+ MLflow Tracing
+ {" "}
+ with a single line of code to capture the full execution graph for every
+ agent invocation: reasoning steps, tool calls, token usage, and latency.
+ This gives you a baseline. Then use{" "}
+
+ MLflow Evaluation
+ {" "}
+ to measure agent output quality with LLM judges. Once you can see and
+ measure performance, apply targeted optimizations: improve system
+ prompts with{" "}
+ automated prompt optimization,
+ reduce unnecessary tool calls, route sub-tasks to faster models, or
+ cache repeated operations.
+ >
+ ),
+ answerText:
+ "Start by enabling MLflow Tracing with a single line of code to capture the full execution graph for every agent invocation: reasoning steps, tool calls, token usage, and latency. This gives you a baseline. Then use MLflow Evaluation to measure agent output quality with LLM judges. Once you can see and measure performance, apply targeted optimizations: improve system prompts with automated prompt optimization, reduce unnecessary tool calls, route sub-tasks to faster models, or cache repeated operations.",
+ },
+];
+
+const faqJsonLd = {
+ "@context": "https://schema.org",
+ "@type": "FAQPage",
+ mainEntity: faqs.map((faq) => ({
+ "@type": "Question",
+ name: faq.question,
+ acceptedAnswer: {
+ "@type": "Answer",
+ text: faq.answerText || faq.answer,
+ },
+ })),
+};
+
+const softwareJsonLd = {
+ "@context": "https://schema.org",
+ "@type": "SoftwareApplication",
+ name: "MLflow",
+ applicationCategory: "DeveloperApplication",
+ operatingSystem: "Cross-platform",
+ offers: {
+ "@type": "Offer",
+ price: "0",
+ priceCurrency: "USD",
+ },
+ description:
+ "Open-source platform for optimizing AI agents with tracing, evaluation, prompt optimization, and cost management, compliance, and governance.",
+ url: "https://mlflow.org",
+ license: "https://www.apache.org/licenses/LICENSE-2.0",
+};
+
+const OPTIMIZE_CODE = `import mlflow
+from mlflow.genai.optimize import GepaPromptOptimizer
+from mlflow.genai.scorers import Correctness
+
+base_prompt = mlflow.genai.register_prompt(
+ name="agent-system-prompt",
+ template="Answer the question based on the context.\\n\\n"
+ "Context: {{ context }}\\n"
+ "Question: {{ question }}\\n\\nAnswer:",
+)
+
+result = mlflow.genai.optimize_prompts(
+ predict_fn=my_predict_fn,
+ train_data=train_data, # labeled examples
+ prompt_uris=[base_prompt.uri],
+ optimizer=GepaPromptOptimizer(
+ reflection_model="openai:/gpt-5.2",
+ ),
+ scorers=[Correctness()],
+)
+
+optimized = mlflow.genai.load_prompt(result.optimized_prompts[0].uri)
+print(optimized.template)`;
+
+export default function AgentOptimization() {
+ const [openFaqIndex, setOpenFaqIndex] = useState
+ Agent optimization is the practice of improving AI agents across
+ quality, cost, and latency. Unlike optimizing a single LLM call,
+ agents introduce unique challenges: they make multiple LLM calls per
+ request, invoke tools, follow multi-step reasoning chains, and
+ produce compounding costs and latency at every step. Debugging why
+ an agent chose the wrong tool, entered a redundant reasoning loop,
+ or hallucinated an intermediate step requires specialized tooling
+ that captures the full execution graph and evaluates decision-making
+ quality at every level.
+
+ Effective agent optimization starts with visibility.{" "}
+ Tracing captures the full execution
+ graph for every agent invocation: reasoning steps, tool calls, token
+ counts, and latency per span, revealing exactly where costs,
+ bottlenecks, and failures occur.{" "}
+ Evaluation measures agent output
+ quality with LLM judges, providing a baseline to track whether
+ changes actually improve performance.{" "}
+ Prompt optimization{" "}
+ automates the process of improving agent system prompts with
+ algorithms, replacing manual trial-and-error with data-driven
+ iteration.
+
+ MLflow provides the complete open-source
+ toolkit for agent optimization:{" "}
+ tracing for
+ full execution visibility,{" "}
+
+ evaluation
+ {" "}
+ for quality measurement,{" "}
+
+ prompt optimization
+ {" "}
+ for algorithmic prompt improvement, and an{" "}
+ AI Gateway for cost management,
+ compliance, and governance across LLM providers.
+
+ Quick Navigation:
+
+ AI agents face unique optimization challenges that traditional
+ software profiling and testing can't address. Because agents combine
+ multi-step reasoning, tool use, and LLM calls into complex execution
+ paths, standard debugging and monitoring tools fall short:
+
+ Problem: Agents make multiple LLM calls per
+ request across reasoning steps and tool invocations. Token costs
+ compound at every step, and without visibility, API bills grow
+ unpredictably.
+
+ Solution:{" "}
+ Tracing{" "}
+ captures token usage per span across the full agent execution
+ graph, and an AI Gateway{" "}
+ enforces rate limits and budget controls across providers.
+
+ Problem: Agents can select the wrong tools,
+ enter redundant reasoning loops, hallucinate intermediate steps,
+ or produce incorrect final responses. Traditional testing can't
+ catch these failures.
+
+ Solution:{" "}
+
+ Evaluation with LLM judges
+ {" "}
+ continuously assesses agent quality across correctness,
+ relevance, tool use, and safety, catching regressions before
+ users report them.
+
+ Problem: Each reasoning step and tool call adds
+ latency. Agent workflows with sequential LLM calls and external
+ API requests compound delays, leading to slow response times
+ that frustrate users.
+
+ Solution:{" "}
+
+ Per-span latency tracing
+ {" "}
+ across the full execution graph identifies bottlenecks so you
+ can parallelize calls, cache repeated operations, or use faster
+ models for non-critical steps.
+
+ Problem: Agents have many interacting surfaces
+ to optimize: system prompts, tool descriptions, model selection
+ per step, few-shot examples, and retrieval parameters. Manual
+ tuning doesn't scale.
+
+ Solution:{" "}
+
+ Automated prompt optimization
+ {" "}
+ uses algorithms to improve agent prompts across hundreds of
+ examples, replacing manual trial-and-error with data-driven
+ iteration.
+
+ Each of the challenges above has a corresponding optimization
+ approach. Here's what actually works:
+
+ These techniques apply to any agent architecture. Here are the most
+ common scenarios where teams see the biggest improvements:
+
+ MLflow provides the complete open-source
+ toolkit for agent optimization. Start with tracing to gain full
+ execution visibility, add evaluation to measure agent quality, then
+ apply targeted optimizations based on what the data shows.
+
+ Optimize agent prompts automatically
+
+ Evaluate agent quality with LLM judges
+
+
+ MLflow
+ {" "}
+ is the largest open-source{" "}
+ AI engineering platform, with over 30 million
+ monthly downloads. Thousands of organizations use MLflow to debug,
+ evaluate, monitor, and optimize production-quality AI agents while
+ controlling costs and managing access to models and data. Backed
+ by the Linux Foundation and licensed under Apache 2.0, MLflow
+ provides a complete agent optimization toolkit with no vendor
+ lock-in.{" "}
+ Get started →
+
+ When choosing tools for agent optimization, the decision between
+ open source and proprietary platforms has significant implications
+ for cost, flexibility, and data ownership.
+
+
+ Open Source (MLflow):
+ {" "}
+ With MLflow, you maintain complete control over your agent
+ optimization data and infrastructure. Trace data, evaluation
+ results, and prompt versions stay on your own systems. There are no
+ per-seat fees, no usage limits, and no vendor lock-in. MLflow
+ integrates with any LLM provider and agent framework (LangGraph,
+ CrewAI, OpenAI Agents, and more) through OpenTelemetry-compatible
+ tracing.
+
+ Proprietary SaaS Tools: Commercial agent
+ optimization and observability platforms offer convenience but at
+ the cost of flexibility and control. They typically charge per seat
+ or per trace volume, which becomes expensive at scale with agents
+ that generate many traces per request. Your trace data and
+ evaluation results are sent to their servers, raising privacy and
+ compliance concerns. You're locked into their ecosystem, making it
+ difficult to switch providers or customize workflows.
+
+ Why Teams Choose Open Source: Organizations
+ building AI agents at scale choose MLflow because it provides
+ production-grade tracing, evaluation, prompt optimization, and cost
+ management, compliance, and governance without giving up control of
+ their data, cost predictability, or flexibility. The Apache 2.0
+ license and Linux Foundation backing ensure MLflow remains truly
+ open and community-driven.
+ Agent Optimization
+
+
+
+
+ Why Agent Optimization Matters
+
+
+ Compounding Costs
+ Unreliable Decision-Making
+ Cascading Latency
+ Too Many Knobs to Tune
+ Agent Optimization Techniques
+
+
+
+
+ Common Use Cases for Agent Optimization
+
+
+
+
+ How to Implement Agent Optimization
+
+
+ {tokens.map((line, i) => (
+
+ )}
+ Open Source vs. Proprietary Agent Optimization
+
+ Frequently Asked Questions
+
+ Related Resources
+
+
+
+
+ Debug, evaluate, and optimize AI agents for quality, cost, and + latency with tracing, evaluation, and prompt optimization. +
+
diff --git a/website/src/pages/llm-optimization.tsx b/website/src/pages/llm-optimization.tsx
new file mode 100644
index 0000000000..553b6db60a
--- /dev/null
+++ b/website/src/pages/llm-optimization.tsx
@@ -0,0 +1,1120 @@
+import { useState } from "react";
+import Head from "@docusaurus/Head";
+import Link from "@docusaurus/Link";
+import { Highlight } from "prism-react-renderer";
+import { Header } from "../components/Header/Header";
+import { SocialLinksFooter } from "../components/SocialLinksFooter/SocialLinksFooter";
+import { ArticleSidebar } from "../components/ArticleSidebar/ArticleSidebar";
+import { MLFLOW_GENAI_DOCS_URL } from "@site/src/constants";
+import ObservabilityHero from "@site/static/img/GenAI_observability/GenAI_observability_hero.png";
+import { CopyButton } from "../components/CodeSnippet/CopyButton";
+import { customNightOwl, CODE_BG } from "../components/CodeSnippet/codeTheme";
+
+const SEO_TITLE =
+ "LLM Optimization: Reduce Costs & Improve Quality | MLflow AI Platform";
+const SEO_DESCRIPTION =
+ "Learn how to optimize LLM applications for cost, latency, and quality. Use MLflow's open-source tracing, evaluation, and prompt optimization to systematically improve LLM performance.";
+
+const faqs: {
+ question: string;
+ answer: React.ReactNode;
+ answerText?: string;
+}[] = [
+ {
+ question: "What is LLM optimization?",
+ answer:
+ "LLM optimization is the practice of systematically improving LLM applications and agents across dimensions like quality, cost, and latency. It includes techniques like prompt optimization, model selection, token usage reduction, caching, evaluation-driven iteration, and production monitoring to ensure LLM applications and agents perform well and cost-effectively at scale.",
+ },
+ {
+ question: "How do I reduce LLM API costs with MLflow?",
+ answer: (
+ <>
+ The most effective way to reduce LLM API costs is to gain visibility
+ into where tokens are being spent.{" "}
+ MLflow Tracing{" "}
+ captures token counts per span, so you can identify expensive
+ operations, redundant LLM calls, and oversized prompts. From there, you
+ can apply targeted optimizations: shorten prompts, cache repeated
+ queries, use smaller models for simpler tasks, or route through an{" "}
+ AI Gateway with rate limiting and
+ fallback routing.
+ >
+ ),
+ answerText:
+ "The most effective way to reduce LLM API costs is to gain visibility into where tokens are being spent. MLflow Tracing captures token counts per span, so you can identify expensive operations, redundant LLM calls, and oversized prompts. From there, you can apply targeted optimizations: shorten prompts, cache repeated queries, use smaller models for simpler tasks, or route through an AI Gateway with rate limiting and fallback routing.",
+ },
+ {
+ question: "How do I improve LLM response quality with MLflow?",
+ answer: (
+ <>
+ Improving LLM response quality requires measurement and iteration.{" "}
+
+ MLflow Evaluation
+ {" "}
+ lets you score outputs with LLM judges across dimensions like
+ correctness, relevance, safety, and groundedness. Once you have a
+ quality baseline, you can improve quality through{" "}
+ prompt optimization, better
+ retrieval pipelines for RAG, or model upgrades, and measure the impact
+ of each change.
+ >
+ ),
+ answerText:
+ "Improving LLM response quality requires measurement and iteration. MLflow Evaluation lets you score outputs with LLM judges across dimensions like correctness, relevance, safety, and groundedness. Once you have a quality baseline, you can improve quality through prompt optimization, better retrieval pipelines for RAG, or model upgrades, and measure the impact of each change.",
+ },
+ {
+ question: "How do I reduce LLM latency with MLflow?",
+ answer: (
+ <>
+ MLflow Tracing{" "}
+ captures latency per span in your LLM pipeline, making it easy to find
+ bottlenecks. Common latency optimizations include using streaming
+ responses, caching frequent queries, parallelizing independent LLM
+ calls, using smaller or faster models for non-critical steps, and
+ reducing prompt length to decrease time-to-first-token.
+ >
+ ),
+ answerText:
+ "MLflow Tracing captures latency per span in your LLM pipeline, making it easy to find bottlenecks. Common latency optimizations include using streaming responses, caching frequent queries, parallelizing independent LLM calls, using smaller or faster models for non-critical steps, and reducing prompt length to decrease time-to-first-token.",
+ },
+ {
+ question: "What is prompt optimization in MLflow and how does it work?",
+ answer: (
+ <>
+ Prompt optimization automates
+ the process of improving prompts using data-driven algorithms instead of
+ manual trial-and-error. Optimizers like GEPA evaluate prompts across
+ training examples, analyze failure patterns, generate improved variants,
+ and repeat until quality converges. MLflow provides a unified{" "}
+
+ prompt optimization API
+ {" "}
+ that tracks every version and metric automatically.
+ >
+ ),
+ answerText:
+ "Prompt optimization automates the process of improving prompts using data-driven algorithms instead of manual trial-and-error. Optimizers like GEPA evaluate prompts across training examples, analyze failure patterns, generate improved variants, and repeat until quality converges. MLflow provides a unified prompt optimization API that tracks every version and metric automatically.",
+ },
+ {
+ question:
+ "How do I optimize a RAG (Retrieval-Augmented Generation) pipeline?",
+ answer:
+ "Optimizing a RAG pipeline involves improving both the retrieval and generation stages. Use MLflow Tracing to see exactly what documents are retrieved and how they affect the LLM's response. Use MLflow Evaluation with groundedness and relevance judges to measure retrieval quality. Then iterate on your chunking strategy, embedding model, retrieval parameters, and generation prompts, measuring the impact of each change.",
+ },
+ {
+ question: "How do I optimize an AI agent with MLflow?",
+ answer: (
+ <>
+ Agent optimization requires visibility into every reasoning step, tool
+ call, and LLM invocation.{" "}
+ MLflow Tracing{" "}
+ captures the full execution graph so you can identify unnecessary tool
+ calls, redundant reasoning loops, and expensive LLM invocations.{" "}
+
+ MLflow Evaluation
+ {" "}
+ lets you assess agent decision-making quality with LLM judges, and{" "}
+ prompt optimization can improve
+ the agent's system prompts algorithmically.
+ >
+ ),
+ answerText:
+ "Agent optimization requires visibility into every reasoning step, tool call, and LLM invocation. MLflow Tracing captures the full execution graph so you can identify unnecessary tool calls, redundant reasoning loops, and expensive LLM invocations. MLflow Evaluation lets you assess agent decision-making quality with LLM judges, and prompt optimization can improve the agent's system prompts algorithmically.",
+ },
+ {
+ question: "What is the best tool for LLM optimization?",
+ answer:
+ "The best tool for LLM optimization depends on your needs. MLflow is the leading open-source option, providing the complete toolkit: tracing for cost and latency visibility, evaluation for quality measurement, prompt optimization for algorithmic improvement, and an AI Gateway for cost management, compliance, and governance. Unlike proprietary tools, MLflow is 100% free, supports any LLM provider and agent framework, and is backed by the Linux Foundation with over 30 million monthly downloads.",
+ },
+ {
+ question: "How do I measure LLM performance with MLflow?",
+ answer: (
+ <>
+ LLM performance is measured across multiple dimensions: quality (using{" "}
+
+ LLM judge scorers
+ {" "}
+ for correctness, relevance, safety, etc.), cost (token usage per
+ request), and latency (response time per span).{" "}
+ MLflow Tracing{" "}
+ captures cost and latency automatically, while{" "}
+
+ MLflow Evaluation
+ {" "}
+ provides automated quality scoring with 70+ built-in judges.
+ >
+ ),
+ answerText:
+ "LLM performance is measured across multiple dimensions: quality (using LLM judge scorers for correctness, relevance, safety, etc.), cost (token usage per request), and latency (response time per span). MLflow Tracing captures cost and latency automatically, while MLflow Evaluation provides automated quality scoring with 70+ built-in judges.",
+ },
+ {
+ question: "Is MLflow free for LLM optimization?",
+ answer:
+ "Yes. MLflow is 100% open source under the Apache 2.0 license, backed by the Linux Foundation. You can use all optimization features (tracing, evaluation, prompt optimization, AI Gateway) for free, including in commercial applications. There are no per-seat fees, no usage limits, and no vendor lock-in.",
+ },
+ {
+ question: "How do I get started with LLM optimization using MLflow?",
+ answer: (
+ <>
+ Start by enabling{" "}
+
+ MLflow Tracing
+ {" "}
+ with a single line of code to capture token usage, latency, and
+ execution details for every LLM call. This gives you a baseline. Then
+ use{" "}
+
+ MLflow Evaluation
+ {" "}
+ to measure output quality. Once you can see and measure performance,
+ apply targeted optimizations: shorten prompts, optimize retrieval,
+ adjust model selection, or run{" "}
+ automated prompt optimization.
+ >
+ ),
+ answerText:
+ "Start by enabling MLflow Tracing with a single line of code to capture token usage, latency, and execution details for every LLM call. This gives you a baseline. Then use MLflow Evaluation to measure output quality. Once you can see and measure performance, apply targeted optimizations: shorten prompts, optimize retrieval, adjust model selection, or run automated prompt optimization.",
+ },
+];
+
+const faqJsonLd = {
+ "@context": "https://schema.org",
+ "@type": "FAQPage",
+ mainEntity: faqs.map((faq) => ({
+ "@type": "Question",
+ name: faq.question,
+ acceptedAnswer: {
+ "@type": "Answer",
+ text: faq.answerText || faq.answer,
+ },
+ })),
+};
+
+const softwareJsonLd = {
+ "@context": "https://schema.org",
+ "@type": "SoftwareApplication",
+ name: "MLflow",
+ applicationCategory: "DeveloperApplication",
+ operatingSystem: "Cross-platform",
+ offers: {
+ "@type": "Offer",
+ price: "0",
+ priceCurrency: "USD",
+ },
+ description:
+ "Open-source platform for optimizing LLM applications with tracing, evaluation, prompt optimization, and cost management, compliance, and governance.",
+ url: "https://mlflow.org",
+ license: "https://www.apache.org/licenses/LICENSE-2.0",
+};
+
+const TRACING_CODE = `import mlflow
+from openai import OpenAI
+
+# Enable automatic tracing for OpenAI
+mlflow.openai.autolog()
+
+# Every LLM call is now traced with token counts,
+# latency, prompts, and responses
+client = OpenAI()
+response = client.chat.completions.create(
+ model="gpt-4.1",
+ messages=[{"role": "user", "content": "Summarize MLflow"}],
+)
+
+# Search traces to analyze cost and latency patterns
+traces = mlflow.search_traces(experiment_ids=["1"])
+for trace in traces:
+ print(f"Tokens: {trace.info.total_tokens}")
+ print(f"Latency: {trace.info.execution_duration_ms}ms")`;
+
+const EVAL_CODE = `import mlflow
+from mlflow.genai.scorers import (
+ Correctness,
+ RelevanceToInput,
+ Safety,
+)
+
+# Evaluate LLM outputs with built-in judges
+results = mlflow.genai.evaluate(
+ data=mlflow.search_traces(experiment_ids=["1"]),
+ scorers=[
+ Correctness(), # Are responses factually correct?
+ RelevanceToInput(), # Are responses relevant to the query?
+ Safety(), # Are responses free of harmful content?
+ ],
+)
+
+# View results in the MLflow UI or programmatically
+print(results.tables["eval_results"])`;
+
+export default function LLMOptimization() {
+ const [openFaqIndex, setOpenFaqIndex] = useState
+ LLM optimization is the practice of systematically improving LLM
+ applications and agents across dimensions like quality, cost, and
+ latency. For single-turn LLM applications, optimization typically
+ focuses on prompt quality, model selection, and token efficiency.
+ Agent optimization goes further: because agents make multiple LLM
+ calls, invoke tools, and follow multi-step reasoning chains, they
+ introduce additional challenges around debugging complex execution
+ paths, reducing compounding latency, and controlling costs that
+ scale with each reasoning step. Both require specialized tooling
+ because LLM behavior is non-deterministic, costs scale with token
+ usage, and quality can only be measured with semantic evaluation
+ rather than unit tests.
+
+ Effective LLM optimization starts with visibility.{" "}
+ Tracing captures token counts,
+ latency, and execution details for every LLM call, revealing where
+ costs and bottlenecks are.{" "}
+ Evaluation measures output
+ quality with LLM judges, providing a baseline to track whether
+ changes actually improve performance.{" "}
+ Prompt optimization{" "}
+ automates the process of improving prompts algorithmically,
+ replacing manual trial-and-error with systematic, data-driven
+ iteration.
+
+ MLflow provides the complete open-source
+ toolkit for LLM optimization:{" "}
+ tracing for
+ cost and latency visibility,{" "}
+
+ evaluation
+ {" "}
+ for quality measurement,{" "}
+
+ prompt optimization
+ {" "}
+ for algorithmic prompt improvement, and an{" "}
+ AI Gateway for cost management,
+ compliance, and governance across LLM providers.
+
+ Quick Navigation:
+
+ LLM applications face unique optimization challenges that
+ traditional software profiling and testing can't address:
+
+ Problem: Token costs scale with usage, and
+ multi-step agents can make dozens of LLM calls per request.
+ Without visibility, API bills grow unpredictably.
+
+ Solution:{" "}
+ Tracing{" "}
+ captures token usage per span so you can find expensive
+ operations, and an AI Gateway{" "}
+ enforces rate limits and budget controls across providers.
+
+ Problem: LLM applications can produce
+ hallucinations, irrelevant responses, and degraded outputs that
+ undermine user trust, and traditional testing can't catch these
+ issues.
+
+ Solution:{" "}
+
+ Evaluation with LLM judges
+ {" "}
+ continuously assesses quality dimensions like correctness,
+ relevance, and safety across every response, catching
+ regressions before users report them.
+
+ Problem: LLM calls add hundreds of milliseconds
+ to seconds of latency per request. Agent workflows with multiple
+ sequential calls compound this significantly.
+
+ Solution:{" "}
+
+ Per-span latency tracing
+ {" "}
+ identifies bottlenecks so you can parallelize calls, cache
+ repeated queries, or use faster models for non-critical steps.
+
+ Problem: Manual prompt engineering is slow,
+ inconsistent, and plateaus quickly. Engineers can't
+ systematically identify which prompt changes improve quality.
+
+ Solution:{" "}
+
+ Automated prompt optimization
+ {" "}
+ uses algorithms to systematically improve prompts across
+ hundreds of examples, replacing guesswork with measured
+ improvement.
+
+ LLM optimization spans several complementary strategies. The most
+ effective approach combines visibility (tracing), measurement
+ (evaluation), and systematic improvement (prompt optimization):
+
+ LLM optimization applies across the full lifecycle of LLM
+ applications:
+
+ MLflow provides the complete open-source
+ toolkit for LLM optimization. Start with tracing to gain visibility,
+ add evaluation to measure quality, then apply targeted optimizations
+ based on what the data shows.
+
+
+ Step 1: Enable tracing for cost and latency visibility
+
+
+ Step 2: Evaluate quality with LLM judges
+
+ With tracing and evaluation in place, you have the visibility needed
+ to apply targeted optimizations: shorten prompts to reduce costs,
+ run{" "}
+
+ automated prompt optimization
+ {" "}
+ to improve quality, route through the{" "}
+ AI Gateway for cost management,
+ compliance, and governance, and monitor production quality with
+ continuous evaluation.
+
+ MLflow captures traces with token counts, latency, and full
+ execution context for every LLM call
+
+
+ MLflow
+ {" "}
+ is the largest open-source{" "}
+ AI engineering platform, with over 30 million
+ monthly downloads. Thousands of organizations use MLflow to debug,
+ evaluate, monitor, and optimize production-quality AI agents and
+ LLM applications while controlling costs and managing access to
+ models and data. Backed by the Linux Foundation and licensed under
+ Apache 2.0, MLflow provides a complete LLM optimization toolkit
+ with no vendor lock-in.{" "}
+ Get started →
+
+ When choosing tools for LLM optimization, the decision between open
+ source and proprietary platforms has significant implications for
+ cost, flexibility, and data ownership.
+
+
+ Open Source (MLflow):
+ {" "}
+ With MLflow, you maintain complete control over your optimization
+ data and infrastructure. Trace data, evaluation results, and prompt
+ versions stay on your own systems. There are no per-seat fees, no
+ usage limits, and no vendor lock-in. MLflow integrates with any LLM
+ provider and agent framework through OpenTelemetry-compatible
+ tracing.
+
+ Proprietary SaaS Tools: Commercial optimization and
+ observability platforms offer convenience but at the cost of
+ flexibility and control. They typically charge per seat or per trace
+ volume, which becomes expensive at scale. Your trace data and
+ evaluation results are sent to their servers, raising privacy and
+ compliance concerns. You're locked into their ecosystem, making it
+ difficult to switch providers or customize workflows.
+
+ Why Teams Choose Open Source: Organizations
+ optimizing LLM applications at scale choose MLflow because it
+ provides production-grade tracing, evaluation, prompt optimization,
+ and cost management, compliance, and governance without giving up
+ control of their data, cost predictability, or flexibility. The
+ Apache 2.0 license and Linux Foundation backing ensure MLflow
+ remains truly open and community-driven.
+ LLM Optimization
+
+
+
+
+ Why LLM Optimization Matters
+
+
+ Runaway Costs
+ Quality & Reliability
+ Slow Response Times
+ Inefficient Iteration
+ LLM Optimization Techniques
+
+
+
+
+ Common Use Cases for LLM Optimization
+
+
+
+
+ How to Implement LLM Optimization
+
+
+ {tokens.map((line, i) => (
+
+ )}
+
+ {tokens.map((line, i) => (
+
+ )}
+
+
Open Source vs. Proprietary LLM Optimization
+
+ Frequently Asked Questions
+
+ Related Resources
+
+
+
+