diff --git a/website/src/pages/ai-observability.tsx b/website/src/pages/ai-observability.tsx index 36d11682a2..c563440e35 100644 --- a/website/src/pages/ai-observability.tsx +++ b/website/src/pages/ai-observability.tsx @@ -1193,6 +1193,12 @@ const result = await generateText({
  • Agent Evaluation FAQ
  • +
  • + LLMOps Guide +
  • +
  • + AIOps Guide +
  • MLflow for Agents and LLMs Overview
  • diff --git a/website/src/pages/aiops.tsx b/website/src/pages/aiops.tsx new file mode 100644 index 0000000000..d52defb57a --- /dev/null +++ b/website/src/pages/aiops.tsx @@ -0,0 +1,1080 @@ +import { useState } from "react"; +import Head from "@docusaurus/Head"; +import Link from "@docusaurus/Link"; +import { Highlight } from "prism-react-renderer"; +import { Header } from "../components/Header/Header"; +import { SocialLinksFooter } from "../components/SocialLinksFooter/SocialLinksFooter"; +import { ArticleSidebar } from "../components/ArticleSidebar/ArticleSidebar"; +import { MLFLOW_GENAI_DOCS_URL, MLFLOW_DOCS_URL } from "@site/src/constants"; +import ObservabilityHero from "@site/static/img/GenAI_observability/GenAI_observability_hero.png"; +import { CopyButton } from "../components/CodeSnippet/CopyButton"; +import { customNightOwl, CODE_BG } from "../components/CodeSnippet/codeTheme"; + +const SEO_TITLE = "What is AIOps? AI Operations Guide | MLflow AI Platform"; +const SEO_DESCRIPTION = + "Learn AIOps with MLflow, the largest open source AI engineering platform. Trace, evaluate, and monitor LLM agents, RAG systems, and ML models in production."; + +const faqs: { + question: string; + answer: React.ReactNode; + answerText?: string; +}[] = [ + { + question: "What is AIOps?", + answer: + "AIOps (AI Operations) refers to the practices, tools, and workflows for running AI applications in production across their full lifecycle, from development and evaluation through deployment and monitoring. This includes LLM applications, agents, RAG systems, and traditional machine learning models.", + }, + { + question: + "How is modern AIOps different from traditional AIOps (AI for IT)?", + answer: + "Traditionally, 'AIOps' meant using AI to improve IT operations (log analysis, anomaly detection, incident management). Modern AIOps has evolved to also mean operations for AI: the practices and platforms needed to build, deploy, and maintain AI applications in production. MLflow focuses on this modern definition: running AI applications at scale.", + }, + { + question: "How is AIOps different from MLOps and LLMOps?", + answer: ( + <> + AIOps is the broadest term, encompassing operations for all AI + applications. MLOps focuses specifically on traditional machine learning + (training, versioning, deploying models).{" "} + LLMOps focuses on LLM-specific challenges + (prompt management, non-deterministic evaluation, token costs). AIOps + unifies both under a single operational discipline, recognizing that + modern AI teams work across ML and LLM workloads. + + ), + answerText: + "AIOps is the broadest term, encompassing operations for all AI applications. MLOps focuses specifically on traditional machine learning (training, versioning, deploying models). LLMOps focuses on LLM-specific challenges (prompt management, non-deterministic evaluation, token costs). AIOps unifies both under a single operational discipline, recognizing that modern AI teams work across ML and LLM workloads.", + }, + { + question: "What are the key capabilities of an AIOps platform?", + answer: ( + <> + An AIOps platform typically provides:{" "} + tracing{" "} + (execution capture for LLM and agent debugging),{" "} + evaluation{" "} + (automated quality assessment),{" "} + experiment tracking{" "} + (for ML and LLM experiments),{" "} + + model registry + {" "} + (versioning and lifecycle management), and{" "} + + production monitoring + + . + + ), + answerText: + "An AIOps platform typically provides: tracing (execution capture for LLM and agent debugging), evaluation (automated quality assessment), experiment tracking (for ML and LLM experiments), model registry (versioning and lifecycle management), and production monitoring.", + }, + { + question: "Do I need AIOps?", + answer: + "Yes, if you're building production AI applications of any kind. AIOps helps you manage the full lifecycle, whether you're training traditional ML models, building LLM-powered chatbots, or deploying multi-step agents. Without AIOps practices, teams struggle with reproducibility, debugging, quality assurance, and cost management at scale.", + }, + { + question: "What is AI Ops vs AIOps?", + answer: + "AI Ops and AIOps refer to the same concept: operations for AI applications. 'AIOps' is the more common compound form, following the convention of DevOps and MLOps. Both terms describe the tools and practices needed to operationalize AI applications across their full lifecycle.", + }, + { + question: "What is the best AIOps platform?", + answer: + "The best AIOps platform depends on your needs. MLflow is the leading open-source option, providing a unified platform for both traditional ML operations (experiment tracking, model registry) and modern LLM operations (tracing, evaluation, prompt management). MLflow supports any framework, any model, and any cloud provider, with over 30 million monthly downloads and Linux Foundation backing.", + }, + { + question: "How does MLflow support AIOps?", + answer: ( + <> + MLflow provides a unified AIOps platform covering both traditional ML + and modern LLM workloads:{" "} + automatic tracing{" "} + for LLM debugging,{" "} + + evaluation with LLM judges + + ,{" "} + experiment tracking{" "} + for ML workflows,{" "} + + model registry + {" "} + for versioning, and{" "} + + production monitoring + {" "} + for ongoing quality tracking. + + ), + answerText: + "MLflow provides a unified AIOps platform covering both traditional ML and modern LLM workloads: automatic tracing for LLM debugging, evaluation with LLM judges, experiment tracking for ML workflows, model registry for versioning, and production monitoring for ongoing quality tracking.", + }, + { + question: "Can AIOps handle both ML models and LLM applications?", + answer: + "Yes. A modern AIOps platform like MLflow is designed to handle both traditional ML models (scikit-learn, PyTorch, TensorFlow) and LLM applications (OpenAI, Anthropic, open-source models) under a single operational framework. This unified approach prevents tool sprawl and gives teams a consistent workflow across all AI workloads.", + }, + { + question: "Is MLflow free for AIOps?", + answer: + "Yes. MLflow is 100% open source under the Apache 2.0 license, backed by the Linux Foundation. You can use all AIOps features (tracing, evaluation, experiment tracking, model registry, monitoring) for free, including in commercial applications. There are no per-seat fees, no usage limits, and no vendor lock-in.", + }, + { + question: "How do I get started with AIOps?", + answer: ( + <> + Getting started with AIOps using MLflow depends on your workload. For + LLM applications, enable{" "} + + automatic tracing + {" "} + with a single line of code. For traditional ML, start with{" "} + + experiment tracking + {" "} + to log parameters, metrics, and artifacts. MLflow provides a unified + platform so you can adopt both incrementally. + + ), + answerText: + "Getting started with AIOps using MLflow depends on your workload. For LLM applications, enable automatic tracing with a single line of code. For traditional ML, start with experiment tracking to log parameters, metrics, and artifacts. MLflow provides a unified platform so you can adopt both incrementally.", + }, + { + question: "What's the relationship between AIOps and AI observability?", + answer: ( + <> + AI observability is a core + component of AIOps, focused on monitoring and understanding AI system + behavior through tracing, metrics, and evaluation. AIOps is broader, + also encompassing experiment management, model versioning, deployment + workflows, prompt management, and the full operational lifecycle from + development through production. + + ), + answerText: + "AI observability is a core component of AIOps, focused on monitoring and understanding AI system behavior through tracing, metrics, and evaluation. AIOps is broader, also encompassing experiment management, model versioning, deployment workflows, prompt management, and the full operational lifecycle from development through production.", + }, +]; + +const faqJsonLd = { + "@context": "https://schema.org", + "@type": "FAQPage", + mainEntity: faqs.map((faq) => ({ + "@type": "Question", + name: faq.question, + acceptedAnswer: { + "@type": "Answer", + text: faq.answerText || faq.answer, + }, + })), +}; + +const softwareJsonLd = { + "@context": "https://schema.org", + "@type": "SoftwareApplication", + name: "MLflow", + applicationCategory: "DeveloperApplication", + operatingSystem: "Cross-platform", + offers: { + "@type": "Offer", + price: "0", + priceCurrency: "USD", + }, + description: + "Open-source AIOps platform for tracing, evaluating, tracking, and deploying AI applications and ML models.", + url: "https://mlflow.org", + license: "https://www.apache.org/licenses/LICENSE-2.0", +}; + +const TRACING_CODE = `import mlflow +from openai import OpenAI + +# Enable automatic tracing for OpenAI +mlflow.openai.autolog() + +# Every LLM call is now traced with full context +client = OpenAI() +response = client.chat.completions.create( + model="gpt-4.1", + messages=[{"role": "user", "content": "Summarize MLflow"}], +)`; + +const EVAL_CODE = `import mlflow.genai + +# Evaluate traced outputs with LLM judges +results = mlflow.genai.evaluate( + data=mlflow.search_traces(experiment_ids=["1"]), + scorers=[ + mlflow.genai.scorers.Relevance(), + mlflow.genai.scorers.Safety(), + ], +)`; + +const EXPERIMENT_CODE = `import mlflow + +# Track traditional ML experiments +mlflow.set_experiment("my-classification-model") + +with mlflow.start_run(): + mlflow.log_param("learning_rate", 0.01) + mlflow.log_param("epochs", 100) + + # Train your model... + mlflow.log_metric("accuracy", 0.95) + mlflow.log_metric("f1_score", 0.93) + + # Log the model artifact + mlflow.sklearn.log_model(model, "model")`; + +export default function AIOps() { + const [openFaqIndex, setOpenFaqIndex] = useState(0); + + return ( + <> + + {SEO_TITLE} + + + + + + + + + + + +
    +
    + +
    +

    What is AIOps?

    + +

    + AIOps (AI Operations) is the practice of running AI applications in + production across their full lifecycle, from development and + evaluation through deployment and monitoring. This encompasses{" "} + LLM applications and agents, RAG + systems, and{" "} + + traditional machine learning models + + . +

    + +

    + Historically, "AIOps" referred to using AI for IT operations + (automated log analysis, anomaly detection, incident management). + Today, the term has evolved to also describe the{" "} + operations for AI: the practices and platforms + needed to build, deploy, and maintain AI applications in production. + As organizations adopt LLMs, agents, and ML models at scale, AIOps + provides a unified framework to manage all of these workloads. +

    + +

    + MLflow is the most adopted open-source + AIOps platform, providing a unified stack for both LLMOps ( + tracing,{" "} + + evaluation + + ,{" "} + + prompt management + + , AI Gateway) and traditional + ML operations ( + + experiment tracking + + ,{" "} + + model registry + + ). +

    + +
    + +
    + +

    Why AIOps Matters

    + +

    + AI applications, whether LLM-powered agents or traditional ML + models, introduce operational challenges that standard DevOps can't + address: +

    + +
    +
    +

    Fragmented AI Tooling

    +

    + Problem: Teams use separate tools for ML + experiment tracking, LLM tracing, evaluation, and deployment, + creating tool sprawl and fragmented workflows. +

    +

    + Solution: A unified AIOps platform manages all + AI workloads (ML models, LLM apps, and agents) under a single + framework. +

    +
    + +
    +

    Quality at Scale

    +

    + Problem: AI outputs are non-deterministic and + can degrade silently, making it hard to maintain quality across + thousands of daily requests. +

    +

    + Solution: Automated evaluation with LLM judges + and continuous monitoring catch regressions before they reach + users. +

    +
    + +
    +

    Reproducibility

    +

    + Problem: Without systematic tracking of + parameters, data, models, and prompts, AI experiments and + deployments become impossible to reproduce. +

    +

    + Solution: Experiment tracking and model + registries capture every artifact, enabling full reproducibility + across ML and LLM workloads. +

    +
    + +
    +

    Cost & Resource Management

    +

    + Problem: AI workloads consume expensive compute + (GPU training) and API costs (LLM tokens) that can spiral + without visibility. +

    +

    + Solution: AIOps platforms track resource usage + across all AI workloads, helping teams optimize costs and + allocate resources effectively. +

    +
    +
    + +

    What is AIOps?

    + +

    + Modern AIOps is the operational discipline for all AI applications. + It unifies the practices previously split across MLOps (for + traditional ML) and LLMOps (for LLM + applications) into a single framework that covers: +

    + +
      +
    • + LLMOps / AgentOps:{" "} + Tracing,{" "} + + evaluation + + ,{" "} + + prompt management + + , and{" "} + + production monitoring + {" "} + for LLM applications, agents, and RAG systems. +
    • +
    • + ML Operations:{" "} + + Experiment tracking + + ,{" "} + + model registry + + , and model deployment for traditional ML workflows using + frameworks like scikit-learn, PyTorch, and TensorFlow. +
    • +
    • + Cross-Cutting Concerns: Governance, audit trails, + access control, cost tracking, and compliance that apply to all AI + workloads regardless of type. +
    • +
    + +

    + The key insight behind modern AIOps is that organizations rarely + build with just one type of AI. Most teams have a mix of traditional + ML models, LLM-powered features, and increasingly autonomous agents. + AIOps provides a unified platform to operationalize all of these, + preventing tool sprawl and ensuring consistent practices across all + their AI work. +

    + +

    + AIOps is closely related to{" "} + AI observability (the + monitoring and understanding subset) and{" "} + LLMOps (the LLM-specific subset). AIOps + is the broadest term, encompassing both and adding experiment + management, model versioning, and unified deployment. +

    + +

    Key AIOps Capabilities

    + +

    + A comprehensive AIOps platform combines capabilities for both + LLMOps/AgentOps and traditional ML workloads: +

    + +
      +
    • + + Tracing + + : Record every step of LLM and agent execution (prompts, + completions, tool calls, retrieval results, token usage, and + latency) for debugging and monitoring. +
    • +
    • + + Evaluation + + : Assess AI output quality using{" "} + + LLM judges + + , custom scorers, and traditional metrics across all workload + types. +
    • +
    • + + Experiment Tracking + + : Log parameters, metrics, and artifacts for ML experiments and + LLM development, enabling comparison and reproducibility. +
    • +
    • + + Model Registry + + : Version, stage, and manage the lifecycle of ML models and LLM + configurations in a centralized registry. +
    • +
    • + + Prompt Management + + : Version-control prompt templates, track production usage, and + enable safe rollbacks for LLM applications. +
    • +
    • + + AI Gateway + + : Route requests across LLM providers through a single endpoint + with unified authentication, rate limiting, and fallback routing. +
    • +
    • + + Production Monitoring + + : Track quality scores, error rates, costs, and drift over time to + catch regressions across all AI workloads. +
    • +
    + +

    How to Implement AIOps

    + +

    + MLflow provides a complete, open-source + AIOps platform. Here's how teams use MLflow across different AI + workloads: +

    + +

    + LLM Tracing +

    + +
    +
    + python + +
    +
    + + {({ style, tokens, getLineProps, getTokenProps }) => ( +
    +                    {tokens.map((line, i) => (
    +                      
    + {line.map((token, key) => ( + + ))} +
    + ))} +
    + )} +
    +
    +
    + +

    + Evaluation with LLM Judges +

    + +
    +
    + python + +
    +
    + + {({ style, tokens, getLineProps, getTokenProps }) => ( +
    +                    {tokens.map((line, i) => (
    +                      
    + {line.map((token, key) => ( + + ))} +
    + ))} +
    + )} +
    +
    +
    + +

    + ML Experiment Tracking +

    + +
    +
    + python + +
    +
    + + {({ style, tokens, getLineProps, getTokenProps }) => ( +
    +                    {tokens.map((line, i) => (
    +                      
    + {line.map((token, key) => ( + + ))} +
    + ))} +
    + )} +
    +
    +
    + +
    + MLflow UI showing traced AI operations with full execution context for AIOps workflows +

    + MLflow provides unified visibility across all AI operations: LLM + tracing, evaluation, and experiment tracking +

    +
    + +
    +

    + + MLflow + {" "} + is the largest open-source AI platform, with over 30 million + monthly downloads. Backed by the Linux Foundation and licensed + under Apache 2.0, it provides a complete AIOps stack with no + vendor lock-in.{" "} + Get started → +

    +
    + +

    Open Source vs. Proprietary AIOps

    + +

    + When choosing an AIOps platform, the decision between open source + and proprietary SaaS tools has significant long-term implications + for your team, infrastructure, and data ownership. +

    + +

    + + Open Source (MLflow): + {" "} + With MLflow, you maintain complete control over your AIOps + infrastructure and data. Deploy on your own infrastructure or use + managed versions on Databricks, AWS, or other platforms. There are + no per-seat fees, no usage limits, and no vendor lock-in. MLflow + supports any AI framework, from scikit-learn and PyTorch to OpenAI + and LangChain, under a single platform. +

    + +

    + Proprietary SaaS Tools: Commercial AIOps platforms + offer convenience but at the cost of flexibility and control. They + typically charge per seat or per usage volume, which can become + expensive at scale. Your data is sent to their servers, raising + privacy and compliance concerns. Most proprietary tools specialize + in either ML or LLM workloads, not both, leading to tool sprawl. +

    + +

    + Why Teams Choose Open Source: Organizations + building production AI applications increasingly choose MLflow + because it offers production-ready AIOps for both ML and LLM + workloads without giving up control of their data, cost + predictability, or flexibility. The Apache 2.0 license and Linux + Foundation backing ensure MLflow remains truly open and + community-driven. +

    + +

    Frequently Asked Questions

    + +
    + {faqs.map((faq, index) => ( +
    + + {openFaqIndex === index && ( +
    {faq.answer}
    + )} +
    + ))} +
    + +

    Related Resources

    + +
      +
    • + LLMOps Guide +
    • +
    • + AI Observability Guide +
    • +
    • + LLM Tracing Guide +
    • +
    • + + LLM Tracing Documentation + +
    • +
    • + + LLM Evaluation Guide + +
    • +
    • + + Experiment Tracking Documentation + +
    • +
    • + MLflow for Agents and LLMs Overview +
    • +
    • + + MLflow for Agents and LLMs Documentation + +
    • +
    +
    + + + +
    + + ); +} diff --git a/website/src/pages/faq.tsx b/website/src/pages/faq.tsx index b7aec66770..397a49f381 100644 --- a/website/src/pages/faq.tsx +++ b/website/src/pages/faq.tsx @@ -1112,6 +1112,13 @@ export default function FAQ() { management from development to production.

    + +

    AIOps

    +

    + Unified operations for all AI workloads: LLM applications, + agents, and traditional ML models under one platform. +

    +

    MLflow for LLMs & Agents

    diff --git a/website/src/pages/llm-evaluation.tsx b/website/src/pages/llm-evaluation.tsx index 87200191b8..f06f479727 100644 --- a/website/src/pages/llm-evaluation.tsx +++ b/website/src/pages/llm-evaluation.tsx @@ -1791,6 +1791,12 @@ results = mlflow.genai.evaluate(

  • AI Observability FAQ
  • +
  • + LLMOps Guide +
  • +
  • + AIOps Guide +
  • MLflow for Agents and LLMs Overview
  • diff --git a/website/src/pages/llm-tracing.tsx b/website/src/pages/llm-tracing.tsx index 13dbc2f255..fad505aaa7 100644 --- a/website/src/pages/llm-tracing.tsx +++ b/website/src/pages/llm-tracing.tsx @@ -1188,6 +1188,12 @@ const result = await generateText({
  • Agent Evaluation FAQ
  • +
  • + LLMOps Guide +
  • +
  • + AIOps Guide +
  • MLflow for Agents and LLMs Overview
  • diff --git a/website/src/pages/llmops.tsx b/website/src/pages/llmops.tsx index ebdafcd16a..30de8b9e9f 100644 --- a/website/src/pages/llmops.tsx +++ b/website/src/pages/llmops.tsx @@ -696,8 +696,8 @@ export default function LLMOps() {

    - LLMOps is closely related to AIOps (the broader discipline of - running all AI applications in production) and{" "} + LLMOps is closely related to AIOps (the + broader discipline of running all AI applications in production) and{" "} AI observability (the monitoring and debugging subset). LLMOps specifically targets LLM-powered applications, while AIOps also covers traditional ML @@ -1042,6 +1042,9 @@ export default function LLMOps() { Prompt Registry Documentation +

  • + AIOps Guide +
  • AI Observability Guide