feat: Generate a fallback report upon recursion limit hit (#838)

xunliu · lxl0413 · web-flow · commit ee02b9f637aa · 2026-01-26T21:10:18.000+08:00
* finish handle_recursion_limit_fallback

* fix

* renmae test file

* fix

* doc

---------

Co-authored-by: lxl0413 &lt;lixinling2021@gmail.com&gt;
diff --git a/docs/configuration_guide.md b/docs/configuration_guide.md
@@ -305,6 +305,31 @@ Or via API request parameter:
 
 ---
 
+## Recursion Fallback Configuration
+
+When agents hit the recursion limit, DeerFlow can gracefully generate a summary of accumulated findings instead of failing (enabled by default).
+
+### Configuration
+
+In `conf.yaml`:
+```yaml
+ENABLE_RECURSION_FALLBACK: true
+```
+
+### Recursion Limit
+
+Set the maximum recursion limit via environment variable:
+```bash
+export AGENT_RECURSION_LIMIT=50  # default: 25
+```
+
+Or in `.env`:
+```ini
+AGENT_RECURSION_LIMIT=50
+```
+
+---
+
 ## RAG (Retrieval-Augmented Generation) Configuration
 
 DeerFlow supports multiple RAG providers for document retrieval. Configure the RAG provider by setting environment variables.
diff --git a/src/config/configuration.py b/src/config/configuration.py
@@ -63,6 +63,9 @@ class Configuration:
     interrupt_before_tools: list[str] = field(
         default_factory=list
     )  # List of tool names to interrupt before execution
+    enable_recursion_fallback: bool = (
+        True  # Enable graceful fallback when recursion limit is reached
+    )
 
     @classmethod
     def from_runnable_config(
diff --git a/src/graph/nodes.py b/src/graph/nodes.py
@@ -7,10 +7,11 @@
 from functools import partial
 from typing import Annotated, Any, Literal
 
-from langchain_core.messages import AIMessage, HumanMessage, ToolMessage
+from langchain_core.messages import AIMessage, HumanMessage, SystemMessage, ToolMessage
 from langchain_core.runnables import RunnableConfig
 from langchain_core.tools import tool
 from langchain_mcp_adapters.client import MultiServerMCPClient
+from langgraph.errors import GraphRecursionError
 from langgraph.types import Command, interrupt
 
 from src.agents import create_agent
@@ -19,7 +20,7 @@
 from src.config.configuration import Configuration
 from src.llms.llm import get_llm_by_type, get_llm_token_limit_by_type
 from src.prompts.planner_model import Plan
-from src.prompts.template import apply_prompt_template
+from src.prompts.template import apply_prompt_template, get_system_prompt_template
 from src.tools import (
     crawl_tool,
     get_retriever_tool,
@@ -929,6 +930,79 @@ def validate_web_search_usage(messages: list, agent_name: str = "agent") -> bool
     return web_search_used
 
 
+async def _handle_recursion_limit_fallback(
+    messages: list,
+    agent_name: str,
+    current_step,
+    state: State,
+) -> list:
+    """Handle GraphRecursionError with graceful fallback using LLM summary.
+
+    When the agent hits the recursion limit, this function generates a final output
+    using only the observations already gathered, without calling any tools.
+
+    Args:
+        messages: Messages accumulated during agent execution before hitting limit
+        agent_name: Name of the agent that hit the limit
+        current_step: The current step being executed
+        state: Current workflow state
+
+    Returns:
+        list: Messages including the accumulated messages plus the fallback summary
+
+    Raises:
+        Exception: If the fallback LLM call fails
+    """
+    logger.warning(
+        f"Recursion limit reached for {agent_name} agent. "
+        f"Attempting graceful fallback with {len(messages)} accumulated messages."
+    )
+
+    if len(messages) == 0:
+        return messages
+
+    cleared_messages = messages.copy()
+    while len(cleared_messages) > 0 and cleared_messages[-1].type == "system":
+        cleared_messages = cleared_messages[:-1]
+
+    # Prepare state for prompt template
+    fallback_state = {
+        "locale": state.get("locale", "en-US"),
+    }
+
+    # Apply the recursion_fallback prompt template
+    system_prompt = get_system_prompt_template(agent_name, fallback_state, None, fallback_state.get("locale", "en-US"))
+    limit_prompt = get_system_prompt_template("recursion_fallback", fallback_state, None, fallback_state.get("locale", "en-US"))
+    fallback_messages = cleared_messages + [
+        SystemMessage(content=system_prompt),
+        SystemMessage(content=limit_prompt)
+    ]
+
+    # Get the LLM without tools (strip all tools from binding)
+    fallback_llm = get_llm_by_type(AGENT_LLM_MAP[agent_name])
+
+    # Call the LLM with the updated messages
+    fallback_response = fallback_llm.invoke(fallback_messages)
+    fallback_content = fallback_response.content
+
+    logger.info(
+        f"Graceful fallback succeeded for {agent_name} agent. "
+        f"Generated summary of {len(fallback_content)} characters."
+    )
+
+    # Sanitize response
+    fallback_content = sanitize_tool_response(str(fallback_content))
+
+    # Update the step with the fallback result
+    current_step.execution_res = fallback_content
+
+    # Return the accumulated messages plus the fallback response
+    result_messages = list(cleared_messages)
+    result_messages.append(AIMessage(content=fallback_content, name=agent_name))
+
+    return result_messages
+
+
 async def _execute_agent_step(
     state: State, agent, agent_name: str, config: RunnableConfig = None
 ) -> Command[Literal["research_team"]]:
@@ -1049,11 +1123,51 @@ async def _execute_agent_step(
             f"Context compression for {agent_name}: {len(compressed_state.get('messages', []))} messages, "
             f"estimated tokens before: ~{token_count_before}, after: ~{token_count_after}"
         )
-    
+
     try:
-        result = await agent.ainvoke(
-            input=agent_input, config={"recursion_limit": recursion_limit}
-        )
+        # Use stream from the start to capture messages in real-time
+        # This allows us to retrieve accumulated messages even if recursion limit is hit
+        accumulated_messages = []
+        for chunk in agent.stream(
+            input=agent_input,
+            config={"recursion_limit": recursion_limit},
+            stream_mode="values",
+        ):
+            if isinstance(chunk, dict) and "messages" in chunk:
+                accumulated_messages = chunk["messages"]
+
+        # If we get here, execution completed successfully
+        result = {"messages": accumulated_messages}
+    except GraphRecursionError:
+        # Check if recursion fallback is enabled
+        configurable = Configuration.from_runnable_config(config) if config else Configuration()
+
+        if configurable.enable_recursion_fallback:
+            try:
+                # Call fallback with accumulated messages (function returns list of messages)
+                response_messages = await _handle_recursion_limit_fallback(
+                    messages=accumulated_messages,
+                    agent_name=agent_name,
+                    current_step=current_step,
+                    state=state,
+                )
+
+                # Create result dict so the code can continue normally from line 1178
+                result = {"messages": response_messages}
+            except Exception as fallback_error:
+                # If fallback fails, log and fall through to standard error handling
+                logger.error(
+                    f"Recursion fallback failed for {agent_name} agent: {fallback_error}. "
+                    "Falling back to standard error handling."
+                )
+                raise
+        else:
+            # Fallback disabled, let error propagate to standard handler
+            logger.info(
+                f"Recursion limit reached but graceful fallback is disabled. "
+                "Using standard error handling."
+            )
+            raise
     except Exception as e:
         import traceback
 
@@ -1088,8 +1202,10 @@ async def _execute_agent_step(
             goto="research_team",
         )
 
+    response_messages = result["messages"]
+
     # Process the result
-    response_content = result["messages"][-1].content
+    response_content = response_messages[-1].content
     
     # Sanitize response to remove extra tokens and truncate if needed
     response_content = sanitize_tool_response(str(response_content))
diff --git a/src/prompts/recursion_fallback.md b/src/prompts/recursion_fallback.md
@@ -0,0 +1,16 @@
+---
+CURRENT_TIME: {{ CURRENT_TIME }}
+locale: {{ locale }}
+---
+
+You have reached the maximum number of reasoning steps.
+
+Using ONLY the tool observations already produced,
+write the final research report in EXACTLY the same format
+as you would normally output at the end of this task.
+
+Do not call any tools.
+Do not add new information.
+If something is missing, state it explicitly.
+
+Always output in the locale of **{{ locale }}**.
diff --git a/src/prompts/template.py b/src/prompts/template.py
@@ -4,7 +4,6 @@
 import dataclasses
 import os
 from datetime import datetime
-
 from jinja2 import Environment, FileSystemLoader, TemplateNotFound, select_autoescape
 from langchain.agents import AgentState
 
@@ -61,6 +60,28 @@ def apply_prompt_template(
     Returns:
         List of messages with the system prompt as the first message
     """
+    try:
+        system_prompt = get_system_prompt_template(prompt_name, state, configurable, locale)
+        return [{"role": "system", "content": system_prompt}] + state["messages"]
+    except Exception as e:
+        raise ValueError(f"Error applying template {prompt_name} for locale {locale}: {e}")
+
+def get_system_prompt_template(
+    prompt_name: str, state: AgentState, configurable: Configuration = None, locale: str = "en-US"
+) -> str:
+    """
+    Render and return the system prompt template with state and configuration variables.
+    This function loads a Jinja2-based prompt template (with optional locale-specific
+    variants), applies variables from the agent state and Configuration object, and
+    returns the fully rendered system prompt string.
+    Args:
+        prompt_name: Name of the prompt template to load (without .md extension).
+        state: Current agent state containing variables available to the template.
+        configurable: Optional Configuration object providing additional template variables.
+        locale: Language locale for template selection (e.g., en-US, zh-CN).
+    Returns:
+        The rendered system prompt string after applying all template variables.
+    """
     # Convert state to dict for template rendering
     state_vars = {
         "CURRENT_TIME": datetime.now().strftime("%a %b %d %Y %H:%M:%S %z"),
@@ -74,15 +95,15 @@ def apply_prompt_template(
     try:
         # Normalize locale format
         normalized_locale = locale.replace("-", "_") if locale and locale.strip() else "en_US"
-        
+
         # Try locale-specific template first
         try:
             template = env.get_template(f"{prompt_name}.{normalized_locale}.md")
         except TemplateNotFound:
             # Fallback to English template
             template = env.get_template(f"{prompt_name}.md")
-        
+
         system_prompt = template.render(**state_vars)
-        return [{"role": "system", "content": system_prompt}] + state["messages"]
+        return system_prompt
     except Exception as e:
-        raise ValueError(f"Error applying template {prompt_name} for locale {locale}: {e}")
+        raise ValueError(f"Error loading template {prompt_name} for locale {locale}: {e}")
diff --git a/tests/integration/test_nodes.py b/tests/integration/test_nodes.py
diff --git a/tests/unit/graph/test_nodes_recursion_limit.py b/tests/unit/graph/test_nodes_recursion_limit.py