fix: EAGLE mix_hidden_states in-place op crash (#1088) (#1104)

javierdejesusda · web-flow · commit 5887410a6d25 · 2026-04-23T20:44:23.000Z
### Type of change - [x] Bug fix (non-breaking change which fixes an issue) ### Description Fixes #1088 — `RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: IndexPutBackward0` when training with `eagle_mix_hidden_states=True`. **Root cause:** In `HFEagleModel._eagle_training_forward`, the indexed assignment at line 991–994 modifies `eagle_input_hiddens` in-place while it is still part of the autograd computation graph. **Fix:** Clone the tensor before the in-place assignment. This is the same pattern already used in the Megatron backend at `megatron_eagle.py:1201-1202`: ```python # Clone to avoid inplace modification of view created in no_grad mode eagle_module_input_hidden_states = eagle_module_input_hidden_states.clone() ``` The HF backend was missing this clone. ### Usage ```python config["eagle_mix_hidden_states"] = True config["eagle_ttt_steps"] = 2 mtsp.convert(model, mode=[("eagle", config)]) model.train() outputs = model(input_ids=input_ids, labels=labels) outputs.loss.backward() # no longer crashes ``` ### Testing Added `test_eagle_mix_hidden_states_backward` parametrized over `eagle_ttt_steps` [1, 2] that: - Converts a tiny LLaMA to EAGLE with `eagle_mix_hidden_states=True` - Runs forward + backward pass - Asserts loss is not None and gradients flow to `eagle_module` ``` pytest tests/unit/torch/speculative/plugins/test_hf_speculative.py::test_eagle_mix_hidden_states_backward -v ``` ### Checklist - [x] I have read the [contributor guidelines](CONTRIBUTING.md) and signed my commits - [x] I have followed the [security best practices](SECURITY.md) - [x] This change is backward compatible - [x] I have followed third-party code and dependency guidelines - [x] I have added tests that prove my fix is effective  ## Summary by CodeRabbit * **Bug Fixes** * Fixed gradient computation issue in speculative decoding during model training to ensure proper autograd behavior. * **Tests** * Added regression test to validate gradient computation in speculative decoding scenarios.  Signed-off-by: javierdejesusda <javier.dejesusj9@gmail.com>
diff --git a/modelopt/torch/speculative/plugins/transformers.py b/modelopt/torch/speculative/plugins/transformers.py
@@ -1080,6 +1080,8 @@ def forward(
                     batch_size, seq_len_s, device=eagle_input_hiddens.device
                 ).argsort(dim=1)[:, :num_to_replace]
 
+                # Clone to avoid inplace modification that breaks autograd
+                eagle_input_hiddens = eagle_input_hiddens.clone()
                 batch_indices = torch.arange(batch_size)[:, None]
                 eagle_input_hiddens[batch_indices, rand_indices] = eagle_output_hiddens[
                     batch_indices, rand_indices
diff --git a/tests/unit/torch/speculative/plugins/test_hf_speculative.py b/tests/unit/torch/speculative/plugins/test_hf_speculative.py
@@ -17,6 +17,7 @@
 from copy import deepcopy
 
 import pytest
+import torch
 from _test_utils.torch.transformers_models import (
     get_tiny_llama,
     tf_modelopt_state_and_output_tester,
@@ -48,3 +49,39 @@ def test_eagle_model_convert_save_and_restore(tmp_path, eagle_config):
     model_test = AutoModelForCausalLM.from_pretrained(tmp_path / "modelopt_model")
     assert isinstance(model_test, mtsp.plugins.HFEagleModel)
     tf_modelopt_state_and_output_tester(model_ref, model_test)
+
+
+@pytest.mark.parametrize("eagle_config", [EAGLE3_DEFAULT_CFG])
+@pytest.mark.parametrize("eagle_ttt_steps", [1, 2])
+def test_eagle_mix_hidden_states_backward(eagle_config, eagle_ttt_steps):
+    """Regression test for GitHub issue #1088.
+
+    Verifies that the EAGLE training forward+backward pass does not crash with
+    ``eagle_mix_hidden_states=True`` due to an in-place tensor modification
+    breaking autograd.
+    """
+    model = get_tiny_llama(num_hidden_layers=8)
+
+    config = deepcopy(eagle_config["config"])
+    config["eagle_architecture_config"].update(
+        {
+            "draft_vocab_size": model.config.vocab_size,
+            "hidden_size": model.config.hidden_size,
+        }
+    )
+    config["eagle_mix_hidden_states"] = True
+    config["eagle_ttt_steps"] = eagle_ttt_steps
+    config["eagle_use_torch_compile"] = False
+
+    mtsp.convert(model, mode=[("eagle", config)])
+    model.train()
+
+    input_ids = torch.randint(0, model.config.vocab_size, (2, 16))
+    labels = input_ids.clone()
+
+    outputs = model(input_ids=input_ids, labels=labels)
+    assert outputs.loss is not None
+    outputs.loss.backward()
+
+    eagle_grads = [p.grad for p in model.eagle_module.parameters() if p.grad is not None]
+    assert len(eagle_grads) > 0, "Expected gradients to flow to eagle_module"