NVIDIA
diff --git a/‎examples/speculative_decoding/eagle_config.json‎
Lines changed: 0 additions & 1 deletion b/‎examples/speculative_decoding/eagle_config.json‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎examples/speculative_decoding/eagle_utils.py‎
Lines changed: 29 additions & 26 deletions b/‎examples/speculative_decoding/eagle_utils.py‎
Lines changed: 29 additions & 26 deletions
diff --git a/‎examples/speculative_decoding/launch_train.sh‎
Lines changed: 6 additions & 0 deletions b/‎examples/speculative_decoding/launch_train.sh‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎examples/speculative_decoding/main.py‎
Lines changed: 30 additions & 13 deletions b/‎examples/speculative_decoding/main.py‎
Lines changed: 30 additions & 13 deletions
@@ -1,3 +1,2 @@
 {
-    "_attn_implementation": "sdpa"
 }
@@ -259,35 +259,24 @@ def __len__(self):
     def __getitem__(self, i) -> dict[str, torch.Tensor]:
         # Load the conversational data, using the cache
         raw_data, offline_file_path = self.data_entries[i]
-        if i in self.cached_data_dict:
-            preprocessed_base = self.cached_data_dict[i]
-        else:
-            ret = self.preprocess_fn(
-                [raw_data], self.tokenizer, processor=self.vlm_processor, img_dir=self.img_dir
-            )
-            preprocessed_base = {k: ret[k][0] for k in ret}
-            self.cached_data_dict[i] = preprocessed_base
-
         # Extend the data sample with the hidden states from the .pt file
         max_length = self.tokenizer.model_max_length
         offline_data = torch.load(offline_file_path)
         offline_data["input_ids"] = offline_data["input_ids"][:max_length]
         offline_data["hidden_states"] = offline_data["hidden_states"][:max_length, :]
         offline_data["aux_hidden_states"] = offline_data["aux_hidden_states"][:max_length, :]
 
-        # Make sure the input_ids have the same shape
-        if preprocessed_base["input_ids"].shape != offline_data["input_ids"].shape:
-            msg = f"""Input IDs from offline data do not match the preprocessed input IDs
-                                for offline data sample at {offline_file_path}."""
-            raise ValueError(msg)
-
-        ret = {**preprocessed_base}  # Shallow copy so we don't accidentally modify the cache
-        ret["input_ids"] = offline_data["input_ids"]
-        ret["kwargs"] = {
-            "base_model_outputs": {
-                "base_model_hidden_states": offline_data["hidden_states"],
-                "aux_hidden_states": offline_data["aux_hidden_states"],
-            }
+        ret = {
+            "input_ids": offline_data["input_ids"],
+            "attention_mask": torch.ones_like(offline_data["input_ids"]),
+            "loss_mask": torch.ones_like(offline_data["input_ids"]),
+            "labels": torch.full_like(offline_data["input_ids"], IGNORE_TOKEN_ID),
+            "kwargs": {
+                "base_model_outputs": {
+                    "base_model_hidden_states": offline_data["hidden_states"],
+                    "aux_hidden_states": offline_data["aux_hidden_states"],
+                }
+            },
         }
         return ret
 
@@ -338,12 +327,24 @@ def make_eagle_supervised_data_module(
             "offline_data_path must be provided for offline training."
         )
         offline_data_path = Path(data_args.offline_data_path)
+        # Collect all pt file paths
         all_files = {str(p) for p in offline_data_path.glob("*.pt")}
+        all_files |= {str(p) for p in offline_data_path.glob("**/*.pt")}
         if not all_files:
             raise ValueError(f"No .pt files found in {data_args.offline_data_path}")
 
-        # Filter to conversations that exist in the offline data and in the provided json
+        # Build a map from conv_id to file_path for fast lookup
+        print("building conv_id_to_file map...")
+        conv_id_to_file = {}
+        for pt_path in all_files:
+            pt_name = Path(pt_path).name
+            # Expect conv_id.pt
+            if pt_name.endswith(".pt"):
+                conv_id = pt_name[:-3]
+                conv_id_to_file[conv_id] = pt_path
+
         valid_entries = []
+        print("filtering valid entries...")
         for entry in data_json:
             conv_id = entry.get("conversation_id")
             if conv_id is None:
@@ -352,9 +353,11 @@ def make_eagle_supervised_data_module(
                 conv_id = entry.get("id")
             if conv_id is None:
                 raise ValueError(f"Conversation ID required but not found for entry {entry}")
-            file_path = str(offline_data_path / f"{conv_id}.pt")
-            if file_path in all_files:
-                valid_entries.append((entry, file_path))
+
+            file_path = conv_id_to_file.get(str(conv_id))
+            if file_path is None:
+                continue
+            valid_entries.append((entry, file_path))
 
         if len(valid_entries) == 0:
             msg = """No valid files found in the offline data path that match the conversation IDs
 
@@ -38,6 +38,10 @@ while [ $# -gt 0 ]; do
       if [[ "$1" != *=* ]]; then shift; fi
       MODE="${1#*=}"
       ;;
+    --eagle_decoder_type*)
+      if [[ "$1" != *=* ]]; then shift; fi
+      EAGLE_DECODER_TYPE="${1#*=}"
+      ;;
     --output_dir*)
       if [[ "$1" != *=* ]]; then shift; fi
       OUTPUT_DIR="${1#*=}"
@@ -115,6 +119,7 @@ DEFAULT_SAVE_STEPS=$((8192 / GPU_COUNT))
 
 MODEL=${MODEL:-"TinyLlama/TinyLlama-1.1B-Chat-v1.0"}
 MODE=${MODE:-"eagle3"}
+EAGLE_DECODER_TYPE=${EAGLE_DECODER_TYPE:-"llama"}
 # Set default OUTPUT_DIR to ckpts/{modelname}, where {modelname} is the last part of the model path
 MODEL_BASENAME=$(basename "$MODEL")
 OUTPUT_DIR=${OUTPUT_DIR:-"ckpts/${MODEL_BASENAME}-$(date +%Y%m%d_%H%M)"}
@@ -174,6 +179,7 @@ fi
 export TOKENIZERS_PARALLELISM=False
 CMD="accelerate launch $MULTI_GPU --mixed_precision bf16 main.py \
     --mode $MODE \
+    --eagle_decoder_type $EAGLE_DECODER_TYPE \
     --model_name_or_path $MODEL \
     --training_seq_len $TRAINING_SEQ_LEN \
     --dataloader_drop_last True \
 
@@ -111,6 +111,10 @@ class MedusaArguments:
 @dataclass
 class EagleArguments:
     eagle_config: str = field(default=None, metadata={"help": "Path to eagle_config.json"})
+    eagle_decoder_type: str = field(
+        default="llama",
+        metadata={"help": "The class of eagle decoder to use. Available options: llama, kimik2"},
+    )
 
 
 def train():
@@ -144,24 +148,29 @@ def train():
 
     if checkpoint:
         model = transformers.AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype="auto")
-        tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)
+        tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
     else:
         # To avoid OOM for large models, we load and convert model on CPU first.
         # Model will be moved to GPU during HF trainer.init().
+        offline_kwargs = {"num_hidden_layers": 0} if use_offline_training else {}
         model = transformers.AutoModelForCausalLM.from_pretrained(
             model_args.model_name_or_path,
             torch_dtype="auto",
             device_map="cpu",
             trust_remote_code=True,
+            **offline_kwargs,
         )
         if use_offline_training:
             # When doing offline training, we need to set num_hidden_layers
             # since we override it when loading the model for space savings
-            model_config = transformers.AutoConfig.from_pretrained(model_args.model_name_or_path)
+            model_config = transformers.AutoConfig.from_pretrained(
+                model_args.model_name_or_path, trust_remote_code=True
+            )
             model.config.num_orig_hidden_layers = model_config.num_hidden_layers
         tokenizer = transformers.AutoTokenizer.from_pretrained(
             model_args.model_name_or_path,
             model_max_length=training_args.training_seq_len,
+            trust_remote_code=True,
         )
         if tokenizer.chat_template is None:
             tokenizer.chat_template = (
@@ -179,22 +188,30 @@ def train():
             }
             mtsp.convert(model, [("medusa", config)])
         elif training_args.mode in ["eagle1", "eagle3"]:
-            from modelopt.torch.speculative.config import EAGLE1_DEFAULT_CFG, EAGLE3_DEFAULT_CFG
-
-            # Load default config
-            config = {
-                "eagle1": EAGLE1_DEFAULT_CFG,
-                "eagle3": EAGLE3_DEFAULT_CFG,
-            }[training_args.mode]["config"]
+            from modelopt.torch.speculative.config import (
+                default_eagle_config,
+                eagle3_default_config,
+                kimik2_eagle_default_config,
+            )
 
-            # overwrite config with custom config
-            if use_offline_training:
-                config["eagle_offline"] = True
+            if eagle_args.eagle_decoder_type == "kimik2":
+                eagle_architecture_config = kimik2_eagle_default_config
+            else:
+                eagle_architecture_config = {
+                    "eagle1": default_eagle_config,
+                    "eagle3": eagle3_default_config,
+                }[training_args.mode]
 
             if eagle_args.eagle_config:
                 with open(eagle_args.eagle_config) as f:
                     custom_config = json.load(f)
-                config["eagle_architecture_config"].update(custom_config)
+                eagle_architecture_config.update(custom_config)
+
+            config = {
+                "eagle_decoder_type": eagle_args.eagle_decoder_type,
+                "eagle_offline": use_offline_training,
+                "eagle_architecture_config": eagle_architecture_config,
+            }
 
             mtsp.convert(model, [("eagle", config)])
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,2 @@`
`1`	`1`	`{`
`2`		`- "_attn_implementation": "sdpa"`
`3`	`2`	`}`