NVIDIA
diff --git a/‎examples/llm_sparsity/attention_sparsity/hf_sa.py‎
Lines changed: 31 additions & 37 deletions b/‎examples/llm_sparsity/attention_sparsity/hf_sa.py‎
Lines changed: 31 additions & 37 deletions
diff --git a/‎modelopt/torch/sparsity/attention_sparsity/calibration/calibrate.py‎
Lines changed: 103 additions & 22 deletions b/‎modelopt/torch/sparsity/attention_sparsity/calibration/calibrate.py‎
Lines changed: 103 additions & 22 deletions
@@ -17,12 +17,12 @@
 """Example script for applying sparse attention to HuggingFace models."""
 
 import argparse
+import copy
 import random
 from pathlib import Path
 
 import numpy as np
 import torch
-from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 import modelopt.torch.opt as mto
@@ -46,41 +46,13 @@
 }
 
 
-def get_narrativeqa_samples(num_samples=3):
-    """Load samples from NarrativeQA dataset for testing.
-
-    Args:
-        num_samples: Number of samples to generate
-
-    Raises:
-        RuntimeError: If dataset loading fails
-        ValueError: If no valid samples could be loaded
-    """
-    # Load NarrativeQA dataset with retry logic
-    try:
-        dataset = load_dataset("narrativeqa", split="test", streaming=True)
-    except Exception as e:
-        raise RuntimeError(f"Failed to load NarrativeQA dataset: {e}")
-
-    samples = []
-    for i, item in enumerate(dataset):
-        if i >= num_samples:
-            break
-
-        # Combine document context and question
-        context = item.get("document", {}).get("text", "")
-        question = item.get("question", {}).get("text", "")
-
-        if context and question:
-            # Use the full context as-is
-            prompt = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
-            samples.append(prompt)
-
-    if not samples:
-        raise ValueError("Could not load NarrativeQA samples")
-
-    print(f"Loaded {len(samples)} NarrativeQA samples")
-    return samples
+def get_test_prompts():
+    """Get simple test prompts for sample output generation."""
+    return [
+        "What is the capital of France? Answer:",
+        "Explain the theory of relativity in simple terms:",
+        "Write a short poem about the ocean:",
+    ]
 
 
 def truncate_text(text: str, tokenizer, max_length: int):
@@ -130,7 +102,7 @@ def generate_sample_output(model, tokenizer, args):
         Tuple of (generated_text, input_prompt, input_ids)
     """
     # Load test sample
-    prompts = get_narrativeqa_samples(num_samples=1)
+    prompts = get_test_prompts()
     prompt = prompts[0]
 
     # Prepare inputs
@@ -198,6 +170,20 @@ def main(args):
     # Apply sparse attention with optional calibration
     print(f"\nApplying sparse attention: {args.sparse_attn}")
     sparse_config = SPARSE_ATTN_CFG_CHOICES[args.sparse_attn]
+
+    # Override target_sparse_ratio if provided via CLI
+    if args.target_sparse_ratio is not None:
+        sparse_config = copy.deepcopy(sparse_config)
+        sparse_cfg = sparse_config.get("sparse_cfg", {})
+        if isinstance(sparse_cfg, dict) and "calibration" in sparse_cfg:
+            calibration_cfg = sparse_cfg["calibration"]
+            if isinstance(calibration_cfg, dict):
+                calibration_cfg["target_sparse_ratio"] = {
+                    "prefill": args.target_sparse_ratio,
+                    "decode": args.target_sparse_ratio,
+                }
+                print(f"Overriding target_sparse_ratio to {args.target_sparse_ratio}")
+
     model = mtsa.sparsify(model, config=sparse_config)
     print("Sparse attention applied successfully!")
 
@@ -287,5 +273,13 @@ def main(args):
         help="Directory to export the model with sparse attention applied",
     )
 
+    # Calibration arguments
+    parser.add_argument(
+        "--target_sparse_ratio",
+        type=float,
+        default=None,
+        help="Target sparsity ratio for calibration (0.0 to 1.0). Overrides config value.",
+    )
+
     args = parser.parse_args()
     main(args)
@@ -15,8 +15,11 @@
 
 """Calibration functions for sparse attention."""
 
+import hashlib
+import json
 import warnings
 from collections.abc import Callable
+from pathlib import Path
 from typing import Any
 
 import torch
@@ -30,6 +33,54 @@
 from .dataset import RulerDatasetBuilder
 
 
+def _get_cache_path(
+    tokenizer_path: str, samples: int, max_seqlen: int, cache_dir: str | None = None
+) -> Path:
+    """Generate cache file path based on calibration parameters.
+
+    Args:
+        tokenizer_path: Path to tokenizer (used in hash)
+        samples: Number of calibration samples
+        max_seqlen: Maximum sequence length
+        cache_dir: Optional cache directory. If None, uses ~/.cache/modelopt/sparse_attention/
+    """
+    # Create a hash of the parameters for the cache filename
+    key = f"{tokenizer_path}_{samples}_{max_seqlen}"
+    hash_str = hashlib.md5(key.encode(), usedforsecurity=False).hexdigest()[:12]
+    filename = f"ruler_cache_{samples}s_{max_seqlen}l_{hash_str}.json"
+
+    if cache_dir:
+        base_dir = Path(cache_dir)
+    else:
+        base_dir = Path.home() / ".cache" / "modelopt" / "sparse_attention"
+
+    return base_dir / filename
+
+
+def _load_cached_data(cache_path: Path) -> list[dict[str, Any]] | None:
+    """Load calibration data from cache if it exists."""
+    if cache_path.exists():
+        try:
+            with open(cache_path) as f:
+                data = json.load(f)
+            print(f"Loaded {len(data)} cached calibration samples from {cache_path}")
+            return data
+        except Exception as e:
+            print(f"Warning: Failed to load cache: {e}")
+    return None
+
+
+def _save_cached_data(cache_path: Path, data: list[dict[str, Any]]) -> None:
+    """Save calibration data to cache."""
+    try:
+        cache_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(cache_path, "w") as f:
+            json.dump(data, f)
+        print(f"Saved calibration samples to cache: {cache_path}")
+    except Exception as e:
+        print(f"Warning: Failed to save cache: {e}")
+
+
 def _extract_tokenizer_from_model(model: nn.Module) -> str:
     """Extract tokenizer name/path from model config.
 
@@ -255,18 +306,31 @@ def calibrate_sparse_attention(
     calibration_data = None
 
     if calibrate_prefill or calibrate_decode:
-        builder = RulerDatasetBuilder(
-            samples=calib_config.samples,
-            max_seqlen=calib_config.max_seqlen,
-            tokenizer_name_or_path=tokenizer,
-            num_length_bins=calib_config.num_length_bins,
-            max_length_filter=int(calib_config.max_seqlen * 1.5),
+        # Try to load from cache first
+        cache_path = _get_cache_path(
+            tokenizer,
+            calib_config.samples,
+            calib_config.max_seqlen,
+            cache_dir=calib_config.cache_dir,
         )
-        calibration_data = builder.build_calibration_dataset()
-        print(f"Generated {len(calibration_data)} calibration samples")
+        calibration_data = _load_cached_data(cache_path)
+
+        # Generate if not cached
+        if calibration_data is None:
+            builder = RulerDatasetBuilder(
+                samples=calib_config.samples,
+                max_seqlen=calib_config.max_seqlen,
+                tokenizer_name_or_path=tokenizer,
+                num_length_bins=calib_config.num_length_bins,
+                max_length_filter=int(calib_config.max_seqlen * 1.5),
+            )
+            calibration_data = builder.build_calibration_dataset()
+            print(f"Generated {len(calibration_data)} calibration samples")
+
+            # Save to cache for future runs
+            _save_cached_data(cache_path, calibration_data)
 
     # Initialize results
-    threshold_scale_factor: dict[str, float] = {}
     calibration_results: dict[str, Any] = {}
 
     # Run prefill calibration if enabled
@@ -282,13 +346,11 @@ def calibrate_sparse_attention(
         )
 
         prefill_calibrator = DynamicThresholdCalibrator(
-            target_sparse_ratio=target_dict,
             threshold_trials=calib_config.threshold_trials,
         )
         prefill_result = prefill_calibrator.calibrate(model, prefill_forward_loop, phase="prefill")
 
-        if "scale_factor" in prefill_result:
-            threshold_scale_factor["prefill"] = prefill_result["scale_factor"]
+        if "k" in prefill_result and "p" in prefill_result:
             calibration_results["prefill"] = prefill_result
         else:
             warnings.warn("Prefill calibration did not produce valid results")
@@ -306,38 +368,57 @@ def calibrate_sparse_attention(
         )
 
         decode_calibrator = DynamicThresholdCalibrator(
-            target_sparse_ratio=target_dict,
             threshold_trials=calib_config.threshold_trials,
         )
         decode_result = decode_calibrator.calibrate(model, decode_forward_loop, phase="decode")
 
-        if "scale_factor" in decode_result:
-            threshold_scale_factor["decode"] = decode_result["scale_factor"]
+        if "k" in decode_result and "p" in decode_result:
             calibration_results["decode"] = decode_result
         else:
             warnings.warn("Decode calibration did not produce valid results")
 
     # Check if any calibration succeeded
-    if not threshold_scale_factor:
+    if not calibration_results:
         warnings.warn("No calibration produced valid results")
         return {}
 
-    # Apply combined threshold_scale_factor dict to all modules
+    # Extract k and p for each phase
+    calibration_params: dict[str, dict[str, float]] = {}
+    for phase in ["prefill", "decode"]:
+        if phase in calibration_results:
+            result = calibration_results[phase]
+            calibration_params[phase] = {
+                "k": result["k"],
+                "p": result["p"],
+            }
+
+    # Apply calibration params to all modules
     print("\n" + "=" * 60)
     print("APPLYING CALIBRATION RESULTS")
     print("=" * 60)
-    print(f"Applying threshold_scale_factor to {len(sparse_modules)} modules:")
-    for phase, scale_factor in threshold_scale_factor.items():
-        print(f"  {phase}: {scale_factor:.6f}")
+    print(f"Applying calibration to {len(sparse_modules)} modules:")
+    for phase, params in calibration_params.items():
+        result = calibration_results[phase]
+        print(f"  {phase}:")
+        print(f"    Model: scale_factor = {params['k']:.4f} / (1 - sparsity)^{params['p']:.4f}")
+        print(f"    R-squared: {result['r_squared']:.6f}")
 
     for module_name, module in sparse_modules:
-        module._sparse_method_instance.threshold_scale_factor = threshold_scale_factor
+        module._sparse_method_instance.calibration_params = calibration_params
+        module._sparse_method_instance.target_sparse_ratio = target_dict
 
     # Print final summary
     print("\nCalibration complete!")
+    print(
+        f"Target sparsity: prefill={target_dict.get('prefill', 0):.0%}, "
+        f"decode={target_dict.get('decode', 0):.0%}"
+    )
+    print("\nTo change target sparsity at inference time, update:")
+    print("  module._sparse_method_instance.target_sparse_ratio = {'prefill': X, 'decode': Y}")
     print_sparse_attention_summary(model)
 
     return {
-        "threshold_scale_factor": threshold_scale_factor,
+        "calibration_params": calibration_params,
+        "target_sparse_ratio": target_dict,
         "calibration_results": calibration_results,
     }