NVIDIA
diff --git a/‎modelopt/torch/sparsity/attention_sparsity/calibration/calibrate.py‎
Lines changed: 6 additions & 6 deletions b/‎modelopt/torch/sparsity/attention_sparsity/calibration/calibrate.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎modelopt/torch/sparsity/attention_sparsity/calibration/calibrator.py‎
Lines changed: 39 additions & 35 deletions b/‎modelopt/torch/sparsity/attention_sparsity/calibration/calibrator.py‎
Lines changed: 39 additions & 35 deletions
diff --git a/‎modelopt/torch/sparsity/attention_sparsity/calibration/ruler_utils.py‎
Lines changed: 3 additions & 3 deletions b/‎modelopt/torch/sparsity/attention_sparsity/calibration/ruler_utils.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎modelopt/torch/sparsity/attention_sparsity/config.py‎
Lines changed: 4 additions & 3 deletions b/‎modelopt/torch/sparsity/attention_sparsity/config.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎modelopt/torch/sparsity/attention_sparsity/conversion.py‎
Lines changed: 74 additions & 17 deletions b/‎modelopt/torch/sparsity/attention_sparsity/conversion.py‎
Lines changed: 74 additions & 17 deletions
@@ -350,7 +350,7 @@ def calibrate_sparse_attention(
         )
         prefill_result = prefill_calibrator.calibrate(model, prefill_forward_loop, phase="prefill")
 
-        if "k" in prefill_result and "p" in prefill_result:
+        if "a" in prefill_result and "b" in prefill_result:
             calibration_results["prefill"] = prefill_result
         else:
             warnings.warn("Prefill calibration did not produce valid results")
@@ -372,7 +372,7 @@ def calibrate_sparse_attention(
         )
         decode_result = decode_calibrator.calibrate(model, decode_forward_loop, phase="decode")
 
-        if "k" in decode_result and "p" in decode_result:
+        if "a" in decode_result and "b" in decode_result:
             calibration_results["decode"] = decode_result
         else:
             warnings.warn("Decode calibration did not produce valid results")
@@ -382,14 +382,14 @@ def calibrate_sparse_attention(
         warnings.warn("No calibration produced valid results")
         return {}
 
-    # Extract k and p for each phase
+    # Extract a and b for each phase
     calibration_params: dict[str, dict[str, float]] = {}
     for phase in ["prefill", "decode"]:
         if phase in calibration_results:
             result = calibration_results[phase]
             calibration_params[phase] = {
-                "k": result["k"],
-                "p": result["p"],
+                "a": result["a"],
+                "b": result["b"],
             }
 
     # Apply calibration params to all modules
@@ -400,7 +400,7 @@ def calibrate_sparse_attention(
     for phase, params in calibration_params.items():
         result = calibration_results[phase]
         print(f"  {phase}:")
-        print(f"    Model: scale_factor = {params['k']:.4f} / (1 - sparsity)^{params['p']:.4f}")
+        print(f"    Model: scale_factor = {params['a']:.6f} * exp({params['b']:.4f} * sparsity)")
         print(f"    R-squared: {result['r_squared']:.6f}")
 
     for module_name, module in sparse_modules:
 
@@ -31,21 +31,21 @@
 
 
 class DynamicThresholdCalibrator:
-    """Dynamic threshold calibrator using Inverse Power model.
+    """Dynamic threshold calibrator using Exponential model.
 
     Calibration Algorithm:
         1. For each threshold λ_j in threshold_trials:
            - Run ALL samples through forward_loop
            - For each sample i with length L_i, collect sparsity S_ij
            - Compute scale_factor_ij = λ_j × L_i
 
-        2. Fit Inverse Power model to ALL individual (sf_ij, S_ij) pairs:
-           scale_factor = k / (1 - sparsity)^p
+        2. Fit Exponential model to ALL individual (sf_ij, S_ij) pairs:
+           scale_factor = a * exp(b * sparsity)
 
-        3. Return fitted k and p parameters (model-specific)
+        3. Return fitted a and b parameters
 
     At inference time (user specifies target_sparsity S*):
-        scale_factor = k / (1 - S*)^p
+        scale_factor = a * exp(b * S*)
         threshold = scale_factor / seqlen
 
     Key insight: Using all individual data points (N_thresholds × N_samples)
@@ -88,20 +88,20 @@ def __init__(
         ]
 
     def calibrate(self, model: nn.Module, forward_loop: Callable, phase: str) -> dict[str, Any]:
-        """Calibrate k and p parameters for Inverse Power model.
+        """Calibrate a and b parameters for Exponential model.
 
         Algorithm:
             1. For each threshold λ_j in threshold_trials:
                - Run ALL samples, collect sparsities S_ij for each sample i
                - Compute scale_factor_ij = λ_j × L_i (where L_i is sample length)
 
-            2. Fit Inverse Power model to ALL (sf_ij, S_ij) pairs:
-               scale_factor = k / (1 - sparsity)^p
+            2. Fit Exponential model to ALL (sf_ij, S_ij) pairs:
+               scale_factor = a * exp(b * sparsity)
 
-            3. Return fitted k and p parameters
+            3. Return fitted a and b parameters
 
         At inference time (user specifies target_sparsity S*):
-            scale_factor = k / (1 - S*)^p
+            scale_factor = a * exp(b * S*)
             threshold = scale_factor / seqlen
 
         Args:
@@ -110,15 +110,15 @@ def calibrate(self, model: nn.Module, forward_loop: Callable, phase: str) -> dic
             phase: Phase to calibrate ('prefill' or 'decode')
 
         Returns:
-            Dict with calibration results including k, p, r_squared, and num_data_points
+            Dict with calibration results including a, b, r_squared, and num_data_points
         """
         # Extract attention modules
         attention_modules = [m for m in model.modules() if isinstance(m, SparseAttentionModule)]
 
         if not attention_modules:
             raise ValueError("No sparse attention modules found for calibration")
 
-        print(f"Starting Inverse Power model calibration ({phase} phase)")
+        print(f"Starting Exponential model calibration ({phase} phase)")
         print(f"Threshold trials: {len(self.threshold_trials)}")
 
         # Stage 1: Collect ALL (scale_factor, sparsity) pairs for all thresholds and samples
@@ -162,15 +162,16 @@ def calibrate(self, model: nn.Module, forward_loop: Callable, phase: str) -> dic
 
         print(f"Collected {len(all_data_points)} individual (scale_factor, sparsity) pairs")
 
-        # Stage 2: Fit Inverse Power model: scale_factor = k / (1 - sparsity)^p
-        print("\nStage 2: Fitting Inverse Power model to all data points...")
+        # Stage 2: Fit Exponential model: scale_factor = a * exp(b * sparsity)
+        print("\nStage 2: Fitting Exponential model to all data points...")
 
         # Extract data for fitting
-        scale_factors = np.array([p["scale_factor"] for p in all_data_points])
-        sparsities = np.array([p["sparsity"] for p in all_data_points])
+        scale_factors = np.array([pt["scale_factor"] for pt in all_data_points])
+        sparsities = np.array([pt["sparsity"] for pt in all_data_points])
 
-        # Filter out invalid sparsities (must be in (0, 1))
-        valid_mask = (sparsities > 0.01) & (sparsities < 0.99)
+        # Filter out extreme sparsities (must be in (10%, 90%))
+        # Extreme values are unreliable for fitting
+        valid_mask = (sparsities >= 0.10) & (sparsities <= 0.90)
         scale_factors = scale_factors[valid_mask]
         sparsities = sparsities[valid_mask]
 
@@ -180,44 +181,46 @@ def calibrate(self, model: nn.Module, forward_loop: Callable, phase: str) -> dic
             )
             return {}
 
-        # Define Inverse Power model: sf = k / (1 - S)^p
-        def inverse_power(sparsity, k, p):
-            return k / np.power(1 - sparsity, p)
+        # Define Exponential model: sf = a * exp(b * S)
+        def exponential(sparsity, a, b):
+            return a * np.exp(b * sparsity)
 
         # Fit the model
         try:
             popt, pcov = curve_fit(
-                inverse_power,
+                exponential,
                 sparsities,
                 scale_factors,
-                p0=[100, 1.5],  # Initial guess
-                bounds=([0.1, 0.1], [1e7, 10]),  # Bounds for k and p
+                p0=[1.0, 5.0],  # Initial guess
+                bounds=([0.0, 0.0], [np.inf, 20.0]),  # Bounds for a and b
                 maxfev=10000,
             )
-            k, p = popt
+            a, b = popt
         except Exception as e:
             warnings.warn(f"Curve fitting failed: {e}")
             return {}
 
-        # Calculate R-squared
-        pred_scale_factors = inverse_power(sparsities, k, p)
+        # Calculate R-squared and RMSE
+        pred_scale_factors = exponential(sparsities, a, b)
         ss_res = np.sum((scale_factors - pred_scale_factors) ** 2)
         ss_tot = np.sum((scale_factors - np.mean(scale_factors)) ** 2)
         r_squared = 1 - (ss_res / ss_tot) if ss_tot > 0 else 0
+        rmse = np.sqrt(np.mean((scale_factors - pred_scale_factors) ** 2))
 
-        print(f"\n{phase.capitalize()} Calibration Results (Inverse Power Model):")
-        print("  Model: scale_factor = k / (1 - sparsity)^p")
-        print(f"  Fitted k: {k:.4f}")
-        print(f"  Fitted p: {p:.4f}")
+        print(f"\n{phase.capitalize()} Calibration Results (Exponential Model):")
+        print("  Model: scale_factor = a * exp(b * sparsity)")
+        print(f"  Fitted a: {a:.6f}")
+        print(f"  Fitted b: {b:.4f}")
         print(f"  R-squared: {r_squared:.6f}")
+        print(f"  RMSE: {rmse:.2f}")
         print(f"  Data points used: {int(np.sum(valid_mask))} / {len(all_data_points)}")
 
         # Show scale_factor for various target sparsities
         print("\nScale factors for different target sparsities:")
         print(f"  {'Target':<10} {'Scale Factor':<15}")
         print(f"  {'-' * 10} {'-' * 15}")
         for target in [0.5, 0.7, 0.8, 0.9, 0.95]:
-            sf = k / (1 - target) ** p
+            sf = a * np.exp(b * target)
             print(f"  {target:<10.0%} {sf:<15.2f}")
 
         # Print calibration data summary by threshold
@@ -238,12 +241,13 @@ def inverse_power(sparsity, k, p):
 
         return {
             "phase": phase,
-            "k": float(k),
-            "p": float(p),
+            "a": float(a),
+            "b": float(b),
             "r_squared": float(r_squared),
+            "rmse": float(rmse),
             "num_data_points": int(np.sum(valid_mask)),
             "total_samples": len(all_data_points),
-            "calibration_type": "inverse_power",
+            "calibration_type": "exponential",
         }
 
     def _enable_calibration_mode(self, modules: list[nn.Module]):
 
@@ -461,8 +461,8 @@ def find_optimal_haystack_size(
     upper_bound = max(estimated_max, incremental * 2)
     optimal_num_haystack = None
 
-    logger.info(f"Estimated {tokens_per_haystack:.1f} tokens per haystack")
-    logger.info(f"Binary search bounds: {lower_bound} to {upper_bound}")
+    logger.debug(f"Estimated {tokens_per_haystack:.1f} tokens per haystack")
+    logger.debug(f"Binary search bounds: {lower_bound} to {upper_bound}")
 
     while lower_bound <= upper_bound:
         mid = (lower_bound + upper_bound) // 2
@@ -486,6 +486,6 @@ def find_optimal_haystack_size(
             upper_bound = mid - 1
 
     final_size = optimal_num_haystack if optimal_num_haystack is not None else incremental
-    logger.info(f"Optimal haystack size: {final_size}")
+    logger.debug(f"Optimal haystack size: {final_size}")
 
     return final_size
@@ -147,10 +147,10 @@ def validate_threshold(cls, v):
 class CalibrationConfig(ModeloptBaseConfig):
     """Configuration for automatic threshold calibration using RULER dataset.
 
-    Calibration fits an Inverse Power model to determine dynamic thresholds that
-    achieve target sparsity. The model learns parameters k and p per phase:
+    Calibration fits an Exponential model to determine dynamic thresholds that
+    achieve target sparsity. The model learns parameters a and b per phase:
 
-        scale_factor = k / (1 - target_sparsity)^p
+        scale_factor = a * exp(b * target_sparsity)
 
     At inference time, the threshold is computed as:
 
@@ -160,6 +160,7 @@ class CalibrationConfig(ModeloptBaseConfig):
     - Target sparsity can be changed at runtime without recalibration
     - Threshold automatically adapts to sequence length
     - Supports independent prefill and decode phase calibration
+    - Exponential model provides better fit (lower RMSE)
     """
 
     target_sparse_ratio: dict[str, float] = ModeloptField(
 
@@ -244,28 +244,85 @@ def update_sparse_attention_metadata(
 def export_sparse_attention_config(model: nn.Module) -> dict[str, Any] | None:
     """Extract sparse attention config for export to config.json.
 
-    Extracts the calibration parameters (k, p) and target_sparse_ratio from the first
-    sparse attention module that has calibrated thresholds.
+    Extracts the calibration parameters (a, b) for the exponential threshold model
+    from the first sparse attention module that has calibrated thresholds.
+
+    The exported config allows computing threshold at runtime:
+        scale_factor = a * exp(b * target_sparsity)
+        threshold = scale_factor / seqlen
 
     Args:
         model: Model with sparse attention applied
 
     Returns:
-        Dictionary with sparse attention config, or None if no calibrated config found.
-        Contains "calibration_params" with k and p per phase, and "target_sparse_ratio".
+        Dictionary with sparse attention config for HuggingFace config.json export.
+        Returns None if no calibrated sparse attention modules found.
+
+    Example output::
+
+        {
+            "config_groups": {
+                "group_0": {"sparse_algo": "softmax_skip", "targets": ["LlamaAttention"]}
+            },
+            "threshold_scale_factor": {
+                "formula": "a * exp(b * target_sparsity)",
+                "prefill": {"a": 7.93, "b": 8.61},
+                "decode": {"a": 0.12, "b": 9.85},
+            },
+            "producer": {"name": "modelopt", "version": "0.37.0"},
+        }
     """
+    import modelopt
+
+    # Collect sparse attention module info
+    calibration_params = None
+    target_classes: set[str] = set()
+
     for module in model.modules():
         if isinstance(module, SparseAttentionModule):
-            calibration_params = getattr(module._sparse_method_instance, "calibration_params", None)
-            target_sparse_ratio = getattr(
-                module._sparse_method_instance, "target_sparse_ratio", None
-            )
-            if calibration_params is not None:
-                return {
-                    "calibration_params": calibration_params,
-                    "target_sparse_ratio": target_sparse_ratio,
-                }
-    return None
+            # Get the original wrapped module's class name
+            if hasattr(module, "get_original_cls_by_level"):
+                original_cls = module.get_original_cls_by_level(level=0)
+                if original_cls is not None:
+                    target_classes.add(original_cls.__name__)
+
+            # Get calibration params from first module that has them
+            if calibration_params is None:
+                calibration_params = getattr(
+                    module._sparse_method_instance, "calibration_params", None
+                )
+
+    # Return None if no calibration params found
+    if calibration_params is None:
+        return None
+
+    # Build threshold_scale_factor with model parameters
+    threshold_scale_factor: dict[str, Any] = {
+        "formula": "a * exp(b * target_sparsity)",
+    }
+    for phase in ["prefill", "decode"]:
+        if phase in calibration_params:
+            threshold_scale_factor[phase] = {
+                "a": calibration_params[phase]["a"],
+                "b": calibration_params[phase]["b"],
+            }
+
+    # Build the export config
+    export_config: dict[str, Any] = {
+        "config_groups": {
+            "group_0": {
+                "sparse_algo": "softmax_skip",
+                "targets": sorted(target_classes) if target_classes else ["Attention"],
+            }
+        },
+        "threshold_scale_factor": threshold_scale_factor,
+        "producer": {
+            "name": "modelopt",
+            "version": modelopt.__version__,
+        },
+    }
+
+    return export_config
 
 
 def disable_sparse_attention(model: nn.Module, wildcard_or_filter_func: str | Callable):
@@ -332,15 +389,15 @@ def _format_threshold(info: dict) -> str:
     """Format threshold info for display."""
     t = info.get("type")
     if t == "dynamic_calibrated":
-        # Inverse Power model: threshold = k / (1 - sparsity)^p / seqlen
+        # Exponential model: threshold = a * exp(b * sparsity) / seqlen
         params = info.get("calibration_params", {})
         target = info.get("target_sparse_ratio", {})
         parts = []
         for phase in ["prefill", "decode"]:
             if phase in params:
-                k, p = params[phase]["k"], params[phase]["p"]
+                a, b = params[phase]["a"], params[phase]["b"]
                 s = target.get(phase, 0.5)
-                parts.append(f"{phase}: k={k:.1f}, p={p:.2f}, target={s:.0%}")
+                parts.append(f"{phase}: a={a:.4f}, b={b:.2f}, target={s:.0%}")
         return f"calibrated({', '.join(parts)})"
     if t == "static":
         v = info.get("value")