add: HF PTQ support and modelopt_recipes mount in launcher (#1089)

ChenhanYu · claude · web-flow · commit ae965a9532af · 2026-03-24T20:13:46.000-07:00
- Add common/hf_ptq/hf_ptq.sh script for running hf_ptq.py directly - Add Qwen3-8B hf_ptq_local.yaml example config - Mount modelopt_recipes alongside modelopt in both Slurm and Docker executors so modelopt.recipe imports work with the overlay - Update default container to tensorrt-llm/release:1.3.0rc2 ### What does this PR do? Type of change: ?   ### Usage ```python # Add a code snippet demonstrating how to use this ``` ### Testing  ### Before your PR is "*Ready for review*" Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/Model-Optimizer/blob/main/CONTRIBUTING.md) and your commits are signed (`git commit -s -S`). Make sure you read and follow the [Security Best Practices](https://github.com/NVIDIA/Model-Optimizer/blob/main/SECURITY.md#security-coding-practices-for-contributors) (e.g. avoiding hardcoded `trust_remote_code=True`, `torch.load(..., weights_only=False)`, `pickle`, etc.). - Is this change backward compatible?: ✅ / ❌ / N/A  - If you copied code from any other sources or added a new PIP dependency, did you follow guidance in `CONTRIBUTING.md`: ✅ / ❌ / N/A  - Did you write any new necessary tests?: ✅ / ❌ / N/A  - Did you update [Changelog](https://github.com/NVIDIA/Model-Optimizer/blob/main/CHANGELOG.rst)?: ✅ / ❌ / N/A  ### Additional Information   ## Summary by CodeRabbit * **New Features** * Added a Hugging Face PTQ workflow to run post-training quantization for models. * Added a local single-GPU pipeline for Qwen3-8B with fp8 quantization and export support. * Added a small command-line launcher to invoke the PTQ workflow with configurable model, quantization, calibration, and export options. * **Chores** * Updated the default runtime container image to a newer release. * Included model-optimization recipe files in container mounts.  --------- Signed-off-by: Chenhan Yu <chenhany@nvidia.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/tools/launcher/common/hf_ptq/hf_ptq.sh b/tools/launcher/common/hf_ptq/hf_ptq.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "${SCRIPT_DIR}/../service_utils.sh"
+
+trap 'error_handler $0 $LINENO' ERR # ERROR HANDLER
+trap 'exit_handler' EXIT
+###################################################################################################
+
+HF_PTQ_DIR=modules/Model-Optimizer/examples/llm_ptq
+
+HF_MODEL=${HF_MODEL:-"Qwen/Qwen3-8B"}
+QFORMAT=${QFORMAT:-"fp8"}
+CALIB_SIZE=${CALIB_SIZE:-"512"}
+EXPORT_PATH=${EXPORT_PATH:-"/scratchspace/exported_model"}
+
+PYTHONPATH="${HF_PTQ_DIR}:${PYTHONPATH}" python ${HF_PTQ_DIR}/hf_ptq.py \
+    --pyt_ckpt_path ${HF_MODEL} \
+    --qformat ${QFORMAT} \
+    --calib_size ${CALIB_SIZE} \
+    --export_path ${EXPORT_PATH} \
+    "$@"
+
+report_result "PASS: hf_ptq ${HF_MODEL} ${QFORMAT}"
diff --git a/tools/launcher/core.py b/tools/launcher/core.py
@@ -235,9 +235,18 @@ def build_slurm_executor(
         f"{job_dir}/{experiment_title}/{experiment_id}"
         f"/{task_name}/code/modules/Model-Optimizer/modelopt"
     )
+    modelopt_recipes_dst = os.path.join(
+        os.path.dirname(os.path.normpath(slurm_config.modelopt_install_path)),
+        "modelopt_recipes",
+    )
+    modelopt_recipes_src = (
+        f"{job_dir}/{experiment_title}/{experiment_id}"
+        f"/{task_name}/code/modules/Model-Optimizer/modelopt_recipes"
+    )
     container_mounts += [
         f"{scratch_src}:{scratch_dst}",
         f"{modelopt_src}:{modelopt_dst}",
+        f"{modelopt_recipes_src}:{modelopt_recipes_dst}",
         f"{job_dir}/{experiment_title}:/{experiment_title}",
     ]
 
@@ -291,11 +300,17 @@ def build_docker_executor(
     modelopt_dst = slurm_config.modelopt_install_path
     if modelopt_src_path is None:
         modelopt_src_path = os.path.join(os.getcwd(), "modules/Model-Optimizer/modelopt")
+    modelopt_recipes_dst = os.path.join(
+        os.path.dirname(os.path.normpath(slurm_config.modelopt_install_path)),
+        "modelopt_recipes",
+    )
+    modelopt_recipes_src_path = os.path.join(os.path.dirname(modelopt_src_path), "modelopt_recipes")
     exp_title_src = os.path.join(job_dir, experiment_title)
     os.makedirs(exp_title_src, exist_ok=True)
     container_mounts += [
         f"{scratch_src}:{scratch_dst}",
         f"{modelopt_src_path}:{modelopt_dst}",
+        f"{modelopt_recipes_src_path}:{modelopt_recipes_dst}",
         f"{exp_title_src}:/{experiment_title}",
     ]
 
diff --git a/tools/launcher/examples/Qwen/Qwen3-8B/hf_ptq_local.yaml b/tools/launcher/examples/Qwen/Qwen3-8B/hf_ptq_local.yaml
@@ -0,0 +1,27 @@
+# Local single-GPU HF PTQ for Qwen3-8B using hf_ptq.py from Model-Optimizer.
+#
+# Runs hf_ptq.py directly (Hugging Face path, no Megatron-LM conversion).
+#
+# Usage:
+#   uv run launch.py --yaml examples/Qwen/Qwen3-8B/hf_ptq_local.yaml hf_local=/mnt/hf-local --yes
+
+job_name: Qwen3-8B_fp8_hf_ptq_local
+pipeline:
+  skip: false
+  allow_to_fail: false
+  note:
+
+  task_0:
+    script: common/hf_ptq/hf_ptq.sh
+    args:
+      - --dataset cnn_dailymail
+    environment:
+      - HF_MODEL: /hf-local/Qwen/Qwen3-8B
+      - QFORMAT: fp8
+      - CALIB_SIZE: "512"
+      - EXPORT_PATH: /scratchspace/exported_model
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 1
diff --git a/tools/launcher/slurm_config.py b/tools/launcher/slurm_config.py
@@ -53,7 +53,7 @@ def slurm_factory(
     nodes: int = 1,
     ntasks_per_node: int = 1,
     gpus_per_node: int = 1,
-    container: str = "nvcr.io/nvidia/tensorrt-llm/release:1.2.0",
+    container: str = "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc8",
     modelopt_install_path: str = "/usr/local/lib/python3.12/dist-packages/modelopt",
     container_mounts: list[str] = [
         "{}:/hf-local".format(os.environ.get("SLURM_HF_LOCAL", "/hf-local")),