openvinotoolkit · daniil-lyakhov · Apr 28, 2026 · Apr 29, 2026 · May 11, 2026
@@ -10,6 +10,10 @@
 # limitations under the License.
 
 
+import contextvars
+from collections.abc import Generator
+from contextlib import contextmanager
+
 import numpy as np
 import openvino as ov
 from openvino import Type
@@ -19,6 +23,17 @@
 from nncf.definitions import NNCF_DATASET_RESET_STATE_KEY
 from nncf.openvino.graph.model_utils import model_has_state
 
+_calibration_device: contextvars.ContextVar[str | None] = contextvars.ContextVar("_calibration_device", default=None)
+
+
+@contextmanager
+def calibration_device_context(device: str | None) -> Generator[None, None, None]:
+    token = _calibration_device.set(device)
+    try:
+        yield
+    finally:
+        _calibration_device.reset(token)
+
 
 class OVCompiledModelEngine(Engine):
     """
@@ -79,12 +94,13 @@ def __init__(self, model: ov.Model, use_fp32_precision: bool = True):
         :param use_fp32_precision: A flag that determines whether to force the engine to use FP32
             precision during inference.
         """
+        device_name = _calibration_device.get() or "CPU"
         config = None
-        if use_fp32_precision:
+        if use_fp32_precision and device_name == "CPU":
             config = {inference_precision: Type.f32}
         ie = ov.Core()
         stateful = model_has_state(model)
-        compiled_model = ie.compile_model(model, device_name="CPU", config=config)
+        compiled_model = ie.compile_model(model, device_name=device_name, config=config)
         self.engine = OVCompiledModelEngine(compiled_model, stateful)
 
     def infer(

@@ -21,6 +21,7 @@
 from nncf.common.logging import nncf_logger
 from nncf.common.quantization.structs import QuantizationPreset
 from nncf.data import Dataset
+from nncf.openvino.engine import calibration_device_context
 from nncf.openvino.graph.metatypes.groups import OPERATIONS_OUTPUT_HAS_NO_BATCH_AXIS
 from nncf.openvino.graph.metatypes.openvino_metatypes import OVIfMetatype
 from nncf.openvino.graph.metatypes.openvino_metatypes import get_node_metatype
@@ -119,9 +120,11 @@ def _extract_all_subgraphs(model: ov.Model, current_id: str) -> None:
         f"The model consists of {if_ops_number} If node(-s) with then and else bodies. \
             Main model and all If bodies will be quantized recursively."
     )
-    quantized_model, _ = apply_algorithm_if_bodies(
-        quantization_algorithm, model, graphs, main_model_graph_id, calibration_dataset, subset_size, 1
-    )
+    calibration_device = advanced_parameters.calibration_device if advanced_parameters else None
+    with calibration_device_context(calibration_device):
+        quantized_model, _ = apply_algorithm_if_bodies(
+            quantization_algorithm, model, graphs, main_model_graph_id, calibration_dataset, subset_size, 1
+        )
 
     if is_weight_compression_needed(advanced_parameters):
         compress_quantize_weights_transformation(quantized_model)
@@ -168,7 +171,9 @@ def native_quantize_impl(
     )
     graph = GraphConverter.create_nncf_graph(model)
     warning_model_no_batchwise_support(graph, advanced_parameters, model_type, OPERATIONS_OUTPUT_HAS_NO_BATCH_AXIS)
-    quantized_model = quantization_algorithm.apply(model, graph, dataset=calibration_dataset)
+    calibration_device = advanced_parameters.calibration_device if advanced_parameters else None
+    with calibration_device_context(calibration_device):
+        quantized_model = quantization_algorithm.apply(model, graph, dataset=calibration_dataset)
 
     if is_weight_compression_needed(advanced_parameters):
         compress_quantize_weights_transformation(quantized_model)
@@ -296,15 +301,19 @@ def quantize_with_accuracy_control_impl(
             advanced_accuracy_restorer_parameters.num_ranking_workers,
             advanced_accuracy_restorer_parameters.restore_mode,
         )
-        quantized_model = accuracy_restorer.apply(
-            model,
-            initial_metric_results,
-            quantized_model,
-            quantized_metric_results,
-            validation_dataset,
-            validation_dataset_size,
-            evaluator,
+        calibration_device = (
+            advanced_quantization_parameters.calibration_device if advanced_quantization_parameters else None
         )
+        with calibration_device_context(calibration_device):
+            quantized_model = accuracy_restorer.apply(
+                model,
+                initial_metric_results,
+                quantized_model,
+                quantized_metric_results,
+                validation_dataset,
+                validation_dataset_size,
+                evaluator,
+            )
 
     if compress_weights:
         compress_quantize_weights_transformation(quantized_model)
@@ -402,12 +411,15 @@ def compress_weights_impl(
         advanced_parameters,
     )
 
+    calibration_device = advanced_parameters.calibration_device if advanced_parameters else None
+
     statistics_points = None
     if advanced_parameters and advanced_parameters.statistics_path:
         # If there is no such directory, then caches statistics
         statistics_path = Path(advanced_parameters.statistics_path)
         if not statistics_path.exists():
-            cache_weight_compression_statistics(model, graph, dataset, subset_size, statistics_path)
+            with calibration_device_context(calibration_device):
+                cache_weight_compression_statistics(model, graph, dataset, subset_size, statistics_path)
         statistics_aggregator = StatisticsAggregatorFactory.create(model, dataset)
         compression_algorithm.set_backend_entity(model)
         _, matmul_input_to_output_nodes_map = compression_algorithm.get_compression_nodes_info(graph)
@@ -421,4 +433,5 @@ def compress_weights_impl(
         statistics_aggregator.load_statistics_from_dir(statistics_path)
         statistics_points = statistics_aggregator.statistic_points
 
-    return compression_algorithm.apply(model, graph, statistics_points, dataset)
+    with calibration_device_context(calibration_device):
+        return compression_algorithm.apply(model, graph, statistics_points, dataset)
@@ -252,6 +252,10 @@ class AdvancedQuantizationParameters:
     :type smooth_quant_alpha: float
     :param backend_params: Backend-specific parameters.
     :type backend_params: dict[str, Any]
+    :param calibration_device: OpenVINO device name to use for calibration inference
+        (e.g. "CPU", "GPU", "GPU.0", "AUTO:GPU,CPU"). If None, defaults to "CPU".
+        Only applicable to the OpenVINO backend.
+    :type calibration_device: Optional[str]
     """
 
     # General parameters
@@ -282,6 +286,9 @@ class AdvancedQuantizationParameters:
     # Backend specific parameters
     backend_params: dict[str, Any] = field(default_factory=dict)
 
+    # Calibration device
+    calibration_device: str | None = None
+
 
 @api()
 @dataclass
@@ -427,6 +434,10 @@ class AdvancedCompressionParameters:
     :type lora_correction_params: AdvancedLoraCorrectionParameters
     :param backend_params: Backend-specific parameters.
     :type backend_params: dict[str, Any]
+    :param calibration_device: OpenVINO device name to use for calibration inference
+        (e.g. "CPU", "GPU", "GPU.0", "AUTO:GPU,CPU"). If None, defaults to "CPU".
+        Only applicable to the OpenVINO backend.
+    :type calibration_device: Optional[str]
     :param codebook: The codebook (LUT) for the weight compression.
         Applicable for vector quantization. Must be a numpy array or ov Tensor.
     :type codebook: TTensor
@@ -445,6 +456,7 @@ class AdvancedCompressionParameters:
     gptq_params: AdvancedGPTQParameters = field(default_factory=AdvancedGPTQParameters)
     lora_correction_params: AdvancedLoraCorrectionParameters = field(default_factory=AdvancedLoraCorrectionParameters)
     backend_params: dict[str, Any] = field(default_factory=dict)
+    calibration_device: str | None = None
     codebook: TTensor | None = None
     adaptive_codebook_params: AdvancedAdaptiveCodebookParameters = field(
         default_factory=AdvancedAdaptiveCodebookParameters

@@ -201,6 +201,10 @@ def quantize(
     if backend == BackendType.ONNX:
         from nncf.onnx.quantization.quantize_model import quantize_impl
 
+        if advanced_parameters and advanced_parameters.calibration_device:
+            msg = "ONNX backend does not support the `calibration_device` option."
+            raise nncf.ParameterNotSupportedError(msg)
+
         return quantize_impl(  # type: ignore[no-any-return]
             model=model,
             calibration_dataset=calibration_dataset,
@@ -217,6 +221,10 @@ def quantize(
     if backend == BackendType.TORCH:
         from nncf.torch.function_hook.quantization.quantize_model import quantize_impl
 
+        if advanced_parameters and advanced_parameters.calibration_device:
+            msg = "Torch backend does not support the `calibration_device` option."
+            raise nncf.ParameterNotSupportedError(msg)
+
         return quantize_impl(  # type: ignore[no-any-return]
             model=model,
             calibration_dataset=calibration_dataset,
@@ -233,6 +241,10 @@ def quantize(
     if backend == BackendType.TORCH_FX:
         from nncf.experimental.torch.fx.quantization.quantize_model import quantize_impl
 
+        if advanced_parameters and advanced_parameters.calibration_device:
+            msg = "TorchFX backend does not support the `calibration_device` option."
+            raise nncf.ParameterNotSupportedError(msg)
+
         return quantize_impl(  # type: ignore[no-any-return]
             model=model,
             calibration_dataset=calibration_dataset,
@@ -372,6 +384,10 @@ def quantize_with_accuracy_control(
     if backend == BackendType.ONNX:
         from nncf.onnx.quantization.quantize_model import quantize_with_accuracy_control_impl
 
+        if advanced_quantization_parameters and advanced_quantization_parameters.calibration_device:
+            msg = "ONNX backend does not support the `calibration_device` option."
+            raise nncf.ParameterNotSupportedError(msg)
+
         return quantize_with_accuracy_control_impl(  # type: ignore[no-any-return]
             model,
             calibration_dataset,
@@ -528,6 +544,10 @@ def compress_weights(
             msg = "Torch backend does not support statistics caching."
             raise nncf.ParameterNotSupportedError(msg)
 
+        if advanced_parameters and advanced_parameters.calibration_device:
+            msg = "Torch backend does not support the `calibration_device` option."
+            raise nncf.ParameterNotSupportedError(msg)
+
         if compression_format == CompressionFormat.FQ and group_size != -1:
             msg = "Torch backend does not support FQ compression format for group-wise quantization."
             raise nncf.ParameterNotSupportedError(msg)
@@ -578,6 +598,10 @@ def compress_weights(
             msg = "TorchFX does not supports statistics caching."
             raise nncf.ParameterNotSupportedError(msg)
 
+        if advanced_parameters and advanced_parameters.calibration_device:
+            msg = "TorchFX backend does not support the `calibration_device` option."
+            raise nncf.ParameterNotSupportedError(msg)
+
         if compression_format in [CompressionFormat.FQ, CompressionFormat.FQ_LORA, CompressionFormat.FQ_LORA_NLS]:
             msg = "Torch FX backend does not support FQ, FQ_LORA and FQ_LORA_NLS compression formats."
             raise nncf.ParameterNotSupportedError(msg)
@@ -649,6 +673,10 @@ def compress_weights(
         if advanced_parameters and advanced_parameters.statistics_path:
             msg = "ONNX does not supports statistics caching."
             raise nncf.ParameterNotSupportedError(msg)
+
+        if advanced_parameters and advanced_parameters.calibration_device:
+            msg = "ONNX backend does not support the `calibration_device` option."
+            raise nncf.ParameterNotSupportedError(msg)
         compression_weights_impl = onnx_compress_weights_impl
     if compression_weights_impl is None:
         msg = f"Unsupported type of backend: {backend}"

@@ -0,0 +1,37 @@
+# Copyright (c) 2026 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import ABC
+from abc import abstractmethod
+from typing import TypeVar
+
+import pytest
+
+import nncf
+from nncf.data.dataset import Dataset
+from nncf.quantization.advanced_parameters import AdvancedQuantizationParameters
+
+TModel = TypeVar("TModel")
+
+
+class TemplateTestQuantizeApi(ABC):
+    @staticmethod
+    @abstractmethod
+    def get_simple_model() -> TModel:
+        """Returns a minimal model for the backend."""
+
+    def test_quantize_calibration_device(self):
+        model = self.get_simple_model()
+        with pytest.raises(nncf.ParameterNotSupportedError):
+            nncf.quantize(
+                model,
+                Dataset([0]),
+                advanced_parameters=AdvancedQuantizationParameters(calibration_device="SOME_DEVICE"),
+            )
@@ -983,3 +983,17 @@ def test_compression_skipped_with_transposed_activations(self, transpose_a_suppo
                 all_layers=True,
                 **kwargs,
             )
+
+    def test_compress_weights_calibration_device(self):
+        model = self.get_awq_model(non_mergable_pattern=False, is_3d_weights=False)
+        dataset = Dataset([self.to_tensor(np.ones([2, 8, 8]))])
+        with pytest.raises(nncf.ParameterNotSupportedError):
+            compress_weights(
+                model,
+                mode=CompressWeightsMode.INT4_SYM,
+                ratio=1.0,
+                group_size=2,
+                dataset=dataset,
+                awq=True,
+                advanced_parameters=CompressionParams(calibration_device="SOME_DEVICE"),
+            )
@@ -0,0 +1,38 @@
+# Copyright (c) 2026 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import pytest
+
+import nncf
+from nncf import Dataset
+from nncf.quantization.advanced_parameters import AdvancedQuantizationParameters
+from tests.cross_fw.test_templates.template_test_quantize_api import TemplateTestQuantizeApi
+from tests.onnx.models import LinearModel
+
+INPUT_SHAPE = [1, 3, 32, 32]
+
+
+class TestONNXQuantizeApi(TemplateTestQuantizeApi):
+    @staticmethod
+    def get_simple_model():
+        return LinearModel().onnx_model
+
+    def test_quantize_with_accuracy_control_calibration_device(self):
+        model = self.get_simple_model()
+        dataset = Dataset([np.ones(INPUT_SHAPE, dtype=np.float32)])
+        with pytest.raises(nncf.ParameterNotSupportedError):
+            nncf.quantize_with_accuracy_control(
+                model,
+                dataset,
+                dataset,
+                lambda model, dataset: (1.0, None),
+                advanced_quantization_parameters=AdvancedQuantizationParameters(calibration_device="SOME_DEVICE"),
+            )