Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions src/nncf/openvino/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@
# limitations under the License.


import contextvars
from collections.abc import Generator
from contextlib import contextmanager

import numpy as np
import openvino as ov
from openvino import Type
Expand All @@ -19,6 +23,17 @@
from nncf.definitions import NNCF_DATASET_RESET_STATE_KEY
from nncf.openvino.graph.model_utils import model_has_state

_calibration_device: contextvars.ContextVar[str | None] = contextvars.ContextVar("_calibration_device", default=None)


@contextmanager
def calibration_device_context(device: str | None) -> Generator[None, None, None]:
token = _calibration_device.set(device)
try:
yield
finally:
_calibration_device.reset(token)


class OVCompiledModelEngine(Engine):
"""
Expand Down Expand Up @@ -79,12 +94,13 @@ def __init__(self, model: ov.Model, use_fp32_precision: bool = True):
:param use_fp32_precision: A flag that determines whether to force the engine to use FP32
precision during inference.
"""
device_name = _calibration_device.get() or "CPU"
Comment thread
daniil-lyakhov marked this conversation as resolved.
config = None
if use_fp32_precision:
if use_fp32_precision and device_name == "CPU":
config = {inference_precision: Type.f32}
ie = ov.Core()
stateful = model_has_state(model)
compiled_model = ie.compile_model(model, device_name="CPU", config=config)
compiled_model = ie.compile_model(model, device_name=device_name, config=config)
self.engine = OVCompiledModelEngine(compiled_model, stateful)

def infer(
Expand Down
41 changes: 27 additions & 14 deletions src/nncf/openvino/quantization/quantize_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from nncf.common.logging import nncf_logger
from nncf.common.quantization.structs import QuantizationPreset
from nncf.data import Dataset
from nncf.openvino.engine import calibration_device_context
from nncf.openvino.graph.metatypes.groups import OPERATIONS_OUTPUT_HAS_NO_BATCH_AXIS
from nncf.openvino.graph.metatypes.openvino_metatypes import OVIfMetatype
from nncf.openvino.graph.metatypes.openvino_metatypes import get_node_metatype
Expand Down Expand Up @@ -119,9 +120,11 @@ def _extract_all_subgraphs(model: ov.Model, current_id: str) -> None:
f"The model consists of {if_ops_number} If node(-s) with then and else bodies. \
Main model and all If bodies will be quantized recursively."
)
quantized_model, _ = apply_algorithm_if_bodies(
quantization_algorithm, model, graphs, main_model_graph_id, calibration_dataset, subset_size, 1
)
calibration_device = advanced_parameters.calibration_device if advanced_parameters else None
with calibration_device_context(calibration_device):
quantized_model, _ = apply_algorithm_if_bodies(
quantization_algorithm, model, graphs, main_model_graph_id, calibration_dataset, subset_size, 1
)

if is_weight_compression_needed(advanced_parameters):
compress_quantize_weights_transformation(quantized_model)
Expand Down Expand Up @@ -168,7 +171,9 @@ def native_quantize_impl(
)
graph = GraphConverter.create_nncf_graph(model)
warning_model_no_batchwise_support(graph, advanced_parameters, model_type, OPERATIONS_OUTPUT_HAS_NO_BATCH_AXIS)
quantized_model = quantization_algorithm.apply(model, graph, dataset=calibration_dataset)
calibration_device = advanced_parameters.calibration_device if advanced_parameters else None
with calibration_device_context(calibration_device):
quantized_model = quantization_algorithm.apply(model, graph, dataset=calibration_dataset)

if is_weight_compression_needed(advanced_parameters):
compress_quantize_weights_transformation(quantized_model)
Expand Down Expand Up @@ -296,15 +301,19 @@ def quantize_with_accuracy_control_impl(
advanced_accuracy_restorer_parameters.num_ranking_workers,
advanced_accuracy_restorer_parameters.restore_mode,
)
quantized_model = accuracy_restorer.apply(
model,
initial_metric_results,
quantized_model,
quantized_metric_results,
validation_dataset,
validation_dataset_size,
evaluator,
calibration_device = (
advanced_quantization_parameters.calibration_device if advanced_quantization_parameters else None
)
with calibration_device_context(calibration_device):
quantized_model = accuracy_restorer.apply(
model,
initial_metric_results,
quantized_model,
quantized_metric_results,
validation_dataset,
validation_dataset_size,
evaluator,
)

if compress_weights:
compress_quantize_weights_transformation(quantized_model)
Expand Down Expand Up @@ -402,12 +411,15 @@ def compress_weights_impl(
advanced_parameters,
)

calibration_device = advanced_parameters.calibration_device if advanced_parameters else None

statistics_points = None
if advanced_parameters and advanced_parameters.statistics_path:
# If there is no such directory, then caches statistics
statistics_path = Path(advanced_parameters.statistics_path)
if not statistics_path.exists():
cache_weight_compression_statistics(model, graph, dataset, subset_size, statistics_path)
with calibration_device_context(calibration_device):
cache_weight_compression_statistics(model, graph, dataset, subset_size, statistics_path)
statistics_aggregator = StatisticsAggregatorFactory.create(model, dataset)
compression_algorithm.set_backend_entity(model)
_, matmul_input_to_output_nodes_map = compression_algorithm.get_compression_nodes_info(graph)
Expand All @@ -421,4 +433,5 @@ def compress_weights_impl(
statistics_aggregator.load_statistics_from_dir(statistics_path)
statistics_points = statistics_aggregator.statistic_points

return compression_algorithm.apply(model, graph, statistics_points, dataset)
with calibration_device_context(calibration_device):
return compression_algorithm.apply(model, graph, statistics_points, dataset)
12 changes: 12 additions & 0 deletions src/nncf/quantization/advanced_parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,10 @@ class AdvancedQuantizationParameters:
:type smooth_quant_alpha: float
:param backend_params: Backend-specific parameters.
:type backend_params: dict[str, Any]
:param calibration_device: OpenVINO device name to use for calibration inference
(e.g. "CPU", "GPU", "GPU.0", "AUTO:GPU,CPU"). If None, defaults to "CPU".
Only applicable to the OpenVINO backend.
:type calibration_device: Optional[str]
"""

# General parameters
Expand Down Expand Up @@ -282,6 +286,9 @@ class AdvancedQuantizationParameters:
# Backend specific parameters
backend_params: dict[str, Any] = field(default_factory=dict)

# Calibration device
calibration_device: str | None = None


@api()
@dataclass
Expand Down Expand Up @@ -427,6 +434,10 @@ class AdvancedCompressionParameters:
:type lora_correction_params: AdvancedLoraCorrectionParameters
:param backend_params: Backend-specific parameters.
:type backend_params: dict[str, Any]
:param calibration_device: OpenVINO device name to use for calibration inference
(e.g. "CPU", "GPU", "GPU.0", "AUTO:GPU,CPU"). If None, defaults to "CPU".
Only applicable to the OpenVINO backend.
:type calibration_device: Optional[str]
:param codebook: The codebook (LUT) for the weight compression.
Applicable for vector quantization. Must be a numpy array or ov Tensor.
:type codebook: TTensor
Expand All @@ -445,6 +456,7 @@ class AdvancedCompressionParameters:
gptq_params: AdvancedGPTQParameters = field(default_factory=AdvancedGPTQParameters)
lora_correction_params: AdvancedLoraCorrectionParameters = field(default_factory=AdvancedLoraCorrectionParameters)
backend_params: dict[str, Any] = field(default_factory=dict)
calibration_device: str | None = None
codebook: TTensor | None = None
adaptive_codebook_params: AdvancedAdaptiveCodebookParameters = field(
default_factory=AdvancedAdaptiveCodebookParameters
Expand Down
28 changes: 28 additions & 0 deletions src/nncf/quantization/quantize_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,10 @@ def quantize(
if backend == BackendType.ONNX:
from nncf.onnx.quantization.quantize_model import quantize_impl

if advanced_parameters and advanced_parameters.calibration_device:
msg = "ONNX backend does not support the `calibration_device` option."
raise nncf.ParameterNotSupportedError(msg)
Comment thread
daniil-lyakhov marked this conversation as resolved.

return quantize_impl( # type: ignore[no-any-return]
model=model,
calibration_dataset=calibration_dataset,
Expand All @@ -217,6 +221,10 @@ def quantize(
if backend == BackendType.TORCH:
from nncf.torch.function_hook.quantization.quantize_model import quantize_impl

if advanced_parameters and advanced_parameters.calibration_device:
msg = "Torch backend does not support the `calibration_device` option."
raise nncf.ParameterNotSupportedError(msg)

return quantize_impl( # type: ignore[no-any-return]
model=model,
calibration_dataset=calibration_dataset,
Expand All @@ -233,6 +241,10 @@ def quantize(
if backend == BackendType.TORCH_FX:
from nncf.experimental.torch.fx.quantization.quantize_model import quantize_impl

if advanced_parameters and advanced_parameters.calibration_device:
msg = "TorchFX backend does not support the `calibration_device` option."
raise nncf.ParameterNotSupportedError(msg)

return quantize_impl( # type: ignore[no-any-return]
model=model,
calibration_dataset=calibration_dataset,
Expand Down Expand Up @@ -372,6 +384,10 @@ def quantize_with_accuracy_control(
if backend == BackendType.ONNX:
from nncf.onnx.quantization.quantize_model import quantize_with_accuracy_control_impl

if advanced_quantization_parameters and advanced_quantization_parameters.calibration_device:
msg = "ONNX backend does not support the `calibration_device` option."
raise nncf.ParameterNotSupportedError(msg)

return quantize_with_accuracy_control_impl( # type: ignore[no-any-return]
model,
calibration_dataset,
Expand Down Expand Up @@ -528,6 +544,10 @@ def compress_weights(
msg = "Torch backend does not support statistics caching."
raise nncf.ParameterNotSupportedError(msg)

if advanced_parameters and advanced_parameters.calibration_device:
msg = "Torch backend does not support the `calibration_device` option."
raise nncf.ParameterNotSupportedError(msg)

if compression_format == CompressionFormat.FQ and group_size != -1:
msg = "Torch backend does not support FQ compression format for group-wise quantization."
raise nncf.ParameterNotSupportedError(msg)
Expand Down Expand Up @@ -578,6 +598,10 @@ def compress_weights(
msg = "TorchFX does not supports statistics caching."
raise nncf.ParameterNotSupportedError(msg)

if advanced_parameters and advanced_parameters.calibration_device:
msg = "TorchFX backend does not support the `calibration_device` option."
raise nncf.ParameterNotSupportedError(msg)

if compression_format in [CompressionFormat.FQ, CompressionFormat.FQ_LORA, CompressionFormat.FQ_LORA_NLS]:
msg = "Torch FX backend does not support FQ, FQ_LORA and FQ_LORA_NLS compression formats."
raise nncf.ParameterNotSupportedError(msg)
Expand Down Expand Up @@ -649,6 +673,10 @@ def compress_weights(
if advanced_parameters and advanced_parameters.statistics_path:
msg = "ONNX does not supports statistics caching."
raise nncf.ParameterNotSupportedError(msg)

if advanced_parameters and advanced_parameters.calibration_device:
msg = "ONNX backend does not support the `calibration_device` option."
raise nncf.ParameterNotSupportedError(msg)
compression_weights_impl = onnx_compress_weights_impl
if compression_weights_impl is None:
msg = f"Unsupported type of backend: {backend}"
Expand Down
37 changes: 37 additions & 0 deletions tests/cross_fw/test_templates/template_test_quantize_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright (c) 2026 Intel Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from abc import ABC
from abc import abstractmethod
from typing import TypeVar

import pytest

import nncf
from nncf.data.dataset import Dataset
from nncf.quantization.advanced_parameters import AdvancedQuantizationParameters

TModel = TypeVar("TModel")


class TemplateTestQuantizeApi(ABC):
@staticmethod
@abstractmethod
def get_simple_model() -> TModel:
"""Returns a minimal model for the backend."""

def test_quantize_calibration_device(self):
model = self.get_simple_model()
with pytest.raises(nncf.ParameterNotSupportedError):
nncf.quantize(
model,
Dataset([0]),
advanced_parameters=AdvancedQuantizationParameters(calibration_device="SOME_DEVICE"),
)
Original file line number Diff line number Diff line change
Expand Up @@ -983,3 +983,17 @@ def test_compression_skipped_with_transposed_activations(self, transpose_a_suppo
all_layers=True,
**kwargs,
)

def test_compress_weights_calibration_device(self):
model = self.get_awq_model(non_mergable_pattern=False, is_3d_weights=False)
dataset = Dataset([self.to_tensor(np.ones([2, 8, 8]))])
with pytest.raises(nncf.ParameterNotSupportedError):
compress_weights(
model,
mode=CompressWeightsMode.INT4_SYM,
ratio=1.0,
group_size=2,
dataset=dataset,
awq=True,
advanced_parameters=CompressionParams(calibration_device="SOME_DEVICE"),
)
38 changes: 38 additions & 0 deletions tests/onnx/quantization/test_quantize_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Copyright (c) 2026 Intel Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import pytest

import nncf
from nncf import Dataset
from nncf.quantization.advanced_parameters import AdvancedQuantizationParameters
from tests.cross_fw.test_templates.template_test_quantize_api import TemplateTestQuantizeApi
from tests.onnx.models import LinearModel

INPUT_SHAPE = [1, 3, 32, 32]


class TestONNXQuantizeApi(TemplateTestQuantizeApi):
@staticmethod
def get_simple_model():
return LinearModel().onnx_model

def test_quantize_with_accuracy_control_calibration_device(self):
model = self.get_simple_model()
dataset = Dataset([np.ones(INPUT_SHAPE, dtype=np.float32)])
with pytest.raises(nncf.ParameterNotSupportedError):
nncf.quantize_with_accuracy_control(
model,
dataset,
dataset,
lambda model, dataset: (1.0, None),
advanced_quantization_parameters=AdvancedQuantizationParameters(calibration_device="SOME_DEVICE"),
)
Loading
Loading