Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/nncf/onnx/graph/model_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,13 +284,17 @@ def _get_scale_zero_point_tensors(
dims = scale.shape if per_channel else []
onnx_scale = [scale.tolist()] if not per_channel else scale
onnx_zero_point = [zero_point.tolist()] if not per_channel else zero_point

if tensor_type == np.uint8:
onnx_tensor_type = onnx.TensorProto.UINT8
elif tensor_type == np.int8:
onnx_tensor_type = onnx.TensorProto.INT8
elif tensor_type in (onnx.TensorProto.FLOAT8E5M2, onnx.TensorProto.FLOAT8E4M3FN):
onnx_tensor_type = tensor_type
else:
msg = f"Incorrect tensor type - {tensor_type}."
raise nncf.ValidationError(msg)

assert quantizer.input[1] == dequantizer.input[1] and quantizer.input[2] == dequantizer.input[2]
scale_tensor_name = quantizer.input[1]
zero_point_tensor_name = quantizer.input[2]
Expand Down
12 changes: 7 additions & 5 deletions src/nncf/onnx/quantization/quantize_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,13 +139,14 @@ def quantize_impl(
if target_device == TargetDevice.CPU_SPR:
msg = "target_device == CPU_SPR is not supported."
raise nncf.ValidationError(msg)
if mode is not None:
msg = f"mode={mode} is not supported"
raise ValueError(msg)
if model.opset_import[0].version < 10:

opset_version = model.opset_import[0].version
if opset_version < 21 and mode is not None:
msg = f"FP8 quantization requires opset >= 21, got {opset_version}"
if opset_version < 10:
msg = "ONNX models with opset version < 10 do not support quantization."
raise nncf.ValidationError(msg)
if model.opset_import[0].version < 13:
if opset_version < 13:
nncf_logger.warning(
"ONNX models with 10 < opset version < 13 do not support per-channel quantization."
" Per-tensor quantization will be applied."
Expand All @@ -163,6 +164,7 @@ def quantize_impl(
model = apply_preprocess_passes(model)

quantization_algorithm = PostTrainingQuantization(
mode=mode,
preset=preset,
target_device=target_device,
subset_size=subset_size,
Expand Down
37 changes: 36 additions & 1 deletion src/nncf/onnx/quantization/quantizer_parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@
from dataclasses import dataclass

import numpy as np
import onnx

from nncf.quantization.advanced_parameters import FP8Type
from nncf.quantization.fake_quantize import FakeConvertParameters
from nncf.quantization.fake_quantize import FakeQuantizeParameters
from nncf.quantization.fake_quantize import calculate_scale_zero_point
from nncf.tensor import functions as fns
Expand All @@ -31,10 +34,42 @@ class ONNXQuantizerLayerParameters:

scale: np.ndarray
zero_point: np.ndarray
tensor_type: np.dtype
tensor_type: onnx.TensorProto.DataType | np.dtype
axis: int | None = None


def convert_fc_params_to_onnx_params(
parameters: FakeConvertParameters, axis: int | None
) -> ONNXQuantizerLayerParameters:
"""
Converts common FakeConvertParameters to ONNXQuantizerLayerParameters.

:param parameters: FakeConvertParameters representation.
:param axis: Axis for per-channel quantization.
:return: Quantizer layer attributes.
"""
if parameters.destination_type == FP8Type.E4M3:
tensor_type = onnx.TensorProto.FLOAT8E4M3FN
elif parameters.destination_type == FP8Type.E5M2:
tensor_type = onnx.TensorProto.FLOAT8E5M2
else:
msg = f"Unsupported FP8type: {parameters.destination_type}. Expected FP8Type.E4M3 or FP8Type.E5M2"
raise ValueError(msg)

scale = parameters.scale
zero_point = parameters.shift

# NOTE: adding machine epsilon to avoid division by zero
eps = fns.finfo(scale).eps
scale = fns.where(fns.abs(scale) < eps, eps, scale)
scale = 1.0 / scale
# ONNX demands parameters to be a scalar or 1-D Tensor.
scale = fns.squeeze(scale)
zero_point = fns.squeeze(zero_point)

return ONNXQuantizerLayerParameters(scale.data, zero_point.data, tensor_type, axis)


def convert_fq_params_to_onnx_params(
parameters: FakeQuantizeParameters, num_bits: int, tensor_type: np.dtype, axis: tuple[int]
) -> ONNXQuantizerLayerParameters:
Expand Down
6 changes: 4 additions & 2 deletions src/nncf/quantization/algorithms/min_max/algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1030,7 +1030,9 @@ def filter_func(point: StatisticPoint) -> bool:
)
for quantization_target_point in unified_scale_group:
transformation_layout.register(
self._backend_entity.create_convert_insertion_command(quantization_target_point, parameters)
self._backend_entity.create_convert_insertion_command(
graph, quantization_target_point, qconfig, parameters
)
)
unified_ops_list.add(quantization_target_point)
continue
Expand Down Expand Up @@ -1069,7 +1071,7 @@ def filter_func(point: StatisticPoint) -> bool:
statistics, is_per_channel=qconfig.per_channel, destination_type=destination_type
)
command = self._backend_entity.create_convert_insertion_command(
quantization_target_point, parameters
graph, quantization_target_point, qconfig, parameters
)
else:
parameters = calculate_quantizer_parameters(statistics, qconfig, quant_group, half_range)
Expand Down
4 changes: 4 additions & 0 deletions src/nncf/quantization/algorithms/min_max/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,13 +198,17 @@ def create_unified_scales_quantizers_insertion_commands(
@staticmethod
@abstractmethod
def create_convert_insertion_command(
nncf_graph: NNCFGraph,
target_point: TargetPoint,
quantizer_config: QuantizerConfig,
parameters: FakeConvertParameters,
) -> Command:
"""
Returns backend-specific convert insertion command.

:param nncf_graph: NNCFGraph to get input/output shapes for the target point.
:param target_point: Target location for the correction.
:param quantizer_config: QuantizerConfig instance for the current layer.
:param parameters: FakeConvertParameters to calculate activation quantization parameters.
:return: Backend-specific Command for the quantizer insertion operation.
"""
Expand Down
17 changes: 14 additions & 3 deletions src/nncf/quantization/algorithms/min_max/onnx_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@

import numpy as np

import nncf
from nncf.common.graph.graph import NNCFGraph
from nncf.common.graph.graph import NNCFNode
from nncf.common.graph.operator_metatypes import OperatorMetatype
Expand All @@ -33,6 +32,7 @@
from nncf.onnx.graph.transformations.commands import ONNXTargetPoint
from nncf.onnx.hardware.config import ONNXHWConfig
from nncf.onnx.quantization.default_quantization import DEFAULT_ONNX_QUANT_TRAIT_TO_OP_DICT
from nncf.onnx.quantization.quantizer_parameters import convert_fc_params_to_onnx_params
from nncf.onnx.quantization.quantizer_parameters import convert_fq_params_to_onnx_params
from nncf.parameters import ModelType
from nncf.parameters import TargetDevice
Expand Down Expand Up @@ -158,11 +158,22 @@ def create_unified_scales_quantizers_insertion_commands(

@staticmethod
def create_convert_insertion_command(
nncf_graph: NNCFGraph,
target_point: ONNXTargetPoint,
quantizer_config: QuantizerConfig,
parameters: FakeConvertParameters,
) -> TransformationCommand:
msg = "FakeConvert insertion not implemented in ONNX backend!"
raise nncf.InternalError(msg)
axis = None
if quantizer_config.per_channel:
node = nncf_graph.get_node_by_name(target_point.target_node_name)
axis = (
get_weight_quantization_axis(node, target_point.port_id) if target_point.is_weight_target_point() else 1
)
onnx_parameters = convert_fc_params_to_onnx_params(parameters, axis)
# TODO(andrey-churkin): Investigate why the nncf_input_node_next_nodes parameter is passed directly to the
# command rather than being created within ModelTransformer, as it is for the OpenVINO backend.
nncf_input_node_next_nodes = ONNXMinMaxAlgoBackend._get_input_edges_mapping(nncf_graph)
Comment thread
daniil-lyakhov marked this conversation as resolved.
return ONNXQuantizerInsertionCommand(target_point, nncf_input_node_next_nodes, onnx_parameters)

@staticmethod
def _get_input_edges_mapping(nncf_graph: NNCFGraph):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,9 @@ def create_unified_scales_quantizers_insertion_commands(

@staticmethod
def create_convert_insertion_command(
nncf_graph: NNCFGraph,
target_point: OVTargetPoint,
quantizer_config: QuantizerConfig,
parameters: FakeConvertParameters,
) -> OVQuantizerInsertionCommand:
return OVConvertInsertionCommand(target_point, parameters)
Expand Down
2 changes: 2 additions & 0 deletions src/nncf/quantization/algorithms/min_max/torch_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,9 @@ def target_point(target_type: TargetType, target_node_name: str, port_id: int) -

@staticmethod
def create_convert_insertion_command(
nncf_graph: NNCFGraph,
target_point: PTTargetPoint,
quantizer_config: QuantizerConfig,
parameters: FakeConvertParameters,
) -> TransformationCommand:
msg = "FakeConvert insertion not implemented in PyTorch backend!"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,9 @@ def target_point(target_type: TargetType, target_node_name: str, port_id: int) -

@staticmethod
def create_convert_insertion_command(
nncf_graph: NNCFGraph,
target_point: PTTargetPoint,
quantizer_config: QuantizerConfig,
parameters: FakeConvertParameters,
) -> TransformationCommand:
msg = "FakeConvert insertion not implemented in PyTorch backend!"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ class ONNXWeightCompressionAlgoBackend(WeightCompressionAlgoBackend):
CompressWeightsMode.INT8_ASYM: onnx.TensorProto.UINT8,
CompressWeightsMode.INT4_SYM: onnx.TensorProto.INT4,
CompressWeightsMode.INT4_ASYM: onnx.TensorProto.UINT4,
CompressWeightsMode.FP8_E4M3: onnx.TensorProto.FLOAT8E4M3FN,
}

def __init__(self, model: onnx.ModelProto):
Expand Down Expand Up @@ -363,8 +364,13 @@ def _add_dequantize_linear_layer(
zero_point = pack_4_bits(zero_point)

# Create initializers for the quantized weights, scale, and zero point
vals = quantized_weights
if weight_dtype == onnx.TensorProto.FLOAT8E4M3FN:
np_dtype = helper.tensor_dtype_to_np_dtype(weight_dtype)
vals = onnx.numpy_helper.saturate_cast(np.asarray(quantized_weights), np_dtype).flatten()

quantized_weights_initializer = onnx.helper.make_tensor(
quantized_weight_name, weight_dtype, orig_shape, quantized_weights.tobytes(), raw=True
quantized_weight_name, weight_dtype, orig_shape, vals.tobytes(), raw=True
)
scale_initializer = numpy_helper.from_array(
np.array(scale, dtype=helper.tensor_dtype_to_np_dtype(scale_dtype)), name=scale_name
Expand All @@ -374,8 +380,14 @@ def _add_dequantize_linear_layer(

if zero_point is not None:
deq_inputs.append(weight_name + "_zero_point")

vals = zero_point
if weight_dtype == onnx.TensorProto.FLOAT8E4M3FN:
np_dtype = helper.tensor_dtype_to_np_dtype(weight_dtype)
vals = onnx.numpy_helper.saturate_cast(np.asarray(zero_point), np_dtype).flatten()

zero_point_initializer = onnx.helper.make_tensor(
weight_name + "_zero_point", weight_dtype, orig_zero_point_shape, zero_point.tobytes(), raw=True
weight_name + "_zero_point", weight_dtype, orig_zero_point_shape, vals.tobytes(), raw=True
)
new_initializers.append(zero_point_initializer)

Expand Down
1 change: 0 additions & 1 deletion src/nncf/quantization/quantize_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -627,7 +627,6 @@ def compress_weights(
CompressWeightsMode.NF4,
CompressWeightsMode.MXFP4,
CompressWeightsMode.MXFP8_E4M3,
CompressWeightsMode.FP8_E4M3,
CompressWeightsMode.FP4,
CompressWeightsMode.NVFP4,
CompressWeightsMode.CODEBOOK,
Expand Down
56 changes: 56 additions & 0 deletions tests/onnx/quantization/test_qdq_params_calculation.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,12 @@

from nncf.common.quantization.structs import QuantizationPreset
from nncf.onnx.graph.onnx_helper import get_tensor_value
from nncf.onnx.quantization.quantizer_parameters import convert_fc_params_to_onnx_params
from nncf.quantization.advanced_parameters import AdvancedQuantizationParameters
from nncf.quantization.advanced_parameters import FP8Type
from nncf.quantization.advanced_parameters import OverflowFix
from nncf.quantization.fake_quantize import FakeConvertParameters
from nncf.tensor import Tensor
from tests.cross_fw.shared.comparator import compare_stats
from tests.cross_fw.shared.json import load_json
from tests.onnx.conftest import ONNX_TEST_ROOT
Expand Down Expand Up @@ -113,3 +117,55 @@ def test_scales(model, preset):

ref_nodes_params = load_json(ref_stats_path)
compare_stats(ref_nodes_params, q_nodes_params)


@pytest.mark.parametrize(
"scale,shift,destination_type,axis,expected",
[
(
np.array(2, dtype=np.float32),
np.array(0, dtype=np.float32),
FP8Type.E4M3,
None,
(np.array(0.5, dtype=np.float32), np.array(0, dtype=np.float32), onnx.TensorProto.FLOAT8E4M3FN),
),
(
np.array(2, dtype=np.float32),
np.array(0, dtype=np.float32),
FP8Type.E5M2,
None,
(np.array(0.5, dtype=np.float32), np.array(0, dtype=np.float32), onnx.TensorProto.FLOAT8E5M2),
),
(
np.array(0, dtype=np.float32),
np.array(0, dtype=np.float32),
FP8Type.E5M2,
None,
(np.array(8388608, dtype=np.float32), np.array(0, dtype=np.float32), onnx.TensorProto.FLOAT8E5M2),
),
(
np.array([2, 4, 8], dtype=np.float32).reshape(3, 1, 1, 1),
np.array([0, 0, 0], dtype=np.float32).reshape(3, 1, 1, 1),
FP8Type.E4M3,
0,
(
np.array([0.5, 0.25, 0.125], dtype=np.float32),
np.array([0, 0, 0], dtype=np.float32),
onnx.TensorProto.FLOAT8E4M3FN,
),
),
],
)
def test_convert_fc_params_to_onnx_params(scale, shift, destination_type, axis, expected):
fc_params = FakeConvertParameters(Tensor(scale), Tensor(shift), destination_type)
onnx_params = convert_fc_params_to_onnx_params(fc_params, axis)
expected_scale, expected_zp, expected_type = expected

assert expected_scale.shape == onnx_params.scale.shape
assert np.allclose(expected_scale, onnx_params.scale, atol=1e-5)

assert expected_zp.shape == onnx_params.zero_point.shape
assert np.allclose(expected_zp, onnx_params.zero_point, atol=1e-5)

assert expected_type == onnx_params.tensor_type
assert axis == onnx_params.axis
1 change: 0 additions & 1 deletion tests/onnx/quantization/test_weights_compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@
CompressWeightsMode.NVFP4,
CompressWeightsMode.MXFP4,
CompressWeightsMode.MXFP8_E4M3,
CompressWeightsMode.FP8_E4M3,
CompressWeightsMode.FP4,
)

Expand Down