Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,12 @@
from nncf.tensor import functions as fns


def process_stats(stats: WCTensorStatistic, subset_size: int, act_ch_axis: int = -1) -> tuple[Tensor, Tensor]:
def process_stats(
stats: WCTensorStatistic,
subset_size: int,
act_ch_axis: int = -1,
transpose_a: bool = False,
) -> tuple[Tensor, Tensor]:
Comment thread
daniil-lyakhov marked this conversation as resolved.
"""
A function for processing activations. Shared between AWQ, Scale Estimation and LoRA Correction algorithms.

Expand All @@ -37,6 +42,11 @@ def process_stats(stats: WCTensorStatistic, subset_size: int, act_ch_axis: int =
axes = list(range(1, len(X.shape))) + [0]
X_full = fns.transpose(X, axes=axes)

if transpose_a:
axes = list(range(len(X_full.shape)))
axes[-1], axes[-2] = axes[-2], axes[-1]
X_full = fns.transpose(X_full, axes=axes)

# The sample dimension is always the last axis after transpose
sample_axis = -1

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ def calculate_adapters(
layer_name = wc_params.node_with_weight.node_name
layer_statistics = self._statistics[layer_name]
is_debug = self._debug_interface is not None
transpose_a_flag = getattr(wc_params.node_with_weight, "transpose_a", False)
Comment thread
daniil-lyakhov marked this conversation as resolved.
Outdated
lora_A, lora_B, mean_noises = self.calculate_low_rank_matrices(
weight,
compressed_weight,
Expand All @@ -129,6 +130,7 @@ def calculate_adapters(
self._lora_correction_params,
layer_statistics,
is_debug,
transpose_a=transpose_a_flag,
)
if is_debug:
self._debug_interface.add_noises(layer_name, mean_noises)
Expand All @@ -143,6 +145,7 @@ def calculate_low_rank_matrices(
lora_correction_params: AdvancedLoraCorrectionParameters,
layer_statistics: WCTensorStatistic,
is_debug: Optional[bool] = False,
transpose_a: bool = False,
Comment thread
daniil-lyakhov marked this conversation as resolved.
Outdated
):
"""
Calculates low rank matrices for a given original and compressed weights.
Expand Down Expand Up @@ -170,7 +173,15 @@ def calculate_low_rank_matrices(
)
mode = compression_config.mode
assert len(reduction_axes) == 1, "Assumed a single reduction axis"
reduction_axis = reduction_axes[0] if compression_config.group_size != -1 else -1

if compression_config.group_size != -1:
reduction_axis = reduction_axes[0]
else:
reduction_axis = -1

if transpose_a and reduction_axis != -1:
reduction_axis = 1

if mode in (CompressWeightsMode.INT4_SYM, CompressWeightsMode.INT4_ASYM):
fq_weights = do_integer_dequantization(
compressed_weight.tensor,
Expand All @@ -192,9 +203,8 @@ def calculate_low_rank_matrices(
# reduction axes is all axes except output dimension in linear/conv layers.
if reduction_axes[0] == 1:
svd_residual = fns.transpose(svd_residual)
residual = svd_residual.clone() # [H, O]

s, X = process_stats(layer_statistics, subset_size) # [H], [H, SS]
residual = fns.transpose(svd_residual) if transpose_a else svd_residual # [H, O] or [O, H]
s, X = process_stats(layer_statistics, subset_size, act_ch_axis=-1, transpose_a=transpose_a)
X = fns.transpose(X) # [SS, H]
if compression_config.group_size > 0:
# Multiply residual of weights by maximum channel magnitude of activations normalized per quantization
Expand Down
78 changes: 78 additions & 0 deletions tests/openvino/native/quantization/test_weights_compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from nncf import SensitivityMetric
from nncf.common.factory import build_graph
from nncf.common.tensor_statistics.collectors import AggregatorBase
from nncf.common.tensor_statistics.statistics import WCTensorStatistic
from nncf.common.utils.debug import nncf_debug
from nncf.common.utils.helpers import set_env_variable
from nncf.data.dataset import Dataset
Expand All @@ -42,6 +43,7 @@
from nncf.quantization.advanced_parameters import AdvancedGPTQParameters as GPTQParams
from nncf.quantization.advanced_parameters import AdvancedLoraCorrectionParameters as LoraParams
from nncf.quantization.advanced_parameters import GroupSizeFallbackMode
from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats
from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA
Expand Down Expand Up @@ -2574,3 +2576,79 @@ def test_awq_scale_ref() -> list[dict[str, Tensor]]:
@pytest.fixture
def transpose_a_supported(self) -> bool:
return True


def test_process_stats_with_transpose_a_changes_layout():
activations = np.random.randn(10, 3, 8)

stats = WCTensorStatistic(
Tensor(activations),
shape_values=activations.shape,
)

subset_size = 10

s_default, X_default = process_stats(
stats,
subset_size=subset_size,
act_ch_axis=-1,
transpose_a=False,
)

s_transposed, X_transposed = process_stats(
stats,
subset_size=subset_size,
act_ch_axis=-1,
transpose_a=True,
)

# Rank must stay the same
assert len(s_default.shape) == len(s_transposed.shape)

# Reduction dimension (seq_len) must be preserved
assert s_default.shape[0] == s_transposed.shape[0] == 3

# Layout must change
assert X_default.shape != X_transposed.shape

# Element count preserved
assert np.prod(X_default.shape) == np.prod(X_transposed.shape)


@pytest.mark.parametrize(
"transpose_a,transpose_b",
[
(False, False),
(False, True),
],
)
def test_lora_transpose_a_fix(transpose_a, transpose_b):
"""
Test LoRA correction insertion only with transpose_a=False
because transposed activations are not yet supported by LoRA.
"""
# Setup LoRA parameters
params = LoraParams(adapter_rank=4, use_int8_adapters=False)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that makes sense. I can update the existing tests to cover the act_ch_axis/transpose handling instead of adding separate ones, so the verification of LoRA Correction with transposed inputs is integrated with the current test suite.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please don't forget to update the tests

advanced_parameters = CompressionParams(lora_correction_params=params)

# Initialize model with given transpose configuration
model = LMLinearModel(transpose_b=transpose_b, transpose_a=transpose_a)
ov_model = model.ov_model

# Use dummy dataset with same shape as model input
dataset = Dataset(np.ones(inp.shape) for inp in ov_model.inputs)

# Compress weights with LoRA correction enabled
compressed_model = compress_weights(
ov_model,
mode=CompressWeightsMode.INT4_SYM,
ratio=1.0,
group_size=8,
dataset=dataset,
all_layers=True,
lora_correction=True,
advanced_parameters=advanced_parameters,
)

# Simple assertion: compressed model is returned
assert compressed_model is not None