diff --git a/examples/llm_compression/onnx/tiny_llama/main.py b/examples/llm_compression/onnx/tiny_llama/main.py index 90f1b475ac6..6ee571882da 100644 --- a/examples/llm_compression/onnx/tiny_llama/main.py +++ b/examples/llm_compression/onnx/tiny_llama/main.py @@ -66,9 +66,10 @@ def main(): ov_model = OVModelForCausalLM.from_pretrained(OUTPUT_DIR, from_onnx=True) messages = [{"role": "user", "content": "What is PyTorch?"}] - input_ids = tokenizer.apply_chat_template( + batch_feature = tokenizer.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_tensors="pt" - ).to(device=model.device) + ) + input_ids = batch_feature["input_ids"] start_t = time.time() output = ov_model.generate(input_ids, max_new_tokens=100) diff --git a/examples/llm_compression/onnx/tiny_llama/requirements.txt b/examples/llm_compression/onnx/tiny_llama/requirements.txt index c9eb95a07e1..6de4e3e6505 100644 --- a/examples/llm_compression/onnx/tiny_llama/requirements.txt +++ b/examples/llm_compression/onnx/tiny_llama/requirements.txt @@ -1,4 +1,4 @@ -transformers==4.53.0 +transformers==5.0.0 openvino==2026.2.0 optimum-intel[openvino]==2.0.0 optimum==2.2.0 diff --git a/examples/llm_compression/onnx/tiny_llama_scale_estimation/main.py b/examples/llm_compression/onnx/tiny_llama_scale_estimation/main.py index 818c8610248..bfd2d5e4010 100644 --- a/examples/llm_compression/onnx/tiny_llama_scale_estimation/main.py +++ b/examples/llm_compression/onnx/tiny_llama_scale_estimation/main.py @@ -119,9 +119,10 @@ def main(): # Infer Model. ov_model = OVModelForCausalLM.from_pretrained(OUTPUT_DIR, from_onnx=True) messages = [{"role": "user", "content": "What is PyTorch?"}] - input_ids = tokenizer.apply_chat_template( + batch_feature = tokenizer.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_tensors="pt" - ).to(device=model.device) + ) + input_ids = batch_feature["input_ids"] start_t = time.time() output = ov_model.generate(input_ids, max_new_tokens=100) diff --git a/examples/llm_compression/onnx/tiny_llama_scale_estimation/requirements.txt b/examples/llm_compression/onnx/tiny_llama_scale_estimation/requirements.txt index 7760fb114bd..c6eca1e1e7c 100644 --- a/examples/llm_compression/onnx/tiny_llama_scale_estimation/requirements.txt +++ b/examples/llm_compression/onnx/tiny_llama_scale_estimation/requirements.txt @@ -1,5 +1,5 @@ torch==2.10.0 -transformers==4.53.0 +transformers==5.0.0 openvino==2026.2.0 optimum-intel[openvino]==2.0.0 optimum==2.2.0 diff --git a/examples/llm_compression/openvino/smollm2_360m_adaptive_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_adaptive_codebook/main.py index 8975609e00d..63ab7a94667 100644 --- a/examples/llm_compression/openvino/smollm2_360m_adaptive_codebook/main.py +++ b/examples/llm_compression/openvino/smollm2_360m_adaptive_codebook/main.py @@ -151,9 +151,10 @@ def generate_answers( for question in questions: messages.append({"role": "user", "content": question}) - input_ids = tokenizer.apply_chat_template( + batch_feature = tokenizer.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_tensors="pt" - ).to(device=model.device) + ) + input_ids = batch_feature["input_ids"] input_len = len(input_ids[0]) output = model.generate(input_ids, max_new_tokens=max_new_tokens, do_sample=False)[0] diff --git a/examples/llm_compression/openvino/smollm2_360m_adaptive_codebook/requirements.txt b/examples/llm_compression/openvino/smollm2_360m_adaptive_codebook/requirements.txt index 889f46fe1ab..8e00b9b6583 100644 --- a/examples/llm_compression/openvino/smollm2_360m_adaptive_codebook/requirements.txt +++ b/examples/llm_compression/openvino/smollm2_360m_adaptive_codebook/requirements.txt @@ -2,7 +2,7 @@ datasets==5.0.0 openvino==2026.2.0 optimum-intel[openvino]==2.0.0 optimum==2.2.0 -transformers==4.53.0 +transformers==5.0.0 onnx==1.22.0 torch==2.10.0 torchvision==0.25.0 diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py index 9711304e9dc..caaf00157c7 100644 --- a/examples/llm_compression/openvino/smollm2_360m_codebook/main.py +++ b/examples/llm_compression/openvino/smollm2_360m_codebook/main.py @@ -48,9 +48,10 @@ def generate_answers( for question in questions: messages.append({"role": "user", "content": question}) - input_ids = tokenizer.apply_chat_template( + batch_feature = tokenizer.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_tensors="pt" - ).to(device=model.device) + ) + input_ids = batch_feature["input_ids"] input_len = len(input_ids[0]) output = model.generate(input_ids, max_new_tokens=max_new_tokens, do_sample=False)[0] diff --git a/examples/llm_compression/openvino/smollm2_360m_codebook/requirements.txt b/examples/llm_compression/openvino/smollm2_360m_codebook/requirements.txt index eca318ebd8c..f821443dd29 100644 --- a/examples/llm_compression/openvino/smollm2_360m_codebook/requirements.txt +++ b/examples/llm_compression/openvino/smollm2_360m_codebook/requirements.txt @@ -1,7 +1,7 @@ openvino==2026.2.0 optimum-intel[openvino]==2.0.0 optimum==2.2.0 -transformers==4.53.0 +transformers==5.0.0 onnx==1.22.0 torch==2.10.0 torchvision==0.25.0 diff --git a/examples/llm_compression/openvino/smollm2_360m_fp8/main.py b/examples/llm_compression/openvino/smollm2_360m_fp8/main.py index d3558d9a78b..92c500424b1 100644 --- a/examples/llm_compression/openvino/smollm2_360m_fp8/main.py +++ b/examples/llm_compression/openvino/smollm2_360m_fp8/main.py @@ -47,9 +47,10 @@ def generate_answers(questions, model, tokenizer, max_new_tokens=50): for question in questions: messages.append({"role": "user", "content": question}) - input_ids = tokenizer.apply_chat_template( + batch_feature = tokenizer.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_tensors="pt" - ).to(device=model.device) + ) + input_ids = batch_feature["input_ids"] input_len = len(input_ids[0]) output = model.generate(input_ids, max_new_tokens=max_new_tokens, do_sample=False)[0] diff --git a/examples/llm_compression/openvino/smollm2_360m_fp8/requirements.txt b/examples/llm_compression/openvino/smollm2_360m_fp8/requirements.txt index 4583a53ff53..27bdab2d902 100644 --- a/examples/llm_compression/openvino/smollm2_360m_fp8/requirements.txt +++ b/examples/llm_compression/openvino/smollm2_360m_fp8/requirements.txt @@ -2,6 +2,6 @@ datasets==5.0.0 openvino==2026.2.0 optimum-intel[openvino]==2.0.0 optimum==2.2.0 -transformers==4.53.0 +transformers==5.0.0 onnx==1.22.0 torch==2.10.0 diff --git a/examples/llm_compression/openvino/tiny_llama/main.py b/examples/llm_compression/openvino/tiny_llama/main.py index 14224dba5d4..45d8c031ce5 100644 --- a/examples/llm_compression/openvino/tiny_llama/main.py +++ b/examples/llm_compression/openvino/tiny_llama/main.py @@ -60,9 +60,10 @@ def transform_fn(data, tokenizer): model = OVModelForCausalLM.from_pretrained(OUTPUT_DIR) messages = [{"role": "user", "content": "What is PyTorch?"}] - input_ids = tokenizer.apply_chat_template( + batch_feature = tokenizer.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_tensors="pt" - ).to(device=model.device) + ) + input_ids = batch_feature["input_ids"] start_t = time.time() output = model.generate(input_ids, max_new_tokens=100) diff --git a/examples/llm_compression/openvino/tiny_llama/requirements.txt b/examples/llm_compression/openvino/tiny_llama/requirements.txt index 60d8a4694f8..6a9f3731996 100644 --- a/examples/llm_compression/openvino/tiny_llama/requirements.txt +++ b/examples/llm_compression/openvino/tiny_llama/requirements.txt @@ -4,4 +4,4 @@ openvino==2026.2.0 optimum-intel[openvino]==2.0.0 optimum==2.2.0 torch==2.10.0 -transformers==4.53.0 +transformers==5.0.0 diff --git a/examples/llm_compression/openvino/tiny_llama_find_hyperparams/requirements.txt b/examples/llm_compression/openvino/tiny_llama_find_hyperparams/requirements.txt index 89033f36120..7cf84bead90 100644 --- a/examples/llm_compression/openvino/tiny_llama_find_hyperparams/requirements.txt +++ b/examples/llm_compression/openvino/tiny_llama_find_hyperparams/requirements.txt @@ -1,8 +1,8 @@ -whowhatbench @ git+https://github.com/AlexanderDokuchaev/openvino.genai@light_req#subdirectory=tools/who_what_benchmark +whowhatbench @ git+https://github.com/AlexanderDokuchaev/openvino.genai@a26c9b20d07209f035f6d74aeae94d6a72a132ab#subdirectory=tools/who_what_benchmark numpy==1.26.4 openvino==2026.2.0 optimum-intel==2.0.0 optimum==2.2.0 -transformers==4.53.0 +transformers==5.0.0 onnx==1.22.0 torch==2.10.0 diff --git a/examples/llm_compression/openvino/tiny_llama_synthetic_data/requirements.txt b/examples/llm_compression/openvino/tiny_llama_synthetic_data/requirements.txt index d988ed7b1bc..93754454bfc 100644 --- a/examples/llm_compression/openvino/tiny_llama_synthetic_data/requirements.txt +++ b/examples/llm_compression/openvino/tiny_llama_synthetic_data/requirements.txt @@ -4,5 +4,5 @@ numpy>=1.23.5,<2 openvino==2026.2.0 optimum-intel==2.0.0 optimum==2.2.0 -transformers==4.53.0 +transformers==5.0.0 onnx==1.22.0 diff --git a/examples/llm_compression/torch/distillation_qat_with_lora/requirements.txt b/examples/llm_compression/torch/distillation_qat_with_lora/requirements.txt index 1fb91a4647a..90bf5bf06a2 100644 --- a/examples/llm_compression/torch/distillation_qat_with_lora/requirements.txt +++ b/examples/llm_compression/torch/distillation_qat_with_lora/requirements.txt @@ -5,5 +5,5 @@ numpy>=1.23.5,<2 openvino==2026.2.0 optimum-intel==2.0.0 optimum==2.2.0 -transformers==4.53.0 -lm_eval==0.4.8 +transformers==5.0.0 +lm_eval[hf]==0.4.12 diff --git a/examples/llm_compression/torch/downstream_qat_with_nls/requirements.txt b/examples/llm_compression/torch/downstream_qat_with_nls/requirements.txt index fc3a0ac02f6..66d01cea870 100644 --- a/examples/llm_compression/torch/downstream_qat_with_nls/requirements.txt +++ b/examples/llm_compression/torch/downstream_qat_with_nls/requirements.txt @@ -4,6 +4,6 @@ numpy>=1.23.5,<2 openvino==2026.2.0 optimum-intel==2.0.0 optimum==2.2.0 -transformers==4.53.0 -lm_eval==0.4.8 +transformers==5.0.0 +lm_eval[hf]==0.4.12 torchao==0.17.0 diff --git a/examples/llm_compression/torch_fx/tiny_llama/main.py b/examples/llm_compression/torch_fx/tiny_llama/main.py index 7b9255c200e..2c0a403e3ce 100644 --- a/examples/llm_compression/torch_fx/tiny_llama/main.py +++ b/examples/llm_compression/torch_fx/tiny_llama/main.py @@ -69,9 +69,10 @@ def main() -> str: }, {"role": "user", "content": "How many helicopters can a human eat in one sitting?"}, ] - input_ids = tokenizer.apply_chat_template( + batch_feature = tokenizer.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_tensors="pt" ) + input_ids = batch_feature["input_ids"] print("Warmup...") output = compressed_model_hf.generate(input_ids) diff --git a/examples/llm_compression/torch_fx/tiny_llama/modelling.py b/examples/llm_compression/torch_fx/tiny_llama/modelling.py index 484bba44043..d5cfeeb1e76 100644 --- a/examples/llm_compression/torch_fx/tiny_llama/modelling.py +++ b/examples/llm_compression/torch_fx/tiny_llama/modelling.py @@ -17,7 +17,6 @@ from transformers import GenerationMixin from transformers import PretrainedConfig from transformers import PreTrainedModel -from transformers.cache_utils import StaticCacheConfig from transformers.integrations.executorch import TorchExportableModuleWithStaticCache from transformers.modeling_outputs import CausalLMOutputWithPast from transformers.models.llama.configuration_llama import LlamaConfig @@ -97,8 +96,9 @@ def convert_and_export_with_cache(model: PreTrainedModel) -> tuple[ExportedProgr example_cache_position = torch.arange(0, 8, dtype=torch.long) model_config = None gen_config = None + model.generation_config.use_cache = True model.generation_config.cache_implementation = "static" - model.generation_config.cache_config = StaticCacheConfig(batch_size=1, max_cache_len=512) + model.generation_config.cache_config = {"batch_size": 1, "max_cache_len": 512} model.generation_config.max_new_tokens = 100 gen_config = model.generation_config model_config = model.config diff --git a/examples/llm_compression/torch_fx/tiny_llama/requirements.txt b/examples/llm_compression/torch_fx/tiny_llama/requirements.txt index 80ce17a9ebe..a68ebd90210 100644 --- a/examples/llm_compression/torch_fx/tiny_llama/requirements.txt +++ b/examples/llm_compression/torch_fx/tiny_llama/requirements.txt @@ -1,4 +1,4 @@ -transformers==4.53.0 +transformers==5.0.0 datasets==5.0.0 openvino==2026.2.0 optimum==2.2.0 diff --git a/tests/openvino/requirements.txt b/tests/openvino/requirements.txt index 47e3781a108..c5dc1472900 100644 --- a/tests/openvino/requirements.txt +++ b/tests/openvino/requirements.txt @@ -15,6 +15,6 @@ addict>=2.4.0 timm==0.9.2 efficientnet_pytorch==0.7.1 datasets -transformers==4.53.0 +transformers==5.0.0 optimum-intel==2.0.0 optimum==2.2.0 diff --git a/tests/post_training/pipelines/fx_modelling.py b/tests/post_training/pipelines/fx_modelling.py index 559cfb7ad76..61b077d6761 100644 --- a/tests/post_training/pipelines/fx_modelling.py +++ b/tests/post_training/pipelines/fx_modelling.py @@ -16,7 +16,6 @@ from transformers import GenerationMixin from transformers import PretrainedConfig from transformers import PreTrainedModel -from transformers.cache_utils import StaticCacheConfig from transformers.integrations.executorch import TorchExportableModuleWithStaticCache from transformers.modeling_outputs import CausalLMOutputWithPast @@ -84,8 +83,9 @@ def convert_and_export_with_cache(model: PreTrainedModel): example_input_ids = torch.ones(1, 8, dtype=torch.long) example_cache_position = torch.arange(0, 8, dtype=torch.long) + model.generation_config.use_cache = True model.generation_config.cache_implementation = "static" - model.generation_config.cache_config = StaticCacheConfig(batch_size=1, max_cache_len=512) + model.generation_config.cache_config = {"batch_size": 1, "max_cache_len": 512} model.generation_config.max_new_tokens = 100 gen_config = model.generation_config model_config = model.config diff --git a/tests/post_training/pipelines/gpt.py b/tests/post_training/pipelines/gpt.py index 57878eea479..779036f2946 100644 --- a/tests/post_training/pipelines/gpt.py +++ b/tests/post_training/pipelines/gpt.py @@ -24,14 +24,37 @@ from tests.post_training.pipelines.base import PTQTestPipeline +# TODO(AlexandrDokuchaev): Remove this wrapper when the issue with torch.jit.trace and transformers>=5.0 is fixed +class CausalLMTracingWrapper(torch.nn.Module): + """ + Wraps a Hugging Face causal language model so it can be exported via ``torch.jit.trace`` + (used internally by ``openvino.convert_model``). + + Since ``transformers>=5.0`` a causal LM forward pass returns a ``DynamicCache`` in its output + and builds the attention mask with a ``torch.diff``-based packed-sequence check when no + attention mask is provided. Neither construct is supported by ``torch.jit.trace`` / the + OpenVINO PyTorch frontend. + """ + + def __init__(self, model: torch.nn.Module) -> None: + super().__init__() + self.model = model + self.model.config.use_cache = False + + def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor | None = None) -> torch.Tensor: + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + return self.model(input_ids=input_ids, attention_mask=attention_mask) + + class GPT(PTQTestPipeline): """Pipeline for causal language models from Hugging Face repository""" def prepare_model(self) -> None: if self.backend in PT_BACKENDS: self.model_hf = transformers.AutoModelForCausalLM.from_pretrained(self.model_id) - self.model = self.model_hf - self.model.config.torchscript = True # Set to export by convert_model via torch.jit.trace + self.model = CausalLMTracingWrapper(self.model_hf) + self.dummy_tensor = self.model_hf.dummy_inputs["input_ids"] elif self.backend in OV_BACKENDS + [BackendType.FP32]: diff --git a/tests/post_training/requirements.txt b/tests/post_training/requirements.txt index a409d23282c..abd23217f3b 100644 --- a/tests/post_training/requirements.txt +++ b/tests/post_training/requirements.txt @@ -22,8 +22,8 @@ optimum-onnx @ git+https://github.com/AlexanderDokuchaev/optimum-onnx.git@b57739 scikit-learn>=1.2.2,<=1.5.0 soundfile==0.12.1 tensorboard==2.20.0 -tensorflow-io==0.37.0 +tensorflow-io==0.37.1 timm==0.9.2 accelerate==1.9.0 -transformers==4.53.0 -whowhatbench @ git+https://github.com/AlexanderDokuchaev/openvino.genai@light_req#subdirectory=tools/who_what_benchmark +transformers==5.0.0 +whowhatbench @ git+https://github.com/AlexanderDokuchaev/openvino.genai@a26c9b20d07209f035f6d74aeae94d6a72a132ab#subdirectory=tools/who_what_benchmark diff --git a/tests/torch/function_hook/sparsify_activations/helpers.py b/tests/torch/function_hook/sparsify_activations/helpers.py index bb59019e5f9..3c6c20a73ce 100644 --- a/tests/torch/function_hook/sparsify_activations/helpers.py +++ b/tests/torch/function_hook/sparsify_activations/helpers.py @@ -15,7 +15,6 @@ import openvino as ov import torch import torch.nn as nn -import transformers.models from nncf import IgnoredScope from nncf.experimental.torch.sparsify_activations import TargetScope @@ -36,21 +35,6 @@ def forward(self, input_ids: torch.Tensor): return y0, y1 -def dummy_llama_model(): - config = transformers.models.llama.configuration_llama.LlamaConfig( - vocab_size=32, - hidden_size=8, - intermediate_size=14, - num_attention_heads=2, - num_key_value_heads=1, - num_hidden_layers=2, - use_cache=False, - return_dict=False, - ) - model = transformers.AutoModelForCausalLM.from_config(config, attn_implementation="eager") - return model - - def count_sparsifier_patterns_in_ov(model: ov.Model) -> int: """ Counts the number of activation sparsification pattern "Abs -> LessEqual -> Select" diff --git a/tests/torch/function_hook/sparsify_activations/test_algo.py b/tests/torch/function_hook/sparsify_activations/test_algo.py index 66092acbbbb..8ca14b6baf7 100644 --- a/tests/torch/function_hook/sparsify_activations/test_algo.py +++ b/tests/torch/function_hook/sparsify_activations/test_algo.py @@ -32,7 +32,6 @@ from tests.cross_fw.shared.paths import TEST_ROOT from tests.torch.function_hook.sparsify_activations.helpers import ThreeLinearModel from tests.torch.function_hook.sparsify_activations.helpers import count_sparsifier_patterns_in_ov -from tests.torch.function_hook.sparsify_activations.helpers import dummy_llama_model from tests.torch.helpers import set_torch_seed from tests.torch.utils import compare_with_reference_file from tests.torch.utils import to_comparable_nx_graph @@ -100,24 +99,6 @@ def __str__(self) -> str: ref_num_batches_tracked=3, ref_num_patterns_in_ov=2, ), - SparsifyActivationsAlgorithmTestDesc( - name="dummy_llama", - model_getter=dummy_llama_model, - dataset_getter=lambda device: nncf.Dataset(torch.randint(0, 30, (3, 2, 8)).to(device)), - target_sparsity_by_scope={ - TargetScope(patterns=[".*gate_proj.*"]): 0.2, - TargetScope(patterns=[".*up_proj.*"]): 0.3, - TargetScope(patterns=[".*down_proj.*"]): 0.4, - }, - ignored_scope=None, - ref_sparsifier_target_sparsity={ - (f"pre_hooks.model/mlp/{name}/linear/{layer_id}__0.0"): sparsity - for name, sparsity in [("gate_proj", 0.2), ("up_proj", 0.3), ("down_proj", 0.4)] - for layer_id in [0, 1] - }, - ref_num_batches_tracked=3, - ref_num_patterns_in_ov=6, - ), ] diff --git a/tests/torch/requirements.txt b/tests/torch/requirements.txt index f3057e556e9..73c22107643 100644 --- a/tests/torch/requirements.txt +++ b/tests/torch/requirements.txt @@ -14,9 +14,9 @@ torchvision addict>=2.4.0 efficientnet_pytorch==0.7.1 -transformers==4.53.0 +transformers==5.0.0 -sentence-transformers==4.1.0 +sentence-transformers==5.6.0 optimum-intel==2.0.0 optimum==2.2.0 accelerate==1.9.0