Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions examples/llm_compression/onnx/tiny_llama/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,10 @@ def main():
ov_model = OVModelForCausalLM.from_pretrained(OUTPUT_DIR, from_onnx=True)

messages = [{"role": "user", "content": "What is PyTorch?"}]
input_ids = tokenizer.apply_chat_template(
batch_feature = tokenizer.apply_chat_template(
messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
).to(device=model.device)
)
input_ids = batch_feature["input_ids"]

start_t = time.time()
output = ov_model.generate(input_ids, max_new_tokens=100)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
transformers==4.53.0
transformers==5.0.0
openvino==2026.2.0
optimum-intel[openvino]==2.0.0
optimum==2.2.0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,9 +119,10 @@ def main():
# Infer Model.
ov_model = OVModelForCausalLM.from_pretrained(OUTPUT_DIR, from_onnx=True)
messages = [{"role": "user", "content": "What is PyTorch?"}]
input_ids = tokenizer.apply_chat_template(
batch_feature = tokenizer.apply_chat_template(
messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
).to(device=model.device)
)
input_ids = batch_feature["input_ids"]

start_t = time.time()
output = ov_model.generate(input_ids, max_new_tokens=100)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
torch==2.10.0
transformers==4.53.0
transformers==5.0.0
openvino==2026.2.0
optimum-intel[openvino]==2.0.0
optimum==2.2.0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,9 +151,10 @@ def generate_answers(

for question in questions:
messages.append({"role": "user", "content": question})
input_ids = tokenizer.apply_chat_template(
batch_feature = tokenizer.apply_chat_template(
messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
).to(device=model.device)
)
input_ids = batch_feature["input_ids"]
input_len = len(input_ids[0])

output = model.generate(input_ids, max_new_tokens=max_new_tokens, do_sample=False)[0]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ datasets==5.0.0
openvino==2026.2.0
optimum-intel[openvino]==2.0.0
optimum==2.2.0
transformers==4.53.0
transformers==5.0.0
onnx==1.22.0
torch==2.10.0
torchvision==0.25.0
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,10 @@ def generate_answers(

for question in questions:
messages.append({"role": "user", "content": question})
input_ids = tokenizer.apply_chat_template(
batch_feature = tokenizer.apply_chat_template(
messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
).to(device=model.device)
)
input_ids = batch_feature["input_ids"]
input_len = len(input_ids[0])

output = model.generate(input_ids, max_new_tokens=max_new_tokens, do_sample=False)[0]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
openvino==2026.2.0
optimum-intel[openvino]==2.0.0
optimum==2.2.0
transformers==4.53.0
transformers==5.0.0
onnx==1.22.0
torch==2.10.0
torchvision==0.25.0
5 changes: 3 additions & 2 deletions examples/llm_compression/openvino/smollm2_360m_fp8/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,10 @@ def generate_answers(questions, model, tokenizer, max_new_tokens=50):

for question in questions:
messages.append({"role": "user", "content": question})
input_ids = tokenizer.apply_chat_template(
batch_feature = tokenizer.apply_chat_template(
messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
).to(device=model.device)
)
input_ids = batch_feature["input_ids"]
input_len = len(input_ids[0])

output = model.generate(input_ids, max_new_tokens=max_new_tokens, do_sample=False)[0]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@ datasets==5.0.0
openvino==2026.2.0
optimum-intel[openvino]==2.0.0
optimum==2.2.0
transformers==4.53.0
transformers==5.0.0
onnx==1.22.0
torch==2.10.0
5 changes: 3 additions & 2 deletions examples/llm_compression/openvino/tiny_llama/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,10 @@ def transform_fn(data, tokenizer):
model = OVModelForCausalLM.from_pretrained(OUTPUT_DIR)

messages = [{"role": "user", "content": "What is PyTorch?"}]
input_ids = tokenizer.apply_chat_template(
batch_feature = tokenizer.apply_chat_template(
messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
).to(device=model.device)
)
input_ids = batch_feature["input_ids"]

start_t = time.time()
output = model.generate(input_ids, max_new_tokens=100)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ openvino==2026.2.0
optimum-intel[openvino]==2.0.0
optimum==2.2.0
torch==2.10.0
transformers==4.53.0
transformers==5.0.0
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
whowhatbench @ git+https://github.com/AlexanderDokuchaev/openvino.genai@light_req#subdirectory=tools/who_what_benchmark
whowhatbench @ git+https://github.com/AlexanderDokuchaev/openvino.genai@a26c9b20d07209f035f6d74aeae94d6a72a132ab#subdirectory=tools/who_what_benchmark
numpy==1.26.4
openvino==2026.2.0
optimum-intel==2.0.0
optimum==2.2.0
transformers==4.53.0
transformers==5.0.0
onnx==1.22.0
torch==2.10.0
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@ numpy>=1.23.5,<2
openvino==2026.2.0
optimum-intel==2.0.0
optimum==2.2.0
transformers==4.53.0
transformers==5.0.0
onnx==1.22.0
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,5 @@ numpy>=1.23.5,<2
openvino==2026.2.0
optimum-intel==2.0.0
optimum==2.2.0
transformers==4.53.0
lm_eval==0.4.8
transformers==5.0.0
lm_eval[hf]==0.4.12
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,6 @@ numpy>=1.23.5,<2
openvino==2026.2.0
optimum-intel==2.0.0
optimum==2.2.0
transformers==4.53.0
lm_eval==0.4.8
transformers==5.0.0
lm_eval[hf]==0.4.12
torchao==0.17.0
3 changes: 2 additions & 1 deletion examples/llm_compression/torch_fx/tiny_llama/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,10 @@ def main() -> str:
},
{"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
]
input_ids = tokenizer.apply_chat_template(
batch_feature = tokenizer.apply_chat_template(
messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
)
input_ids = batch_feature["input_ids"]

print("Warmup...")
output = compressed_model_hf.generate(input_ids)
Expand Down
4 changes: 2 additions & 2 deletions examples/llm_compression/torch_fx/tiny_llama/modelling.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
from transformers import GenerationMixin
from transformers import PretrainedConfig
from transformers import PreTrainedModel
from transformers.cache_utils import StaticCacheConfig
from transformers.integrations.executorch import TorchExportableModuleWithStaticCache
from transformers.modeling_outputs import CausalLMOutputWithPast
from transformers.models.llama.configuration_llama import LlamaConfig
Expand Down Expand Up @@ -97,8 +96,9 @@ def convert_and_export_with_cache(model: PreTrainedModel) -> tuple[ExportedProgr
example_cache_position = torch.arange(0, 8, dtype=torch.long)
model_config = None
gen_config = None
model.generation_config.use_cache = True
model.generation_config.cache_implementation = "static"
model.generation_config.cache_config = StaticCacheConfig(batch_size=1, max_cache_len=512)
model.generation_config.cache_config = {"batch_size": 1, "max_cache_len": 512}
model.generation_config.max_new_tokens = 100
gen_config = model.generation_config
model_config = model.config
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
transformers==4.53.0
transformers==5.0.0
datasets==5.0.0
openvino==2026.2.0
optimum==2.2.0
Expand Down
2 changes: 1 addition & 1 deletion tests/openvino/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,6 @@ addict>=2.4.0
timm==0.9.2
efficientnet_pytorch==0.7.1
datasets
transformers==4.53.0
transformers==5.0.0
optimum-intel==2.0.0
optimum==2.2.0
4 changes: 2 additions & 2 deletions tests/post_training/pipelines/fx_modelling.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from transformers import GenerationMixin
from transformers import PretrainedConfig
from transformers import PreTrainedModel
from transformers.cache_utils import StaticCacheConfig
from transformers.integrations.executorch import TorchExportableModuleWithStaticCache
from transformers.modeling_outputs import CausalLMOutputWithPast

Expand Down Expand Up @@ -84,8 +83,9 @@ def convert_and_export_with_cache(model: PreTrainedModel):

example_input_ids = torch.ones(1, 8, dtype=torch.long)
example_cache_position = torch.arange(0, 8, dtype=torch.long)
model.generation_config.use_cache = True
model.generation_config.cache_implementation = "static"
model.generation_config.cache_config = StaticCacheConfig(batch_size=1, max_cache_len=512)
model.generation_config.cache_config = {"batch_size": 1, "max_cache_len": 512}
model.generation_config.max_new_tokens = 100
gen_config = model.generation_config
model_config = model.config
Expand Down
27 changes: 25 additions & 2 deletions tests/post_training/pipelines/gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,37 @@
from tests.post_training.pipelines.base import PTQTestPipeline


# TODO(AlexandrDokuchaev): Remove this wrapper when the issue with torch.jit.trace and transformers>=5.0 is fixed
class CausalLMTracingWrapper(torch.nn.Module):
"""
Wraps a Hugging Face causal language model so it can be exported via ``torch.jit.trace``
(used internally by ``openvino.convert_model``).

Since ``transformers>=5.0`` a causal LM forward pass returns a ``DynamicCache`` in its output
and builds the attention mask with a ``torch.diff``-based packed-sequence check when no
attention mask is provided. Neither construct is supported by ``torch.jit.trace`` / the
OpenVINO PyTorch frontend.
"""

def __init__(self, model: torch.nn.Module) -> None:
super().__init__()
self.model = model
self.model.config.use_cache = False

def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor | None = None) -> torch.Tensor:
if attention_mask is None:
attention_mask = torch.ones_like(input_ids)
return self.model(input_ids=input_ids, attention_mask=attention_mask)


class GPT(PTQTestPipeline):
"""Pipeline for causal language models from Hugging Face repository"""

def prepare_model(self) -> None:
if self.backend in PT_BACKENDS:
self.model_hf = transformers.AutoModelForCausalLM.from_pretrained(self.model_id)
self.model = self.model_hf
self.model.config.torchscript = True # Set to export by convert_model via torch.jit.trace
self.model = CausalLMTracingWrapper(self.model_hf)

self.dummy_tensor = self.model_hf.dummy_inputs["input_ids"]

elif self.backend in OV_BACKENDS + [BackendType.FP32]:
Expand Down
6 changes: 3 additions & 3 deletions tests/post_training/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ optimum-onnx @ git+https://github.com/AlexanderDokuchaev/optimum-onnx.git@b57739
scikit-learn>=1.2.2,<=1.5.0
soundfile==0.12.1
tensorboard==2.20.0
tensorflow-io==0.37.0
tensorflow-io==0.37.1
timm==0.9.2
accelerate==1.9.0
transformers==4.53.0
whowhatbench @ git+https://github.com/AlexanderDokuchaev/openvino.genai@light_req#subdirectory=tools/who_what_benchmark
transformers==5.0.0
whowhatbench @ git+https://github.com/AlexanderDokuchaev/openvino.genai@a26c9b20d07209f035f6d74aeae94d6a72a132ab#subdirectory=tools/who_what_benchmark
16 changes: 0 additions & 16 deletions tests/torch/function_hook/sparsify_activations/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
import openvino as ov
import torch
import torch.nn as nn
import transformers.models

from nncf import IgnoredScope
from nncf.experimental.torch.sparsify_activations import TargetScope
Expand All @@ -36,21 +35,6 @@ def forward(self, input_ids: torch.Tensor):
return y0, y1


def dummy_llama_model():
config = transformers.models.llama.configuration_llama.LlamaConfig(
vocab_size=32,
hidden_size=8,
intermediate_size=14,
num_attention_heads=2,
num_key_value_heads=1,
num_hidden_layers=2,
use_cache=False,
return_dict=False,
)
model = transformers.AutoModelForCausalLM.from_config(config, attn_implementation="eager")
return model


def count_sparsifier_patterns_in_ov(model: ov.Model) -> int:
"""
Counts the number of activation sparsification pattern "Abs -> LessEqual -> Select"
Expand Down
19 changes: 0 additions & 19 deletions tests/torch/function_hook/sparsify_activations/test_algo.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
from tests.cross_fw.shared.paths import TEST_ROOT
from tests.torch.function_hook.sparsify_activations.helpers import ThreeLinearModel
from tests.torch.function_hook.sparsify_activations.helpers import count_sparsifier_patterns_in_ov
from tests.torch.function_hook.sparsify_activations.helpers import dummy_llama_model
from tests.torch.helpers import set_torch_seed
from tests.torch.utils import compare_with_reference_file
from tests.torch.utils import to_comparable_nx_graph
Expand Down Expand Up @@ -100,24 +99,6 @@ def __str__(self) -> str:
ref_num_batches_tracked=3,
ref_num_patterns_in_ov=2,
),
SparsifyActivationsAlgorithmTestDesc(
name="dummy_llama",
model_getter=dummy_llama_model,
dataset_getter=lambda device: nncf.Dataset(torch.randint(0, 30, (3, 2, 8)).to(device)),
target_sparsity_by_scope={
TargetScope(patterns=[".*gate_proj.*"]): 0.2,
TargetScope(patterns=[".*up_proj.*"]): 0.3,
TargetScope(patterns=[".*down_proj.*"]): 0.4,
},
ignored_scope=None,
ref_sparsifier_target_sparsity={
(f"pre_hooks.model/mlp/{name}/linear/{layer_id}__0.0"): sparsity
for name, sparsity in [("gate_proj", 0.2), ("up_proj", 0.3), ("down_proj", 0.4)]
for layer_id in [0, 1]
},
ref_num_batches_tracked=3,
ref_num_patterns_in_ov=6,
),
]
Comment thread
AlexanderDokuchaev marked this conversation as resolved.


Expand Down
4 changes: 2 additions & 2 deletions tests/torch/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ torchvision

addict>=2.4.0
efficientnet_pytorch==0.7.1
transformers==4.53.0
transformers==5.0.0

sentence-transformers==4.1.0
sentence-transformers==5.6.0
optimum-intel==2.0.0
optimum==2.2.0
accelerate==1.9.0
Expand Down