From 53d19b95b0d0fee1d1d7259afd920a97d62b4881 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 19 Jan 2026 10:39:10 +0100 Subject: [PATCH 001/190] Transformers v5 --- .github/workflows/test_openvino.yml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index 2d0958b2d6..cffeabc42d 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -38,7 +38,7 @@ jobs: "*diffusion*", "*quantization*", ] - transformers-version: ["4.45.0", "latest"] + transformers-version: ["4.45.0", "latest", "5.0.0rc3"] runs-on: ubuntu-22.04 diff --git a/setup.py b/setup.py index f7be8fd778..79a2cac349 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ INSTALL_REQUIRE = [ "torch>=2.1", "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@main", - "transformers>=4.45,<4.58", + "transformers>=4.45,<5.1", "setuptools", ] From 5205434f5394f98072291ededa597869b1604839 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 19 Jan 2026 16:11:49 +0100 Subject: [PATCH 002/190] fix loading for llava_next_video --- tests/openvino/test_genai.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_genai.py b/tests/openvino/test_genai.py index b8b9e8d6cd..cdb4f8a555 100644 --- a/tests/openvino/test_genai.py +++ b/tests/openvino/test_genai.py @@ -247,9 +247,9 @@ def _get_model_class(self, model_arch): return AutoModelForImageTextToText elif model_arch == "llava_next_video": - from transformers import AutoModelForVision2Seq + from transformers import LlavaNextVideoForConditionalGeneration - return AutoModelForVision2Seq + return LlavaNextVideoForConditionalGeneration elif model_arch == "llava": from transformers import LlavaForConditionalGeneration From e8feb0caf0d4286fa633ba2e2907681e2e9605f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 20 Jan 2026 18:09:58 +0100 Subject: [PATCH 003/190] Remove deprecated transformers.onnx --- optimum/intel/openvino/modeling_base.py | 6 ++---- optimum/intel/openvino/utils.py | 13 ------------- tests/openvino/test_modeling.py | 11 +++++------ 3 files changed, 7 insertions(+), 23 deletions(-) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 89fa7f5a88..6632acde68 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -851,16 +851,14 @@ def _export( **kwargs, ): """ - Export a vanilla Transformers model into an ONNX model using `transformers.onnx.export_onnx`. + Load and export a model to the OpenVINO IR. Arguments: model_id (`str` or `Path`): The directory from which to load the model. Can be either: - The model id of a pretrained model hosted inside a model repo on huggingface.co. - - The path to a directory containing the model weights. save_dir (`str` or `Path`): - The directory where the exported ONNX model should be saved, default to - `transformers.file_utils.default_cache_path`, which is the cache directory for transformers. + - The path to a directory containing the model weights. token (Optional[Union[bool, str]], defaults to `None`): The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated when running `huggingface-cli login` (stored in `~/.huggingface`). diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py index 4baa280fea..bb3ec658ed 100644 --- a/optimum/intel/openvino/utils.py +++ b/optimum/intel/openvino/utils.py @@ -32,7 +32,6 @@ from openvino import Type as OVType from packaging.version import Version from transformers import AutoTokenizer, CLIPTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast -from transformers.onnx.utils import ParameterFormat, compute_serialized_parameters_size from optimum.intel.utils.import_utils import is_torch_version @@ -228,18 +227,6 @@ def maybe_convert_tokenizer_to_fast( return hf_tokenizer -def use_external_data_format(num_parameters: int) -> bool: - """ - Returns whether or not the model requires using external data format for the ONNX export - Args: - num_parameters: Number of parameter on the model - Returns: - True if model.num_parameters() * size_of(float32) >= 2Gb False otherwise - """ - - return compute_serialized_parameters_size(num_parameters, ParameterFormat.Float) >= EXTERNAL_DATA_FORMAT_SIZE_LIMIT - - def _is_timm_ov_dir(model_dir): config_file = None has_xml = False diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 279fcb9a8d..0c15a1b251 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -53,7 +53,6 @@ pipeline, set_seed, ) -from transformers.onnx.utils import get_preprocessor from transformers.testing_utils import slow from transformers.utils import http_user_agent from utils_tests import F32_CONFIG, MODEL_NAMES, OPENVINO_DEVICE, SEED, TENSOR_ALIAS_TO_TYPE, TEST_IMAGE_URL @@ -236,7 +235,7 @@ def test_load_from_hub_and_save_visual_language_model(self): # anymore due to an internal bug in transformers model_ids.append("katuni4ka/phi-4-multimodal-ov") for model_id in model_ids: - processor = get_preprocessor(model_id) + processor = AutoProcessor.from_pretrained(model_id) prompt = "What is shown in this image?" image = Image.open( requests.get( @@ -491,7 +490,7 @@ def test_load_from_hub_and_save_sam_model(self): self.assertEqual( loaded_model.prompt_encoder_mask_decoder.request.get_property("PERFORMANCE_HINT"), "THROUGHPUT" ) - processor = get_preprocessor(self.OV_SAM_MODEL_ID) + processor = AutoProcessor.from_pretrained(self.OV_SAM_MODEL_ID) img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png" input_points = [[[450, 600]]] raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB") @@ -1846,7 +1845,7 @@ def test_compare_to_transformers(self, model_arch): model_id = MODEL_NAMES[model_arch] set_seed(SEED) ov_model = OVSamModel.from_pretrained(model_id, export=True, ov_config=F32_CONFIG, device=OPENVINO_DEVICE) - processor = get_preprocessor(model_id) + processor = AutoProcessor.from_pretrained(model_id) self.assertIsInstance(ov_model.vision_encoder, OVSamVisionEncoder) self.assertIsInstance(ov_model.prompt_encoder_mask_decoder, OVSamPromptEncoder) @@ -1899,7 +1898,7 @@ def test_reshape(self, model_arch): model_id = MODEL_NAMES[model_arch] set_seed(SEED) ov_model = OVSamModel.from_pretrained(model_id, export=True, ov_config=F32_CONFIG, device=OPENVINO_DEVICE) - processor = get_preprocessor(model_id) + processor = AutoProcessor.from_pretrained(model_id) self.assertTrue(ov_model.is_dynamic) input_points = [[[450, 600]]] IMAGE = Image.open( @@ -1935,7 +1934,7 @@ def test_compare_to_transformers(self, model_arch): ov_model = OVModelForZeroShotImageClassification.from_pretrained( model_id, export=True, ov_config=F32_CONFIG, device=OPENVINO_DEVICE ) - processor = get_preprocessor(model_id) + processor = AutoProcessor.from_pretrained(model_id) self.assertIsInstance(ov_model.config, PretrainedConfig) From bb54f64adee17964693a238de416ebd492a728ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 23 Jan 2026 15:01:05 +0100 Subject: [PATCH 004/190] remove deprecated transformers.onnx from tests --- tests/openvino/test_seq2seq.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index dbc1031a4c..238e13a1ac 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -42,7 +42,6 @@ set_seed, ) from transformers.models.auto.configuration_auto import CONFIG_MAPPING_NAMES -from transformers.onnx.utils import get_preprocessor from transformers.testing_utils import slow from transformers.utils import http_user_agent from utils_tests import F32_CONFIG, MODEL_NAMES, OPENVINO_DEVICE, SEED, TEST_IMAGE_URL, Timer @@ -336,7 +335,7 @@ def test_compare_to_transformers(self, model_arch): self._check_openvino_model_attributes(ov_model, use_cache=True, stateful=True) self._check_openvino_model_attributes(ov_model_stateless, use_cache=True, stateful=False) - processor = get_preprocessor(model_id) + processor = AutoProcessor.from_pretrained(model_id) data = self._generate_random_audio_data() pt_features = processor.feature_extractor(data, return_tensors="pt") decoder_start_token_id = transformers_model.config.decoder_start_token_id @@ -395,7 +394,7 @@ def test_pipeline(self, model_arch): set_seed(SEED) model_id = MODEL_NAMES[model_arch] model = self.OVMODEL_CLASS.from_pretrained(model_id, device=OPENVINO_DEVICE) - processor = get_preprocessor(model_id) + processor = AutoProcessor.from_pretrained(model_id) pipe = pipeline( "automatic-speech-recognition", model=model, @@ -1079,7 +1078,7 @@ def test_compare_to_transformers(self, model_arch): question = "Who am I?" transformers_model = self.AUTOMODEL_CLASS.from_pretrained(model_id) - preprocessor = get_preprocessor(model_id) + preprocessor = AutoProcessor.from_pretrained(model_id) inputs = preprocessor(images=self.IMAGE, text=question, padding=True, return_tensors="pt") ov_outputs = ov_model(**inputs) @@ -1100,7 +1099,7 @@ def test_compare_to_transformers(self, model_arch): def test_generate_utils(self, model_arch): model_id = MODEL_NAMES[model_arch] model = self.OVMODEL_CLASS.from_pretrained(model_id, export=True, device=OPENVINO_DEVICE) - preprocessor = get_preprocessor(model_id) + preprocessor = AutoProcessor.from_pretrained(model_id) question = "Who am I?" inputs = preprocessor(images=self.IMAGE, text=question, return_tensors="pt") @@ -1114,7 +1113,7 @@ def test_generate_utils(self, model_arch): def test_compare_with_and_without_past_key_values(self): model_id = MODEL_NAMES["pix2struct"] - preprocessor = get_preprocessor(model_id) + preprocessor = AutoProcessor.from_pretrained(model_id) question = "Who am I?" inputs = preprocessor(images=self.IMAGE, text=question, return_tensors="pt") model_with_pkv = self.OVMODEL_CLASS.from_pretrained( From 71aa34e773537e6a191463e4c9298720fd3ff714 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 23 Jan 2026 15:38:03 +0100 Subject: [PATCH 005/190] remove huggingface_hub deprecated --- optimum/intel/openvino/modeling_base.py | 8 +++++++- optimum/intel/utils/import_utils.py | 18 ++++++++++++++++++ optimum/intel/utils/modeling_utils.py | 11 ++++++++--- setup.py | 1 + 4 files changed, 34 insertions(+), 4 deletions(-) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index b96f375728..569422a085 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -28,12 +28,12 @@ from transformers import GenerationConfig, PretrainedConfig from transformers.file_utils import add_start_docstrings from transformers.generation import GenerationMixin -from transformers.utils import is_offline_mode from transformers.utils.hub import cached_file from optimum.exporters.base import ExportConfig from optimum.exporters.openvino.utils import _MAX_UNCOMPRESSED_SIZE from optimum.modeling_base import FROM_PRETRAINED_START_DOCSTRING, OptimizedModel +from optimum.utils.import_utils import is_huggingface_hub_version from ...exporters.openvino import export, main_export from ..utils.import_utils import is_nncf_available @@ -58,6 +58,12 @@ ) +if is_huggingface_hub_version(">=", "1.2.1"): + from huggingface_hub import is_offline_mode +else: + from transformers.utils import is_offline_mode + + core = Core() logger = logging.getLogger(__name__) diff --git a/optimum/intel/utils/import_utils.py b/optimum/intel/utils/import_utils.py index 3ad9877a82..d5e44d06d0 100644 --- a/optimum/intel/utils/import_utils.py +++ b/optimum/intel/utils/import_utils.py @@ -119,6 +119,15 @@ pass +_huggingface_hub_available = importlib.util.find_spec("huggingface_hub") is not None +_huggingface_hub_version = "N/A" +if _huggingface_hub_available: + try: + _huggingface_hub_version = importlib_metadata.version("huggingface_hub") + except importlib_metadata.PackageNotFoundError: + _huggingface_hub_available = False + + _safetensors_version = "N/A" _safetensors_available = importlib.util.find_spec("safetensors") is not None if _safetensors_available: @@ -486,6 +495,15 @@ def is_sentence_transformers_version(operation: str, version: str): return compare_versions(parse(_sentence_transformers_version), operation, version) +def is_huggingface_hub_version(operation: str, version: str): + """ + Compare the current huggingface_hub version to a given reference with an operation. + """ + if not _huggingface_hub_available: + return False + return compare_versions(parse(_huggingface_hub_version), operation, version) + + DIFFUSERS_IMPORT_ERROR = """ {0} requires the diffusers library but it was not found in your environment. You can install it with pip: `pip install diffusers`. Please note that you may need to restart your runtime after installation. diff --git a/optimum/intel/utils/modeling_utils.py b/optimum/intel/utils/modeling_utils.py index cab9e5efa3..83b5ccc1ac 100644 --- a/optimum/intel/utils/modeling_utils.py +++ b/optimum/intel/utils/modeling_utils.py @@ -23,14 +23,19 @@ from typing import Dict, List, Optional, Type, Union import torch -from huggingface_hub import HfApi, HfFolder, hf_hub_download +from huggingface_hub import HfApi, get_token, hf_hub_download from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE from huggingface_hub.hf_api import file_exists from transformers import CLIPConfig, PretrainedConfig, PreTrainedModel from optimum.exporters.tasks import TasksManager -from .import_utils import is_diffusers_available, is_numa_available, is_open_clip_available, is_psutil_available +from .import_utils import ( + is_diffusers_available, + is_numa_available, + is_open_clip_available, + is_psutil_available, +) if is_diffusers_available(): @@ -115,7 +120,7 @@ def _find_files_matching_pattern( model_path = Path(model_name_or_path) if not isinstance(model_name_or_path, Path) else model_name_or_path if isinstance(use_auth_token, bool): - token = HfFolder().get_token() + token = get_token() else: token = use_auth_token diff --git a/setup.py b/setup.py index 79a2cac349..9937ad3ebf 100644 --- a/setup.py +++ b/setup.py @@ -31,6 +31,7 @@ "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@main", "transformers>=4.45,<5.1", "setuptools", + "huggingface-hub>=0.23.2,<2.0", ] TESTS_REQUIRE = [ From 0954015d7953735a0c1e5f1519bbbbd7cafeb77b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 23 Jan 2026 19:19:17 +0100 Subject: [PATCH 006/190] relative to absolute import --- optimum/intel/openvino/modeling_base.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 569422a085..8a16470fe4 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -31,14 +31,9 @@ from transformers.utils.hub import cached_file from optimum.exporters.base import ExportConfig +from optimum.exporters.openvino import export, main_export from optimum.exporters.openvino.utils import _MAX_UNCOMPRESSED_SIZE -from optimum.modeling_base import FROM_PRETRAINED_START_DOCSTRING, OptimizedModel -from optimum.utils.import_utils import is_huggingface_hub_version - -from ...exporters.openvino import export, main_export -from ..utils.import_utils import is_nncf_available -from ..utils.modeling_utils import _find_files_matching_pattern -from .configuration import ( +from optimum.intel.openvino.configuration import ( _DEFAULT_4BIT_WQ_CONFIG, OVConfig, OVQuantizationConfigBase, @@ -47,7 +42,7 @@ _quantization_config_from_dict, get_default_quantization_config, ) -from .utils import ( +from optimum.intel.openvino.utils import ( ONNX_WEIGHTS_NAME, OV_TO_PT_TYPE, OV_XML_FILE_NAME, @@ -56,6 +51,9 @@ classproperty, model_has_dynamic_inputs, ) +from optimum.intel.utils.import_utils import is_huggingface_hub_version, is_nncf_available +from optimum.intel.utils.modeling_utils import _find_files_matching_pattern +from optimum.modeling_base import FROM_PRETRAINED_START_DOCSTRING, OptimizedModel if is_huggingface_hub_version(">=", "1.2.1"): From 1ba9789bd9d8a18cd56631bbb7d85edd8ce8144f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 26 Jan 2026 15:06:56 +0100 Subject: [PATCH 007/190] update workflow to v5 --- .github/workflows/test_openvino.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index cffeabc42d..f1874d3dbd 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -38,7 +38,7 @@ jobs: "*diffusion*", "*quantization*", ] - transformers-version: ["4.45.0", "latest", "5.0.0rc3"] + transformers-version: ["4.45.0", "latest", "5.0.0"] runs-on: ubuntu-22.04 From f1586565e90bdf05b900b9e9912089a42d0d417f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 26 Jan 2026 15:43:26 +0100 Subject: [PATCH 008/190] remove redundant --- optimum/exporters/openvino/model_configs.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index ca12d455be..e9c7b52d97 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -247,10 +247,6 @@ def init_model_configs(): "transformers", "Qwen2VLForConditionalGeneration", ) - TasksManager._CUSTOM_CLASSES[("pt", "qwen2_5_vl", "image-text-to-text")] = ( - "transformers", - "AutoModelForImageTextToText", - ) TasksManager._CUSTOM_CLASSES[("pt", "llava_next_video", "image-text-to-text")] = ( "transformers", "AutoModelForVision2Seq", @@ -259,14 +255,6 @@ def init_model_configs(): "transformers", "Gemma3ForConditionalGeneration", ) - TasksManager._CUSTOM_CLASSES[("pt", "idefics3", "image-text-to-text")] = ( - "transformers", - "AutoModelForImageTextToText", - ) - TasksManager._CUSTOM_CLASSES[("pt", "smolvlm", "image-text-to-text")] = ( - "transformers", - "AutoModelForImageTextToText", - ) TasksManager._CUSTOM_CLASSES[("pt", "phi4mm", "image-text-to-text")] = ("transformers", "AutoModelForCausalLM") TasksManager._CUSTOM_CLASSES[("pt", "phi4mm", "automatic-speech-recognition")] = ( "transformers", @@ -280,10 +268,6 @@ def init_model_configs(): "transformers", "AutoModelForCausalLM", ) - TasksManager._CUSTOM_CLASSES[("pt", "llama4", "image-text-to-text")] = ( - "transformers", - "AutoModelForImageTextToText", - ) if is_diffusers_available() and "fill" not in TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS: TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS["fill"] = "FluxFillPipeline" From 93451439c75f2758031a3acb573547a5a55add26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 26 Jan 2026 16:18:13 +0100 Subject: [PATCH 009/190] update loading given transformers version --- optimum/exporters/openvino/model_configs.py | 53 +++++++++++++-------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index e9c7b52d97..67686b94bb 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -235,26 +235,7 @@ def init_model_configs(): if "open_clip" not in TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES: TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES["open_clip"] = {} - TasksManager._CUSTOM_CLASSES[("pt", "llava", "image-text-to-text")] = ( - "transformers", - "LlavaForConditionalGeneration", - ) - TasksManager._CUSTOM_CLASSES[("pt", "llava_next", "image-text-to-text")] = ( - "transformers", - "LlavaNextForConditionalGeneration", - ) - TasksManager._CUSTOM_CLASSES[("pt", "qwen2_vl", "image-text-to-text")] = ( - "transformers", - "Qwen2VLForConditionalGeneration", - ) - TasksManager._CUSTOM_CLASSES[("pt", "llava_next_video", "image-text-to-text")] = ( - "transformers", - "AutoModelForVision2Seq", - ) - TasksManager._CUSTOM_CLASSES[("pt", "gemma3", "image-text-to-text")] = ( - "transformers", - "Gemma3ForConditionalGeneration", - ) + TasksManager._CUSTOM_CLASSES[("pt", "phi4mm", "image-text-to-text")] = ("transformers", "AutoModelForCausalLM") TasksManager._CUSTOM_CLASSES[("pt", "phi4mm", "automatic-speech-recognition")] = ( "transformers", @@ -269,6 +250,38 @@ def init_model_configs(): "AutoModelForCausalLM", ) + # since transformers v4.46, model can be loaded using default AutoModelForImageTextToText + # https://github.com/huggingface/transformers/blob/v4.46.0/src/transformers/models/auto/modeling_auto.py#L776 + if is_transformers_version("<", "4.46"): + TasksManager._CUSTOM_CLASSES[("pt", "llava", "image-text-to-text")] = ( + "transformers", + "LlavaForConditionalGeneration", + ) + TasksManager._CUSTOM_CLASSES[("pt", "llava_next", "image-text-to-text")] = ( + "transformers", + "LlavaNextForConditionalGeneration", + ) + TasksManager._CUSTOM_CLASSES[("pt", "qwen2_vl", "image-text-to-text")] = ( + "transformers", + "Qwen2VLForConditionalGeneration", + ) + + # since transformers v4.50, model can be loaded using default AutoModelForImageTextToText + # https://github.com/huggingface/transformers/blob/v4.50.0/src/transformers/models/auto/modeling_auto.py#L835 + if is_transformers_version("<", "4.50"): + TasksManager._CUSTOM_CLASSES[("pt", "gemma3", "image-text-to-text")] = ( + "transformers", + "Gemma3ForConditionalGeneration", + ) + + # since transformers v4.52, model can be loaded using default AutoModelForImageTextToText + # https://github.com/huggingface/transformers/blob/v4.52.0/src/transformers/models/auto/modeling_auto.py#L899 + if is_transformers_version("<", "4.52"): + TasksManager._CUSTOM_CLASSES[("pt", "llava_next_video", "image-text-to-text")] = ( + "transformers", + "AutoModelForVision2Seq", + ) + if is_diffusers_available() and "fill" not in TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS: TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS["fill"] = "FluxFillPipeline" TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["fill"] = {"flux": "FluxFillPipeline"} From b290ae3c36ca4b6dd995b0601be2450a6aed63ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 26 Jan 2026 16:37:48 +0100 Subject: [PATCH 010/190] remove deprecated AutoModelForVision2Seq --- optimum/intel/openvino/modeling_seq2seq.py | 15 +++++++++++++-- tests/openvino/test_seq2seq.py | 21 ++++++++++++++++----- 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/optimum/intel/openvino/modeling_seq2seq.py b/optimum/intel/openvino/modeling_seq2seq.py index c1b2177c59..4a7bc0394d 100644 --- a/optimum/intel/openvino/modeling_seq2seq.py +++ b/optimum/intel/openvino/modeling_seq2seq.py @@ -27,7 +27,6 @@ AutoConfig, AutoModelForSeq2SeqLM, AutoModelForSpeechSeq2Seq, - AutoModelForVision2Seq, GenerationConfig, Pix2StructForConditionalGeneration, PretrainedConfig, @@ -56,6 +55,18 @@ ) +# AutoModelForVision2Seq is deprecated since v4.54 +# https://github.com/huggingface/transformers/blob/v4.54.0/src/transformers/models/auto/modeling_auto.py#L2151 +if is_transformers_version(">=", "4.54.0"): + from transformers import AutoModelForImageTextToText + + transformers_auto_class = AutoModelForImageTextToText +else: + from transformers import AutoModelForVision2Seq + + transformers_auto_class = AutoModelForVision2Seq + + core = Core() logger = logging.getLogger(__name__) @@ -1036,7 +1047,7 @@ def _reorder_cache( INPUTS_DOCSTRING, ) class OVModelForVision2Seq(OVModelForSeq2SeqLM): - auto_model_class = AutoModelForVision2Seq + auto_model_class = transformers_auto_class main_input_name = "pixel_values" export_feature = "image-to-text" diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index 238e13a1ac..83a4b7c54f 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -33,7 +33,6 @@ AutoModelForSeq2SeqLM, AutoModelForSpeechSeq2Seq, AutoModelForTextToSpectrogram, - AutoModelForVision2Seq, AutoProcessor, AutoTokenizer, GenerationConfig, @@ -69,6 +68,18 @@ from optimum.intel.utils.import_utils import is_openvino_version, is_transformers_version +# AutoModelForVision2Seq is deprecated since v4.54 +# https://github.com/huggingface/transformers/blob/v4.54.0/src/transformers/models/auto/modeling_auto.py#L2151 +if is_transformers_version(">=", "4.54.0"): + from transformers import AutoModelForImageTextToText + + transformers_auto_class = AutoModelForImageTextToText +else: + from transformers import AutoModelForVision2Seq + + transformers_auto_class = AutoModelForVision2Seq + + os.environ["TOKENIZERS_PARALLELISM"] = "false" @@ -421,7 +432,7 @@ class OVModelForVision2SeqIntegrationTest(OVSeq2SeqTestMixin): UNSUPPORTED_ARCHITECTURES = {"got_ocr2", "pix2struct"} TASK = "image-to-text" OVMODEL_CLASS = OVModelForVision2Seq - AUTOMODEL_CLASS = AutoModelForVision2Seq + AUTOMODEL_CLASS = transformers_auto_class GENERATION_LENGTH = 100 SPEEDUP_CACHE = 1.1 @@ -580,9 +591,9 @@ def get_transformer_model_class(self, model_arch): return AutoModelForImageTextToText if model_arch == "llava_next_video": - from transformers import AutoModelForVision2Seq + from transformers import LlavaNextVideoForConditionalGeneration - return AutoModelForVision2Seq + return LlavaNextVideoForConditionalGeneration if model_arch == "llava": from transformers import LlavaForConditionalGeneration @@ -1056,7 +1067,7 @@ class OVModelForPix2StructIntegrationTest(OVSeq2SeqTestMixin): SUPPORTED_ARCHITECTURES = ["pix2struct"] TASK = "image-to-text" # is it fine as well with visual-question-answering? OVMODEL_CLASS = OVModelForVision2Seq - AUTOMODEL_CLASS = AutoModelForVision2Seq + AUTOMODEL_CLASS = transformers_auto_class GENERATION_LENGTH = 100 SPEEDUP_CACHE = 1.1 From a4d1dc0067813762978c3252c029b140b7e53ebd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 26 Jan 2026 16:44:22 +0100 Subject: [PATCH 011/190] update workflow --- .github/workflows/test_openvino.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index f1874d3dbd..1e8433087c 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -56,7 +56,7 @@ jobs: pip install --upgrade pip uv uv pip install .[openvino,diffusers,tests] - - if: ${{ matrix.transformers-version != 'latest' }} + - if: ${{ matrix.transformers-version == '4.45.0' }} name: Install specific dependencies and versions required for older transformers run: | uv pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.32.* transformers_stream_generator From ac953baa2715e0f4665a4d6b03303cd679e7ebd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 26 Jan 2026 17:30:56 +0100 Subject: [PATCH 012/190] style --- optimum/intel/utils/modeling_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/optimum/intel/utils/modeling_utils.py b/optimum/intel/utils/modeling_utils.py index 83b5ccc1ac..69de1770ce 100644 --- a/optimum/intel/utils/modeling_utils.py +++ b/optimum/intel/utils/modeling_utils.py @@ -29,8 +29,7 @@ from transformers import CLIPConfig, PretrainedConfig, PreTrainedModel from optimum.exporters.tasks import TasksManager - -from .import_utils import ( +from optimum.intel.utils.import_utils import ( is_diffusers_available, is_numa_available, is_open_clip_available, From 800188441707ed6c8ea1b216d742cc110911b062 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 26 Jan 2026 17:42:39 +0100 Subject: [PATCH 013/190] update setup --- .github/workflows/test_openvino.yml | 4 ++-- setup.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index 1e8433087c..81c8b4b48a 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -38,7 +38,7 @@ jobs: "*diffusion*", "*quantization*", ] - transformers-version: ["4.45.0", "latest", "5.0.0"] + transformers-version: ["4.45", "4.57.6", "latest"] runs-on: ubuntu-22.04 @@ -61,7 +61,7 @@ jobs: run: | uv pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.32.* transformers_stream_generator - - if: ${{ matrix.transformers-version == 'latest' && matrix.test-pattern == '*decoder*'}} + - if: ${{ matrix.transformers-version != '4.45.0' && matrix.test-pattern == '*decoder*'}} name: Install auto-gptq, autoawq run: | uv pip install auto-gptq "autoawq<0.2.8" diff --git a/setup.py b/setup.py index 9937ad3ebf..1c313dbe0c 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ INSTALL_REQUIRE = [ "torch>=2.1", - "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@main", + "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@transformers-v5", "transformers>=4.45,<5.1", "setuptools", "huggingface-hub>=0.23.2,<2.0", From 5f2a00716ee2755fe9924d491d30ce476c2d947b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 26 Jan 2026 17:48:35 +0100 Subject: [PATCH 014/190] deprecated is_offline_mode --- optimum/intel/openvino/modeling_open_clip.py | 25 ++++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/optimum/intel/openvino/modeling_open_clip.py b/optimum/intel/openvino/modeling_open_clip.py index db6abf9cc3..11bc115843 100644 --- a/optimum/intel/openvino/modeling_open_clip.py +++ b/optimum/intel/openvino/modeling_open_clip.py @@ -31,16 +31,27 @@ from transformers.file_utils import add_start_docstrings from transformers.modeling_outputs import ModelOutput from transformers.models.clip.modeling_clip import CLIPOutput -from transformers.utils import is_offline_mode +from optimum.exporters.openvino import main_export from optimum.exporters.tasks import TasksManager +from optimum.intel.openvino.configuration import ( + OVConfig, + OVWeightQuantizationConfig, +) +from optimum.intel.openvino.modeling import MODEL_START_DOCSTRING, OVModel +from optimum.intel.openvino.modeling_base import OVModelHostMixin +from optimum.intel.openvino.utils import ( + TemporaryDirectory, + classproperty, +) +from optimum.intel.utils.import_utils import is_huggingface_hub_version +from optimum.intel.utils.modeling_utils import _find_files_matching_pattern, _OpenClipForZeroShotImageClassification + -from ...exporters.openvino import main_export -from ..utils.modeling_utils import _find_files_matching_pattern, _OpenClipForZeroShotImageClassification -from .configuration import OVConfig, OVWeightQuantizationConfig -from .modeling import MODEL_START_DOCSTRING, OVModel -from .modeling_base import OVModelHostMixin -from .utils import TemporaryDirectory, classproperty +if is_huggingface_hub_version(">=", "1.2.1"): + from huggingface_hub import is_offline_mode +else: + from transformers.utils import is_offline_mode logger = logging.getLogger(__name__) From ad477fe92395a73a67aac57349cbe25c4a82e466 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 26 Jan 2026 18:15:55 +0100 Subject: [PATCH 015/190] remove incompatible neural-compressor installation --- .github/workflows/build_documentation.yml | 2 +- .github/workflows/build_pr_documentation.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml index bcb51d6b58..52dae651de 100644 --- a/.github/workflows/build_documentation.yml +++ b/.github/workflows/build_documentation.yml @@ -51,7 +51,7 @@ jobs: run: | pip install --upgrade pip uv uv pip install git+https://github.com/huggingface/doc-builder - uv pip install .[quality] nncf openvino neural-compressor[pt]>3.4 diffusers accelerate datasets + uv pip install .[quality] nncf openvino diffusers accelerate datasets - name: Make documentation shell: bash diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml index 16ac720c8f..c4a34baaa6 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yml @@ -38,7 +38,7 @@ jobs: run: | pip install --upgrade pip uv uv pip install git+https://github.com/huggingface/doc-builder - uv pip install .[quality] nncf openvino neural-compressor[pt]>3.4 diffusers accelerate datasets + uv pip install .[quality] nncf openvino diffusers accelerate datasets - name: Make documentation shell: bash From 42e98b8495fc4ac8dc090cf06c5459f400faff55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 26 Jan 2026 18:18:57 +0100 Subject: [PATCH 016/190] remove documentation reference --- docs/source/neural_compressor/reference.mdx | 40 --------------------- 1 file changed, 40 deletions(-) diff --git a/docs/source/neural_compressor/reference.mdx b/docs/source/neural_compressor/reference.mdx index b6e3d8f468..b83618b4bc 100644 --- a/docs/source/neural_compressor/reference.mdx +++ b/docs/source/neural_compressor/reference.mdx @@ -14,43 +14,3 @@ specific language governing permissions and limitations under the License. `optimum.intel.neural_compressor` is deprecated and will be removed in the next major release. - -## INCQuantizer - -[[autodoc]] neural_compressor.quantization.INCQuantizer - -## INCTrainer - -[[autodoc]] neural_compressor.trainer.INCTrainer - -## INCModel - -[[autodoc]] neural_compressor.modeling_base.INCModel - -## INCModelForSequenceClassification - -[[autodoc]] neural_compressor.modeling_base.INCModelForSequenceClassification - -## INCModelForQuestionAnswering - -[[autodoc]] neural_compressor.modeling_base.INCModelForQuestionAnswering - -## INCModelForTokenClassification - -[[autodoc]] neural_compressor.modeling_base.INCModelForTokenClassification - -## INCModelForMultipleChoice - -[[autodoc]] neural_compressor.modeling_base.INCModelForMultipleChoice - -## INCModelForMaskedLM - -[[autodoc]] neural_compressor.modeling_base.INCModelForMaskedLM - -## INCModelForCausalLM - -[[autodoc]] neural_compressor.modeling_base.INCModelForCausalLM - -## INCModelForSeq2SeqLM - -[[autodoc]] neural_compressor.modeling_base.INCModelForSeq2SeqLM From 4ee3f51ccdf946ae44644e9980f494d0893c2f71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 26 Jan 2026 18:30:58 +0100 Subject: [PATCH 017/190] add install transformers step --- .github/workflows/test_openvino.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index 81c8b4b48a..2fcd23dbcf 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -38,7 +38,7 @@ jobs: "*diffusion*", "*quantization*", ] - transformers-version: ["4.45", "4.57.6", "latest"] + transformers-version: ["4.45.0", "4.57.6", "latest"] runs-on: ubuntu-22.04 @@ -56,6 +56,11 @@ jobs: pip install --upgrade pip uv uv pip install .[openvino,diffusers,tests] + - if: ${{ matrix.transformers-version != 'latest' }} + name: Install transformers + run: | + uv pip install transformers==${{ matrix.transformers-version }} + - if: ${{ matrix.transformers-version == '4.45.0' }} name: Install specific dependencies and versions required for older transformers run: | From 8204264e1ab001d039ccdfae3a3c48418ccc23d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 27 Jan 2026 18:54:01 +0100 Subject: [PATCH 018/190] transformers v5 --- .github/workflows/test_openvino.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index 2fcd23dbcf..aef4ef484b 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -38,7 +38,7 @@ jobs: "*diffusion*", "*quantization*", ] - transformers-version: ["4.45.0", "4.57.6", "latest"] + transformers-version: ["4.45.0", "5.0.0", "latest"] runs-on: ubuntu-22.04 From b319d19a5e8d33761bad5de23291a2c8c87557af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 27 Jan 2026 19:10:25 +0100 Subject: [PATCH 019/190] install diffusers from source for v5 --- .github/workflows/test_openvino.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index aef4ef484b..5bcbc0e31c 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -66,6 +66,11 @@ jobs: run: | uv pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.32.* transformers_stream_generator + - if: ${{ matrix.transformers-version == '5.0.0' }} + name: Install diffusers + run: | + uv pip install git+https://github.com/huggingface/diffusers + - if: ${{ matrix.transformers-version != '4.45.0' && matrix.test-pattern == '*decoder*'}} name: Install auto-gptq, autoawq run: | From 42300e42bbea84fde261a6cf01f81ac3789081a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 27 Jan 2026 19:21:26 +0100 Subject: [PATCH 020/190] remove deprecated CLIPFeatureExtractor --- optimum/intel/openvino/modeling_diffusion.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 368265bc3e..22182ee96c 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -50,7 +50,7 @@ from huggingface_hub.utils import validate_hf_hub_args from openvino import Core from openvino._offline_transformations import compress_model_transformation -from transformers import CLIPFeatureExtractor, CLIPTokenizer +from transformers import CLIPImageProcessor, CLIPTokenizer from transformers.modeling_outputs import ModelOutput from transformers.utils import http_user_agent @@ -170,7 +170,7 @@ def __init__( tokenizer: Optional[CLIPTokenizer] = None, tokenizer_2: Optional[CLIPTokenizer] = None, tokenizer_3: Optional[CLIPTokenizer] = None, - feature_extractor: Optional[CLIPFeatureExtractor] = None, + feature_extractor: Optional[CLIPImageProcessor] = None, # stable diffusion xl specific arguments force_zeros_for_empty_prompt: bool = True, requires_aesthetics_score: bool = False, From 2a761024506fa8536a77c603233e875e25a4dbb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 28 Jan 2026 16:17:24 +0100 Subject: [PATCH 021/190] openvino 2025.3.0 --- optimum/intel/openvino/__init__.py | 8 +++++--- setup.py | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py index 8441944800..28e39f0528 100644 --- a/optimum/intel/openvino/__init__.py +++ b/optimum/intel/openvino/__init__.py @@ -35,8 +35,12 @@ warnings.simplefilter(action="ignore", category=FutureWarning) + +logger = logging.getLogger(__name__) + + if is_openvino_version("<", "2025.4.0"): - raise ImportError( + logger.warning( "Optimum-intel requires OpenVINO version 2025.4.0 or higher. " "Please upgrade OpenVINO to version 2025.4 or later. " f"The current version of OpenVINO is {_openvino_version}." @@ -51,8 +55,6 @@ ) -logger = logging.getLogger(__name__) - if is_nncf_available(): import nncf diff --git a/setup.py b/setup.py index 1c313dbe0c..b2c945b37b 100644 --- a/setup.py +++ b/setup.py @@ -67,7 +67,7 @@ EXTRAS_REQUIRE = { "nncf": ["nncf>=2.19.0"], - "openvino": ["nncf>=2.19.0", "openvino>=2025.4.0", "openvino-tokenizers>=2025.4.0"], + "openvino": ["nncf>=2.19.0", "openvino==2025.3.0", "openvino-tokenizers==2025.3.0"], "neural-compressor": ["neural-compressor[pt]>=3.4.1", "accelerate", "transformers<4.46", "datasets"], "ipex": ["intel-extension-for-pytorch>=2.8", "transformers>4.54,<4.56", "accelerate"], "diffusers": ["diffusers"], From f38703a626c64495bd67233368b2b36a5d0a78af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 29 Jan 2026 10:57:49 +0100 Subject: [PATCH 022/190] add ov cache classes --- optimum/exporters/openvino/model_patcher.py | 167 ++++++++++++++++---- 1 file changed, 140 insertions(+), 27 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 73b25149d9..3639ece9cf 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -18,6 +18,7 @@ import logging as log import math import types +from collections.abc import Iterable from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union import torch @@ -54,6 +55,118 @@ logger = logging.getLogger(__name__) +class OVDynamicCache(DynamicCache): + # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1005 + def to_legacy_cache(self) -> tuple[tuple[torch.Tensor, torch.Tensor]]: + """ + Converts the `Cache` instance into the its equivalent in the legacy cache format. Used for + backward compatibility. + """ + legacy_cache = () + for layer in self.layers: + legacy_cache += ((layer.keys, layer.values),) + return legacy_cache + + # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1015 + @classmethod + def from_legacy_cache(cls, past_key_values: tuple[tuple[torch.Tensor, torch.Tensor]]) -> "DynamicCache": + """ + Converts a cache in the legacy cache format into an equivalent `Cache`. Used for + backward compatibility. + """ + cache = cls() + if past_key_values is None: + logger.warning_once("past_key_values should not be None in from_legacy_cache()") + if past_key_values is not None: + for layer_idx in range(len(past_key_values)): + key_states, value_states = past_key_values[layer_idx] + cache.update(key_states, value_states, layer_idx) + return cache + + +class OVEncoderDecoderCache(EncoderDecoderCache): + # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1266 + def to_legacy_cache(self) -> tuple[tuple[torch.Tensor]]: + """Converts the `EncoderDecoderCache` instance into its equivalent in the legacy cache format.""" + legacy_cache = () + if len(self.cross_attention_cache) > 0: + for self_attn, cross_attn in zip( + self.self_attention_cache.to_legacy_cache(), self.cross_attention_cache.to_legacy_cache() + ): + legacy_cache += (self_attn + cross_attn,) + else: + legacy_cache = self.self_attention_cache.to_legacy_cache() + return legacy_cache + + # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1279 + @classmethod + def from_legacy_cache( + cls, past_key_values: Optional[Iterable[tuple[torch.FloatTensor, ...]]] + ) -> "EncoderDecoderCache": + """Converts a cache in the legacy cache format into an equivalent `EncoderDecoderCache`.""" + cache = cls(DynamicCache(), DynamicCache()) + if past_key_values is None: + logger.warning_once("past_key_values should not be None in from_legacy_cache()") + else: + for layer_idx, key_value_states in enumerate(past_key_values): + key_states, value_states = key_value_states[:2] + cache.self_attention_cache.update(key_states, value_states, layer_idx) + if len(key_value_states) > 2: + key_states, value_states = key_value_states[2:] + cache.cross_attention_cache.update(key_states, value_states, layer_idx) + cache.is_updated[layer_idx] = True + return cache + + +def preprocess_past_key_values(past_key_values): + if ( + is_transformers_version(">=", "4.48") + and isinstance(past_key_values, (list, tuple)) + and isinstance(past_key_values[0], (list, tuple)) + ): + if len(past_key_values[0]) == 2: + past_key_values = OVDynamicCache.from_legacy_cache(past_key_values) + elif len(past_key_values[0]) == 4: + past_key_values = OVEncoderDecoderCache.from_legacy_cache(past_key_values) + else: + raise ValueError( + f"past_key_values should have either 2 or 4 elements, but it has {len(past_key_values[0])} elements." + ) + + return past_key_values + + +class OVModelPatcher(ModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: "PreTrainedModel", + model_kwargs: Optional[Dict[str, Any]] = None, + ): + super().__init__(config, model, model_kwargs) + + self.model_patched_forward = self.patched_forward + + @functools.wraps(self.model_patched_forward) + def patched_forward(*args, **kwargs): + signature = inspect.signature(self.model_patched_forward) + args, kwargs = override_arguments(args, kwargs, signature, model_kwargs=self.model_kwargs) + + if "past_key_values" in signature.parameters: + # Most models require past_key_values to be a cache instance instead of a tuple now + pkv_index = list(signature.parameters.keys()).index("past_key_values") + if pkv_index < len(args) and args[pkv_index] is not None: + args[pkv_index] = preprocess_past_key_values(args[pkv_index]) + elif kwargs.get("past_key_values") is not None: + kwargs["past_key_values"] = preprocess_past_key_values(kwargs["past_key_values"]) + + outputs = self.model_patched_forward(*args, **kwargs) + + return outputs + + self.patched_forward = patched_forward + + for idx, spec in enumerate(UNSUPPORTED_OPS_PATCHING_SPEC): if spec.name in { # onnx-exporter-specific fixes @@ -210,7 +323,7 @@ def eager_mask_without_vmap(*args, **kwargs) -> Optional[torch.Tensor]: return mask -class OVDecoderModelPatcher(ModelPatcher): +class OVDecoderModelPatcher(OVModelPatcher): def __enter__(self): super().__enter__() @@ -3069,7 +3182,7 @@ def __exit__(self, exc_type, exc_value, traceback): layer.self_attn.forward = layer.self_attn._orig_forward -class IBertModelPatcher(ModelPatcher): +class IBertModelPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3087,7 +3200,7 @@ def __init__( self._model(torch.ones([1, 1], dtype=torch.long)) -class InternVLChatImageEmbeddingModelPatcher(ModelPatcher): +class InternVLChatImageEmbeddingModelPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3230,7 +3343,7 @@ def maira_vision_embed_forward(self, pixel_values): return self.get_image_features(pixel_values, vision_feature_layer, vision_feature_select_strategy) -class LlavaImageEmbeddingModelPatcher(ModelPatcher): +class LlavaImageEmbeddingModelPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3247,7 +3360,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward -class MairaImageEmbeddingModelPatcher(ModelPatcher): +class MairaImageEmbeddingModelPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3264,7 +3377,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward -class LlavaNextVideoImageEmbeddingModelPatcher(ModelPatcher): +class LlavaNextVideoImageEmbeddingModelPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3305,7 +3418,7 @@ def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor: return emb.unsqueeze(1) -class FluxTransfromerModelPatcher(ModelPatcher): +class FluxTransfromerModelPatcher(OVModelPatcher): def __enter__(self): super().__enter__() if is_diffusers_version("<", "0.31.0"): @@ -3480,7 +3593,7 @@ def _minicpmv_siglip_transformer_forward( ) -class MiniCPMVResamplerModelPatcher(ModelPatcher): +class MiniCPMVResamplerModelPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3497,7 +3610,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward -class MiniCPMVImageEmbeddingsModelPatcher(ModelPatcher): +class MiniCPMVImageEmbeddingsModelPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3528,7 +3641,7 @@ def __exit__(self, exc_type, exc_value, traceback): layer.self_attn.forward = layer.self_attn._orig_forward -class LlavaQwen2ImageEmbeddingsModelPatcher(ModelPatcher): +class LlavaQwen2ImageEmbeddingsModelPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3546,7 +3659,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward -class InputEmbeddingPatcher(ModelPatcher): +class InputEmbeddingPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3571,7 +3684,7 @@ def phi3_vision_embeddings_forward(self, pixel_values: torch.FloatTensor): return self.get_img_features(pixel_values) -class Phi3VisionImageEmbeddingsPatcher(ModelPatcher): +class Phi3VisionImageEmbeddingsPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -4180,7 +4293,7 @@ def block_forward( block.attn.forward = types.MethodType(sdpa_attn_forward, block.attn) -class Qwen2VLVisionEmbMergerPatcher(ModelPatcher): +class Qwen2VLVisionEmbMergerPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -4214,7 +4327,7 @@ def __exit__(self, exc_type, exc_value, traceback): block.attn.forward = block.attn._orig_forward -class Qwen2_5_VLVisionEmbMergerPatcher(ModelPatcher): +class Qwen2_5_VLVisionEmbMergerPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -4345,7 +4458,7 @@ def __exit__(self, exc_type, exc_value, traceback): block_sparse_moe.output_linear.forward = block_sparse_moe.output_linear._orig_forward -class OVSeq2SeqModelPatcher(ModelPatcher): +class OVSeq2SeqModelPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -4430,7 +4543,7 @@ def __exit__(self, exc_type, exc_value, traceback): ALL_MASK_ATTENTION_FUNCTIONS.register("eager", eager_mask) -class SanaTextEncoderModelPatcher(ModelPatcher): +class SanaTextEncoderModelPatcher(OVModelPatcher): def __enter__(self): super().__enter__() @@ -4481,7 +4594,7 @@ def __init__( super().__init__(config, model, model_kwargs) -class CommonImageEmbeddingsModelPatcher(ModelPatcher): +class CommonImageEmbeddingsModelPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -4631,7 +4744,7 @@ def __exit__(self, exc_type, exc_value, traceback): del self._model.model._orig_update_causual_mask -class Idefics3ImageEmbeddingsModelPatcher(ModelPatcher): +class Idefics3ImageEmbeddingsModelPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -5410,7 +5523,7 @@ def speecht5_decoder_layer_forward( return outputs -class OVSpeechT5ModelPatcher(ModelPatcher): +class OVSpeechT5ModelPatcher(OVModelPatcher): def __enter__(self): if self.real_config._behavior != "vocoder": super().__enter__() @@ -5586,7 +5699,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward -class Phi4MMAudioForwardEmbeddingsPatcher(ModelPatcher): +class Phi4MMAudioForwardEmbeddingsPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -5610,7 +5723,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward -class Phi4MMAudioEncoderPatcher(ModelPatcher): +class Phi4MMAudioEncoderPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -5651,7 +5764,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward -class Phi4MMVisionEmbeddingsPatcher(ModelPatcher): +class Phi4MMVisionEmbeddingsPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -5960,7 +6073,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.img_processor.embeddings.forward = self._model.img_processor.embeddings._orig_forward -class Llama4ImageEmbeddingsModelPatcher(ModelPatcher): +class Llama4ImageEmbeddingsModelPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -6146,7 +6259,7 @@ def llama4_moe_forward(self, hidden_states): return out, router_scores -class Llama4TextModelPatcher(ModelPatcher): +class Llama4TextModelPatcher(OVModelPatcher): def __enter__(self): super().__enter__() @@ -6316,7 +6429,7 @@ def mamba_mixer_forward( # 1. Inject a MambaCache structure into the original model to simplify input and output handling related to SSM states # 2. Patch ConvSequenceTransform module to avoid if-else branching # 3. Vectorize the selective scan operation to ensure correct behavior during JIT tracing -class MambaPatcher(ModelPatcher): +class MambaPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -6810,7 +6923,7 @@ def segment_sum(input_tensor): # for subsequent invocation of the model's `forward` method. # 2. Patches the Zamba2MambaMixer so that the traced `forward` function works correctly # during both the prefill and decoding steps. -class Zamba2ModelPatcher(ModelPatcher): +class Zamba2ModelPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -7236,7 +7349,7 @@ def granite_moe_hybrid_update_causal_mask( return causal_mask -class GraniteMoeHybridModelPatcher(ModelPatcher): +class GraniteMoeHybridModelPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", From 2d3c734c8abf3a9907cb929accd04ef61d57a5ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 29 Jan 2026 13:28:58 +0100 Subject: [PATCH 023/190] openvino nightly for modeling tests --- .github/workflows/test_openvino.yml | 8 ++++---- setup.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index 5bcbc0e31c..950c2f987c 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -81,15 +81,15 @@ jobs: run: | python tests/scripts/login_with_ci_token.py - - name: Test with Pytest - run: | - pytest tests/openvino/${{ matrix.test-pattern }} --durations=0 - - if: ${{ matrix.test-pattern == '*modeling*' }} name: Install Nightly OpenVINO run: | uv pip install --upgrade --pre openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + - name: Test with Pytest + run: | + pytest tests/openvino/${{ matrix.test-pattern }} --durations=0 + - if: ${{ matrix.test-pattern == '*modeling*' }} name: Test with Pytest - Nightly OpenVINO run: | diff --git a/setup.py b/setup.py index b2c945b37b..1c313dbe0c 100644 --- a/setup.py +++ b/setup.py @@ -67,7 +67,7 @@ EXTRAS_REQUIRE = { "nncf": ["nncf>=2.19.0"], - "openvino": ["nncf>=2.19.0", "openvino==2025.3.0", "openvino-tokenizers==2025.3.0"], + "openvino": ["nncf>=2.19.0", "openvino>=2025.4.0", "openvino-tokenizers>=2025.4.0"], "neural-compressor": ["neural-compressor[pt]>=3.4.1", "accelerate", "transformers<4.46", "datasets"], "ipex": ["intel-extension-for-pytorch>=2.8", "transformers>4.54,<4.56", "accelerate"], "diffusers": ["diffusers"], From b6dcefd0949130e084d378e5bc6d7cc46c9e698c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 29 Jan 2026 14:21:35 +0100 Subject: [PATCH 024/190] openvino 2025.3 for modeling tests --- .github/workflows/test_openvino.yml | 13 +++++++++---- setup.py | 2 +- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index 950c2f987c..37e61ea335 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -56,6 +56,11 @@ jobs: pip install --upgrade pip uv uv pip install .[openvino,diffusers,tests] + - if: ${{ matrix.test-pattern == '*modeling*' }} + name: Install OpenVINO + run: | + uv pip install openvino==2025.3.0 openvino-tokenizers==2025.3.0 + - if: ${{ matrix.transformers-version != 'latest' }} name: Install transformers run: | @@ -81,15 +86,15 @@ jobs: run: | python tests/scripts/login_with_ci_token.py + - name: Test with Pytest + run: | + pytest tests/openvino/${{ matrix.test-pattern }} --durations=0 + - if: ${{ matrix.test-pattern == '*modeling*' }} name: Install Nightly OpenVINO run: | uv pip install --upgrade --pre openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - - name: Test with Pytest - run: | - pytest tests/openvino/${{ matrix.test-pattern }} --durations=0 - - if: ${{ matrix.test-pattern == '*modeling*' }} name: Test with Pytest - Nightly OpenVINO run: | diff --git a/setup.py b/setup.py index 1c313dbe0c..c072bab7f3 100644 --- a/setup.py +++ b/setup.py @@ -67,7 +67,7 @@ EXTRAS_REQUIRE = { "nncf": ["nncf>=2.19.0"], - "openvino": ["nncf>=2.19.0", "openvino>=2025.4.0", "openvino-tokenizers>=2025.4.0"], + "openvino": ["nncf>=2.19.0", "openvino>=2025.3.0", "openvino-tokenizers>=2025.3.0"], "neural-compressor": ["neural-compressor[pt]>=3.4.1", "accelerate", "transformers<4.46", "datasets"], "ipex": ["intel-extension-for-pytorch>=2.8", "transformers>4.54,<4.56", "accelerate"], "diffusers": ["diffusers"], From ea24727535b789e228106790f2f725c50dc8309b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 29 Jan 2026 15:51:08 +0100 Subject: [PATCH 025/190] stop moving misplaced parameters from config to generation_config --- optimum/exporters/openvino/convert.py | 29 ++++++++++---------- optimum/intel/openvino/modeling_base.py | 32 +++++++++++----------- optimum/intel/openvino/modeling_seq2seq.py | 29 ++++++++++---------- 3 files changed, 46 insertions(+), 44 deletions(-) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 4b0652393d..794e38c9ed 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -688,20 +688,21 @@ def export_from_model( files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_export_configs.keys()] elif library_name != "diffusers": - # some model configs may have issues with loading without parameters initialization - try: - misplaced_generation_parameters = model.config._get_non_default_generation_parameters() - except (AttributeError, KeyError, TypeError): - misplaced_generation_parameters = {} - if isinstance(model, GenerationMixin) and len(misplaced_generation_parameters) > 0: - logger.warning( - "Moving the following attributes in the config to the generation config: " - f"{misplaced_generation_parameters}. You are seeing this warning because you've set " - "generation parameters in the model config, as opposed to in the generation config.", - ) - for param_name, param_value in misplaced_generation_parameters.items(): - setattr(model.generation_config, param_name, param_value) - setattr(model.config, param_name, None) + if is_transformers_version("<", "5"): + # some model configs may have issues with loading without parameters initialization + try: + misplaced_generation_parameters = model.config._get_non_default_generation_parameters() + except (AttributeError, KeyError, TypeError): + misplaced_generation_parameters = {} + if isinstance(model, GenerationMixin) and len(misplaced_generation_parameters) > 0: + logger.warning( + "Moving the following attributes in the config to the generation config: " + f"{misplaced_generation_parameters}. You are seeing this warning because you've set " + "generation parameters in the model config, as opposed to in the generation config.", + ) + for param_name, param_value in misplaced_generation_parameters.items(): + setattr(model.generation_config, param_name, param_value) + setattr(model.config, param_name, None) # Saving the model config and preprocessor as this is needed sometimes. save_config(model.config, output) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index b3c9a11eb0..0d95cc233d 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -47,7 +47,7 @@ classproperty, model_has_dynamic_inputs, ) -from optimum.intel.utils.import_utils import is_huggingface_hub_version, is_nncf_available +from optimum.intel.utils.import_utils import is_huggingface_hub_version, is_nncf_available, is_transformers_version from optimum.intel.utils.modeling_utils import _find_files_matching_pattern from optimum.modeling_base import FROM_PRETRAINED_START_DOCSTRING, OptimizedModel @@ -265,21 +265,21 @@ def __init__( if self.can_generate(): self.generation_config = generation_config or GenerationConfig.from_model_config(config) - # some model configs may have issues with loading without parameters initialization - try: - misplaced_generation_parameters = self.config._get_non_default_generation_parameters() - except (KeyError, TypeError): - misplaced_generation_parameters = {} - if len(misplaced_generation_parameters) > 0: - logger.warning( - "Moving the following attributes in the config to the generation config: " - f"{misplaced_generation_parameters}. You are seeing this warning because you've set " - "generation parameters in the model config, as opposed to in the generation config.", - ) - for param_name, param_value in misplaced_generation_parameters.items(): - setattr(self.generation_config, param_name, param_value) - setattr(self.config, param_name, None) - + if is_transformers_version("<", "5"): + # some model configs may have issues with loading without parameters initialization + try: + misplaced_generation_parameters = self.config._get_non_default_generation_parameters() + except (KeyError, TypeError): + misplaced_generation_parameters = {} + if len(misplaced_generation_parameters) > 0: + logger.warning( + "Moving the following attributes in the config to the generation config: " + f"{misplaced_generation_parameters}. You are seeing this warning because you've set " + "generation parameters in the model config, as opposed to in the generation config.", + ) + for param_name, param_value in misplaced_generation_parameters.items(): + setattr(self.generation_config, param_name, param_value) + setattr(self.config, param_name, None) else: self.generation_config = None diff --git a/optimum/intel/openvino/modeling_seq2seq.py b/optimum/intel/openvino/modeling_seq2seq.py index c5365e8aae..cb8d6b7fa4 100644 --- a/optimum/intel/openvino/modeling_seq2seq.py +++ b/optimum/intel/openvino/modeling_seq2seq.py @@ -370,20 +370,21 @@ def __init__( generation_config = kwargs.get("generation_config", None) self.generation_config = generation_config or GenerationConfig.from_model_config(config) - # some model configs may have issues with loading without parameters initialization - try: - misplaced_generation_parameters = self.config._get_non_default_generation_parameters() - except (KeyError, TypeError): - misplaced_generation_parameters = {} - if len(misplaced_generation_parameters) > 0: - logger.warning( - "Moving the following attributes in the config to the generation config: " - f"{misplaced_generation_parameters}. You are seeing this warning because you've set " - "generation parameters in the model config, as opposed to in the generation config.", - ) - for param_name, param_value in misplaced_generation_parameters.items(): - setattr(self.generation_config, param_name, param_value) - setattr(self.config, param_name, None) + if is_transformers_version("<", "5"): + # some model configs may have issues with loading without parameters initialization + try: + misplaced_generation_parameters = self.config._get_non_default_generation_parameters() + except (KeyError, TypeError): + misplaced_generation_parameters = {} + if len(misplaced_generation_parameters) > 0: + logger.warning( + "Moving the following attributes in the config to the generation config: " + f"{misplaced_generation_parameters}. You are seeing this warning because you've set " + "generation parameters in the model config, as opposed to in the generation config.", + ) + for param_name, param_value in misplaced_generation_parameters.items(): + setattr(self.generation_config, param_name, param_value) + setattr(self.config, param_name, None) self._openvino_config = None if quantization_config: From 07ff06b936fea14798feb0ca208449bc408b3694 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 29 Jan 2026 15:54:59 +0100 Subject: [PATCH 026/190] fix transformers version for doc building --- .github/workflows/build_pr_documentation.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml index c4a34baaa6..01a5bbe7e9 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yml @@ -38,6 +38,7 @@ jobs: run: | pip install --upgrade pip uv uv pip install git+https://github.com/huggingface/doc-builder + uv pip install transformers==4.57.6 uv pip install .[quality] nncf openvino diffusers accelerate datasets - name: Make documentation From 1270db0612cad34664ec7b295c55e19ea0be38fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 29 Jan 2026 15:56:32 +0100 Subject: [PATCH 027/190] fix transformers version for doc building --- .github/workflows/build_documentation.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml index 52dae651de..332563450b 100644 --- a/.github/workflows/build_documentation.yml +++ b/.github/workflows/build_documentation.yml @@ -51,6 +51,7 @@ jobs: run: | pip install --upgrade pip uv uv pip install git+https://github.com/huggingface/doc-builder + uv pip install transformers==4.57.6 uv pip install .[quality] nncf openvino diffusers accelerate datasets - name: Make documentation From eb045ce620a72080d746dd2877b12a685c9bb79a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 29 Jan 2026 18:57:38 +0100 Subject: [PATCH 028/190] Use model.get_image_features --- optimum/exporters/openvino/model_configs.py | 25 ++++++++++++++------- optimum/exporters/openvino/model_patcher.py | 6 ++++- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 67686b94bb..a8b293ca84 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -317,6 +317,13 @@ def init_model_configs(): register_in_tasks_manager = TasksManager.create_register("openvino", overwrite_existing=True) +def _get_language_model(model): + if is_transformers_version("<", "5"): + return model.language_model + + return model.model.language_model + + @register_in_tasks_manager("baichuan", *["text-generation", "text-generation-with-past"], library_name="transformers") class BaichaunOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 13 @@ -1702,14 +1709,14 @@ def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior]) behavior = VLMConfigBehavior(behavior) if behavior == VLMConfigBehavior.LANGUAGE: - return model.language_model if not hasattr(model, "lm_head") else model + return _get_language_model(model) if not hasattr(model, "lm_head") else model if behavior == VLMConfigBehavior.VISION_EMBEDDINGS: return model if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS: text_embedding = model.get_input_embeddings() - text_embedding.config = model.language_model.config + text_embedding.config = _get_language_model(model).config return text_embedding def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None): @@ -1892,8 +1899,8 @@ def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior]) behavior = VLMConfigBehavior(behavior) if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS: - text_embedding = model.language_model.get_input_embeddings() - text_embedding.config = model.language_model.config + text_embedding = _get_language_model(model).get_input_embeddings() + text_embedding.config = _get_language_model(model).config return text_embedding return super().get_model_for_behavior(model, behavior) @@ -1969,14 +1976,14 @@ def get_model_for_behavior(model, behavior: Union[str, VLMConfigBehavior]): behavior = VLMConfigBehavior(behavior) if behavior == VLMConfigBehavior.LANGUAGE: - return model.language_model + return _get_language_model(model) if behavior == VLMConfigBehavior.VISION_EMBEDDINGS: return model if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS: - text_embedding = model.language_model.get_input_embeddings() - text_embedding.config = model.language_model.config + text_embedding = _get_language_model(model).get_input_embeddings() + text_embedding.config = _get_language_model(model).config return text_embedding def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None): @@ -3477,7 +3484,9 @@ def get_model_for_behavior(model, behavior: Union[str, Qwen2VLConfigBehavior]): if behavior == Qwen2VLConfigBehavior.TEXT_EMBEDDINGS: text_embedding = ( - model.model.embed_tokens if hasattr(model.model, "embed_tokens") else model.language_model.embed_tokens + model.model.embed_tokens + if hasattr(model.model, "embed_tokens") + else _get_language_model(model).embed_tokens ) text_embedding.config = model.config return text_embedding diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 3639ece9cf..1c1cb3bb9f 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3351,7 +3351,11 @@ def __init__( model_kwargs: Dict[str, Any], ): model.__orig_forward = model.forward - model.forward = types.MethodType(llava_vision_embed_forward, model) + + if is_transformers_version("<", "5"): + model.forward = types.MethodType(llava_vision_embed_forward, model) + else: + model.forward = model.get_image_features super().__init__(config, model, model_kwargs) From f2f352dd92891a0d1eba46a9e0298d4848fb494d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 29 Jan 2026 18:58:22 +0100 Subject: [PATCH 029/190] Use model.get_image_features --- optimum/exporters/openvino/model_patcher.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 1c1cb3bb9f..7aec5bbe41 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3389,7 +3389,11 @@ def __init__( model_kwargs: Dict[str, Any], ): model.__orig_forward = model.forward - model.forward = types.MethodType(llava_next_video_vision_embed_forward, model) + + if is_transformers_version("<", "5"): + model.forward = types.MethodType(llava_next_video_vision_embed_forward, model) + else: + model.forward = model.get_image_features super().__init__(config, model, model_kwargs) From 1db8fb9820a23d8a3e1d19201823c39aff1b99a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 29 Jan 2026 19:07:03 +0100 Subject: [PATCH 030/190] only add codegen remote code models when transformers < v5 --- tests/openvino/test_decoder.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 183f362913..33d8383876 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -48,7 +48,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "blenderbot-small", "bloom", "codegen", - "codegen2", "gpt2", "gptj", "gpt_neo", @@ -147,6 +146,9 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version("<", "4.56.0"): SUPPORTED_ARCHITECTURES += ("qwen", "chatglm", "chatglm4") + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ("codegen2",) + GENERATION_LENGTH = 100 EXPECTED_NUM_SDPA = { From 0c72bc518c69ad2450217ad288b5172f2db19768 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 30 Jan 2026 16:06:32 +0100 Subject: [PATCH 031/190] Fix pipelines --- optimum/intel/pipelines/accelerator_utils.py | 30 ++++++++++++++------ 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/optimum/intel/pipelines/accelerator_utils.py b/optimum/intel/pipelines/accelerator_utils.py index 7ea4102ec7..9090b8f30f 100644 --- a/optimum/intel/pipelines/accelerator_utils.py +++ b/optimum/intel/pipelines/accelerator_utils.py @@ -18,10 +18,15 @@ import transformers.pipelines from transformers import AutoConfig +from optimum.intel.utils import ( + IPEX_IMPORT_ERROR, + OPENVINO_IMPORT_ERROR, + is_ipex_available, + is_openvino_available, + is_transformers_version, +) from optimum.utils.logging import get_logger -from ..utils import IPEX_IMPORT_ERROR, OPENVINO_IMPORT_ERROR, is_ipex_available, is_openvino_available - if TYPE_CHECKING: from transformers import PretrainedConfig @@ -154,7 +159,10 @@ def openvino_infer_framework_load_model( You can also provide None as the model to use a default one.""" ) - return "pt", ov_model + if is_transformers_version("<", "5"): + return "pt", ov_model + + return ov_model def get_ipex_model_class(task: str, **model_kwargs): @@ -189,27 +197,33 @@ def ipex_infer_framework_load_model( You can also provide None as the model to use a default one.""" ) - return "pt", ipex_model + if is_transformers_version("<", "5"): + return "pt", ipex_model + + return ipex_model @contextlib.contextmanager def patch_pipelines_to_load_accelerator_model(accelerator: str): - original_infer_framework_load_model = transformers.pipelines.infer_framework_load_model + target_fn = "infer_framework_load_model" if is_transformers_version("<", "5") else "load_model" + + original_infer_framework_load_model = getattr(transformers.pipelines, target_fn) if accelerator == "openvino": if not is_openvino_available(): raise ImportError(OPENVINO_IMPORT_ERROR.format("`accelerator=openvino`")) - transformers.pipelines.infer_framework_load_model = openvino_infer_framework_load_model + setattr(transformers.pipelines, target_fn, openvino_infer_framework_load_model) + elif accelerator == "ipex": if not is_ipex_available(): raise ImportError(IPEX_IMPORT_ERROR.format("`accelerator=ipex`")) - transformers.pipelines.infer_framework_load_model = ipex_infer_framework_load_model + setattr(transformers.pipelines, target_fn, ipex_infer_framework_load_model) else: raise ValueError(f"Accelerator '{accelerator}' is not supported. Only 'openvino' and 'ipex' are supported.") try: yield finally: - transformers.pipelines.infer_framework_load_model = original_infer_framework_load_model + setattr(transformers.pipelines, target_fn, original_infer_framework_load_model) From 08ebe2b6d1778df68cb41b2297db074615a9a87b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 30 Jan 2026 16:24:16 +0100 Subject: [PATCH 032/190] fix pipelines --- optimum/intel/pipelines/accelerator_utils.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/optimum/intel/pipelines/accelerator_utils.py b/optimum/intel/pipelines/accelerator_utils.py index 9090b8f30f..9ac24d06d7 100644 --- a/optimum/intel/pipelines/accelerator_utils.py +++ b/optimum/intel/pipelines/accelerator_utils.py @@ -13,7 +13,7 @@ # limitations under the License. import contextlib -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, Dict, Optional, Tuple import transformers.pipelines from transformers import AutoConfig @@ -143,12 +143,15 @@ def get_openvino_model_class( # a modified transformers.pipelines.base.infer_framework_load_model that loads OpenVINO models def openvino_infer_framework_load_model( - model, config: Optional["PretrainedConfig"] = None, task: Optional[str] = None, **model_kwargs + model, + config: Optional["PretrainedConfig"] = None, + model_classes: Optional[Dict[str, Tuple[type]]] = None, + task: Optional[str] = None, + framework: Optional[str] = None, + **model_kwargs, ): if isinstance(model, str): - model_kwargs.pop("framework", None) model_kwargs.pop("_commit_hash", None) # not supported for OVModel - model_kwargs.pop("model_classes", None) ov_model_class = get_openvino_model_class(task, config, model, **model_kwargs) ov_model = ov_model_class.from_pretrained(model, **model_kwargs) elif isinstance(model, OVBaseModel): @@ -181,12 +184,15 @@ def get_ipex_model_class(task: str, **model_kwargs): # a modified transformers.pipelines.base.infer_framework_load_model that loads IPEX models def ipex_infer_framework_load_model( - model, config: Optional["PretrainedConfig"] = None, task: Optional[str] = None, **model_kwargs + model, + config: Optional["PretrainedConfig"] = None, + model_classes: Optional[Dict[str, Tuple[type]]] = None, + task: Optional[str] = None, + framework: Optional[str] = None, + **model_kwargs, ): if isinstance(model, str): - model_kwargs.pop("framework", None) model_kwargs.pop("_commit_hash", None) # not supported for IPEXModel - model_kwargs.pop("model_classes", None) ipex_model_class = get_ipex_model_class(task, **model_kwargs) ipex_model = ipex_model_class.from_pretrained(model, **model_kwargs) elif isinstance(model, IPEXModel): From 33f8c24df28e80efe49fa5beabef103d23ea89e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 30 Jan 2026 16:34:34 +0100 Subject: [PATCH 033/190] replace with OV cache --- optimum/exporters/openvino/model_patcher.py | 33 ++++++++++----------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 7aec5bbe41..fa8fe1bfb6 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -69,7 +69,7 @@ def to_legacy_cache(self) -> tuple[tuple[torch.Tensor, torch.Tensor]]: # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1015 @classmethod - def from_legacy_cache(cls, past_key_values: tuple[tuple[torch.Tensor, torch.Tensor]]) -> "DynamicCache": + def from_legacy_cache(cls, past_key_values: tuple[tuple[torch.Tensor, torch.Tensor]]) -> "OVDynamicCache": """ Converts a cache in the legacy cache format into an equivalent `Cache`. Used for backward compatibility. @@ -87,7 +87,7 @@ def from_legacy_cache(cls, past_key_values: tuple[tuple[torch.Tensor, torch.Tens class OVEncoderDecoderCache(EncoderDecoderCache): # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1266 def to_legacy_cache(self) -> tuple[tuple[torch.Tensor]]: - """Converts the `EncoderDecoderCache` instance into its equivalent in the legacy cache format.""" + """Converts the `OVEncoderDecoderCache` instance into its equivalent in the legacy cache format.""" legacy_cache = () if len(self.cross_attention_cache) > 0: for self_attn, cross_attn in zip( @@ -102,9 +102,9 @@ def to_legacy_cache(self) -> tuple[tuple[torch.Tensor]]: @classmethod def from_legacy_cache( cls, past_key_values: Optional[Iterable[tuple[torch.FloatTensor, ...]]] - ) -> "EncoderDecoderCache": - """Converts a cache in the legacy cache format into an equivalent `EncoderDecoderCache`.""" - cache = cls(DynamicCache(), DynamicCache()) + ) -> "OVEncoderDecoderCache": + """Converts a cache in the legacy cache format into an equivalent `OVEncoderDecoderCache`.""" + cache = cls(OVDynamicCache(), OVDynamicCache()) if past_key_values is None: logger.warning_once("past_key_values should not be None in from_legacy_cache()") else: @@ -1451,7 +1451,7 @@ def phi3_442_forward( if use_cache: use_legacy_cache = not isinstance(past_key_values, Cache) if use_legacy_cache: - past_key_values = DynamicCache.from_legacy_cache(past_key_values) + past_key_values = OVDynamicCache.from_legacy_cache(past_key_values) past_key_values_length = past_key_values.get_usable_length(seq_length) if position_ids is None: @@ -3023,7 +3023,7 @@ def patched_forward(*args, **kwargs): legacy_pkv = args[pkv_argument_index] pkv_in_args = True if legacy_pkv is not None: - pkv = DynamicCache.from_legacy_cache(legacy_pkv) + pkv = OVDynamicCache.from_legacy_cache(legacy_pkv) return_legacy_cache = True if not pkv_in_args: kwargs["past_key_values"] = pkv @@ -4162,7 +4162,7 @@ def forward_wrap( input_ids=None, use_cache=True, ): - new_past_key_values = DynamicCache.from_legacy_cache(past_key_values) + new_past_key_values = OVDynamicCache.from_legacy_cache(past_key_values) result = self.__orig_forward( input_ids=input_ids, attention_mask=attention_mask, @@ -4498,11 +4498,11 @@ def patched_forward(*args, **kwargs): pkv = args[pkv_arg_index] if pkv is not None: - if isinstance(pkv, EncoderDecoderCache): + if isinstance(pkv, OVEncoderDecoderCache): pkv = pkv.self_attention_cache.to_legacy_cache() else: pkv = [pkv_item[:2] for pkv_item in pkv] - pkv = EncoderDecoderCache.from_legacy_cache(pkv) + pkv = OVEncoderDecoderCache.from_legacy_cache(pkv) if "past_key_values" in kwargs: kwargs["past_key_values"] = pkv @@ -4512,7 +4512,7 @@ def patched_forward(*args, **kwargs): outputs = self.super_patched_forward(*args, **kwargs) # the optimum-onnx seq2seq model patcher only converts to tuple starting from 4.48 - if isinstance(outputs.get("past_key_values"), (DynamicCache, EncoderDecoderCache)): + if isinstance(outputs.get("past_key_values"), (OVDynamicCache, OVEncoderDecoderCache)): outputs["past_key_values"] = outputs["past_key_values"].to_legacy_cache() # we still need to filter out cross attention in the case of non-stateful decoder @@ -4687,7 +4687,7 @@ def __init__( def forward( self, attention_mask, position_ids, past_key_values, token_type_ids, inputs_embeds, use_cache=True ): - pkv = DynamicCache.from_legacy_cache(past_key_values) + pkv = OVDynamicCache.from_legacy_cache(past_key_values) past_seen_tokens = past_key_values[0][0].shape[-2] cache_position = torch.arange( @@ -5052,7 +5052,6 @@ def _blenderbot_attn_forward_new( output_attentions: bool = False, cache_position: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - from transformers.cache_utils import EncoderDecoderCache """Input shape: Batch x Time x Channel""" @@ -5076,7 +5075,7 @@ def _blenderbot_attn_forward_new( query_states = query_states if past_key_value is not None: - if isinstance(past_key_value, EncoderDecoderCache): + if isinstance(past_key_value, OVEncoderDecoderCache): is_updated = past_key_value.is_updated.get(self.layer_idx) if is_cross_attention: # after the first generated id, we can subsequently re-use all key/value_states from cache @@ -5601,7 +5600,7 @@ def patched_decoder_forward( if past_key_values is not None: past_key_values = [cache_item[:2] for cache_item in past_key_values] if is_transformers_version(">=", "4.56"): - past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) + past_key_values = OVEncoderDecoderCache.from_legacy_cache(past_key_values) output_sequence = inputs_embeds output_cross_attentions = False @@ -5633,7 +5632,7 @@ def patched_decoder_forward( past_key_values = decoder_out.past_key_values if past_key_values is not None: - if isinstance(past_key_values, EncoderDecoderCache): + if isinstance(past_key_values, OVEncoderDecoderCache): past_key_values = past_key_values.self_attention_cache.to_legacy_cache() else: past_key_values = [cache_item[:2] for cache_item in past_key_values] @@ -5685,7 +5684,7 @@ def __init__( # Adopted from https://github.com/huggingface/transformers/blob/v4.51.3/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py#L2156-L2178 # moved audio and vision features processing outside model def lm_forward(self, inputs_embeds, attention_mask, position_ids, past_key_values, use_cache=True): - pkv = DynamicCache.from_legacy_cache(past_key_values) + pkv = OVDynamicCache.from_legacy_cache(past_key_values) outputs = self.model( inputs_embeds=inputs_embeds, attention_mask=attention_mask, From 9809e7ede6f973ea3ef625f5baabc06365b5f0ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 30 Jan 2026 16:35:23 +0100 Subject: [PATCH 034/190] style --- optimum/exporters/openvino/model_patcher.py | 1 - 1 file changed, 1 deletion(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index fa8fe1bfb6..a85bd6f75c 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -5052,7 +5052,6 @@ def _blenderbot_attn_forward_new( output_attentions: bool = False, cache_position: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - """Input shape: Batch x Time x Channel""" # if key_value_states are provided this layer is used as a cross-attention layer From 621e2bf710b44df53fc15435760bdc17c34886d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 30 Jan 2026 17:51:21 +0100 Subject: [PATCH 035/190] Use AutoProcessor instead of AutoFeatureExtractor --- tests/openvino/test_modeling.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 4ffa7ab06b..777b276859 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -1184,19 +1184,18 @@ def test_compare_to_transformers(self, model_arch): self.assertIsInstance(ov_model.config, PretrainedConfig) set_seed(SEED) transformers_model = AutoModelForImageClassification.from_pretrained(model_id) - preprocessor = AutoFeatureExtractor.from_pretrained(model_id) + preprocessor = AutoProcessor.from_pretrained(model_id) url = TEST_IMAGE_URL image = Image.open(requests.get(url, stream=True).raw) inputs = preprocessor(images=image, return_tensors="pt") with torch.no_grad(): transformers_outputs = transformers_model(**inputs) - for input_type in ["pt", "np"]: - inputs = preprocessor(images=image, return_tensors=input_type) - ov_outputs = ov_model(**inputs) - self.assertIn("logits", ov_outputs) - self.assertIsInstance(ov_outputs.logits, TENSOR_ALIAS_TO_TYPE[input_type]) - # Compare tensor outputs - self.assertTrue(torch.allclose(torch.Tensor(ov_outputs.logits), transformers_outputs.logits, atol=1e-4)) + inputs = preprocessor(images=image, return_tensors="pt") + ov_outputs = ov_model(**inputs) + self.assertIn("logits", ov_outputs) + self.assertIsInstance(ov_outputs.logits, torch.Tensor) + # Compare tensor outputs + self.assertTrue(torch.allclose(torch.Tensor(ov_outputs.logits), transformers_outputs.logits, atol=1e-4)) del transformers_model del ov_model gc.collect() @@ -1209,7 +1208,7 @@ def test_pipeline(self, model_arch): model_id = MODEL_NAMES[model_arch] model = OVModelForImageClassification.from_pretrained(model_id, device=OPENVINO_DEVICE) model.eval() - preprocessor = AutoFeatureExtractor.from_pretrained(model_id) + preprocessor = AutoProcessor.from_pretrained(model_id) pipe = pipeline("image-classification", model=model, feature_extractor=preprocessor) inputs = TEST_IMAGE_URL outputs = pipe(inputs) From 30f628592391ba40c843c5077afbbc842eb586e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 30 Jan 2026 18:02:27 +0100 Subject: [PATCH 036/190] remove afmoe from models to be tested list --- optimum/exporters/openvino/model_configs.py | 2 +- tests/openvino/test_decoder.py | 11 +++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index a8b293ca84..594d876812 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -4646,7 +4646,7 @@ class ASTOpenVINOConfig(ASTOnnxConfig): ) class AfmoeOpenVINOConfig(LlamaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.55.0" - MAX_TRANSFORMERS_VERSION = "4.57.99" + MAX_TRANSFORMERS_VERSION = "4.57.6" _MODEL_PATCHER = AfmoeModelPatcher diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 33d8383876..19eb7dfb99 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -21,7 +21,12 @@ patch_awq_for_inference, ) -from optimum.exporters.openvino.model_configs import BitnetOpenVINOConfig, DeepseekOpenVINOConfig, LFM2OpenVINOConfig +from optimum.exporters.openvino.model_configs import ( + AfmoeOpenVINOConfig, + BitnetOpenVINOConfig, + DeepseekOpenVINOConfig, + LFM2OpenVINOConfig, +) from optimum.exporters.openvino.model_patcher import patch_update_causal_mask from optimum.exporters.openvino.utils import ONNX_SUPPORTED_ARCHITECTURES from optimum.exporters.tasks import TasksManager @@ -274,11 +279,13 @@ def test_find_untested_architectures(self): if "llama4_text" in supported_architectures: supported_architectures.remove("llama4_text") - if is_transformers_version(">=", str(DeepseekOpenVINOConfig.MAX_TRANSFORMERS_VERSION)): + if is_transformers_version(">", str(DeepseekOpenVINOConfig.MAX_TRANSFORMERS_VERSION)): if "deepseek_v2" in supported_architectures: supported_architectures.remove("deepseek_v2") if "deepseek_v3" in supported_architectures: supported_architectures.remove("deepseek_v3") + if is_transformers_version(">", str(AfmoeOpenVINOConfig.MAX_TRANSFORMERS_VERSION)): + supported_architectures -= {"afmoe"} if is_transformers_version("<", str(BitnetOpenVINOConfig.MIN_TRANSFORMERS_VERSION)): supported_architectures -= {"bitnet"} if is_transformers_version("<", str(LFM2OpenVINOConfig.MIN_TRANSFORMERS_VERSION)): From 6cd7b1c263ef119008f21382be8f0a6dd32a5a29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 30 Jan 2026 18:20:04 +0100 Subject: [PATCH 037/190] fix pipeline saving tests --- tests/openvino/test_modeling.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 777b276859..d9a61e8b44 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -708,6 +708,8 @@ def test_load_model_from_hub(self): # verify could load both pytorch and openvino model (export argument should automatically infered) ov_exported_pipe = optimum_pipeline("text-generation", model_id, revision="pt", accelerator="openvino") + ov_exported_pipe.modelcard = None + ov_pipe = optimum_pipeline("text-generation", model_id, revision="ov", accelerator="openvino") self.assertIsInstance(ov_exported_pipe.model, OVBaseModel) self.assertIsInstance(ov_pipe.model, OVBaseModel) From 85a0418e0a64b6e9de3e802912d55eaca9c7a056 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 2 Feb 2026 15:46:04 +0100 Subject: [PATCH 038/190] fix seq2seq pipeline tests loading --- tests/openvino/test_modeling.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index d9a61e8b44..785c4e2782 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -727,20 +727,21 @@ def test_load_model_from_hub(self): gc.collect() def test_seq2seq_load_from_hub(self): - model_id = "echarlaix/tiny-random-t5" + model_id = MODEL_NAMES["whisper"] + task = "automatic-speech-recognition" # verify could load both pytorch and openvino model (export argument should automatically infered) - ov_exported_pipe = optimum_pipeline("text2text-generation", model_id, accelerator="openvino") - ov_pipe = optimum_pipeline("text2text-generation", model_id, revision="ov", accelerator="openvino") + ov_exported_pipe = optimum_pipeline(task, model_id, accelerator="openvino") + ov_exported_pipe.modelcard = None + ov_pipe = optimum_pipeline(task, model_id, revision="ov", accelerator="openvino") self.assertIsInstance(ov_exported_pipe.model, OVBaseModel) self.assertIsInstance(ov_pipe.model, OVBaseModel) with TemporaryDirectory() as tmpdirname: ov_exported_pipe.save_pretrained(tmpdirname) folder_contents = os.listdir(tmpdirname) - if not ov_exported_pipe.model.decoder.stateful: - self.assertTrue(OV_DECODER_WITH_PAST_NAME in folder_contents) - self.assertTrue(OV_DECODER_WITH_PAST_NAME.replace(".xml", ".bin") in folder_contents) - ov_exported_pipe = optimum_pipeline("text2text-generation", tmpdirname, accelerator="openvino") + self.assertTrue(ov_exported_pipe.model._ov_model_paths["encoder"] in folder_contents) + self.assertTrue(ov_exported_pipe.model._ov_model_paths["decoder"] in folder_contents) + ov_exported_pipe = optimum_pipeline(task, tmpdirname, accelerator="openvino") self.assertIsInstance(ov_exported_pipe.model, OVBaseModel) del ov_exported_pipe From 08d148014e292ca9118cb1fdbf502369f3f44d46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 2 Feb 2026 16:06:52 +0100 Subject: [PATCH 039/190] disable pipelines tests when transformers >= v5 since summarization/translation/text2text-generation pipelines are deprecated --- tests/openvino/test_seq2seq.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index 83a4b7c54f..daf81cf747 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -224,6 +224,10 @@ def test_compare_to_transformers(self, model_arch): @parameterized.expand(SUPPORTED_ARCHITECTURES) @pytest.mark.run_slow @slow + @pytest.mark.skipif( + is_transformers_version(">=", "5"), + reason="requires transformers < v5 since summarization/translation/text2text-generation pipelines are deprecated", + ) def test_pipeline(self, model_arch): set_seed(SEED) model_id = MODEL_NAMES[model_arch] From 7bc714cad9b526e8634b562a349a7ecdbc54abdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 2 Feb 2026 18:52:48 +0100 Subject: [PATCH 040/190] fix MixtralModelPatcher --- optimum/exporters/openvino/model_patcher.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index a9d67eba7d..af2a2f546b 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -396,18 +396,19 @@ def _mixtral_sparse_moe_block_forward(self, hidden_states: torch.Tensor) -> torc class MixtralModelPatcher(OVDecoderModelPatcher): def __enter__(self): super().__enter__() - - for layer in self._model.model.layers: - layer.block_sparse_moe._unpatched_forward = layer.block_sparse_moe.forward - layer.block_sparse_moe.forward = types.MethodType( - _mixtral_sparse_moe_block_forward, layer.block_sparse_moe - ) + if is_transformers_version("<", "5"): + for layer in self._model.model.layers: + layer.block_sparse_moe._unpatched_forward = layer.block_sparse_moe.forward + layer.block_sparse_moe.forward = types.MethodType( + _mixtral_sparse_moe_block_forward, layer.block_sparse_moe + ) def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) - for layer in self._model.model.layers: - layer.block_sparse_moe.forward = layer.block_sparse_moe._unpatched_forward + if is_transformers_version("<", "5"): + for layer in self._model.model.layers: + layer.block_sparse_moe.forward = layer.block_sparse_moe._unpatched_forward class ArcticModelPatcher(MixtralModelPatcher): From 8b374c7e067325c2ff7b3f0774aff33010f7d1a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 2 Feb 2026 19:03:49 +0100 Subject: [PATCH 041/190] fix moe patching --- optimum/exporters/openvino/model_patcher.py | 81 +++++++++++++-------- 1 file changed, 51 insertions(+), 30 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index af2a2f546b..18783eb770 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -402,6 +402,8 @@ def __enter__(self): layer.block_sparse_moe.forward = types.MethodType( _mixtral_sparse_moe_block_forward, layer.block_sparse_moe ) + else: + self._model.config._experts_implementation = "batched_mm" def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) @@ -1709,16 +1711,22 @@ def _phi_moe_sparse_moe_block_forward(self, hidden_states: torch.Tensor) -> torc class PhiMoEModelPatcher(Phi3ModelPatcher): def __enter__(self): super().__enter__() - for layer in self._model.model.layers: - layer.block_sparse_moe._orig_forward = layer.block_sparse_moe.forward - layer.block_sparse_moe.forward = types.MethodType( - _phi_moe_sparse_moe_block_forward, layer.block_sparse_moe - ) + + if is_transformers_version("<", "5"): + for layer in self._model.model.layers: + layer.block_sparse_moe._orig_forward = layer.block_sparse_moe.forward + layer.block_sparse_moe.forward = types.MethodType( + _phi_moe_sparse_moe_block_forward, layer.block_sparse_moe + ) + else: + self._model.config._experts_implementation = "batched_mm" def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) - for layer in self._model.model.layers: - layer.block_sparse_moe.forward = layer.block_sparse_moe._orig_forward + + if is_transformers_version("<", "5"): + for layer in self._model.model.layers: + layer.block_sparse_moe.forward = layer.block_sparse_moe._orig_forward def _aquila_self_attn_sdpa_forward( @@ -4443,28 +4451,35 @@ def _granite_moe_parallel_experts_forward(self, inputs, expert_size): class GraniteMoEModelPatcher(OVDecoderModelPatcher): def __enter__(self): super().__enter__() - for layer in self._model.model.layers: - block_sparse_moe = layer.block_sparse_moe - block_sparse_moe.router._orig_forward = block_sparse_moe.router.forward - block_sparse_moe.router.forward = types.MethodType( - _granite_moe_topk_gating_forward, block_sparse_moe.router - ) - block_sparse_moe.input_linear._orig_forward = block_sparse_moe.input_linear.forward - block_sparse_moe.input_linear.forward = types.MethodType( - _granite_moe_parallel_experts_forward, block_sparse_moe.input_linear - ) - block_sparse_moe.output_linear._orig_forward = block_sparse_moe.output_linear.forward - block_sparse_moe.output_linear.forward = types.MethodType( - _granite_moe_parallel_experts_forward, block_sparse_moe.output_linear - ) + + if is_transformers_version("<", "5"): + for layer in self._model.model.layers: + block_sparse_moe = layer.block_sparse_moe + block_sparse_moe.router._orig_forward = block_sparse_moe.router.forward + block_sparse_moe.router.forward = types.MethodType( + _granite_moe_topk_gating_forward, block_sparse_moe.router + ) + block_sparse_moe.input_linear._orig_forward = block_sparse_moe.input_linear.forward + block_sparse_moe.input_linear.forward = types.MethodType( + _granite_moe_parallel_experts_forward, block_sparse_moe.input_linear + ) + block_sparse_moe.output_linear._orig_forward = block_sparse_moe.output_linear.forward + block_sparse_moe.output_linear.forward = types.MethodType( + _granite_moe_parallel_experts_forward, block_sparse_moe.output_linear + ) + + else: + self._model.config._experts_implementation = "batched_mm" def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) - for layer in self._model.model.layers: - block_sparse_moe = layer.block_sparse_moe - block_sparse_moe.router.forward = block_sparse_moe.router._orig_forward - block_sparse_moe.input_linear.forward = block_sparse_moe.input_linear._orig_forward - block_sparse_moe.output_linear.forward = block_sparse_moe.output_linear._orig_forward + + if is_transformers_version("<", "5"): + for layer in self._model.model.layers: + block_sparse_moe = layer.block_sparse_moe + block_sparse_moe.router.forward = block_sparse_moe.router._orig_forward + block_sparse_moe.input_linear.forward = block_sparse_moe.input_linear._orig_forward + block_sparse_moe.output_linear.forward = block_sparse_moe.output_linear._orig_forward class OVSeq2SeqModelPatcher(OVModelPatcher): @@ -5270,14 +5285,18 @@ def _qwen2moe_sparse_block_forward(self, hidden_states: torch.Tensor) -> torch.T class Qwen2MoEPatcher(OVDecoderModelPatcher): def __enter__(self): super().__enter__() - if is_transformers_version(">=", "4.52.0"): + + if is_transformers_version(">=", "4.52.0") and is_transformers_version("<", "5"): from transformers.models.qwen2_moe.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock modulewise_patch(self._model, Qwen2MoeSparseMoeBlock, _qwen2moe_sparse_block_forward) + if is_transformers_version(">=", "5"): + self._model.config._experts_implementation = "batched_mm" + def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) - if is_transformers_version(">=", "4.52.0"): + if is_transformers_version(">=", "4.52.0") and is_transformers_version("<", "5"): from transformers.models.qwen2_moe.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock modulewise_unpatch(self._model, Qwen2MoeSparseMoeBlock) @@ -6626,14 +6645,16 @@ class Qwen3MoeModelPatcher(OVDecoderModelPatcher): def __enter__(self): super().__enter__() - if is_transformers_version(">=", "4.53"): + if is_transformers_version(">=", "4.53") and is_transformers_version("<", "5"): self.original_moe_forward = Qwen3MoeSparseMoeBlock.forward Qwen3MoeSparseMoeBlock.forward = qwen3_moe_forward_patched + if is_transformers_version(">=", "5"): + self._model.config._experts_implementation = "batched_mm" def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) - if is_transformers_version(">=", "4.53"): + if is_transformers_version(">=", "4.53") and is_transformers_version("<", "5"): Qwen3MoeSparseMoeBlock.forward = self.original_moe_forward From a4cfc55f57bbe575576db4db6b0a0edbef72b452 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 3 Feb 2026 15:17:13 +0100 Subject: [PATCH 042/190] gptj fix --- optimum/exporters/openvino/model_patcher.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 18783eb770..68b306e318 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -2793,15 +2793,19 @@ def gptj_attn_forward( if output_attentions: self._attn = self._orig_attn + kwargs = {} + if is_transformers_version("<", "5"): + kwrags["head_mask"] = head_mask + return self._orig_forward( hidden_states, layer_past, attention_mask, position_ids, - head_mask, use_cache=use_cache, output_attentions=output_attentions, cache_position=cache_position, + **kwargs ) From 5bab4588af14819368c7b5bbe555abad18df20b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 3 Feb 2026 15:40:58 +0100 Subject: [PATCH 043/190] fix granitemoehybrid patcher --- optimum/exporters/openvino/model_patcher.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 68b306e318..08710d6601 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -2805,7 +2805,7 @@ def gptj_attn_forward( use_cache=use_cache, output_attentions=output_attentions, cache_position=cache_position, - **kwargs + **kwargs, ) @@ -7522,10 +7522,12 @@ def patch_sparse_moe(sparse_moe_layer): super().__enter__() setattr(self._model, self.orig_forward_name, self.patched_forward) - self._model.model._orig_update_causal_mask = self._model.model._update_causal_mask - self._model.model._update_causal_mask = types.MethodType( - granite_moe_hybrid_update_causal_mask, self._model.model - ) + if is_transformers_version("<", "5"): + self._model.model._orig_update_causal_mask = self._model.model._update_causal_mask + self._model.model._update_causal_mask = types.MethodType( + granite_moe_hybrid_update_causal_mask, self._model.model + ) + for idx, layer in enumerate(self._model.model.layers): if hasattr(layer, "block_sparse_moe"): patch_sparse_moe(layer.block_sparse_moe) @@ -7545,7 +7547,9 @@ def unpatch_sparse_moe(sparse_moe_layer): super().__exit__(exc_type, exc_value, traceback) setattr(self._model, self.orig_forward_name, self.model_orig_forward) - self._model.model._update_causal_mask = self._model.model._orig_update_causal_mask + if is_transformers_version("<", "5"): + self._model.model._update_causal_mask = self._model.model._orig_update_causal_mask + for idx, layer in enumerate(self._model.model.layers): if hasattr(layer, "block_sparse_moe"): unpatch_sparse_moe(layer.block_sparse_moe) From daf7ec83e2fe8da562575fb7db52de930980002a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 3 Feb 2026 18:20:54 +0100 Subject: [PATCH 044/190] typo --- optimum/exporters/openvino/model_patcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 08710d6601..a496ec7d8e 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -2795,7 +2795,7 @@ def gptj_attn_forward( kwargs = {} if is_transformers_version("<", "5"): - kwrags["head_mask"] = head_mask + kwargs["head_mask"] = head_mask return self._orig_forward( hidden_states, From a45f5ab8f32c7fef54381587fb9e5368b816610e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 3 Feb 2026 19:10:59 +0100 Subject: [PATCH 045/190] add exaone max_transformers_version as incompatible with v5 --- optimum/exporters/openvino/model_configs.py | 2 +- tests/openvino/test_decoder.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 594d876812..0f8afff724 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -701,7 +701,7 @@ class BitnetOpenVINOConfig(LlamaOnnxConfig): library_name="transformers", ) class ExaoneOpenVINOConfig(LlamaOpenVINOConfig): - pass + MAX_TRANSFORMERS_VERSION = "4.57.6" @register_in_tasks_manager( diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 19eb7dfb99..1f7ae31827 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -90,7 +90,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "qwen2_moe", "phi3", "gemma2", - "exaone", "granite", "granitemoe", ) @@ -152,7 +151,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES += ("qwen", "chatglm", "chatglm4") if is_transformers_version("<", "5"): - SUPPORTED_ARCHITECTURES += ("codegen2",) + SUPPORTED_ARCHITECTURES += ("codegen2", "exaone") GENERATION_LENGTH = 100 From 342dc59c3742230d4661c351d8dba272382040cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 4 Feb 2026 14:35:09 +0100 Subject: [PATCH 046/190] add decilm max_transformers_version as incompatible with v5 --- optimum/exporters/openvino/model_configs.py | 1 + tests/openvino/test_decoder.py | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 0f8afff724..fb6be5eb52 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -1351,6 +1351,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int @register_in_tasks_manager("deci", *["text-generation", "text-generation-with-past"], library_name="transformers") class DeciOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 + MAX_TRANSFORMERS_VERSION = "4.57.6" DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DeciDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = DeciDummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 1f7ae31827..2d7652e3bf 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -79,7 +79,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "xverse", "internlm", "jais", - "decilm", "gemma", "olmo", "stablelm", @@ -151,7 +150,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES += ("qwen", "chatglm", "chatglm4") if is_transformers_version("<", "5"): - SUPPORTED_ARCHITECTURES += ("codegen2", "exaone") + SUPPORTED_ARCHITECTURES += ("codegen2", "exaone", "decilm") GENERATION_LENGTH = 100 From 2a28fe7211bc39190303da7cfea980007b992e89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 4 Feb 2026 16:16:59 +0100 Subject: [PATCH 047/190] fix llama4 patcher --- optimum/exporters/openvino/model_patcher.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index a496ec7d8e..8aa6b94a18 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -19,7 +19,7 @@ import math import types from collections.abc import Iterable -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union import torch import torch.nn.functional as F @@ -6289,6 +6289,15 @@ def llama4_moe_forward(self, hidden_states): return out, router_scores +# Copied from https://github.com/huggingface/transformers/blob/v4.56.0/src/transformers/masking_utils.py#L105 +# transformers.masking_utils._legacy_chunked_overlay deprecated since transformers v5 +def _legacy_chunked_overlay(chunk_size: int) -> Callable: + def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool: + return kv_idx // chunk_size == q_idx // chunk_size + + return inner_mask + + class Llama4TextModelPatcher(OVModelPatcher): def __enter__(self): super().__enter__() @@ -6305,8 +6314,8 @@ def __enter__(self): if is_transformers_version(">=", "4.56"): # openvino is not able to trace through the new chunked_overlay with left_padding self.original_chunked_overlay = transformers.masking_utils.chunked_overlay - transformers.masking_utils.chunked_overlay = ( - lambda chunk_size, left_padding: transformers.masking_utils._legacy_chunked_overlay(chunk_size) + transformers.masking_utils.chunked_overlay = lambda chunk_size, left_padding: _legacy_chunked_overlay( + chunk_size ) def __exit__(self, exc_type, exc_value, traceback): From b9a3cbe90f1ecd802f008f3cfc2cb75d1934fc25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 4 Feb 2026 16:54:48 +0100 Subject: [PATCH 048/190] make OV DynamicCache backward compatible --- optimum/exporters/openvino/model_patcher.py | 52 +++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 8aa6b94a18..8d1c7a93e9 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -56,6 +56,28 @@ class OVDynamicCache(DynamicCache): + # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L881 + def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor]: + """ + Support for backwards-compatible `past_key_values` indexing, e.g. `past_key_values[0][0].shape[2]` to get the + sequence length. + """ + if layer_idx < len(self.layers): + return self.layers[layer_idx].keys, self.layers[layer_idx].values + else: + raise KeyError( + f"Cache only has {len(self.layers)} layers, attempted to access layer with index {layer_idx}" + ) + + # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L893 + def __iter__(self): + """ + Support for backwards-compatible `past_key_values` iteration, e.g. `for x in past_key_values:` to iterate over + keys and values + """ + for layer_idx in range(len(self)): + yield (self.layers[layer_idx].keys, self.layers[layer_idx].values) + # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1005 def to_legacy_cache(self) -> tuple[tuple[torch.Tensor, torch.Tensor]]: """ @@ -85,6 +107,36 @@ def from_legacy_cache(cls, past_key_values: tuple[tuple[torch.Tensor, torch.Tens class OVEncoderDecoderCache(EncoderDecoderCache): + # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1244 + def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Support for backwards-compatible `past_key_values` indexing, e.g. `past_key_values[0][0].shape[2]` to get the + sequence length. + """ + if layer_idx < len(self): + return ( + self.self_attention_cache.layers[layer_idx].keys, + self.self_attention_cache.layers[layer_idx].values, + self.cross_attention_cache.layers[layer_idx].keys, + self.cross_attention_cache.layers[layer_idx].values, + ) + else: + raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}") + + # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1231 + def __iter__(self): + """ + Support for backwards-compatible `past_key_values` iteration, e.g. `for x in past_key_values:` to iterate over + keys and values + """ + for layer_idx in range(len(self)): + yield ( + self.self_attention_cache.layers[layer_idx].keys, + self.self_attention_cache.layers[layer_idx].values, + self.cross_attention_cache.layers[layer_idx].keys, + self.cross_attention_cache.layers[layer_idx].values, + ) + # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1266 def to_legacy_cache(self) -> tuple[tuple[torch.Tensor]]: """Converts the `OVEncoderDecoderCache` instance into its equivalent in the legacy cache format.""" From 1687e3515df16c6f19cb7c76e089a4d3523f4255 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 4 Feb 2026 17:48:16 +0100 Subject: [PATCH 049/190] remove incompatible models aquila2 orion internlm2 --- optimum/exporters/openvino/model_configs.py | 3 ++- tests/openvino/test_decoder.py | 5 +---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index fb6be5eb52..5e2e643ef4 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -858,6 +858,7 @@ class Starcoder2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): @register_in_tasks_manager("internlm2", *["text-generation", "text-generation-with-past"], library_name="transformers") class InternLM2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 + MAX_TRANSFORMERS_VERSION = "4.57.6" DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig @@ -867,7 +868,7 @@ class InternLM2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): @register_in_tasks_manager("orion", *["text-generation", "text-generation-with-past"], library_name="transformers") class OrionOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 - + MAX_TRANSFORMERS_VERSION = "4.57.6" DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 2d7652e3bf..f40e402ecb 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -66,8 +66,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "opt", "pegasus", "phi", - "internlm2", - "orion", "falcon", "falcon-40b", "persimmon", @@ -75,7 +73,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "gpt_neox_japanese", "xglm", "aquila", - "aquila2", "xverse", "internlm", "jais", @@ -150,7 +147,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES += ("qwen", "chatglm", "chatglm4") if is_transformers_version("<", "5"): - SUPPORTED_ARCHITECTURES += ("codegen2", "exaone", "decilm") + SUPPORTED_ARCHITECTURES += ("codegen2", "exaone", "decilm", "internlm2", "orion", "aquila2") GENERATION_LENGTH = 100 From 961c1d3f3aa91193ff7ed09ef88c22c6c8a23514 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 4 Feb 2026 18:34:26 +0100 Subject: [PATCH 050/190] add jais max_transformers_version as incompatible with v5 --- optimum/exporters/openvino/model_configs.py | 1 + tests/openvino/test_decoder.py | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 5e2e643ef4..2e060684e7 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -1184,6 +1184,7 @@ class DBRXOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): ) class JaisOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 + MAX_TRANSFORMERS_VERSION = "4.57.6" NORMALIZED_CONFIG_CLASS = NormalizedTextConfig DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = DummyPastKeyValuesGenerator diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index f40e402ecb..26e8010f76 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -75,7 +75,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "aquila", "xverse", "internlm", - "jais", "gemma", "olmo", "stablelm", @@ -147,7 +146,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES += ("qwen", "chatglm", "chatglm4") if is_transformers_version("<", "5"): - SUPPORTED_ARCHITECTURES += ("codegen2", "exaone", "decilm", "internlm2", "orion", "aquila2") + SUPPORTED_ARCHITECTURES += ("codegen2", "exaone", "decilm", "internlm2", "orion", "aquila2", "jais") GENERATION_LENGTH = 100 From e8e6c18a284e8cf036ca12a420e458724a42d7ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 5 Feb 2026 10:29:30 +0100 Subject: [PATCH 051/190] dbrx --- optimum/exporters/openvino/model_configs.py | 1 + tests/openvino/test_decoder.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 2e060684e7..3113fdf136 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -1165,6 +1165,7 @@ class CodeGenOpenVINOConfig(CodeGenOnnxConfig): ) class DBRXOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 + MAX_TRANSFORMERS_VERSION = "4.57.6" NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args( num_attention_heads="n_heads", hidden_size="d_model", diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 26e8010f76..5b8916c5cb 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -79,7 +79,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "olmo", "stablelm", "starcoder2", - "dbrx", "cohere", "qwen2", "qwen2_moe", @@ -146,7 +145,8 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES += ("qwen", "chatglm", "chatglm4") if is_transformers_version("<", "5"): - SUPPORTED_ARCHITECTURES += ("codegen2", "exaone", "decilm", "internlm2", "orion", "aquila2", "jais") + # TODO: add dbrx back once fixed in transformers + SUPPORTED_ARCHITECTURES += ("codegen2", "exaone", "decilm", "internlm2", "orion", "aquila2", "jais", "dbrx") GENERATION_LENGTH = 100 From c6640d6364257db8da6207999ed00384a9e16358 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 5 Feb 2026 19:18:38 +0100 Subject: [PATCH 052/190] set float32 dtype --- tests/openvino/test_decoder.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 5b8916c5cb..76901f5db4 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -360,7 +360,17 @@ def test_compare_to_transformers(self, model_arch): set_seed(SEED) with mock_torch_cuda_is_available("awq" in model_arch or "gptq" in model_arch): transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs) - if model_arch in ["qwen", "arctic", "chatglm4", "gpt_oss_mxfp4"]: + if model_arch in [ + "qwen", + "arctic", + "chatglm4", + "gpt_oss_mxfp4", + "llama", + "lfm2", + "gemma3_text", + "llama4", + "exaone4", + ]: transformers_model.to(torch.float32) with torch.no_grad(): @@ -808,7 +818,7 @@ def test_beam_search(self, model_arch): def test_load_with_different_dtype(self): set_seed(SEED) - model_id = MODEL_NAMES["llama"] + model_id = MODEL_NAMES["mistral"] pt_model = AutoModelForCausalLM.from_pretrained( model_id, ) From 1e0c06ffa9a3bb82009399f98dab0efd70219287 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 5 Feb 2026 19:30:18 +0100 Subject: [PATCH 053/190] baichuan2 not compatible with v5 --- tests/openvino/test_decoder.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 76901f5db4..86105e8112 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -45,7 +45,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES = ( "bart", - "baichuan2", "baichuan2-13b", "gpt_bigcode", "bigbird_pegasus", @@ -146,7 +145,17 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version("<", "5"): # TODO: add dbrx back once fixed in transformers - SUPPORTED_ARCHITECTURES += ("codegen2", "exaone", "decilm", "internlm2", "orion", "aquila2", "jais", "dbrx") + SUPPORTED_ARCHITECTURES += ( + "codegen2", + "exaone", + "decilm", + "internlm2", + "orion", + "aquila2", + "jais", + "dbrx", + "baichuan2", + ) GENERATION_LENGTH = 100 From b4910fc0a30bbaf391cae1057dccb60f4f8d5225 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 5 Feb 2026 22:40:38 +0100 Subject: [PATCH 054/190] gpt oss set experts_implementation batched mm --- optimum/exporters/openvino/model_patcher.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 8d1c7a93e9..7490c44d55 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -7385,16 +7385,19 @@ class GptOssModelPatcher(OVDecoderModelPatcher): def __enter__(self): super().__enter__() - if is_transformers_version(">=", "4.55.0"): + if is_transformers_version(">=", "4.55.0") and is_transformers_version("<", "5"): from transformers.models.gpt_oss.modeling_gpt_oss import GptOssExperts self.original_gpt_oss_forward = GptOssExperts.forward GptOssExperts.forward = gpt_oss_forward + if is_transformers_version(">=", "5"): + self._model.config._experts_implementation = "batched_mm" + def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) - if is_transformers_version(">=", "4.55.0"): + if is_transformers_version(">=", "4.55.0") and is_transformers_version("<", "5"): from transformers.models.gpt_oss.modeling_gpt_oss import GptOssExperts GptOssExperts.forward = self.original_gpt_oss_forward From e19da565ff0cb4d85921c99c606b3a67df7af259 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 5 Feb 2026 23:15:09 +0100 Subject: [PATCH 055/190] bitnet --- optimum/exporters/openvino/model_configs.py | 1 + tests/openvino/test_decoder.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 3113fdf136..e25c154f4a 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -686,6 +686,7 @@ class GptOssOpenVINOConfig(LlamaOpenVINOConfig): ) class BitnetOpenVINOConfig(LlamaOnnxConfig): MIN_TRANSFORMERS_VERSION = "4.52.1" + MAX_TRANSFORMERS_VERSION = "4.57.6" _MODEL_PATCHER = OVDecoderModelPatcher diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 86105e8112..6efa3629fe 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -125,7 +125,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version(">=", "4.53.0"): SUPPORTED_ARCHITECTURES += ("arcee",) - if is_transformers_version(">=", "4.52.1"): + if is_transformers_version(">=", "4.52.1") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES += ("bitnet",) if is_transformers_version(">=", "4.54.0"): From fde5ac98af9612f51bd015efe6f26c4ca693c268 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 6 Feb 2026 12:23:38 +0100 Subject: [PATCH 056/190] qwenvl --- optimum/exporters/openvino/model_configs.py | 26 ++++++++++----------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index e25c154f4a..4d5a0d4f48 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -317,11 +317,11 @@ def init_model_configs(): register_in_tasks_manager = TasksManager.create_register("openvino", overwrite_existing=True) -def _get_language_model(model): +def _get_subcomponent_model(model, name): if is_transformers_version("<", "5"): - return model.language_model + return getattr(model, name) - return model.model.language_model + return getattr(model.model, name) @register_in_tasks_manager("baichuan", *["text-generation", "text-generation-with-past"], library_name="transformers") @@ -1714,14 +1714,14 @@ def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior]) behavior = VLMConfigBehavior(behavior) if behavior == VLMConfigBehavior.LANGUAGE: - return _get_language_model(model) if not hasattr(model, "lm_head") else model + return _get_subcomponent_model(model, "language_model") if not hasattr(model, "lm_head") else model if behavior == VLMConfigBehavior.VISION_EMBEDDINGS: return model if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS: text_embedding = model.get_input_embeddings() - text_embedding.config = _get_language_model(model).config + text_embedding.config = _get_subcomponent_model(model, "language_model").config return text_embedding def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None): @@ -1904,8 +1904,8 @@ def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior]) behavior = VLMConfigBehavior(behavior) if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS: - text_embedding = _get_language_model(model).get_input_embeddings() - text_embedding.config = _get_language_model(model).config + text_embedding = _get_subcomponent_model(model, "language_model").get_input_embeddings() + text_embedding.config = _get_subcomponent_model(model, "language_model").config return text_embedding return super().get_model_for_behavior(model, behavior) @@ -1981,14 +1981,14 @@ def get_model_for_behavior(model, behavior: Union[str, VLMConfigBehavior]): behavior = VLMConfigBehavior(behavior) if behavior == VLMConfigBehavior.LANGUAGE: - return _get_language_model(model) + return _get_subcomponent_model(model, "language_model") if behavior == VLMConfigBehavior.VISION_EMBEDDINGS: return model if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS: - text_embedding = _get_language_model(model).get_input_embeddings() - text_embedding.config = _get_language_model(model).config + text_embedding = _get_subcomponent_model(model, "language_model").get_input_embeddings() + text_embedding.config = _get_subcomponent_model(model, "language_model").config return text_embedding def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None): @@ -3478,12 +3478,12 @@ def get_model_for_behavior(model, behavior: Union[str, Qwen2VLConfigBehavior]): return model if behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS: - vision_embeddings = model.visual.patch_embed + vision_embeddings = _get_subcomponent_model(model, "visual").patch_embed vision_embeddings.config = model.config.vision_config return vision_embeddings if behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER: - vision_emb_merger = model.visual + vision_emb_merger = _get_subcomponent_model(model, "visual") vision_emb_merger.config = model.config.vision_config return vision_emb_merger @@ -3491,7 +3491,7 @@ def get_model_for_behavior(model, behavior: Union[str, Qwen2VLConfigBehavior]): text_embedding = ( model.model.embed_tokens if hasattr(model.model, "embed_tokens") - else _get_language_model(model).embed_tokens + else _get_subcomponent_model(model, "language_model").embed_tokens ) text_embedding.config = model.config return text_embedding From 0d3b656a2c110b2a84f440d428dff13e86461b8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 6 Feb 2026 12:43:12 +0100 Subject: [PATCH 057/190] maira2 remote code --- optimum/exporters/openvino/model_configs.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 4d5a0d4f48..b20e80ae27 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -1904,10 +1904,13 @@ def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior]) behavior = VLMConfigBehavior(behavior) if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS: - text_embedding = _get_subcomponent_model(model, "language_model").get_input_embeddings() - text_embedding.config = _get_subcomponent_model(model, "language_model").config + text_embedding = model.language_model.get_input_embeddings() + text_embedding.config = model.language_model.config return text_embedding + if behavior == VLMConfigBehavior.LANGUAGE: + return model.language_model + return super().get_model_for_behavior(model, behavior) From b8797e32dcb0dd49fd42a5cb403df051f4f7d6e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 6 Feb 2026 15:00:31 +0100 Subject: [PATCH 058/190] gemma3 and got_ocr2 --- optimum/exporters/openvino/model_patcher.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 7490c44d55..f780ce0cd9 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -4684,7 +4684,11 @@ def __init__( model.__orig_forward = model.forward # Adopted from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/got_ocr2/modeling_got_ocr2.py#L835 # Adopted from https://github.com/huggingface/transformers/blob/v4.49.0-Gemma-3/src/transformers/models/gemma3/modeling_gemma3.py#L1321 - if hasattr(model, "model") and hasattr(model.model, "get_image_features"): + if ( + hasattr(model, "model") + and hasattr(model.model, "get_image_features") + and is_transformers_version("<", "5") + ): model.forward = model.model.get_image_features else: model.forward = model.get_image_features From 9dfb66617c19508dade18184d84bd20a6f5d9cf5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 6 Feb 2026 16:36:14 +0100 Subject: [PATCH 059/190] llava next --- optimum/exporters/openvino/model_configs.py | 15 ++++++----- optimum/exporters/openvino/model_patcher.py | 29 +++++++++++++++++++-- 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index b20e80ae27..a2a54fb152 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -166,6 +166,7 @@ Llama4ImageEmbeddingsModelPatcher, Llama4TextModelPatcher, LlavaImageEmbeddingModelPatcher, + LlavaNextImageEmbeddingModelPatcher, LlavaNextVideoImageEmbeddingModelPatcher, LlavaQwen2ImageEmbeddingsModelPatcher, MairaImageEmbeddingModelPatcher, @@ -199,6 +200,7 @@ SanaTextEncoderModelPatcher, XverseModelPatcher, Zamba2ModelPatcher, + _get_subcomponent_model, ) @@ -317,13 +319,6 @@ def init_model_configs(): register_in_tasks_manager = TasksManager.create_register("openvino", overwrite_existing=True) -def _get_subcomponent_model(model, name): - if is_transformers_version("<", "5"): - return getattr(model, name) - - return getattr(model.model, name) - - @register_in_tasks_manager("baichuan", *["text-generation", "text-generation-with-past"], library_name="transformers") class BaichaunOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 13 @@ -1773,6 +1768,12 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs) -> Dict: class LlavaNextOpenVINOConfig(LlavaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.40.0" + def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None): + model_kwargs = model_kwargs or {} + if self._behavior != VLMConfigBehavior.VISION_EMBEDDINGS: + return super().patch_model_for_export(model, model_kwargs) + return LlavaNextImageEmbeddingModelPatcher(self, model, model_kwargs) + class DummyLLavaMultiModalProjectorInputGenerator(DummyInputGenerator): SUPPORTED_INPUT_NAMES = ["image_features"] diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index f780ce0cd9..5b93fc5347 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -55,6 +55,13 @@ logger = logging.getLogger(__name__) +def _get_subcomponent_model(model, name): + if is_transformers_version("<", "5"): + return getattr(model, name) + + return getattr(model.model, name) + + class OVDynamicCache(DynamicCache): # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L881 def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor]: @@ -3365,7 +3372,7 @@ def llava_vision_embed_forward(self, pixel_values): # copied from https://github.com/huggingface/transformers/blob/v4.44.2/src/transformers/models/llava/modeling_llava.py#L428-L441 # these changes does not bring any difference from original, it only packs model subcomponent inference together # that allow us avoid memory overheads and their inference results handling on code-level - image_outputs = self.vision_tower(pixel_values, output_hidden_states=True) + image_outputs = _get_subcomponent_model(self, "vision_tower")(pixel_values, output_hidden_states=True) # this is not memory efficient at all (output_hidden_states=True) will save all the hidden stated. selected_image_feature = image_outputs.hidden_states[self.config.vision_feature_layer] @@ -3376,7 +3383,7 @@ def llava_vision_embed_forward(self, pixel_values): else: raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}") - image_features = self.multi_modal_projector(selected_image_feature) + image_features = _get_subcomponent_model(self, "multi_modal_projector")(selected_image_feature) return image_features @@ -3429,6 +3436,24 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward +class LlavaNextImageEmbeddingModelPatcher(OVModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: "PreTrainedModel", + model_kwargs: Dict[str, Any], + ): + model.__orig_forward = model.forward + # TODO: use get_image_features instead and add image_sizes as input when exorting + # https://github.com/huggingface/transformers/blob/v4.48.0/src/transformers/models/llava_next/modeling_llava_next.py#L716 + model.forward = types.MethodType(llava_vision_embed_forward, model) + super().__init__(config, model, model_kwargs) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + self._model.forward = self._model.__orig_forward + + class MairaImageEmbeddingModelPatcher(OVModelPatcher): def __init__( self, From 3386c647fb1d5ad051c866e0b071ce77b4e9760a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 6 Feb 2026 16:51:13 +0100 Subject: [PATCH 060/190] llava next video --- optimum/exporters/openvino/model_patcher.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 5b93fc5347..b3412a0a41 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3391,7 +3391,7 @@ def llava_next_video_vision_embed_forward(self, pixel_values): # copied from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/llava_next_video/modeling_llava_next_video.py#L519 # these changes does not bring any difference from original, it only packs model subcomponent inference together # that allow us avoid memory overheads and their inference results handling on code-level - image_features = self.vision_tower(pixel_values, output_hidden_states=True) + image_features = _get_subcomponent_model(self, "vision_tower")(pixel_values, output_hidden_states=True) vision_feature_layer = self.config.vision_feature_layer if isinstance(vision_feature_layer, int): selected_image_feature = image_features.hidden_states[vision_feature_layer] @@ -3444,7 +3444,7 @@ def __init__( model_kwargs: Dict[str, Any], ): model.__orig_forward = model.forward - # TODO: use get_image_features instead and add image_sizes as input when exorting + # TODO: use get_image_features instead and add image_sizes as input when exporting # https://github.com/huggingface/transformers/blob/v4.48.0/src/transformers/models/llava_next/modeling_llava_next.py#L716 model.forward = types.MethodType(llava_vision_embed_forward, model) super().__init__(config, model, model_kwargs) @@ -3479,12 +3479,9 @@ def __init__( model_kwargs: Dict[str, Any], ): model.__orig_forward = model.forward - - if is_transformers_version("<", "5"): - model.forward = types.MethodType(llava_next_video_vision_embed_forward, model) - else: - model.forward = model.get_image_features - + # TODO: use get_image_features instead and add image_sizes as input when exporting + # https://github.com/huggingface/transformers/blob/v4.48.0/src/transformers/models/llava_next_video/modeling_llava_next_video.py#L746 + model.forward = types.MethodType(llava_next_video_vision_embed_forward, model) super().__init__(config, model, model_kwargs) def __exit__(self, exc_type, exc_value, traceback): From bc4a84d163dd4b8d6166272006b9b7c6c105e804 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 10 Feb 2026 15:12:12 +0100 Subject: [PATCH 061/190] use ONNXCache --- optimum/exporters/openvino/model_patcher.py | 245 ++++---------------- 1 file changed, 42 insertions(+), 203 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 1ab1a386b9..b779170d8e 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -36,6 +36,9 @@ override_arguments, sdpa_mask_without_vmap, ) +from optimum.exporters.onnx.utils import ONNXDynamicCache, ONNXEncoderDecoderCache + + from optimum.intel.utils.import_utils import is_diffusers_version, is_torch_version, is_transformers_version @@ -62,170 +65,6 @@ def _get_subcomponent_model(model, name): return getattr(model.model, name) -class OVDynamicCache(DynamicCache): - # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L881 - def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor]: - """ - Support for backwards-compatible `past_key_values` indexing, e.g. `past_key_values[0][0].shape[2]` to get the - sequence length. - """ - if layer_idx < len(self.layers): - return self.layers[layer_idx].keys, self.layers[layer_idx].values - else: - raise KeyError( - f"Cache only has {len(self.layers)} layers, attempted to access layer with index {layer_idx}" - ) - - # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L893 - def __iter__(self): - """ - Support for backwards-compatible `past_key_values` iteration, e.g. `for x in past_key_values:` to iterate over - keys and values - """ - for layer_idx in range(len(self)): - yield (self.layers[layer_idx].keys, self.layers[layer_idx].values) - - # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1005 - def to_legacy_cache(self) -> tuple[tuple[torch.Tensor, torch.Tensor]]: - """ - Converts the `Cache` instance into the its equivalent in the legacy cache format. Used for - backward compatibility. - """ - legacy_cache = () - for layer in self.layers: - legacy_cache += ((layer.keys, layer.values),) - return legacy_cache - - # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1015 - @classmethod - def from_legacy_cache(cls, past_key_values: tuple[tuple[torch.Tensor, torch.Tensor]]) -> "OVDynamicCache": - """ - Converts a cache in the legacy cache format into an equivalent `Cache`. Used for - backward compatibility. - """ - cache = cls() - if past_key_values is None: - logger.warning_once("past_key_values should not be None in from_legacy_cache()") - if past_key_values is not None: - for layer_idx in range(len(past_key_values)): - key_states, value_states = past_key_values[layer_idx] - cache.update(key_states, value_states, layer_idx) - return cache - - -class OVEncoderDecoderCache(EncoderDecoderCache): - # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1244 - def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """ - Support for backwards-compatible `past_key_values` indexing, e.g. `past_key_values[0][0].shape[2]` to get the - sequence length. - """ - if layer_idx < len(self): - return ( - self.self_attention_cache.layers[layer_idx].keys, - self.self_attention_cache.layers[layer_idx].values, - self.cross_attention_cache.layers[layer_idx].keys, - self.cross_attention_cache.layers[layer_idx].values, - ) - else: - raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}") - - # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1231 - def __iter__(self): - """ - Support for backwards-compatible `past_key_values` iteration, e.g. `for x in past_key_values:` to iterate over - keys and values - """ - for layer_idx in range(len(self)): - yield ( - self.self_attention_cache.layers[layer_idx].keys, - self.self_attention_cache.layers[layer_idx].values, - self.cross_attention_cache.layers[layer_idx].keys, - self.cross_attention_cache.layers[layer_idx].values, - ) - - # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1266 - def to_legacy_cache(self) -> tuple[tuple[torch.Tensor]]: - """Converts the `OVEncoderDecoderCache` instance into its equivalent in the legacy cache format.""" - legacy_cache = () - if len(self.cross_attention_cache) > 0: - for self_attn, cross_attn in zip( - self.self_attention_cache.to_legacy_cache(), self.cross_attention_cache.to_legacy_cache() - ): - legacy_cache += (self_attn + cross_attn,) - else: - legacy_cache = self.self_attention_cache.to_legacy_cache() - return legacy_cache - - # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1279 - @classmethod - def from_legacy_cache( - cls, past_key_values: Optional[Iterable[tuple[torch.FloatTensor, ...]]] - ) -> "OVEncoderDecoderCache": - """Converts a cache in the legacy cache format into an equivalent `OVEncoderDecoderCache`.""" - cache = cls(OVDynamicCache(), OVDynamicCache()) - if past_key_values is None: - logger.warning_once("past_key_values should not be None in from_legacy_cache()") - else: - for layer_idx, key_value_states in enumerate(past_key_values): - key_states, value_states = key_value_states[:2] - cache.self_attention_cache.update(key_states, value_states, layer_idx) - if len(key_value_states) > 2: - key_states, value_states = key_value_states[2:] - cache.cross_attention_cache.update(key_states, value_states, layer_idx) - cache.is_updated[layer_idx] = True - return cache - - -def preprocess_past_key_values(past_key_values): - if ( - is_transformers_version(">=", "4.48") - and isinstance(past_key_values, (list, tuple)) - and isinstance(past_key_values[0], (list, tuple)) - ): - if len(past_key_values[0]) == 2: - past_key_values = OVDynamicCache.from_legacy_cache(past_key_values) - elif len(past_key_values[0]) == 4: - past_key_values = OVEncoderDecoderCache.from_legacy_cache(past_key_values) - else: - raise ValueError( - f"past_key_values should have either 2 or 4 elements, but it has {len(past_key_values[0])} elements." - ) - - return past_key_values - - -class OVModelPatcher(ModelPatcher): - def __init__( - self, - config: "OnnxConfig", - model: "PreTrainedModel", - model_kwargs: Optional[Dict[str, Any]] = None, - ): - super().__init__(config, model, model_kwargs) - - self.model_patched_forward = self.patched_forward - - @functools.wraps(self.model_patched_forward) - def patched_forward(*args, **kwargs): - signature = inspect.signature(self.model_patched_forward) - args, kwargs = override_arguments(args, kwargs, signature, model_kwargs=self.model_kwargs) - - if "past_key_values" in signature.parameters: - # Most models require past_key_values to be a cache instance instead of a tuple now - pkv_index = list(signature.parameters.keys()).index("past_key_values") - if pkv_index < len(args) and args[pkv_index] is not None: - args[pkv_index] = preprocess_past_key_values(args[pkv_index]) - elif kwargs.get("past_key_values") is not None: - kwargs["past_key_values"] = preprocess_past_key_values(kwargs["past_key_values"]) - - outputs = self.model_patched_forward(*args, **kwargs) - - return outputs - - self.patched_forward = patched_forward - - for idx, spec in enumerate(UNSUPPORTED_OPS_PATCHING_SPEC): if spec.name in { # onnx-exporter-specific fixes @@ -382,7 +221,7 @@ def eager_mask_without_vmap(*args, **kwargs) -> Optional[torch.Tensor]: return mask -class OVDecoderModelPatcher(OVModelPatcher): +class OVDecoderModelPatcher(ModelPatcher): def __enter__(self): super().__enter__() @@ -1513,7 +1352,7 @@ def phi3_442_forward( if use_cache: use_legacy_cache = not isinstance(past_key_values, Cache) if use_legacy_cache: - past_key_values = OVDynamicCache.from_legacy_cache(past_key_values) + past_key_values = ONNXDynamicCache.from_legacy_cache(past_key_values) past_key_values_length = past_key_values.get_usable_length(seq_length) if position_ids is None: @@ -3095,7 +2934,7 @@ def patched_forward(*args, **kwargs): legacy_pkv = args[pkv_argument_index] pkv_in_args = True if legacy_pkv is not None: - pkv = OVDynamicCache.from_legacy_cache(legacy_pkv) + pkv = ONNXDynamicCache.from_legacy_cache(legacy_pkv) return_legacy_cache = True if not pkv_in_args: kwargs["past_key_values"] = pkv @@ -3254,7 +3093,7 @@ def __exit__(self, exc_type, exc_value, traceback): layer.self_attn.forward = layer.self_attn._orig_forward -class IBertModelPatcher(OVModelPatcher): +class IBertModelPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3272,7 +3111,7 @@ def __init__( self._model(torch.ones([1, 1], dtype=torch.long)) -class InternVLChatImageEmbeddingModelPatcher(OVModelPatcher): +class InternVLChatImageEmbeddingModelPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3415,7 +3254,7 @@ def maira_vision_embed_forward(self, pixel_values): return self.get_image_features(pixel_values, vision_feature_layer, vision_feature_select_strategy) -class LlavaImageEmbeddingModelPatcher(OVModelPatcher): +class LlavaImageEmbeddingModelPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3436,7 +3275,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward -class LlavaNextImageEmbeddingModelPatcher(OVModelPatcher): +class LlavaNextImageEmbeddingModelPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3454,7 +3293,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward -class MairaImageEmbeddingModelPatcher(OVModelPatcher): +class MairaImageEmbeddingModelPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3471,7 +3310,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward -class LlavaNextVideoImageEmbeddingModelPatcher(OVModelPatcher): +class LlavaNextVideoImageEmbeddingModelPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3513,7 +3352,7 @@ def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor: return emb.unsqueeze(1) -class FluxTransfromerModelPatcher(OVModelPatcher): +class FluxTransfromerModelPatcher(ModelPatcher): def __enter__(self): super().__enter__() if is_diffusers_version("<", "0.31.0"): @@ -3688,7 +3527,7 @@ def _minicpmv_siglip_transformer_forward( ) -class MiniCPMVResamplerModelPatcher(OVModelPatcher): +class MiniCPMVResamplerModelPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3705,7 +3544,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward -class MiniCPMVImageEmbeddingsModelPatcher(OVModelPatcher): +class MiniCPMVImageEmbeddingsModelPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3736,7 +3575,7 @@ def __exit__(self, exc_type, exc_value, traceback): layer.self_attn.forward = layer.self_attn._orig_forward -class LlavaQwen2ImageEmbeddingsModelPatcher(OVModelPatcher): +class LlavaQwen2ImageEmbeddingsModelPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3754,7 +3593,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward -class InputEmbeddingPatcher(OVModelPatcher): +class InputEmbeddingPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3779,7 +3618,7 @@ def phi3_vision_embeddings_forward(self, pixel_values: torch.FloatTensor): return self.get_img_features(pixel_values) -class Phi3VisionImageEmbeddingsPatcher(OVModelPatcher): +class Phi3VisionImageEmbeddingsPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -4249,7 +4088,7 @@ def forward_wrap( input_ids=None, use_cache=True, ): - new_past_key_values = OVDynamicCache.from_legacy_cache(past_key_values) + new_past_key_values = ONNXDynamicCache.from_legacy_cache(past_key_values) result = self.__orig_forward( input_ids=input_ids, attention_mask=attention_mask, @@ -4434,7 +4273,7 @@ def block_forward( block.attn.forward = types.MethodType(sdpa_attn_forward, block.attn) -class Qwen2VLVisionEmbMergerPatcher(OVModelPatcher): +class Qwen2VLVisionEmbMergerPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -4468,7 +4307,7 @@ def __exit__(self, exc_type, exc_value, traceback): block.attn.forward = block.attn._orig_forward -class Qwen2_5_VLVisionEmbMergerPatcher(OVModelPatcher): +class Qwen2_5_VLVisionEmbMergerPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -4648,7 +4487,7 @@ def __exit__(self, exc_type, exc_value, traceback): block_sparse_moe.output_linear.forward = block_sparse_moe.output_linear._orig_forward -class OVSeq2SeqModelPatcher(OVModelPatcher): +class OVSeq2SeqModelPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -4680,11 +4519,11 @@ def patched_forward(*args, **kwargs): pkv = args[pkv_arg_index] if pkv is not None: - if isinstance(pkv, OVEncoderDecoderCache): + if isinstance(pkv, ONNXEncoderDecoderCache): pkv = pkv.self_attention_cache.to_legacy_cache() else: pkv = [pkv_item[:2] for pkv_item in pkv] - pkv = OVEncoderDecoderCache.from_legacy_cache(pkv) + pkv = ONNXEncoderDecoderCache.from_legacy_cache(pkv) if "past_key_values" in kwargs: kwargs["past_key_values"] = pkv @@ -4694,7 +4533,7 @@ def patched_forward(*args, **kwargs): outputs = self.super_patched_forward(*args, **kwargs) # the optimum-onnx seq2seq model patcher only converts to tuple starting from 4.48 - if isinstance(outputs.get("past_key_values"), (OVDynamicCache, OVEncoderDecoderCache)): + if isinstance(outputs.get("past_key_values"), (ONNXDynamicCache, ONNXEncoderDecoderCache)): outputs["past_key_values"] = outputs["past_key_values"].to_legacy_cache() # we still need to filter out cross attention in the case of non-stateful decoder @@ -4733,7 +4572,7 @@ def __exit__(self, exc_type, exc_value, traceback): ALL_MASK_ATTENTION_FUNCTIONS.register("eager", eager_mask) -class SanaTextEncoderModelPatcher(OVModelPatcher): +class SanaTextEncoderModelPatcher(ModelPatcher): def __enter__(self): super().__enter__() @@ -4784,7 +4623,7 @@ def __init__( super().__init__(config, model, model_kwargs) -class CommonImageEmbeddingsModelPatcher(OVModelPatcher): +class CommonImageEmbeddingsModelPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -4873,7 +4712,7 @@ def __init__( def forward( self, attention_mask, position_ids, past_key_values, token_type_ids, inputs_embeds, use_cache=True ): - pkv = OVDynamicCache.from_legacy_cache(past_key_values) + pkv = ONNXDynamicCache.from_legacy_cache(past_key_values) past_seen_tokens = past_key_values[0][0].shape[-2] cache_position = torch.arange( @@ -4938,7 +4777,7 @@ def __exit__(self, exc_type, exc_value, traceback): del self._model.model._orig_update_causual_mask -class Idefics3ImageEmbeddingsModelPatcher(OVModelPatcher): +class Idefics3ImageEmbeddingsModelPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -5260,7 +5099,7 @@ def _blenderbot_attn_forward_new( query_states = query_states if past_key_value is not None: - if isinstance(past_key_value, OVEncoderDecoderCache): + if isinstance(past_key_value, ONNXEncoderDecoderCache): is_updated = past_key_value.is_updated.get(self.layer_idx) if is_cross_attention: # after the first generated id, we can subsequently re-use all key/value_states from cache @@ -5719,7 +5558,7 @@ def speecht5_decoder_layer_forward( return outputs -class OVSpeechT5ModelPatcher(OVModelPatcher): +class OVSpeechT5ModelPatcher(ModelPatcher): def __enter__(self): if self.real_config._behavior != "vocoder": super().__enter__() @@ -5789,7 +5628,7 @@ def patched_decoder_forward( if past_key_values is not None: past_key_values = [cache_item[:2] for cache_item in past_key_values] if is_transformers_version(">=", "4.56"): - past_key_values = OVEncoderDecoderCache.from_legacy_cache(past_key_values) + past_key_values = ONNXEncoderDecoderCache.from_legacy_cache(past_key_values) output_sequence = inputs_embeds output_cross_attentions = False @@ -5821,7 +5660,7 @@ def patched_decoder_forward( past_key_values = decoder_out.past_key_values if past_key_values is not None: - if isinstance(past_key_values, OVEncoderDecoderCache): + if isinstance(past_key_values, ONNXEncoderDecoderCache): past_key_values = past_key_values.self_attention_cache.to_legacy_cache() else: past_key_values = [cache_item[:2] for cache_item in past_key_values] @@ -5873,7 +5712,7 @@ def __init__( # Adopted from https://github.com/huggingface/transformers/blob/v4.51.3/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py#L2156-L2178 # moved audio and vision features processing outside model def lm_forward(self, inputs_embeds, attention_mask, position_ids, past_key_values, use_cache=True): - pkv = OVDynamicCache.from_legacy_cache(past_key_values) + pkv = ONNXDynamicCache.from_legacy_cache(past_key_values) outputs = self.model( inputs_embeds=inputs_embeds, attention_mask=attention_mask, @@ -5895,7 +5734,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward -class Phi4MMAudioForwardEmbeddingsPatcher(OVModelPatcher): +class Phi4MMAudioForwardEmbeddingsPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -5919,7 +5758,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward -class Phi4MMAudioEncoderPatcher(OVModelPatcher): +class Phi4MMAudioEncoderPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -5960,7 +5799,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward -class Phi4MMVisionEmbeddingsPatcher(OVModelPatcher): +class Phi4MMVisionEmbeddingsPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -6269,7 +6108,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.img_processor.embeddings.forward = self._model.img_processor.embeddings._orig_forward -class Llama4ImageEmbeddingsModelPatcher(OVModelPatcher): +class Llama4ImageEmbeddingsModelPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -6464,7 +6303,7 @@ def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool: return inner_mask -class Llama4TextModelPatcher(OVModelPatcher): +class Llama4TextModelPatcher(ModelPatcher): def __enter__(self): super().__enter__() @@ -6634,7 +6473,7 @@ def mamba_mixer_forward( # 1. Inject a MambaCache structure into the original model to simplify input and output handling related to SSM states # 2. Patch ConvSequenceTransform module to avoid if-else branching # 3. Vectorize the selective scan operation to ensure correct behavior during JIT tracing -class MambaPatcher(OVModelPatcher): +class MambaPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -7130,7 +6969,7 @@ def segment_sum(input_tensor): # for subsequent invocation of the model's `forward` method. # 2. Patches the Zamba2MambaMixer so that the traced `forward` function works correctly # during both the prefill and decoding steps. -class Zamba2ModelPatcher(OVModelPatcher): +class Zamba2ModelPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -7559,7 +7398,7 @@ def granite_moe_hybrid_update_causal_mask( return causal_mask -class GraniteMoeHybridModelPatcher(OVModelPatcher): +class GraniteMoeHybridModelPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", From 0e41943847818a8ab8d660bde7f73a0ec3b2ba7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 10 Feb 2026 16:13:44 +0100 Subject: [PATCH 062/190] style --- optimum/exporters/openvino/model_patcher.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index b779170d8e..c2e878f0c8 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -18,12 +18,11 @@ import logging as log import math import types -from collections.abc import Iterable from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union import torch import torch.nn.functional as F -from transformers.cache_utils import DynamicCache, EncoderDecoderCache +from transformers.cache_utils import DynamicCache from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPast, BaseModelOutputWithPooling from transformers.models.phi3.modeling_phi3 import apply_rotary_pos_emb, repeat_kv from transformers.models.speecht5.modeling_speecht5 import SpeechT5EncoderWithSpeechPrenet @@ -37,8 +36,6 @@ sdpa_mask_without_vmap, ) from optimum.exporters.onnx.utils import ONNXDynamicCache, ONNXEncoderDecoderCache - - from optimum.intel.utils.import_utils import is_diffusers_version, is_torch_version, is_transformers_version @@ -4129,7 +4126,6 @@ def lm_forward( deepstack_visual_embeds, use_cache=True, ): - from transformers.cache_utils import DynamicCache pkv = DynamicCache.from_legacy_cache(past_key_values) outputs = self.model.language_model( From bc9665d0335d6d24be3a0e94c566c53424ed4088 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 10 Feb 2026 17:35:08 +0100 Subject: [PATCH 063/190] fix seq2seq stateless export --- optimum/exporters/openvino/model_patcher.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index c2e878f0c8..c8cba66a2a 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -4531,6 +4531,8 @@ def patched_forward(*args, **kwargs): # the optimum-onnx seq2seq model patcher only converts to tuple starting from 4.48 if isinstance(outputs.get("past_key_values"), (ONNXDynamicCache, ONNXEncoderDecoderCache)): outputs["past_key_values"] = outputs["past_key_values"].to_legacy_cache() + elif isinstance(outputs.get("past_key_values"), (DynamicCache, EncoderDecoderCache)): + outputs.pop("past_key_values") # we still need to filter out cross attention in the case of non-stateful decoder filtered_outputs = {} From fceb15186746ce08deccb26e66ccbdc958826b65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 10 Feb 2026 18:07:51 +0100 Subject: [PATCH 064/190] cache depending on transformers version --- optimum/exporters/openvino/model_patcher.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index e8d56d555e..6dd442130a 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -54,7 +54,6 @@ override_arguments, sdpa_mask_without_vmap, ) -from optimum.exporters.onnx.utils import ONNXDynamicCache, ONNXEncoderDecoderCache from optimum.intel.utils.import_utils import is_diffusers_version, is_torch_version, is_transformers_version @@ -78,6 +77,14 @@ TransformersKwargs = object +if is_transformers_version("<", "5"): + from transformers import DynamicCache as ONNXDynamicCache + from transformers import EncoderDecoderCache as ONNXEncoderDecoderCache +else: + from optimum.exporters.onnx.utils import LegacyDynamicCache as ONNXDynamicCache + from optimum.exporters.onnx.utils import LegacyEncoderDecoderCache as ONNXEncoderDecoderCache + + logger = logging.getLogger(__name__) From 5133a4a9f7f6e36b84caac249fcf8a66e20adef8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 11 Feb 2026 12:05:29 +0100 Subject: [PATCH 065/190] pix2struct patcher --- optimum/exporters/openvino/model_configs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 4ad4cb079b..506459987d 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -5005,7 +5005,7 @@ class GPTBigCodeOpenVINOConfig(GPTBigCodeOnnxConfig): ], ) class Pix2StructOpenVINOConfig(Pix2StructOnnxConfig): - pass + _MODEL_PATCHER = OVSeq2SeqModelPatcher @register_in_tasks_manager("bert", *COMMON_TEXT_TASKS) From 0c4a89c4877065c6f2b721e94f7abdb6975e7815 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 11 Feb 2026 16:17:59 +0100 Subject: [PATCH 066/190] fix --- optimum/exporters/openvino/model_patcher.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 6dd442130a..3f87244111 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -89,10 +89,10 @@ def _get_subcomponent_model(model, name): - if is_transformers_version("<", "5"): - return getattr(model, name) + if is_transformers_version(">=", "5") and hasattr(model, "model"): + return getattr(model.model, name) - return getattr(model.model, name) + return getattr(model, name) for idx, spec in enumerate(UNSUPPORTED_OPS_PATCHING_SPEC): From d8a482935c9cfede8a214a2555416e5cf89a7c58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 11 Feb 2026 18:05:15 +0100 Subject: [PATCH 067/190] remove internvl_chat, minicpmv in tests --- tests/openvino/test_seq2seq.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index 0ad560213d..153f57be8e 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -535,14 +535,10 @@ def test_pipeline(self, model_arch: str): class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): SUPPORTED_ARCHITECTURES = [ - "internvl_chat", "llava", "llava_next", "llava_next_mistral", "llava_next_video", - "llava-qwen2", - "minicpmv", - "phi3_v", "qwen2_vl", ] SUPPORT_VIDEO = ["llava_next_video", "qwen2_vl"] @@ -554,9 +550,14 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): SUPPORTED_ARCHITECTURES += ["maira2", "idefics3"] if is_transformers_version(">=", "4.49.0"): - SUPPORTED_ARCHITECTURES += ["qwen2_5_vl", "got_ocr2", "phi4mm"] + SUPPORTED_ARCHITECTURES += ["qwen2_5_vl", "got_ocr2"] SUPPORT_VIDEO.append("qwen2_5_vl") - SUPPORT_AUDIO.append("phi4mm") + + if is_transformers_version("<", "4.54.0"): + # remote code models differs after transformers v4.54 + SUPPORTED_ARCHITECTURES += ["phi4mm"] + SUPPORT_AUDIO.append("phi4mm") + if is_transformers_version(">", "4.49"): SUPPORTED_ARCHITECTURES += ["gemma3", "smolvlm"] if is_transformers_version(">=", "4.51"): @@ -569,9 +570,13 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): SUPPORTED_ARCHITECTURES += ["qwen3_vl"] SUPPORT_VIDEO += ["qwen3_vl"] - if is_transformers_version(">=", "4.54.0"): + if is_transformers_version("<", "4.54.0"): # remote code models differs after transformers v4.54 - SUPPORTED_ARCHITECTURES = set(SUPPORTED_ARCHITECTURES) - {"llava-qwen2", "phi3_v", "phi4mm"} + SUPPORTED_ARCHITECTURES += ["llava-qwen2", "phi3_v"] + + if is_transformers_version("<", "5"): + # remote code models incompatible after transformers v5 + SUPPORTED_ARCHITECTURES += ["internvl_chat", "minicpmv"] REMOTE_CODE_MODELS = ["internvl_chat", "minicpmv", "minicpmo", "llava-qwen2", "phi3_v", "maira2", "phi4mm"] IMAGE = Image.open( From e477044ed4c9906091a9ec4f07e91535c86834be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 11 Feb 2026 18:12:38 +0100 Subject: [PATCH 068/190] set max transformers version for internvl_chat minicpmv --- optimum/exporters/openvino/model_configs.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 506459987d..0fb8663202 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -2078,6 +2078,7 @@ def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior]) @register_in_tasks_manager("internvl_chat", *["image-text-to-text"], library_name="transformers") class InternVLChatOpenVINOConfig(BaseVLMOpenVINOConfig): + MAX_TRANSFORMERS_VERSION = "4.57.6" def __init__( self, config: "PretrainedConfig", @@ -2862,6 +2863,7 @@ class MiniCPMVConfigBehavior(str, enum.Enum): @register_in_tasks_manager("minicpmv", *["image-text-to-text"], library_name="transformers") class MiniCPMVOpenVINOConfig(BaseVLMOpenVINOConfig): + MAX_TRANSFORMERS_VERSION = "4.57.6" SUPPORTED_BEHAVIORS = [model_type.value for model_type in MiniCPMVConfigBehavior] NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig DUMMY_INPUT_GENERATOR_CLASSES = () From e0b2b46849c44d25ca4e6e0975179bd4e57e7306 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 11 Feb 2026 18:20:18 +0100 Subject: [PATCH 069/190] style --- optimum/exporters/openvino/model_configs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 0fb8663202..c3348ed285 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -2079,6 +2079,7 @@ def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior]) @register_in_tasks_manager("internvl_chat", *["image-text-to-text"], library_name="transformers") class InternVLChatOpenVINOConfig(BaseVLMOpenVINOConfig): MAX_TRANSFORMERS_VERSION = "4.57.6" + def __init__( self, config: "PretrainedConfig", From 50fe59046294a0aff8dab9d527c7d1027666922c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 13 Feb 2026 15:47:37 +0100 Subject: [PATCH 070/190] fix textual inversion --- optimum/intel/openvino/loaders.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/optimum/intel/openvino/loaders.py b/optimum/intel/openvino/loaders.py index 214a4a7e8c..bd62e047bb 100644 --- a/optimum/intel/openvino/loaders.py +++ b/optimum/intel/openvino/loaders.py @@ -22,7 +22,7 @@ from openvino import Type from openvino import opset11 as ops from openvino.passes import Manager, Matcher, MatcherPass, WrapType -from transformers import PreTrainedTokenizer +from transformers import PreTrainedTokenizerBase from .utils import TEXTUAL_INVERSION_EMBEDDING_KEYS @@ -80,7 +80,7 @@ def load_textual_inversion( self, pretrained_model_name_or_path: Union[str, List[str], Dict[str, torch.Tensor], List[Dict[str, torch.Tensor]]], token: Optional[Union[str, List[str]]] = None, - tokenizer: Optional["PreTrainedTokenizer"] = None, # noqa: F821 + tokenizer: Optional["PreTrainedTokenizerBase"] = None, # noqa: F821 text_encoder: Optional["openvino.Model"] = None, # noqa: F821 **kwargs, ): @@ -88,9 +88,9 @@ def load_textual_inversion( raise ValueError( f"{self.__class__.__name__} requires `self.tokenizer` for calling `{self.load_textual_inversion.__name__}`" ) - elif not isinstance(self.tokenizer, PreTrainedTokenizer): + elif not isinstance(self.tokenizer, PreTrainedTokenizerBase): raise ValueError( - f"{self.__class__.__name__} requires `self.tokenizer` of type `PreTrainedTokenizer` for calling `{self.load_textual_inversion.__name__}`" + f"{self.__class__.__name__} requires `self.tokenizer` of type `PreTrainedTokenizerBase` for calling `{self.load_textual_inversion.__name__}`" ) if not hasattr(self, "text_encoder"): From e9ff083d929c3c132f4b1d17b05b59bc50873cf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 13 Feb 2026 16:15:25 +0100 Subject: [PATCH 071/190] add back inc --- .github/workflows/build_documentation.yml | 2 +- .github/workflows/build_pr_documentation.yml | 2 +- docs/source/neural_compressor/reference.mdx | 40 ++++++++++++++++++++ 3 files changed, 42 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml index 15852df3eb..ce3eb464ce 100644 --- a/.github/workflows/build_documentation.yml +++ b/.github/workflows/build_documentation.yml @@ -52,7 +52,7 @@ jobs: pip install --upgrade pip uv uv pip install git+https://github.com/huggingface/doc-builder uv pip install transformers==4.57.6 - uv pip install .[quality] diffusers accelerate datasets + uv pip install .[quality] neural-compressor[pt]>3.4 diffusers accelerate datasets - name: Make documentation shell: bash diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml index 311f29b0dd..6b0b89f3f1 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yml @@ -39,7 +39,7 @@ jobs: pip install --upgrade pip uv uv pip install git+https://github.com/huggingface/doc-builder uv pip install transformers==4.57.6 - uv pip install .[quality] diffusers accelerate datasets + uv pip install .[quality] neural-compressor[pt]>3.4 diffusers accelerate datasets - name: Make documentation shell: bash diff --git a/docs/source/neural_compressor/reference.mdx b/docs/source/neural_compressor/reference.mdx index b83618b4bc..c631aed883 100644 --- a/docs/source/neural_compressor/reference.mdx +++ b/docs/source/neural_compressor/reference.mdx @@ -14,3 +14,43 @@ specific language governing permissions and limitations under the License. `optimum.intel.neural_compressor` is deprecated and will be removed in the next major release. + +## INCQuantizer + +[[autodoc]] neural_compressor.quantization.INCQuantizer + +## INCTrainer + +[[autodoc]] neural_compressor.trainer.INCTrainer + +## INCModel + +[[autodoc]] neural_compressor.modeling_base.INCModel + +## INCModelForSequenceClassification + +[[autodoc]] neural_compressor.modeling_base.INCModelForSequenceClassification + +## INCModelForQuestionAnswering + +[[autodoc]] neural_compressor.modeling_base.INCModelForQuestionAnswering + +## INCModelForTokenClassification + +[[autodoc]] neural_compressor.modeling_base.INCModelForTokenClassification + +## INCModelForMultipleChoice + +[[autodoc]] neural_compressor.modeling_base.INCModelForMultipleChoice + +## INCModelForMaskedLM + +[[autodoc]] neural_compressor.modeling_base.INCModelForMaskedLM + +## INCModelForCausalLM + +[[autodoc]] neural_compressor.modeling_base.INCModelForCausalLM + +## INCModelForSeq2SeqLM + +[[autodoc]] neural_compressor.modeling_base.INCModelForSeq2SeqLM \ No newline at end of file From 49f020f4c3d6c09b45330ff47bf3c05b36e89208 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 13 Feb 2026 16:17:22 +0100 Subject: [PATCH 072/190] style --- docs/source/neural_compressor/reference.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/neural_compressor/reference.mdx b/docs/source/neural_compressor/reference.mdx index c631aed883..b6e3d8f468 100644 --- a/docs/source/neural_compressor/reference.mdx +++ b/docs/source/neural_compressor/reference.mdx @@ -53,4 +53,4 @@ specific language governing permissions and limitations under the License. ## INCModelForSeq2SeqLM -[[autodoc]] neural_compressor.modeling_base.INCModelForSeq2SeqLM \ No newline at end of file +[[autodoc]] neural_compressor.modeling_base.INCModelForSeq2SeqLM From 31e8c4462f9146c7d66faeb013a40d80ece08f0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 13 Feb 2026 16:35:35 +0100 Subject: [PATCH 073/190] skip text2text generation pipeline when >= v5 --- tests/openvino/test_modeling_basic.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/openvino/test_modeling_basic.py b/tests/openvino/test_modeling_basic.py index 3dac24c69a..549411f344 100644 --- a/tests/openvino/test_modeling_basic.py +++ b/tests/openvino/test_modeling_basic.py @@ -28,7 +28,7 @@ OVModelForTokenClassification, OVStableDiffusionPipeline, ) - +from optimum.intel.utils.import_utils import is_transformers_version # Make sure that common architectures are used in combination with common tasks MODEL_NAMES = { @@ -58,6 +58,9 @@ def test_pipeline(self, model_id): """ tokenizer = AutoTokenizer.from_pretrained(model_id) model_class_str = MODEL_NAMES[model_id] + if model_class_str == "OVModelForSeq2SeqLM" and is_transformers_version(">=", "5"): + self.skipTest("text2text-generation pipeline was deprecated in transformers v5") + model_class = eval(model_class_str) model = model_class.from_pretrained(model_id, device=OPENVINO_DEVICE) model.save_pretrained(f"{model_id}_ov") From 4e43429bfd283e0bb1ffe8630e440833844aa5c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 13 Feb 2026 16:59:51 +0100 Subject: [PATCH 074/190] fix perceiver vision preprocessor loading --- tests/openvino/test_modeling.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 785c4e2782..f53c9fdce6 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -34,6 +34,7 @@ from sentence_transformers import SentenceTransformer from transformers import ( AutoFeatureExtractor, + AutoImageProcessor, AutoModel, AutoModelForAudioClassification, AutoModelForAudioFrameClassification, @@ -1187,7 +1188,7 @@ def test_compare_to_transformers(self, model_arch): self.assertIsInstance(ov_model.config, PretrainedConfig) set_seed(SEED) transformers_model = AutoModelForImageClassification.from_pretrained(model_id) - preprocessor = AutoProcessor.from_pretrained(model_id) + preprocessor = AutoImageProcessor.from_pretrained(model_id) url = TEST_IMAGE_URL image = Image.open(requests.get(url, stream=True).raw) inputs = preprocessor(images=image, return_tensors="pt") @@ -1211,7 +1212,7 @@ def test_pipeline(self, model_arch): model_id = MODEL_NAMES[model_arch] model = OVModelForImageClassification.from_pretrained(model_id, device=OPENVINO_DEVICE) model.eval() - preprocessor = AutoProcessor.from_pretrained(model_id) + preprocessor = AutoImageProcessor.from_pretrained(model_id) pipe = pipeline("image-classification", model=model, feature_extractor=preprocessor) inputs = TEST_IMAGE_URL outputs = pipe(inputs) From 3565637f2ced588d5c5dfc271fdffb015bc91c38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 13 Feb 2026 17:15:11 +0100 Subject: [PATCH 075/190] fix question answering pipeline --- tests/openvino/test_modeling_basic.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/openvino/test_modeling_basic.py b/tests/openvino/test_modeling_basic.py index 549411f344..5d5665beeb 100644 --- a/tests/openvino/test_modeling_basic.py +++ b/tests/openvino/test_modeling_basic.py @@ -72,9 +72,13 @@ def test_pipeline(self, model_id): elif model_class_str == "OVModelForMaskedLM": input_text[0] = f"{input_text[0]} {tokenizer.mask_token}" - if model_class_str in TASKS: - task = TASKS[model_class_str] - pipe = pipeline(task, model=model, tokenizer=tokenizer) + task = TASKS[model_class_str] + pipe = pipeline(task, model=model, tokenizer=tokenizer) + + if task == "question-answering": + # positional arguments deprecated for question-answering pipeline since v5 + pipe(question=input_text[0], context=input_text[1]) + else: pipe(*input_text) gc.collect() From 2d1929d9b81931e3fc43fa2040c671df1a23f93e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 13 Feb 2026 17:37:05 +0100 Subject: [PATCH 076/190] only install diffusers when compatible --- .github/workflows/test_openvino.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index 835204f423..6791a8962f 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -54,7 +54,7 @@ jobs: - name: Install dependencies run: | pip install --upgrade pip uv - uv pip install .[diffusers,tests] + uv pip install .[tests] - if: ${{ matrix.test-pattern == '*modeling*' }} name: Install OpenVINO @@ -71,10 +71,10 @@ jobs: run: | uv pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.32.* transformers_stream_generator - - if: ${{ matrix.transformers-version == '5.0.0' }} + - if: ${{ matrix.transformers-version != '5.0.0' }} name: Install diffusers run: | - uv pip install git+https://github.com/huggingface/diffusers + uv pip install diffusers - if: ${{ matrix.transformers-version != '4.45.0' && matrix.test-pattern == '*decoder*'}} name: Install auto-gptq, autoawq From a8b08a4c51fa9f097e579141d1e37ff9edc1f4d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 13 Feb 2026 17:38:11 +0100 Subject: [PATCH 077/190] fix diffusers mapping --- optimum/exporters/openvino/model_configs.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index c3348ed285..42a58ee523 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -290,6 +290,8 @@ def init_model_configs(): TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS["fill"] = "FluxFillPipeline" TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["fill"] = {"flux": "FluxFillPipeline"} TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS["text-to-image"] = ("AutoPipelineForText2Image", "SanaPipeline") + if "text-to-image" not in TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS: + TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["text-to-image"] = {} TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["text-to-image"]["sana"] = "SanaPipeline" TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["text-to-image"]["sana-sprint"] = "SanaSprintPipeline" if is_diffusers_available() and "text-to-video" not in TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS: From 83ae84653c69c19e93be9de63e30f46e35386e94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 13 Feb 2026 17:38:43 +0100 Subject: [PATCH 078/190] style --- tests/openvino/test_modeling_basic.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/openvino/test_modeling_basic.py b/tests/openvino/test_modeling_basic.py index 5d5665beeb..c2576db98b 100644 --- a/tests/openvino/test_modeling_basic.py +++ b/tests/openvino/test_modeling_basic.py @@ -17,16 +17,11 @@ from transformers import AutoTokenizer, pipeline from utils_tests import OPENVINO_DEVICE from optimum.intel import ( - OVModelForAudioClassification, OVModelForCausalLM, - OVModelForFeatureExtraction, - OVModelForImageClassification, OVModelForMaskedLM, OVModelForQuestionAnswering, OVModelForSeq2SeqLM, OVModelForSequenceClassification, - OVModelForTokenClassification, - OVStableDiffusionPipeline, ) from optimum.intel.utils.import_utils import is_transformers_version From cad085b66ec8cba022a4117b4c657bc519b20903 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 13 Feb 2026 17:39:06 +0100 Subject: [PATCH 079/190] update diffusers extra --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e010f5c0ef..617e12d24a 100644 --- a/setup.py +++ b/setup.py @@ -73,7 +73,7 @@ "openvino": ["nncf>=2.19.0", "openvino>=2025.3.0", "openvino-tokenizers>=2025.3.0"], "neural-compressor": ["neural-compressor[pt]>=3.4.1", "accelerate", "transformers<4.46", "datasets"], "ipex": ["intel-extension-for-pytorch>=2.8", "transformers>4.54,<4.56", "accelerate"], - "diffusers": ["diffusers"], + "diffusers": ["diffusers", "transformers<5"], "quality": QUALITY_REQUIRE, "tests": TESTS_REQUIRE, } From 5dbe3c894447bb8759454ff2d273fffd69de73fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 16 Feb 2026 14:18:56 +0100 Subject: [PATCH 080/190] add transformers version workflow --- .github/workflows/test_openvino.yml | 8 ++++---- setup.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index 6791a8962f..b42bca1548 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -38,7 +38,7 @@ jobs: "*diffusion*", "*quantization*", ] - transformers-version: ["4.45.0", "5.0.0", "latest"] + transformers-version: ["4.45", "4.57", "latest"] runs-on: ubuntu-22.04 @@ -66,17 +66,17 @@ jobs: run: | uv pip install transformers==${{ matrix.transformers-version }} - - if: ${{ matrix.transformers-version == '4.45.0' }} + - if: ${{ matrix.transformers-version == '4.45' }} name: Install specific dependencies and versions required for older transformers run: | uv pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.32.* transformers_stream_generator - - if: ${{ matrix.transformers-version != '5.0.0' }} + - if: ${{ matrix.transformers-version != 'latest' }} name: Install diffusers run: | uv pip install diffusers - - if: ${{ matrix.transformers-version != '4.45.0' && matrix.test-pattern == '*decoder*'}} + - if: ${{ matrix.transformers-version != '4.45' && matrix.test-pattern == '*decoder*'}} name: Install auto-gptq, autoawq run: | uv pip install auto-gptq "autoawq<0.2.8" diff --git a/setup.py b/setup.py index 617e12d24a..16e2a82fed 100644 --- a/setup.py +++ b/setup.py @@ -56,7 +56,7 @@ "sentence-transformers", "open_clip_torch>=2.26.1", "peft", - "datasets[audio]>=1.4.0,<4.0.0", + "datasets>=1.4.0,<4.0.0", "tbb", "langchain-huggingface", "hf_xet", From b7ce98b6639488f65cba525bfeabff6d502841b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 16 Feb 2026 16:08:10 +0100 Subject: [PATCH 081/190] set transformers 4.57.6 for tests --- .github/workflows/test_openvino.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index b42bca1548..59bd4673b3 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -38,7 +38,7 @@ jobs: "*diffusion*", "*quantization*", ] - transformers-version: ["4.45", "4.57", "latest"] + transformers-version: ["4.45.0", "4.57.6", "latest"] runs-on: ubuntu-22.04 @@ -66,7 +66,7 @@ jobs: run: | uv pip install transformers==${{ matrix.transformers-version }} - - if: ${{ matrix.transformers-version == '4.45' }} + - if: ${{ matrix.transformers-version == '4.45.0' }} name: Install specific dependencies and versions required for older transformers run: | uv pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.32.* transformers_stream_generator @@ -76,7 +76,7 @@ jobs: run: | uv pip install diffusers - - if: ${{ matrix.transformers-version != '4.45' && matrix.test-pattern == '*decoder*'}} + - if: ${{ matrix.transformers-version != '4.45.0' && matrix.test-pattern == '*decoder*'}} name: Install auto-gptq, autoawq run: | uv pip install auto-gptq "autoawq<0.2.8" From d692d44785edd13a424f33843d004734a3fc564a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 16 Feb 2026 16:08:54 +0100 Subject: [PATCH 082/190] batch_encode_plus was deprecated in v5 --- tests/openvino/test_modeling.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index f53c9fdce6..03e099f77e 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -1605,7 +1605,7 @@ def test_load_from_hub_and_save_model(self): tokenizer = AutoTokenizer.from_pretrained(self.OV_MODEL_ID_IR) all_text = ["a dog", "a cat", "a frog"] - tokens = tokenizer.batch_encode_plus( + tokens = tokenizer( all_text, return_tensors="pt", max_length=loaded_model.config.text_config.context_length, @@ -1683,7 +1683,7 @@ def test_functions(self): tokenizer = AutoTokenizer.from_pretrained(self.OV_MODEL_ID_IR) all_text = ["a dog", "a cat", "a frog"] - tokens = tokenizer.batch_encode_plus( + tokens = tokenizer( all_text, return_tensors="pt", max_length=model.config.text_config.context_length, From 93679e9b8e3afca7dc7446fd9773f0425d3990c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 16 Feb 2026 18:10:15 +0100 Subject: [PATCH 083/190] fix sam --- optimum/intel/openvino/modeling_sam.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/intel/openvino/modeling_sam.py b/optimum/intel/openvino/modeling_sam.py index 4722437b72..57b33be14e 100644 --- a/optimum/intel/openvino/modeling_sam.py +++ b/optimum/intel/openvino/modeling_sam.py @@ -403,7 +403,7 @@ def get_image_wide_positional_embeddings(self): x_embed = x_embed / size positional_embedding = self.shared_image_embedding(torch.stack([x_embed, y_embed], dim=-1)) - return positional_embedding.permute(2, 0, 1).unsqueeze(0) + return positional_embedding.permute(2, 0, 1).unsqueeze(0).detach() def get_image_features(self, pixel_values, *args, **kwargs): return torch.from_numpy(self.vision_encoder(pixel_values).image_embeddings) From b2ef4184f92d626d3f9db4263e1d6b33044b75a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 16 Feb 2026 18:26:46 +0100 Subject: [PATCH 084/190] install librosa for tests --- .github/workflows/test_openvino.yml | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index 59bd4673b3..38a10c22a7 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -64,17 +64,12 @@ jobs: - if: ${{ matrix.transformers-version != 'latest' }} name: Install transformers run: | - uv pip install transformers==${{ matrix.transformers-version }} + uv pip install transformers==${{ matrix.transformers-version }} diffusers - if: ${{ matrix.transformers-version == '4.45.0' }} name: Install specific dependencies and versions required for older transformers run: | - uv pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.32.* transformers_stream_generator - - - if: ${{ matrix.transformers-version != 'latest' }} - name: Install diffusers - run: | - uv pip install diffusers + uv pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.32.* transformers_stream_generator librosa - if: ${{ matrix.transformers-version != '4.45.0' && matrix.test-pattern == '*decoder*'}} name: Install auto-gptq, autoawq From 3fb01723225341f11dc850e228855f16352d1e20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 16 Feb 2026 19:45:29 +0100 Subject: [PATCH 085/190] rename OVDynamicCache --- optimum/exporters/openvino/model_patcher.py | 42 ++++++++++----------- tests/openvino/test_modeling.py | 2 +- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 3f87244111..009d226b34 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -78,11 +78,11 @@ if is_transformers_version("<", "5"): - from transformers import DynamicCache as ONNXDynamicCache - from transformers import EncoderDecoderCache as ONNXEncoderDecoderCache + from transformers import DynamicCache as OVDynamicCache + from transformers import EncoderDecoderCache as OVEncoderDecoderCache else: - from optimum.exporters.onnx.utils import LegacyDynamicCache as ONNXDynamicCache - from optimum.exporters.onnx.utils import LegacyEncoderDecoderCache as ONNXEncoderDecoderCache + from optimum.exporters.onnx.utils import LegacyDynamicCache as OVDynamicCache + from optimum.exporters.onnx.utils import LegacyEncoderDecoderCache as OVEncoderDecoderCache logger = logging.getLogger(__name__) @@ -331,7 +331,7 @@ def __enter__(self): _mixtral_sparse_moe_block_forward, layer.block_sparse_moe ) else: - self._model.config._experts_implementation = "batched_mm" + self._model.set_experts_implementation("batched_mm") def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) @@ -1382,7 +1382,7 @@ def phi3_442_forward( if use_cache: use_legacy_cache = not isinstance(past_key_values, Cache) if use_legacy_cache: - past_key_values = ONNXDynamicCache.from_legacy_cache(past_key_values) + past_key_values = OVDynamicCache.from_legacy_cache(past_key_values) past_key_values_length = past_key_values.get_usable_length(seq_length) if position_ids is None: @@ -1647,7 +1647,7 @@ def __enter__(self): _phi_moe_sparse_moe_block_forward, layer.block_sparse_moe ) else: - self._model.config._experts_implementation = "batched_mm" + self._model.set_experts_implementation("batched_mm") def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) @@ -2964,7 +2964,7 @@ def patched_forward(*args, **kwargs): legacy_pkv = args[pkv_argument_index] pkv_in_args = True if legacy_pkv is not None: - pkv = ONNXDynamicCache.from_legacy_cache(legacy_pkv) + pkv = OVDynamicCache.from_legacy_cache(legacy_pkv) return_legacy_cache = True if not pkv_in_args: kwargs["past_key_values"] = pkv @@ -4118,7 +4118,7 @@ def forward_wrap( input_ids=None, use_cache=True, ): - new_past_key_values = ONNXDynamicCache.from_legacy_cache(past_key_values) + new_past_key_values = OVDynamicCache.from_legacy_cache(past_key_values) result = self.__orig_forward( input_ids=input_ids, attention_mask=attention_mask, @@ -4502,7 +4502,7 @@ def __enter__(self): ) else: - self._model.config._experts_implementation = "batched_mm" + self._model.set_experts_implementation("batched_mm") def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) @@ -4547,11 +4547,11 @@ def patched_forward(*args, **kwargs): pkv = args[pkv_arg_index] if pkv is not None: - if isinstance(pkv, ONNXEncoderDecoderCache): + if isinstance(pkv, OVEncoderDecoderCache): pkv = pkv.self_attention_cache.to_legacy_cache() else: pkv = [pkv_item[:2] for pkv_item in pkv] - pkv = ONNXEncoderDecoderCache.from_legacy_cache(pkv) + pkv = OVEncoderDecoderCache.from_legacy_cache(pkv) if "past_key_values" in kwargs: kwargs["past_key_values"] = pkv @@ -4561,7 +4561,7 @@ def patched_forward(*args, **kwargs): outputs = self.super_patched_forward(*args, **kwargs) # the optimum-onnx seq2seq model patcher only converts to tuple starting from 4.48 - if isinstance(outputs.get("past_key_values"), (ONNXDynamicCache, ONNXEncoderDecoderCache)): + if isinstance(outputs.get("past_key_values"), (OVDynamicCache, OVEncoderDecoderCache)): outputs["past_key_values"] = outputs["past_key_values"].to_legacy_cache() elif isinstance(outputs.get("past_key_values"), (DynamicCache, EncoderDecoderCache)): outputs.pop("past_key_values") @@ -4742,7 +4742,7 @@ def __init__( def forward( self, attention_mask, position_ids, past_key_values, token_type_ids, inputs_embeds, use_cache=True ): - pkv = ONNXDynamicCache.from_legacy_cache(past_key_values) + pkv = OVDynamicCache.from_legacy_cache(past_key_values) past_seen_tokens = past_key_values[0][0].shape[-2] cache_position = torch.arange( @@ -5129,7 +5129,7 @@ def _blenderbot_attn_forward_new( query_states = query_states if past_key_value is not None: - if isinstance(past_key_value, ONNXEncoderDecoderCache): + if isinstance(past_key_value, OVEncoderDecoderCache): is_updated = past_key_value.is_updated.get(self.layer_idx) if is_cross_attention: # after the first generated id, we can subsequently re-use all key/value_states from cache @@ -5331,7 +5331,7 @@ def __enter__(self): modulewise_patch(self._model, Qwen2MoeSparseMoeBlock, _qwen2moe_sparse_block_forward) if is_transformers_version(">=", "5"): - self._model.config._experts_implementation = "batched_mm" + self._model.set_experts_implementation("batched_mm") def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) @@ -5658,7 +5658,7 @@ def patched_decoder_forward( if past_key_values is not None: past_key_values = [cache_item[:2] for cache_item in past_key_values] if is_transformers_version(">=", "4.56"): - past_key_values = ONNXEncoderDecoderCache.from_legacy_cache(past_key_values) + past_key_values = OVEncoderDecoderCache.from_legacy_cache(past_key_values) output_sequence = inputs_embeds output_cross_attentions = False @@ -5690,7 +5690,7 @@ def patched_decoder_forward( past_key_values = decoder_out.past_key_values if past_key_values is not None: - if isinstance(past_key_values, ONNXEncoderDecoderCache): + if isinstance(past_key_values, OVEncoderDecoderCache): past_key_values = past_key_values.self_attention_cache.to_legacy_cache() else: past_key_values = [cache_item[:2] for cache_item in past_key_values] @@ -5742,7 +5742,7 @@ def __init__( # Adopted from https://github.com/huggingface/transformers/blob/v4.51.3/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py#L2156-L2178 # moved audio and vision features processing outside model def lm_forward(self, inputs_embeds, attention_mask, position_ids, past_key_values, use_cache=True): - pkv = ONNXDynamicCache.from_legacy_cache(past_key_values) + pkv = OVDynamicCache.from_legacy_cache(past_key_values) outputs = self.model( inputs_embeds=inputs_embeds, attention_mask=attention_mask, @@ -6696,7 +6696,7 @@ def __enter__(self): self.original_moe_forward = Qwen3MoeSparseMoeBlock.forward Qwen3MoeSparseMoeBlock.forward = qwen3_moe_forward_patched if is_transformers_version(">=", "5"): - self._model.config._experts_implementation = "batched_mm" + self._model.set_experts_implementation("batched_mm") def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) @@ -7374,7 +7374,7 @@ def __enter__(self): GptOssExperts.forward = gpt_oss_forward if is_transformers_version(">=", "5"): - self._model.config._experts_implementation = "batched_mm" + self._model.set_experts_implementation("batched_mm") def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 03e099f77e..4eccde4c87 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -1723,7 +1723,7 @@ def test_functions(self): self.assertTrue(torch.allclose(model_outputs.logits_per_image, res.logits_per_image, atol=1e-2)) model.reshape(1, -1) - reshaped_tokens = tokenizer.batch_encode_plus( + reshaped_tokens = tokenizer( ["a dog"], return_tensors="pt", max_length=model.config.text_config.context_length, From 3d2286c4bc2aefc7d1c89d4c1554032440738ad7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 16 Feb 2026 20:56:13 +0100 Subject: [PATCH 086/190] qwenvl3 fix --- optimum/exporters/openvino/model_configs.py | 2 +- optimum/exporters/openvino/model_patcher.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 42a58ee523..ef54b8f78d 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -3832,7 +3832,7 @@ def __init__( @staticmethod def get_model_for_behavior(model, behavior: Union[str, QwenVLConfigBehavior]): if behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_POS: - vision_emb_pos = model.visual.pos_embed + vision_emb_pos = _get_subcomponent_model(model, "visual").pos_embed vision_emb_pos.config = model.config.vision_config return vision_emb_pos diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 009d226b34..b7084b7a34 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -4159,7 +4159,7 @@ def lm_forward( deepstack_visual_embeds, use_cache=True, ): - pkv = DynamicCache.from_legacy_cache(past_key_values) + pkv = OVDynamicCache.from_legacy_cache(past_key_values) outputs = self.model.language_model( inputs_embeds=inputs_embeds, attention_mask=attention_mask, @@ -7858,7 +7858,7 @@ def forward( inputs_embeds: torch.Tensor = self.embed_tokens(input_ids) if use_cache and past_key_values is None: - past_key_values = DynamicCache(config=self.config) + past_key_values = OVDynamicCache(config=self.config) if cache_position is None: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 From 46fa8a70d1910cd985a8411a6a8650bcaf7f784a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 16 Feb 2026 23:06:16 +0100 Subject: [PATCH 087/190] fix qwen2vl --- optimum/exporters/openvino/model_configs.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index ef54b8f78d..cea7528529 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -3716,12 +3716,17 @@ def with_behavior( behavior = QwenVLConfigBehavior(behavior) if behavior == QwenVLConfigBehavior.TEXT_EMBEDDINGS: - return get_vlm_text_embeddings_config("qwen2", self._orig_config, self.int_dtype, self.float_dtype) + return get_vlm_text_embeddings_config( + "qwen2", + self._orig_config if is_transformers_version("<", "5") else self._orig_config.text_config, + self.int_dtype, + self.float_dtype, + ) if behavior == QwenVLConfigBehavior.LANGUAGE: return get_vlm_text_generation_config( "qwen2", - self._orig_config, + self._orig_config if is_transformers_version("<", "5") else self._orig_config.text_config, self.int_dtype, self.float_dtype, model_patcher=Qwen2VLLanguageModelPatcher, From 20bb596bcd3d3c08c93e0da51e778d3be0060f1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 17 Feb 2026 10:43:13 +0100 Subject: [PATCH 088/190] github workflow librosa --- .github/workflows/test_openvino.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index 38a10c22a7..085619c5fa 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -54,7 +54,7 @@ jobs: - name: Install dependencies run: | pip install --upgrade pip uv - uv pip install .[tests] + uv pip install .[tests] librosa - if: ${{ matrix.test-pattern == '*modeling*' }} name: Install OpenVINO @@ -69,7 +69,7 @@ jobs: - if: ${{ matrix.transformers-version == '4.45.0' }} name: Install specific dependencies and versions required for older transformers run: | - uv pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.32.* transformers_stream_generator librosa + uv pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.32.* transformers_stream_generator - if: ${{ matrix.transformers-version != '4.45.0' && matrix.test-pattern == '*decoder*'}} name: Install auto-gptq, autoawq From a091dadd1262971955598c460ea700fde6232f9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 17 Feb 2026 10:46:04 +0100 Subject: [PATCH 089/190] Update MAX_TRANSFORMERS_VERSION for incompatible models --- optimum/exporters/openvino/model_configs.py | 43 ++++++++++++++++++--- tests/openvino/test_decoder.py | 3 +- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index cea7528529..a25c3e7b8e 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -1080,6 +1080,8 @@ class Phi3OpenVINOConfig(PhiOnnxConfig): ) class PhiMoEOpenVINOConfig(Phi3OpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.46.0" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" _MODEL_PATCHER = PhiMoEModelPatcher @@ -1284,6 +1286,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int @register_in_tasks_manager("aquila", *["text-generation", "text-generation-with-past"], library_name="transformers") class AquilaMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 + MAX_TRANSFORMERS_VERSION = "4.57.6" DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, AquilaDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = AquilaDummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_key_value_heads="num_key_value_heads", allow_new=True) @@ -1293,6 +1296,7 @@ class AquilaMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): @register_in_tasks_manager("xverse", *["text-generation", "text-generation-with-past"], library_name="transformers") class XverseMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 + MAX_TRANSFORMERS_VERSION = "4.57.6" DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = DummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig @@ -1302,6 +1306,7 @@ class XverseMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): @register_in_tasks_manager("internlm", *["text-generation", "text-generation-with-past"], library_name="transformers") class InternLMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 + MAX_TRANSFORMERS_VERSION = "4.57.6" DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = DummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig @@ -1892,6 +1897,8 @@ def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[ @register_in_tasks_manager("llava", *["image-text-to-text"], library_name="transformers") class LlavaOpenVINOConfig(BaseVLMOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.37.2" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" def __init__( self, @@ -1930,6 +1937,7 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs) -> Dict: @register_in_tasks_manager("llava_next", *["image-text-to-text"], library_name="transformers") class LlavaNextOpenVINOConfig(LlavaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.40.0" + MAX_TRANSFORMERS_VERSION = "5.99" def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None): model_kwargs = model_kwargs or {} @@ -1991,6 +1999,8 @@ class LlavaNextVideoConfigBehavior(str, enum.Enum): @register_in_tasks_manager("llava_next_video", *["image-text-to-text"], library_name="transformers") class LlavaNextVideoOpenVINOConfig(LlavaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.42.0" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" SUPPORTED_BEHAVIORS = [model_type.value for model_type in LlavaNextVideoConfigBehavior] def with_behavior( @@ -2055,6 +2065,7 @@ def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[ ) class MairaOpenVINOConfig(LlavaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.46.0" + MAX_TRANSFORMERS_VERSION = "5.99" SUPPORTS_PAST = True def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None): @@ -3976,6 +3987,8 @@ class GraniteOpenVINOConfig(LlamaOpenVINOConfig): ) class GraniteMoEOpenVINOConfig(LlamaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.45.0" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" _MODEL_PATCHER = GraniteMoEModelPatcher @@ -4009,7 +4022,8 @@ class T5OpenVINOConfig(T5OnnxConfig): library_name="transformers", ) class MT5OpenVINOConfig(T5OpenVINOConfig): - pass + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" @register_in_tasks_manager( @@ -4098,6 +4112,8 @@ class DeepseekOpenVINOConfig(MiniCPM3OpenVINOConfig): @register_in_tasks_manager("got_ocr2", *["image-to-text", "image-text-to-text"], library_name="transformers") class GotOCR2OpenVINOConfig(BaseVLMOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.49.0" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" def __init__( self, @@ -4130,6 +4146,8 @@ def __init__( @register_in_tasks_manager("gemma3", *["image-text-to-text"], library_name="transformers") class Gemma3OpenVINOConfig(BaseVLMOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.50.0" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" def __init__( self, @@ -4213,6 +4231,8 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int class Idefics3OpenVINOConfig(BaseVLMOpenVINOConfig): DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator, DummyVisionPositionIdsInputGenerator) MIN_TRANSFORMERS_VERSION = "4.46.0" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" def __init__( self, @@ -4271,6 +4291,8 @@ def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior]) @register_in_tasks_manager("smolvlm", *["image-text-to-text"], library_name="transformers") class SmolVLMOpenVINOConfig(Idefics3OpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.50.0" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" @register_in_tasks_manager( @@ -4335,6 +4357,8 @@ class PegasusOpenVINOConfig(PegasusOnnxConfig): ) class MarianOpenVINOConfig(MarianOnnxConfig): _MODEL_PATCHER = MarianModelPatcher + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" class DummySpeechT5OpenVINOInputGenerator(DummyInputGenerator): @@ -4548,6 +4572,8 @@ class Llama4TextOpenVINOConfig(LlamaOpenVINOConfig): ) class Llama4OpenVINOConfig(GotOCR2OpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.51.0" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None): model_kwargs = model_kwargs or {} @@ -4789,6 +4815,8 @@ class Zamba2OpenVINOConfig(MambaOpenVINOConfig): DUMMY_PKV_GENERATOR_CLASS = Zamba2DummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig MIN_TRANSFORMERS_VERSION = "4.49.0" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" _MODEL_PATCHER = Zamba2ModelPatcher def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], direction: str): @@ -5015,7 +5043,9 @@ class GPTBigCodeOpenVINOConfig(GPTBigCodeOnnxConfig): ], ) class Pix2StructOpenVINOConfig(Pix2StructOnnxConfig): - _MODEL_PATCHER = OVSeq2SeqModelPatcher + # _MODEL_PATCHER = OVSeq2SeqModelPatcher + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" @register_in_tasks_manager("bert", *COMMON_TEXT_TASKS) @@ -5060,7 +5090,8 @@ class MobileBertOpenVINOConfig(MobileBertOnnxConfig): @register_in_tasks_manager("xlm", *COMMON_TEXT_TASKS) class XLMOpenVINOConfig(XLMOnnxConfig): - pass + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" @register_in_tasks_manager("xlm-roberta", *COMMON_TEXT_TASKS) @@ -5085,7 +5116,8 @@ class CamembertOpenVINOConfig(CamembertOnnxConfig): @register_in_tasks_manager("flaubert", *COMMON_TEXT_TASKS) class FlaubertOpenVINOConfig(FlaubertOnnxConfig): - pass + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" @register_in_tasks_manager( @@ -5117,7 +5149,8 @@ class Data2VecAudioOpenVINOConfig(Data2VecAudioOnnxConfig): @register_in_tasks_manager("data2vec-text", *COMMON_TEXT_TASKS) class Data2VecTextOpenVINOConfig(Data2VecTextOnnxConfig): - pass + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" @register_in_tasks_manager("data2vec-vision", *["feature-extraction", "image-classification"]) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 1a55242d5c..235eb8406d 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -856,7 +856,8 @@ def test_load_with_different_dtype(self): ) @parameterized.expand(EAGLE3_MODELS.items()) - @pytest.mark.skipif(is_transformers_version("<", "4.54"), reason="Eagle3 requires transformers >= 4.54") + # TODO (@echarlaix) transformers v5 support + @pytest.mark.skipif(is_transformers_version("<", "4.54") or is_transformers_version(">=", "5"), reason="Eagle3 requires transformers >= 4.54") def test_load_and_infer_with_eagle3_model(self, model_arch, model_pair): draft_model_id, target_model_id = model_pair From 847c98d8235c931fa546057fcc815b98806eafaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 17 Feb 2026 11:05:03 +0100 Subject: [PATCH 090/190] style --- tests/openvino/test_decoder.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 235eb8406d..07da27807b 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -857,7 +857,10 @@ def test_load_with_different_dtype(self): @parameterized.expand(EAGLE3_MODELS.items()) # TODO (@echarlaix) transformers v5 support - @pytest.mark.skipif(is_transformers_version("<", "4.54") or is_transformers_version(">=", "5"), reason="Eagle3 requires transformers >= 4.54") + @pytest.mark.skipif( + is_transformers_version("<", "4.54") or is_transformers_version(">=", "5"), + reason="Eagle3 requires transformers >= 4.54", + ) def test_load_and_infer_with_eagle3_model(self, model_arch, model_pair): draft_model_id, target_model_id = model_pair From 4bc2768eae5d2d18cc88aa0ecd6b2481835f7352 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 17 Feb 2026 16:44:52 +0100 Subject: [PATCH 091/190] pkv fix --- optimum/exporters/openvino/model_patcher.py | 106 ++++++++++++++------ setup.py | 2 +- 2 files changed, 77 insertions(+), 31 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index b7084b7a34..a2a9d18fbc 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -24,6 +24,7 @@ import torch import torch.nn.functional as F from torch import nn +from transformers import DynamicCache, EncoderDecoderCache from transformers.cache_utils import Cache, DynamicCache, EncoderDecoderCache from transformers.configuration_utils import PretrainedConfig from transformers.generation import GenerationMixin @@ -77,14 +78,6 @@ TransformersKwargs = object -if is_transformers_version("<", "5"): - from transformers import DynamicCache as OVDynamicCache - from transformers import EncoderDecoderCache as OVEncoderDecoderCache -else: - from optimum.exporters.onnx.utils import LegacyDynamicCache as OVDynamicCache - from optimum.exporters.onnx.utils import LegacyEncoderDecoderCache as OVEncoderDecoderCache - - logger = logging.getLogger(__name__) @@ -95,6 +88,23 @@ def _get_subcomponent_model(model, name): return getattr(model, name) +def postprocess_past_key_values(past_key_values): + if isinstance(past_key_values, (EncoderDecoderCache, DynamicCache)): + if hasattr(past_key_values, "to_legacy_cache"): + past_key_values = past_key_values.to_legacy_cache() + elif isinstance(past_key_values, DynamicCache): + past_key_values = [(lay.keys, lay.values) for lay in past_key_values.layers] + elif isinstance(past_key_values, EncoderDecoderCache): + past_key_values = [ + (self_lay.keys, self_lay.values, cross_lay.keys, cross_lay.values) + for self_lay, cross_lay in zip( + past_key_values.self_attention_cache.layers, + past_key_values.cross_attention_cache.layers, + ) + ] + return past_key_values + + for idx, spec in enumerate(UNSUPPORTED_OPS_PATCHING_SPEC): if spec.name in { # onnx-exporter-specific fixes @@ -1382,7 +1392,11 @@ def phi3_442_forward( if use_cache: use_legacy_cache = not isinstance(past_key_values, Cache) if use_legacy_cache: - past_key_values = OVDynamicCache.from_legacy_cache(past_key_values) + if is_transformers_version("<", "5"): + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + else: + past_key_values = DynamicCache(past_key_values) + past_key_values_length = past_key_values.get_usable_length(seq_length) if position_ids is None: @@ -1455,7 +1469,7 @@ def phi3_442_forward( next_cache = None if use_cache: - next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + next_cache = postprocess_past_key_values(next_decoder_cache) if use_legacy_cache else next_decoder_cache if not return_dict: return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) return BaseModelOutputWithPast( @@ -2964,7 +2978,11 @@ def patched_forward(*args, **kwargs): legacy_pkv = args[pkv_argument_index] pkv_in_args = True if legacy_pkv is not None: - pkv = OVDynamicCache.from_legacy_cache(legacy_pkv) + if is_transformers_version("<", "5"): + pkv = DynamicCache.from_legacy_cache(legacy_pkv) + else: + pkv = DynamicCache(legacy_pkv) + return_legacy_cache = True if not pkv_in_args: kwargs["past_key_values"] = pkv @@ -2985,7 +3003,7 @@ def patched_forward(*args, **kwargs): outputs = self.orig_forward(*args, **kwargs) if return_legacy_cache: - outputs.past_key_values = outputs.past_key_values.to_legacy_cache() + outputs.past_key_values = postprocess_past_key_values(outputs.past_key_values) return outputs @@ -4118,7 +4136,11 @@ def forward_wrap( input_ids=None, use_cache=True, ): - new_past_key_values = OVDynamicCache.from_legacy_cache(past_key_values) + if is_transformers_version("<", "5"): + new_past_key_values = DynamicCache.from_legacy_cache(past_key_values) + else: + new_past_key_values = DynamicCache(past_key_values) + result = self.__orig_forward( input_ids=input_ids, attention_mask=attention_mask, @@ -4128,7 +4150,7 @@ def forward_wrap( use_cache=use_cache, ) if past_key_values is not None: - result["past_key_values"] = result["past_key_values"].to_legacy_cache() + result["past_key_values"] = postprocess_past_key_values(result["past_key_values"]) return result model.forward = types.MethodType(forward_wrap, model) @@ -4159,7 +4181,11 @@ def lm_forward( deepstack_visual_embeds, use_cache=True, ): - pkv = OVDynamicCache.from_legacy_cache(past_key_values) + if is_transformers_version("<", "5"): + pkv = DynamicCache.from_legacy_cache(past_key_values) + else: + pkv = DynamicCache(past_key_values) + outputs = self.model.language_model( inputs_embeds=inputs_embeds, attention_mask=attention_mask, @@ -4172,7 +4198,7 @@ def lm_forward( hidden_states = outputs[0] # Only compute necessary logits, and do not upcast them to float if we are not computing the loss logits = self.lm_head(hidden_states) - return (logits, outputs.past_key_values.to_legacy_cache()) + return (logits, postprocess_past_key_values(outputs.past_key_values)) model.__orig_forward = model.forward model.forward = types.MethodType(lm_forward, model) @@ -4547,11 +4573,18 @@ def patched_forward(*args, **kwargs): pkv = args[pkv_arg_index] if pkv is not None: - if isinstance(pkv, OVEncoderDecoderCache): - pkv = pkv.self_attention_cache.to_legacy_cache() + if isinstance(pkv, EncoderDecoderCache): + pkv = postprocess_past_key_values(pkv.self_attention_cache) else: pkv = [pkv_item[:2] for pkv_item in pkv] - pkv = OVEncoderDecoderCache.from_legacy_cache(pkv) + + if is_transformers_version("<", "5"): + pkv = EncoderDecoderCache.from_legacy_cache(pkv) + else: + pkv = EncoderDecoderCache( + DynamicCache([layer[:2] for layer in pkv]), + DynamicCache([layer[2:] for layer in pkv]), + ) if "past_key_values" in kwargs: kwargs["past_key_values"] = pkv @@ -4561,8 +4594,8 @@ def patched_forward(*args, **kwargs): outputs = self.super_patched_forward(*args, **kwargs) # the optimum-onnx seq2seq model patcher only converts to tuple starting from 4.48 - if isinstance(outputs.get("past_key_values"), (OVDynamicCache, OVEncoderDecoderCache)): - outputs["past_key_values"] = outputs["past_key_values"].to_legacy_cache() + if isinstance(outputs.get("past_key_values"), (DynamicCache, EncoderDecoderCache)): + outputs["past_key_values"] = postprocess_past_key_values(outputs["past_key_values"]) elif isinstance(outputs.get("past_key_values"), (DynamicCache, EncoderDecoderCache)): outputs.pop("past_key_values") @@ -4742,7 +4775,10 @@ def __init__( def forward( self, attention_mask, position_ids, past_key_values, token_type_ids, inputs_embeds, use_cache=True ): - pkv = OVDynamicCache.from_legacy_cache(past_key_values) + if is_transformers_version("<", "5"): + pkv = DynamicCache.from_legacy_cache(past_key_values) + else: + pkv = DynamicCache(past_key_values) past_seen_tokens = past_key_values[0][0].shape[-2] cache_position = torch.arange( @@ -4768,7 +4804,7 @@ def forward( **forward_kwargs, ) upd_pkv = result["past_key_values"] - result["past_key_values"] = upd_pkv.to_legacy_cache() + result["past_key_values"] = postprocess_past_key_values(upd_pkv) return result if is_transformers_version("<", "4.53.0"): @@ -5129,7 +5165,7 @@ def _blenderbot_attn_forward_new( query_states = query_states if past_key_value is not None: - if isinstance(past_key_value, OVEncoderDecoderCache): + if isinstance(past_key_value, EncoderDecoderCache): is_updated = past_key_value.is_updated.get(self.layer_idx) if is_cross_attention: # after the first generated id, we can subsequently re-use all key/value_states from cache @@ -5658,7 +5694,13 @@ def patched_decoder_forward( if past_key_values is not None: past_key_values = [cache_item[:2] for cache_item in past_key_values] if is_transformers_version(">=", "4.56"): - past_key_values = OVEncoderDecoderCache.from_legacy_cache(past_key_values) + if is_transformers_version("<", "5"): + past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) + else: + past_key_values = EncoderDecoderCache( + DynamicCache([layer[:2] for layer in past_key_values]), + DynamicCache([layer[2:] for layer in past_key_values]), + ) output_sequence = inputs_embeds output_cross_attentions = False @@ -5690,8 +5732,8 @@ def patched_decoder_forward( past_key_values = decoder_out.past_key_values if past_key_values is not None: - if isinstance(past_key_values, OVEncoderDecoderCache): - past_key_values = past_key_values.self_attention_cache.to_legacy_cache() + if isinstance(past_key_values, EncoderDecoderCache): + past_key_values = postprocess_past_key_values(past_key_values.self_attention_cache) else: past_key_values = [cache_item[:2] for cache_item in past_key_values] @@ -5742,7 +5784,11 @@ def __init__( # Adopted from https://github.com/huggingface/transformers/blob/v4.51.3/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py#L2156-L2178 # moved audio and vision features processing outside model def lm_forward(self, inputs_embeds, attention_mask, position_ids, past_key_values, use_cache=True): - pkv = OVDynamicCache.from_legacy_cache(past_key_values) + if is_transformers_version("<", "5"): + pkv = DynamicCache.from_legacy_cache(past_key_values) + else: + pkv = DynamicCache(past_key_values) + outputs = self.model( inputs_embeds=inputs_embeds, attention_mask=attention_mask, @@ -5753,7 +5799,7 @@ def lm_forward(self, inputs_embeds, attention_mask, position_ids, past_key_value hidden_states = outputs[0] # Only compute necessary logits, and do not upcast them to float if we are not computing the loss logits = self.lm_head(hidden_states) - return (logits, outputs.past_key_values.to_legacy_cache()) + return (logits, postprocess_past_key_values(outputs.past_key_values)) model.__orig_forward = model.forward model.forward = types.MethodType(lm_forward, model) @@ -7858,7 +7904,7 @@ def forward( inputs_embeds: torch.Tensor = self.embed_tokens(input_ids) if use_cache and past_key_values is None: - past_key_values = OVDynamicCache(config=self.config) + past_key_values = DynamicCache(config=self.config) if cache_position is None: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 diff --git a/setup.py b/setup.py index 16e2a82fed..25a5a01a97 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ INSTALL_REQUIRE = [ "torch>=2.1", - "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@transformers-v5", + "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@xadupre/investigate", "transformers>=4.45,<5.1", "setuptools", "huggingface-hub>=0.23.2,<2.0", From 6799e939ede93ae3205753b80b9fc42ee31587f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 17 Feb 2026 18:21:57 +0100 Subject: [PATCH 092/190] transformers-v5 branch --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 25a5a01a97..16e2a82fed 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ INSTALL_REQUIRE = [ "torch>=2.1", - "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@xadupre/investigate", + "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@transformers-v5", "transformers>=4.45,<5.1", "setuptools", "huggingface-hub>=0.23.2,<2.0", From a2cd48ec1a7c87549bfe86d4db0309c3d670d8c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 17 Feb 2026 18:28:41 +0100 Subject: [PATCH 093/190] use_model_defaults arg was deprecated in v5 --- tests/openvino/test_decoder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 07da27807b..9bcef5f2f0 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -478,7 +478,7 @@ def test_pipeline(self, model_arch): tokenizer._convert_tokens_to_ids = lambda x: 0 additional_args = {} - if is_transformers_version(">=", "4.51"): + if is_transformers_version(">=", "4.51") and is_transformers_version("<", "5"): additional_args["use_model_defaults"] = False set_seed(SEED) @@ -784,7 +784,7 @@ def test_beam_search(self, model_arch): ov_model_stateless.config.eos_token_id = None transformers_model.config.eos_token_id = None - if is_transformers_version(">=", "4.51"): + if is_transformers_version(">=", "4.51") and is_transformers_version("<", "5"): additional_inputs["use_model_defaults"] = False for gen_config in gen_configs: From 850c1cee66fbd5fde919d6e8b2a163bd372ba2d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 17 Feb 2026 18:33:18 +0100 Subject: [PATCH 094/190] style --- optimum/exporters/openvino/model_patcher.py | 1 - 1 file changed, 1 deletion(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index a2a9d18fbc..89d295c0e8 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -24,7 +24,6 @@ import torch import torch.nn.functional as F from torch import nn -from transformers import DynamicCache, EncoderDecoderCache from transformers.cache_utils import Cache, DynamicCache, EncoderDecoderCache from transformers.configuration_utils import PretrainedConfig from transformers.generation import GenerationMixin From af4a6059d0aa27f7fa091401bc3be89a0cc56e14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 17 Feb 2026 18:42:20 +0100 Subject: [PATCH 095/190] baichuan remote code models incompatible with v5 --- optimum/exporters/openvino/model_configs.py | 1 + tests/openvino/test_decoder.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index a25c3e7b8e..069286d2f1 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -330,6 +330,7 @@ class BaichaunOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): num_layers="num_hidden_layers", num_attention_heads="num_attention_heads", hidden_size="hidden_size" ) _MODEL_PATCHER = BaichuanModelPatcher + MAX_TRANSFORMERS_VERSION = "4.57.6" @register_in_tasks_manager( diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 9bcef5f2f0..d079e04539 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -47,7 +47,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES = ( "bart", - "baichuan2-13b", "gpt_bigcode", "bigbird_pegasus", "blenderbot", @@ -157,6 +156,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "jais", "dbrx", "baichuan2", + "baichuan2-13b", ) GENERATION_LENGTH = 100 From 4da53e8c4037434d472f2c8ef11e628cfc50eb81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 17 Feb 2026 19:17:47 +0100 Subject: [PATCH 096/190] remove tests for modelsf for models that needs fixes --- tests/openvino/test_decoder.py | 28 ++++++++++++++++++---------- tests/openvino/test_modeling.py | 13 ++++++++----- tests/openvino/test_seq2seq.py | 32 ++++++++++++++++++++++++-------- 3 files changed, 50 insertions(+), 23 deletions(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index d079e04539..6782574c01 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -58,7 +58,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "gpt_neo", "gpt_neox", "llama", - "marian", "mistral", "mixtral", "mpt", @@ -72,9 +71,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "biogpt", "gpt_neox_japanese", "xglm", - "aquila", - "xverse", - "internlm", "gemma", "olmo", "stablelm", @@ -85,12 +81,12 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "phi3", "gemma2", "granite", - "granitemoe", ) SUPPORTED_SSM_ARCHITECTURES = ("mamba", "falcon_mamba") - if is_transformers_version(">=", "4.49"): + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version(">=", "4.49") and is_transformers_version("<", "5"): SUPPORTED_SSM_ARCHITECTURES += ("zamba2",) if is_transformers_version(">=", "4.53.0"): @@ -102,11 +98,15 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES += SUPPORTED_SSM_ARCHITECTURES if is_transformers_version(">=", "4.46.0"): - SUPPORTED_ARCHITECTURES += ("glm", "mistral-nemo", "phimoe") + SUPPORTED_ARCHITECTURES += ("glm", "mistral-nemo") if is_transformers_version("<", "4.54.0"): SUPPORTED_ARCHITECTURES += ("deepseek",) + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ("phimoe",) + # gptq and awq install disabled for windows test environment if platform.system() != "Windows" and is_transformers_version("<", "4.56.0"): SUPPORTED_ARCHITECTURES += ("opt_gptq", "mixtral_awq") @@ -145,8 +145,8 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES += ("qwen", "chatglm", "chatglm4") if is_transformers_version("<", "5"): - # TODO: add dbrx back once fixed in transformers SUPPORTED_ARCHITECTURES += ( + # remote modeling incompatible with v5 "codegen2", "exaone", "decilm", @@ -154,11 +154,19 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "orion", "aquila2", "jais", - "dbrx", "baichuan2", "baichuan2-13b", + # remote modeling code failing with v5 + "aquila", + "xverse", + "internlm", + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + "dbrx", + # "phimoe", + "marian", + "granitemoe", + # "zamba2", ) - GENERATION_LENGTH = 100 EXPECTED_NUM_SDPA = { diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 4eccde4c87..8d8ab01147 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -757,14 +757,16 @@ class OVModelForSequenceClassificationIntegrationTest(unittest.TestCase): "convbert", "distilbert", "electra", - "flaubert", "ibert", "roberta", "roformer", "squeezebert", - "xlm", ) + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ("flaubert", "xlm") + @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_transformers(self, model_arch): model_id = MODEL_NAMES[model_arch] @@ -1087,13 +1089,11 @@ class OVModelForMaskedLMIntegrationTest(unittest.TestCase): "bert", "camembert", "convbert", - "data2vec-text", "deberta", "deberta-v2", "distilbert", "electra", "esm", - "flaubert", "ibert", "mobilebert", "mpnet", @@ -1102,7 +1102,6 @@ class OVModelForMaskedLMIntegrationTest(unittest.TestCase): "roberta", "roformer", "squeezebert", - "xlm", "xlm-roberta", ) @@ -1110,6 +1109,10 @@ class OVModelForMaskedLMIntegrationTest(unittest.TestCase): if is_transformers_version("<", "4.51.0"): SUPPORTED_ARCHITECTURES += ("nystromformer",) + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ("data2vec-text", "flaubert", "xlm") + @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_transformers(self, model_arch): model_id = MODEL_NAMES[model_arch] diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index 153f57be8e..d0e5f88b71 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -145,7 +145,6 @@ class OVModelForSeq2SeqLMIntegrationTest(OVSeq2SeqTestMixin): "longt5", "m2m_100", "mbart", - "mt5", "pegasus", "t5", ) @@ -159,6 +158,10 @@ class OVModelForSeq2SeqLMIntegrationTest(OVSeq2SeqTestMixin): # There are known issues with marian model on OpenVINO 2025.3.x and 2025.4.x SUPPORTED_ARCHITECTURES += ("marian",) + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ("mt5",) + SUPPORT_STATEFUL = ("t5", "mt5", "longt5") if is_transformers_version(">=", "4.52.0"): SUPPORT_STATEFUL += ("bart", "blenderbot", "blenderbot-small", "m2m_100", "marian", "mbart") @@ -535,10 +538,8 @@ def test_pipeline(self, model_arch: str): class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): SUPPORTED_ARCHITECTURES = [ - "llava", "llava_next", "llava_next_mistral", - "llava_next_video", "qwen2_vl", ] SUPPORT_VIDEO = ["llava_next_video", "qwen2_vl"] @@ -547,20 +548,31 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): TASK = "image-text-to-text" if is_transformers_version(">=", "4.46.0"): - SUPPORTED_ARCHITECTURES += ["maira2", "idefics3"] + SUPPORTED_ARCHITECTURES += ["maira2"] + + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ["idefics3"] if is_transformers_version(">=", "4.49.0"): - SUPPORTED_ARCHITECTURES += ["qwen2_5_vl", "got_ocr2"] + SUPPORTED_ARCHITECTURES += ["qwen2_5_vl"] SUPPORT_VIDEO.append("qwen2_5_vl") + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ["got_ocr2"] + if is_transformers_version("<", "4.54.0"): # remote code models differs after transformers v4.54 SUPPORTED_ARCHITECTURES += ["phi4mm"] SUPPORT_AUDIO.append("phi4mm") - if is_transformers_version(">", "4.49"): - SUPPORTED_ARCHITECTURES += ["gemma3", "smolvlm"] - if is_transformers_version(">=", "4.51"): + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version(">", "4.49") and is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ["gemma3", "smolvl"] + + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version(">=", "4.51") and is_transformers_version("<", "5"): # SUPPORTED_ARCHITECTURES += ["llama4", "phi4_multimodal"] SUPPORTED_ARCHITECTURES += ["llama4"] @@ -578,6 +590,10 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): # remote code models incompatible after transformers v5 SUPPORTED_ARCHITECTURES += ["internvl_chat", "minicpmv"] + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ("llava", "llava_next_video") + REMOTE_CODE_MODELS = ["internvl_chat", "minicpmv", "minicpmo", "llava-qwen2", "phi3_v", "maira2", "phi4mm"] IMAGE = Image.open( requests.get( From 5a74781777df1600644601c52e5854b3d9bfa113 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 17 Feb 2026 19:45:14 +0100 Subject: [PATCH 097/190] fix decoder tests untested_architectures --- tests/openvino/test_decoder.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 6782574c01..9a6acf1cb7 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -126,6 +126,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version(">=", "4.53.0"): SUPPORTED_ARCHITECTURES += ("arcee",) + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly if is_transformers_version(">=", "4.52.1") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES += ("bitnet",) @@ -305,9 +306,13 @@ def test_find_untested_architectures(self): supported_architectures -= {"lfm2"} # qwen3_vl_text a part of qwen3_vl architecture and is tested in seq2seq group - if is_transformers_version(">=", str(Qwen3VLOpenVINOConfig.MIN_TRANSFORMERS_VERSION)): + if is_transformers_version(">", str(Qwen3VLOpenVINOConfig.MIN_TRANSFORMERS_VERSION)): supported_architectures -= {"qwen3_vl_text"} + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version(">=", "5"): + supported_architectures -= {"phimoe", "granitemoe", "bitnet", "dbrx", "zamba2", "marian"} + supported_architectures -= ONNX_SUPPORTED_ARCHITECTURES untested_architectures = supported_architectures - tested_architectures From e634d777eb815f50f12366a796b85554056b059d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 17 Feb 2026 19:51:46 +0100 Subject: [PATCH 098/190] fix untested architecture --- tests/openvino/test_seq2seq.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index d0e5f88b71..70e43293e0 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -161,6 +161,8 @@ class OVModelForSeq2SeqLMIntegrationTest(OVSeq2SeqTestMixin): # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly if is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES += ("mt5",) + else: + UNSUPPORTED_ARCHITECTURES = {"marian", "mt5"} SUPPORT_STATEFUL = ("t5", "mt5", "longt5") if is_transformers_version(">=", "4.52.0"): @@ -593,7 +595,17 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly if is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES += ("llava", "llava_next_video") - + else: + UNSUPPORTED_ARCHITECTURES = { + "got_ocr2", + "idefics3", + "llama4", + "llava_next_video", + "phi4_multimodal", + "gemma3", + "smolvlm", + "llava", + } REMOTE_CODE_MODELS = ["internvl_chat", "minicpmv", "minicpmo", "llava-qwen2", "phi3_v", "maira2", "phi4mm"] IMAGE = Image.open( requests.get( From f89d0de33d40d71cfcd07b885f6a85bdbf700de3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 17 Feb 2026 21:57:28 +0100 Subject: [PATCH 099/190] fix pkv patching --- optimum/exporters/openvino/model_patcher.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 89d295c0e8..82a25f3098 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -4580,10 +4580,7 @@ def patched_forward(*args, **kwargs): if is_transformers_version("<", "5"): pkv = EncoderDecoderCache.from_legacy_cache(pkv) else: - pkv = EncoderDecoderCache( - DynamicCache([layer[:2] for layer in pkv]), - DynamicCache([layer[2:] for layer in pkv]), - ) + pkv = EncoderDecoderCache(DynamicCache(pkv), DynamicCache()) if "past_key_values" in kwargs: kwargs["past_key_values"] = pkv @@ -5696,10 +5693,7 @@ def patched_decoder_forward( if is_transformers_version("<", "5"): past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) else: - past_key_values = EncoderDecoderCache( - DynamicCache([layer[:2] for layer in past_key_values]), - DynamicCache([layer[2:] for layer in past_key_values]), - ) + past_key_values = EncoderDecoderCache(DynamicCache(past_key_values), DynamicCache()) output_sequence = inputs_embeds output_cross_attentions = False From 6070155e197b19f8553a62978003160d37bf724a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 17 Feb 2026 22:27:15 +0100 Subject: [PATCH 100/190] fix test --- tests/openvino/test_seq2seq.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index 70e43293e0..2737059e50 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -380,7 +380,7 @@ def test_compare_to_transformers(self, model_arch): ) generate_kwrgs = {} - if is_transformers_version(">=", "4.50"): + if is_transformers_version(">=", "4.50") and is_transformers_version("<", "5"): generate_kwrgs = {"use_model_defaults": False} gen_config = GenerationConfig( @@ -571,7 +571,7 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly if is_transformers_version(">", "4.49") and is_transformers_version("<", "5"): - SUPPORTED_ARCHITECTURES += ["gemma3", "smolvl"] + SUPPORTED_ARCHITECTURES += ["gemma3", "smolvlm"] # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly if is_transformers_version(">=", "4.51") and is_transformers_version("<", "5"): From 26d5c4413cdb0d37fd99aff736013390c541ac09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 17 Feb 2026 22:36:52 +0100 Subject: [PATCH 101/190] fix expcted int8 tests --- tests/openvino/test_quantization.py | 4 ++-- tests/openvino/utils_tests.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index b39ee223ae..f9bde752b3 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -269,7 +269,7 @@ class OVQuantizerTest(unittest.TestCase): "model": 33, }, { - "model": {"int8": 35}, + "model": {"int8": 35 if is_transformers_version("<", "5") else 36}, }, ), ( @@ -299,7 +299,7 @@ class OVQuantizerTest(unittest.TestCase): "model": 32, }, { - "model": {"int8": 34}, + "model": {"int8": 34 if is_transformers_version("<", "5") else 35}, }, ), ( diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 85f79801cd..06314ef394 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -231,19 +231,19 @@ _ARCHITECTURES_TO_EXPECTED_INT8 = { "afmoe": {"model": 16}, - "bert": {"model": 68}, + "bert": {"model": 68 if is_transformers_version("<", "5") else 70}, "roberta": {"model": 68}, "albert": {"model": 84}, "vit": {"model": 64}, - "blenderbot": {"model": 70}, + "blenderbot": {"model": 70 if is_transformers_version("<", "5") else 72}, "gpt2": {"model": 44}, "granitemoehybrid": {"model": 118}, "wav2vec2": {"model": 34}, "distilbert": {"model": 66}, "t5": { "encoder": 64, - "decoder": 104, - "decoder_with_past": 84, + "decoder": 104 if is_transformers_version("<", "5") else 106, + "decoder_with_past": 84 if is_transformers_version("<", "5") else 86, }, "stable-diffusion": { "unet": 242, From 9d84f3a4870a501fd4591a8b8473e5e1879c6217 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Feb 2026 10:27:45 +0100 Subject: [PATCH 102/190] tests transformers v5 --- tests/openvino/test_seq2seq.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index 2737059e50..da68d6e8b9 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -862,7 +862,11 @@ def test_compare_to_transformers(self, model_arch): gc.collect() - @parameterized.expand(["llava", "llava_next", "llava_next_video", "llava_next_mistral"]) + @parameterized.expand( + ["llava", "llava_next", "llava_next_video", "llava_next_mistral"] + if is_transformers_version("<", "5") + else ["llava_next", "llava_next_mistral"] + ) def test_llava_with_new_preprocessing(self, model_arch): prompt = "\n What is shown in this image?" model_id = MODEL_NAMES[model_arch] From 4b5f83d4f5169513c467dd3b3a9dfdf9fc43006e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Feb 2026 10:29:41 +0100 Subject: [PATCH 103/190] pix2struct --- optimum/exporters/openvino/model_configs.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 069286d2f1..ce617dc3ea 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -5044,9 +5044,7 @@ class GPTBigCodeOpenVINOConfig(GPTBigCodeOnnxConfig): ], ) class Pix2StructOpenVINOConfig(Pix2StructOnnxConfig): - # _MODEL_PATCHER = OVSeq2SeqModelPatcher - # TODO (@echarlaix): add v5 support - MAX_TRANSFORMERS_VERSION = "4.57.6" + _MODEL_PATCHER = OVSeq2SeqModelPatcher @register_in_tasks_manager("bert", *COMMON_TEXT_TASKS) From 14e1b524547bc44e08294013325abbea2e63c481 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Feb 2026 10:42:44 +0100 Subject: [PATCH 104/190] fix num expected int8 --- tests/openvino/test_quantization.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index f9bde752b3..a249624023 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -135,8 +135,8 @@ class OVQuantizerTest(unittest.TestCase): (OVModelForSequenceClassification, "bert", 32, 35), (OVModelForCausalLM, "gpt2", 31, 22), (OVSentenceTransformer, "sentence-transformers-bert", 12, 15), - (OVModelForFeatureExtraction, "blenderbot", 33, 35), - (OVModelForMaskedLM, "roberta", 32, 34), + (OVModelForFeatureExtraction, "blenderbot", 33, 35 if is_transformers_version("<", "5") else 36), + (OVModelForMaskedLM, "roberta", 32, 34 if is_transformers_version("<", "5") else 35), (OVModelForZeroShotImageClassification, "clip", 65, 65), ) SUPPORTED_ARCHITECTURES_OV_MODEL_WITH_AUTO_DATASET = [ @@ -344,12 +344,12 @@ class OVQuantizerTest(unittest.TestCase): if is_transformers_version("<=", "4.45") else { "encoder": 30, - "decoder": 52, + "decoder": 52 if is_transformers_version("<", "5") else 53, }, ( {"encoder": {"int8": 32}, "decoder": {"int8": 52}, "decoder_with_past": {"int8": 42}} if is_transformers_version("<=", "4.45") - else {"encoder": {"int8": 32}, "decoder": {"int8": 52}} + else {"encoder": {"int8": 32}, "decoder": {"int8": 52 if is_transformers_version("<", "5") else 53}} ), ), ( @@ -596,7 +596,9 @@ class OVWeightCompressionTest(unittest.TestCase): (OVModelForCausalLM, "gpt2", 44, 44), ) - SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 62, 43),) + SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS = ( + (OVModelForCausalLM, "opt125m", 62 if is_transformers_version("<", "5") else 64, 43), + ) SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTOCOMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 0, 74),) SUPPORTED_ARCHITECTURES_STATEFUL_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS = ((OVModelForCausalLM, "gpt2", 44, 44),) From 0dbe96c293c68dc66e6fdf9a0213d312d004c943 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Feb 2026 10:44:15 +0100 Subject: [PATCH 105/190] use_model_defaults deprecated in v5 --- tests/openvino/test_quantization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index a249624023..fc4f9ea102 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -2497,7 +2497,7 @@ def check_model_inference(ov_model, model_id, trust_remote_code): if isinstance(ov_model, OVModelForSpeechSeq2Seq): input_features = torch.randn((1, ov_model.config.num_mel_bins, 3000), dtype=torch.float32) generate_kwrgs = {} - if is_transformers_version(">=", "4.50"): + if is_transformers_version(">=", "4.50") and is_transformers_version("<", "5"): generate_kwrgs = {"use_model_defaults": False} ov_model.generate(input_features, generation_config=gen_config, **generate_kwrgs) else: From af3fba3d7800adb4ab6dfd0f118cfac1c33bd962 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Feb 2026 11:01:50 +0100 Subject: [PATCH 106/190] rename --- optimum/exporters/openvino/model_configs.py | 20 ++++++++++---------- optimum/exporters/openvino/model_patcher.py | 12 +++++------- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index ce617dc3ea..fb7acb865d 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -202,7 +202,7 @@ SanaTextEncoderModelPatcher, XverseModelPatcher, Zamba2ModelPatcher, - _get_subcomponent_model, + _get_model_attribute, ) @@ -1878,14 +1878,14 @@ def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior]) behavior = VLMConfigBehavior(behavior) if behavior == VLMConfigBehavior.LANGUAGE: - return _get_subcomponent_model(model, "language_model") if not hasattr(model, "lm_head") else model + return _get_model_attribute(model, "language_model") if not hasattr(model, "lm_head") else model if behavior == VLMConfigBehavior.VISION_EMBEDDINGS: return model if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS: text_embedding = model.get_input_embeddings() - text_embedding.config = _get_subcomponent_model(model, "language_model").config + text_embedding.config = _get_model_attribute(model, "language_model").config return text_embedding def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None): @@ -2162,14 +2162,14 @@ def get_model_for_behavior(model, behavior: Union[str, VLMConfigBehavior]): behavior = VLMConfigBehavior(behavior) if behavior == VLMConfigBehavior.LANGUAGE: - return _get_subcomponent_model(model, "language_model") + return _get_model_attribute(model, "language_model") if behavior == VLMConfigBehavior.VISION_EMBEDDINGS: return model if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS: - text_embedding = _get_subcomponent_model(model, "language_model").get_input_embeddings() - text_embedding.config = _get_subcomponent_model(model, "language_model").config + text_embedding = _get_model_attribute(model, "language_model").get_input_embeddings() + text_embedding.config = _get_model_attribute(model, "language_model").config return text_embedding def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None): @@ -3696,12 +3696,12 @@ def get_model_for_behavior(model, behavior: Union[str, QwenVLConfigBehavior]): return model if behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS: - vision_embeddings = _get_subcomponent_model(model, "visual").patch_embed + vision_embeddings = _get_model_attribute(model, "visual").patch_embed vision_embeddings.config = model.config.vision_config return vision_embeddings if behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_MERGER: - vision_emb_merger = _get_subcomponent_model(model, "visual") + vision_emb_merger = _get_model_attribute(model, "visual") vision_emb_merger.config = model.config.vision_config return vision_emb_merger @@ -3709,7 +3709,7 @@ def get_model_for_behavior(model, behavior: Union[str, QwenVLConfigBehavior]): text_embedding = ( model.model.embed_tokens if hasattr(model.model, "embed_tokens") - else _get_subcomponent_model(model, "language_model").embed_tokens + else _get_model_attribute(model, "language_model").embed_tokens ) text_embedding.config = model.config return text_embedding @@ -3849,7 +3849,7 @@ def __init__( @staticmethod def get_model_for_behavior(model, behavior: Union[str, QwenVLConfigBehavior]): if behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_POS: - vision_emb_pos = _get_subcomponent_model(model, "visual").pos_embed + vision_emb_pos = _get_model_attribute(model, "visual").pos_embed vision_emb_pos.config = model.config.vision_config return vision_emb_pos diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 82a25f3098..a617289c8e 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -80,13 +80,6 @@ logger = logging.getLogger(__name__) -def _get_subcomponent_model(model, name): - if is_transformers_version(">=", "5") and hasattr(model, "model"): - return getattr(model.model, name) - - return getattr(model, name) - - def postprocess_past_key_values(past_key_values): if isinstance(past_key_values, (EncoderDecoderCache, DynamicCache)): if hasattr(past_key_values, "to_legacy_cache"): @@ -104,6 +97,11 @@ def postprocess_past_key_values(past_key_values): return past_key_values +def _get_model_attribute(model, name): + target = getattr(model, "model", model) + return getattr(target, name) + + for idx, spec in enumerate(UNSUPPORTED_OPS_PATCHING_SPEC): if spec.name in { # onnx-exporter-specific fixes From 546127bdee7899e99fa505fadb8bf85b6a2a7a79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Feb 2026 11:04:12 +0100 Subject: [PATCH 107/190] style --- optimum/exporters/openvino/model_patcher.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index a617289c8e..0910f4de3f 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3256,7 +3256,7 @@ def llava_vision_embed_forward(self, pixel_values): # copied from https://github.com/huggingface/transformers/blob/v4.44.2/src/transformers/models/llava/modeling_llava.py#L428-L441 # these changes does not bring any difference from original, it only packs model subcomponent inference together # that allow us avoid memory overheads and their inference results handling on code-level - image_outputs = _get_subcomponent_model(self, "vision_tower")(pixel_values, output_hidden_states=True) + image_outputs = _get_model_attribute(self, "vision_tower")(pixel_values, output_hidden_states=True) # this is not memory efficient at all (output_hidden_states=True) will save all the hidden stated. selected_image_feature = image_outputs.hidden_states[self.config.vision_feature_layer] @@ -3267,7 +3267,7 @@ def llava_vision_embed_forward(self, pixel_values): else: raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}") - image_features = _get_subcomponent_model(self, "multi_modal_projector")(selected_image_feature) + image_features = _get_model_attribute(self, "multi_modal_projector")(selected_image_feature) return image_features @@ -3275,7 +3275,7 @@ def llava_next_video_vision_embed_forward(self, pixel_values): # copied from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/llava_next_video/modeling_llava_next_video.py#L519 # these changes does not bring any difference from original, it only packs model subcomponent inference together # that allow us avoid memory overheads and their inference results handling on code-level - image_features = _get_subcomponent_model(self, "vision_tower")(pixel_values, output_hidden_states=True) + image_features = _get_model_attribute(self, "vision_tower")(pixel_values, output_hidden_states=True) vision_feature_layer = self.config.vision_feature_layer if isinstance(vision_feature_layer, int): selected_image_feature = image_features.hidden_states[vision_feature_layer] From 3eeeb4dc64d6eeeadd1b9cdac309f36838e9b36a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Feb 2026 11:10:09 +0100 Subject: [PATCH 108/190] install diffusers from source for v5 --- .github/workflows/test_openvino.yml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index 085619c5fa..48e3a7409b 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -54,7 +54,7 @@ jobs: - name: Install dependencies run: | pip install --upgrade pip uv - uv pip install .[tests] librosa + uv pip install .[tests] librosa diffusers - if: ${{ matrix.test-pattern == '*modeling*' }} name: Install OpenVINO @@ -64,7 +64,12 @@ jobs: - if: ${{ matrix.transformers-version != 'latest' }} name: Install transformers run: | - uv pip install transformers==${{ matrix.transformers-version }} diffusers + uv pip install transformers==${{ matrix.transformers-version }} + + - if: ${{ matrix.transformers-version == 'latest' }} + name: Install diffusers + run: | + uv pip install git+https://github.com/huggingface/diffusers - if: ${{ matrix.transformers-version == '4.45.0' }} name: Install specific dependencies and versions required for older transformers From 2b61bd38e7b375f18c018e83072e2c00d258db4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Feb 2026 15:44:38 +0100 Subject: [PATCH 109/190] qwen2vl --- optimum/exporters/openvino/model_patcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 0910f4de3f..53b7340962 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -98,7 +98,7 @@ def postprocess_past_key_values(past_key_values): def _get_model_attribute(model, name): - target = getattr(model, "model", model) + target = getattr(model, "model", model) if is_transformers_version(">=", "5") else model return getattr(target, name) From bbe65bbff2e8628e8e694535c3ca74f0c216e65b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Feb 2026 15:57:19 +0100 Subject: [PATCH 110/190] remove tests for v5 --- tests/openvino/test_quantization.py | 114 ++++++++++++++++------------ 1 file changed, 64 insertions(+), 50 deletions(-) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index fc4f9ea102..f66ae8834d 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -342,10 +342,7 @@ class OVQuantizerTest(unittest.TestCase): ), {"encoder": 30, "decoder": 52, "decoder_with_past": 61} if is_transformers_version("<=", "4.45") - else { - "encoder": 30, - "decoder": 52 if is_transformers_version("<", "5") else 53, - }, + else {"encoder": 30, "decoder": 52}, ( {"encoder": {"int8": 32}, "decoder": {"int8": 52}, "decoder_with_past": {"int8": 42}} if is_transformers_version("<=", "4.45") @@ -1064,9 +1061,6 @@ class OVWeightCompressionTest(unittest.TestCase): (OVStableDiffusionPipeline, "stable-diffusion", False), (OVStableDiffusionXLPipeline, "stable-diffusion-xl", False), (OVModelOpenCLIPForZeroShotImageClassification, "open-clip", False), - (OVModelForVisualCausalLM, "llava", False), - (OVModelForVisualCausalLM, "llava_next_video", False), - (OVModelForVisualCausalLM, "minicpmv", True), (OVModelForVisualCausalLM, "qwen2_vl", False), ] @@ -1082,6 +1076,15 @@ class OVWeightCompressionTest(unittest.TestCase): if is_transformers_version(">=", "4.57.0"): SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "qwen3_vl", False)) + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.extend( + [ + (OVModelForVisualCausalLM, "llava", False), + (OVModelForVisualCausalLM, "llava_next_video", False), + (OVModelForVisualCausalLM, "minicpmv", True), + ] + ) + SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION = [ (OVStableDiffusionPipeline, "stable-diffusion", 72, 195), (OVStableDiffusionXLPipeline, "stable-diffusion-xl", 84, 331), @@ -1119,17 +1122,6 @@ class OVWeightCompressionTest(unittest.TestCase): "text_encoder": {}, }, ), - ( - OVModelForVisualCausalLM, - "llava", - 4, - {"bits": 4, "group_size": 8, "ratio": 0.5}, - { - "lm_model": {"int8": 22, "int4": 8}, - "text_embeddings_model": {"int8": 1}, - "vision_embeddings_model": {"int8": 9}, - }, - ), ( OVSamModel, "sam", @@ -1183,15 +1175,6 @@ class OVWeightCompressionTest(unittest.TestCase): }, }, ), - ( - OVModelForVisualCausalLM, - "llava", - { - "lm_model": {"patterns": [".*layers.0.self_attn.q_proj/aten::linear/MatMul"]}, - "vision_embeddings_model": {"patterns": [".*layers.0.self_attn.q_proj/aten::linear/MatMul"]}, - "text_embeddings_model": {"patterns": ["."]}, - }, - ), ( OVSamModel, "sam", @@ -1212,6 +1195,33 @@ class OVWeightCompressionTest(unittest.TestCase): ), ] + if is_transformers_version("<", "5"): + DEFAULT_COMPRESSION_CONFIGURATIONS.append( + ( + OVModelForVisualCausalLM, + "llava", + 4, + {"bits": 4, "group_size": 8, "ratio": 0.5}, + { + "lm_model": {"int8": 22, "int4": 8}, + "text_embeddings_model": {"int8": 1}, + "vision_embeddings_model": {"int8": 9}, + }, + ), + ) + + DEFAULT_IGNORED_SCOPE_CONFIGURATIONS.append( + ( + OVModelForVisualCausalLM, + "llava", + { + "lm_model": {"patterns": [".*layers.0.self_attn.q_proj/aten::linear/MatMul"]}, + "vision_embeddings_model": {"patterns": [".*layers.0.self_attn.q_proj/aten::linear/MatMul"]}, + "text_embeddings_model": {"patterns": ["."]}, + }, + ), + ) + def test_filtered_architectures(cls): expected = set() if is_transformers_version("<", "4.49"): @@ -1800,31 +1810,35 @@ class OVPipelineQuantizationTest(unittest.TestCase): {"encoder": 14, "decoder": 22}, {"encoder": {"int8": 14}, "decoder": {"int8": 22}}, ), - ( - OVModelForVisualCausalLM, - "internvl_chat", - True, - dict( - quantization_configs={ - "lm_model": dict(bits=8, weight_only=True), - "vision_embeddings_model": dict(bits=8, weight_only=False), + ] + + if is_transformers_version("<", "5"): + PIPELINE_QUANTIZATION_SCOPE.append( + ( + OVModelForVisualCausalLM, + "internvl_chat", + True, + dict( + quantization_configs={ + "lm_model": dict(bits=8, weight_only=True), + "vision_embeddings_model": dict(bits=8, weight_only=False), + }, + dataset="contextual", + num_samples=1, + default_config=dict(bits=8, sym=True, weight_only=True), + ), + { + "lm_model": 0, + "text_embeddings_model": 0, + "vision_embeddings_model": 15, + }, + { + "lm_model": {"int8": 30}, + "text_embeddings_model": {"int8": 1}, + "vision_embeddings_model": {"int8": 11}, }, - dataset="contextual", - num_samples=1, - default_config=dict(bits=8, sym=True, weight_only=True), ), - { - "lm_model": 0, - "text_embeddings_model": 0, - "vision_embeddings_model": 15, - }, - { - "lm_model": {"int8": 30}, - "text_embeddings_model": {"int8": 1}, - "vision_embeddings_model": {"int8": 11}, - }, - ), - ] + ) if is_transformers_version(">=", "4.49.0") and is_transformers_version("<", "4.54.0"): PIPELINE_QUANTIZATION_SCOPE.extend( From 7ba6fd1289612fa92476194bc9aa7f16316a2a52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Feb 2026 16:37:42 +0100 Subject: [PATCH 111/190] disable tests for transformers v5 --- tests/openvino/test_genai.py | 41 +++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/tests/openvino/test_genai.py b/tests/openvino/test_genai.py index b31ca1569e..74f6bab1ec 100644 --- a/tests/openvino/test_genai.py +++ b/tests/openvino/test_genai.py @@ -45,7 +45,6 @@ class LLMPipelineTestCase(unittest.TestCase): "gpt_bigcode", "bloom", "codegen", - "codegen2", "gpt2", "gptj", "gpt_neox", @@ -53,37 +52,29 @@ class LLMPipelineTestCase(unittest.TestCase): "mistral", "mixtral", "phi", - "internlm2", - "orion", "falcon", "persimmon", "xglm", - "aquila", - "aquila2", - "internlm", - "jais", - "decilm", "gemma", "olmo", "stablelm", "starcoder2", - "dbrx", "cohere", "qwen2", "qwen2_moe", "phi3", "gemma2", - "exaone", "granite", - "granitemoe", ) if is_transformers_version(">=", "4.46.0"): - SUPPORTED_ARCHITECTURES += ("glm", "mistral-nemo", "phimoe", "opt") + SUPPORTED_ARCHITECTURES += ("glm", "mistral-nemo", "opt") if is_transformers_version("<", "4.54.0"): SUPPORTED_ARCHITECTURES += ("deepseek",) if is_transformers_version("<", "4.56.0"): SUPPORTED_ARCHITECTURES += ("qwen",) + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ("phimoe",) if is_transformers_version(">=", "4.49"): SUPPORTED_ARCHITECTURES += ("gemma3_text",) if is_transformers_version(">=", "4.51.0"): @@ -101,6 +92,25 @@ class LLMPipelineTestCase(unittest.TestCase): if is_transformers_version("<", "4.56.0"): SUPPORTED_ARCHITECTURES += ("chatglm", "chatglm4") + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ( + # remote modeling incompatible with v5 + "codegen2", + "exaone", + "decilm", + "internlm2", + "orion", + "aquila2", + "jais", + # remote modeling code failing with v5 + "aquila", + "internlm", + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + "dbrx", + # "phimoe", + "granitemoe", + ) + REMOTE_CODE_MODELS = ( "chatglm", "minicpm", @@ -200,9 +210,7 @@ def test_compare_outputs(self, model_arch): class VLMPipelineTestCase(unittest.TestCase): SUPPORTED_ARCHITECTURES = ( - "llava", "llava_next", - "llava_next_video", # "minicpmv", # output is truncated for some reason "qwen2_vl", ) @@ -216,8 +224,11 @@ class VLMPipelineTestCase(unittest.TestCase): SUPPORTED_ARCHITECTURES += ("qwen2_5_vl",) if is_transformers_version("<", "4.54.0"): SUPPORTED_ARCHITECTURES += ("phi4mm",) - if is_transformers_version(">=", "4.49"): + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version(">=", "4.49") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES += ("gemma3",) + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ("llava", "llava_next_video") REMOTE_CODE_MODELS = ( "minicpmv", From 928fb5009f60ac2478b818bb39aa17bd72eadf93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Feb 2026 16:42:24 +0100 Subject: [PATCH 112/190] remove non needed --- optimum/exporters/openvino/model_patcher.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 53b7340962..1be2bfe437 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -4590,8 +4590,6 @@ def patched_forward(*args, **kwargs): # the optimum-onnx seq2seq model patcher only converts to tuple starting from 4.48 if isinstance(outputs.get("past_key_values"), (DynamicCache, EncoderDecoderCache)): outputs["past_key_values"] = postprocess_past_key_values(outputs["past_key_values"]) - elif isinstance(outputs.get("past_key_values"), (DynamicCache, EncoderDecoderCache)): - outputs.pop("past_key_values") # we still need to filter out cross attention in the case of non-stateful decoder filtered_outputs = {} From ef320b3be74f090a861e30ac4c45cc76ffafa071 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Feb 2026 19:11:20 +0100 Subject: [PATCH 113/190] disable tests --- tests/openvino/test_export.py | 9 ++- tests/openvino/test_exporters_cli.py | 82 +++++++++++++++++----------- tests/openvino/utils_tests.py | 6 +- 3 files changed, 58 insertions(+), 39 deletions(-) diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index 18811bd121..6cc28c8597 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -84,7 +84,6 @@ class ExportModelTest(unittest.TestCase): "stable-diffusion-xl": OVStableDiffusionXLPipeline, "stable-diffusion-xl-refiner": OVStableDiffusionXLImg2ImgPipeline, "latent-consistency": OVLatentConsistencyModelPipeline, - "llava": OVModelForVisualCausalLM, "sam": OVSamModel, "speecht5": OVModelForTextToSpeechSeq2Seq, "clip": OVModelForZeroShotImageClassification, @@ -95,7 +94,7 @@ class ExportModelTest(unittest.TestCase): "ltx-video": OVLTXPipeline, } - if is_transformers_version(">=", "4.49"): + if is_transformers_version(">=", "4.49") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES.update({"zamba2": OVModelForCausalLM}) if is_transformers_version(">=", "4.53.0"): @@ -118,7 +117,11 @@ class ExportModelTest(unittest.TestCase): if is_transformers_version(">=", "4.51"): SUPPORTED_ARCHITECTURES.update({"qwen3": OVModelForFeatureExtraction}) - GENERATIVE_MODELS = ("pix2struct", "t5", "bart", "gpt2", "whisper", "llava", "speecht5") + GENERATIVE_MODELS = ("pix2struct", "t5", "bart", "gpt2", "whisper", "speecht5") + + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES.update({"llava": OVModelForVisualCausalLM}) + GENERATIVE_MODELS.append("llava") def _openvino_export( self, diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 4be27f43e5..a684c90ca8 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -121,11 +121,17 @@ class OVCLIExportTestCase(unittest.TestCase): [ ("text-generation", "lfm2"), ("text-generation-with-past", "lfm2"), + ] + ) + + if is_transformers_version(">=", "4.54") and is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES.extend( + [ ("text-generation-with-past", "qwen3_eagle3"), ] ) - if is_transformers_version(">=", "4.49"): + if is_transformers_version(">=", "4.49") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES.extend( [ ("text-generation-with-past", "zamba2"), @@ -138,7 +144,7 @@ class OVCLIExportTestCase(unittest.TestCase): ("text-generation-with-past", "exaone4"), ] ) - if is_transformers_version(">=", "4.52.1"): + if is_transformers_version(">=", "4.52.1") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES.extend( [ ("text-generation-with-past", "bitnet"), @@ -202,14 +208,6 @@ class OVCLIExportTestCase(unittest.TestCase): "expected_chat_template": False, "simplified_chat_template": False, }, - "llava": { # transformers, chat template in processor, simplified chat template - "num_tokenizers": 2, - "task": "image-text-to-text", - "processor_chat_template": True, - "remote_code": False, - "expected_chat_template": True, - "simplified_chat_template": True, - }, "llava_next": { # transformers, chat template in processor overrides tokinizer chat template, simplified chat template "num_tokenizers": 2, "task": "image-text-to-text", @@ -256,6 +254,20 @@ class OVCLIExportTestCase(unittest.TestCase): } ) + if is_transformers_version("<", "5"): + TOKENIZER_CHAT_TEMPLATE_TESTS_MODELS.update( + { + "llava": { # transformers, chat template in processor, simplified chat template + "num_tokenizers": 2, + "task": "image-text-to-text", + "processor_chat_template": True, + "remote_code": False, + "expected_chat_template": True, + "simplified_chat_template": True, + }, + } + ) + SUPPORTED_SD_HYBRID_ARCHITECTURES = [ ("flux", 7, 56), ("latent-consistency", 50, 135), @@ -407,7 +419,7 @@ class OVCLIExportTestCase(unittest.TestCase): "model": 33, }, { - "model": {"int8": 35}, + "model": {"int8": 35 if is_transformers_version("<", "5") else 36}, }, ), ( @@ -431,7 +443,7 @@ class OVCLIExportTestCase(unittest.TestCase): "model": 32, }, { - "model": {"int8": 34}, + "model": {"int8": 34 if is_transformers_version("<", "5") else 35}, }, ), ( @@ -472,7 +484,7 @@ class OVCLIExportTestCase(unittest.TestCase): ( {"encoder": {"int8": 32}, "decoder": {"int8": 52}, "decoder_with_past": {"int8": 42}} if is_transformers_version("<=", "4.45") - else {"encoder": {"int8": 32}, "decoder": {"int8": 52}} + else {"encoder": {"int8": 32}, "decoder": {"int8": 52 if is_transformers_version("<", "5") else 53}} ), ), ( @@ -489,48 +501,52 @@ class OVCLIExportTestCase(unittest.TestCase): "prompt_encoder_mask_decoder": {"int8": 49}, }, ), - ( - "image-text-to-text", - "internvl_chat", - "f8e4m3", - "--dataset contextual --num-samples 1 --trust-remote-code", - { - "lm_model": 15, - "text_embeddings_model": 0, - "vision_embeddings_model": 17, - }, - { - "lm_model": {"f8e4m3": 15}, - "text_embeddings_model": {"int8": 1}, - "vision_embeddings_model": {"f8e4m3": 11}, - }, - ), ] + if is_transformers_version("<", "5"): + SUPPORTED_QUANTIZATION_ARCHITECTURES.append( + ( + "image-text-to-text", + "internvl_chat", + "f8e4m3", + "--dataset contextual --num-samples 1 --trust-remote-code", + { + "lm_model": 15, + "text_embeddings_model": 0, + "vision_embeddings_model": 17, + }, + { + "lm_model": {"f8e4m3": 15}, + "text_embeddings_model": {"int8": 1}, + "vision_embeddings_model": {"f8e4m3": 11}, + }, + ), + ) + TRANSFORMERS_4BIT_CONFIGURATIONS = [ ( "text-generation-with-past", "opt125m", "int4 --sym --group-size 128", - {"model": {"int8": 4, "int4": 72}}, + {"model": {"int8": 4 if is_transformers_version("<", "5") else 6, "int4": 72}}, ), ( "text-generation-with-past", "opt125m", "int4 --group-size 64", - {"model": {"int8": 4, "int4": 144}}, + {"model": {"int8": 4 if is_transformers_version("<", "5") else 6, "int4": 144}}, ), ( "text-generation-with-past", "opt125m", "mxfp4", - {"model": {"int8": 4, "f4e2m1": 72, "f8e8m0": 72}}, + {"model": {"int8": 4 if is_transformers_version("<", "5") else 6, "f4e2m1": 72, "f8e8m0": 72}}, ), ( "text-generation-with-past", "opt125m", "nf4", - {"model": {"int8": 4, "nf4": 72}}, + {"model": {"int8": 4 if is_transformers_version("<", "5") else 6, "nf4": 72}}, ), ( "text-generation-with-past", diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 06314ef394..c6737bff1e 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -354,8 +354,8 @@ "vocoder": 80, }, "clip": {"model": 130}, - "mamba": {"model": 322}, - "falcon_mamba": {"model": 162}, + "mamba": {"model": 322 if is_transformers_version("<", "5") else 324}, + "falcon_mamba": {"model": 162 if is_transformers_version("<", "5") else 164}, "minicpmo": { "lm_model": 16, "text_embeddings_model": 1, @@ -364,7 +364,7 @@ }, "zamba2": {"model": 44}, "exaone4": {"model": 16}, - "lfm2": {"model": 52}, + "lfm2": {"model": 52 if is_transformers_version("<", "5") else 54}, "qwen3_eagle3": {"model": 20}, } From 8beb8d8bfaed7d778adfc152212d3b4912613745 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Feb 2026 19:35:01 +0100 Subject: [PATCH 114/190] fix --- tests/openvino/test_export.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index 6cc28c8597..eae3727de6 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -121,7 +121,7 @@ class ExportModelTest(unittest.TestCase): if is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES.update({"llava": OVModelForVisualCausalLM}) - GENERATIVE_MODELS.append("llava") + GENERATIVE_MODELS += ("llava",) def _openvino_export( self, From e4eba9296ec619c219029f005d5fcc8913eb6871 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 19 Feb 2026 10:28:15 +0100 Subject: [PATCH 115/190] add stable diffusion 3 tests when diffusers compatible with v5 --- tests/openvino/test_diffusion.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py index 8efc69f8ec..e4f558efb7 100644 --- a/tests/openvino/test_diffusion.py +++ b/tests/openvino/test_diffusion.py @@ -80,7 +80,6 @@ class OVPipelineForText2ImageTest(unittest.TestCase): "stable-diffusion", "stable-diffusion-xl", "latent-consistency", - "stable-diffusion-3", "flux", "sana", ] @@ -93,6 +92,10 @@ class OVPipelineForText2ImageTest(unittest.TestCase): if is_diffusers_version(">=", "0.33.0"): SUPPORTED_ARCHITECTURES.extend(["sana-sprint"]) + + if is_transformers_version("<", "5") or is_diffusers_version(">=", "0.37"): + SUPPORTED_ARCHITECTURES.append("stable-diffusion-3") + CALLBACK_SUPPORT_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] OVMODEL_CLASS = OVPipelineForText2Image @@ -499,9 +502,11 @@ class OVPipelineForImage2ImageTest(unittest.TestCase): "stable-diffusion", "stable-diffusion-xl", "latent-consistency", - "stable-diffusion-3", "flux", ] + if is_transformers_version("<", "5") or is_diffusers_version(">=", "0.37"): + SUPPORTED_ARCHITECTURES.append("stable-diffusion-3") + AUTOMODEL_CLASS = AutoPipelineForImage2Image OVMODEL_CLASS = OVPipelineForImage2Image TASK = "image-to-image" @@ -754,7 +759,11 @@ def test_textual_inversion(self): class OVPipelineForInpaintingTest(unittest.TestCase): - SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "stable-diffusion-3", "flux", "flux-fill"] + SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "flux", "flux-fill"] + + if is_transformers_version("<", "5") or is_diffusers_version(">=", "0.37"): + SUPPORTED_ARCHITECTURES.append("stable-diffusion-3") + AUTOMODEL_CLASS = AutoPipelineForInpainting OVMODEL_CLASS = OVPipelineForInpainting TASK = "inpainting" From dc2823d35bef2fe24d15022c624fe210a589ac8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 19 Feb 2026 14:58:17 +0100 Subject: [PATCH 116/190] use xlm-roberta with max_position_embeddings 514 --- tests/openvino/utils_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index c6737bff1e..2cdbdcf8b7 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -211,7 +211,7 @@ "wav2vec2-conformer": "optimum-intel-internal-testing/tiny-random-wav2vec2-conformer", "whisper": "optimum-intel-internal-testing/tiny-random-whisper", "xlm": "optimum-intel-internal-testing/tiny-random-xlm", - "xlm-roberta": "optimum-intel-internal-testing/tiny-xlm-roberta", + "xlm-roberta": "optimum-intel-internal-testing/tiny-random-xlm-roberta", "xglm": "optimum-intel-internal-testing/tiny-random-XGLMForCausalLM", "xverse": "optimum-intel-internal-testing/tiny-random-xverse", "glm4": "optimum-intel-internal-testing/tiny-random-glm4", From 5967be3cf2a42122d05546c8b04f449970dcef3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 19 Feb 2026 15:00:31 +0100 Subject: [PATCH 117/190] add missing import --- tests/openvino/test_diffusion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py index e4f558efb7..bc58c91796 100644 --- a/tests/openvino/test_diffusion.py +++ b/tests/openvino/test_diffusion.py @@ -38,7 +38,7 @@ OVPipelineForText2Video, ) from optimum.intel.openvino.utils import TemporaryDirectory -from optimum.intel.utils.import_utils import is_diffusers_version +from optimum.intel.utils.import_utils import is_diffusers_version, is_transformers_version from optimum.utils.testing_utils import require_diffusers From 699b0b797679c3242a861521f53b8394e98ca8aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 19 Feb 2026 15:51:28 +0100 Subject: [PATCH 118/190] granitemoe fix --- optimum/exporters/openvino/model_configs.py | 2 - optimum/exporters/openvino/model_patcher.py | 43 +++++++++------------ tests/openvino/test_decoder.py | 2 +- 3 files changed, 20 insertions(+), 27 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index fb7acb865d..fe846efcf4 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -3988,8 +3988,6 @@ class GraniteOpenVINOConfig(LlamaOpenVINOConfig): ) class GraniteMoEOpenVINOConfig(LlamaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.45.0" - # TODO (@echarlaix): add v5 support - MAX_TRANSFORMERS_VERSION = "4.57.6" _MODEL_PATCHER = GraniteMoEModelPatcher diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 1be2bfe437..4bd9024bc8 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -4508,34 +4508,29 @@ class GraniteMoEModelPatcher(OVDecoderModelPatcher): def __enter__(self): super().__enter__() - if is_transformers_version("<", "5"): - for layer in self._model.model.layers: - block_sparse_moe = layer.block_sparse_moe - block_sparse_moe.router._orig_forward = block_sparse_moe.router.forward - block_sparse_moe.router.forward = types.MethodType( - _granite_moe_topk_gating_forward, block_sparse_moe.router - ) - block_sparse_moe.input_linear._orig_forward = block_sparse_moe.input_linear.forward - block_sparse_moe.input_linear.forward = types.MethodType( - _granite_moe_parallel_experts_forward, block_sparse_moe.input_linear - ) - block_sparse_moe.output_linear._orig_forward = block_sparse_moe.output_linear.forward - block_sparse_moe.output_linear.forward = types.MethodType( - _granite_moe_parallel_experts_forward, block_sparse_moe.output_linear - ) - - else: - self._model.set_experts_implementation("batched_mm") + for layer in self._model.model.layers: + block_sparse_moe = layer.block_sparse_moe + block_sparse_moe.router._orig_forward = block_sparse_moe.router.forward + block_sparse_moe.router.forward = types.MethodType( + _granite_moe_topk_gating_forward, block_sparse_moe.router + ) + block_sparse_moe.input_linear._orig_forward = block_sparse_moe.input_linear.forward + block_sparse_moe.input_linear.forward = types.MethodType( + _granite_moe_parallel_experts_forward, block_sparse_moe.input_linear + ) + block_sparse_moe.output_linear._orig_forward = block_sparse_moe.output_linear.forward + block_sparse_moe.output_linear.forward = types.MethodType( + _granite_moe_parallel_experts_forward, block_sparse_moe.output_linear + ) def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) - if is_transformers_version("<", "5"): - for layer in self._model.model.layers: - block_sparse_moe = layer.block_sparse_moe - block_sparse_moe.router.forward = block_sparse_moe.router._orig_forward - block_sparse_moe.input_linear.forward = block_sparse_moe.input_linear._orig_forward - block_sparse_moe.output_linear.forward = block_sparse_moe.output_linear._orig_forward + for layer in self._model.model.layers: + block_sparse_moe = layer.block_sparse_moe + block_sparse_moe.router.forward = block_sparse_moe.router._orig_forward + block_sparse_moe.input_linear.forward = block_sparse_moe.input_linear._orig_forward + block_sparse_moe.output_linear.forward = block_sparse_moe.output_linear._orig_forward class OVSeq2SeqModelPatcher(ModelPatcher): diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 9a6acf1cb7..92b87ddfe3 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -81,6 +81,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "phi3", "gemma2", "granite", + "granitemoe", ) SUPPORTED_SSM_ARCHITECTURES = ("mamba", "falcon_mamba") @@ -165,7 +166,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "dbrx", # "phimoe", "marian", - "granitemoe", # "zamba2", ) GENERATION_LENGTH = 100 From 389f818868ecdbff53249bbb0067e769f304ebe2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 19 Feb 2026 15:59:54 +0100 Subject: [PATCH 119/190] filtered test --- tests/openvino/test_exporters_cli.py | 2 ++ tests/openvino/test_quantization.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index a684c90ca8..5f45f00031 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -834,6 +834,8 @@ def test_filtered_architectures(cls): expected = {"qwen3_vl"} else: expected = {"llava-qwen2", "phi3_v", "phi4mm", "minicpmo"} + if is_transformers_version(">=", "5"): + expected.update({"llama4", "llava_next_video", "minicpmv", "internvl_chat"}) all_model_type = {config[1] for config in cls.TRANSFORMERS_4BIT_CONFIGURATIONS} filtered_model_type = {config[1] for config in cls.SUPPORTED_4BIT_CONFIGURATIONS} diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index f66ae8834d..dd69f926f5 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -1234,6 +1234,8 @@ def test_filtered_architectures(cls): expected.add("qwen3_vl") if is_transformers_version(">=", "4.54"): expected.update({"llava-qwen2", "phi3_v", "minicpmo"}) + if is_transformers_version(">=", "5"): + expected.update({"llama4", "llava_next_video", "minicpmv", "internvl_chat"}) all_model_type = {config[1] for config in cls.TRANSFORMERS_4BIT_CONFIGURATIONS} filtered_model_type = {config[1] for config in cls.LOAD_IN_4_BITS_SCOPE} From ffe2d27e445e05e4eef70e07aed8a27038db0d1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 19 Feb 2026 16:46:52 +0100 Subject: [PATCH 120/190] add back granitemoe model support --- tests/openvino/test_decoder.py | 2 +- tests/openvino/test_genai.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 92b87ddfe3..fac01b5960 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -311,7 +311,7 @@ def test_find_untested_architectures(self): # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly if is_transformers_version(">=", "5"): - supported_architectures -= {"phimoe", "granitemoe", "bitnet", "dbrx", "zamba2", "marian"} + supported_architectures -= {"phimoe", "bitnet", "dbrx", "zamba2", "marian"} supported_architectures -= ONNX_SUPPORTED_ARCHITECTURES untested_architectures = supported_architectures - tested_architectures diff --git a/tests/openvino/test_genai.py b/tests/openvino/test_genai.py index 74f6bab1ec..388c3ce127 100644 --- a/tests/openvino/test_genai.py +++ b/tests/openvino/test_genai.py @@ -65,6 +65,7 @@ class LLMPipelineTestCase(unittest.TestCase): "phi3", "gemma2", "granite", + "granitemoe", ) if is_transformers_version(">=", "4.46.0"): @@ -108,7 +109,6 @@ class LLMPipelineTestCase(unittest.TestCase): # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly "dbrx", # "phimoe", - "granitemoe", ) REMOTE_CODE_MODELS = ( From c649bdf8325b9d132051b57bdaf2b8effc7a0568 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 19 Feb 2026 17:59:49 +0100 Subject: [PATCH 121/190] udpate setup --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 16e2a82fed..3a1995891d 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ INSTALL_REQUIRE = [ "torch>=2.1", - "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@transformers-v5", + "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@xadupre/transformers5", "transformers>=4.45,<5.1", "setuptools", "huggingface-hub>=0.23.2,<2.0", From 4c74aebf280bd1e68625f8b20620651cdcbb5210 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 19 Feb 2026 18:26:29 +0100 Subject: [PATCH 122/190] update workflows --- .github/workflows/test_offline.yaml | 2 +- .github/workflows/test_openvino_nightly.yml | 7 ++++++- .github/workflows/test_openvino_slow.yml | 7 ++++++- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test_offline.yaml b/.github/workflows/test_offline.yaml index c75ba43bef..5b6b019e83 100644 --- a/.github/workflows/test_offline.yaml +++ b/.github/workflows/test_offline.yaml @@ -34,7 +34,7 @@ jobs: - name: Install dependencies run: | pip install --upgrade pip uv - uv pip install .[diffusers,tests] + uv pip install .[tests] diffusers - name: Test run: | diff --git a/.github/workflows/test_openvino_nightly.yml b/.github/workflows/test_openvino_nightly.yml index 90df6a2af3..ace0246329 100644 --- a/.github/workflows/test_openvino_nightly.yml +++ b/.github/workflows/test_openvino_nightly.yml @@ -97,7 +97,12 @@ jobs: - name: Install dependencies run: | pip install --upgrade pip uv - uv pip install .[diffusers,tests] + uv pip install .[tests] librosa diffusers + + - if: ${{ matrix.transformers-version == 'latest' }} + name: Install diffusers + run: | + uv pip install git+https://github.com/huggingface/diffusers - if: ${{ matrix.openvino-version == 'openvino-nightly' }} name: Install OpenVINO Nightly diff --git a/.github/workflows/test_openvino_slow.yml b/.github/workflows/test_openvino_slow.yml index 4b271d898b..3868e44141 100644 --- a/.github/workflows/test_openvino_slow.yml +++ b/.github/workflows/test_openvino_slow.yml @@ -59,7 +59,12 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip uv - uv pip install .[tests,diffusers] transformers[testing] + uv pip install .[tests] transformers[testing] diffusers + + - if: ${{ matrix.transformers-version == 'latest' }} + name: Install diffusers + run: | + uv pip install git+https://github.com/huggingface/diffusers - if: ${{ matrix.transformers-version != 'latest' && matrix.transformers-version != 'main' }} name: Install specific dependencies and versions required for older transformers From c7184e114939e777886cb1f2acb4b3abcca6f148 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 19 Feb 2026 18:38:37 +0100 Subject: [PATCH 123/190] update setup --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 3a1995891d..267d0b83f2 100644 --- a/setup.py +++ b/setup.py @@ -28,8 +28,8 @@ INSTALL_REQUIRE = [ "torch>=2.1", - "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@xadupre/transformers5", - "transformers>=4.45,<5.1", + "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@transformers5", + "transformers>=4.45,<5.3", "setuptools", "huggingface-hub>=0.23.2,<2.0", "nncf>=2.19.0", From 7b0806e3f8571558a055a457ac3958589edecc2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 19 Feb 2026 18:39:54 +0100 Subject: [PATCH 124/190] fix --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 267d0b83f2..e99736e5a4 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ INSTALL_REQUIRE = [ "torch>=2.1", - "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@transformers5", + "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@transformers-v5", "transformers>=4.45,<5.3", "setuptools", "huggingface-hub>=0.23.2,<2.0", From 50e30b789ffc182fac3ba943cdcafdbf1a27c11b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 19 Feb 2026 18:52:22 +0100 Subject: [PATCH 125/190] update setup --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e99736e5a4..16e2a82fed 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ INSTALL_REQUIRE = [ "torch>=2.1", "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@transformers-v5", - "transformers>=4.45,<5.3", + "transformers>=4.45,<5.1", "setuptools", "huggingface-hub>=0.23.2,<2.0", "nncf>=2.19.0", From e7878e1de4f04e61eafc97657db69b37c9e79f30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 19 Feb 2026 19:00:41 +0100 Subject: [PATCH 126/190] remove diffusers --- .github/workflows/test_offline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_offline.yaml b/.github/workflows/test_offline.yaml index 5b6b019e83..7c4458a306 100644 --- a/.github/workflows/test_offline.yaml +++ b/.github/workflows/test_offline.yaml @@ -34,7 +34,7 @@ jobs: - name: Install dependencies run: | pip install --upgrade pip uv - uv pip install .[tests] diffusers + uv pip install .[tests] - name: Test run: | From 467dcad06b77db153fc8419fdfb6981c1005640a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 19 Feb 2026 19:06:51 +0100 Subject: [PATCH 127/190] fix offline workflow --- .github/workflows/test_offline.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_offline.yaml b/.github/workflows/test_offline.yaml index 7c4458a306..d079c6c8b7 100644 --- a/.github/workflows/test_offline.yaml +++ b/.github/workflows/test_offline.yaml @@ -38,10 +38,10 @@ jobs: - name: Test run: | - HF_HOME=/tmp/ huggingface-cli download hf-internal-testing/tiny-random-gpt2 + HF_HOME=/tmp/ hf download hf-internal-testing/tiny-random-gpt2 HF_HOME=/tmp/ HF_HUB_OFFLINE=1 optimum-cli export openvino --model hf-internal-testing/tiny-random-gpt2 gpt2_openvino --task text-generation - huggingface-cli download hf-internal-testing/tiny-random-gpt2 + hf download hf-internal-testing/tiny-random-gpt2 HF_HUB_OFFLINE=1 optimum-cli export openvino --model hf-internal-testing/tiny-random-gpt2 gpt2_openvino --task text-generation pytest tests/openvino/test_modeling.py -k "test_load_from_hub" -s -vvvvv From c14f2e53737afa21d7cb20fa9ea40c9e32139f80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 19 Feb 2026 19:42:33 +0100 Subject: [PATCH 128/190] exclude openclip from offline tests --- .github/workflows/test_offline.yaml | 4 ++-- tests/openvino/test_modeling.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test_offline.yaml b/.github/workflows/test_offline.yaml index d079c6c8b7..48f07b9396 100644 --- a/.github/workflows/test_offline.yaml +++ b/.github/workflows/test_offline.yaml @@ -44,5 +44,5 @@ jobs: hf download hf-internal-testing/tiny-random-gpt2 HF_HUB_OFFLINE=1 optimum-cli export openvino --model hf-internal-testing/tiny-random-gpt2 gpt2_openvino --task text-generation - pytest tests/openvino/test_modeling.py -k "test_load_from_hub" -s -vvvvv - HF_HUB_OFFLINE=1 pytest tests/openvino/test_modeling.py -k "test_load_from_hub" -s -vvvvv + pytest tests/openvino/test_modeling.py -k "test_load_from_hub and not openclip" -s -vvvvv + HF_HUB_OFFLINE=1 pytest tests/openvino/test_modeling.py -k "test_load_from_hub and not openclip" -s -vvvvv diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 8d8ab01147..db369a478c 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -1601,7 +1601,7 @@ def _get_sample_image(self): image = Image.open(requests.get(url, stream=True).raw) return image - def test_load_from_hub_and_save_model(self): + def test_load_from_hub_and_save_model_openclip(self): loaded_model = OVModelOpenCLIPForZeroShotImageClassification.from_pretrained( self.OV_MODEL_ID_IR, device=OPENVINO_DEVICE ) From 69c16bfd00cb485ef7e72a013b720575bf84c28d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 20 Feb 2026 09:29:22 +0100 Subject: [PATCH 129/190] workflow slow --- .github/workflows/test_openvino_slow.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_openvino_slow.yml b/.github/workflows/test_openvino_slow.yml index 3868e44141..580253a36a 100644 --- a/.github/workflows/test_openvino_slow.yml +++ b/.github/workflows/test_openvino_slow.yml @@ -59,7 +59,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip uv - uv pip install .[tests] transformers[testing] diffusers + uv pip install .[tests] librosa diffusers - if: ${{ matrix.transformers-version == 'latest' }} name: Install diffusers From 3f8dfb4be84364485b37aaa366b7472afde47286 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 20 Feb 2026 11:15:10 +0100 Subject: [PATCH 130/190] fix question answering pipeline --- tests/openvino/test_modeling.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index db369a478c..0c5011a908 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -896,12 +896,12 @@ def test_pipeline(self, model_arch): pipe = pipeline("question-answering", model=model, tokenizer=tokenizer) question = "What's my name?" context = "My Name is Arthur and I live in Lyon." - outputs = pipe(question, context) + outputs = pipe(question=question, context=context) self.assertEqual(pipe.device, model.device) self.assertGreaterEqual(outputs["score"], 0.0) self.assertIsInstance(outputs["answer"], str) ov_pipe = optimum_pipeline("question-answering", model_id, accelerator="openvino") - ov_outputs = ov_pipe(question, context) + ov_outputs = ov_pipe(question=question, context=context) self.assertEqual(outputs["score"], ov_outputs["score"]) del model del ov_pipe From 975da724b9c9cd25e768d4d4f928d534113f85fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 20 Feb 2026 11:24:43 +0100 Subject: [PATCH 131/190] encode_plus deprecated --- tests/openvino/test_quantization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index dd69f926f5..ed1577d1cd 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -2038,7 +2038,7 @@ def preprocess_function(examples, tokenizer): # Test that inference on quantized model works model = OVModelForQuestionAnswering.from_pretrained(tmp_dir, device=OPENVINO_DEVICE) - tokens = tokenizer.encode_plus( + tokens = tokenizer( "This is a sample question", "This is a sample context", add_special_tokens=True, return_tensors="pt" ) model(**tokens, return_dict=True) From 31989eb0b262634373bfa2799ed634b96c0b3fd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 20 Feb 2026 11:54:31 +0100 Subject: [PATCH 132/190] automatic-speech pipeline for whisper incompatible with v5 --- tests/openvino/test_seq2seq.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index da68d6e8b9..d8c10f39dd 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -410,6 +410,7 @@ def test_compare_to_transformers(self, model_arch): @parameterized.expand(SUPPORTED_ARCHITECTURES) @pytest.mark.run_slow @slow + @pytest.mark.skipif(is_transformers_version("==", "5.0"), reason="Issue with transformers v5.0 coming from num_frames") def test_pipeline(self, model_arch): set_seed(SEED) model_id = MODEL_NAMES[model_arch] From 7adb81012983043c4e12632693848c87a1f92746 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 20 Feb 2026 11:54:57 +0100 Subject: [PATCH 133/190] style --- tests/openvino/test_seq2seq.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index d8c10f39dd..0fc3821c9b 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -410,7 +410,9 @@ def test_compare_to_transformers(self, model_arch): @parameterized.expand(SUPPORTED_ARCHITECTURES) @pytest.mark.run_slow @slow - @pytest.mark.skipif(is_transformers_version("==", "5.0"), reason="Issue with transformers v5.0 coming from num_frames") + @pytest.mark.skipif( + is_transformers_version("==", "5.0"), reason="Issue with transformers v5.0 coming from num_frames" + ) def test_pipeline(self, model_arch): set_seed(SEED) model_id = MODEL_NAMES[model_arch] From 6a93224b1ca747e32f8ce68cdc14be725a60bb9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 20 Feb 2026 14:21:22 +0100 Subject: [PATCH 134/190] image-to-text pipeline deprecated --- tests/openvino/test_seq2seq.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index 0fc3821c9b..bbc3d9260d 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -515,6 +515,10 @@ def test_compare_to_transformers(self, model_arch: str): @parameterized.expand(SUPPORTED_ARCHITECTURES) @pytest.mark.run_slow @slow + @pytest.mark.skipif( + is_transformers_version(">=", "5"), + reason="requires transformers < v5 since image-to-text pipelines is deprecated", + ) def test_pipeline(self, model_arch: str): set_seed(SEED) model_id = MODEL_NAMES[model_arch] From c8e9488fad965f496b0bc4dac3aae36b554fb82a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 20 Feb 2026 14:30:49 +0100 Subject: [PATCH 135/190] update MAX_TRANSFORMERS_VERSION for gemma3 exaone4 and llama4 --- optimum/exporters/openvino/model_configs.py | 6 ++++++ tests/openvino/test_decoder.py | 11 ++++++++--- tests/openvino/test_export.py | 2 +- tests/openvino/test_exporters_cli.py | 2 +- tests/openvino/test_genai.py | 2 +- 5 files changed, 17 insertions(+), 6 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index fe846efcf4..53610803da 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -874,6 +874,8 @@ class ExaoneOpenVINOConfig(LlamaOpenVINOConfig): ) class Exaone4OpenVINOConfig(LlamaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.54.0" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" @register_in_tasks_manager( @@ -1474,6 +1476,8 @@ class Gemma2OpenVINOConfig(GemmaOpenVINOConfig): ) class Gemma3TextOpenVINOConfig(Gemma2OpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.50.0" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" class DeciDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator): @@ -4561,6 +4565,8 @@ def with_behavior( ) class Llama4TextOpenVINOConfig(LlamaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.51.0" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, GemmaDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = GemmaDummyPastKeyValuesGenerator _MODEL_PATCHER = Llama4TextModelPatcher diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index fac01b5960..2e6a938c81 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -115,11 +115,16 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version(">", "4.47"): SUPPORTED_ARCHITECTURES += ("olmo2",) - if is_transformers_version(">", "4.49"): + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version(">", "4.49") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES += ("gemma3_text",) if is_transformers_version(">=", "4.51.0"): - SUPPORTED_ARCHITECTURES += ("llama4", "qwen3", "qwen3_moe") + SUPPORTED_ARCHITECTURES += ("qwen3", "qwen3_moe") + + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version(">=", "4.51.0") and is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ("llama4",) if is_transformers_version(">=", "4.51.3"): SUPPORTED_ARCHITECTURES += ("glm4",) @@ -131,7 +136,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version(">=", "4.52.1") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES += ("bitnet",) - if is_transformers_version(">=", "4.54.0"): + if is_transformers_version(">=", "4.54.0") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES += ("exaone4",) if is_transformers_version("<", "4.54.0"): diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index eae3727de6..b73de1aaf9 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -100,7 +100,7 @@ class ExportModelTest(unittest.TestCase): if is_transformers_version(">=", "4.53.0"): SUPPORTED_ARCHITECTURES.update({"granitemoehybrid": OVModelForCausalLM}) - if is_transformers_version(">=", "4.54"): + if is_transformers_version(">=", "4.54") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES.update({"exaone4": OVModelForCausalLM, "lfm2": OVModelForCausalLM}) if is_transformers_version(">=", "4.55.0") and is_transformers_version("<", "4.58.0"): diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 5f45f00031..326f42d9bd 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -138,7 +138,7 @@ class OVCLIExportTestCase(unittest.TestCase): ] ) - if is_transformers_version(">=", "4.54"): + if is_transformers_version(">=", "4.54") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES.extend( [ ("text-generation-with-past", "exaone4"), diff --git a/tests/openvino/test_genai.py b/tests/openvino/test_genai.py index 388c3ce127..906216c567 100644 --- a/tests/openvino/test_genai.py +++ b/tests/openvino/test_genai.py @@ -84,7 +84,7 @@ class LLMPipelineTestCase(unittest.TestCase): SUPPORTED_ARCHITECTURES += ("glm4",) if is_transformers_version(">=", "4.53.0"): SUPPORTED_ARCHITECTURES += ("arcee",) - if is_transformers_version(">=", "4.54.0"): + if is_transformers_version(">=", "4.54.0") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES += ("exaone4",) if is_transformers_version(">=", "4.55.0"): SUPPORTED_ARCHITECTURES += ("gpt_oss",) From 28e2e24ff38d99ee269631652fe913fa27063552 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 20 Feb 2026 15:28:36 +0100 Subject: [PATCH 136/190] remove from test when not supported --- tests/openvino/test_genai.py | 2 +- tests/openvino/test_quantization.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_genai.py b/tests/openvino/test_genai.py index 906216c567..0ddc6db210 100644 --- a/tests/openvino/test_genai.py +++ b/tests/openvino/test_genai.py @@ -76,7 +76,7 @@ class LLMPipelineTestCase(unittest.TestCase): SUPPORTED_ARCHITECTURES += ("qwen",) if is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES += ("phimoe",) - if is_transformers_version(">=", "4.49"): + if is_transformers_version(">=", "4.49") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES += ("gemma3_text",) if is_transformers_version(">=", "4.51.0"): SUPPORTED_ARCHITECTURES += ("qwen3", "qwen3_moe") diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index ed1577d1cd..9c60468dd3 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -1070,7 +1070,7 @@ class OVWeightCompressionTest(unittest.TestCase): if is_transformers_version("<", "4.52.0"): SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "minicpmo", True)) - if is_transformers_version(">=", "4.54.0"): + if is_transformers_version(">=", "4.54.0") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForCausalLM, "exaone4", True)) if is_transformers_version(">=", "4.57.0"): From f061f2ce4d7643e5bcc43ca30dab48438821f628 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 20 Feb 2026 17:41:16 +0100 Subject: [PATCH 137/190] decoder tests --- tests/openvino/test_decoder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 2e6a938c81..01e4481c8d 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -316,7 +316,7 @@ def test_find_untested_architectures(self): # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly if is_transformers_version(">=", "5"): - supported_architectures -= {"phimoe", "bitnet", "dbrx", "zamba2", "marian"} + supported_architectures -= {"phimoe", "bitnet", "dbrx", "zamba2", "marian", "llama4", "gemma3_text", "exaone4"} supported_architectures -= ONNX_SUPPORTED_ARCHITECTURES untested_architectures = supported_architectures - tested_architectures @@ -420,7 +420,7 @@ def test_compare_to_transformers(self, model_arch): if model_arch in ["qwen"]: return - tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True) + tokens = tokenizer(["Today is a nice day and", "This is me"], return_tensors="pt", padding=True) ov_model.generation_config.eos_token_id = None transformers_model.generation_config.eos_token_id = None ov_model.config.eos_token_id = None From 8820fb3964e245c41351cb2cb866dfe8da228897 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 20 Feb 2026 17:43:05 +0100 Subject: [PATCH 138/190] test filtered architectures update with exaone4 --- tests/openvino/test_quantization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 9c60468dd3..b5c01b90d9 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -1235,7 +1235,7 @@ def test_filtered_architectures(cls): if is_transformers_version(">=", "4.54"): expected.update({"llava-qwen2", "phi3_v", "minicpmo"}) if is_transformers_version(">=", "5"): - expected.update({"llama4", "llava_next_video", "minicpmv", "internvl_chat"}) + expected.update({"llama4", "llava_next_video", "minicpmv", "internvl_chat", "exaone4"}) all_model_type = {config[1] for config in cls.TRANSFORMERS_4BIT_CONFIGURATIONS} filtered_model_type = {config[1] for config in cls.LOAD_IN_4_BITS_SCOPE} From 290b7b328cf64a9dfd9d2881996dbb1114d76369 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 20 Feb 2026 18:04:06 +0100 Subject: [PATCH 139/190] change gptoss model --- tests/openvino/test_exporters_cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 326f42d9bd..7f396e3a85 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -1225,13 +1225,13 @@ def test_exporters_cli_full_quantization( {"model": 65}, ), ( - "gpt_oss_mxfp4", + "gpt_oss", "openai/gpt-oss-20b", AutoModelForCausalLM, OVModelForCausalLM, "--task text-generation-with-past --weight-format int4", _DEFAULT_4BIT_WQ_CONFIGS, - {"model": {"int8": 22, "int4": 4}}, + {"model": {"int8": 40, "int4": 0}}, {"model": 0}, ), ( From 64223a8d6d331816d507a353aeb248189cfc8bf0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 23 Feb 2026 15:49:57 +0100 Subject: [PATCH 140/190] style --- tests/openvino/test_decoder.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 01e4481c8d..e111b0ec06 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -316,7 +316,16 @@ def test_find_untested_architectures(self): # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly if is_transformers_version(">=", "5"): - supported_architectures -= {"phimoe", "bitnet", "dbrx", "zamba2", "marian", "llama4", "gemma3_text", "exaone4"} + supported_architectures -= { + "phimoe", + "bitnet", + "dbrx", + "zamba2", + "marian", + "llama4", + "gemma3_text", + "exaone4", + } supported_architectures -= ONNX_SUPPORTED_ARCHITECTURES untested_architectures = supported_architectures - tested_architectures From f40bcb35016907a36f2b509cd2cfeb4dbe669c18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 23 Feb 2026 16:15:10 +0100 Subject: [PATCH 141/190] set num beam to 5 --- tests/openvino/test_decoder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index e111b0ec06..75a3a49f36 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -429,7 +429,7 @@ def test_compare_to_transformers(self, model_arch): if model_arch in ["qwen"]: return - tokens = tokenizer(["Today is a nice day and", "This is me"], return_tensors="pt", padding=True) + tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True) ov_model.generation_config.eos_token_id = None transformers_model.generation_config.eos_token_id = None ov_model.config.eos_token_id = None @@ -440,7 +440,7 @@ def test_compare_to_transformers(self, model_arch): # LFM2 fails with beam search, issue link: https://github.com/huggingface/transformers/issues/42257 # CVS-177964 GraniteMoeHybrid fails due to lack support of Beam search for hybrid models in OpenVINO # For this support, we expect changes in IRs to have connected beam_idx with Mamba/Linear attention states - num_beams=1 if model_arch in ["chatglm4", "lfm2", "granitemoehybrid"] else 2, + num_beams=1 if model_arch in ["chatglm4", "lfm2", "granitemoehybrid"] else 5, do_sample=False, ) From 86767c7a5297026d0d88797dfa8c925d92f6998b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 25 Feb 2026 17:30:27 +0100 Subject: [PATCH 142/190] add llava support for v5 --- optimum/exporters/openvino/model_configs.py | 10 --------- optimum/exporters/openvino/model_patcher.py | 23 --------------------- tests/openvino/test_decoder.py | 5 +++-- tests/openvino/test_export.py | 7 ++----- tests/openvino/test_exporters_cli.py | 22 +++++++------------- tests/openvino/test_seq2seq.py | 6 +++--- 6 files changed, 16 insertions(+), 57 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 53610803da..09ab8d72a0 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -166,7 +166,6 @@ Llama4ImageEmbeddingsModelPatcher, Llama4TextModelPatcher, LlavaImageEmbeddingModelPatcher, - LlavaNextImageEmbeddingModelPatcher, LlavaNextVideoImageEmbeddingModelPatcher, LlavaQwen2ImageEmbeddingsModelPatcher, MairaImageEmbeddingModelPatcher, @@ -1902,8 +1901,6 @@ def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[ @register_in_tasks_manager("llava", *["image-text-to-text"], library_name="transformers") class LlavaOpenVINOConfig(BaseVLMOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.37.2" - # TODO (@echarlaix): add v5 support - MAX_TRANSFORMERS_VERSION = "4.57.6" def __init__( self, @@ -1942,13 +1939,6 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs) -> Dict: @register_in_tasks_manager("llava_next", *["image-text-to-text"], library_name="transformers") class LlavaNextOpenVINOConfig(LlavaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.40.0" - MAX_TRANSFORMERS_VERSION = "5.99" - - def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None): - model_kwargs = model_kwargs or {} - if self._behavior != VLMConfigBehavior.VISION_EMBEDDINGS: - return super().patch_model_for_export(model, model_kwargs) - return LlavaNextImageEmbeddingModelPatcher(self, model, model_kwargs) class DummyLLavaMultiModalProjectorInputGenerator(DummyInputGenerator): diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 4bd9024bc8..9624401569 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3307,29 +3307,6 @@ def __init__( model_kwargs: Dict[str, Any], ): model.__orig_forward = model.forward - - if is_transformers_version("<", "5"): - model.forward = types.MethodType(llava_vision_embed_forward, model) - else: - model.forward = model.get_image_features - - super().__init__(config, model, model_kwargs) - - def __exit__(self, exc_type, exc_value, traceback): - super().__exit__(exc_type, exc_value, traceback) - self._model.forward = self._model.__orig_forward - - -class LlavaNextImageEmbeddingModelPatcher(ModelPatcher): - def __init__( - self, - config: "OnnxConfig", - model: "PreTrainedModel", - model_kwargs: Dict[str, Any], - ): - model.__orig_forward = model.forward - # TODO: use get_image_features instead and add image_sizes as input when exporting - # https://github.com/huggingface/transformers/blob/v4.48.0/src/transformers/models/llava_next/modeling_llava_next.py#L716 model.forward = types.MethodType(llava_vision_embed_forward, model) super().__init__(config, model, model_kwargs) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 75a3a49f36..a06ee5fa12 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -429,7 +429,8 @@ def test_compare_to_transformers(self, model_arch): if model_arch in ["qwen"]: return - tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True) + inputs = "Today is a nice day and" if model_arch == "decilm" else "The quick brown fox jumps over the" + tokens = tokenizer([inputs, "This is me"], return_tensors="pt", padding=True) ov_model.generation_config.eos_token_id = None transformers_model.generation_config.eos_token_id = None ov_model.config.eos_token_id = None @@ -440,7 +441,7 @@ def test_compare_to_transformers(self, model_arch): # LFM2 fails with beam search, issue link: https://github.com/huggingface/transformers/issues/42257 # CVS-177964 GraniteMoeHybrid fails due to lack support of Beam search for hybrid models in OpenVINO # For this support, we expect changes in IRs to have connected beam_idx with Mamba/Linear attention states - num_beams=1 if model_arch in ["chatglm4", "lfm2", "granitemoehybrid"] else 5, + num_beams=1 if model_arch in ["chatglm4", "lfm2", "granitemoehybrid"] else 2, do_sample=False, ) diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index b73de1aaf9..ca16598103 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -84,6 +84,7 @@ class ExportModelTest(unittest.TestCase): "stable-diffusion-xl": OVStableDiffusionXLPipeline, "stable-diffusion-xl-refiner": OVStableDiffusionXLImg2ImgPipeline, "latent-consistency": OVLatentConsistencyModelPipeline, + "llava": OVModelForVisualCausalLM, "sam": OVSamModel, "speecht5": OVModelForTextToSpeechSeq2Seq, "clip": OVModelForZeroShotImageClassification, @@ -117,11 +118,7 @@ class ExportModelTest(unittest.TestCase): if is_transformers_version(">=", "4.51"): SUPPORTED_ARCHITECTURES.update({"qwen3": OVModelForFeatureExtraction}) - GENERATIVE_MODELS = ("pix2struct", "t5", "bart", "gpt2", "whisper", "speecht5") - - if is_transformers_version("<", "5"): - SUPPORTED_ARCHITECTURES.update({"llava": OVModelForVisualCausalLM}) - GENERATIVE_MODELS += ("llava",) + GENERATIVE_MODELS = ("pix2struct", "t5", "bart", "gpt2", "whisper", "llava", "speecht5") def _openvino_export( self, diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 7f396e3a85..96c8cd64f6 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -208,6 +208,14 @@ class OVCLIExportTestCase(unittest.TestCase): "expected_chat_template": False, "simplified_chat_template": False, }, + "llava": { # transformers, chat template in processor, simplified chat template + "num_tokenizers": 2, + "task": "image-text-to-text", + "processor_chat_template": True, + "remote_code": False, + "expected_chat_template": True, + "simplified_chat_template": True, + }, "llava_next": { # transformers, chat template in processor overrides tokinizer chat template, simplified chat template "num_tokenizers": 2, "task": "image-text-to-text", @@ -254,20 +262,6 @@ class OVCLIExportTestCase(unittest.TestCase): } ) - if is_transformers_version("<", "5"): - TOKENIZER_CHAT_TEMPLATE_TESTS_MODELS.update( - { - "llava": { # transformers, chat template in processor, simplified chat template - "num_tokenizers": 2, - "task": "image-text-to-text", - "processor_chat_template": True, - "remote_code": False, - "expected_chat_template": True, - "simplified_chat_template": True, - }, - } - ) - SUPPORTED_SD_HYBRID_ARCHITECTURES = [ ("flux", 7, 56), ("latent-consistency", 50, 135), diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index bbc3d9260d..4e2df41407 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -547,6 +547,7 @@ def test_pipeline(self, model_arch: str): class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): SUPPORTED_ARCHITECTURES = [ + "llava", "llava_next", "llava_next_mistral", "qwen2_vl", @@ -601,7 +602,7 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly if is_transformers_version("<", "5"): - SUPPORTED_ARCHITECTURES += ("llava", "llava_next_video") + SUPPORTED_ARCHITECTURES += ("llava_next_video",) else: UNSUPPORTED_ARCHITECTURES = { "got_ocr2", @@ -611,7 +612,6 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): "phi4_multimodal", "gemma3", "smolvlm", - "llava", } REMOTE_CODE_MODELS = ["internvl_chat", "minicpmv", "minicpmo", "llava-qwen2", "phi3_v", "maira2", "phi4mm"] IMAGE = Image.open( @@ -872,7 +872,7 @@ def test_compare_to_transformers(self, model_arch): @parameterized.expand( ["llava", "llava_next", "llava_next_video", "llava_next_mistral"] if is_transformers_version("<", "5") - else ["llava_next", "llava_next_mistral"] + else ["llava", "llava_next", "llava_next_mistral"] ) def test_llava_with_new_preprocessing(self, model_arch): prompt = "\n What is shown in this image?" From c523617123a1f45d6664335490446b05a82f576d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 25 Feb 2026 17:33:15 +0100 Subject: [PATCH 143/190] maira --- optimum/exporters/openvino/model_configs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 09ab8d72a0..a52cb0ca87 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -2060,7 +2060,6 @@ def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[ ) class MairaOpenVINOConfig(LlavaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.46.0" - MAX_TRANSFORMERS_VERSION = "5.99" SUPPORTS_PAST = True def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None): From d49a895bda3bf9b94752ed04bee1821735e2d8f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 25 Feb 2026 17:50:37 +0100 Subject: [PATCH 144/190] extend tests disabled for marian for openvino v2026 --- tests/openvino/test_genai.py | 4 ++-- tests/openvino/test_seq2seq.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/openvino/test_genai.py b/tests/openvino/test_genai.py index 0ddc6db210..5d6b3c4b72 100644 --- a/tests/openvino/test_genai.py +++ b/tests/openvino/test_genai.py @@ -466,8 +466,8 @@ class LLMPipelineWithEagle3TestCase(unittest.TestCase): @parameterized.expand(EAGLE3_MODELS.items()) def test_compare_outputs(self, model_arch, model_pair): - if is_transformers_version("<", "4.54"): - self.skipTest("Eagle3 requires transformers >= 4.54") + if is_transformers_version("<", "4.54") or is_transformers_version(">=", "5"): + self.skipTest("Eagle3 requires transformers >= 4.54 and transformers < 5") if is_openvino_version("<", "2026.0"): self.skipTest("Eagle3 requires openvino-genai >= 2026.0") diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index 4e2df41407..e34a256060 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -154,7 +154,7 @@ class OVModelForSeq2SeqLMIntegrationTest(OVSeq2SeqTestMixin): GENERATION_LENGTH = 100 SPEEDUP_CACHE = 1.1 - if not (is_openvino_version(">=", "2025.3.0") and is_openvino_version("<", "2025.5.0")): + if not (is_openvino_version(">=", "2025.3.0") and is_openvino_version("<", "2026")) and is_transformers_version("<", "5"): # There are known issues with marian model on OpenVINO 2025.3.x and 2025.4.x SUPPORTED_ARCHITECTURES += ("marian",) From 6f608fd27e7c25c8a5cb438804c272eb26b61fed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 25 Feb 2026 17:51:02 +0100 Subject: [PATCH 145/190] style --- tests/openvino/test_seq2seq.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index e34a256060..c15c0ca269 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -154,7 +154,9 @@ class OVModelForSeq2SeqLMIntegrationTest(OVSeq2SeqTestMixin): GENERATION_LENGTH = 100 SPEEDUP_CACHE = 1.1 - if not (is_openvino_version(">=", "2025.3.0") and is_openvino_version("<", "2026")) and is_transformers_version("<", "5"): + if not (is_openvino_version(">=", "2025.3.0") and is_openvino_version("<", "2026")) and is_transformers_version( + "<", "5" + ): # There are known issues with marian model on OpenVINO 2025.3.x and 2025.4.x SUPPORTED_ARCHITECTURES += ("marian",) From 02a2ccd8688c520ca039d126b1dbe413c51dc82d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 25 Feb 2026 17:53:12 +0100 Subject: [PATCH 146/190] style --- tests/openvino/test_genai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/test_genai.py b/tests/openvino/test_genai.py index 5d6b3c4b72..f3c1bed1e9 100644 --- a/tests/openvino/test_genai.py +++ b/tests/openvino/test_genai.py @@ -466,7 +466,7 @@ class LLMPipelineWithEagle3TestCase(unittest.TestCase): @parameterized.expand(EAGLE3_MODELS.items()) def test_compare_outputs(self, model_arch, model_pair): - if is_transformers_version("<", "4.54") or is_transformers_version(">=", "5"): + if is_transformers_version("<", "4.54") or is_transformers_version(">=", "5"): self.skipTest("Eagle3 requires transformers >= 4.54 and transformers < 5") if is_openvino_version("<", "2026.0"): self.skipTest("Eagle3 requires openvino-genai >= 2026.0") From 710c5bc8679c23e71061385babc8f28994f2c67c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 25 Feb 2026 18:12:08 +0100 Subject: [PATCH 147/190] include openvino 2026 --- tests/openvino/test_seq2seq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index c15c0ca269..af047f0313 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -154,7 +154,7 @@ class OVModelForSeq2SeqLMIntegrationTest(OVSeq2SeqTestMixin): GENERATION_LENGTH = 100 SPEEDUP_CACHE = 1.1 - if not (is_openvino_version(">=", "2025.3.0") and is_openvino_version("<", "2026")) and is_transformers_version( + if not (is_openvino_version(">=", "2025.3.0") and is_openvino_version("<", "2026.1")) and is_transformers_version( "<", "5" ): # There are known issues with marian model on OpenVINO 2025.3.x and 2025.4.x From 64dc198c8c80d997e80ddb6f5a57d589aba733ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 25 Feb 2026 18:44:57 +0100 Subject: [PATCH 148/190] add gemma3 text --- optimum/exporters/openvino/model_configs.py | 2 -- tests/openvino/test_decoder.py | 4 +--- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index a52cb0ca87..8af57604fa 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -1475,8 +1475,6 @@ class Gemma2OpenVINOConfig(GemmaOpenVINOConfig): ) class Gemma3TextOpenVINOConfig(Gemma2OpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.50.0" - # TODO (@echarlaix): add v5 support - MAX_TRANSFORMERS_VERSION = "4.57.6" class DeciDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator): diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index a06ee5fa12..de3d3df121 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -115,8 +115,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version(">", "4.47"): SUPPORTED_ARCHITECTURES += ("olmo2",) - # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly - if is_transformers_version(">", "4.49") and is_transformers_version("<", "5"): + if is_transformers_version(">", "4.49"): SUPPORTED_ARCHITECTURES += ("gemma3_text",) if is_transformers_version(">=", "4.51.0"): @@ -323,7 +322,6 @@ def test_find_untested_architectures(self): "zamba2", "marian", "llama4", - "gemma3_text", "exaone4", } From d3bdb292d52a46fa9f29c721b36a69e56b9ebc02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 25 Feb 2026 18:53:01 +0100 Subject: [PATCH 149/190] llava tests --- tests/openvino/test_quantization.py | 49 +++++++++++++---------------- tests/openvino/test_seq2seq.py | 8 +++-- 2 files changed, 26 insertions(+), 31 deletions(-) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index b5c01b90d9..753b1e387a 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -1061,6 +1061,7 @@ class OVWeightCompressionTest(unittest.TestCase): (OVStableDiffusionPipeline, "stable-diffusion", False), (OVStableDiffusionXLPipeline, "stable-diffusion-xl", False), (OVModelOpenCLIPForZeroShotImageClassification, "open-clip", False), + (OVModelForVisualCausalLM, "llava", False), (OVModelForVisualCausalLM, "qwen2_vl", False), ] @@ -1079,7 +1080,6 @@ class OVWeightCompressionTest(unittest.TestCase): if is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.extend( [ - (OVModelForVisualCausalLM, "llava", False), (OVModelForVisualCausalLM, "llava_next_video", False), (OVModelForVisualCausalLM, "minicpmv", True), ] @@ -1122,6 +1122,17 @@ class OVWeightCompressionTest(unittest.TestCase): "text_encoder": {}, }, ), + ( + OVModelForVisualCausalLM, + "llava", + 4, + {"bits": 4, "group_size": 8, "ratio": 0.5}, + { + "lm_model": {"int8": 22, "int4": 8}, + "text_embeddings_model": {"int8": 1}, + "vision_embeddings_model": {"int8": 9}, + }, + ), ( OVSamModel, "sam", @@ -1175,6 +1186,15 @@ class OVWeightCompressionTest(unittest.TestCase): }, }, ), + ( + OVModelForVisualCausalLM, + "llava", + { + "lm_model": {"patterns": [".*layers.0.self_attn.q_proj/aten::linear/MatMul"]}, + "vision_embeddings_model": {"patterns": [".*layers.0.self_attn.q_proj/aten::linear/MatMul"]}, + "text_embeddings_model": {"patterns": ["."]}, + }, + ), ( OVSamModel, "sam", @@ -1195,33 +1215,6 @@ class OVWeightCompressionTest(unittest.TestCase): ), ] - if is_transformers_version("<", "5"): - DEFAULT_COMPRESSION_CONFIGURATIONS.append( - ( - OVModelForVisualCausalLM, - "llava", - 4, - {"bits": 4, "group_size": 8, "ratio": 0.5}, - { - "lm_model": {"int8": 22, "int4": 8}, - "text_embeddings_model": {"int8": 1}, - "vision_embeddings_model": {"int8": 9}, - }, - ), - ) - - DEFAULT_IGNORED_SCOPE_CONFIGURATIONS.append( - ( - OVModelForVisualCausalLM, - "llava", - { - "lm_model": {"patterns": [".*layers.0.self_attn.q_proj/aten::linear/MatMul"]}, - "vision_embeddings_model": {"patterns": [".*layers.0.self_attn.q_proj/aten::linear/MatMul"]}, - "text_embeddings_model": {"patterns": ["."]}, - }, - ), - ) - def test_filtered_architectures(cls): expected = set() if is_transformers_version("<", "4.49"): diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index af047f0313..9e2246582f 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -153,18 +153,20 @@ class OVModelForSeq2SeqLMIntegrationTest(OVSeq2SeqTestMixin): TASK = "text2text-generation" GENERATION_LENGTH = 100 SPEEDUP_CACHE = 1.1 - - if not (is_openvino_version(">=", "2025.3.0") and is_openvino_version("<", "2026.1")) and is_transformers_version( + UNSUPPORTED_ARCHITECTURES = set() + if not (is_openvino_version(">=", "2025.3.0") and is_openvino_version("<", "2026.1")) or is_transformers_version( "<", "5" ): # There are known issues with marian model on OpenVINO 2025.3.x and 2025.4.x SUPPORTED_ARCHITECTURES += ("marian",) + else: + UNSUPPORTED_ARCHITECTURES.add("marian") # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly if is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES += ("mt5",) else: - UNSUPPORTED_ARCHITECTURES = {"marian", "mt5"} + UNSUPPORTED_ARCHITECTURES.add("mt5") SUPPORT_STATEFUL = ("t5", "mt5", "longt5") if is_transformers_version(">=", "4.52.0"): From ea761a75bd0657e0514d3a025f57676d956056ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 26 Feb 2026 14:32:57 +0100 Subject: [PATCH 150/190] exclude marian for transformers v5 or higher --- tests/openvino/test_seq2seq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index 9e2246582f..26b5b7d391 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -154,7 +154,7 @@ class OVModelForSeq2SeqLMIntegrationTest(OVSeq2SeqTestMixin): GENERATION_LENGTH = 100 SPEEDUP_CACHE = 1.1 UNSUPPORTED_ARCHITECTURES = set() - if not (is_openvino_version(">=", "2025.3.0") and is_openvino_version("<", "2026.1")) or is_transformers_version( + if not (is_openvino_version(">=", "2025.3.0") and is_openvino_version("<", "2026.1")) and is_transformers_version( "<", "5" ): # There are known issues with marian model on OpenVINO 2025.3.x and 2025.4.x From a33065ee1c4745f1461764d6321fc6abb1bbe5d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 26 Feb 2026 18:26:50 +0100 Subject: [PATCH 151/190] fix gemma3 --- optimum/exporters/openvino/model_configs.py | 2 -- optimum/exporters/openvino/model_patcher.py | 36 +++++++++++++-------- tests/openvino/test_decoder.py | 2 +- tests/openvino/test_genai.py | 5 ++- tests/openvino/test_seq2seq.py | 13 ++++---- 5 files changed, 32 insertions(+), 26 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index ec330c59e8..d5ce89bd46 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -4136,8 +4136,6 @@ def __init__( @register_in_tasks_manager("gemma3", *["image-text-to-text"], library_name="transformers") class Gemma3OpenVINOConfig(BaseVLMOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.50.0" - # TODO (@echarlaix): add v5 support - MAX_TRANSFORMERS_VERSION = "4.57.6" def __init__( self, diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 4ce8d17ded..860ce212e9 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -4657,22 +4657,30 @@ def __init__( model: "PreTrainedModel", model_kwargs: Dict[str, Any], ): - model.__orig_forward = model.forward - # Adopted from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/got_ocr2/modeling_got_ocr2.py#L835 - # Adopted from https://github.com/huggingface/transformers/blob/v4.49.0-Gemma-3/src/transformers/models/gemma3/modeling_gemma3.py#L1321 - if ( - hasattr(model, "model") - and hasattr(model.model, "get_image_features") - and is_transformers_version("<", "5") - ): - model.forward = model.model.get_image_features - else: - model.forward = model.get_image_features super().__init__(config, model, model_kwargs) - def __exit__(self, exc_type, exc_value, traceback): - super().__exit__(exc_type, exc_value, traceback) - self._model.forward = self._model.__orig_forward + @functools.wraps(self.orig_forward) + def patched_forward(*args, **kwargs): + # Adapted from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/got_ocr2/modeling_got_ocr2.py#L835 + # Adapted from https://github.com/huggingface/transformers/blob/v4.49.0-Gemma-3/src/transformers/models/gemma3/modeling_gemma3.py#L1321 + if ( + hasattr(self._model, "model") + and hasattr(self._model.model, "get_image_features") + and is_transformers_version("<", "5") + ): + get_image_features = self._model.model.get_image_features + else: + get_image_features = self._model.get_image_features + + outputs = get_image_features(*args, **kwargs) + + if is_transformers_version(">=", "5") and hasattr(outputs, "pooler_output"): + outputs = outputs.pooler_output + + output_names = list(config.outputs.keys()) + return {output_names[0]: outputs} + + self.patched_forward = patched_forward # Adopted from https://github.com/huggingface/transformers/blob/v4.49.0-Gemma-3/src/transformers/models/gemma3/modeling_gemma3.py#L1147 diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index d3cfba3ba3..8f0a8f12c2 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -115,7 +115,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version(">", "4.47"): SUPPORTED_ARCHITECTURES += ("olmo2",) - if is_transformers_version(">", "4.49"): + if is_transformers_version(">=", "4.50"): SUPPORTED_ARCHITECTURES += ("gemma3_text",) if is_transformers_version(">=", "4.51.0"): diff --git a/tests/openvino/test_genai.py b/tests/openvino/test_genai.py index f3c1bed1e9..5375cf9b67 100644 --- a/tests/openvino/test_genai.py +++ b/tests/openvino/test_genai.py @@ -76,7 +76,7 @@ class LLMPipelineTestCase(unittest.TestCase): SUPPORTED_ARCHITECTURES += ("qwen",) if is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES += ("phimoe",) - if is_transformers_version(">=", "4.49") and is_transformers_version("<", "5"): + if is_transformers_version(">=", "4.50"): SUPPORTED_ARCHITECTURES += ("gemma3_text",) if is_transformers_version(">=", "4.51.0"): SUPPORTED_ARCHITECTURES += ("qwen3", "qwen3_moe") @@ -224,8 +224,7 @@ class VLMPipelineTestCase(unittest.TestCase): SUPPORTED_ARCHITECTURES += ("qwen2_5_vl",) if is_transformers_version("<", "4.54.0"): SUPPORTED_ARCHITECTURES += ("phi4mm",) - # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly - if is_transformers_version(">=", "4.49") and is_transformers_version("<", "5"): + if is_transformers_version(">=", "4.50"): SUPPORTED_ARCHITECTURES += ("gemma3",) if is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES += ("llava", "llava_next_video") diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index 2b005849fa..e7c59476b6 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -581,9 +581,11 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): SUPPORTED_ARCHITECTURES += ["phi4mm"] SUPPORT_AUDIO.append("phi4mm") - # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly - if is_transformers_version(">", "4.49") and is_transformers_version("<", "5"): - SUPPORTED_ARCHITECTURES += ["gemma3", "smolvlm"] + if is_transformers_version(">=", "4.50"): + SUPPORTED_ARCHITECTURES += ["gemma3"] + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ["smolvlm"] # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly if is_transformers_version(">=", "4.51") and is_transformers_version("<", "5"): @@ -614,7 +616,6 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): "llama4", "llava_next_video", "phi4_multimodal", - "gemma3", "smolvlm", } REMOTE_CODE_MODELS = ["internvl_chat", "minicpmv", "minicpmo", "llava-qwen2", "phi3_v", "maira2", "phi4mm"] @@ -783,9 +784,9 @@ def test_compare_to_transformers(self, model_arch): set_seed(SEED) additional_inputs = {} - # gemma3 does not support dynamic cache, it is unfair to compare dynamic cache result vs hybrid cache, + # gemma3 does not support dynamic cache until v4.53, we cannot compare dynamic cache result vs hybrid cache, # align cache representation in torch model - if model_arch == "gemma3": + if model_arch == "gemma3" and is_transformers_version("<", "4.53.0"): patch_update_causal_mask( transformers_model if is_transformers_version("<", "4.52.0") else transformers_model.language_model, "4.43.0", From bf51329a5519e3c964ed5119043a3619594666a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 3 Mar 2026 18:36:48 +0100 Subject: [PATCH 152/190] add comment --- optimum/exporters/openvino/model_patcher.py | 1 + 1 file changed, 1 insertion(+) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 92fe0b4063..634f015872 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -4674,6 +4674,7 @@ def patched_forward(*args, **kwargs): outputs = get_image_features(*args, **kwargs) + # we should be able to specify pooler_output as output_name, not supported here as pooler_output key does not exist if is_transformers_version(">=", "5") and hasattr(outputs, "pooler_output"): outputs = outputs.pooler_output From 4a2786218d053164cfc01412505dca5c1174820a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 4 Mar 2026 15:35:24 +0100 Subject: [PATCH 153/190] replace gpt_oss_mxfp4 test to gpt_oss for v5 --- tests/openvino/test_exporters_cli.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index e8766c737d..9690496089 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -1233,13 +1233,14 @@ def test_exporters_cli_full_quantization( {"model": 65}, ), ( - "gpt_oss", + # mxfp4 fixing saving broken since v5, fixed in https://github.com/huggingface/transformers/pull/43148, test can be added back for v5.3 + "gpt_oss_mxfp4" if is_transformers_version("<", "5") else "gpt_oss", "openai/gpt-oss-20b", AutoModelForCausalLM, OVModelForCausalLM, "--task text-generation-with-past --weight-format int4", _DEFAULT_4BIT_WQ_CONFIGS, - {"model": {"int8": 40, "int4": 0}}, + {"model": {"int8": 22, "int4": 4} if is_transformers_version("<", "5") else {"int8": 40, "int4": 0}}, {"model": 0}, ), ( From 4a8644d937989fa88031b2c46992a5e48f4b8ac2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 4 Mar 2026 15:51:24 +0100 Subject: [PATCH 154/190] include Qwen3VLOpenVINOConfig min version --- tests/openvino/test_decoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index dc4073c063..e5267b5224 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -318,7 +318,7 @@ def test_find_untested_architectures(self): supported_architectures -= {"lfm2"} # qwen3_vl_text a part of qwen3_vl architecture and is tested in seq2seq group - if is_transformers_version(">", str(Qwen3VLOpenVINOConfig.MIN_TRANSFORMERS_VERSION)): + if is_transformers_version(">=", str(Qwen3VLOpenVINOConfig.MIN_TRANSFORMERS_VERSION)): supported_architectures -= {"qwen3_vl_text"} # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly From 988147517ceb2216275ee0c720221281fd18151a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 4 Mar 2026 16:40:14 +0100 Subject: [PATCH 155/190] add phi4_multimodal for transformers < v5 --- tests/openvino/test_seq2seq.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index e7c59476b6..9ceab2d227 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -558,6 +558,7 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): ] SUPPORT_VIDEO = ["llava_next_video", "qwen2_vl"] SUPPORT_AUDIO = [] + UNSUPPORTED_ARCHITECTURES = {"phi4_multimodal"} OVMODEL_CLASS = OVModelForVisualCausalLM TASK = "image-text-to-text" @@ -610,14 +611,7 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): if is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES += ("llava_next_video",) else: - UNSUPPORTED_ARCHITECTURES = { - "got_ocr2", - "idefics3", - "llama4", - "llava_next_video", - "phi4_multimodal", - "smolvlm", - } + UNSUPPORTED_ARCHITECTURES.update({"got_ocr2", "idefics3", "llama4", "llava_next_video", "smolvlm"}) REMOTE_CODE_MODELS = ["internvl_chat", "minicpmv", "minicpmo", "llava-qwen2", "phi3_v", "maira2", "phi4mm"] IMAGE = Image.open( requests.get( From 2d764ef9cf916ea168282b8acc000d018da55177 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 6 Mar 2026 11:29:44 +0100 Subject: [PATCH 156/190] set dtype for beam_search tests for gemma3 text model --- tests/openvino/test_decoder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index e5267b5224..0ddb251b22 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -783,11 +783,11 @@ def test_beam_search(self, model_arch): set_seed(SEED) with mock_torch_cuda_is_available("awq" in model_arch or "gptq" in model_arch): transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs) - if model_arch == "arctic" or "mxfp4" in model_arch: + if model_arch in ["arctic", "gemma3_text"] or "mxfp4" in model_arch: transformers_model.to(torch.float32) additional_inputs = {} # gemma2 does not support dynamic cache, it is unfair to compare dynamic cache result vs hybrid cache, align cache representation in torch model - if model_arch in ["gemma2", "gemma3_text"]: + if model_arch in ["gemma2", "gemma3_text"] and is_transformers_version("<", "4.53.0"): patch_update_causal_mask(transformers_model, "4.43.0") transformers_model._supports_cache_class = True transformers_model.generation_config.cache_implementation = None From 8574954e7009520ea4f618c52b49f2bf3692a425 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 8 Mar 2026 18:03:59 +0000 Subject: [PATCH 157/190] Add _ov_ops.py with RecurrentAttentionCellOp conversion rule Add conversion rule for the RecurrentAttentionCellOp operation used for GatedDeltaNet patching in OpenVINO PyTorch frontend. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- optimum/exporters/openvino/_ov_ops.py | 113 ++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 optimum/exporters/openvino/_ov_ops.py diff --git a/optimum/exporters/openvino/_ov_ops.py b/optimum/exporters/openvino/_ov_ops.py new file mode 100644 index 0000000000..78e5b6d23b --- /dev/null +++ b/optimum/exporters/openvino/_ov_ops.py @@ -0,0 +1,113 @@ +# Copyright 2026 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Conversion rule for the `RecurrentAttentionCellOp` operation in a Torch graph. +# The `RecurrentAttentionCellOp` appears in the Torch graph as a result of replacing +# the `torch.nn.Module` block `RecurrentAttentionCell` via a registered +# `ModuleExtension` for `RecurrentAttentionCell` in the OpenVINO PyTorch frontend. +import numpy as np + +import openvino as ov +import openvino.opset14 as ops + + +def convert_recurrent_attention_cell(context): + query = context.get_input(0) + key = context.get_input(1) + value = context.get_input(2) + g = context.get_input(3) + beta = context.get_input(4) + last_recurrent_state_old = context.get_input(5) + + value_shape = ops.shape_of(value) + const_zero = ops.constant(0, dtype=np.float32) + core_attn_out = ops.broadcast(const_zero, value_shape) + const_two_out = ops.constant(2, dtype=np.int32) + const_zero_out = ops.constant(0, dtype=np.int32) + seq_len = ops.gather(value_shape, const_two_out, const_zero_out) + + timestep_param = ops.parameter([], np.int32, "timestep") + q_t_param = ops.parameter([-1, -1, 1, -1], np.float32, "q_t") + k_t_param = ops.parameter([-1, -1, 1, -1], np.float32, "k_t") + v_t_param = ops.parameter([-1, -1, 1, -1], np.float32, "v_t") + g_t_param = ops.parameter([-1, -1, 1], np.float32, "g_t") + beta_t_param = ops.parameter([-1, -1, 1], np.float32, "beta_t") + last_recurrent_state_t = ops.parameter([-1, -1, -1, -1], np.float32, "last_recurrent_state_t") + core_attn_out_t = ops.parameter([-1, -1, -1, -1], np.float32, "core_attn_out_t") + + const_two = ops.constant(2, dtype=np.int32) + q_t = ops.squeeze(q_t_param, const_two) + k_t = ops.squeeze(k_t_param, const_two) + v_t = ops.squeeze(v_t_param, const_two) + const_minus_one = ops.constant(-1, dtype=np.int32) + g_t = ops.unsqueeze(ops.exp(g_t_param), const_minus_one) + beta_t = beta_t_param + + last_recurrent_state_in = ops.multiply(last_recurrent_state_t, g_t) + const_minus_two = ops.constant(-2, dtype=np.int32) + kv_mem = ops.multiply(last_recurrent_state_in, ops.unsqueeze(k_t, const_minus_one)) + kv_mem = ops.reduce_sum(kv_mem, const_minus_two, False) + delta = ops.multiply(ops.subtract(v_t, kv_mem), beta_t) + last_recurrent_state_delta = ops.multiply( + ops.unsqueeze(k_t, const_minus_one), ops.unsqueeze(delta, const_minus_two) + ) + last_recurrent_state_in = ops.add(last_recurrent_state_in, last_recurrent_state_delta) + core_attn_update = ops.multiply(last_recurrent_state_in, ops.unsqueeze(q_t, const_minus_one)) + core_attn_update = ops.reduce_sum(core_attn_update, const_minus_two, True) + const_zero = ops.constant(0, dtype=np.int32) + timestep = ops.unsqueeze(timestep_param, const_zero) + + core_attn_out_res = ops.scatter_update(core_attn_out_t, timestep, core_attn_update, const_two) + last_recurrent_state_res = last_recurrent_state_in + + body_cond = ops.constant([True], dtype=bool) + + body_model = ov.Model( + [body_cond, last_recurrent_state_res, core_attn_out_res], + [ + timestep_param, + q_t_param, + k_t_param, + v_t_param, + g_t_param, + beta_t_param, + last_recurrent_state_t, + core_attn_out_t, + ], + "body_model", + ) + + seq_len = ops.convert(seq_len, "i32") + loop = ops.loop(seq_len, ops.constant(True, dtype="bool")) + loop.set_function(body_model) + + loop.set_sliced_input(q_t_param, query, 0, 1, 1, -1, 2) + loop.set_sliced_input(k_t_param, key, 0, 1, 1, -1, 2) + loop.set_sliced_input(v_t_param, value, 0, 1, 1, -1, 2) + loop.set_sliced_input(g_t_param, g, 0, 1, 1, -1, 2) + loop.set_sliced_input(beta_t_param, beta, 0, 1, 1, -1, 2) + loop.set_merged_input(last_recurrent_state_t, last_recurrent_state_old, last_recurrent_state_res.output(0)) + loop.set_merged_input(core_attn_out_t, core_attn_out.output(0), core_attn_out_res.output(0)) + loop.set_special_body_ports([0, 0]) + + core_attn_out_new = loop.get_iter_value(core_attn_out_res.output(0), -1) + last_recurrent_state_new = loop.get_iter_value(last_recurrent_state_res.output(0), -1) + + flatten_shape = ops.constant([-1], dtype=np.int32) + core_attn_out_new = ops.reshape(core_attn_out_new, flatten_shape, False) + last_recurrent_state_new = ops.reshape(last_recurrent_state_new, flatten_shape, False) + + final_output = ops.concat([core_attn_out_new, last_recurrent_state_new], 0) + + return [final_output.output(0)] From 050d14f61d5eec382c9dfcce7b33053e2096930f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 8 Mar 2026 18:16:05 +0000 Subject: [PATCH 158/190] Add initial Qwen3.5 model support with VLM and hybrid text model Co-authored-by: rkazants <35459624+rkazants@users.noreply.github.com> --- docs/source/openvino/models.mdx | 1 + optimum/exporters/openvino/model_configs.py | 259 +++++++++++ optimum/exporters/openvino/model_patcher.py | 414 ++++++++++++++++++ optimum/exporters/openvino/utils.py | 3 +- optimum/intel/openvino/modeling_decoder.py | 4 +- .../openvino/modeling_visual_language.py | 365 ++++++++++++++- tests/openvino/test_decoder.py | 5 + tests/openvino/utils_tests.py | 8 + 8 files changed, 1055 insertions(+), 4 deletions(-) diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx index 51200060e8..4ab826378b 100644 --- a/docs/source/openvino/models.mdx +++ b/docs/source/openvino/models.mdx @@ -131,6 +131,7 @@ Here is the list of the supported architectures : - Qwen2VL - Qwen2.5VL - Qwen3VL +- Qwen3.5 - ResNet - Roberta - Roformer diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 5c2023f2c9..527c6321ab 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -194,6 +194,8 @@ Qwen2MoEPatcher, Qwen2VLLanguageModelPatcher, Qwen2VLVisionEmbMergerPatcher, + Qwen3_5ModelPatcher, + Qwen3_5VisionEmbMergerPatcher, Qwen3MoeModelPatcher, Qwen3VLLanguageModelPatcher, Qwen3VLVisionEmbMergerPatcher, @@ -252,6 +254,10 @@ def init_model_configs(): "transformers", "AutoModelForCausalLM", ) + TasksManager._CUSTOM_CLASSES[("pt", "qwen3_5", "image-text-to-text")] = ( + "transformers", + "AutoModelForImageTextToText", + ) # since transformers v4.46, model can be loaded using default AutoModelForImageTextToText # https://github.com/huggingface/transformers/blob/v4.46.0/src/transformers/models/auto/modeling_auto.py#L776 @@ -5360,3 +5366,256 @@ class HunyuanV1DenseOpenVINOConfig(LlamaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.57.0" DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, GemmaDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = GemmaDummyPastKeyValuesGenerator + + +class Qwen3_5DummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator): + """ + Generates dummy cache_params inputs for Qwen3.5 architectures. + """ + + SUPPORTED_INPUT_NAMES = ("cache_params",) + + def __init__( + self, + task: str, + normalized_config, + batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], + sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"], + **kwargs, + ): + super().__init__( + task=task, + normalized_config=normalized_config, + batch_size=batch_size, + sequence_length=sequence_length, + **kwargs, + ) + + config = normalized_config.config + self.num_full_attn_layers = config.layer_types.count("full_attention") + self.num_linear_attn_layers = config.layer_types.count("linear_attention") + self.conv_kernel_size = config.linear_conv_kernel_dim + self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + self.head_k_dim = config.linear_key_head_dim + self.head_v_dim = config.linear_value_head_dim + self.num_v_heads = config.linear_num_value_heads + self.num_k_heads = config.linear_num_key_heads + self.num_key_value_heads = config.num_key_value_heads + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + cache_params = [] + + for idx in range(self.num_linear_attn_layers): + d_inner = self.num_k_heads * (2 * self.head_k_dim + self.head_v_dim * self.num_v_heads // self.num_k_heads) + conv_state_shape = ( + self.batch_size, + d_inner, + self.conv_kernel_size, + ) + conv_state = self.random_float_tensor(conv_state_shape, framework=framework, dtype=float_dtype) + cache_params.append(conv_state) + num_heads = self.num_v_heads + recurrent_state_shape = (self.batch_size, num_heads, self.head_k_dim, self.head_v_dim) + recurrent_state = self.random_float_tensor(recurrent_state_shape, framework=framework, dtype=float_dtype) + cache_params.append(recurrent_state) + + for idx in range(self.num_full_attn_layers): + kv_shape = (self.batch_size, self.num_key_value_heads, self.sequence_length, self.head_dim) + k = self.random_float_tensor(kv_shape, framework=framework, dtype=float_dtype) + v = self.random_float_tensor(kv_shape, framework=framework, dtype=float_dtype) + cache_params.append(k) + cache_params.append(v) + + return cache_params + + +@register_in_tasks_manager( + "qwen3_5_text", + *["text-generation", "text-generation-with-past"], + library_name="transformers", +) +class Qwen3_5TextOpenVINOConfig(Qwen3OpenVINOConfig): + DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, Qwen3_5DummyPastKeyValuesGenerator) + DUMMY_PKV_GENERATOR_CLASS = Qwen3_5DummyPastKeyValuesGenerator + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + MIN_TRANSFORMERS_VERSION = "4.57.0" + _MODEL_PATCHER = Qwen3_5ModelPatcher + + def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], direction: str): + if direction not in ["inputs", "outputs"]: + raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given') + + if direction == "inputs": + decoder_sequence_name = "past_sequence_length" + cache_name_prefix = "cache_params.past" + else: + decoder_sequence_name = "past_sequence_length + sequence_length" + cache_name_prefix = "cache_params.present" + + self.num_full_attn_layers = self._normalized_config.layer_types.count("full_attention") + self.num_linear_attn_layers = self._normalized_config.layer_types.count("linear_attention") + + for i in range(self.num_linear_attn_layers): + inputs_or_outputs[f"{cache_name_prefix}.conv.{i}"] = {0: "batch_size"} + inputs_or_outputs[f"{cache_name_prefix}.ssm.{i}"] = {0: "batch_size"} + + for i in range(self.num_full_attn_layers): + inputs_or_outputs[f"{cache_name_prefix}.key.{i}"] = {0: "batch_size", 2: decoder_sequence_name} + inputs_or_outputs[f"{cache_name_prefix}.value.{i}"] = {0: "batch_size", 2: decoder_sequence_name} + + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + common_inputs = { + "input_ids": {0: "batch_size", 1: "sequence_length"}, + "attention_mask": {0: "batch_size", 1: "sequence_length"}, + } + if self.use_past_in_inputs: + self.add_past_key_values(common_inputs, direction="inputs") + return common_inputs + + def generate_dummy_inputs(self, framework: str = "pt", **kwargs): + dummy_inputs_generators = self._create_dummy_input_generator_classes(**kwargs) + + dummy_inputs = {} + input_names = [key for key in self.inputs.keys() if not key.startswith("cache_params")] + if self.use_past_in_inputs: + input_names.extend(["cache_params"]) + + for input_name in input_names: + input_was_inserted = False + for dummy_input_gen in dummy_inputs_generators: + if dummy_input_gen.supports_input(input_name): + dummy_inputs[input_name] = self.overwrite_shape_and_generate_input( + dummy_input_gen, + input_name, + framework, + input_shapes=kwargs, + ) + input_was_inserted = True + break + if not input_was_inserted: + raise RuntimeError( + f'Could not generate dummy input for "{input_name}". Try adding a proper dummy input generator to the model ONNX config.' + ) + + return dummy_inputs + + +@register_in_tasks_manager( + "qwen3_5", + *["image-text-to-text"], + library_name="transformers", +) +class Qwen3_5OpenVINOConfig(Qwen2VLOpenVINOConfig): + SUPPORTED_BEHAVIORS = [model_type.value for model_type in QwenVLConfigBehavior] + DUMMY_INPUT_GENERATOR_CLASSES = (DummyQwen2VLVisionEmbedInputGenerator,) + MIN_TRANSFORMERS_VERSION = "4.57.0" + + def __init__( + self, + config: "PretrainedConfig", + task: str = "feature-extraction", + int_dtype: str = "int64", + float_dtype: str = "fp32", + behavior: QwenVLConfigBehavior = QwenVLConfigBehavior.VISION_EMBEDDINGS, + preprocessors: Optional[List[Any]] = None, + ): + super().__init__( + config=config, + task=task, + int_dtype=int_dtype, + float_dtype=float_dtype, + preprocessors=preprocessors, + behavior=behavior, + ) + if self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_POS and hasattr(config, "vision_config"): + self._config = config.vision_config + self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config) + self._normalized_config.use_embed_dim = True + + @staticmethod + def get_model_for_behavior(model, behavior: Union[str, QwenVLConfigBehavior]): + if behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_POS: + vision_emb_pos = model.visual.pos_embed + vision_emb_pos.config = model.config.vision_config + return vision_emb_pos + + return Qwen2VLOpenVINOConfig.get_model_for_behavior(model, behavior) + + def with_behavior( + self, + behavior: Union[str, QwenVLConfigBehavior], + ): + """ + Creates a config for different behaviour. + Args: + behavior ([`ConfigBehavior`]): + The behavior to use for the new instance. + """ + if isinstance(behavior, str) and not isinstance(behavior, QwenVLConfigBehavior): + behavior = QwenVLConfigBehavior(behavior) + + if behavior == QwenVLConfigBehavior.TEXT_EMBEDDINGS: + return get_vlm_text_embeddings_config( + "qwen3_5_text", self._orig_config.text_config, self.int_dtype, self.float_dtype + ) + + if behavior == QwenVLConfigBehavior.LANGUAGE: + return get_vlm_text_generation_config( + "qwen3_5_text", + self._orig_config.text_config, + self.int_dtype, + self.float_dtype, + model_patcher=Qwen3_5ModelPatcher, + dummy_input_generator=DummyQwen2VLLMInputGenerator, + inputs_update={"position_ids": {1: "batch_size", 2: "sequence_length"}}, + ) + + if behavior in ( + QwenVLConfigBehavior.VISION_EMBEDDINGS, + QwenVLConfigBehavior.VISION_EMBEDDINGS_MERGER, + QwenVLConfigBehavior.VISION_EMBEDDINGS_POS, + ): + return self.__class__( + self._orig_config, + task=self.task, + int_dtype=self.int_dtype, + float_dtype=self.float_dtype, + behavior=behavior, + preprocessors=self._preprocessors, + ) + + def patch_model_for_export(self, model: Union["PreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None): + model_kwargs = model_kwargs or {} + if self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_MERGER: + return Qwen3_5VisionEmbMergerPatcher(self, model, model_kwargs) + if ( + self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS + or self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_POS + ): + return ModelPatcher(self, model, model_kwargs=model_kwargs) + return super().patch_model_for_export(model, model_kwargs) + + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + if self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_POS: + return { + "input": {1: "sequence_length"}, + } + return super().inputs + + @property + def outputs(self) -> Dict[str, Dict[int, str]]: + if self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS: + return super().outputs + if self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_MERGER: + return {"last_hidden_state": {0: "seq_len"}} + if self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_POS: + return {"last_hidden_state": {0: "seq_len", 1: "seq_len"}} + if self._behavior == QwenVLConfigBehavior.TEXT_EMBEDDINGS: + return {"inputs_embeds": {0: "batch_size", 1: "sequence_length"}} + if self._behavior == QwenVLConfigBehavior.LANGUAGE: + return get_vlm_internal_text_generation_config( + "qwen3_5_text", self._orig_config.text_config, self.int_dtype, self.float_dtype + ).outputs + raise Exception("Unknown Qwen3.5 behavior type.") diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 634f015872..28110c1040 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -56,6 +56,8 @@ ) from optimum.intel.utils.import_utils import is_diffusers_version, is_torch_version, is_transformers_version +from ._ov_ops import convert_recurrent_attention_cell + if is_transformers_version(">=", "4.53"): from transformers.masking_utils import ALL_MASK_ATTENTION_FUNCTIONS, eager_mask, sdpa_mask @@ -7997,3 +7999,415 @@ def forward( hidden_states=outputs.hidden_states, d2t=d2t_out, ) + + +# Patched implementation of the gated delta rule in recurrent form. +# Adapted from: +# https://github.com/huggingface/transformers/blob/v4.57-release/src/transformers/models/qwen3_next/modeling_qwen3_next.py#L522 +# +# To represent the for-loop that generates output embeddings, we use a module +# and the conversion extension mechanism. This is necessary because there is +# no known vectorized form of this loop that would allow it to be correctly +# traced with torch.jit.trace +def patched_recurrent_gated_delta_rule( + self, query, key, value, g, beta, initial_state, output_final_state, use_qk_l2norm_in_kernel=False +): + def l2norm(x: torch.FloatTensor, dim: int = -1, eps: float = 1e-6): + """This function is intended to align with the l2norm implementation in the FLA library.""" + inv_norm = torch.rsqrt((x * x).sum(dim=dim, keepdim=True) + eps) + return x * inv_norm + + initial_dtype = query.dtype + if use_qk_l2norm_in_kernel: + query = l2norm(query, dim=-1, eps=1e-6) + key = l2norm(key, dim=-1, eps=1e-6) + query, key, value, beta, g = [ + x.transpose(1, 2).contiguous().to(torch.float32) for x in (query, key, value, beta, g) + ] + + batch_size, num_heads, sequence_length, k_head_dim = key.shape + v_head_dim = value.shape[-1] + scale = 1 / (query.shape[-1] ** 0.5) + query = query * scale + + last_recurrent_state = ( + torch.zeros(batch_size, num_heads, k_head_dim, v_head_dim).to(value) + if initial_state is None + else initial_state.to(value) + ) + + output_cell = self.recurrent_attention_cell( + query, # (B, H, T, D1) + key, # (B, H, T, D1) + value, # (B, H, T, D2) + g, # (B, H, T) + beta, # (B, H, T) + last_recurrent_state, # (B, H, D1, D2) + ) + + num_elems = value.numel() + core_attn_out = output_cell[:num_elems].reshape(value.shape) + last_recurrent_state = output_cell[num_elems:].reshape(last_recurrent_state.shape) + + if not output_final_state: + last_recurrent_state = None + core_attn_out = core_attn_out.transpose(1, 2).contiguous().to(initial_dtype) + return core_attn_out, last_recurrent_state + + +# The CausalConv1D block is overridden with a generic patch provided by `ov_causal_conv1d()`. +# The GatedDeltaNet block is overridden with a recurrent version of its implementation. +# +# To replace GatedDeltaNet with its recurrent form, patching uses the ModuleExtension +# approach, which replaces the GatedDeltaNet block with a single operation, +# `GatedDeltaNetOp`. OpenVINO then applies the `convert_recurrent_attention_cell()` +# conversion rule to this operation. +def qwen3_5_gated_delta_net_forward( + self, + hidden_states: torch.Tensor, + cache_params=None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, +): + def apply_mask_to_padding_states(hidden_states, attention_mask): + """ + Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66 + """ + # NOTE: attention mask is a 2D boolean tensor + if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1: + dtype = hidden_states.dtype + hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype) + + return hidden_states + + hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask) + + # Set up dimensions for reshapes later + batch_size, seq_len, _ = hidden_states.shape + + # getting projected states from cache if it exists + layer_idx = None + recurrent_state = None + if cache_params is not None: + layer_idx = cache_params.linear_attn_mapping[self.layer_idx] + conv_state = cache_params.conv_states[layer_idx] + recurrent_state = cache_params.recurrent_states[layer_idx] + + mixed_qkv = self.in_proj_qkv(hidden_states) + mixed_qkv = mixed_qkv.transpose(1, 2) + + z = self.in_proj_z(hidden_states) + z = z.reshape(batch_size, seq_len, -1, self.head_v_dim) + + b = self.in_proj_b(hidden_states) + a = self.in_proj_a(hidden_states) + + if cache_params is not None: + new_mixed_qkv, new_conv_state = ov_causal_conv1d(conv_state, mixed_qkv, self.conv1d.weight, self.conv1d.bias) + mixed_qkv = F.silu(new_mixed_qkv) + cache_params.conv_states[layer_idx] = new_conv_state + else: + mixed_qkv = F.silu(self.conv1d(mixed_qkv)[:, :, :seq_len]) + + mixed_qkv = mixed_qkv.transpose(1, 2) + query, key, value = torch.split( + mixed_qkv, + [ + self.key_dim, + self.key_dim, + self.value_dim, + ], + dim=-1, + ) + query = query.reshape(query.shape[0], query.shape[1], -1, self.head_k_dim) + key = key.reshape(key.shape[0], key.shape[1], -1, self.head_k_dim) + value = value.reshape(value.shape[0], value.shape[1], -1, self.head_v_dim) + + beta = b.sigmoid() + # If the model is loaded in fp16, without the .float() here, A might be -inf + g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias) + if self.num_v_heads // self.num_k_heads > 1: + query = query.repeat_interleave(self.num_v_heads // self.num_k_heads, dim=2) + key = key.repeat_interleave(self.num_v_heads // self.num_k_heads, dim=2) + + core_attn_out, last_recurrent_state = self.recurrent_gated_delta_rule( + self, + query, + key, + value, + g=g, + beta=beta, + initial_state=recurrent_state, + output_final_state=cache_params is not None, + use_qk_l2norm_in_kernel=True, + ) + + # Update cache + if cache_params is not None: + cache_params.recurrent_states[layer_idx] = last_recurrent_state + + # reshape input data into 2D tensor + core_attn_out = core_attn_out.reshape(-1, self.head_v_dim) + z = z.reshape(-1, self.head_v_dim) + core_attn_out = self.norm(core_attn_out, z) + core_attn_out = core_attn_out.reshape(batch_size, seq_len, -1) + + output = self.out_proj(core_attn_out) + return output + + +# This torch.nn.Module represents the GatedDeltaNet layer in its recurrent form. +# It is required for converting the GatedDeltaNet layer with OpenVINO using the ModuleExtension mechanism. +class RecurrentAttentionCell(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward( + self, + query, # (B, H, T, D1) + key, # (B, H, T, D1) + value, # (B, H, T, D2) + g, # (B, H, T) + beta, # (B, H, T) + last_recurrent_state, # (B, H, D1, D2) + ): + _, _, sequence_length, _ = key.shape + core_attn_out = torch.zeros_like(value) + + for i in range(sequence_length): + q_t = query[:, :, i] + k_t = key[:, :, i] + v_t = value[:, :, i] + g_t = g[:, :, i].exp().unsqueeze(-1).unsqueeze(-1) + beta_t = beta[:, :, i].unsqueeze(-1) + + last_recurrent_state = last_recurrent_state * g_t + kv_mem = (last_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2) + delta = (v_t - kv_mem) * beta_t + last_recurrent_state = last_recurrent_state + k_t.unsqueeze(-1) * delta.unsqueeze(-2) + core_attn_out[:, :, i] = (last_recurrent_state * q_t.unsqueeze(-1)).sum(dim=-2) + + # This is a workaround to ensure a single output from the torch.nn.Module. + # The OpenVINO ModuleExtension mechanism has a limitation and expects + # the module to produce only one output. + output_cell = torch.cat([core_attn_out.flatten(), last_recurrent_state.flatten()], dim=0) + return output_cell + + +class Qwen3_5ModelPatcher(OVDecoderModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: "PreTrainedModel", + model_kwargs: Optional[Dict[str, Any]] = None, + ): + from transformers.models.qwen3_5.modeling_qwen3_5 import Qwen3_5DynamicCache + + from openvino.frontend.pytorch import ConversionExtension, ModuleExtension + + super().__init__(config, model, model_kwargs) + + # Detect VLM vs text-only model + self._is_vlm = hasattr(self._model.model, "language_model") + if self._is_vlm: + self._text_model = self._model.model.language_model + self._text_config = self._model.config.text_config + else: + self._text_model = self._model.model + self._text_config = self._model.model.config + + class Qwen3_5DynamicCacheWrap(Qwen3_5DynamicCache): + def __init__(self, config, conv_states, recurrent_states, key_cache, value_cache): + # Call parent constructor with all required arguments + super().__init__(config=config) + + self.conv_states = conv_states + self.recurrent_states = recurrent_states + self.key_cache = key_cache + self.value_cache = value_cache + self.full_attn_mapping = {} + self.linear_attn_mapping = {} + full_attn_layer_idx = 0 + linear_attn_layer_idx = 0 + for i in range(len(config.layer_types)): + if self.layer_types[i] == "full_attention": + self.full_attn_mapping[i] = full_attn_layer_idx + full_attn_layer_idx += 1 + elif self.layer_types[i] == "linear_attention": + self.linear_attn_mapping[i] = linear_attn_layer_idx + linear_attn_layer_idx += 1 + + def update( + self, + key_states: torch.Tensor, + value_states: torch.Tensor, + layer_idx: int, + cache_kwargs: Optional[dict[str, Any]] = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + # map layer_idx to key_cache (value_cache) idx + layer_idx = self.full_attn_mapping[layer_idx] + if self.key_cache[layer_idx] is None: + self.key_cache[layer_idx] = key_states + self.value_cache[layer_idx] = value_states + else: + self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=2) + self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=2) + + return self.key_cache[layer_idx], self.value_cache[layer_idx] + + def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: + """Returns the sequence length of the cached states. A layer index can be optionally passed.""" + # take any layer that contains cache and not empty tensor + layer_idx = self.transformer_layers[0] if layer_idx not in self.transformer_layers else layer_idx + layer_idx = self.full_attn_mapping[layer_idx] + if len(self.key_cache) <= layer_idx or self.key_cache[layer_idx] is None: + return 0 + return self.key_cache[layer_idx].shape[-2] + + @property + def has_previous_state(self): + """We have a previous state if the last linear (conv) layer was already updated.""" + layer_idx = self.linear_attn_mapping[self.last_linear_layer] + return self.conv_states[layer_idx] is not None + + # the patch is needed to include KV-cache, Conv, and SSM states in the inputs and outputs. + def patched_forward( + input_ids=None, + attention_mask=None, + cache_params=None, + inputs_embeds=None, + position_ids=None, + ): + text_config = self._text_config + num_full_attn_layers = text_config.layer_types.count("full_attention") + num_linear_attn_layers = text_config.layer_types.count("linear_attention") + + use_cache = False + wrapped_cache_params = None + if cache_params is not None: + use_cache = True + conv_states = [] + recurrent_states = [] + key_cache = [] + value_cache = [] + + # decouple ssm_states, conv_states, keys and values from cache_params + for idx in range(num_linear_attn_layers): + conv_states.append(cache_params[2 * idx]) + recurrent_states.append(cache_params[2 * idx + 1]) + + for idx in range(num_full_attn_layers): + key_cache.append(cache_params[2 * num_linear_attn_layers + 2 * idx]) + value_cache.append(cache_params[2 * num_linear_attn_layers + 2 * idx + 1]) + + wrapped_cache_params = Qwen3_5DynamicCacheWrap( + text_config, conv_states, recurrent_states, key_cache, value_cache + ) + + if self._is_vlm: + # VLM case: call language model through the composite model + outputs_lm = self._text_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=wrapped_cache_params, + use_cache=use_cache, + ) + hidden_states = outputs_lm[0] + logits = self._model.lm_head(hidden_states) + past_kv = outputs_lm.past_key_values + else: + causal_lm_output = self.model_orig_forward( + input_ids=input_ids, + attention_mask=attention_mask, + past_key_values=wrapped_cache_params, + use_cache=use_cache, + ) + logits = causal_lm_output.logits + past_kv = causal_lm_output.past_key_values + outputs = { + "logits": logits, + } + + if use_cache: + present_key_values = [] + for idx in range(num_linear_attn_layers): + present_key_values.append(past_kv.conv_states[idx]) + present_key_values.append(past_kv.recurrent_states[idx]) + + for idx in range(num_full_attn_layers): + present_key_values.append(past_kv.key_cache[idx]) + present_key_values.append(past_kv.value_cache[idx]) + + outputs["present_key_values"] = present_key_values + + return outputs + + self.patched_forward = patched_forward + self.model_orig_forward = self.orig_forward + self.orig_forward = patched_forward + + self.module_extensions = { + RecurrentAttentionCell: ModuleExtension(RecurrentAttentionCell, "RecurrentAttentionCellOp"), + } + self.conversion_extensions = [ + ConversionExtension("RecurrentAttentionCellOp", convert_recurrent_attention_cell), + ] + + def __enter__(self): + super().__enter__() + setattr(self._model, self.orig_forward_name, self.patched_forward) + + for idx, decoder_layer in enumerate(self._text_model.layers): + layer_type = self._text_config.layer_types[idx] + if layer_type == "linear_attention": + linear_attn_layer = decoder_layer.linear_attn + linear_attn_layer._orig_forward = linear_attn_layer.forward + linear_attn_layer.forward = types.MethodType(qwen3_5_gated_delta_net_forward, linear_attn_layer) + linear_attn_layer.recurrent_gated_delta_rule = patched_recurrent_gated_delta_rule + linear_attn_layer.recurrent_attention_cell = RecurrentAttentionCell() + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + setattr(self._model, self.orig_forward_name, self.model_orig_forward) + for idx, decoder_layer in enumerate(self._text_model.layers): + layer_type = self._text_config.layer_types[idx] + if layer_type == "linear_attention": + linear_attn_layer = decoder_layer.linear_attn + linear_attn_layer.forward = linear_attn_layer._orig_forward + + +class Qwen3_5VisionEmbMergerPatcher(ModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: "PreTrainedModel", + model_kwargs: Dict[str, Any] = None, + ): + model.__orig_forward = model.forward + + # Adapted from Qwen3.5 VisionModel forward + # added attention_mask input instead of cu_seqlens for its internal calculation + # separated patch_embed and rot_pos_emb calls for performing as part of another model + def image_embed_forward( + self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, rotary_pos_emb: torch.Tensor + ) -> torch.Tensor: + emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1) + position_embeddings = (emb.cos(), emb.sin()) + for blk in self.blocks: + hidden_states = blk(hidden_states, attention_mask=attention_mask, position_embeddings=position_embeddings) + return self.merger(hidden_states) + + model.forward = types.MethodType(image_embed_forward, model) + super().__init__(config, model, model_kwargs) + + def __enter__(self): + patch_qwen2vl_vision_blocks(self._model) + super().__enter__() + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + self._model.forward = self._model.__orig_forward + for block in self._model.blocks: + block.forward = block._orig_forward + block.attn.forward = block.attn._orig_forward diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py index 3d9a854e39..08011e44b2 100644 --- a/optimum/exporters/openvino/utils.py +++ b/optimum/exporters/openvino/utils.py @@ -295,6 +295,7 @@ def get_submodels(model): "qwen2_vl", "qwen2_5_vl", "qwen3_vl", + "qwen3_5", "got_ocr2", "gemma3", "idefics3", @@ -305,7 +306,7 @@ def get_submodels(model): "minicpmo", ] -SSM_MODELS = ["mamba", "falcon_mamba", "zamba2", "lfm2", "granitemoehybrid"] +SSM_MODELS = ["mamba", "falcon_mamba", "zamba2", "lfm2", "granitemoehybrid", "qwen3_5_text"] # All transformers, diffusers, timm and sentence transformers models that are supported via optimum-onnx OnnxConfigs but that have currently no test # TODO: add tests for all models that are compatible and remove support for all others diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 3b95b5f276..66e036bc37 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -1449,8 +1449,8 @@ def prepare_inputs_for_generation( # decoding stage so it takes the last token input_ids = input_ids[:, -1].unsqueeze(-1) - if self.config.model_type not in ["lfm2", "granitemoehybrid"]: - # LFM2 and GraniteMoeHybrid (Granite-4.0) require the attention mask + if self.config.model_type not in ["lfm2", "granitemoehybrid", "qwen3_5_text"]: + # LFM2, GraniteMoeHybrid (Granite-4.0), and Qwen3.5 require the attention mask # to be the length of the full context, so default mask from OVModelForCausalLM needs to be used. # Other models like Mamba typically do not require an attention_mask # for the decoding step after the first token so use attention mask of ones. diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 2fe8cb0ea0..ba002befde 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -190,7 +190,7 @@ def prepare_inputs( if past_len: position_ids = position_ids[:, -inputs_embeds.shape[1] :] - if (self.config.model_type in ["qwen2_vl", "qwen3_vl"]) and position_ids.ndim != 3: + if (self.config.model_type in ["qwen2_vl", "qwen3_vl", "qwen3_5"]) and position_ids.ndim != 3: position_ids = np.repeat(np.expand_dims(position_ids, 0), 3, axis=0) inputs["position_ids"] = position_ids @@ -3431,6 +3431,12 @@ def preprocess_inputs( Qwen3VLVisionModel, Qwen3VLVisionRotaryEmbedding, ) + + from transformers.models.qwen3_5.modeling_qwen3_5 import ( + Qwen3_5Model, + Qwen3_5VisionModel, + Qwen3_5VisionRotaryEmbedding, + ) else: class Qwen3VLModel: @@ -3439,6 +3445,12 @@ class Qwen3VLModel: class Qwen3VLVisionModel: pass + class Qwen3_5Model: + pass + + class Qwen3_5VisionModel: + pass + # The inheritance from Qwen3VLModel is needed to get access to methods: # get_placeholder_mask(): https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py#L1066 @@ -4802,6 +4814,355 @@ def preprocess_inputs( return inputs +# The inheritance from Qwen3_5Model is needed to get access to methods: +# get_placeholder_mask(), get_rope_index(), get_image_features(), get_video_features(), compute_3d_position_ids() +# +# and inheritance from Qwen3_5VisionModel is needed for accessing the following method: +# rot_pos_emb() +class _OVQwen3_5ForCausalLM(OVModelForVisualCausalLM, Qwen3_5Model, Qwen3_5VisionModel): + additional_parts = ["vision_embeddings_merger", "vision_embeddings_pos"] + + def __init__( + self, + language_model: ov.Model, + text_embeddings: ov.Model, + vision_embeddings: ov.Model, + config: PretrainedConfig = None, + device: str = "CPU", + dynamic_shapes: bool = None, + ov_config: Optional[Dict[str, str]] = None, + model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, + quantization_config: Union[OVWeightQuantizationConfig, Dict] = None, + **kwargs, + ): + if is_transformers_version("<", "4.57.0"): + raise Exception("Qwen3.5 is not supported in transformers versions earlier than 4.57.0.") + + super().__init__( + language_model=language_model, + text_embeddings=text_embeddings, + vision_embeddings=vision_embeddings, + config=config, + device=device, + dynamic_shapes=dynamic_shapes, + ov_config=ov_config, + model_save_dir=model_save_dir, + quantization_config=quantization_config, + **kwargs, + ) + self.rope_deltas = None # cache rope_deltas here + + self.num_grid_per_side = int(config.vision_config.num_position_embeddings**0.5) + self.spatial_merge_size = config.vision_config.spatial_merge_size + head_dim = config.vision_config.hidden_size // config.vision_config.num_heads + self.rotary_pos_emb = Qwen3_5VisionRotaryEmbedding(head_dim // 2) + + def __setattr__(self, name, value): + OVModelForVisualCausalLM.__setattr__(self, name, value) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + position_ids=None, + use_cache=True, + pixel_values=None, + pixel_values_videos=None, + image_grid_thw=None, + video_grid_thw=None, + **kwargs, + ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + if past_key_values is not None: + if inputs_embeds is not None and input_ids.shape[1] == 0: # Exception 4 + inputs_embeds = inputs_embeds[:, -cache_position.shape[0] :] + elif inputs_embeds is not None: + input_ids = input_ids[:, -cache_position.shape[0] :] + elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) + input_ids = input_ids[:, cache_position] + + if cache_position[0] != 0: + pixel_values = None + pixel_values_videos = None + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and len(cache_position) == inputs_embeds.shape[1]: + model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None} + else: + model_inputs = {"input_ids": input_ids, "inputs_embeds": None} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": use_cache, + "attention_mask": attention_mask, + "pixel_values": pixel_values, + "pixel_values_videos": pixel_values_videos, + "image_grid_thw": image_grid_thw, + "video_grid_thw": video_grid_thw, + "cache_position": cache_position, + } + ) + return model_inputs + + # Adapted from Qwen3_5VisionModel.fast_pos_embed_interpolate + # This method needs to be changed, as instead of running self.pos_embed of type nn.Embedding, openvino model needs to be inferred (self.vision_embeddings_pos) + def fast_pos_embed_interpolate(self, grid_thw): + grid_ts, grid_hs, grid_ws = grid_thw[:, 0], grid_thw[:, 1], grid_thw[:, 2] + + idx_list = [[] for _ in range(4)] + weight_list = [[] for _ in range(4)] + + for t, h, w in zip(grid_ts, grid_hs, grid_ws): + h_idxs = torch.linspace(0, self.num_grid_per_side - 1, h) + w_idxs = torch.linspace(0, self.num_grid_per_side - 1, w) + + h_idxs_floor = h_idxs.int() + w_idxs_floor = w_idxs.int() + h_idxs_ceil = (h_idxs.int() + 1).clip(max=self.num_grid_per_side - 1) + w_idxs_ceil = (w_idxs.int() + 1).clip(max=self.num_grid_per_side - 1) + + dh = h_idxs - h_idxs_floor + dw = w_idxs - w_idxs_floor + + base_h = h_idxs_floor * self.num_grid_per_side + base_h_ceil = h_idxs_ceil * self.num_grid_per_side + + indices = [ + (base_h[None].T + w_idxs_floor[None]).flatten(), + (base_h[None].T + w_idxs_ceil[None]).flatten(), + (base_h_ceil[None].T + w_idxs_floor[None]).flatten(), + (base_h_ceil[None].T + w_idxs_ceil[None]).flatten(), + ] + + weights = [ + ((1 - dh)[None].T * (1 - dw)[None]).flatten(), + ((1 - dh)[None].T * dw[None]).flatten(), + (dh[None].T * (1 - dw)[None]).flatten(), + (dh[None].T * dw[None]).flatten(), + ] + + for i in range(4): + idx_list[i].extend(indices[i].tolist()) + weight_list[i].extend(weights[i].tolist()) + + idx_tensor = torch.tensor(idx_list) + weight_tensor = torch.tensor(weight_list) + pos_embeds = torch.from_numpy(self.vision_embeddings_pos(idx_tensor)) * weight_tensor[:, :, None] + patch_pos_embeds = pos_embeds[0] + pos_embeds[1] + pos_embeds[2] + pos_embeds[3] + + patch_pos_embeds = patch_pos_embeds.split([h * w for h, w in zip(grid_hs, grid_ws)]) + + patch_pos_embeds_permute = [] + merge_size = self.config.vision_config.spatial_merge_size + for pos_embed, t, h, w in zip(patch_pos_embeds, grid_ts, grid_hs, grid_ws): + pos_embed = pos_embed.repeat(t, 1) + pos_embed = ( + pos_embed.view(t, h // merge_size, merge_size, w // merge_size, merge_size, -1) + .permute(0, 1, 3, 2, 4, 5) + .flatten(0, 4) + ) + patch_pos_embeds_permute.append(pos_embed) + patch_pos_embeds = torch.cat(patch_pos_embeds_permute) + return patch_pos_embeds + + def get_vision_embeddings(self, pixel_values, grid_thw, **kwargs): + hidden_states = torch.from_numpy(self.vision_embeddings(pixel_values)[0]) + + pos_embeds = self.fast_pos_embed_interpolate(grid_thw) + hidden_states = hidden_states + pos_embeds + + rotary_pos_emb = self.rot_pos_emb(grid_thw) + seq_len, _ = hidden_states.size() + hidden_states = hidden_states.reshape(seq_len, -1) + rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1) + cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum( + dim=0, dtype=torch.int32 + ) + cu_seqlens = torch.nn.functional.pad(cu_seqlens, (1, 0), value=0) + attention_mask = torch.zeros((1, hidden_states.shape[0], hidden_states.shape[0]), dtype=torch.bool) + causal_mask = torch.zeros_like(attention_mask, dtype=torch.float32) + for i in range(1, len(cu_seqlens)): + attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = True + + causal_mask.masked_fill_(torch.logical_not(attention_mask), float("-inf")) + + res = self.vision_embeddings_merger( + pixel_values=hidden_states, attention_mask=causal_mask, rotary_pos_emb=rotary_pos_emb + ) + return res[0] + + def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None): + """ + Encodes images into continuous embeddings that can be forwarded to the language model. + """ + image_embeds = torch.from_numpy(self.get_vision_embeddings(pixel_values, image_grid_thw)) + split_sizes = (image_grid_thw.prod(-1) // self.spatial_merge_size**2).tolist() + image_embeds = torch.split(image_embeds, split_sizes) + return image_embeds + + def get_video_features( + self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None + ): + """ + Encodes videos into continuous embeddings that can be forwarded to the language model. + """ + return self.get_image_features(pixel_values_videos, video_grid_thw) + + def get_multimodal_embeddings( + self, + input_ids, + pixel_values=None, + attention_mask=None, + position_ids=None, + pixel_values_videos=None, + image_grid_thw=None, + video_grid_thw=None, + cache_position=None, + **kwargs, + ): + inputs_embeds = torch.from_numpy(self.get_text_embeddings(input_ids)) + if pixel_values is not None and input_ids.shape[1] != 1: + image_embeds = self.get_image_features(pixel_values, image_grid_thw) + image_embeds = torch.cat(image_embeds, dim=0) + n_image_tokens = (input_ids == self.config.image_token_id).sum().item() + n_image_features = image_embeds.shape[0] + if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + + mask = input_ids == self.config.image_token_id + mask_unsqueezed = mask.unsqueeze(-1) + mask_expanded = mask_unsqueezed.expand_as(inputs_embeds) + image_mask = mask_expanded.to(inputs_embeds.device) + + image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds) + + if pixel_values_videos is not None and input_ids.shape[1] != 1: + video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw) + video_embeds = torch.cat(video_embeds, dim=0) + n_video_tokens = (input_ids == self.config.video_token_id).sum().item() + n_video_features = video_embeds.shape[0] + if n_video_tokens != n_video_features: + raise ValueError( + f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}" + ) + + mask = input_ids == self.config.video_token_id + mask_unsqueezed = mask.unsqueeze(-1) + mask_expanded = mask_unsqueezed.expand_as(inputs_embeds) + video_mask = mask_expanded.to(inputs_embeds.device) + + video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds) + + if position_ids is None and input_ids is not None and (attention_mask is None or attention_mask.ndim == 2): + # calculate RoPE index once per generation in the pre-fill stage only + if (cache_position is not None and cache_position[0] == 0) or self.rope_deltas is None: + # Construct mm_token_type_ids from input_ids + mm_token_type_ids = torch.zeros_like(input_ids, dtype=torch.int) + mm_token_type_ids[input_ids == self.config.image_token_id] = 1 + mm_token_type_ids[input_ids == self.config.video_token_id] = 2 + position_ids, rope_deltas = self.get_rope_index( + input_ids, mm_token_type_ids, image_grid_thw, video_grid_thw, attention_mask + ) + self.rope_deltas = rope_deltas + # then use the prev pre-calculated rope-deltas to get the correct position ids + else: + batch_size, seq_length, _ = inputs_embeds.shape + delta = cache_position[0] + self.rope_deltas if cache_position is not None else 0 + position_ids = torch.arange(seq_length, device=inputs_embeds.device) + position_ids = position_ids.view(1, -1).expand(batch_size, -1) + if cache_position is not None: # otherwise `deltas` is an int `0` + delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0) + position_ids = position_ids.add(delta) + position_ids = position_ids.unsqueeze(0).expand(3, -1, -1) + + return inputs_embeds, attention_mask, position_ids + + @staticmethod + def preprocess_inputs( + text: str, + image: Optional["Image"] = None, + processor: Optional[AutoImageProcessor] = None, + tokenizer: Optional[PreTrainedTokenizer] = None, + config: Optional[PretrainedConfig] = None, + video: Optional["VideoInput"] = None, + audio: Optional[np.ndarray] = None, + ): + if processor is None: + raise ValueError("Processor is required.") + if audio is not None: + raise ValueError("Audio input is not supported") + conversation = [ + { + "role": "user", + "content": [ + {"type": "text", "text": text}, + ], + } + ] + if image is not None: + conversation[0]["content"].insert(0, {"type": "image"}) + if video is not None: + conversation[0]["content"].insert(0, {"type": "video"}) + + text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) + + inputs = processor(images=image, text=text_prompt, videos=video, return_tensors="pt") + return inputs + + def forward( + self, + input_ids, + pixel_values=None, + past_key_values=None, + inputs_embeds=None, + image_sizes=None, + attention_mask=None, + position_ids=None, + image_bound=None, + tgt_sizes=None, + pixel_values_videos=None, + image_grid_thw=None, + video_grid_thw=None, + rope_deltas=None, + **kwargs, + ): + result = super().forward( + input_ids, + pixel_values, + past_key_values, + inputs_embeds, + image_sizes, + attention_mask, + position_ids, + image_bound, + tgt_sizes, + pixel_values_videos, + image_grid_thw, + video_grid_thw, + rope_deltas, + **kwargs, + ) + final_result = QWen2VLModelOutputWithPast( + logits=result.logits, past_key_values=result.past_key_values, rope_deltas=rope_deltas + ) + return final_result + + def generate(self, *args, **kwargs): + # Clear cached rope delta from previous generations + self.rope_deltas = None + + return super().generate(*args, **kwargs) + + MODEL_TYPE_TO_CLS_MAPPING = { "llava": _OVLlavaForCausalLM, "llava_next": _OVLlavaNextForCausalLM, @@ -4823,5 +5184,7 @@ def preprocess_inputs( "phi4_multimodal": _OVPhi4MMForCausalLM, "llama4": _OVLlama4ForCausalLM, "qwen3_vl": _OVQwen3VLForCausalLM, + "qwen3_5": _OVQwen3_5ForCausalLM, + "qwen3_5_text": _OVQwen3_5ForCausalLM, "minicpmo": _OVMiniCPMOForCausalLM, } diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 0ddb251b22..c3a89fe3aa 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -28,6 +28,7 @@ DeepseekOpenVINOConfig, LFM2OpenVINOConfig, Qwen3VLOpenVINOConfig, + Qwen3_5TextOpenVINOConfig, ) from optimum.exporters.openvino.model_patcher import patch_update_causal_mask from optimum.exporters.openvino.utils import ONNX_SUPPORTED_ARCHITECTURES @@ -333,6 +334,10 @@ def test_find_untested_architectures(self): "exaone4", } + # qwen3_5_text a part of qwen3_5 architecture and is tested in seq2seq group + if is_transformers_version(">=", str(Qwen3_5TextOpenVINOConfig.MIN_TRANSFORMERS_VERSION)): + supported_architectures -= {"qwen3_5_text"} + supported_architectures -= ONNX_SUPPORTED_ARCHITECTURES untested_architectures = supported_architectures - tested_architectures diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 61659fa1ed..a17750c2cc 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -172,6 +172,7 @@ "qwen3": "optimum-intel-internal-testing/tiny-random-qwen3", "qwen3_moe": "optimum-intel-internal-testing/tiny-random-qwen3moe", "qwen3_vl": "optimum-intel-internal-testing/tiny-random-qwen3-vl", + "qwen3_5": "optimum-intel-internal-testing/tiny-random-qwen3.5", "rembert": "optimum-intel-internal-testing/tiny-random-rembert", "resnet": "optimum-intel-internal-testing/tiny-random-resnet", "roberta": "optimum-intel-internal-testing/tiny-random-roberta", @@ -334,6 +335,13 @@ "vision_embeddings_merger_model": 32, "vision_embeddings_pos_model": 1, }, + "qwen3_5": { + "lm_model": 100, + "text_embeddings_model": 1, + "vision_embeddings_model": 1, + "vision_embeddings_merger_model": 32, + "vision_embeddings_pos_model": 1, + }, "sana": { "transformer": 58, "vae_decoder": 28, From 4cbb25e8d713d003a60a981013321ecf3bdadc38 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 8 Mar 2026 18:28:40 +0000 Subject: [PATCH 159/190] Fix Qwen3.5 model patcher and config for VLM text embeddings access Co-authored-by: rkazants <35459624+rkazants@users.noreply.github.com> --- optimum/exporters/openvino/model_configs.py | 5 +++++ optimum/exporters/openvino/model_patcher.py | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 527c6321ab..a1e77272ab 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -5540,6 +5540,11 @@ def get_model_for_behavior(model, behavior: Union[str, QwenVLConfigBehavior]): vision_emb_pos.config = model.config.vision_config return vision_emb_pos + if behavior == QwenVLConfigBehavior.TEXT_EMBEDDINGS: + text_embedding = model.model.language_model.embed_tokens + text_embedding.config = model.config + return text_embedding + return Qwen2VLOpenVINOConfig.get_model_for_behavior(model, behavior) def with_behavior( diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 28110c1040..7321b2371d 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -56,8 +56,6 @@ ) from optimum.intel.utils.import_utils import is_diffusers_version, is_torch_version, is_transformers_version -from ._ov_ops import convert_recurrent_attention_cell - if is_transformers_version(">=", "4.53"): from transformers.masking_utils import ALL_MASK_ATTENTION_FUNCTIONS, eager_mask, sdpa_mask @@ -8205,6 +8203,8 @@ def __init__( from openvino.frontend.pytorch import ConversionExtension, ModuleExtension + from ._ov_ops import convert_recurrent_attention_cell + super().__init__(config, model, model_kwargs) # Detect VLM vs text-only model From b660200347700b3d4f6b5fee3c4e9172d03efb75 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 8 Mar 2026 18:30:23 +0000 Subject: [PATCH 160/190] Fix comment grammar in test_decoder.py Co-authored-by: rkazants <35459624+rkazants@users.noreply.github.com> --- docs/source/openvino/models.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx index 4ab826378b..4365b1ff84 100644 --- a/docs/source/openvino/models.mdx +++ b/docs/source/openvino/models.mdx @@ -131,7 +131,7 @@ Here is the list of the supported architectures : - Qwen2VL - Qwen2.5VL - Qwen3VL -- Qwen3.5 +- Qwen3.5 - ResNet - Roberta - Roformer From f901a66f2e42360405402a765c80342e9fefc513 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 9 Mar 2026 15:57:24 +0100 Subject: [PATCH 161/190] diffusers latest release now compatible with transformers v5 --- .github/workflows/build_documentation.yml | 1 - .github/workflows/build_pr_documentation.yml | 1 - .github/workflows/test_openvino.yml | 7 +------ .github/workflows/test_openvino_nightly.yml | 5 ----- .github/workflows/test_openvino_slow.yml | 5 ----- setup.py | 2 +- 6 files changed, 2 insertions(+), 19 deletions(-) diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml index ce3eb464ce..896c5f8b43 100644 --- a/.github/workflows/build_documentation.yml +++ b/.github/workflows/build_documentation.yml @@ -51,7 +51,6 @@ jobs: run: | pip install --upgrade pip uv uv pip install git+https://github.com/huggingface/doc-builder - uv pip install transformers==4.57.6 uv pip install .[quality] neural-compressor[pt]>3.4 diffusers accelerate datasets - name: Make documentation diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml index 6b0b89f3f1..ac3291acfd 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yml @@ -38,7 +38,6 @@ jobs: run: | pip install --upgrade pip uv uv pip install git+https://github.com/huggingface/doc-builder - uv pip install transformers==4.57.6 uv pip install .[quality] neural-compressor[pt]>3.4 diffusers accelerate datasets - name: Make documentation diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index 48e3a7409b..ba60fc597a 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -56,7 +56,7 @@ jobs: pip install --upgrade pip uv uv pip install .[tests] librosa diffusers - - if: ${{ matrix.test-pattern == '*modeling*' }} + - if: ${{ matrix.test-pattern == '*modeling*' || matrix.test-pattern == '*quantization*' }} name: Install OpenVINO run: | uv pip install openvino==2025.3.0 openvino-tokenizers==2025.3.0 @@ -66,11 +66,6 @@ jobs: run: | uv pip install transformers==${{ matrix.transformers-version }} - - if: ${{ matrix.transformers-version == 'latest' }} - name: Install diffusers - run: | - uv pip install git+https://github.com/huggingface/diffusers - - if: ${{ matrix.transformers-version == '4.45.0' }} name: Install specific dependencies and versions required for older transformers run: | diff --git a/.github/workflows/test_openvino_nightly.yml b/.github/workflows/test_openvino_nightly.yml index ace0246329..886d22c2b3 100644 --- a/.github/workflows/test_openvino_nightly.yml +++ b/.github/workflows/test_openvino_nightly.yml @@ -99,11 +99,6 @@ jobs: pip install --upgrade pip uv uv pip install .[tests] librosa diffusers - - if: ${{ matrix.transformers-version == 'latest' }} - name: Install diffusers - run: | - uv pip install git+https://github.com/huggingface/diffusers - - if: ${{ matrix.openvino-version == 'openvino-nightly' }} name: Install OpenVINO Nightly run: | diff --git a/.github/workflows/test_openvino_slow.yml b/.github/workflows/test_openvino_slow.yml index 580253a36a..8a6460ca1b 100644 --- a/.github/workflows/test_openvino_slow.yml +++ b/.github/workflows/test_openvino_slow.yml @@ -61,11 +61,6 @@ jobs: python -m pip install --upgrade pip uv uv pip install .[tests] librosa diffusers - - if: ${{ matrix.transformers-version == 'latest' }} - name: Install diffusers - run: | - uv pip install git+https://github.com/huggingface/diffusers - - if: ${{ matrix.transformers-version != 'latest' && matrix.transformers-version != 'main' }} name: Install specific dependencies and versions required for older transformers run: | diff --git a/setup.py b/setup.py index 16e2a82fed..ca26a42a1f 100644 --- a/setup.py +++ b/setup.py @@ -73,7 +73,7 @@ "openvino": ["nncf>=2.19.0", "openvino>=2025.3.0", "openvino-tokenizers>=2025.3.0"], "neural-compressor": ["neural-compressor[pt]>=3.4.1", "accelerate", "transformers<4.46", "datasets"], "ipex": ["intel-extension-for-pytorch>=2.8", "transformers>4.54,<4.56", "accelerate"], - "diffusers": ["diffusers", "transformers<5"], + "diffusers": ["diffusers"], "quality": QUALITY_REQUIRE, "tests": TESTS_REQUIRE, } From 7879da8bfea6fc26f8e179c74a76fb7e51b10c9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 9 Mar 2026 16:47:20 +0100 Subject: [PATCH 162/190] set qwen3_next max transformers version --- optimum/exporters/openvino/model_configs.py | 1 + tests/openvino/test_decoder.py | 3 ++- tests/openvino/test_export.py | 5 ++++- tests/openvino/test_exporters_cli.py | 6 ++++++ 4 files changed, 13 insertions(+), 2 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index b8c2eefcf1..a386842b7d 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -5435,6 +5435,7 @@ class Qwen3NextOpenVINOConfig(Qwen3OpenVINOConfig): DUMMY_PKV_GENERATOR_CLASS = Qwen3NextDummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig MIN_TRANSFORMERS_VERSION = "4.57.0" + MAX_TRANSFORMERS_VERSION = "4.57.6" _MODEL_PATCHER = Qwen3NextModelPatcher def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], direction: str): diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 2fdacf5ce5..bedb5ee8e9 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -96,7 +96,8 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version(">=", "4.54.0"): SUPPORTED_SSM_ARCHITECTURES += ("lfm2",) - if is_transformers_version(">=", "4.57.0"): + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version(">=", "4.57.0") and is_transformers_version("<", "5"): SUPPORTED_SSM_ARCHITECTURES += ("qwen3_next",) SUPPORTED_ARCHITECTURES += SUPPORTED_SSM_ARCHITECTURES diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index 0c51a6f8da..e9c7696c2d 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -111,7 +111,10 @@ class ExportModelTest(unittest.TestCase): SUPPORTED_ARCHITECTURES.update({"afmoe": OVModelForCausalLM}) if is_transformers_version(">=", "4.57.0"): - SUPPORTED_ARCHITECTURES.update({"hunyuan_v1_dense": OVModelForCausalLM, "qwen3_next": OVModelForCausalLM}) + SUPPORTED_ARCHITECTURES.update({"hunyuan_v1_dense": OVModelForCausalLM}) + + if is_transformers_version(">=", "4.57.0") and is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES.update({"qwen3_next": OVModelForCausalLM}) EXPECTED_DIFFUSERS_SCALE_FACTORS = { "stable-diffusion-xl": {"vae_encoder": "128.0", "vae_decoder": "128.0"}, diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index b26569ed00..940ab5b3ac 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -169,6 +169,12 @@ class OVCLIExportTestCase(unittest.TestCase): SUPPORTED_ARCHITECTURES.extend( [ ("text-generation-with-past", "hunyuan_v1_dense"), + ] + ) + + if is_transformers_version(">=", "4.57.0") and is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES.extend( + [ ("text-generation-with-past", "qwen3_next"), ] ) From d5f22440f58f3a6231fa603bd37cb727f5074b60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 9 Mar 2026 16:51:55 +0100 Subject: [PATCH 163/190] Fix doc building --- .github/workflows/build_documentation.yml | 1 + .github/workflows/build_pr_documentation.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml index 896c5f8b43..ce3eb464ce 100644 --- a/.github/workflows/build_documentation.yml +++ b/.github/workflows/build_documentation.yml @@ -51,6 +51,7 @@ jobs: run: | pip install --upgrade pip uv uv pip install git+https://github.com/huggingface/doc-builder + uv pip install transformers==4.57.6 uv pip install .[quality] neural-compressor[pt]>3.4 diffusers accelerate datasets - name: Make documentation diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml index ac3291acfd..6b0b89f3f1 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yml @@ -38,6 +38,7 @@ jobs: run: | pip install --upgrade pip uv uv pip install git+https://github.com/huggingface/doc-builder + uv pip install transformers==4.57.6 uv pip install .[quality] neural-compressor[pt]>3.4 diffusers accelerate datasets - name: Make documentation From 43ed6175824e06d0ae0226ae4276e3d22a95c364 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 9 Mar 2026 17:35:18 +0100 Subject: [PATCH 164/190] add qwen3_next to list of untested architectures --- tests/openvino/test_decoder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index bedb5ee8e9..e259c38e68 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -336,6 +336,7 @@ def test_find_untested_architectures(self): "marian", "llama4", "exaone4", + "qwen3_next", } supported_architectures -= ONNX_SUPPORTED_ARCHITECTURES From 87cc3f93a7712f60211a14431da170f8b1909314 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 10 Mar 2026 17:29:14 +0100 Subject: [PATCH 165/190] comment for zamba2 --- optimum/exporters/openvino/model_configs.py | 2 +- tests/openvino/test_decoder.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index a386842b7d..62464e81f9 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -4821,8 +4821,8 @@ class Zamba2OpenVINOConfig(MambaOpenVINOConfig): DUMMY_PKV_GENERATOR_CLASS = Zamba2DummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig MIN_TRANSFORMERS_VERSION = "4.49.0" - # TODO (@echarlaix): add v5 support MAX_TRANSFORMERS_VERSION = "4.57.6" + # MIN_TRANSFORMERS_VERSION = "5.2.0" _MODEL_PATCHER = Zamba2ModelPatcher def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], direction: str): diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index e259c38e68..00bb85bd30 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -86,7 +86,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_SSM_ARCHITECTURES = ("mamba", "falcon_mamba") - # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly if is_transformers_version(">=", "4.49") and is_transformers_version("<", "5"): SUPPORTED_SSM_ARCHITECTURES += ("zamba2",) @@ -180,7 +179,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "dbrx", # "phimoe", "marian", - # "zamba2", ) GENERATION_LENGTH = 100 From 07d943dfd90acc944331ba71e009fe4008c4ae3e Mon Sep 17 00:00:00 2001 From: "Kazantsev, Roman" Date: Wed, 11 Mar 2026 10:28:18 +0400 Subject: [PATCH 166/190] Use Qwen3VLOpenVINOConfig Signed-off-by: Kazantsev, Roman --- optimum/exporters/openvino/model_configs.py | 24 +++++---------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index a1e77272ab..7306a6a9f8 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -5434,7 +5434,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int *["text-generation", "text-generation-with-past"], library_name="transformers", ) -class Qwen3_5TextOpenVINOConfig(Qwen3OpenVINOConfig): +class Qwen3_5TextOpenVINOConfig(Qwen3VLTextOpenVINOConfig): DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, Qwen3_5DummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = Qwen3_5DummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig @@ -5506,9 +5506,9 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs): *["image-text-to-text"], library_name="transformers", ) -class Qwen3_5OpenVINOConfig(Qwen2VLOpenVINOConfig): +class Qwen3_5OpenVINOConfig(Qwen3VLOpenVINOConfig): SUPPORTED_BEHAVIORS = [model_type.value for model_type in QwenVLConfigBehavior] - DUMMY_INPUT_GENERATOR_CLASSES = (DummyQwen2VLVisionEmbedInputGenerator,) + DUMMY_INPUT_GENERATOR_CLASSES = (DummyQwen3VLVisionEmbedInputGenerator,) MIN_TRANSFORMERS_VERSION = "4.57.0" def __init__( @@ -5533,20 +5533,6 @@ def __init__( self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config) self._normalized_config.use_embed_dim = True - @staticmethod - def get_model_for_behavior(model, behavior: Union[str, QwenVLConfigBehavior]): - if behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_POS: - vision_emb_pos = model.visual.pos_embed - vision_emb_pos.config = model.config.vision_config - return vision_emb_pos - - if behavior == QwenVLConfigBehavior.TEXT_EMBEDDINGS: - text_embedding = model.model.language_model.embed_tokens - text_embedding.config = model.config - return text_embedding - - return Qwen2VLOpenVINOConfig.get_model_for_behavior(model, behavior) - def with_behavior( self, behavior: Union[str, QwenVLConfigBehavior], @@ -5572,8 +5558,8 @@ def with_behavior( self.int_dtype, self.float_dtype, model_patcher=Qwen3_5ModelPatcher, - dummy_input_generator=DummyQwen2VLLMInputGenerator, - inputs_update={"position_ids": {1: "batch_size", 2: "sequence_length"}}, + #dummy_input_generator=DummyQwen2VLLMInputGenerator, + #inputs_update={"position_ids": {1: "batch_size", 2: "sequence_length"}}, ) if behavior in ( From d8864c45db718f09ac312120502c1340229dc505 Mon Sep 17 00:00:00 2001 From: "Kazantsev, Roman" Date: Wed, 11 Mar 2026 14:03:58 +0400 Subject: [PATCH 167/190] Remove redundant functions Signed-off-by: Kazantsev, Roman --- optimum/exporters/openvino/model_patcher.py | 94 --------------------- 1 file changed, 94 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 816204b0b6..46cac7047d 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -8419,62 +8419,6 @@ def __exit__(self, exc_type, exc_value, traceback): del sparse_moe_block.down_projs, sparse_moe_block.gate_projs, sparse_moe_block.up_projs - - -# Patched implementation of the gated delta rule in recurrent form. -# Adapted from: -# https://github.com/huggingface/transformers/blob/v4.57-release/src/transformers/models/qwen3_next/modeling_qwen3_next.py#L522 -# -# To represent the for-loop that generates output embeddings, we use a module -# and the conversion extension mechanism. This is necessary because there is -# no known vectorized form of this loop that would allow it to be correctly -# traced with torch.jit.trace -def patched_recurrent_gated_delta_rule( - self, query, key, value, g, beta, initial_state, output_final_state, use_qk_l2norm_in_kernel=False -): - def l2norm(x: torch.FloatTensor, dim: int = -1, eps: float = 1e-6): - """This function is intended to align with the l2norm implementation in the FLA library.""" - inv_norm = torch.rsqrt((x * x).sum(dim=dim, keepdim=True) + eps) - return x * inv_norm - - initial_dtype = query.dtype - if use_qk_l2norm_in_kernel: - query = l2norm(query, dim=-1, eps=1e-6) - key = l2norm(key, dim=-1, eps=1e-6) - query, key, value, beta, g = [ - x.transpose(1, 2).contiguous().to(torch.float32) for x in (query, key, value, beta, g) - ] - - batch_size, num_heads, sequence_length, k_head_dim = key.shape - v_head_dim = value.shape[-1] - scale = 1 / (query.shape[-1] ** 0.5) - query = query * scale - - last_recurrent_state = ( - torch.zeros(batch_size, num_heads, k_head_dim, v_head_dim).to(value) - if initial_state is None - else initial_state.to(value) - ) - - output_cell = self.recurrent_attention_cell( - query, # (B, H, T, D1) - key, # (B, H, T, D1) - value, # (B, H, T, D2) - g, # (B, H, T) - beta, # (B, H, T) - last_recurrent_state, # (B, H, D1, D2) - ) - - num_elems = value.numel() - core_attn_out = output_cell[:num_elems].reshape(value.shape) - last_recurrent_state = output_cell[num_elems:].reshape(last_recurrent_state.shape) - - if not output_final_state: - last_recurrent_state = None - core_attn_out = core_attn_out.transpose(1, 2).contiguous().to(initial_dtype) - return core_attn_out, last_recurrent_state - - # The CausalConv1D block is overridden with a generic patch provided by `ov_causal_conv1d()`. # The GatedDeltaNet block is overridden with a recurrent version of its implementation. # @@ -8576,44 +8520,6 @@ def apply_mask_to_padding_states(hidden_states, attention_mask): return output -# This torch.nn.Module represents the GatedDeltaNet layer in its recurrent form. -# It is required for converting the GatedDeltaNet layer with OpenVINO using the ModuleExtension mechanism. -class RecurrentAttentionCell(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward( - self, - query, # (B, H, T, D1) - key, # (B, H, T, D1) - value, # (B, H, T, D2) - g, # (B, H, T) - beta, # (B, H, T) - last_recurrent_state, # (B, H, D1, D2) - ): - _, _, sequence_length, _ = key.shape - core_attn_out = torch.zeros_like(value) - - for i in range(sequence_length): - q_t = query[:, :, i] - k_t = key[:, :, i] - v_t = value[:, :, i] - g_t = g[:, :, i].exp().unsqueeze(-1).unsqueeze(-1) - beta_t = beta[:, :, i].unsqueeze(-1) - - last_recurrent_state = last_recurrent_state * g_t - kv_mem = (last_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2) - delta = (v_t - kv_mem) * beta_t - last_recurrent_state = last_recurrent_state + k_t.unsqueeze(-1) * delta.unsqueeze(-2) - core_attn_out[:, :, i] = (last_recurrent_state * q_t.unsqueeze(-1)).sum(dim=-2) - - # This is a workaround to ensure a single output from the torch.nn.Module. - # The OpenVINO ModuleExtension mechanism has a limitation and expects - # the module to produce only one output. - output_cell = torch.cat([core_attn_out.flatten(), last_recurrent_state.flatten()], dim=0) - return output_cell - - class Qwen3_5ModelPatcher(OVDecoderModelPatcher): def __init__( self, From 96d47b0172bb6720d43aef9b15d372eae3be0f4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 11 Mar 2026 17:36:47 +0100 Subject: [PATCH 168/190] Fix eagle3 compatibility with v5 --- optimum/exporters/openvino/model_patcher.py | 2 +- tests/openvino/test_decoder.py | 6 +----- tests/openvino/test_genai.py | 4 ++-- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index f21399c836..a0c8b4601a 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -7804,7 +7804,7 @@ def forward( hidden_states=hidden_states, attention_mask=attention_mask, position_ids=position_ids, - past_key_value=past_key_values, + **{"past_key_values" if is_transformers_version(">=", "5.0") else "past_key_value": past_key_values}, output_attentions=output_attentions, position_embeddings=position_embeddings, use_cache=use_cache, diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 00bb85bd30..3067f1c5c4 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -894,11 +894,7 @@ def test_load_with_different_dtype(self): ) @parameterized.expand(EAGLE3_MODELS.items()) - # TODO (@echarlaix) transformers v5 support - @pytest.mark.skipif( - is_transformers_version("<", "4.54") or is_transformers_version(">=", "5"), - reason="Eagle3 requires transformers >= 4.54", - ) + @pytest.mark.skipif(is_transformers_version("<", "4.54"), reason="Eagle3 requires transformers >= 4.54") def test_load_and_infer_with_eagle3_model(self, model_arch, model_pair): draft_model_id, target_model_id = model_pair diff --git a/tests/openvino/test_genai.py b/tests/openvino/test_genai.py index 584d798e88..9d217e7373 100644 --- a/tests/openvino/test_genai.py +++ b/tests/openvino/test_genai.py @@ -467,8 +467,8 @@ class LLMPipelineWithEagle3TestCase(unittest.TestCase): @parameterized.expand(EAGLE3_MODELS.items()) def test_compare_outputs(self, model_arch, model_pair): - if is_transformers_version("<", "4.54") or is_transformers_version(">=", "5"): - self.skipTest("Eagle3 requires transformers >= 4.54 and transformers < 5") + if is_transformers_version("<", "4.54"): + self.skipTest("Eagle3 requires transformers >= 4.54") if is_openvino_version("<", "2026.0"): self.skipTest("Eagle3 requires openvino-genai >= 2026.0") From db805612f9bac4179c27960f4c6f6d9eeb0b4ef1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 11 Mar 2026 18:59:25 +0100 Subject: [PATCH 169/190] set dtype in tests when loading sd3 model --- tests/openvino/test_diffusion.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py index bc58c91796..08c5180a48 100644 --- a/tests/openvino/test_diffusion.py +++ b/tests/openvino/test_diffusion.py @@ -157,7 +157,12 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, model_type=model_arch) ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], device=OPENVINO_DEVICE) auto_cls = self.AUTOMODEL_CLASS if "sana" not in model_arch else DiffusionPipeline - diffusers_pipeline = auto_cls.from_pretrained(MODEL_NAMES[model_arch]) + model_kwargs = ( + {"torch_dtype": torch.float32} + if is_transformers_version(">=", "5") and model_arch == "stable-diffusion-3" + else {} + ) + diffusers_pipeline = auto_cls.from_pretrained(MODEL_NAMES[model_arch], **model_kwargs) for output_type in ["latent", "np", "pt"]: inputs["output_type"] = output_type @@ -632,7 +637,12 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): height, width, batch_size = 128, 128, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, model_type=model_arch) - diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + model_kwargs = ( + {"torch_dtype": torch.float32} + if is_transformers_version(">=", "5") and model_arch == "stable-diffusion-3" + else {} + ) + diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], **model_kwargs) ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], device=OPENVINO_DEVICE) for output_type in ["latent", "np", "pt"]: @@ -898,12 +908,18 @@ def test_shape(self, model_arch: str): @require_diffusers def test_compare_to_diffusers_pipeline(self, model_arch: str): ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], device=OPENVINO_DEVICE) + model_kwargs = ( + {"torch_dtype": torch.float32} + if is_transformers_version(">=", "5") and model_arch == "stable-diffusion-3" + else {} + ) + if model_arch != "flux-fill": - diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], **model_kwargs) else: from diffusers import FluxFillPipeline - diffusers_pipeline = FluxFillPipeline.from_pretrained(MODEL_NAMES[model_arch]) + diffusers_pipeline = FluxFillPipeline.from_pretrained(MODEL_NAMES[model_arch], **model_kwargs) height, width, batch_size = 64, 64, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, model_arch=model_arch) From 3e5a2b23f2a0d658b81acb40b9559e20c6f7d3c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 11 Mar 2026 19:13:08 +0100 Subject: [PATCH 170/190] trigger tests for transformers v5.3 --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index ca26a42a1f..3ca73ac3d9 100644 --- a/setup.py +++ b/setup.py @@ -28,8 +28,8 @@ INSTALL_REQUIRE = [ "torch>=2.1", - "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@transformers-v5", - "transformers>=4.45,<5.1", + "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@xadupre/transformers5", + "transformers>=4.45,<5.4", "setuptools", "huggingface-hub>=0.23.2,<2.0", "nncf>=2.19.0", From 10add8c7df53753ab42ae8e224cb76dc35ef5eb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 12 Mar 2026 09:32:48 +0100 Subject: [PATCH 171/190] update setup --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 3ca73ac3d9..fe70f63757 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ INSTALL_REQUIRE = [ "torch>=2.1", "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@xadupre/transformers5", - "transformers>=4.45,<5.4", + "transformers>=4.45,<5.1", "setuptools", "huggingface-hub>=0.23.2,<2.0", "nncf>=2.19.0", From 501b5233a25e0753591cf4557f8edc91a075cf0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 12 Mar 2026 09:43:33 +0100 Subject: [PATCH 172/190] update setup --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index fe70f63757..baccbf1d68 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ INSTALL_REQUIRE = [ "torch>=2.1", "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@xadupre/transformers5", - "transformers>=4.45,<5.1", + "transformers>=4.45,<5.3", "setuptools", "huggingface-hub>=0.23.2,<2.0", "nncf>=2.19.0", From 15548fcb6986259ddaaa1af4ced90701a6ba1acc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 12 Mar 2026 10:01:37 +0100 Subject: [PATCH 173/190] update setup --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index baccbf1d68..fe70f63757 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ INSTALL_REQUIRE = [ "torch>=2.1", "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@xadupre/transformers5", - "transformers>=4.45,<5.3", + "transformers>=4.45,<5.1", "setuptools", "huggingface-hub>=0.23.2,<2.0", "nncf>=2.19.0", From 79078787955d097221068bfb94399b4a8b6850d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 17 Mar 2026 19:09:31 +0100 Subject: [PATCH 174/190] fix bf16 model export --- optimum/exporters/openvino/__main__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index eb763b45d4..5e59f0cb19 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -496,6 +496,9 @@ def bitnet_load_hook(self, state_dict, prefix, *args, **kwargs): **loading_kwargs, ) + if getattr(model, "dtype", None) in [torch.float16, torch.bfloat16]: + patch_16bit = True + needs_pad_token_id = task == "text-classification" and getattr(model.config, "pad_token_id", None) is None if needs_pad_token_id: From c026dd99262725b2000457e0c77f682ff2082c4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Mar 2026 10:03:59 +0100 Subject: [PATCH 175/190] question answering pipeline deprecated in v5.3 --- tests/openvino/test_modeling.py | 8 ++++++++ tests/openvino/test_modeling_basic.py | 6 +++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 0c5011a908..53223e692a 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -887,6 +887,10 @@ def test_compare_to_transformers(self, model_arch): @parameterized.expand(SUPPORTED_ARCHITECTURES) @pytest.mark.run_slow @slow + @pytest.mark.skipif( + is_transformers_version(">=", "5.3"), + reason="requires transformers < v5.3 since question-answering pipeline is deprecated in v5.3", + ) def test_pipeline(self, model_arch): set_seed(SEED) model_id = MODEL_NAMES[model_arch] @@ -909,6 +913,10 @@ def test_pipeline(self, model_arch): @pytest.mark.run_slow @slow + @pytest.mark.skipif( + is_transformers_version(">=", "5.3"), + reason="requires transformers < v5.3 since question-answering pipeline is deprecated in v5.3", + ) def test_metric(self): model_id = "distilbert-base-cased-distilled-squad" set_seed(SEED) diff --git a/tests/openvino/test_modeling_basic.py b/tests/openvino/test_modeling_basic.py index c2576db98b..eb72175032 100644 --- a/tests/openvino/test_modeling_basic.py +++ b/tests/openvino/test_modeling_basic.py @@ -30,12 +30,16 @@ "hf-internal-testing/tiny-random-bert": "OVModelForMaskedLM", "hf-internal-testing/tiny-random-distilbert": "OVModelForSequenceClassification", "hf-internal-testing/tiny-random-mbart": "OVModelForSeq2SeqLM", - "hf-internal-testing/tiny-random-roberta": "OVModelForQuestionAnswering", "hf-internal-testing/tiny-random-gpt2": "OVModelForCausalLM", "hf-internal-testing/tiny-random-t5": "OVModelForSeq2SeqLM", "hf-internal-testing/tiny-random-bart": "OVModelForSeq2SeqLM", } +# question-answering pipeline is deprecated in transformers v5.3 +if is_transformers_version("<", "5.3"): + MODEL_NAMES["hf-internal-testing/tiny-random-roberta"] = "OVModelForQuestionAnswering" + + TASKS = { "OVModelForMaskedLM": "fill-mask", "OVModelForSequenceClassification": "text-classification", From 61d85b371415e16d5a3757a49219ef604ee5e337 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Mar 2026 10:04:16 +0100 Subject: [PATCH 176/190] ix mamba expected int8 --- tests/openvino/utils_tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index cc084565fe..1117604b7b 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -358,8 +358,8 @@ "vocoder": 80, }, "clip": {"model": 130}, - "mamba": {"model": 322 if is_transformers_version("<", "5") else 324}, - "falcon_mamba": {"model": 162 if is_transformers_version("<", "5") else 164}, + "mamba": {"model": 324 if is_transformers_version("==", "5.0") else 322}, + "falcon_mamba": {"model": 164 if is_transformers_version("==", "5.0") else 162}, "minicpmo": { "lm_model": 16, "text_embeddings_model": 1, From 55c0d469b9c853e5aa2a285f7f9f0f37dd0b5c80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Mar 2026 17:39:46 +0100 Subject: [PATCH 177/190] Fix _DEFAULT_IGNORED_SCOPE_CONFIGS for __make_16bit_traceable patched models --- optimum/intel/openvino/configuration.py | 1 + tests/openvino/test_quantization.py | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 2002e268ac..2d8608fadb 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -546,6 +546,7 @@ class OVQuantizationMethod(str, Enum): "__module.layers.27.mlp.up_proj/aten::linear/MatMul", "__module.layers.27.mlp.gate_proj/aten::linear/MatMul", ], + "validate": False, }, }, "microsoft/speecht5_tts": { diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index ec9d7b84f7..ff90b208e2 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -1175,8 +1175,12 @@ class OVWeightCompressionTest(unittest.TestCase): "llama", { "model": { - "names": ["__module.layers.1.self_attn.v_proj/aten::linear/MatMul"], - "patterns": ["__module.layers.\\d.self_attn.o_proj/aten::linear/MatMul"], + "names": [ + f"__module.layers.1.self_attn.v_proj/{'aten' if is_transformers_version('<', '5') else 'ov_ext'}::linear/MatMul" + ], + "patterns": [ + f"__module.layers.\\d.self_attn.o_proj/{'aten' if is_transformers_version('<', '5') else 'ov_ext'}::linear/MatMul" + ], } }, ), From 2f38fd868b67d63add87e7f083325cd3c82968e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Mar 2026 18:52:35 +0100 Subject: [PATCH 178/190] add test to ensure dtype --- tests/openvino/test_modeling.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 53223e692a..7a3110b182 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -702,6 +702,14 @@ def test_load_from_hub_onnx_model_and_save(self): del model gc.collect() + def test_export_dtype(self): + model_id = "optimum-intel-internal-testing/tiny-random-GemmaForCausalLM" + for dtype in [torch.float32, torch.bfloat16, torch.float16]: + with TemporaryDirectory() as tmpdirname: + model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype) + self.assertEqual(model.dtype, dtype) + model.save_pretrained(tmpdirname) + ov_model = OVModelForCausalLM.from_pretrained(tmpdirname, export=True) class PipelineTest(unittest.TestCase): def test_load_model_from_hub(self): From c925a79cf3f04bf1ae78d24bc1ec2ea64aefa94e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Mar 2026 18:52:55 +0100 Subject: [PATCH 179/190] style --- tests/openvino/test_modeling.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 7a3110b182..e8f68d62dd 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -711,6 +711,7 @@ def test_export_dtype(self): model.save_pretrained(tmpdirname) ov_model = OVModelForCausalLM.from_pretrained(tmpdirname, export=True) + class PipelineTest(unittest.TestCase): def test_load_model_from_hub(self): model_id = "echarlaix/tiny-random-PhiForCausalLM" From 934b32eb9da4bb1b39769562e430f1998f53ec00 Mon Sep 17 00:00:00 2001 From: "Kazantsev, Roman" Date: Wed, 18 Mar 2026 22:09:29 +0400 Subject: [PATCH 180/190] Correct patching for vlm Signed-off-by: Kazantsev, Roman --- optimum/exporters/openvino/stateful.py | 4 ++++ .../openvino/modeling_visual_language.py | 22 ++++++++++++++----- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/optimum/exporters/openvino/stateful.py b/optimum/exporters/openvino/stateful.py index 3b8642d65a..38ffef5d05 100644 --- a/optimum/exporters/openvino/stateful.py +++ b/optimum/exporters/openvino/stateful.py @@ -310,6 +310,10 @@ def patch_stateful(config: PretrainedConfig, ov_model: ov.Model): return patch_stateful_encoder_decoder(config, ov_model) if config.model_type in SSM_MODELS: return patch_stateful_hybrid_ssm(ov_model) + # For VLM models, the text sub-model may be SSM-based (e.g. qwen3_5 VLM with qwen3_5_text language model) + text_config = getattr(config, "text_config", None) + if text_config is not None and getattr(text_config, "model_type", None) in SSM_MODELS: + return patch_stateful_hybrid_ssm(ov_model) return patch_stateful_decoder(config, ov_model) diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index bc4d5a76d0..8e7a1c8e21 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -3431,12 +3431,6 @@ def preprocess_inputs( Qwen3VLVisionModel, Qwen3VLVisionRotaryEmbedding, ) - - from transformers.models.qwen3_5.modeling_qwen3_5 import ( - Qwen3_5Model, - Qwen3_5VisionModel, - Qwen3_5VisionRotaryEmbedding, - ) else: class Qwen3VLModel: @@ -3451,6 +3445,22 @@ class Qwen3_5Model: class Qwen3_5VisionModel: pass +if is_transformers_version(">=", "5.2.0"): + from transformers.models.qwen3_5.modeling_qwen3_5 import ( + Qwen3_5Model, + Qwen3_5VisionModel, + Qwen3_5VisionRotaryEmbedding, + ) +else: + + class Qwen3_5Model: + pass + + class Qwen3_5VisionModel: + pass + + class Qwen3_5VisionRotaryEmbedding: + pass # The inheritance from Qwen3VLModel is needed to get access to methods: # get_placeholder_mask(): https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py#L1066 From bf1f377c540120ee33641ac02536d41e681bd6bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 19 Mar 2026 15:28:13 +0100 Subject: [PATCH 181/190] check openvino model expected dtype in test_export_dtype --- optimum/intel/openvino/utils.py | 1 + tests/openvino/test_modeling.py | 20 ++++++++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py index be6ac41d31..9549da9773 100644 --- a/optimum/intel/openvino/utils.py +++ b/optimum/intel/openvino/utils.py @@ -95,6 +95,7 @@ "f16": torch.float16, "f32": torch.float32, "f64": torch.float64, + "bf16": torch.bfloat16, } if is_torch_version(">=", "2.4.0"): diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index e8f68d62dd..372cd28943 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -90,8 +90,10 @@ OV_LANGUAGE_MODEL_NAME, OV_PROMPT_ENCODER_MASK_DECODER_MODEL_NAME, OV_TEXT_EMBEDDINGS_MODEL_NAME, + OV_TO_PT_TYPE, OV_VISION_EMBEDDINGS_MODEL_NAME, OV_VISION_ENCODER_MODEL_NAME, + STR_TO_OV_TYPE, TemporaryDirectory, ) from optimum.intel.pipelines import pipeline as optimum_pipeline @@ -704,12 +706,18 @@ def test_load_from_hub_onnx_model_and_save(self): def test_export_dtype(self): model_id = "optimum-intel-internal-testing/tiny-random-GemmaForCausalLM" - for dtype in [torch.float32, torch.bfloat16, torch.float16]: - with TemporaryDirectory() as tmpdirname: - model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype) - self.assertEqual(model.dtype, dtype) - model.save_pretrained(tmpdirname) - ov_model = OVModelForCausalLM.from_pretrained(tmpdirname, export=True) + for dtype in ["f32", "f16", "bf16"]: + torch_dtype = OV_TO_PT_TYPE[dtype] + ov_dtype = STR_TO_OV_TYPE[dtype] + with TemporaryDirectory() as tmp_dir: + model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch_dtype) + self.assertEqual(model.dtype, torch_dtype) + model.save_pretrained(tmp_dir) + del model + ov_model = OVModelForCausalLM.from_pretrained(tmp_dir, export=True) + dtypes = {op.get_element_type() for op in ov_model.model.get_ops() if op.get_type_name() == "Constant"} + self.assertIn(ov_dtype, dtypes, f"Expected {ov_dtype}, found {dtypes}") + del ov_model class PipelineTest(unittest.TestCase): From e1f8c28b8575fd63b731e8ab83d8f2f4faa328bc Mon Sep 17 00:00:00 2001 From: "Kazantsev, Roman" Date: Fri, 20 Mar 2026 17:19:53 +0400 Subject: [PATCH 182/190] Fix bf16 patching Signed-off-by: Kazantsev, Roman --- optimum/exporters/openvino/convert.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index d0efa2259f..23e309efdb 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -447,6 +447,16 @@ def ts_patched_forward(*args, **kwargs): extension=conversion_extensions, ) + if patch_16bit_model: + # Undo __make_16bit_traceable patching on sub-modules to avoid corrupting + # forward methods of modules shared across export behaviors (e.g. pos_embed + # Embedding in VLMs that is also exported separately as vision_embeddings_pos). + _orig_forward_attr = "_openvino_module_extension_patch_orig_forward" + for module in model.modules(): + if hasattr(module, _orig_forward_attr): + module.forward = getattr(module, _orig_forward_attr) + delattr(module, _orig_forward_attr) + ov_model.validate_nodes_and_infer_types() # TODO: remove as unnecessary validation? output_names = list(config.outputs.keys()) From 5033df204cdcee729dec7ff8556e579422784d84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 20 Mar 2026 19:47:17 +0100 Subject: [PATCH 183/190] fix qwen3vl vision embeddings pos --- optimum/exporters/openvino/model_configs.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 62464e81f9..cc1cac2714 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -3913,11 +3913,10 @@ def patch_model_for_export(self, model: Union["PreTrainedModel"], model_kwargs: model_kwargs = model_kwargs or {} if self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_MERGER: return Qwen3VLVisionEmbMergerPatcher(self, model, model_kwargs) - if ( - self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS - or self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_POS - ): + if self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS: return ModelPatcher(self, model, model_kwargs=model_kwargs) + if self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_POS: + return InputEmbeddingPatcher(self, model, model_kwargs) return super().patch_model_for_export(model, model_kwargs) @property From 4602e000f4ca2c0e04a03c3633d30703b6bb0b05 Mon Sep 17 00:00:00 2001 From: "Kazantsev, Roman" Date: Sun, 22 Mar 2026 20:03:10 +0400 Subject: [PATCH 184/190] Support Qwen3.5-MoE Signed-off-by: Kazantsev, Roman --- optimum/exporters/openvino/model_configs.py | 66 +++++++++++++++++- optimum/exporters/openvino/model_patcher.py | 68 +++++++++++++++++++ optimum/exporters/openvino/utils.py | 3 +- optimum/intel/openvino/modeling_decoder.py | 2 +- .../openvino/modeling_visual_language.py | 18 ++++- 5 files changed, 153 insertions(+), 4 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index ac48a991ee..17f6074743 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -195,6 +195,7 @@ Qwen2VLLanguageModelPatcher, Qwen2VLVisionEmbMergerPatcher, Qwen3_5ModelPatcher, + Qwen3_5MoeModelPatcher, Qwen3_5VisionEmbMergerPatcher, Qwen3MoeModelPatcher, Qwen3NextModelPatcher, @@ -259,6 +260,10 @@ def init_model_configs(): "transformers", "AutoModelForImageTextToText", ) + TasksManager._CUSTOM_CLASSES[("pt", "qwen3_5_moe", "image-text-to-text")] = ( + "transformers", + "AutoModelForImageTextToText", + ) # since transformers v4.46, model can be loaded using default AutoModelForImageTextToText # https://github.com/huggingface/transformers/blob/v4.46.0/src/transformers/models/auto/modeling_auto.py#L776 @@ -5746,4 +5751,63 @@ def outputs(self) -> Dict[str, Dict[int, str]]: return get_vlm_internal_text_generation_config( "qwen3_5_text", self._orig_config.text_config, self.int_dtype, self.float_dtype ).outputs - raise Exception("Unknown Qwen3.5 behavior type.") \ No newline at end of file + raise Exception("Unknown Qwen3.5 behavior type.") + + +@register_in_tasks_manager( + "qwen3_5_moe_text", + *["text-generation", "text-generation-with-past"], + library_name="transformers", +) +class Qwen3_5MoeTextOpenVINOConfig(Qwen3_5TextOpenVINOConfig): + _MODEL_PATCHER = Qwen3_5MoeModelPatcher + + +@register_in_tasks_manager( + "qwen3_5_moe", + *["image-text-to-text"], + library_name="transformers", +) +class Qwen3_5MoeOpenVINOConfig(Qwen3_5OpenVINOConfig): + def with_behavior( + self, + behavior: Union[str, QwenVLConfigBehavior], + ): + if isinstance(behavior, str) and not isinstance(behavior, QwenVLConfigBehavior): + behavior = QwenVLConfigBehavior(behavior) + + if behavior == QwenVLConfigBehavior.TEXT_EMBEDDINGS: + return get_vlm_text_embeddings_config( + "qwen3_5_moe_text", self._orig_config.text_config, self.int_dtype, self.float_dtype + ) + + if behavior == QwenVLConfigBehavior.LANGUAGE: + return get_vlm_text_generation_config( + "qwen3_5_moe_text", + self._orig_config.text_config, + self.int_dtype, + self.float_dtype, + model_patcher=Qwen3_5MoeModelPatcher, + ) + + if behavior in ( + QwenVLConfigBehavior.VISION_EMBEDDINGS, + QwenVLConfigBehavior.VISION_EMBEDDINGS_MERGER, + QwenVLConfigBehavior.VISION_EMBEDDINGS_POS, + ): + return self.__class__( + self._orig_config, + task=self.task, + int_dtype=self.int_dtype, + float_dtype=self.float_dtype, + behavior=behavior, + preprocessors=self._preprocessors, + ) + + @property + def outputs(self) -> Dict[str, Dict[int, str]]: + if self._behavior == QwenVLConfigBehavior.LANGUAGE: + return get_vlm_internal_text_generation_config( + "qwen3_5_moe_text", self._orig_config.text_config, self.int_dtype, self.float_dtype + ).outputs + return super().outputs \ No newline at end of file diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 5b66c8c6fb..7fbf431c3c 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -8740,3 +8740,71 @@ def __exit__(self, exc_type, exc_value, traceback): block.forward = block._orig_forward block.attn.forward = block.attn._orig_forward + +def patched_qwen3_5_moe_sparse_moe_block(self, hidden_states: torch.Tensor) -> torch.Tensor: + num_experts = self.experts.num_experts + batch_size, sequence_length, hidden_dim = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_dim) + + # router returns (logits, scores, indices) + _, routing_weights, selected_experts = self.gate(hidden_states) + + new_routing_weights = torch.zeros(batch_size * sequence_length, num_experts, dtype=routing_weights.dtype) + new_routing_weights.scatter_(dim=1, index=selected_experts, src=routing_weights) + + shared_expert_output = self.shared_expert(hidden_states) + shared_expert_output = torch.sigmoid(self.shared_expert_gate(hidden_states)) * shared_expert_output + + hidden_states = hidden_states.repeat(num_experts, 1) + hidden_states = hidden_states.view(num_experts, -1, hidden_dim) + act_fn = self.experts.act_fn + + # compute experts outputs in a vectorized form using torch.bmm + gate = torch.bmm(hidden_states, self.gate_projs.transpose(1, 2)) + up = torch.bmm(hidden_states, self.up_projs.transpose(1, 2)) + gate_up = act_fn(gate) * up + next_states = torch.bmm(gate_up, self.down_projs.transpose(1, 2)) + next_states = next_states.view(num_experts, batch_size, -1, hidden_dim) + next_states = next_states * new_routing_weights.transpose(0, 1).view(num_experts, batch_size, -1)[..., None] + next_states = next_states.sum(dim=0) + + shared_expert_output = shared_expert_output.view(batch_size, -1, hidden_dim) + output = shared_expert_output + next_states + return output.view(batch_size, sequence_length, hidden_dim) + + +class Qwen3_5MoeModelPatcher(Qwen3_5ModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: "PreTrainedModel", + model_kwargs: Optional[Dict[str, Any]] = None, + ): + super().__init__(config, model, model_kwargs) + + def __enter__(self): + from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import Qwen3_5MoeSparseMoeBlock + + super().__enter__() + for decoder_layer in self._text_model.layers: + if isinstance(decoder_layer.mlp, Qwen3_5MoeSparseMoeBlock): + sparse_moe_block = decoder_layer.mlp + intermediate_dim = sparse_moe_block.experts.intermediate_dim + sparse_moe_block._orig_forward = sparse_moe_block.forward + sparse_moe_block.forward = types.MethodType(patched_qwen3_5_moe_sparse_moe_block, sparse_moe_block) + # TODO: remove `float()` casting when CVS-181449 is fixed + # now it is needed to have MoE optimizations to be applied + sparse_moe_block.gate_projs = sparse_moe_block.experts.gate_up_proj[:, :intermediate_dim, :].float() + sparse_moe_block.up_projs = sparse_moe_block.experts.gate_up_proj[:, intermediate_dim:, :].float() + sparse_moe_block.down_projs = sparse_moe_block.experts.down_proj.data.float() + + def __exit__(self, exc_type, exc_value, traceback): + from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import Qwen3_5MoeSparseMoeBlock + + super().__exit__(exc_type, exc_value, traceback) + for decoder_layer in self._text_model.layers: + if isinstance(decoder_layer.mlp, Qwen3_5MoeSparseMoeBlock): + sparse_moe_block = decoder_layer.mlp + sparse_moe_block.forward = sparse_moe_block._orig_forward + del sparse_moe_block.gate_projs, sparse_moe_block.up_projs, sparse_moe_block.down_projs + diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py index 5486629fac..061334754c 100644 --- a/optimum/exporters/openvino/utils.py +++ b/optimum/exporters/openvino/utils.py @@ -296,6 +296,7 @@ def get_submodels(model): "qwen2_5_vl", "qwen3_vl", "qwen3_5", + "qwen3_5_moe", "got_ocr2", "gemma3", "idefics3", @@ -306,7 +307,7 @@ def get_submodels(model): "minicpmo", ] -SSM_MODELS = ["mamba", "falcon_mamba", "zamba2", "lfm2", "granitemoehybrid", "qwen3_next", "qwen3_5_text"] +SSM_MODELS = ["mamba", "falcon_mamba", "zamba2", "lfm2", "granitemoehybrid", "qwen3_next", "qwen3_5_text", "qwen3_5_moe_text"] # All transformers, diffusers, timm and sentence transformers models that are supported via optimum-onnx OnnxConfigs but that have currently no test # TODO: add tests for all models that are compatible and remove support for all others diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index ed75ef1150..8da7ea3738 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -1449,7 +1449,7 @@ def prepare_inputs_for_generation( # decoding stage so it takes the last token input_ids = input_ids[:, -1].unsqueeze(-1) - if self.config.model_type not in ["lfm2", "granitemoehybrid", "qwen3_next", "qwen3_5_text"]: + if self.config.model_type not in ["lfm2", "granitemoehybrid", "qwen3_next", "qwen3_5_text", "qwen3_5_moe_text"]: # LFM2, GraniteMoeHybrid (Granite-4.0), Qwen3-Next, and Qwen3.5 require the attention mask # to be the length of the full context, so default mask from OVModelForCausalLM needs to be used. # Other models like Mamba typically do not require an attention_mask diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 8e7a1c8e21..82ea10467d 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -190,7 +190,7 @@ def prepare_inputs( if past_len: position_ids = position_ids[:, -inputs_embeds.shape[1] :] - if (self.config.model_type in ["qwen2_vl", "qwen3_vl", "qwen3_5"]) and position_ids.ndim != 3: + if (self.config.model_type in ["qwen2_vl", "qwen3_vl", "qwen3_5", "qwen3_5_moe"]) and position_ids.ndim != 3: position_ids = np.repeat(np.expand_dims(position_ids, 0), 3, axis=0) inputs["position_ids"] = position_ids @@ -3462,6 +3462,20 @@ class Qwen3_5VisionModel: class Qwen3_5VisionRotaryEmbedding: pass + +if is_transformers_version(">=", "5.2.0"): + from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import ( + Qwen3_5MoeModel, + Qwen3_5MoeVisionModel, + ) +else: + + class Qwen3_5MoeModel: + pass + + class Qwen3_5MoeVisionModel: + pass + # The inheritance from Qwen3VLModel is needed to get access to methods: # get_placeholder_mask(): https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py#L1066 # get_rope_index(): https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py#L916 @@ -5196,5 +5210,7 @@ def generate(self, *args, **kwargs): "qwen3_vl": _OVQwen3VLForCausalLM, "qwen3_5": _OVQwen3_5ForCausalLM, "qwen3_5_text": _OVQwen3_5ForCausalLM, + "qwen3_5_moe": _OVQwen3_5ForCausalLM, + "qwen3_5_moe_text": _OVQwen3_5ForCausalLM, "minicpmo": _OVMiniCPMOForCausalLM, } From cbe127ee06e7f8136d0c8d8c80193e41e1fe2ed9 Mon Sep 17 00:00:00 2001 From: "Kazantsev, Roman" Date: Wed, 15 Apr 2026 01:51:46 +0400 Subject: [PATCH 185/190] Add position_ids input and its preparation for inference Signed-off-by: Kazantsev, Roman --- optimum/exporters/openvino/model_configs.py | 17 +++++- optimum/exporters/openvino/model_patcher.py | 5 +- optimum/exporters/openvino/utils.py | 11 +++- optimum/intel/openvino/modeling_decoder.py | 8 ++- .../openvino/modeling_visual_language.py | 61 ++++++++++++++++--- tests/openvino/test_decoder.py | 2 +- 6 files changed, 88 insertions(+), 16 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 17f6074743..5467e641c3 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -3562,6 +3562,14 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int return generated_input +class DummyQwen3_5LMInputGenerator(DummyTextInputGenerator): + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + generated_input = super().generate(input_name, framework, int_dtype, float_dtype) + if input_name == "position_ids": + return generated_input.unsqueeze(0).expand(4, -1, -1) + return generated_input + + class DummyQwen2VLVisionEmbedInputGenerator(DummyVisionInputGenerator): SUPPORTED_INPUT_NAMES = ( "hidden_states", @@ -5610,6 +5618,7 @@ def inputs(self) -> Dict[str, Dict[int, str]]: common_inputs = { "input_ids": {0: "batch_size", 1: "sequence_length"}, "attention_mask": {0: "batch_size", 1: "sequence_length"}, + "position_ids": {0: "batch_size", 1: "sequence_length"}, } if self.use_past_in_inputs: self.add_past_key_values(common_inputs, direction="inputs") @@ -5700,8 +5709,8 @@ def with_behavior( self.int_dtype, self.float_dtype, model_patcher=Qwen3_5ModelPatcher, - #dummy_input_generator=DummyQwen2VLLMInputGenerator, - #inputs_update={"position_ids": {1: "batch_size", 2: "sequence_length"}}, + dummy_input_generator=DummyQwen3_5LMInputGenerator, + inputs_update={"position_ids": {1: "batch_size", 2: "sequence_length"}}, ) if behavior in ( @@ -5788,6 +5797,8 @@ def with_behavior( self.int_dtype, self.float_dtype, model_patcher=Qwen3_5MoeModelPatcher, + dummy_input_generator=DummyQwen3_5LMInputGenerator, + inputs_update={"position_ids": {1: "batch_size", 2: "sequence_length"}}, ) if behavior in ( @@ -5810,4 +5821,4 @@ def outputs(self) -> Dict[str, Dict[int, str]]: return get_vlm_internal_text_generation_config( "qwen3_5_moe_text", self._orig_config.text_config, self.int_dtype, self.float_dtype ).outputs - return super().outputs \ No newline at end of file + return super().outputs diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 7fbf431c3c..f906fa52d0 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -8723,7 +8723,9 @@ def image_embed_forward( emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1) position_embeddings = (emb.cos(), emb.sin()) for blk in self.blocks: - hidden_states = blk(hidden_states, attention_mask=attention_mask, position_embeddings=position_embeddings) + hidden_states = blk( + hidden_states, attention_mask=attention_mask, position_embeddings=position_embeddings + ) return self.merger(hidden_states) model.forward = types.MethodType(image_embed_forward, model) @@ -8807,4 +8809,3 @@ def __exit__(self, exc_type, exc_value, traceback): sparse_moe_block = decoder_layer.mlp sparse_moe_block.forward = sparse_moe_block._orig_forward del sparse_moe_block.gate_projs, sparse_moe_block.up_projs, sparse_moe_block.down_projs - diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py index 061334754c..6314803bbc 100644 --- a/optimum/exporters/openvino/utils.py +++ b/optimum/exporters/openvino/utils.py @@ -307,7 +307,16 @@ def get_submodels(model): "minicpmo", ] -SSM_MODELS = ["mamba", "falcon_mamba", "zamba2", "lfm2", "granitemoehybrid", "qwen3_next", "qwen3_5_text", "qwen3_5_moe_text"] +SSM_MODELS = [ + "mamba", + "falcon_mamba", + "zamba2", + "lfm2", + "granitemoehybrid", + "qwen3_next", + "qwen3_5_text", + "qwen3_5_moe_text", +] # All transformers, diffusers, timm and sentence transformers models that are supported via optimum-onnx OnnxConfigs but that have currently no test # TODO: add tests for all models that are compatible and remove support for all others diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 8da7ea3738..7044953664 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -1449,7 +1449,13 @@ def prepare_inputs_for_generation( # decoding stage so it takes the last token input_ids = input_ids[:, -1].unsqueeze(-1) - if self.config.model_type not in ["lfm2", "granitemoehybrid", "qwen3_next", "qwen3_5_text", "qwen3_5_moe_text"]: + if self.config.model_type not in [ + "lfm2", + "granitemoehybrid", + "qwen3_next", + "qwen3_5_text", + "qwen3_5_moe_text", + ]: # LFM2, GraniteMoeHybrid (Granite-4.0), Qwen3-Next, and Qwen3.5 require the attention mask # to be the length of the full context, so default mask from OVModelForCausalLM needs to be used. # Other models like Mamba typically do not require an attention_mask diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 82ea10467d..427860775e 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -188,9 +188,11 @@ def prepare_inputs( position_ids = np.cumsum(attention_mask, axis=1) - 1 position_ids[attention_mask == 0] = 1 if past_len: - position_ids = position_ids[:, -inputs_embeds.shape[1] :] + position_ids = position_ids[..., -inputs_embeds.shape[1] :] - if (self.config.model_type in ["qwen2_vl", "qwen3_vl", "qwen3_5", "qwen3_5_moe"]) and position_ids.ndim != 3: + if self.config.model_type in ["qwen3_5", "qwen3_5_moe"] and position_ids.ndim != 3: + position_ids = np.repeat(np.expand_dims(position_ids, 0), 4, axis=0) + elif self.config.model_type in ["qwen2_vl", "qwen3_vl"] and position_ids.ndim != 3: position_ids = np.repeat(np.expand_dims(position_ids, 0), 3, axis=0) inputs["position_ids"] = position_ids @@ -3445,6 +3447,7 @@ class Qwen3_5Model: class Qwen3_5VisionModel: pass + if is_transformers_version(">=", "5.2.0"): from transformers.models.qwen3_5.modeling_qwen3_5 import ( Qwen3_5Model, @@ -3476,6 +3479,7 @@ class Qwen3_5MoeModel: class Qwen3_5MoeVisionModel: pass + # The inheritance from Qwen3VLModel is needed to get access to methods: # get_placeholder_mask(): https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py#L1066 # get_rope_index(): https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py#L916 @@ -5089,14 +5093,25 @@ def get_multimodal_embeddings( if position_ids is None and input_ids is not None and (attention_mask is None or attention_mask.ndim == 2): # calculate RoPE index once per generation in the pre-fill stage only if (cache_position is not None and cache_position[0] == 0) or self.rope_deltas is None: - # Construct mm_token_type_ids from input_ids - mm_token_type_ids = torch.zeros_like(input_ids, dtype=torch.int) - mm_token_type_ids[input_ids == self.config.image_token_id] = 1 - mm_token_type_ids[input_ids == self.config.video_token_id] = 2 - position_ids, rope_deltas = self.get_rope_index( - input_ids, mm_token_type_ids, image_grid_thw, video_grid_thw, attention_mask + vision_positions, rope_deltas = self.get_rope_index( + input_ids, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + attention_mask=attention_mask, ) self.rope_deltas = rope_deltas + # Compute text positions (simple cumsum) and concatenate as dim 0 + # to create shape (4, batch, seq_len): [text_pos, temporal, height, width] + if attention_mask is not None: + text_positions = attention_mask.long().cumsum(-1) - 1 + text_positions = text_positions.masked_fill(attention_mask == 0, 1) + else: + text_positions = ( + torch.arange(input_ids.shape[1], device=input_ids.device) + .unsqueeze(0) + .expand(input_ids.shape[0], -1) + ) + position_ids = torch.cat([text_positions.unsqueeze(0), vision_positions], dim=0) # then use the prev pre-calculated rope-deltas to get the correct position ids else: batch_size, seq_length, _ = inputs_embeds.shape @@ -5107,6 +5122,12 @@ def get_multimodal_embeddings( delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0) position_ids = position_ids.add(delta) position_ids = position_ids.unsqueeze(0).expand(3, -1, -1) + # Prepend text positions for shape (4, batch, seq_len) + text_positions = torch.arange(seq_length, device=inputs_embeds.device) + text_positions = text_positions.view(1, -1).expand(batch_size, -1) + if cache_position is not None: + text_positions = text_positions + cache_position[0] + position_ids = torch.cat([text_positions.unsqueeze(0), position_ids], dim=0) return inputs_embeds, attention_mask, position_ids @@ -5180,6 +5201,30 @@ def forward( ) return final_result + def _prepare_position_ids_for_generation(self, inputs_tensor, model_kwargs): + # Mirrors Qwen3_5ForConditionalGeneration._prepare_position_ids_for_generation + # Creates proper 4D position_ids: [text_positions, temporal, height, width] + text_positions = GenerationMixin._prepare_position_ids_for_generation(self, inputs_tensor, model_kwargs) + + if "input_ids" in model_kwargs and model_kwargs["input_ids"].shape[1] > 0: + inputs_tensor = model_kwargs["input_ids"] + + is_input_ids = len(inputs_tensor.shape) == 2 and inputs_tensor.dtype in [torch.int, torch.long] + if is_input_ids and ( + model_kwargs.get("image_grid_thw") is not None or model_kwargs.get("video_grid_thw") is not None + ): + filtered_kwargs = {k: v for k, v in model_kwargs.items() if k != "input_ids"} + vision_positions, rope_deltas = self.get_rope_index(inputs_tensor, **filtered_kwargs) + self.rope_deltas = rope_deltas + else: + vision_positions = text_positions.unsqueeze(0).expand(3, -1, -1) + self.rope_deltas = torch.zeros(inputs_tensor.shape[0], 1, dtype=torch.long, device=inputs_tensor.device) + + # Concatenate "text + vision" positions into [4, bs, seq-len] + text_positions = text_positions[None, ...] + position_ids = torch.cat([text_positions, vision_positions], dim=0) + return position_ids + def generate(self, *args, **kwargs): # Clear cached rope delta from previous generations self.rope_deltas = None diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 72b5b1077d..a1e47c7451 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -27,8 +27,8 @@ BitnetOpenVINOConfig, DeepseekOpenVINOConfig, LFM2OpenVINOConfig, - Qwen3VLOpenVINOConfig, Qwen3_5TextOpenVINOConfig, + Qwen3VLOpenVINOConfig, ) from optimum.exporters.openvino.model_patcher import patch_update_causal_mask from optimum.exporters.openvino.utils import ONNX_SUPPORTED_ARCHITECTURES From c7a19db224e5dd7b7d3d8425033e26b284b4f7d5 Mon Sep 17 00:00:00 2001 From: Roman Kazantsev Date: Wed, 15 Apr 2026 23:45:24 +0400 Subject: [PATCH 186/190] Apply suggestion from @rkazants --- .github/workflows/test_offline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_offline.yaml b/.github/workflows/test_offline.yaml index 48f07b9396..830d77e1c3 100644 --- a/.github/workflows/test_offline.yaml +++ b/.github/workflows/test_offline.yaml @@ -34,7 +34,7 @@ jobs: - name: Install dependencies run: | pip install --upgrade pip uv - uv pip install .[tests] + uv pip install .[diffusers,tests] - name: Test run: | From c036d9ca469fa0c48956a4ace44a53b04cf4dbff Mon Sep 17 00:00:00 2001 From: Roman Kazantsev Date: Wed, 15 Apr 2026 23:46:23 +0400 Subject: [PATCH 187/190] Apply suggestion from @rkazants --- optimum/exporters/openvino/model_configs.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index aca2a45411..a576042678 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -1101,8 +1101,6 @@ class Phi3OpenVINOConfig(PhiOnnxConfig): ) class PhiMoEOpenVINOConfig(Phi3OpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.46.0" - # TODO (@echarlaix): add v5 support - MAX_TRANSFORMERS_VERSION = "4.57.6" _MODEL_PATCHER = PhiMoEModelPatcher From 7b63cd3fda2a8aac858a76622f637b3f80e53c09 Mon Sep 17 00:00:00 2001 From: Roman Kazantsev Date: Wed, 15 Apr 2026 23:47:53 +0400 Subject: [PATCH 188/190] Apply suggestion from @rkazants --- optimum/exporters/openvino/model_patcher.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 1ac098966e..d22bae9ef1 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -1744,15 +1744,6 @@ def __enter__(self): "long_mscale", None ) - if is_transformers_version("<", "5"): - for layer in self._model.model.layers: - layer.block_sparse_moe._orig_forward = layer.block_sparse_moe.forward - layer.block_sparse_moe.forward = types.MethodType( - _phi_moe_sparse_moe_block_forward, layer.block_sparse_moe - ) - else: - self._model.set_experts_implementation("batched_mm") - def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) From f7276ceb75759c965060863b8077ea11ae254103 Mon Sep 17 00:00:00 2001 From: Roman Kazantsev Date: Wed, 15 Apr 2026 23:52:33 +0400 Subject: [PATCH 189/190] Apply suggestions from code review Co-authored-by: Roman Kazantsev --- tests/openvino/test_decoder.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 6e9010c134..e423b52867 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -23,7 +23,6 @@ ) from optimum.exporters.openvino.model_configs import ( - AfmoeOpenVINOConfig, BitnetOpenVINOConfig, DeepseekOpenVINOConfig, LFM2OpenVINOConfig, @@ -106,15 +105,11 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES += ("cohere2",) if is_transformers_version(">=", "4.46.0"): - SUPPORTED_ARCHITECTURES += ("glm", "mistral-nemo") + SUPPORTED_ARCHITECTURES += ("glm", "mistral-nemo", "phimoe") if is_transformers_version("<", "4.54.0"): SUPPORTED_ARCHITECTURES += ("deepseek",) - # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly - if is_transformers_version("<", "5"): - SUPPORTED_ARCHITECTURES += ("phimoe",) - # gptq and awq install disabled for windows test environment if platform.system() != "Windows" and is_transformers_version("<", "4.56.0"): SUPPORTED_ARCHITECTURES += ("opt_gptq", "mixtral_awq") @@ -310,8 +305,6 @@ def test_find_untested_architectures(self): supported_architectures.remove("deepseek_v2") if "deepseek_v3" in supported_architectures: supported_architectures.remove("deepseek_v3") - if is_transformers_version(">", str(AfmoeOpenVINOConfig.MAX_TRANSFORMERS_VERSION)): - supported_architectures -= {"afmoe"} if is_transformers_version("<", str(BitnetOpenVINOConfig.MIN_TRANSFORMERS_VERSION)): supported_architectures -= {"bitnet"} if is_transformers_version("<", str(LFM2OpenVINOConfig.MIN_TRANSFORMERS_VERSION)): From c656558f8288d44fe7ce0dd7c98753af0c740b15 Mon Sep 17 00:00:00 2001 From: "Kazantsev, Roman" Date: Thu, 16 Apr 2026 00:10:43 +0400 Subject: [PATCH 190/190] Apply code formatting Signed-off-by: Kazantsev, Roman --- tests/openvino/test_decoder.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index e423b52867..4a9f3b23fb 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -26,7 +26,6 @@ BitnetOpenVINOConfig, DeepseekOpenVINOConfig, LFM2OpenVINOConfig, - Qwen3_5TextOpenVINOConfig, Qwen3VLOpenVINOConfig, ) from optimum.exporters.openvino.model_patcher import patch_update_causal_mask