From 8574954e7009520ea4f618c52b49f2bf3692a425 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 8 Mar 2026 18:03:59 +0000
Subject: [PATCH 01/10] Add _ov_ops.py with RecurrentAttentionCellOp conversion
 rule

Add conversion rule for the RecurrentAttentionCellOp operation used
for GatedDeltaNet patching in OpenVINO PyTorch frontend.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 optimum/exporters/openvino/_ov_ops.py | 113 ++++++++++++++++++++++++++
 1 file changed, 113 insertions(+)
 create mode 100644 optimum/exporters/openvino/_ov_ops.py

diff --git a/optimum/exporters/openvino/_ov_ops.py b/optimum/exporters/openvino/_ov_ops.py
new file mode 100644
index 0000000000..78e5b6d23b
--- /dev/null
+++ b/optimum/exporters/openvino/_ov_ops.py
@@ -0,0 +1,113 @@
+#  Copyright 2026 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+# Conversion rule for the `RecurrentAttentionCellOp` operation in a Torch graph.
+# The `RecurrentAttentionCellOp` appears in the Torch graph as a result of replacing
+# the `torch.nn.Module` block `RecurrentAttentionCell` via a registered
+# `ModuleExtension` for `RecurrentAttentionCell` in the OpenVINO PyTorch frontend.
+import numpy as np
+
+import openvino as ov
+import openvino.opset14 as ops
+
+
+def convert_recurrent_attention_cell(context):
+    query = context.get_input(0)
+    key = context.get_input(1)
+    value = context.get_input(2)
+    g = context.get_input(3)
+    beta = context.get_input(4)
+    last_recurrent_state_old = context.get_input(5)
+
+    value_shape = ops.shape_of(value)
+    const_zero = ops.constant(0, dtype=np.float32)
+    core_attn_out = ops.broadcast(const_zero, value_shape)
+    const_two_out = ops.constant(2, dtype=np.int32)
+    const_zero_out = ops.constant(0, dtype=np.int32)
+    seq_len = ops.gather(value_shape, const_two_out, const_zero_out)
+
+    timestep_param = ops.parameter([], np.int32, "timestep")
+    q_t_param = ops.parameter([-1, -1, 1, -1], np.float32, "q_t")
+    k_t_param = ops.parameter([-1, -1, 1, -1], np.float32, "k_t")
+    v_t_param = ops.parameter([-1, -1, 1, -1], np.float32, "v_t")
+    g_t_param = ops.parameter([-1, -1, 1], np.float32, "g_t")
+    beta_t_param = ops.parameter([-1, -1, 1], np.float32, "beta_t")
+    last_recurrent_state_t = ops.parameter([-1, -1, -1, -1], np.float32, "last_recurrent_state_t")
+    core_attn_out_t = ops.parameter([-1, -1, -1, -1], np.float32, "core_attn_out_t")
+
+    const_two = ops.constant(2, dtype=np.int32)
+    q_t = ops.squeeze(q_t_param, const_two)
+    k_t = ops.squeeze(k_t_param, const_two)
+    v_t = ops.squeeze(v_t_param, const_two)
+    const_minus_one = ops.constant(-1, dtype=np.int32)
+    g_t = ops.unsqueeze(ops.exp(g_t_param), const_minus_one)
+    beta_t = beta_t_param
+
+    last_recurrent_state_in = ops.multiply(last_recurrent_state_t, g_t)
+    const_minus_two = ops.constant(-2, dtype=np.int32)
+    kv_mem = ops.multiply(last_recurrent_state_in, ops.unsqueeze(k_t, const_minus_one))
+    kv_mem = ops.reduce_sum(kv_mem, const_minus_two, False)
+    delta = ops.multiply(ops.subtract(v_t, kv_mem), beta_t)
+    last_recurrent_state_delta = ops.multiply(
+        ops.unsqueeze(k_t, const_minus_one), ops.unsqueeze(delta, const_minus_two)
+    )
+    last_recurrent_state_in = ops.add(last_recurrent_state_in, last_recurrent_state_delta)
+    core_attn_update = ops.multiply(last_recurrent_state_in, ops.unsqueeze(q_t, const_minus_one))
+    core_attn_update = ops.reduce_sum(core_attn_update, const_minus_two, True)
+    const_zero = ops.constant(0, dtype=np.int32)
+    timestep = ops.unsqueeze(timestep_param, const_zero)
+
+    core_attn_out_res = ops.scatter_update(core_attn_out_t, timestep, core_attn_update, const_two)
+    last_recurrent_state_res = last_recurrent_state_in
+
+    body_cond = ops.constant([True], dtype=bool)
+
+    body_model = ov.Model(
+        [body_cond, last_recurrent_state_res, core_attn_out_res],
+        [
+            timestep_param,
+            q_t_param,
+            k_t_param,
+            v_t_param,
+            g_t_param,
+            beta_t_param,
+            last_recurrent_state_t,
+            core_attn_out_t,
+        ],
+        "body_model",
+    )
+
+    seq_len = ops.convert(seq_len, "i32")
+    loop = ops.loop(seq_len, ops.constant(True, dtype="bool"))
+    loop.set_function(body_model)
+
+    loop.set_sliced_input(q_t_param, query, 0, 1, 1, -1, 2)
+    loop.set_sliced_input(k_t_param, key, 0, 1, 1, -1, 2)
+    loop.set_sliced_input(v_t_param, value, 0, 1, 1, -1, 2)
+    loop.set_sliced_input(g_t_param, g, 0, 1, 1, -1, 2)
+    loop.set_sliced_input(beta_t_param, beta, 0, 1, 1, -1, 2)
+    loop.set_merged_input(last_recurrent_state_t, last_recurrent_state_old, last_recurrent_state_res.output(0))
+    loop.set_merged_input(core_attn_out_t, core_attn_out.output(0), core_attn_out_res.output(0))
+    loop.set_special_body_ports([0, 0])
+
+    core_attn_out_new = loop.get_iter_value(core_attn_out_res.output(0), -1)
+    last_recurrent_state_new = loop.get_iter_value(last_recurrent_state_res.output(0), -1)
+
+    flatten_shape = ops.constant([-1], dtype=np.int32)
+    core_attn_out_new = ops.reshape(core_attn_out_new, flatten_shape, False)
+    last_recurrent_state_new = ops.reshape(last_recurrent_state_new, flatten_shape, False)
+
+    final_output = ops.concat([core_attn_out_new, last_recurrent_state_new], 0)
+
+    return [final_output.output(0)]

From 050d14f61d5eec382c9dfcce7b33053e2096930f Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 8 Mar 2026 18:16:05 +0000
Subject: [PATCH 02/10] Add initial Qwen3.5 model support with VLM and hybrid
 text model

Co-authored-by: rkazants <35459624+rkazants@users.noreply.github.com>
---
 docs/source/openvino/models.mdx               |   1 +
 optimum/exporters/openvino/model_configs.py   | 259 +++++++++++
 optimum/exporters/openvino/model_patcher.py   | 414 ++++++++++++++++++
 optimum/exporters/openvino/utils.py           |   3 +-
 optimum/intel/openvino/modeling_decoder.py    |   4 +-
 .../openvino/modeling_visual_language.py      | 365 ++++++++++++++-
 tests/openvino/test_decoder.py                |   5 +
 tests/openvino/utils_tests.py                 |   8 +
 8 files changed, 1055 insertions(+), 4 deletions(-)

diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx
index 51200060e8..4ab826378b 100644
--- a/docs/source/openvino/models.mdx
+++ b/docs/source/openvino/models.mdx
@@ -131,6 +131,7 @@ Here is the list of the supported architectures :
 - Qwen2VL
 - Qwen2.5VL
 - Qwen3VL
+- Qwen3.5
 - ResNet
 - Roberta
 - Roformer
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 5c2023f2c9..527c6321ab 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -194,6 +194,8 @@
     Qwen2MoEPatcher,
     Qwen2VLLanguageModelPatcher,
     Qwen2VLVisionEmbMergerPatcher,
+    Qwen3_5ModelPatcher,
+    Qwen3_5VisionEmbMergerPatcher,
     Qwen3MoeModelPatcher,
     Qwen3VLLanguageModelPatcher,
     Qwen3VLVisionEmbMergerPatcher,
@@ -252,6 +254,10 @@ def init_model_configs():
         "transformers",
         "AutoModelForCausalLM",
     )
+    TasksManager._CUSTOM_CLASSES[("pt", "qwen3_5", "image-text-to-text")] = (
+        "transformers",
+        "AutoModelForImageTextToText",
+    )
 
     # since transformers v4.46, model can be loaded using default AutoModelForImageTextToText
     # https://github.com/huggingface/transformers/blob/v4.46.0/src/transformers/models/auto/modeling_auto.py#L776
@@ -5360,3 +5366,256 @@ class HunyuanV1DenseOpenVINOConfig(LlamaOpenVINOConfig):
     MIN_TRANSFORMERS_VERSION = "4.57.0"
     DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, GemmaDummyPastKeyValuesGenerator)
     DUMMY_PKV_GENERATOR_CLASS = GemmaDummyPastKeyValuesGenerator
+
+
+class Qwen3_5DummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator):
+    """
+    Generates dummy cache_params inputs for Qwen3.5 architectures.
+    """
+
+    SUPPORTED_INPUT_NAMES = ("cache_params",)
+
+    def __init__(
+        self,
+        task: str,
+        normalized_config,
+        batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
+        sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"],
+        **kwargs,
+    ):
+        super().__init__(
+            task=task,
+            normalized_config=normalized_config,
+            batch_size=batch_size,
+            sequence_length=sequence_length,
+            **kwargs,
+        )
+
+        config = normalized_config.config
+        self.num_full_attn_layers = config.layer_types.count("full_attention")
+        self.num_linear_attn_layers = config.layer_types.count("linear_attention")
+        self.conv_kernel_size = config.linear_conv_kernel_dim
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.head_k_dim = config.linear_key_head_dim
+        self.head_v_dim = config.linear_value_head_dim
+        self.num_v_heads = config.linear_num_value_heads
+        self.num_k_heads = config.linear_num_key_heads
+        self.num_key_value_heads = config.num_key_value_heads
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        cache_params = []
+
+        for idx in range(self.num_linear_attn_layers):
+            d_inner = self.num_k_heads * (2 * self.head_k_dim + self.head_v_dim * self.num_v_heads // self.num_k_heads)
+            conv_state_shape = (
+                self.batch_size,
+                d_inner,
+                self.conv_kernel_size,
+            )
+            conv_state = self.random_float_tensor(conv_state_shape, framework=framework, dtype=float_dtype)
+            cache_params.append(conv_state)
+            num_heads = self.num_v_heads
+            recurrent_state_shape = (self.batch_size, num_heads, self.head_k_dim, self.head_v_dim)
+            recurrent_state = self.random_float_tensor(recurrent_state_shape, framework=framework, dtype=float_dtype)
+            cache_params.append(recurrent_state)
+
+        for idx in range(self.num_full_attn_layers):
+            kv_shape = (self.batch_size, self.num_key_value_heads, self.sequence_length, self.head_dim)
+            k = self.random_float_tensor(kv_shape, framework=framework, dtype=float_dtype)
+            v = self.random_float_tensor(kv_shape, framework=framework, dtype=float_dtype)
+            cache_params.append(k)
+            cache_params.append(v)
+
+        return cache_params
+
+
+@register_in_tasks_manager(
+    "qwen3_5_text",
+    *["text-generation", "text-generation-with-past"],
+    library_name="transformers",
+)
+class Qwen3_5TextOpenVINOConfig(Qwen3OpenVINOConfig):
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, Qwen3_5DummyPastKeyValuesGenerator)
+    DUMMY_PKV_GENERATOR_CLASS = Qwen3_5DummyPastKeyValuesGenerator
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
+    MIN_TRANSFORMERS_VERSION = "4.57.0"
+    _MODEL_PATCHER = Qwen3_5ModelPatcher
+
+    def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], direction: str):
+        if direction not in ["inputs", "outputs"]:
+            raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given')
+
+        if direction == "inputs":
+            decoder_sequence_name = "past_sequence_length"
+            cache_name_prefix = "cache_params.past"
+        else:
+            decoder_sequence_name = "past_sequence_length + sequence_length"
+            cache_name_prefix = "cache_params.present"
+
+        self.num_full_attn_layers = self._normalized_config.layer_types.count("full_attention")
+        self.num_linear_attn_layers = self._normalized_config.layer_types.count("linear_attention")
+
+        for i in range(self.num_linear_attn_layers):
+            inputs_or_outputs[f"{cache_name_prefix}.conv.{i}"] = {0: "batch_size"}
+            inputs_or_outputs[f"{cache_name_prefix}.ssm.{i}"] = {0: "batch_size"}
+
+        for i in range(self.num_full_attn_layers):
+            inputs_or_outputs[f"{cache_name_prefix}.key.{i}"] = {0: "batch_size", 2: decoder_sequence_name}
+            inputs_or_outputs[f"{cache_name_prefix}.value.{i}"] = {0: "batch_size", 2: decoder_sequence_name}
+
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        common_inputs = {
+            "input_ids": {0: "batch_size", 1: "sequence_length"},
+            "attention_mask": {0: "batch_size", 1: "sequence_length"},
+        }
+        if self.use_past_in_inputs:
+            self.add_past_key_values(common_inputs, direction="inputs")
+        return common_inputs
+
+    def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
+        dummy_inputs_generators = self._create_dummy_input_generator_classes(**kwargs)
+
+        dummy_inputs = {}
+        input_names = [key for key in self.inputs.keys() if not key.startswith("cache_params")]
+        if self.use_past_in_inputs:
+            input_names.extend(["cache_params"])
+
+        for input_name in input_names:
+            input_was_inserted = False
+            for dummy_input_gen in dummy_inputs_generators:
+                if dummy_input_gen.supports_input(input_name):
+                    dummy_inputs[input_name] = self.overwrite_shape_and_generate_input(
+                        dummy_input_gen,
+                        input_name,
+                        framework,
+                        input_shapes=kwargs,
+                    )
+                    input_was_inserted = True
+                    break
+            if not input_was_inserted:
+                raise RuntimeError(
+                    f'Could not generate dummy input for "{input_name}". Try adding a proper dummy input generator to the model ONNX config.'
+                )
+
+        return dummy_inputs
+
+
+@register_in_tasks_manager(
+    "qwen3_5",
+    *["image-text-to-text"],
+    library_name="transformers",
+)
+class Qwen3_5OpenVINOConfig(Qwen2VLOpenVINOConfig):
+    SUPPORTED_BEHAVIORS = [model_type.value for model_type in QwenVLConfigBehavior]
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyQwen2VLVisionEmbedInputGenerator,)
+    MIN_TRANSFORMERS_VERSION = "4.57.0"
+
+    def __init__(
+        self,
+        config: "PretrainedConfig",
+        task: str = "feature-extraction",
+        int_dtype: str = "int64",
+        float_dtype: str = "fp32",
+        behavior: QwenVLConfigBehavior = QwenVLConfigBehavior.VISION_EMBEDDINGS,
+        preprocessors: Optional[List[Any]] = None,
+    ):
+        super().__init__(
+            config=config,
+            task=task,
+            int_dtype=int_dtype,
+            float_dtype=float_dtype,
+            preprocessors=preprocessors,
+            behavior=behavior,
+        )
+        if self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_POS and hasattr(config, "vision_config"):
+            self._config = config.vision_config
+            self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config)
+            self._normalized_config.use_embed_dim = True
+
+    @staticmethod
+    def get_model_for_behavior(model, behavior: Union[str, QwenVLConfigBehavior]):
+        if behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_POS:
+            vision_emb_pos = model.visual.pos_embed
+            vision_emb_pos.config = model.config.vision_config
+            return vision_emb_pos
+
+        return Qwen2VLOpenVINOConfig.get_model_for_behavior(model, behavior)
+
+    def with_behavior(
+        self,
+        behavior: Union[str, QwenVLConfigBehavior],
+    ):
+        """
+        Creates a config for different behaviour.
+        Args:
+            behavior ([`ConfigBehavior`]):
+                The behavior to use for the new instance.
+        """
+        if isinstance(behavior, str) and not isinstance(behavior, QwenVLConfigBehavior):
+            behavior = QwenVLConfigBehavior(behavior)
+
+        if behavior == QwenVLConfigBehavior.TEXT_EMBEDDINGS:
+            return get_vlm_text_embeddings_config(
+                "qwen3_5_text", self._orig_config.text_config, self.int_dtype, self.float_dtype
+            )
+
+        if behavior == QwenVLConfigBehavior.LANGUAGE:
+            return get_vlm_text_generation_config(
+                "qwen3_5_text",
+                self._orig_config.text_config,
+                self.int_dtype,
+                self.float_dtype,
+                model_patcher=Qwen3_5ModelPatcher,
+                dummy_input_generator=DummyQwen2VLLMInputGenerator,
+                inputs_update={"position_ids": {1: "batch_size", 2: "sequence_length"}},
+            )
+
+        if behavior in (
+            QwenVLConfigBehavior.VISION_EMBEDDINGS,
+            QwenVLConfigBehavior.VISION_EMBEDDINGS_MERGER,
+            QwenVLConfigBehavior.VISION_EMBEDDINGS_POS,
+        ):
+            return self.__class__(
+                self._orig_config,
+                task=self.task,
+                int_dtype=self.int_dtype,
+                float_dtype=self.float_dtype,
+                behavior=behavior,
+                preprocessors=self._preprocessors,
+            )
+
+    def patch_model_for_export(self, model: Union["PreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None):
+        model_kwargs = model_kwargs or {}
+        if self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_MERGER:
+            return Qwen3_5VisionEmbMergerPatcher(self, model, model_kwargs)
+        if (
+            self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS
+            or self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_POS
+        ):
+            return ModelPatcher(self, model, model_kwargs=model_kwargs)
+        return super().patch_model_for_export(model, model_kwargs)
+
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        if self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_POS:
+            return {
+                "input": {1: "sequence_length"},
+            }
+        return super().inputs
+
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        if self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS:
+            return super().outputs
+        if self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_MERGER:
+            return {"last_hidden_state": {0: "seq_len"}}
+        if self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_POS:
+            return {"last_hidden_state": {0: "seq_len", 1: "seq_len"}}
+        if self._behavior == QwenVLConfigBehavior.TEXT_EMBEDDINGS:
+            return {"inputs_embeds": {0: "batch_size", 1: "sequence_length"}}
+        if self._behavior == QwenVLConfigBehavior.LANGUAGE:
+            return get_vlm_internal_text_generation_config(
+                "qwen3_5_text", self._orig_config.text_config, self.int_dtype, self.float_dtype
+            ).outputs
+        raise Exception("Unknown Qwen3.5 behavior type.")
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 634f015872..28110c1040 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -56,6 +56,8 @@
 )
 from optimum.intel.utils.import_utils import is_diffusers_version, is_torch_version, is_transformers_version
 
+from ._ov_ops import convert_recurrent_attention_cell
+
 
 if is_transformers_version(">=", "4.53"):
     from transformers.masking_utils import ALL_MASK_ATTENTION_FUNCTIONS, eager_mask, sdpa_mask
@@ -7997,3 +7999,415 @@ def forward(
             hidden_states=outputs.hidden_states,
             d2t=d2t_out,
         )
+
+
+# Patched implementation of the gated delta rule in recurrent form.
+# Adapted from:
+# https://github.com/huggingface/transformers/blob/v4.57-release/src/transformers/models/qwen3_next/modeling_qwen3_next.py#L522
+#
+# To represent the for-loop that generates output embeddings, we use a module
+# and the conversion extension mechanism. This is necessary because there is
+# no known vectorized form of this loop that would allow it to be correctly
+# traced with torch.jit.trace
+def patched_recurrent_gated_delta_rule(
+    self, query, key, value, g, beta, initial_state, output_final_state, use_qk_l2norm_in_kernel=False
+):
+    def l2norm(x: torch.FloatTensor, dim: int = -1, eps: float = 1e-6):
+        """This function is intended to align with the l2norm implementation in the FLA library."""
+        inv_norm = torch.rsqrt((x * x).sum(dim=dim, keepdim=True) + eps)
+        return x * inv_norm
+
+    initial_dtype = query.dtype
+    if use_qk_l2norm_in_kernel:
+        query = l2norm(query, dim=-1, eps=1e-6)
+        key = l2norm(key, dim=-1, eps=1e-6)
+    query, key, value, beta, g = [
+        x.transpose(1, 2).contiguous().to(torch.float32) for x in (query, key, value, beta, g)
+    ]
+
+    batch_size, num_heads, sequence_length, k_head_dim = key.shape
+    v_head_dim = value.shape[-1]
+    scale = 1 / (query.shape[-1] ** 0.5)
+    query = query * scale
+
+    last_recurrent_state = (
+        torch.zeros(batch_size, num_heads, k_head_dim, v_head_dim).to(value)
+        if initial_state is None
+        else initial_state.to(value)
+    )
+
+    output_cell = self.recurrent_attention_cell(
+        query,  # (B, H, T, D1)
+        key,  # (B, H, T, D1)
+        value,  # (B, H, T, D2)
+        g,  # (B, H, T)
+        beta,  # (B, H, T)
+        last_recurrent_state,  # (B, H, D1, D2)
+    )
+
+    num_elems = value.numel()
+    core_attn_out = output_cell[:num_elems].reshape(value.shape)
+    last_recurrent_state = output_cell[num_elems:].reshape(last_recurrent_state.shape)
+
+    if not output_final_state:
+        last_recurrent_state = None
+    core_attn_out = core_attn_out.transpose(1, 2).contiguous().to(initial_dtype)
+    return core_attn_out, last_recurrent_state
+
+
+# The CausalConv1D block is overridden with a generic patch provided by `ov_causal_conv1d()`.
+# The GatedDeltaNet block is overridden with a recurrent version of its implementation.
+#
+# To replace GatedDeltaNet with its recurrent form, patching uses the ModuleExtension
+# approach, which replaces the GatedDeltaNet block with a single operation,
+# `GatedDeltaNetOp`. OpenVINO then applies the `convert_recurrent_attention_cell()`
+# conversion rule to this operation.
+def qwen3_5_gated_delta_net_forward(
+    self,
+    hidden_states: torch.Tensor,
+    cache_params=None,
+    cache_position: Optional[torch.LongTensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+):
+    def apply_mask_to_padding_states(hidden_states, attention_mask):
+        """
+        Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
+        """
+        # NOTE: attention mask is a 2D boolean tensor
+        if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1:
+            dtype = hidden_states.dtype
+            hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)
+
+        return hidden_states
+
+    hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask)
+
+    # Set up dimensions for reshapes later
+    batch_size, seq_len, _ = hidden_states.shape
+
+    # getting projected states from cache if it exists
+    layer_idx = None
+    recurrent_state = None
+    if cache_params is not None:
+        layer_idx = cache_params.linear_attn_mapping[self.layer_idx]
+        conv_state = cache_params.conv_states[layer_idx]
+        recurrent_state = cache_params.recurrent_states[layer_idx]
+
+    mixed_qkv = self.in_proj_qkv(hidden_states)
+    mixed_qkv = mixed_qkv.transpose(1, 2)
+
+    z = self.in_proj_z(hidden_states)
+    z = z.reshape(batch_size, seq_len, -1, self.head_v_dim)
+
+    b = self.in_proj_b(hidden_states)
+    a = self.in_proj_a(hidden_states)
+
+    if cache_params is not None:
+        new_mixed_qkv, new_conv_state = ov_causal_conv1d(conv_state, mixed_qkv, self.conv1d.weight, self.conv1d.bias)
+        mixed_qkv = F.silu(new_mixed_qkv)
+        cache_params.conv_states[layer_idx] = new_conv_state
+    else:
+        mixed_qkv = F.silu(self.conv1d(mixed_qkv)[:, :, :seq_len])
+
+    mixed_qkv = mixed_qkv.transpose(1, 2)
+    query, key, value = torch.split(
+        mixed_qkv,
+        [
+            self.key_dim,
+            self.key_dim,
+            self.value_dim,
+        ],
+        dim=-1,
+    )
+    query = query.reshape(query.shape[0], query.shape[1], -1, self.head_k_dim)
+    key = key.reshape(key.shape[0], key.shape[1], -1, self.head_k_dim)
+    value = value.reshape(value.shape[0], value.shape[1], -1, self.head_v_dim)
+
+    beta = b.sigmoid()
+    # If the model is loaded in fp16, without the .float() here, A might be -inf
+    g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias)
+    if self.num_v_heads // self.num_k_heads > 1:
+        query = query.repeat_interleave(self.num_v_heads // self.num_k_heads, dim=2)
+        key = key.repeat_interleave(self.num_v_heads // self.num_k_heads, dim=2)
+
+    core_attn_out, last_recurrent_state = self.recurrent_gated_delta_rule(
+        self,
+        query,
+        key,
+        value,
+        g=g,
+        beta=beta,
+        initial_state=recurrent_state,
+        output_final_state=cache_params is not None,
+        use_qk_l2norm_in_kernel=True,
+    )
+
+    # Update cache
+    if cache_params is not None:
+        cache_params.recurrent_states[layer_idx] = last_recurrent_state
+
+    # reshape input data into 2D tensor
+    core_attn_out = core_attn_out.reshape(-1, self.head_v_dim)
+    z = z.reshape(-1, self.head_v_dim)
+    core_attn_out = self.norm(core_attn_out, z)
+    core_attn_out = core_attn_out.reshape(batch_size, seq_len, -1)
+
+    output = self.out_proj(core_attn_out)
+    return output
+
+
+# This torch.nn.Module represents the GatedDeltaNet layer in its recurrent form.
+# It is required for converting the GatedDeltaNet layer with OpenVINO using the ModuleExtension mechanism.
+class RecurrentAttentionCell(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(
+        self,
+        query,  # (B, H, T, D1)
+        key,  # (B, H, T, D1)
+        value,  # (B, H, T, D2)
+        g,  # (B, H, T)
+        beta,  # (B, H, T)
+        last_recurrent_state,  # (B, H, D1, D2)
+    ):
+        _, _, sequence_length, _ = key.shape
+        core_attn_out = torch.zeros_like(value)
+
+        for i in range(sequence_length):
+            q_t = query[:, :, i]
+            k_t = key[:, :, i]
+            v_t = value[:, :, i]
+            g_t = g[:, :, i].exp().unsqueeze(-1).unsqueeze(-1)
+            beta_t = beta[:, :, i].unsqueeze(-1)
+
+            last_recurrent_state = last_recurrent_state * g_t
+            kv_mem = (last_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2)
+            delta = (v_t - kv_mem) * beta_t
+            last_recurrent_state = last_recurrent_state + k_t.unsqueeze(-1) * delta.unsqueeze(-2)
+            core_attn_out[:, :, i] = (last_recurrent_state * q_t.unsqueeze(-1)).sum(dim=-2)
+
+        # This is a workaround to ensure a single output from the torch.nn.Module.
+        # The OpenVINO ModuleExtension mechanism has a limitation and expects
+        # the module to produce only one output.
+        output_cell = torch.cat([core_attn_out.flatten(), last_recurrent_state.flatten()], dim=0)
+        return output_cell
+
+
+class Qwen3_5ModelPatcher(OVDecoderModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: "PreTrainedModel",
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        from transformers.models.qwen3_5.modeling_qwen3_5 import Qwen3_5DynamicCache
+
+        from openvino.frontend.pytorch import ConversionExtension, ModuleExtension
+
+        super().__init__(config, model, model_kwargs)
+
+        # Detect VLM vs text-only model
+        self._is_vlm = hasattr(self._model.model, "language_model")
+        if self._is_vlm:
+            self._text_model = self._model.model.language_model
+            self._text_config = self._model.config.text_config
+        else:
+            self._text_model = self._model.model
+            self._text_config = self._model.model.config
+
+        class Qwen3_5DynamicCacheWrap(Qwen3_5DynamicCache):
+            def __init__(self, config, conv_states, recurrent_states, key_cache, value_cache):
+                # Call parent constructor with all required arguments
+                super().__init__(config=config)
+
+                self.conv_states = conv_states
+                self.recurrent_states = recurrent_states
+                self.key_cache = key_cache
+                self.value_cache = value_cache
+                self.full_attn_mapping = {}
+                self.linear_attn_mapping = {}
+                full_attn_layer_idx = 0
+                linear_attn_layer_idx = 0
+                for i in range(len(config.layer_types)):
+                    if self.layer_types[i] == "full_attention":
+                        self.full_attn_mapping[i] = full_attn_layer_idx
+                        full_attn_layer_idx += 1
+                    elif self.layer_types[i] == "linear_attention":
+                        self.linear_attn_mapping[i] = linear_attn_layer_idx
+                        linear_attn_layer_idx += 1
+
+            def update(
+                self,
+                key_states: torch.Tensor,
+                value_states: torch.Tensor,
+                layer_idx: int,
+                cache_kwargs: Optional[dict[str, Any]] = None,
+            ) -> tuple[torch.Tensor, torch.Tensor]:
+                # map layer_idx to key_cache (value_cache) idx
+                layer_idx = self.full_attn_mapping[layer_idx]
+                if self.key_cache[layer_idx] is None:
+                    self.key_cache[layer_idx] = key_states
+                    self.value_cache[layer_idx] = value_states
+                else:
+                    self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=2)
+                    self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=2)
+
+                return self.key_cache[layer_idx], self.value_cache[layer_idx]
+
+            def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+                """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+                # take any layer that contains cache and not empty tensor
+                layer_idx = self.transformer_layers[0] if layer_idx not in self.transformer_layers else layer_idx
+                layer_idx = self.full_attn_mapping[layer_idx]
+                if len(self.key_cache) <= layer_idx or self.key_cache[layer_idx] is None:
+                    return 0
+                return self.key_cache[layer_idx].shape[-2]
+
+            @property
+            def has_previous_state(self):
+                """We have a previous state if the last linear (conv) layer was already updated."""
+                layer_idx = self.linear_attn_mapping[self.last_linear_layer]
+                return self.conv_states[layer_idx] is not None
+
+        # the patch is needed to include KV-cache, Conv, and SSM states in the inputs and outputs.
+        def patched_forward(
+            input_ids=None,
+            attention_mask=None,
+            cache_params=None,
+            inputs_embeds=None,
+            position_ids=None,
+        ):
+            text_config = self._text_config
+            num_full_attn_layers = text_config.layer_types.count("full_attention")
+            num_linear_attn_layers = text_config.layer_types.count("linear_attention")
+
+            use_cache = False
+            wrapped_cache_params = None
+            if cache_params is not None:
+                use_cache = True
+                conv_states = []
+                recurrent_states = []
+                key_cache = []
+                value_cache = []
+
+                # decouple ssm_states, conv_states, keys and values from cache_params
+                for idx in range(num_linear_attn_layers):
+                    conv_states.append(cache_params[2 * idx])
+                    recurrent_states.append(cache_params[2 * idx + 1])
+
+                for idx in range(num_full_attn_layers):
+                    key_cache.append(cache_params[2 * num_linear_attn_layers + 2 * idx])
+                    value_cache.append(cache_params[2 * num_linear_attn_layers + 2 * idx + 1])
+
+                wrapped_cache_params = Qwen3_5DynamicCacheWrap(
+                    text_config, conv_states, recurrent_states, key_cache, value_cache
+                )
+
+            if self._is_vlm:
+                # VLM case: call language model through the composite model
+                outputs_lm = self._text_model(
+                    inputs_embeds=inputs_embeds,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_values=wrapped_cache_params,
+                    use_cache=use_cache,
+                )
+                hidden_states = outputs_lm[0]
+                logits = self._model.lm_head(hidden_states)
+                past_kv = outputs_lm.past_key_values
+            else:
+                causal_lm_output = self.model_orig_forward(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    past_key_values=wrapped_cache_params,
+                    use_cache=use_cache,
+                )
+                logits = causal_lm_output.logits
+                past_kv = causal_lm_output.past_key_values
+            outputs = {
+                "logits": logits,
+            }
+
+            if use_cache:
+                present_key_values = []
+                for idx in range(num_linear_attn_layers):
+                    present_key_values.append(past_kv.conv_states[idx])
+                    present_key_values.append(past_kv.recurrent_states[idx])
+
+                for idx in range(num_full_attn_layers):
+                    present_key_values.append(past_kv.key_cache[idx])
+                    present_key_values.append(past_kv.value_cache[idx])
+
+                outputs["present_key_values"] = present_key_values
+
+            return outputs
+
+        self.patched_forward = patched_forward
+        self.model_orig_forward = self.orig_forward
+        self.orig_forward = patched_forward
+
+        self.module_extensions = {
+            RecurrentAttentionCell: ModuleExtension(RecurrentAttentionCell, "RecurrentAttentionCellOp"),
+        }
+        self.conversion_extensions = [
+            ConversionExtension("RecurrentAttentionCellOp", convert_recurrent_attention_cell),
+        ]
+
+    def __enter__(self):
+        super().__enter__()
+        setattr(self._model, self.orig_forward_name, self.patched_forward)
+
+        for idx, decoder_layer in enumerate(self._text_model.layers):
+            layer_type = self._text_config.layer_types[idx]
+            if layer_type == "linear_attention":
+                linear_attn_layer = decoder_layer.linear_attn
+                linear_attn_layer._orig_forward = linear_attn_layer.forward
+                linear_attn_layer.forward = types.MethodType(qwen3_5_gated_delta_net_forward, linear_attn_layer)
+                linear_attn_layer.recurrent_gated_delta_rule = patched_recurrent_gated_delta_rule
+                linear_attn_layer.recurrent_attention_cell = RecurrentAttentionCell()
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        setattr(self._model, self.orig_forward_name, self.model_orig_forward)
+        for idx, decoder_layer in enumerate(self._text_model.layers):
+            layer_type = self._text_config.layer_types[idx]
+            if layer_type == "linear_attention":
+                linear_attn_layer = decoder_layer.linear_attn
+                linear_attn_layer.forward = linear_attn_layer._orig_forward
+
+
+class Qwen3_5VisionEmbMergerPatcher(ModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: "PreTrainedModel",
+        model_kwargs: Dict[str, Any] = None,
+    ):
+        model.__orig_forward = model.forward
+
+        # Adapted from Qwen3.5 VisionModel forward
+        # added attention_mask input instead of cu_seqlens for its internal calculation
+        # separated patch_embed and rot_pos_emb calls for performing as part of another model
+        def image_embed_forward(
+            self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, rotary_pos_emb: torch.Tensor
+        ) -> torch.Tensor:
+            emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+            position_embeddings = (emb.cos(), emb.sin())
+            for blk in self.blocks:
+                hidden_states = blk(hidden_states, attention_mask=attention_mask, position_embeddings=position_embeddings)
+            return self.merger(hidden_states)
+
+        model.forward = types.MethodType(image_embed_forward, model)
+        super().__init__(config, model, model_kwargs)
+
+    def __enter__(self):
+        patch_qwen2vl_vision_blocks(self._model)
+        super().__enter__()
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        self._model.forward = self._model.__orig_forward
+        for block in self._model.blocks:
+            block.forward = block._orig_forward
+            block.attn.forward = block.attn._orig_forward
diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py
index 3d9a854e39..08011e44b2 100644
--- a/optimum/exporters/openvino/utils.py
+++ b/optimum/exporters/openvino/utils.py
@@ -295,6 +295,7 @@ def get_submodels(model):
     "qwen2_vl",
     "qwen2_5_vl",
     "qwen3_vl",
+    "qwen3_5",
     "got_ocr2",
     "gemma3",
     "idefics3",
@@ -305,7 +306,7 @@ def get_submodels(model):
     "minicpmo",
 ]
 
-SSM_MODELS = ["mamba", "falcon_mamba", "zamba2", "lfm2", "granitemoehybrid"]
+SSM_MODELS = ["mamba", "falcon_mamba", "zamba2", "lfm2", "granitemoehybrid", "qwen3_5_text"]
 
 # All transformers, diffusers, timm and sentence transformers models that are supported via optimum-onnx OnnxConfigs but that have currently no test
 # TODO: add tests for all models that are compatible and remove support for all others
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 3b95b5f276..66e036bc37 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -1449,8 +1449,8 @@ def prepare_inputs_for_generation(
                 # decoding stage so it takes the last token
                 input_ids = input_ids[:, -1].unsqueeze(-1)
 
-                if self.config.model_type not in ["lfm2", "granitemoehybrid"]:
-                    # LFM2 and GraniteMoeHybrid (Granite-4.0) require the attention mask
+                if self.config.model_type not in ["lfm2", "granitemoehybrid", "qwen3_5_text"]:
+                    # LFM2, GraniteMoeHybrid (Granite-4.0), and Qwen3.5 require the attention mask
                     # to be the length of the full context, so default mask from OVModelForCausalLM needs to be used.
                     # Other models like Mamba typically do not require an attention_mask
                     # for the decoding step after the first token so use attention mask of ones.
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index 2fe8cb0ea0..ba002befde 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -190,7 +190,7 @@ def prepare_inputs(
             if past_len:
                 position_ids = position_ids[:, -inputs_embeds.shape[1] :]
 
-            if (self.config.model_type in ["qwen2_vl", "qwen3_vl"]) and position_ids.ndim != 3:
+            if (self.config.model_type in ["qwen2_vl", "qwen3_vl", "qwen3_5"]) and position_ids.ndim != 3:
                 position_ids = np.repeat(np.expand_dims(position_ids, 0), 3, axis=0)
 
             inputs["position_ids"] = position_ids
@@ -3431,6 +3431,12 @@ def preprocess_inputs(
         Qwen3VLVisionModel,
         Qwen3VLVisionRotaryEmbedding,
     )
+
+    from transformers.models.qwen3_5.modeling_qwen3_5 import (
+        Qwen3_5Model,
+        Qwen3_5VisionModel,
+        Qwen3_5VisionRotaryEmbedding,
+    )
 else:
 
     class Qwen3VLModel:
@@ -3439,6 +3445,12 @@ class Qwen3VLModel:
     class Qwen3VLVisionModel:
         pass
 
+    class Qwen3_5Model:
+        pass
+
+    class Qwen3_5VisionModel:
+        pass
+
 
 # The inheritance from Qwen3VLModel is needed to get access to methods:
 # get_placeholder_mask(): https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py#L1066
@@ -4802,6 +4814,355 @@ def preprocess_inputs(
         return inputs
 
 
+# The inheritance from Qwen3_5Model is needed to get access to methods:
+# get_placeholder_mask(), get_rope_index(), get_image_features(), get_video_features(), compute_3d_position_ids()
+#
+# and inheritance from Qwen3_5VisionModel is needed for accessing the following method:
+# rot_pos_emb()
+class _OVQwen3_5ForCausalLM(OVModelForVisualCausalLM, Qwen3_5Model, Qwen3_5VisionModel):
+    additional_parts = ["vision_embeddings_merger", "vision_embeddings_pos"]
+
+    def __init__(
+        self,
+        language_model: ov.Model,
+        text_embeddings: ov.Model,
+        vision_embeddings: ov.Model,
+        config: PretrainedConfig = None,
+        device: str = "CPU",
+        dynamic_shapes: bool = None,
+        ov_config: Optional[Dict[str, str]] = None,
+        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
+        quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
+        **kwargs,
+    ):
+        if is_transformers_version("<", "4.57.0"):
+            raise Exception("Qwen3.5 is not supported in transformers versions earlier than 4.57.0.")
+
+        super().__init__(
+            language_model=language_model,
+            text_embeddings=text_embeddings,
+            vision_embeddings=vision_embeddings,
+            config=config,
+            device=device,
+            dynamic_shapes=dynamic_shapes,
+            ov_config=ov_config,
+            model_save_dir=model_save_dir,
+            quantization_config=quantization_config,
+            **kwargs,
+        )
+        self.rope_deltas = None  # cache rope_deltas here
+
+        self.num_grid_per_side = int(config.vision_config.num_position_embeddings**0.5)
+        self.spatial_merge_size = config.vision_config.spatial_merge_size
+        head_dim = config.vision_config.hidden_size // config.vision_config.num_heads
+        self.rotary_pos_emb = Qwen3_5VisionRotaryEmbedding(head_dim // 2)
+
+    def __setattr__(self, name, value):
+        OVModelForVisualCausalLM.__setattr__(self, name, value)
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        pixel_values=None,
+        pixel_values_videos=None,
+        image_grid_thw=None,
+        video_grid_thw=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+        if past_key_values is not None:
+            if inputs_embeds is not None and input_ids.shape[1] == 0:  # Exception 4
+                inputs_embeds = inputs_embeds[:, -cache_position.shape[0] :]
+            elif inputs_embeds is not None:
+                input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, cache_position]
+
+        if cache_position[0] != 0:
+            pixel_values = None
+            pixel_values_videos = None
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and len(cache_position) == inputs_embeds.shape[1]:
+            model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
+        else:
+            model_inputs = {"input_ids": input_ids, "inputs_embeds": None}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+                "pixel_values": pixel_values,
+                "pixel_values_videos": pixel_values_videos,
+                "image_grid_thw": image_grid_thw,
+                "video_grid_thw": video_grid_thw,
+                "cache_position": cache_position,
+            }
+        )
+        return model_inputs
+
+    # Adapted from Qwen3_5VisionModel.fast_pos_embed_interpolate
+    # This method needs to be changed, as instead of running self.pos_embed of type nn.Embedding, openvino model needs to be inferred (self.vision_embeddings_pos)
+    def fast_pos_embed_interpolate(self, grid_thw):
+        grid_ts, grid_hs, grid_ws = grid_thw[:, 0], grid_thw[:, 1], grid_thw[:, 2]
+
+        idx_list = [[] for _ in range(4)]
+        weight_list = [[] for _ in range(4)]
+
+        for t, h, w in zip(grid_ts, grid_hs, grid_ws):
+            h_idxs = torch.linspace(0, self.num_grid_per_side - 1, h)
+            w_idxs = torch.linspace(0, self.num_grid_per_side - 1, w)
+
+            h_idxs_floor = h_idxs.int()
+            w_idxs_floor = w_idxs.int()
+            h_idxs_ceil = (h_idxs.int() + 1).clip(max=self.num_grid_per_side - 1)
+            w_idxs_ceil = (w_idxs.int() + 1).clip(max=self.num_grid_per_side - 1)
+
+            dh = h_idxs - h_idxs_floor
+            dw = w_idxs - w_idxs_floor
+
+            base_h = h_idxs_floor * self.num_grid_per_side
+            base_h_ceil = h_idxs_ceil * self.num_grid_per_side
+
+            indices = [
+                (base_h[None].T + w_idxs_floor[None]).flatten(),
+                (base_h[None].T + w_idxs_ceil[None]).flatten(),
+                (base_h_ceil[None].T + w_idxs_floor[None]).flatten(),
+                (base_h_ceil[None].T + w_idxs_ceil[None]).flatten(),
+            ]
+
+            weights = [
+                ((1 - dh)[None].T * (1 - dw)[None]).flatten(),
+                ((1 - dh)[None].T * dw[None]).flatten(),
+                (dh[None].T * (1 - dw)[None]).flatten(),
+                (dh[None].T * dw[None]).flatten(),
+            ]
+
+            for i in range(4):
+                idx_list[i].extend(indices[i].tolist())
+                weight_list[i].extend(weights[i].tolist())
+
+        idx_tensor = torch.tensor(idx_list)
+        weight_tensor = torch.tensor(weight_list)
+        pos_embeds = torch.from_numpy(self.vision_embeddings_pos(idx_tensor)) * weight_tensor[:, :, None]
+        patch_pos_embeds = pos_embeds[0] + pos_embeds[1] + pos_embeds[2] + pos_embeds[3]
+
+        patch_pos_embeds = patch_pos_embeds.split([h * w for h, w in zip(grid_hs, grid_ws)])
+
+        patch_pos_embeds_permute = []
+        merge_size = self.config.vision_config.spatial_merge_size
+        for pos_embed, t, h, w in zip(patch_pos_embeds, grid_ts, grid_hs, grid_ws):
+            pos_embed = pos_embed.repeat(t, 1)
+            pos_embed = (
+                pos_embed.view(t, h // merge_size, merge_size, w // merge_size, merge_size, -1)
+                .permute(0, 1, 3, 2, 4, 5)
+                .flatten(0, 4)
+            )
+            patch_pos_embeds_permute.append(pos_embed)
+        patch_pos_embeds = torch.cat(patch_pos_embeds_permute)
+        return patch_pos_embeds
+
+    def get_vision_embeddings(self, pixel_values, grid_thw, **kwargs):
+        hidden_states = torch.from_numpy(self.vision_embeddings(pixel_values)[0])
+
+        pos_embeds = self.fast_pos_embed_interpolate(grid_thw)
+        hidden_states = hidden_states + pos_embeds
+
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+        seq_len, _ = hidden_states.size()
+        hidden_states = hidden_states.reshape(seq_len, -1)
+        rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
+        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
+            dim=0, dtype=torch.int32
+        )
+        cu_seqlens = torch.nn.functional.pad(cu_seqlens, (1, 0), value=0)
+        attention_mask = torch.zeros((1, hidden_states.shape[0], hidden_states.shape[0]), dtype=torch.bool)
+        causal_mask = torch.zeros_like(attention_mask, dtype=torch.float32)
+        for i in range(1, len(cu_seqlens)):
+            attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = True
+
+        causal_mask.masked_fill_(torch.logical_not(attention_mask), float("-inf"))
+
+        res = self.vision_embeddings_merger(
+            pixel_values=hidden_states, attention_mask=causal_mask, rotary_pos_emb=rotary_pos_emb
+        )
+        return res[0]
+
+    def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None):
+        """
+        Encodes images into continuous embeddings that can be forwarded to the language model.
+        """
+        image_embeds = torch.from_numpy(self.get_vision_embeddings(pixel_values, image_grid_thw))
+        split_sizes = (image_grid_thw.prod(-1) // self.spatial_merge_size**2).tolist()
+        image_embeds = torch.split(image_embeds, split_sizes)
+        return image_embeds
+
+    def get_video_features(
+        self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None
+    ):
+        """
+        Encodes videos into continuous embeddings that can be forwarded to the language model.
+        """
+        return self.get_image_features(pixel_values_videos, video_grid_thw)
+
+    def get_multimodal_embeddings(
+        self,
+        input_ids,
+        pixel_values=None,
+        attention_mask=None,
+        position_ids=None,
+        pixel_values_videos=None,
+        image_grid_thw=None,
+        video_grid_thw=None,
+        cache_position=None,
+        **kwargs,
+    ):
+        inputs_embeds = torch.from_numpy(self.get_text_embeddings(input_ids))
+        if pixel_values is not None and input_ids.shape[1] != 1:
+            image_embeds = self.get_image_features(pixel_values, image_grid_thw)
+            image_embeds = torch.cat(image_embeds, dim=0)
+            n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
+            n_image_features = image_embeds.shape[0]
+            if n_image_tokens != n_image_features:
+                raise ValueError(
+                    f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+                )
+
+            mask = input_ids == self.config.image_token_id
+            mask_unsqueezed = mask.unsqueeze(-1)
+            mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
+            image_mask = mask_expanded.to(inputs_embeds.device)
+
+            image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
+
+        if pixel_values_videos is not None and input_ids.shape[1] != 1:
+            video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw)
+            video_embeds = torch.cat(video_embeds, dim=0)
+            n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
+            n_video_features = video_embeds.shape[0]
+            if n_video_tokens != n_video_features:
+                raise ValueError(
+                    f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
+                )
+
+            mask = input_ids == self.config.video_token_id
+            mask_unsqueezed = mask.unsqueeze(-1)
+            mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
+            video_mask = mask_expanded.to(inputs_embeds.device)
+
+            video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
+
+        if position_ids is None and input_ids is not None and (attention_mask is None or attention_mask.ndim == 2):
+            # calculate RoPE index once per generation in the pre-fill stage only
+            if (cache_position is not None and cache_position[0] == 0) or self.rope_deltas is None:
+                # Construct mm_token_type_ids from input_ids
+                mm_token_type_ids = torch.zeros_like(input_ids, dtype=torch.int)
+                mm_token_type_ids[input_ids == self.config.image_token_id] = 1
+                mm_token_type_ids[input_ids == self.config.video_token_id] = 2
+                position_ids, rope_deltas = self.get_rope_index(
+                    input_ids, mm_token_type_ids, image_grid_thw, video_grid_thw, attention_mask
+                )
+                self.rope_deltas = rope_deltas
+            # then use the prev pre-calculated rope-deltas to get the correct position ids
+            else:
+                batch_size, seq_length, _ = inputs_embeds.shape
+                delta = cache_position[0] + self.rope_deltas if cache_position is not None else 0
+                position_ids = torch.arange(seq_length, device=inputs_embeds.device)
+                position_ids = position_ids.view(1, -1).expand(batch_size, -1)
+                if cache_position is not None:  # otherwise `deltas` is an int `0`
+                    delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)
+                position_ids = position_ids.add(delta)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
+
+        return inputs_embeds, attention_mask, position_ids
+
+    @staticmethod
+    def preprocess_inputs(
+        text: str,
+        image: Optional["Image"] = None,
+        processor: Optional[AutoImageProcessor] = None,
+        tokenizer: Optional[PreTrainedTokenizer] = None,
+        config: Optional[PretrainedConfig] = None,
+        video: Optional["VideoInput"] = None,
+        audio: Optional[np.ndarray] = None,
+    ):
+        if processor is None:
+            raise ValueError("Processor is required.")
+        if audio is not None:
+            raise ValueError("Audio input is not supported")
+        conversation = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": text},
+                ],
+            }
+        ]
+        if image is not None:
+            conversation[0]["content"].insert(0, {"type": "image"})
+        if video is not None:
+            conversation[0]["content"].insert(0, {"type": "video"})
+
+        text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
+
+        inputs = processor(images=image, text=text_prompt, videos=video, return_tensors="pt")
+        return inputs
+
+    def forward(
+        self,
+        input_ids,
+        pixel_values=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        image_sizes=None,
+        attention_mask=None,
+        position_ids=None,
+        image_bound=None,
+        tgt_sizes=None,
+        pixel_values_videos=None,
+        image_grid_thw=None,
+        video_grid_thw=None,
+        rope_deltas=None,
+        **kwargs,
+    ):
+        result = super().forward(
+            input_ids,
+            pixel_values,
+            past_key_values,
+            inputs_embeds,
+            image_sizes,
+            attention_mask,
+            position_ids,
+            image_bound,
+            tgt_sizes,
+            pixel_values_videos,
+            image_grid_thw,
+            video_grid_thw,
+            rope_deltas,
+            **kwargs,
+        )
+        final_result = QWen2VLModelOutputWithPast(
+            logits=result.logits, past_key_values=result.past_key_values, rope_deltas=rope_deltas
+        )
+        return final_result
+
+    def generate(self, *args, **kwargs):
+        # Clear cached rope delta from previous generations
+        self.rope_deltas = None
+
+        return super().generate(*args, **kwargs)
+
+
 MODEL_TYPE_TO_CLS_MAPPING = {
     "llava": _OVLlavaForCausalLM,
     "llava_next": _OVLlavaNextForCausalLM,
@@ -4823,5 +5184,7 @@ def preprocess_inputs(
     "phi4_multimodal": _OVPhi4MMForCausalLM,
     "llama4": _OVLlama4ForCausalLM,
     "qwen3_vl": _OVQwen3VLForCausalLM,
+    "qwen3_5": _OVQwen3_5ForCausalLM,
+    "qwen3_5_text": _OVQwen3_5ForCausalLM,
     "minicpmo": _OVMiniCPMOForCausalLM,
 }
diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 0ddb251b22..c3a89fe3aa 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -28,6 +28,7 @@
     DeepseekOpenVINOConfig,
     LFM2OpenVINOConfig,
     Qwen3VLOpenVINOConfig,
+    Qwen3_5TextOpenVINOConfig,
 )
 from optimum.exporters.openvino.model_patcher import patch_update_causal_mask
 from optimum.exporters.openvino.utils import ONNX_SUPPORTED_ARCHITECTURES
@@ -333,6 +334,10 @@ def test_find_untested_architectures(self):
                 "exaone4",
             }
 
+        # qwen3_5_text a part of qwen3_5 architecture and is tested in seq2seq group
+        if is_transformers_version(">=", str(Qwen3_5TextOpenVINOConfig.MIN_TRANSFORMERS_VERSION)):
+            supported_architectures -= {"qwen3_5_text"}
+
         supported_architectures -= ONNX_SUPPORTED_ARCHITECTURES
         untested_architectures = supported_architectures - tested_architectures
 
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index 61659fa1ed..a17750c2cc 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -172,6 +172,7 @@
     "qwen3": "optimum-intel-internal-testing/tiny-random-qwen3",
     "qwen3_moe": "optimum-intel-internal-testing/tiny-random-qwen3moe",
     "qwen3_vl": "optimum-intel-internal-testing/tiny-random-qwen3-vl",
+    "qwen3_5": "optimum-intel-internal-testing/tiny-random-qwen3.5",
     "rembert": "optimum-intel-internal-testing/tiny-random-rembert",
     "resnet": "optimum-intel-internal-testing/tiny-random-resnet",
     "roberta": "optimum-intel-internal-testing/tiny-random-roberta",
@@ -334,6 +335,13 @@
         "vision_embeddings_merger_model": 32,
         "vision_embeddings_pos_model": 1,
     },
+    "qwen3_5": {
+        "lm_model": 100,
+        "text_embeddings_model": 1,
+        "vision_embeddings_model": 1,
+        "vision_embeddings_merger_model": 32,
+        "vision_embeddings_pos_model": 1,
+    },
     "sana": {
         "transformer": 58,
         "vae_decoder": 28,

From 4cbb25e8d713d003a60a981013321ecf3bdadc38 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 8 Mar 2026 18:28:40 +0000
Subject: [PATCH 03/10] Fix Qwen3.5 model patcher and config for VLM text
 embeddings access

Co-authored-by: rkazants <35459624+rkazants@users.noreply.github.com>
---
 optimum/exporters/openvino/model_configs.py | 5 +++++
 optimum/exporters/openvino/model_patcher.py | 4 ++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 527c6321ab..a1e77272ab 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -5540,6 +5540,11 @@ def get_model_for_behavior(model, behavior: Union[str, QwenVLConfigBehavior]):
             vision_emb_pos.config = model.config.vision_config
             return vision_emb_pos
 
+        if behavior == QwenVLConfigBehavior.TEXT_EMBEDDINGS:
+            text_embedding = model.model.language_model.embed_tokens
+            text_embedding.config = model.config
+            return text_embedding
+
         return Qwen2VLOpenVINOConfig.get_model_for_behavior(model, behavior)
 
     def with_behavior(
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 28110c1040..7321b2371d 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -56,8 +56,6 @@
 )
 from optimum.intel.utils.import_utils import is_diffusers_version, is_torch_version, is_transformers_version
 
-from ._ov_ops import convert_recurrent_attention_cell
-
 
 if is_transformers_version(">=", "4.53"):
     from transformers.masking_utils import ALL_MASK_ATTENTION_FUNCTIONS, eager_mask, sdpa_mask
@@ -8205,6 +8203,8 @@ def __init__(
 
         from openvino.frontend.pytorch import ConversionExtension, ModuleExtension
 
+        from ._ov_ops import convert_recurrent_attention_cell
+
         super().__init__(config, model, model_kwargs)
 
         # Detect VLM vs text-only model

From b660200347700b3d4f6b5fee3c4e9172d03efb75 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 8 Mar 2026 18:30:23 +0000
Subject: [PATCH 04/10] Fix comment grammar in test_decoder.py

Co-authored-by: rkazants <35459624+rkazants@users.noreply.github.com>
---
 docs/source/openvino/models.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx
index 4ab826378b..4365b1ff84 100644
--- a/docs/source/openvino/models.mdx
+++ b/docs/source/openvino/models.mdx
@@ -131,7 +131,7 @@ Here is the list of the supported architectures :
 - Qwen2VL
 - Qwen2.5VL
 - Qwen3VL
-- Qwen3.5
+- Qwen3.5 
 - ResNet
 - Roberta
 - Roformer

From 07d943dfd90acc944331ba71e009fe4008c4ae3e Mon Sep 17 00:00:00 2001
From: "Kazantsev, Roman" <roman.kazantsev@intel.com>
Date: Wed, 11 Mar 2026 10:28:18 +0400
Subject: [PATCH 05/10] Use Qwen3VLOpenVINOConfig

Signed-off-by: Kazantsev, Roman <roman.kazantsev@intel.com>
---
 optimum/exporters/openvino/model_configs.py | 24 +++++----------------
 1 file changed, 5 insertions(+), 19 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index a1e77272ab..7306a6a9f8 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -5434,7 +5434,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
     *["text-generation", "text-generation-with-past"],
     library_name="transformers",
 )
-class Qwen3_5TextOpenVINOConfig(Qwen3OpenVINOConfig):
+class Qwen3_5TextOpenVINOConfig(Qwen3VLTextOpenVINOConfig):
     DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, Qwen3_5DummyPastKeyValuesGenerator)
     DUMMY_PKV_GENERATOR_CLASS = Qwen3_5DummyPastKeyValuesGenerator
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
@@ -5506,9 +5506,9 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
     *["image-text-to-text"],
     library_name="transformers",
 )
-class Qwen3_5OpenVINOConfig(Qwen2VLOpenVINOConfig):
+class Qwen3_5OpenVINOConfig(Qwen3VLOpenVINOConfig):
     SUPPORTED_BEHAVIORS = [model_type.value for model_type in QwenVLConfigBehavior]
-    DUMMY_INPUT_GENERATOR_CLASSES = (DummyQwen2VLVisionEmbedInputGenerator,)
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyQwen3VLVisionEmbedInputGenerator,)
     MIN_TRANSFORMERS_VERSION = "4.57.0"
 
     def __init__(
@@ -5533,20 +5533,6 @@ def __init__(
             self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config)
             self._normalized_config.use_embed_dim = True
 
-    @staticmethod
-    def get_model_for_behavior(model, behavior: Union[str, QwenVLConfigBehavior]):
-        if behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_POS:
-            vision_emb_pos = model.visual.pos_embed
-            vision_emb_pos.config = model.config.vision_config
-            return vision_emb_pos
-
-        if behavior == QwenVLConfigBehavior.TEXT_EMBEDDINGS:
-            text_embedding = model.model.language_model.embed_tokens
-            text_embedding.config = model.config
-            return text_embedding
-
-        return Qwen2VLOpenVINOConfig.get_model_for_behavior(model, behavior)
-
     def with_behavior(
         self,
         behavior: Union[str, QwenVLConfigBehavior],
@@ -5572,8 +5558,8 @@ def with_behavior(
                 self.int_dtype,
                 self.float_dtype,
                 model_patcher=Qwen3_5ModelPatcher,
-                dummy_input_generator=DummyQwen2VLLMInputGenerator,
-                inputs_update={"position_ids": {1: "batch_size", 2: "sequence_length"}},
+                #dummy_input_generator=DummyQwen2VLLMInputGenerator,
+                #inputs_update={"position_ids": {1: "batch_size", 2: "sequence_length"}},
             )
 
         if behavior in (

From d8864c45db718f09ac312120502c1340229dc505 Mon Sep 17 00:00:00 2001
From: "Kazantsev, Roman" <roman.kazantsev@intel.com>
Date: Wed, 11 Mar 2026 14:03:58 +0400
Subject: [PATCH 06/10] Remove redundant functions

Signed-off-by: Kazantsev, Roman <roman.kazantsev@intel.com>
---
 optimum/exporters/openvino/model_patcher.py | 94 ---------------------
 1 file changed, 94 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 816204b0b6..46cac7047d 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -8419,62 +8419,6 @@ def __exit__(self, exc_type, exc_value, traceback):
                 del sparse_moe_block.down_projs, sparse_moe_block.gate_projs, sparse_moe_block.up_projs
 
 
-
-
-# Patched implementation of the gated delta rule in recurrent form.
-# Adapted from:
-# https://github.com/huggingface/transformers/blob/v4.57-release/src/transformers/models/qwen3_next/modeling_qwen3_next.py#L522
-#
-# To represent the for-loop that generates output embeddings, we use a module
-# and the conversion extension mechanism. This is necessary because there is
-# no known vectorized form of this loop that would allow it to be correctly
-# traced with torch.jit.trace
-def patched_recurrent_gated_delta_rule(
-    self, query, key, value, g, beta, initial_state, output_final_state, use_qk_l2norm_in_kernel=False
-):
-    def l2norm(x: torch.FloatTensor, dim: int = -1, eps: float = 1e-6):
-        """This function is intended to align with the l2norm implementation in the FLA library."""
-        inv_norm = torch.rsqrt((x * x).sum(dim=dim, keepdim=True) + eps)
-        return x * inv_norm
-
-    initial_dtype = query.dtype
-    if use_qk_l2norm_in_kernel:
-        query = l2norm(query, dim=-1, eps=1e-6)
-        key = l2norm(key, dim=-1, eps=1e-6)
-    query, key, value, beta, g = [
-        x.transpose(1, 2).contiguous().to(torch.float32) for x in (query, key, value, beta, g)
-    ]
-
-    batch_size, num_heads, sequence_length, k_head_dim = key.shape
-    v_head_dim = value.shape[-1]
-    scale = 1 / (query.shape[-1] ** 0.5)
-    query = query * scale
-
-    last_recurrent_state = (
-        torch.zeros(batch_size, num_heads, k_head_dim, v_head_dim).to(value)
-        if initial_state is None
-        else initial_state.to(value)
-    )
-
-    output_cell = self.recurrent_attention_cell(
-        query,  # (B, H, T, D1)
-        key,  # (B, H, T, D1)
-        value,  # (B, H, T, D2)
-        g,  # (B, H, T)
-        beta,  # (B, H, T)
-        last_recurrent_state,  # (B, H, D1, D2)
-    )
-
-    num_elems = value.numel()
-    core_attn_out = output_cell[:num_elems].reshape(value.shape)
-    last_recurrent_state = output_cell[num_elems:].reshape(last_recurrent_state.shape)
-
-    if not output_final_state:
-        last_recurrent_state = None
-    core_attn_out = core_attn_out.transpose(1, 2).contiguous().to(initial_dtype)
-    return core_attn_out, last_recurrent_state
-
-
 # The CausalConv1D block is overridden with a generic patch provided by `ov_causal_conv1d()`.
 # The GatedDeltaNet block is overridden with a recurrent version of its implementation.
 #
@@ -8576,44 +8520,6 @@ def apply_mask_to_padding_states(hidden_states, attention_mask):
     return output
 
 
-# This torch.nn.Module represents the GatedDeltaNet layer in its recurrent form.
-# It is required for converting the GatedDeltaNet layer with OpenVINO using the ModuleExtension mechanism.
-class RecurrentAttentionCell(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(
-        self,
-        query,  # (B, H, T, D1)
-        key,  # (B, H, T, D1)
-        value,  # (B, H, T, D2)
-        g,  # (B, H, T)
-        beta,  # (B, H, T)
-        last_recurrent_state,  # (B, H, D1, D2)
-    ):
-        _, _, sequence_length, _ = key.shape
-        core_attn_out = torch.zeros_like(value)
-
-        for i in range(sequence_length):
-            q_t = query[:, :, i]
-            k_t = key[:, :, i]
-            v_t = value[:, :, i]
-            g_t = g[:, :, i].exp().unsqueeze(-1).unsqueeze(-1)
-            beta_t = beta[:, :, i].unsqueeze(-1)
-
-            last_recurrent_state = last_recurrent_state * g_t
-            kv_mem = (last_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2)
-            delta = (v_t - kv_mem) * beta_t
-            last_recurrent_state = last_recurrent_state + k_t.unsqueeze(-1) * delta.unsqueeze(-2)
-            core_attn_out[:, :, i] = (last_recurrent_state * q_t.unsqueeze(-1)).sum(dim=-2)
-
-        # This is a workaround to ensure a single output from the torch.nn.Module.
-        # The OpenVINO ModuleExtension mechanism has a limitation and expects
-        # the module to produce only one output.
-        output_cell = torch.cat([core_attn_out.flatten(), last_recurrent_state.flatten()], dim=0)
-        return output_cell
-
-
 class Qwen3_5ModelPatcher(OVDecoderModelPatcher):
     def __init__(
         self,

From 934b32eb9da4bb1b39769562e430f1998f53ec00 Mon Sep 17 00:00:00 2001
From: "Kazantsev, Roman" <roman.kazantsev@intel.com>
Date: Wed, 18 Mar 2026 22:09:29 +0400
Subject: [PATCH 07/10] Correct patching for vlm

Signed-off-by: Kazantsev, Roman <roman.kazantsev@intel.com>
---
 optimum/exporters/openvino/stateful.py        |  4 ++++
 .../openvino/modeling_visual_language.py      | 22 ++++++++++++++-----
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/optimum/exporters/openvino/stateful.py b/optimum/exporters/openvino/stateful.py
index 3b8642d65a..38ffef5d05 100644
--- a/optimum/exporters/openvino/stateful.py
+++ b/optimum/exporters/openvino/stateful.py
@@ -310,6 +310,10 @@ def patch_stateful(config: PretrainedConfig, ov_model: ov.Model):
         return patch_stateful_encoder_decoder(config, ov_model)
     if config.model_type in SSM_MODELS:
         return patch_stateful_hybrid_ssm(ov_model)
+    # For VLM models, the text sub-model may be SSM-based (e.g. qwen3_5 VLM with qwen3_5_text language model)
+    text_config = getattr(config, "text_config", None)
+    if text_config is not None and getattr(text_config, "model_type", None) in SSM_MODELS:
+        return patch_stateful_hybrid_ssm(ov_model)
     return patch_stateful_decoder(config, ov_model)
 
 
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index bc4d5a76d0..8e7a1c8e21 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -3431,12 +3431,6 @@ def preprocess_inputs(
         Qwen3VLVisionModel,
         Qwen3VLVisionRotaryEmbedding,
     )
-
-    from transformers.models.qwen3_5.modeling_qwen3_5 import (
-        Qwen3_5Model,
-        Qwen3_5VisionModel,
-        Qwen3_5VisionRotaryEmbedding,
-    )
 else:
 
     class Qwen3VLModel:
@@ -3451,6 +3445,22 @@ class Qwen3_5Model:
     class Qwen3_5VisionModel:
         pass
 
+if is_transformers_version(">=", "5.2.0"):
+    from transformers.models.qwen3_5.modeling_qwen3_5 import (
+        Qwen3_5Model,
+        Qwen3_5VisionModel,
+        Qwen3_5VisionRotaryEmbedding,
+    )
+else:
+
+    class Qwen3_5Model:
+        pass
+
+    class Qwen3_5VisionModel:
+        pass
+
+    class Qwen3_5VisionRotaryEmbedding:
+        pass
 
 # The inheritance from Qwen3VLModel is needed to get access to methods:
 # get_placeholder_mask(): https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py#L1066

From e1f8c28b8575fd63b731e8ab83d8f2f4faa328bc Mon Sep 17 00:00:00 2001
From: "Kazantsev, Roman" <roman.kazantsev@intel.com>
Date: Fri, 20 Mar 2026 17:19:53 +0400
Subject: [PATCH 08/10] Fix bf16 patching

Signed-off-by: Kazantsev, Roman <roman.kazantsev@intel.com>
---
 optimum/exporters/openvino/convert.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index d0efa2259f..23e309efdb 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -447,6 +447,16 @@ def ts_patched_forward(*args, **kwargs):
                     extension=conversion_extensions,
                 )
 
+                if patch_16bit_model:
+                    # Undo __make_16bit_traceable patching on sub-modules to avoid corrupting
+                    # forward methods of modules shared across export behaviors (e.g. pos_embed
+                    # Embedding in VLMs that is also exported separately as vision_embeddings_pos).
+                    _orig_forward_attr = "_openvino_module_extension_patch_orig_forward"
+                    for module in model.modules():
+                        if hasattr(module, _orig_forward_attr):
+                            module.forward = getattr(module, _orig_forward_attr)
+                            delattr(module, _orig_forward_attr)
+
         ov_model.validate_nodes_and_infer_types()  # TODO: remove as unnecessary validation?
 
         output_names = list(config.outputs.keys())

From 4602e000f4ca2c0e04a03c3633d30703b6bb0b05 Mon Sep 17 00:00:00 2001
From: "Kazantsev, Roman" <roman.kazantsev@intel.com>
Date: Sun, 22 Mar 2026 20:03:10 +0400
Subject: [PATCH 09/10] Support Qwen3.5-MoE

Signed-off-by: Kazantsev, Roman <roman.kazantsev@intel.com>
---
 optimum/exporters/openvino/model_configs.py   | 66 +++++++++++++++++-
 optimum/exporters/openvino/model_patcher.py   | 68 +++++++++++++++++++
 optimum/exporters/openvino/utils.py           |  3 +-
 optimum/intel/openvino/modeling_decoder.py    |  2 +-
 .../openvino/modeling_visual_language.py      | 18 ++++-
 5 files changed, 153 insertions(+), 4 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index ac48a991ee..17f6074743 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -195,6 +195,7 @@
     Qwen2VLLanguageModelPatcher,
     Qwen2VLVisionEmbMergerPatcher,
     Qwen3_5ModelPatcher,
+    Qwen3_5MoeModelPatcher,
     Qwen3_5VisionEmbMergerPatcher,
     Qwen3MoeModelPatcher,
     Qwen3NextModelPatcher,
@@ -259,6 +260,10 @@ def init_model_configs():
         "transformers",
         "AutoModelForImageTextToText",
     )
+    TasksManager._CUSTOM_CLASSES[("pt", "qwen3_5_moe", "image-text-to-text")] = (
+        "transformers",
+        "AutoModelForImageTextToText",
+    )
 
     # since transformers v4.46, model can be loaded using default AutoModelForImageTextToText
     # https://github.com/huggingface/transformers/blob/v4.46.0/src/transformers/models/auto/modeling_auto.py#L776
@@ -5746,4 +5751,63 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
             return get_vlm_internal_text_generation_config(
                 "qwen3_5_text", self._orig_config.text_config, self.int_dtype, self.float_dtype
             ).outputs
-        raise Exception("Unknown Qwen3.5 behavior type.")
\ No newline at end of file
+        raise Exception("Unknown Qwen3.5 behavior type.")
+
+
+@register_in_tasks_manager(
+    "qwen3_5_moe_text",
+    *["text-generation", "text-generation-with-past"],
+    library_name="transformers",
+)
+class Qwen3_5MoeTextOpenVINOConfig(Qwen3_5TextOpenVINOConfig):
+    _MODEL_PATCHER = Qwen3_5MoeModelPatcher
+
+
+@register_in_tasks_manager(
+    "qwen3_5_moe",
+    *["image-text-to-text"],
+    library_name="transformers",
+)
+class Qwen3_5MoeOpenVINOConfig(Qwen3_5OpenVINOConfig):
+    def with_behavior(
+        self,
+        behavior: Union[str, QwenVLConfigBehavior],
+    ):
+        if isinstance(behavior, str) and not isinstance(behavior, QwenVLConfigBehavior):
+            behavior = QwenVLConfigBehavior(behavior)
+
+        if behavior == QwenVLConfigBehavior.TEXT_EMBEDDINGS:
+            return get_vlm_text_embeddings_config(
+                "qwen3_5_moe_text", self._orig_config.text_config, self.int_dtype, self.float_dtype
+            )
+
+        if behavior == QwenVLConfigBehavior.LANGUAGE:
+            return get_vlm_text_generation_config(
+                "qwen3_5_moe_text",
+                self._orig_config.text_config,
+                self.int_dtype,
+                self.float_dtype,
+                model_patcher=Qwen3_5MoeModelPatcher,
+            )
+
+        if behavior in (
+            QwenVLConfigBehavior.VISION_EMBEDDINGS,
+            QwenVLConfigBehavior.VISION_EMBEDDINGS_MERGER,
+            QwenVLConfigBehavior.VISION_EMBEDDINGS_POS,
+        ):
+            return self.__class__(
+                self._orig_config,
+                task=self.task,
+                int_dtype=self.int_dtype,
+                float_dtype=self.float_dtype,
+                behavior=behavior,
+                preprocessors=self._preprocessors,
+            )
+
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        if self._behavior == QwenVLConfigBehavior.LANGUAGE:
+            return get_vlm_internal_text_generation_config(
+                "qwen3_5_moe_text", self._orig_config.text_config, self.int_dtype, self.float_dtype
+            ).outputs
+        return super().outputs
\ No newline at end of file
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 5b66c8c6fb..7fbf431c3c 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -8740,3 +8740,71 @@ def __exit__(self, exc_type, exc_value, traceback):
             block.forward = block._orig_forward
             block.attn.forward = block.attn._orig_forward
 
+
+def patched_qwen3_5_moe_sparse_moe_block(self, hidden_states: torch.Tensor) -> torch.Tensor:
+    num_experts = self.experts.num_experts
+    batch_size, sequence_length, hidden_dim = hidden_states.shape
+    hidden_states = hidden_states.view(-1, hidden_dim)
+
+    # router returns (logits, scores, indices)
+    _, routing_weights, selected_experts = self.gate(hidden_states)
+
+    new_routing_weights = torch.zeros(batch_size * sequence_length, num_experts, dtype=routing_weights.dtype)
+    new_routing_weights.scatter_(dim=1, index=selected_experts, src=routing_weights)
+
+    shared_expert_output = self.shared_expert(hidden_states)
+    shared_expert_output = torch.sigmoid(self.shared_expert_gate(hidden_states)) * shared_expert_output
+
+    hidden_states = hidden_states.repeat(num_experts, 1)
+    hidden_states = hidden_states.view(num_experts, -1, hidden_dim)
+    act_fn = self.experts.act_fn
+
+    # compute experts outputs in a vectorized form using torch.bmm
+    gate = torch.bmm(hidden_states, self.gate_projs.transpose(1, 2))
+    up = torch.bmm(hidden_states, self.up_projs.transpose(1, 2))
+    gate_up = act_fn(gate) * up
+    next_states = torch.bmm(gate_up, self.down_projs.transpose(1, 2))
+    next_states = next_states.view(num_experts, batch_size, -1, hidden_dim)
+    next_states = next_states * new_routing_weights.transpose(0, 1).view(num_experts, batch_size, -1)[..., None]
+    next_states = next_states.sum(dim=0)
+
+    shared_expert_output = shared_expert_output.view(batch_size, -1, hidden_dim)
+    output = shared_expert_output + next_states
+    return output.view(batch_size, sequence_length, hidden_dim)
+
+
+class Qwen3_5MoeModelPatcher(Qwen3_5ModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: "PreTrainedModel",
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__(config, model, model_kwargs)
+
+    def __enter__(self):
+        from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import Qwen3_5MoeSparseMoeBlock
+
+        super().__enter__()
+        for decoder_layer in self._text_model.layers:
+            if isinstance(decoder_layer.mlp, Qwen3_5MoeSparseMoeBlock):
+                sparse_moe_block = decoder_layer.mlp
+                intermediate_dim = sparse_moe_block.experts.intermediate_dim
+                sparse_moe_block._orig_forward = sparse_moe_block.forward
+                sparse_moe_block.forward = types.MethodType(patched_qwen3_5_moe_sparse_moe_block, sparse_moe_block)
+                # TODO: remove `float()` casting when CVS-181449 is fixed
+                # now it is needed to have MoE optimizations to be applied
+                sparse_moe_block.gate_projs = sparse_moe_block.experts.gate_up_proj[:, :intermediate_dim, :].float()
+                sparse_moe_block.up_projs = sparse_moe_block.experts.gate_up_proj[:, intermediate_dim:, :].float()
+                sparse_moe_block.down_projs = sparse_moe_block.experts.down_proj.data.float()
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import Qwen3_5MoeSparseMoeBlock
+
+        super().__exit__(exc_type, exc_value, traceback)
+        for decoder_layer in self._text_model.layers:
+            if isinstance(decoder_layer.mlp, Qwen3_5MoeSparseMoeBlock):
+                sparse_moe_block = decoder_layer.mlp
+                sparse_moe_block.forward = sparse_moe_block._orig_forward
+                del sparse_moe_block.gate_projs, sparse_moe_block.up_projs, sparse_moe_block.down_projs
+
diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py
index 5486629fac..061334754c 100644
--- a/optimum/exporters/openvino/utils.py
+++ b/optimum/exporters/openvino/utils.py
@@ -296,6 +296,7 @@ def get_submodels(model):
     "qwen2_5_vl",
     "qwen3_vl",
     "qwen3_5",
+    "qwen3_5_moe",
     "got_ocr2",
     "gemma3",
     "idefics3",
@@ -306,7 +307,7 @@ def get_submodels(model):
     "minicpmo",
 ]
 
-SSM_MODELS = ["mamba", "falcon_mamba", "zamba2", "lfm2", "granitemoehybrid", "qwen3_next", "qwen3_5_text"]
+SSM_MODELS = ["mamba", "falcon_mamba", "zamba2", "lfm2", "granitemoehybrid", "qwen3_next", "qwen3_5_text", "qwen3_5_moe_text"]
 
 # All transformers, diffusers, timm and sentence transformers models that are supported via optimum-onnx OnnxConfigs but that have currently no test
 # TODO: add tests for all models that are compatible and remove support for all others
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index ed75ef1150..8da7ea3738 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -1449,7 +1449,7 @@ def prepare_inputs_for_generation(
                 # decoding stage so it takes the last token
                 input_ids = input_ids[:, -1].unsqueeze(-1)
 
-                if self.config.model_type not in ["lfm2", "granitemoehybrid", "qwen3_next", "qwen3_5_text"]:
+                if self.config.model_type not in ["lfm2", "granitemoehybrid", "qwen3_next", "qwen3_5_text", "qwen3_5_moe_text"]:
                     # LFM2, GraniteMoeHybrid (Granite-4.0), Qwen3-Next, and Qwen3.5 require the attention mask
                     # to be the length of the full context, so default mask from OVModelForCausalLM needs to be used.
                     # Other models like Mamba typically do not require an attention_mask
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index 8e7a1c8e21..82ea10467d 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -190,7 +190,7 @@ def prepare_inputs(
             if past_len:
                 position_ids = position_ids[:, -inputs_embeds.shape[1] :]
 
-            if (self.config.model_type in ["qwen2_vl", "qwen3_vl", "qwen3_5"]) and position_ids.ndim != 3:
+            if (self.config.model_type in ["qwen2_vl", "qwen3_vl", "qwen3_5", "qwen3_5_moe"]) and position_ids.ndim != 3:
                 position_ids = np.repeat(np.expand_dims(position_ids, 0), 3, axis=0)
 
             inputs["position_ids"] = position_ids
@@ -3462,6 +3462,20 @@ class Qwen3_5VisionModel:
     class Qwen3_5VisionRotaryEmbedding:
         pass
 
+
+if is_transformers_version(">=", "5.2.0"):
+    from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import (
+        Qwen3_5MoeModel,
+        Qwen3_5MoeVisionModel,
+    )
+else:
+
+    class Qwen3_5MoeModel:
+        pass
+
+    class Qwen3_5MoeVisionModel:
+        pass
+
 # The inheritance from Qwen3VLModel is needed to get access to methods:
 # get_placeholder_mask(): https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py#L1066
 # get_rope_index(): https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py#L916
@@ -5196,5 +5210,7 @@ def generate(self, *args, **kwargs):
     "qwen3_vl": _OVQwen3VLForCausalLM,
     "qwen3_5": _OVQwen3_5ForCausalLM,
     "qwen3_5_text": _OVQwen3_5ForCausalLM,
+    "qwen3_5_moe": _OVQwen3_5ForCausalLM,
+    "qwen3_5_moe_text": _OVQwen3_5ForCausalLM,
     "minicpmo": _OVMiniCPMOForCausalLM,
 }

From cbe127ee06e7f8136d0c8d8c80193e41e1fe2ed9 Mon Sep 17 00:00:00 2001
From: "Kazantsev, Roman" <roman.kazantsev@intel.com>
Date: Wed, 15 Apr 2026 01:51:46 +0400
Subject: [PATCH 10/10] Add position_ids input and its preparation for
 inference

Signed-off-by: Kazantsev, Roman <roman.kazantsev@intel.com>
---
 optimum/exporters/openvino/model_configs.py   | 17 +++++-
 optimum/exporters/openvino/model_patcher.py   |  5 +-
 optimum/exporters/openvino/utils.py           | 11 +++-
 optimum/intel/openvino/modeling_decoder.py    |  8 ++-
 .../openvino/modeling_visual_language.py      | 61 ++++++++++++++++---
 tests/openvino/test_decoder.py                |  2 +-
 6 files changed, 88 insertions(+), 16 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 17f6074743..5467e641c3 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -3562,6 +3562,14 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
         return generated_input
 
 
+class DummyQwen3_5LMInputGenerator(DummyTextInputGenerator):
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        generated_input = super().generate(input_name, framework, int_dtype, float_dtype)
+        if input_name == "position_ids":
+            return generated_input.unsqueeze(0).expand(4, -1, -1)
+        return generated_input
+
+
 class DummyQwen2VLVisionEmbedInputGenerator(DummyVisionInputGenerator):
     SUPPORTED_INPUT_NAMES = (
         "hidden_states",
@@ -5610,6 +5618,7 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
         common_inputs = {
             "input_ids": {0: "batch_size", 1: "sequence_length"},
             "attention_mask": {0: "batch_size", 1: "sequence_length"},
+            "position_ids": {0: "batch_size", 1: "sequence_length"},
         }
         if self.use_past_in_inputs:
             self.add_past_key_values(common_inputs, direction="inputs")
@@ -5700,8 +5709,8 @@ def with_behavior(
                 self.int_dtype,
                 self.float_dtype,
                 model_patcher=Qwen3_5ModelPatcher,
-                #dummy_input_generator=DummyQwen2VLLMInputGenerator,
-                #inputs_update={"position_ids": {1: "batch_size", 2: "sequence_length"}},
+                dummy_input_generator=DummyQwen3_5LMInputGenerator,
+                inputs_update={"position_ids": {1: "batch_size", 2: "sequence_length"}},
             )
 
         if behavior in (
@@ -5788,6 +5797,8 @@ def with_behavior(
                 self.int_dtype,
                 self.float_dtype,
                 model_patcher=Qwen3_5MoeModelPatcher,
+                dummy_input_generator=DummyQwen3_5LMInputGenerator,
+                inputs_update={"position_ids": {1: "batch_size", 2: "sequence_length"}},
             )
 
         if behavior in (
@@ -5810,4 +5821,4 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
             return get_vlm_internal_text_generation_config(
                 "qwen3_5_moe_text", self._orig_config.text_config, self.int_dtype, self.float_dtype
             ).outputs
-        return super().outputs
\ No newline at end of file
+        return super().outputs
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 7fbf431c3c..f906fa52d0 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -8723,7 +8723,9 @@ def image_embed_forward(
             emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
             position_embeddings = (emb.cos(), emb.sin())
             for blk in self.blocks:
-                hidden_states = blk(hidden_states, attention_mask=attention_mask, position_embeddings=position_embeddings)
+                hidden_states = blk(
+                    hidden_states, attention_mask=attention_mask, position_embeddings=position_embeddings
+                )
             return self.merger(hidden_states)
 
         model.forward = types.MethodType(image_embed_forward, model)
@@ -8807,4 +8809,3 @@ def __exit__(self, exc_type, exc_value, traceback):
                 sparse_moe_block = decoder_layer.mlp
                 sparse_moe_block.forward = sparse_moe_block._orig_forward
                 del sparse_moe_block.gate_projs, sparse_moe_block.up_projs, sparse_moe_block.down_projs
-
diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py
index 061334754c..6314803bbc 100644
--- a/optimum/exporters/openvino/utils.py
+++ b/optimum/exporters/openvino/utils.py
@@ -307,7 +307,16 @@ def get_submodels(model):
     "minicpmo",
 ]
 
-SSM_MODELS = ["mamba", "falcon_mamba", "zamba2", "lfm2", "granitemoehybrid", "qwen3_next", "qwen3_5_text", "qwen3_5_moe_text"]
+SSM_MODELS = [
+    "mamba",
+    "falcon_mamba",
+    "zamba2",
+    "lfm2",
+    "granitemoehybrid",
+    "qwen3_next",
+    "qwen3_5_text",
+    "qwen3_5_moe_text",
+]
 
 # All transformers, diffusers, timm and sentence transformers models that are supported via optimum-onnx OnnxConfigs but that have currently no test
 # TODO: add tests for all models that are compatible and remove support for all others
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 8da7ea3738..7044953664 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -1449,7 +1449,13 @@ def prepare_inputs_for_generation(
                 # decoding stage so it takes the last token
                 input_ids = input_ids[:, -1].unsqueeze(-1)
 
-                if self.config.model_type not in ["lfm2", "granitemoehybrid", "qwen3_next", "qwen3_5_text", "qwen3_5_moe_text"]:
+                if self.config.model_type not in [
+                    "lfm2",
+                    "granitemoehybrid",
+                    "qwen3_next",
+                    "qwen3_5_text",
+                    "qwen3_5_moe_text",
+                ]:
                     # LFM2, GraniteMoeHybrid (Granite-4.0), Qwen3-Next, and Qwen3.5 require the attention mask
                     # to be the length of the full context, so default mask from OVModelForCausalLM needs to be used.
                     # Other models like Mamba typically do not require an attention_mask
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index 82ea10467d..427860775e 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -188,9 +188,11 @@ def prepare_inputs(
                 position_ids = np.cumsum(attention_mask, axis=1) - 1
                 position_ids[attention_mask == 0] = 1
             if past_len:
-                position_ids = position_ids[:, -inputs_embeds.shape[1] :]
+                position_ids = position_ids[..., -inputs_embeds.shape[1] :]
 
-            if (self.config.model_type in ["qwen2_vl", "qwen3_vl", "qwen3_5", "qwen3_5_moe"]) and position_ids.ndim != 3:
+            if self.config.model_type in ["qwen3_5", "qwen3_5_moe"] and position_ids.ndim != 3:
+                position_ids = np.repeat(np.expand_dims(position_ids, 0), 4, axis=0)
+            elif self.config.model_type in ["qwen2_vl", "qwen3_vl"] and position_ids.ndim != 3:
                 position_ids = np.repeat(np.expand_dims(position_ids, 0), 3, axis=0)
 
             inputs["position_ids"] = position_ids
@@ -3445,6 +3447,7 @@ class Qwen3_5Model:
     class Qwen3_5VisionModel:
         pass
 
+
 if is_transformers_version(">=", "5.2.0"):
     from transformers.models.qwen3_5.modeling_qwen3_5 import (
         Qwen3_5Model,
@@ -3476,6 +3479,7 @@ class Qwen3_5MoeModel:
     class Qwen3_5MoeVisionModel:
         pass
 
+
 # The inheritance from Qwen3VLModel is needed to get access to methods:
 # get_placeholder_mask(): https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py#L1066
 # get_rope_index(): https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py#L916
@@ -5089,14 +5093,25 @@ def get_multimodal_embeddings(
         if position_ids is None and input_ids is not None and (attention_mask is None or attention_mask.ndim == 2):
             # calculate RoPE index once per generation in the pre-fill stage only
             if (cache_position is not None and cache_position[0] == 0) or self.rope_deltas is None:
-                # Construct mm_token_type_ids from input_ids
-                mm_token_type_ids = torch.zeros_like(input_ids, dtype=torch.int)
-                mm_token_type_ids[input_ids == self.config.image_token_id] = 1
-                mm_token_type_ids[input_ids == self.config.video_token_id] = 2
-                position_ids, rope_deltas = self.get_rope_index(
-                    input_ids, mm_token_type_ids, image_grid_thw, video_grid_thw, attention_mask
+                vision_positions, rope_deltas = self.get_rope_index(
+                    input_ids,
+                    image_grid_thw=image_grid_thw,
+                    video_grid_thw=video_grid_thw,
+                    attention_mask=attention_mask,
                 )
                 self.rope_deltas = rope_deltas
+                # Compute text positions (simple cumsum) and concatenate as dim 0
+                # to create shape (4, batch, seq_len): [text_pos, temporal, height, width]
+                if attention_mask is not None:
+                    text_positions = attention_mask.long().cumsum(-1) - 1
+                    text_positions = text_positions.masked_fill(attention_mask == 0, 1)
+                else:
+                    text_positions = (
+                        torch.arange(input_ids.shape[1], device=input_ids.device)
+                        .unsqueeze(0)
+                        .expand(input_ids.shape[0], -1)
+                    )
+                position_ids = torch.cat([text_positions.unsqueeze(0), vision_positions], dim=0)
             # then use the prev pre-calculated rope-deltas to get the correct position ids
             else:
                 batch_size, seq_length, _ = inputs_embeds.shape
@@ -5107,6 +5122,12 @@ def get_multimodal_embeddings(
                     delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)
                 position_ids = position_ids.add(delta)
                 position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
+                # Prepend text positions for shape (4, batch, seq_len)
+                text_positions = torch.arange(seq_length, device=inputs_embeds.device)
+                text_positions = text_positions.view(1, -1).expand(batch_size, -1)
+                if cache_position is not None:
+                    text_positions = text_positions + cache_position[0]
+                position_ids = torch.cat([text_positions.unsqueeze(0), position_ids], dim=0)
 
         return inputs_embeds, attention_mask, position_ids
 
@@ -5180,6 +5201,30 @@ def forward(
         )
         return final_result
 
+    def _prepare_position_ids_for_generation(self, inputs_tensor, model_kwargs):
+        # Mirrors Qwen3_5ForConditionalGeneration._prepare_position_ids_for_generation
+        # Creates proper 4D position_ids: [text_positions, temporal, height, width]
+        text_positions = GenerationMixin._prepare_position_ids_for_generation(self, inputs_tensor, model_kwargs)
+
+        if "input_ids" in model_kwargs and model_kwargs["input_ids"].shape[1] > 0:
+            inputs_tensor = model_kwargs["input_ids"]
+
+        is_input_ids = len(inputs_tensor.shape) == 2 and inputs_tensor.dtype in [torch.int, torch.long]
+        if is_input_ids and (
+            model_kwargs.get("image_grid_thw") is not None or model_kwargs.get("video_grid_thw") is not None
+        ):
+            filtered_kwargs = {k: v for k, v in model_kwargs.items() if k != "input_ids"}
+            vision_positions, rope_deltas = self.get_rope_index(inputs_tensor, **filtered_kwargs)
+            self.rope_deltas = rope_deltas
+        else:
+            vision_positions = text_positions.unsqueeze(0).expand(3, -1, -1)
+            self.rope_deltas = torch.zeros(inputs_tensor.shape[0], 1, dtype=torch.long, device=inputs_tensor.device)
+
+        # Concatenate "text + vision" positions into [4, bs, seq-len]
+        text_positions = text_positions[None, ...]
+        position_ids = torch.cat([text_positions, vision_positions], dim=0)
+        return position_ids
+
     def generate(self, *args, **kwargs):
         # Clear cached rope delta from previous generations
         self.rope_deltas = None
diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 72b5b1077d..a1e47c7451 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -27,8 +27,8 @@
     BitnetOpenVINOConfig,
     DeepseekOpenVINOConfig,
     LFM2OpenVINOConfig,
-    Qwen3VLOpenVINOConfig,
     Qwen3_5TextOpenVINOConfig,
+    Qwen3VLOpenVINOConfig,
 )
 from optimum.exporters.openvino.model_patcher import patch_update_causal_mask
 from optimum.exporters.openvino.utils import ONNX_SUPPORTED_ARCHITECTURES