openvinotoolkit · yatarkan · Apr 8, 2026 · Apr 14, 2026 · Apr 14, 2026 · Apr 15, 2026
@@ -32,18 +32,35 @@ void update_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention
 }
 
 void update_3d_position_ids(ov::Tensor&& position_ids, const ov::Tensor& attention_mask, const int64_t rope_delta) {
+    constexpr size_t thw_dim_size = 3;
+    constexpr size_t text_thw_dim_size = 4;
+
     const size_t batch_size = attention_mask.get_shape().at(0);
     const size_t sequence_length = attention_mask.get_shape().at(1);
-    const size_t thw_dim_size = 3;
+    const size_t dim_0_size = position_ids.get_shape().at(0);
 
-    position_ids.set_shape({thw_dim_size, batch_size, 1});
+    OPENVINO_ASSERT(dim_0_size == thw_dim_size || dim_0_size == text_thw_dim_size,
+        "Unsupported first dimension in 3D position ids: ", dim_0_size);
+
+    position_ids.set_shape({dim_0_size, batch_size, 1});
     int64_t* position_ids_data = position_ids.data<int64_t>();
 
-    int64_t pos_id = static_cast<int64_t>(sequence_length) - 1 + rope_delta;
+    const int64_t vision_position_id = static_cast<int64_t>(sequence_length) - 1 + rope_delta;
 
-    for (size_t batch = 0; batch < batch_size; batch++) {
-        for (size_t dim = 0; dim < thw_dim_size; ++dim) {
-            position_ids_data[dim * batch_size + batch] = pos_id;
+    // For THW-only layout, all dims use vision_position_id.
+    // For text + THW layout (e.g. Qwen3.5), text position id (without rope_delta) is prepended to dim 0.
+    const size_t vision_dim_idx = (dim_0_size == text_thw_dim_size) ? 1 : 0;
+
+    if (dim_0_size == text_thw_dim_size) {
+        const int64_t text_position_id = static_cast<int64_t>(sequence_length) - 1;
+        for (size_t batch = 0; batch < batch_size; ++batch) {
+            position_ids_data[batch] = text_position_id;
+        }
+    }
+
+    for (size_t dim = vision_dim_idx; dim < dim_0_size; ++dim) {
+        for (size_t batch = 0; batch < batch_size; ++batch) {
+            position_ids_data[dim * batch_size + batch] = vision_position_id;
         }
     }
 }

@@ -224,22 +224,6 @@ ov::genai::OptionalGenerationConfig get_config_from_map(const ov::AnyMap& config
         return std::nullopt;
 }
 
-ProcessorConfig from_any_map(
-    const ov::AnyMap& config_map,
-    const ProcessorConfig& initial
-) {
-    auto iter = config_map.find("processor_config");
-    ProcessorConfig extracted_config = config_map.end() != iter ?
-        iter->second.as<ProcessorConfig>() : initial;
-    using utils::read_anymap_param;
-    read_anymap_param(config_map, "patch_size", extracted_config.patch_size);
-    read_anymap_param(config_map, "scale_resolution", extracted_config.scale_resolution);
-    read_anymap_param(config_map, "max_slice_nums", extracted_config.max_slice_nums);
-    read_anymap_param(config_map, "norm_mean", extracted_config.norm_mean);
-    read_anymap_param(config_map, "norm_std", extracted_config.norm_std);
-    return extracted_config;
-}
-
 ov::genai::ModelDesc get_draft_model_from_config(const ov::AnyMap& config) {
     ov::genai::ModelDesc draft_model;
     if (config.find(utils::DRAFT_MODEL_ARG_NAME) != config.end()) {

@@ -17,7 +17,6 @@
 #include "openvino/genai/generation_handle.hpp"
 #include "openvino/genai/scheduler_config.hpp"
 #include "openvino/genai/generation_config.hpp"
-#include "visual_language/processor_config.hpp"
 
 #include "openvino/genai/streamer_base.hpp"
 
@@ -118,11 +117,6 @@ ov::genai::StreamerVariant get_streamer_from_map(const ov::AnyMap& config_map);
 
 ov::genai::OptionalGenerationConfig get_config_from_map(const ov::AnyMap& config_map);
 
-ProcessorConfig from_any_map(
-    const ov::AnyMap& config_map,
-    const ProcessorConfig& initial
-);
-
 ov::genai::ModelDesc get_draft_model_from_config(const ov::AnyMap& config);
 
 ov::genai::ModelDesc extract_draft_model_from_config(ov::AnyMap& config);

@@ -37,7 +37,7 @@ EncodedImage VisionEncoderGemma3::encode(const ov::Tensor& image, const ov::AnyM
     CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_vision_encoder.get());
     ov::InferRequest& encoder = infer_request_guard.get();
 
-    ProcessorConfig config = utils::from_any_map(config_map, m_processor_config);
+    ProcessorConfig config = ProcessorConfig::from_any_map(config_map, m_processor_config);
 
     ov::Tensor pixel_values = get_pixel_values_gemma3(image, config);
 

@@ -11,6 +11,7 @@
 #include "visual_language/qwen2vl/classes.hpp"
 #include "visual_language/qwen2_5_vl/classes.hpp"
 #include "visual_language/qwen3_vl/classes.hpp"
+#include "visual_language/qwen3_5/classes.hpp"
 #include "visual_language/phi3_vision/classes.hpp"
 #include "visual_language/phi4mm/classes.hpp"
 #include "visual_language/minicpm/classes.hpp"
@@ -286,6 +287,8 @@ InputsEmbedder::InputsEmbedder(const std::filesystem::path& model_dir,
         m_impl = std::make_shared<InputsEmbedderQwen2_5_VL>(vlm_config, model_dir, device, device_config);
     } else if (vlm_config.model_type == VLMModelType::QWEN3_VL) {
         m_impl = std::make_shared<InputsEmbedderQwen3VL>(vlm_config, model_dir, device, device_config);
+    } else if (vlm_config.model_type == VLMModelType::QWEN3_5 || vlm_config.model_type == VLMModelType::QWEN3_5_MOE) {
+        m_impl = std::make_shared<InputsEmbedderQwen3_5>(vlm_config, model_dir, device, device_config);
     } else if (vlm_config.model_type == VLMModelType::GEMMA3) {
         m_impl = std::make_shared<InputsEmbedderGemma3>(vlm_config, model_dir, device, device_config);
     } else {
@@ -322,6 +325,8 @@ InputsEmbedder::InputsEmbedder(const ModelsMap& models_map,
         m_impl = std::make_shared<InputsEmbedderQwen2_5_VL>(vlm_config, models_map, tokenizer, config_dir_path, device, device_config);
     } else if (vlm_config.model_type == VLMModelType::QWEN3_VL) {
         m_impl = std::make_shared<InputsEmbedderQwen3VL>(vlm_config, models_map, tokenizer, config_dir_path, device, device_config);
+    } else if (vlm_config.model_type == VLMModelType::QWEN3_5 || vlm_config.model_type == VLMModelType::QWEN3_5_MOE) {
+        m_impl = std::make_shared<InputsEmbedderQwen3_5>(vlm_config, models_map, tokenizer, config_dir_path, device, device_config);
     } else if (vlm_config.model_type == VLMModelType::GEMMA3) {
         m_impl = std::make_shared<InputsEmbedderGemma3>(vlm_config, models_map, tokenizer, config_dir_path, device, device_config);
     } else {

@@ -362,6 +362,7 @@ class InputsEmbedder {
     friend class InputsEmbedderQwen2VL;
     friend class InputsEmbedderQwen2_5_VL;
     friend class InputsEmbedderQwen3VL;
+    friend class InputsEmbedderQwen3_5;
     friend class InputsEmbedderGemma3;
 };
 

@@ -5,8 +5,6 @@
 
 #include "visual_language/clip.hpp"
 
-#include "utils.hpp"
-
 namespace ov::genai {
 
 namespace {
@@ -133,7 +131,7 @@ ov::Tensor get_pixel_values_internvl(const ov::Tensor& image, const ProcessorCon
 EncodedImage VisionEncoderInternVLChat::encode(const ov::Tensor& image, const ov::AnyMap& config_map) {
     CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_vision_encoder.get());
     ov::InferRequest& encoder = infer_request_guard.get();
-    ProcessorConfig config = utils::from_any_map(config_map, m_processor_config);
+    ProcessorConfig config = ProcessorConfig::from_any_map(config_map, m_processor_config);
 
     ov::Tensor pixel_values = get_pixel_values_internvl(image, config);
 

@@ -45,7 +45,7 @@ ov::Tensor get_pixel_values_llava(const ov::Tensor& image, const ProcessorConfig
 EncodedImage VisionEncoderLLaVA::encode( const ov::Tensor& image, const ov::AnyMap& config_map) {
     CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_vision_encoder.get());
     ov::InferRequest& encoder = infer_request_guard.get();
-    ProcessorConfig config = utils::from_any_map(config_map, m_processor_config);
+    ProcessorConfig config = ProcessorConfig::from_any_map(config_map, m_processor_config);
 
     ov::Tensor pixel_values = get_pixel_values_llava(image, config);
 

@@ -48,7 +48,7 @@ ov::Tensor VisionEncoderLLaVANext::get_pixel_values_llava_next(const ov::Tensor&
 EncodedImage VisionEncoderLLaVANext::encode(const ov::Tensor& image, const ov::AnyMap& config_map) {
     CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_vision_encoder.get());
     ov::InferRequest& encoder = infer_request_guard.get();
-    ProcessorConfig config = utils::from_any_map(config_map, m_processor_config);
+    ProcessorConfig config = ProcessorConfig::from_any_map(config_map, m_processor_config);
 
     ov::Tensor pixel_values = get_pixel_values_llava_next(image, config);
 

@@ -325,7 +325,7 @@ EncodedImage VisionEncoderLLaVANextVideo::encode(const ov::Tensor& image, const
     ov::InferRequest& encoder = infer_request_guard.get();
     CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard_mm_projector(this->m_ireq_queue_multi_modal_projector.get());
     ov::InferRequest& mm_projector = infer_request_guard_mm_projector.get();
-    ProcessorConfig config = utils::from_any_map(config_map, m_processor_config);
+    ProcessorConfig config = ProcessorConfig::from_any_map(config_map, m_processor_config);
 
     ov::Shape pixel_values_shape;
     if (use_ov_vision_preprocess) {

@@ -423,7 +423,7 @@ std::pair<EncodedImage, ImageSliceResult> llava_image_embed_make_with_bytes_slic
 EncodedImage VisionEncoderMiniCPM::encode(const ov::Tensor& image, const ov::AnyMap& config_map) {
     CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_vision_encoder.get());
     ov::InferRequest& encoder = infer_request_guard.get();
-    ProcessorConfig config = utils::from_any_map(config_map, m_processor_config);
+    ProcessorConfig config = ProcessorConfig::from_any_map(config_map, m_processor_config);
 
     clip_ctx ctx_clip;
     ctx_clip.image_size = config.image_size;

@@ -3,7 +3,6 @@
 
 #include "visual_language/nanollava/classes.hpp"
 #include "visual_language/clip.hpp"
-#include "utils.hpp"
 
 namespace ov::genai {
 
@@ -86,7 +85,7 @@ EncodedImage VisionEncoderNanoLLaVA::encode(const ov::Tensor& image, const ov::A
     CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_vision_encoder.get());
     ov::InferRequest& encoder = infer_request_guard.get();
 
-    ProcessorConfig config = utils::from_any_map(config_map, m_processor_config);
+    ProcessorConfig config = ProcessorConfig::from_any_map(config_map, m_processor_config);
 
     // nanollava specific preprocess params
     config.image_mean = std::array<float, 3>{0.5f, 0.5f, 0.5f};

@@ -884,7 +884,7 @@ std::vector<std::variant<ov::Tensor, size_t>> drop_image_placeholders(const ov::
 EncodedImage VisionEncoderPhi3V::encode(const ov::Tensor& image, const ov::AnyMap& config_map) {
     CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_vision_encoder.get());
     ov::InferRequest& encoder = infer_request_guard.get();
-    ProcessorConfig config = utils::from_any_map(config_map, m_processor_config);
+    ProcessorConfig config = ProcessorConfig::from_any_map(config_map, m_processor_config);
 
     ImageSize image_size;
 

@@ -666,7 +666,7 @@ m_separator_inserters{create_separator_inserters(device)} {
 }
 
 EncodedImage VisionEncoderPhi4MM::encode(const ov::Tensor& image, const ov::AnyMap& config_map) {
-    ProcessorConfig config = utils::from_any_map(config_map, m_processor_config);
+    ProcessorConfig config = ProcessorConfig::from_any_map(config_map, m_processor_config);
     ov::Tensor input_image_embeds{ov::element::f32, {}}, image_attention_mask{ov::element::f32, {}};
     int32_t image_height = 0, image_width = 0, num_img_tokens = 0;
 

@@ -753,9 +753,12 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
 };
 
 // TODO: remove it when GEMMA3 ticket-171180 is fixed
+// TODO: remove Qwen3.5 limitation once ticket-183791 is fixed
 bool requires_sdpa(const std::filesystem::path& models_dir) {
     auto vlm_config = utils::from_config_json_if_exists<VLMConfig>(models_dir, "config.json");
-    return vlm_config.model_type == VLMModelType::GEMMA3;
+    return vlm_config.model_type == VLMModelType::GEMMA3
+        || vlm_config.model_type == VLMModelType::QWEN3_5
+        || vlm_config.model_type == VLMModelType::QWEN3_5_MOE;
 }
 
 VLMPipeline::VLMPipeline(

@@ -1,15 +1,13 @@
 // Copyright (C) 2023-2026 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
+#include <fstream>
+
 #include "processor_config.hpp"
 #include "json_utils.hpp"
+#include "utils.hpp"
 
-#include <fstream>
-
-ov::genai::ProcessorConfig::ProcessorConfig(const std::filesystem::path& json_path) {
-    std::ifstream stream(json_path);
-    OPENVINO_ASSERT(stream.is_open(), "Failed to open '", json_path, "' with processor config");
-    nlohmann::json parsed = nlohmann::json::parse(stream);
+ov::genai::ProcessorConfig::ProcessorConfig(const nlohmann::json& parsed) {
     using ov::genai::utils::read_json_param;
     read_json_param(parsed, "patch_size", patch_size); // For llava - stored in config.json vision_config
     read_json_param(parsed, "scale_resolution", scale_resolution);
@@ -51,3 +49,26 @@ ov::genai::ProcessorConfig::ProcessorConfig(const std::filesystem::path& json_pa
     read_json_param(parsed, "size.height", size_height);
     read_json_param(parsed, "size.width", size_width);
 }
+
+ov::genai::ProcessorConfig::ProcessorConfig(const std::filesystem::path& json_path)
+    : ProcessorConfig([&json_path] {
+        std::ifstream stream(json_path);
+        OPENVINO_ASSERT(stream.is_open(), "Failed to open '", json_path, "' with processor config");
+        return nlohmann::json::parse(stream);
+    }()) {}
+
+ov::genai::ProcessorConfig ov::genai::ProcessorConfig::from_any_map(
+    const ov::AnyMap& config_map,
+    const ProcessorConfig& initial
+) {
+    auto iter = config_map.find("processor_config");
+    ProcessorConfig extracted_config = config_map.end() != iter ?
+        iter->second.as<ProcessorConfig>() : initial;
+    using ov::genai::utils::read_anymap_param;
+    read_anymap_param(config_map, "patch_size", extracted_config.patch_size);
+    read_anymap_param(config_map, "scale_resolution", extracted_config.scale_resolution);
+    read_anymap_param(config_map, "max_slice_nums", extracted_config.max_slice_nums);
+    read_anymap_param(config_map, "norm_mean", extracted_config.norm_mean);
+    read_anymap_param(config_map, "norm_std", extracted_config.norm_std);
+    return extracted_config;
+}
@@ -3,10 +3,10 @@
 
 #pragma once
 
-#include "openvino/genai/visibility.hpp"
-#include <openvino/runtime/properties.hpp>
 #include <array>
 #include <filesystem>
+#include <nlohmann/json_fwd.hpp>
+#include <openvino/core/any.hpp>
 
 namespace ov::genai {
 /// @brief A Configuration class passed to VisionEncoder and used to
@@ -62,9 +62,17 @@ class ProcessorConfig {
 
     /// @brief Default constructor
     ProcessorConfig() = default;
+
+    explicit ProcessorConfig(const nlohmann::json& parsed);
+
     /// @brief Construct ProcessorConfig from values in json_path.
     /// Keys in the file must match the ProcessorConfig's members.
     /// @param json_path A path to a file to extract the values from.
     explicit ProcessorConfig(const std::filesystem::path& json_path);
+
+    static ProcessorConfig from_any_map(
+        const ov::AnyMap& config_map,
+        const ProcessorConfig& initial
+    );
 };
 }  // namespace ov::genai
@@ -1125,10 +1125,17 @@ ov::Tensor InputsEmbedderQwen2VL::get_inputs_embeds(const std::string& unified_p
     int64_t image_pad_token_id = m_vision_token_ids["image_pad"];
     int64_t video_pad_token_id = m_vision_token_ids["video_pad"];
 
-    m_position_ids = create_position_ids(input_ids, images_grid_thw, images_sequence, 0, video_grid_thw, videos_sequence, 0, vision_start_token_id, history_vision_count);
-
-    int64_t position_ids_max_element = *std::max_element(m_position_ids.data<int64_t>(), m_position_ids.data<int64_t>() + m_position_ids.get_size());
-    m_rope_delta = position_ids_max_element + 1 - static_cast<int64_t>(input_ids.get_shape().at(1));
+    std::tie(m_position_ids, m_rope_delta) = create_position_ids(
+        input_ids,
+        images_grid_thw,
+        images_sequence,
+        0,
+        video_grid_thw,
+        videos_sequence,
+        0,
+        vision_start_token_id,
+        history_vision_count
+    );
 
     if (images.empty() && videos.empty()) {
         ov::Tensor inputs_embeds(text_embeds.get_element_type(), text_embeds.get_shape());
@@ -1466,7 +1473,7 @@ std::vector<std::array<size_t, 3>> InputsEmbedderQwen2VL::get_vision_grid_thw_fo
     return reordered_vision_grid_thw;
 }
 
-ov::Tensor InputsEmbedderQwen2VL::create_position_ids(
+std::pair<ov::Tensor, int64_t> InputsEmbedderQwen2VL::create_position_ids(
     const ov::Tensor& input_ids_tensor,
     const std::vector<std::array<size_t, 3>>& images_grid_thw,
     const std::vector<size_t>& images_sequence,
@@ -1572,7 +1579,14 @@ ov::Tensor InputsEmbedderQwen2VL::create_position_ids(
         }
     }
 
-    return position_ids;
+    // Calculate rope delta
+    const int64_t position_ids_max_element = *std::max_element(
+        position_ids.data<int64_t>(),
+        position_ids.data<int64_t>() + position_ids.get_size()
+    );
+    const int64_t rope_delta = position_ids_max_element + 1 - static_cast<int64_t>(input_ids_tensor.get_shape().at(1));
+
+    return {position_ids, rope_delta};
 }
 
 } // namespace ov::genai
@@ -153,7 +153,7 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder {
         const std::vector<std::pair<std::size_t, std::size_t>>& history_vision_count
     ) const;
 
-    ov::Tensor create_position_ids(
+    virtual std::pair<ov::Tensor, int64_t> create_position_ids(
         const ov::Tensor& input_ids_tensor,
         const std::vector<std::array<size_t, 3>>& images_grid_thw,
         const std::vector<size_t>& images_sequence,