Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 23 additions & 6 deletions src/cpp/src/lm_encoding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,18 +32,35 @@ void update_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention
}

void update_3d_position_ids(ov::Tensor&& position_ids, const ov::Tensor& attention_mask, const int64_t rope_delta) {
constexpr size_t thw_dim_size = 3;
constexpr size_t text_thw_dim_size = 4;

const size_t batch_size = attention_mask.get_shape().at(0);
const size_t sequence_length = attention_mask.get_shape().at(1);
const size_t thw_dim_size = 3;
const size_t dim_0_size = position_ids.get_shape().at(0);

position_ids.set_shape({thw_dim_size, batch_size, 1});
OPENVINO_ASSERT(dim_0_size == thw_dim_size || dim_0_size == text_thw_dim_size,
"Unsupported first dimension in 3D position ids: ", dim_0_size);

position_ids.set_shape({dim_0_size, batch_size, 1});
int64_t* position_ids_data = position_ids.data<int64_t>();

int64_t pos_id = static_cast<int64_t>(sequence_length) - 1 + rope_delta;
const int64_t vision_position_id = static_cast<int64_t>(sequence_length) - 1 + rope_delta;

for (size_t batch = 0; batch < batch_size; batch++) {
for (size_t dim = 0; dim < thw_dim_size; ++dim) {
position_ids_data[dim * batch_size + batch] = pos_id;
// For THW-only layout, all dims use vision_position_id.
// For text + THW layout (e.g. Qwen3.5), text position id (without rope_delta) is prepended to dim 0.
const size_t vision_dim_idx = (dim_0_size == text_thw_dim_size) ? 1 : 0;

if (dim_0_size == text_thw_dim_size) {
const int64_t text_position_id = static_cast<int64_t>(sequence_length) - 1;
for (size_t batch = 0; batch < batch_size; ++batch) {
position_ids_data[batch] = text_position_id;
}
}

for (size_t dim = vision_dim_idx; dim < dim_0_size; ++dim) {
for (size_t batch = 0; batch < batch_size; ++batch) {
position_ids_data[dim * batch_size + batch] = vision_position_id;
}
}
}
Comment on lines 34 to 66
Copy link

Copilot AI Apr 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change introduces support for an additional 3D position_ids layout (dim0=4 for text+THW, used by Qwen3.5). Please add/extend automated coverage (e.g., in tests/python_tests/test_vlm_pipeline.py) to validate both the new dim0=4 behavior and that existing dim0=3 models are unaffected, including at least one tiny-random Qwen3.5 export path as per project testing guidelines.

Copilot generated this review using guidance from repository custom instructions.
Expand Down
16 changes: 0 additions & 16 deletions src/cpp/src/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,22 +224,6 @@ ov::genai::OptionalGenerationConfig get_config_from_map(const ov::AnyMap& config
return std::nullopt;
}

ProcessorConfig from_any_map(
const ov::AnyMap& config_map,
const ProcessorConfig& initial
) {
auto iter = config_map.find("processor_config");
ProcessorConfig extracted_config = config_map.end() != iter ?
iter->second.as<ProcessorConfig>() : initial;
using utils::read_anymap_param;
read_anymap_param(config_map, "patch_size", extracted_config.patch_size);
read_anymap_param(config_map, "scale_resolution", extracted_config.scale_resolution);
read_anymap_param(config_map, "max_slice_nums", extracted_config.max_slice_nums);
read_anymap_param(config_map, "norm_mean", extracted_config.norm_mean);
read_anymap_param(config_map, "norm_std", extracted_config.norm_std);
return extracted_config;
}

ov::genai::ModelDesc get_draft_model_from_config(const ov::AnyMap& config) {
ov::genai::ModelDesc draft_model;
if (config.find(utils::DRAFT_MODEL_ARG_NAME) != config.end()) {
Expand Down
6 changes: 0 additions & 6 deletions src/cpp/src/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
#include "openvino/genai/generation_handle.hpp"
#include "openvino/genai/scheduler_config.hpp"
#include "openvino/genai/generation_config.hpp"
#include "visual_language/processor_config.hpp"

#include "openvino/genai/streamer_base.hpp"

Expand Down Expand Up @@ -118,11 +117,6 @@ ov::genai::StreamerVariant get_streamer_from_map(const ov::AnyMap& config_map);

ov::genai::OptionalGenerationConfig get_config_from_map(const ov::AnyMap& config_map);

ProcessorConfig from_any_map(
const ov::AnyMap& config_map,
const ProcessorConfig& initial
);

ov::genai::ModelDesc get_draft_model_from_config(const ov::AnyMap& config);

ov::genai::ModelDesc extract_draft_model_from_config(ov::AnyMap& config);
Expand Down
2 changes: 1 addition & 1 deletion src/cpp/src/visual_language/gemma3/classes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ EncodedImage VisionEncoderGemma3::encode(const ov::Tensor& image, const ov::AnyM
CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_vision_encoder.get());
ov::InferRequest& encoder = infer_request_guard.get();

ProcessorConfig config = utils::from_any_map(config_map, m_processor_config);
ProcessorConfig config = ProcessorConfig::from_any_map(config_map, m_processor_config);

ov::Tensor pixel_values = get_pixel_values_gemma3(image, config);

Expand Down
5 changes: 5 additions & 0 deletions src/cpp/src/visual_language/inputs_embedder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "visual_language/qwen2vl/classes.hpp"
#include "visual_language/qwen2_5_vl/classes.hpp"
#include "visual_language/qwen3_vl/classes.hpp"
#include "visual_language/qwen3_5/classes.hpp"
#include "visual_language/phi3_vision/classes.hpp"
#include "visual_language/phi4mm/classes.hpp"
#include "visual_language/minicpm/classes.hpp"
Expand Down Expand Up @@ -286,6 +287,8 @@ InputsEmbedder::InputsEmbedder(const std::filesystem::path& model_dir,
m_impl = std::make_shared<InputsEmbedderQwen2_5_VL>(vlm_config, model_dir, device, device_config);
} else if (vlm_config.model_type == VLMModelType::QWEN3_VL) {
m_impl = std::make_shared<InputsEmbedderQwen3VL>(vlm_config, model_dir, device, device_config);
} else if (vlm_config.model_type == VLMModelType::QWEN3_5 || vlm_config.model_type == VLMModelType::QWEN3_5_MOE) {
m_impl = std::make_shared<InputsEmbedderQwen3_5>(vlm_config, model_dir, device, device_config);
} else if (vlm_config.model_type == VLMModelType::GEMMA3) {
Comment thread
yatarkan marked this conversation as resolved.
m_impl = std::make_shared<InputsEmbedderGemma3>(vlm_config, model_dir, device, device_config);
} else {
Expand Down Expand Up @@ -322,6 +325,8 @@ InputsEmbedder::InputsEmbedder(const ModelsMap& models_map,
m_impl = std::make_shared<InputsEmbedderQwen2_5_VL>(vlm_config, models_map, tokenizer, config_dir_path, device, device_config);
} else if (vlm_config.model_type == VLMModelType::QWEN3_VL) {
m_impl = std::make_shared<InputsEmbedderQwen3VL>(vlm_config, models_map, tokenizer, config_dir_path, device, device_config);
} else if (vlm_config.model_type == VLMModelType::QWEN3_5 || vlm_config.model_type == VLMModelType::QWEN3_5_MOE) {
m_impl = std::make_shared<InputsEmbedderQwen3_5>(vlm_config, models_map, tokenizer, config_dir_path, device, device_config);
} else if (vlm_config.model_type == VLMModelType::GEMMA3) {
m_impl = std::make_shared<InputsEmbedderGemma3>(vlm_config, models_map, tokenizer, config_dir_path, device, device_config);
} else {
Expand Down
1 change: 1 addition & 0 deletions src/cpp/src/visual_language/inputs_embedder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,7 @@ class InputsEmbedder {
friend class InputsEmbedderQwen2VL;
friend class InputsEmbedderQwen2_5_VL;
friend class InputsEmbedderQwen3VL;
friend class InputsEmbedderQwen3_5;
friend class InputsEmbedderGemma3;
};

Expand Down
4 changes: 1 addition & 3 deletions src/cpp/src/visual_language/internvl_chat/classes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@

#include "visual_language/clip.hpp"

#include "utils.hpp"

namespace ov::genai {

namespace {
Expand Down Expand Up @@ -133,7 +131,7 @@ ov::Tensor get_pixel_values_internvl(const ov::Tensor& image, const ProcessorCon
EncodedImage VisionEncoderInternVLChat::encode(const ov::Tensor& image, const ov::AnyMap& config_map) {
CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_vision_encoder.get());
ov::InferRequest& encoder = infer_request_guard.get();
ProcessorConfig config = utils::from_any_map(config_map, m_processor_config);
ProcessorConfig config = ProcessorConfig::from_any_map(config_map, m_processor_config);

ov::Tensor pixel_values = get_pixel_values_internvl(image, config);

Expand Down
2 changes: 1 addition & 1 deletion src/cpp/src/visual_language/llava/classes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ ov::Tensor get_pixel_values_llava(const ov::Tensor& image, const ProcessorConfig
EncodedImage VisionEncoderLLaVA::encode( const ov::Tensor& image, const ov::AnyMap& config_map) {
CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_vision_encoder.get());
ov::InferRequest& encoder = infer_request_guard.get();
ProcessorConfig config = utils::from_any_map(config_map, m_processor_config);
ProcessorConfig config = ProcessorConfig::from_any_map(config_map, m_processor_config);

ov::Tensor pixel_values = get_pixel_values_llava(image, config);

Expand Down
2 changes: 1 addition & 1 deletion src/cpp/src/visual_language/llava_next/classes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ ov::Tensor VisionEncoderLLaVANext::get_pixel_values_llava_next(const ov::Tensor&
EncodedImage VisionEncoderLLaVANext::encode(const ov::Tensor& image, const ov::AnyMap& config_map) {
CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_vision_encoder.get());
ov::InferRequest& encoder = infer_request_guard.get();
ProcessorConfig config = utils::from_any_map(config_map, m_processor_config);
ProcessorConfig config = ProcessorConfig::from_any_map(config_map, m_processor_config);

ov::Tensor pixel_values = get_pixel_values_llava_next(image, config);

Expand Down
2 changes: 1 addition & 1 deletion src/cpp/src/visual_language/llava_next_video/classes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,7 @@ EncodedImage VisionEncoderLLaVANextVideo::encode(const ov::Tensor& image, const
ov::InferRequest& encoder = infer_request_guard.get();
CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard_mm_projector(this->m_ireq_queue_multi_modal_projector.get());
ov::InferRequest& mm_projector = infer_request_guard_mm_projector.get();
ProcessorConfig config = utils::from_any_map(config_map, m_processor_config);
ProcessorConfig config = ProcessorConfig::from_any_map(config_map, m_processor_config);

ov::Shape pixel_values_shape;
if (use_ov_vision_preprocess) {
Expand Down
2 changes: 1 addition & 1 deletion src/cpp/src/visual_language/minicpm/classes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -423,7 +423,7 @@ std::pair<EncodedImage, ImageSliceResult> llava_image_embed_make_with_bytes_slic
EncodedImage VisionEncoderMiniCPM::encode(const ov::Tensor& image, const ov::AnyMap& config_map) {
CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_vision_encoder.get());
ov::InferRequest& encoder = infer_request_guard.get();
ProcessorConfig config = utils::from_any_map(config_map, m_processor_config);
ProcessorConfig config = ProcessorConfig::from_any_map(config_map, m_processor_config);

clip_ctx ctx_clip;
ctx_clip.image_size = config.image_size;
Expand Down
3 changes: 1 addition & 2 deletions src/cpp/src/visual_language/nanollava/classes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

#include "visual_language/nanollava/classes.hpp"
#include "visual_language/clip.hpp"
#include "utils.hpp"

namespace ov::genai {

Expand Down Expand Up @@ -86,7 +85,7 @@ EncodedImage VisionEncoderNanoLLaVA::encode(const ov::Tensor& image, const ov::A
CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_vision_encoder.get());
ov::InferRequest& encoder = infer_request_guard.get();

ProcessorConfig config = utils::from_any_map(config_map, m_processor_config);
ProcessorConfig config = ProcessorConfig::from_any_map(config_map, m_processor_config);

// nanollava specific preprocess params
config.image_mean = std::array<float, 3>{0.5f, 0.5f, 0.5f};
Expand Down
2 changes: 1 addition & 1 deletion src/cpp/src/visual_language/phi3_vision/classes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -884,7 +884,7 @@ std::vector<std::variant<ov::Tensor, size_t>> drop_image_placeholders(const ov::
EncodedImage VisionEncoderPhi3V::encode(const ov::Tensor& image, const ov::AnyMap& config_map) {
CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_vision_encoder.get());
ov::InferRequest& encoder = infer_request_guard.get();
ProcessorConfig config = utils::from_any_map(config_map, m_processor_config);
ProcessorConfig config = ProcessorConfig::from_any_map(config_map, m_processor_config);

ImageSize image_size;

Expand Down
2 changes: 1 addition & 1 deletion src/cpp/src/visual_language/phi4mm/classes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -666,7 +666,7 @@ m_separator_inserters{create_separator_inserters(device)} {
}

EncodedImage VisionEncoderPhi4MM::encode(const ov::Tensor& image, const ov::AnyMap& config_map) {
ProcessorConfig config = utils::from_any_map(config_map, m_processor_config);
ProcessorConfig config = ProcessorConfig::from_any_map(config_map, m_processor_config);
ov::Tensor input_image_embeds{ov::element::f32, {}}, image_attention_mask{ov::element::f32, {}};
int32_t image_height = 0, image_width = 0, num_img_tokens = 0;

Expand Down
5 changes: 4 additions & 1 deletion src/cpp/src/visual_language/pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -753,9 +753,12 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
};

// TODO: remove it when GEMMA3 ticket-171180 is fixed
// TODO: remove Qwen3.5 limitation once ticket-183791 is fixed
bool requires_sdpa(const std::filesystem::path& models_dir) {
auto vlm_config = utils::from_config_json_if_exists<VLMConfig>(models_dir, "config.json");
return vlm_config.model_type == VLMModelType::GEMMA3;
return vlm_config.model_type == VLMModelType::GEMMA3
|| vlm_config.model_type == VLMModelType::QWEN3_5
|| vlm_config.model_type == VLMModelType::QWEN3_5_MOE;
Comment on lines 755 to +761
Copy link

Copilot AI Apr 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PR description/checklist indicates tests and documentation updates are still TBD/in progress. Per repo PR protocol, please update the PR description to fully match the template/checklist status (and ideally link the follow-up PR/commit for tests/docs) before merging, so reviewers can verify readiness.

Copilot uses AI. Check for mistakes.
}
Comment on lines 755 to 762
Copy link

Copilot AI Apr 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PR description checklist indicates tests and documentation updates are still TBD/in progress. Before merging, please update the PR description/checklist to reflect completed test and documentation work (or explicitly scope them out) to align with the repository PR template expectations.

Copilot uses AI. Check for mistakes.

VLMPipeline::VLMPipeline(
Expand Down
33 changes: 27 additions & 6 deletions src/cpp/src/visual_language/processor_config.cpp
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
// Copyright (C) 2023-2026 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#include <fstream>

#include "processor_config.hpp"
#include "json_utils.hpp"
#include "utils.hpp"

#include <fstream>

ov::genai::ProcessorConfig::ProcessorConfig(const std::filesystem::path& json_path) {
std::ifstream stream(json_path);
OPENVINO_ASSERT(stream.is_open(), "Failed to open '", json_path, "' with processor config");
nlohmann::json parsed = nlohmann::json::parse(stream);
ov::genai::ProcessorConfig::ProcessorConfig(const nlohmann::json& parsed) {
using ov::genai::utils::read_json_param;
read_json_param(parsed, "patch_size", patch_size); // For llava - stored in config.json vision_config
read_json_param(parsed, "scale_resolution", scale_resolution);
Expand Down Expand Up @@ -51,3 +49,26 @@ ov::genai::ProcessorConfig::ProcessorConfig(const std::filesystem::path& json_pa
read_json_param(parsed, "size.height", size_height);
read_json_param(parsed, "size.width", size_width);
}

ov::genai::ProcessorConfig::ProcessorConfig(const std::filesystem::path& json_path)
: ProcessorConfig([&json_path] {
std::ifstream stream(json_path);
OPENVINO_ASSERT(stream.is_open(), "Failed to open '", json_path, "' with processor config");
return nlohmann::json::parse(stream);
}()) {}

ov::genai::ProcessorConfig ov::genai::ProcessorConfig::from_any_map(
const ov::AnyMap& config_map,
const ProcessorConfig& initial
) {
auto iter = config_map.find("processor_config");
ProcessorConfig extracted_config = config_map.end() != iter ?
iter->second.as<ProcessorConfig>() : initial;
using ov::genai::utils::read_anymap_param;
read_anymap_param(config_map, "patch_size", extracted_config.patch_size);
read_anymap_param(config_map, "scale_resolution", extracted_config.scale_resolution);
read_anymap_param(config_map, "max_slice_nums", extracted_config.max_slice_nums);
read_anymap_param(config_map, "norm_mean", extracted_config.norm_mean);
read_anymap_param(config_map, "norm_std", extracted_config.norm_std);
return extracted_config;
}
12 changes: 10 additions & 2 deletions src/cpp/src/visual_language/processor_config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@

#pragma once

#include "openvino/genai/visibility.hpp"
#include <openvino/runtime/properties.hpp>
#include <array>
#include <filesystem>
#include <nlohmann/json_fwd.hpp>
#include <openvino/core/any.hpp>

namespace ov::genai {
/// @brief A Configuration class passed to VisionEncoder and used to
Expand Down Expand Up @@ -62,9 +62,17 @@ class ProcessorConfig {

/// @brief Default constructor
ProcessorConfig() = default;

explicit ProcessorConfig(const nlohmann::json& parsed);

/// @brief Construct ProcessorConfig from values in json_path.
/// Keys in the file must match the ProcessorConfig's members.
/// @param json_path A path to a file to extract the values from.
explicit ProcessorConfig(const std::filesystem::path& json_path);

static ProcessorConfig from_any_map(
const ov::AnyMap& config_map,
const ProcessorConfig& initial
);
};
} // namespace ov::genai
26 changes: 20 additions & 6 deletions src/cpp/src/visual_language/qwen2vl/classes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1125,10 +1125,17 @@ ov::Tensor InputsEmbedderQwen2VL::get_inputs_embeds(const std::string& unified_p
int64_t image_pad_token_id = m_vision_token_ids["image_pad"];
int64_t video_pad_token_id = m_vision_token_ids["video_pad"];

m_position_ids = create_position_ids(input_ids, images_grid_thw, images_sequence, 0, video_grid_thw, videos_sequence, 0, vision_start_token_id, history_vision_count);

int64_t position_ids_max_element = *std::max_element(m_position_ids.data<int64_t>(), m_position_ids.data<int64_t>() + m_position_ids.get_size());
m_rope_delta = position_ids_max_element + 1 - static_cast<int64_t>(input_ids.get_shape().at(1));
std::tie(m_position_ids, m_rope_delta) = create_position_ids(
input_ids,
images_grid_thw,
images_sequence,
0,
video_grid_thw,
videos_sequence,
0,
vision_start_token_id,
history_vision_count
);

if (images.empty() && videos.empty()) {
ov::Tensor inputs_embeds(text_embeds.get_element_type(), text_embeds.get_shape());
Expand Down Expand Up @@ -1466,7 +1473,7 @@ std::vector<std::array<size_t, 3>> InputsEmbedderQwen2VL::get_vision_grid_thw_fo
return reordered_vision_grid_thw;
}

ov::Tensor InputsEmbedderQwen2VL::create_position_ids(
std::pair<ov::Tensor, int64_t> InputsEmbedderQwen2VL::create_position_ids(
const ov::Tensor& input_ids_tensor,
const std::vector<std::array<size_t, 3>>& images_grid_thw,
const std::vector<size_t>& images_sequence,
Expand Down Expand Up @@ -1572,7 +1579,14 @@ ov::Tensor InputsEmbedderQwen2VL::create_position_ids(
}
}

return position_ids;
// Calculate rope delta
const int64_t position_ids_max_element = *std::max_element(
position_ids.data<int64_t>(),
position_ids.data<int64_t>() + position_ids.get_size()
);
const int64_t rope_delta = position_ids_max_element + 1 - static_cast<int64_t>(input_ids_tensor.get_shape().at(1));

return {position_ids, rope_delta};
}

} // namespace ov::genai
2 changes: 1 addition & 1 deletion src/cpp/src/visual_language/qwen2vl/classes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder {
const std::vector<std::pair<std::size_t, std::size_t>>& history_vision_count
) const;

ov::Tensor create_position_ids(
virtual std::pair<ov::Tensor, int64_t> create_position_ids(
const ov::Tensor& input_ids_tensor,
const std::vector<std::array<size_t, 3>>& images_grid_thw,
const std::vector<size_t>& images_sequence,
Expand Down
Loading
Loading