diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index b5b3c319..b6952153 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -19,7 +19,6 @@ jobs:
         DISTRO: [
           { "tag": "jammy_gtsam4.3a0", "base_image": "ubuntu:jammy", "suffix": "gcc" },
           { "tag": "jammy_gtsam4.3a0", "base_image": "ubuntu:jammy", "suffix": "llvm" },
-          { "tag": "jammy_cuda12.2_gtsam4.3a0", "base_image": "nvidia/cuda:12.2.2-devel-ubuntu22.04", "suffix": "gcc.cuda" },
           { "tag": "jammy_cuda12.5_gtsam4.3a0", "base_image": "nvidia/cuda:12.5.1-devel-ubuntu22.04", "suffix": "gcc.cuda" },
           { "tag": "jammy_cuda13.1_gtsam4.3a0", "base_image": "nvidia/cuda:13.1.0-devel-ubuntu22.04", "suffix": "gcc.cuda" },
           { "tag": "noble_gtsam4.3a0", "base_image": "ubuntu:noble", "suffix": "gcc" },
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5222ac7e..39a34af1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -251,12 +251,20 @@ if(BUILD_WITH_CUDA)
     src/gtsam_points/cuda/nonlinear_factor_set_gpu_create.cpp
     src/gtsam_points/cuda/stream_roundrobin.cu
     src/gtsam_points/cuda/stream_temp_buffer_roundrobin.cu
+    # ann
+    src/gtsam_points/ann/kdtree_gpu.cpp
+    src/gtsam_points/ann/kdtree_gpu.cu
     # types
     src/gtsam_points/types/point_cloud.cu
     src/gtsam_points/types/point_cloud_gpu.cu
     src/gtsam_points/types/gaussian_voxelmap_gpu.cu
     src/gtsam_points/types/gaussian_voxelmap_gpu_funcs.cu
     # factors
+    src/gtsam_points/factors/integrated_gicp_derivatives.cu
+    src/gtsam_points/factors/integrated_gicp_derivatives_inliers.cu
+    src/gtsam_points/factors/integrated_gicp_derivatives_linearize.cu
+    src/gtsam_points/factors/integrated_gicp_derivatives_compute.cu
+    src/gtsam_points/factors/integrated_gicp_factor_gpu.cpp
     src/gtsam_points/factors/integrated_vgicp_derivatives.cu
     src/gtsam_points/factors/integrated_vgicp_derivatives_inliers.cu
     src/gtsam_points/factors/integrated_vgicp_derivatives_compute.cu
@@ -267,6 +275,7 @@ if(BUILD_WITH_CUDA)
   )
   target_include_directories(gtsam_points_cuda PUBLIC
     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>
     $<INSTALL_INTERFACE:include>
   )
   target_link_libraries(gtsam_points_cuda
diff --git a/include/gtsam_points/ann/kdtree_gpu.hpp b/include/gtsam_points/ann/kdtree_gpu.hpp
new file mode 100644
index 00000000..e5f116dc
--- /dev/null
+++ b/include/gtsam_points/ann/kdtree_gpu.hpp
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025  Kenji Koide (k.koide@aist.go.jp)
+#pragma once
+
+#include <limits>
+#include <cstdint>
+#include <gtsam_points/types/point_cloud.hpp>
+#include <gtsam_points/ann/small_kdtree.hpp>
+
+struct CUstream_st;
+
+namespace gtsam_points {
+
+struct KdTreeNodeGPU {
+  NodeIndexType left = INVALID_NODE;   ///< Left child node index.
+  NodeIndexType right = INVALID_NODE;  ///< Right child node index.
+
+  union {
+    struct Leaf {
+      NodeIndexType first;  ///< First point index in the leaf node.
+      NodeIndexType last;   ///< Last point index in the leaf node.
+    } lr;                   ///< Leaf node.
+    struct NonLeaf {
+      NodeIndexType axis;  ///< Projection axis.
+      float thresh;        ///< Threshold value.
+    } sub;                 ///< Non-leaf node.
+  } node_type;
+};
+
+class KdTreeGPU {
+public:
+  using Ptr = std::shared_ptr<KdTreeGPU>;
+  using ConstPtr = std::shared_ptr<const KdTreeGPU>;
+
+  KdTreeGPU(const PointCloud::ConstPtr& points, CUstream_st* stream = nullptr);
+  ~KdTreeGPU();
+
+  void nearest_neighbor_search(
+    const Eigen::Vector3f* queries,
+    size_t num_queries,
+    std::uint32_t* nn_indices,
+    float* nn_sq_dists,
+    CUstream_st* stream = nullptr);
+
+  void nearest_neighbor_search_cpu(
+    const Eigen::Vector3f* h_queries,
+    size_t num_queries,
+    std::uint32_t* h_nn_indices,
+    float* h_nn_sq_dists,
+    CUstream_st* stream = nullptr);
+
+  /// @brief Get the GPU pointer to the point indices
+  const std::uint32_t* get_indices() const { return indices; }
+
+  /// @brief Get the GPU pointer to the KdTree nodes
+  const KdTreeNodeGPU* get_nodes() const { return nodes; }
+
+private:
+  PointCloud::ConstPtr points;
+  size_t num_indices;
+  size_t num_nodes;
+  std::uint32_t* indices;
+  KdTreeNodeGPU* nodes;
+};
+
+}  // namespace gtsam_points
diff --git a/include/gtsam_points/cuda/kernels/correspondence.hpp b/include/gtsam_points/cuda/kernels/correspondence.hpp
new file mode 100644
index 00000000..d5102697
--- /dev/null
+++ b/include/gtsam_points/cuda/kernels/correspondence.hpp
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025  Kenji Koide (k.koide@aist.go.jp)
+
+#pragma once
+
+#ifdef __CUDACC__
+#define GTSAM_POINTS_HOST_DEVICE __host__ __device__
+#else
+#define GTSAM_POINTS_HOST_DEVICE
+#endif
+
+namespace gtsam_points {
+
+/// @brief A pair of source and target point indices representing a correspondence.
+struct Correspondence {
+  GTSAM_POINTS_HOST_DEVICE Correspondence() : source_idx(-1), target_idx(-1) {}
+  GTSAM_POINTS_HOST_DEVICE Correspondence(int source_idx, int target_idx) : source_idx(source_idx), target_idx(target_idx) {}
+
+  int source_idx;
+  int target_idx;
+};
+
+}  // namespace gtsam_points
diff --git a/include/gtsam_points/cuda/kernels/gicp_derivatives.cuh b/include/gtsam_points/cuda/kernels/gicp_derivatives.cuh
new file mode 100644
index 00000000..fb3013a0
--- /dev/null
+++ b/include/gtsam_points/cuda/kernels/gicp_derivatives.cuh
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025  Kenji Koide (k.koide@aist.go.jp)
+
+#pragma once
+
+#include <Eigen/Core>
+#include <thrust/device_vector.h>
+
+#include <gtsam_points/cuda/kernels/correspondence.hpp>
+#include <gtsam_points/cuda/kernels/pose.cuh>
+#include <gtsam_points/cuda/kernels/linearized_system.cuh>
+
+namespace gtsam_points {
+
+struct gicp_derivatives_kernel {
+  gicp_derivatives_kernel(
+    const Eigen::Isometry3f* linearization_point_ptr,
+    const Eigen::Vector3f* target_means,
+    const Eigen::Matrix3f* target_covs,
+    const Eigen::Vector3f* source_means,
+    const Eigen::Matrix3f* source_covs)
+  : linearization_point_ptr(linearization_point_ptr),
+    target_means_ptr(target_means),
+    target_covs_ptr(target_covs),
+    source_means_ptr(source_means),
+    source_covs_ptr(source_covs) {}
+
+  __device__ LinearizedSystem6 operator()(const Correspondence& source_target_correspondence) const {
+    const int source_idx = source_target_correspondence.source_idx;
+    const int target_idx = source_target_correspondence.target_idx;
+    if (source_idx < 0 || target_idx < 0) {
+      return LinearizedSystem6::zero();
+    }
+
+    const Eigen::Isometry3f& x = *linearization_point_ptr;
+    const Eigen::Matrix3f R = x.linear();
+    const Eigen::Vector3f t = x.translation();
+
+    const Eigen::Vector3f mean_A = source_means_ptr[source_idx];
+    const Eigen::Matrix3f cov_A = source_covs_ptr[source_idx];
+    const Eigen::Vector3f transed_mean_A = R * mean_A + t;
+
+    const Eigen::Vector3f mean_B = target_means_ptr[target_idx];
+    const Eigen::Matrix3f cov_B = target_covs_ptr[target_idx];
+
+    const Eigen::Matrix3f RCR = (R * cov_A * R.transpose());
+    const Eigen::Matrix3f RCR_inv = (cov_B + RCR).inverse();
+    Eigen::Vector3f error = mean_B - transed_mean_A;
+
+    Eigen::Matrix<float, 3, 6> J_target;
+    J_target.block<3, 3>(0, 0) = -skew_symmetric(transed_mean_A);
+    J_target.block<3, 3>(0, 3) = Eigen::Matrix3f::Identity();
+
+    Eigen::Matrix<float, 3, 6> J_source;
+    J_source.block<3, 3>(0, 0) = R * skew_symmetric(mean_A);
+    J_source.block<3, 3>(0, 3) = -R;
+
+    Eigen::Matrix<float, 6, 3> J_target_RCR_inv = J_target.transpose() * RCR_inv;
+    Eigen::Matrix<float, 6, 3> J_source_RCR_inv = J_source.transpose() * RCR_inv;
+
+    LinearizedSystem6 linearized;
+    linearized.num_inliers = 1;
+    linearized.error = error.transpose() * RCR_inv * error;
+    linearized.H_target = J_target_RCR_inv * J_target;
+    linearized.H_source = J_source_RCR_inv * J_source;
+    linearized.H_target_source = J_target_RCR_inv * J_source;
+    linearized.b_target = J_target_RCR_inv * error;
+    linearized.b_source = J_source_RCR_inv * error;
+
+    return linearized;
+  }
+
+  const Eigen::Isometry3f* linearization_point_ptr;
+
+  const Eigen::Vector3f* target_means_ptr;
+  const Eigen::Matrix3f* target_covs_ptr;
+
+  const Eigen::Vector3f* source_means_ptr;
+  const Eigen::Matrix3f* source_covs_ptr;
+};
+
+struct gicp_error_kernel {
+  gicp_error_kernel(
+    const Eigen::Isometry3f* linearization_point_ptr,
+    const Eigen::Isometry3f* evaluation_point_ptr,
+    const Eigen::Vector3f* target_means,
+    const Eigen::Matrix3f* target_covs,
+    const Eigen::Vector3f* source_means,
+    const Eigen::Matrix3f* source_covs)
+  : linearization_point_ptr(linearization_point_ptr),
+    evaluation_point_ptr(evaluation_point_ptr),
+    target_means_ptr(target_means),
+    target_covs_ptr(target_covs),
+    source_means_ptr(source_means),
+    source_covs_ptr(source_covs) {}
+
+  __device__ float operator()(const Correspondence& source_target_correspondence) const {
+    const int source_idx = source_target_correspondence.source_idx;
+    const int target_idx = source_target_correspondence.target_idx;
+    if (source_idx < 0 || target_idx < 0) {
+      return 0.0f;
+    }
+
+    const Eigen::Isometry3f& xl = *linearization_point_ptr;
+    const Eigen::Matrix3f Rl = xl.linear();
+
+    const Eigen::Isometry3f& xe = *evaluation_point_ptr;
+    const Eigen::Matrix3f Re = xe.linear();
+    const Eigen::Vector3f te = xe.translation();
+
+    const Eigen::Vector3f mean_A = source_means_ptr[source_idx];
+    const Eigen::Matrix3f cov_A = source_covs_ptr[source_idx];
+    const Eigen::Vector3f transed_mean_A = Re * mean_A + te;
+
+    const Eigen::Vector3f mean_B = target_means_ptr[target_idx];
+    const Eigen::Matrix3f cov_B = target_covs_ptr[target_idx];
+
+    const Eigen::Matrix3f RCR = (Rl * cov_A * Rl.transpose());
+    const Eigen::Matrix3f RCR_inv = (cov_B + RCR).inverse();
+    Eigen::Vector3f error = mean_B - transed_mean_A;
+
+    return error.transpose() * RCR_inv * error;
+  }
+
+  const Eigen::Isometry3f* linearization_point_ptr;
+  const Eigen::Isometry3f* evaluation_point_ptr;
+
+  const Eigen::Vector3f* target_means_ptr;
+  const Eigen::Matrix3f* target_covs_ptr;
+  const Eigen::Vector3f* source_means_ptr;
+  const Eigen::Matrix3f* source_covs_ptr;
+};
+
+}  // namespace gtsam_points
diff --git a/include/gtsam_points/cuda/kernels/kdtree.cuh b/include/gtsam_points/cuda/kernels/kdtree.cuh
new file mode 100644
index 00000000..013c8628
--- /dev/null
+++ b/include/gtsam_points/cuda/kernels/kdtree.cuh
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025  Kenji Koide (k.koide@aist.go.jp)
+#pragma once
+#include <gtsam_points/ann/kdtree_gpu.hpp>
+
+namespace gtsam_points {
+
+struct kdtree_nearest_neighbor_search_kernel {
+public:
+  static constexpr int MAX_STACK_SIZE = 20;
+
+  __device__ thrust::pair<NodeIndexType, float> operator()(const Eigen::Vector3f query) const {
+    thrust::pair<NodeIndexType, float> result = {INVALID_NODE, std::numeric_limits<float>::max()};
+
+    int stack_size = 1;
+    thrust::pair<int, float> search_stack[MAX_STACK_SIZE] = {{0, 0.0f}};
+
+    while (stack_size > 0) {
+      const auto [node_index, sq_dist] = search_stack[--stack_size];
+      if (sq_dist > result.second) {
+        continue;
+      }
+
+      const KdTreeNodeGPU node = nodes[node_index];
+
+      // Leaf node
+      if (node.left == INVALID_NODE) {
+        for (NodeIndexType i = node.node_type.lr.first; i < node.node_type.lr.last; i++) {
+          const NodeIndexType pt_index = indices[i];
+          const float sq_dist = (points[pt_index] - query).squaredNorm();
+          if (sq_dist < result.second) {
+            result = {pt_index, sq_dist};
+          }
+        }
+        continue;
+      }
+
+      const float val = query[node.node_type.sub.axis];
+      const float diff = val - node.node_type.sub.thresh;
+      const float cut_sq_dist = diff * diff;
+
+      int best_child;
+      int other_child;
+
+      if (diff < 0.0f) {
+        best_child = node.left;
+        other_child = node.right;
+      } else {
+        best_child = node.right;
+        other_child = node.left;
+      }
+
+      if (stack_size > MAX_STACK_SIZE - 2) {
+        printf("kdtree stack overflow!!");
+      } else if (cut_sq_dist < result.second) {
+        search_stack[stack_size].first = other_child;
+        search_stack[stack_size++].second = cut_sq_dist;
+      }
+
+      search_stack[stack_size].first = best_child;
+      search_stack[stack_size++].second = 0.0f;
+    }
+
+    return result;
+  }
+
+public:
+  const Eigen::Vector3f* __restrict__ points;
+  const std::uint32_t* __restrict__ indices;
+  const KdTreeNodeGPU* __restrict__ nodes;
+};
+
+}  // namespace gtsam_points
diff --git a/include/gtsam_points/factors/integrated_gicp_derivatives.cuh b/include/gtsam_points/factors/integrated_gicp_derivatives.cuh
new file mode 100644
index 00000000..e219b4b0
--- /dev/null
+++ b/include/gtsam_points/factors/integrated_gicp_derivatives.cuh
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025  Kenji Koide (k.koide@aist.go.jp)
+
+#pragma once
+
+#include <memory>
+#include <Eigen/Core>
+#include <Eigen/Geometry>
+
+#include <gtsam_points/ann/kdtree_gpu.hpp>
+#include <gtsam_points/cuda/kernels/correspondence.hpp>
+#include <gtsam_points/types/point_cloud_gpu.hpp>
+
+struct CUstream_st;
+
+namespace gtsam_points {
+
+class LinearizedSystem6;
+class TempBufferManager;
+
+class IntegratedGICPDerivatives {
+public:
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+
+  IntegratedGICPDerivatives(
+    const PointCloud::ConstPtr& target,
+    const PointCloud::ConstPtr& source,
+    const KdTreeGPU::ConstPtr& target_tree,
+    CUstream_st* ext_stream,
+    std::shared_ptr<TempBufferManager> temp_buffer);
+  ~IntegratedGICPDerivatives();
+
+  void set_inlier_update_thresh(double trans, double angle) {
+    inlier_update_thresh_trans = trans;
+    inlier_update_thresh_angle = angle;
+  }
+
+  void set_max_correspondence_distance(double dist) { max_correspondence_distance_sq = dist * dist; }
+
+  void set_enable_offloading(bool enable) { enable_offloading = enable; }
+
+  void set_enable_surface_validation(bool enable) { enable_surface_validation = enable; }
+
+  int get_num_inliers() const { return num_inliers; }
+
+  void touch_points();
+
+  // synchronized interface
+  LinearizedSystem6 linearize(const Eigen::Isometry3f& x);
+  double compute_error(const Eigen::Isometry3f& xl, const Eigen::Isometry3f& xe);
+
+  void reset_inliers(const Eigen::Isometry3f& x, const Eigen::Isometry3f* d_x, bool force_update = false);
+  void update_inliers(int num_inliers);
+
+  // async interface
+  void sync_stream();
+  void issue_linearize(const Eigen::Isometry3f* d_x, LinearizedSystem6* d_output);
+  void issue_compute_error(const Eigen::Isometry3f* d_xl, const Eigen::Isometry3f* d_xe, float* d_output);
+
+private:
+  bool enable_offloading;
+
+  bool enable_surface_validation;
+  double max_correspondence_distance_sq;
+  double inlier_update_thresh_trans;
+  double inlier_update_thresh_angle;
+
+  bool external_stream;
+  CUstream_st* stream;
+  std::shared_ptr<TempBufferManager> temp_buffer;
+
+  PointCloud::ConstPtr target;
+  PointCloud::ConstPtr source;
+  KdTreeGPU::ConstPtr target_tree;
+
+  Eigen::Isometry3f inlier_evaluation_point;
+  const Eigen::Isometry3f* inlier_evaluation_point_gpu;
+
+  int num_inliers;
+  int* num_inliers_gpu;
+  Correspondence* source_target_correspondences;
+};
+}  // namespace gtsam_points
\ No newline at end of file
diff --git a/include/gtsam_points/factors/integrated_gicp_factor_gpu.hpp b/include/gtsam_points/factors/integrated_gicp_factor_gpu.hpp
new file mode 100644
index 00000000..334345d2
--- /dev/null
+++ b/include/gtsam_points/factors/integrated_gicp_factor_gpu.hpp
@@ -0,0 +1,159 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025  Kenji Koide (k.koide@aist.go.jp)
+
+#pragma once
+
+#include <optional>
+#include <gtsam_points/types/point_cloud.hpp>
+#include <gtsam_points/ann/kdtree_gpu.hpp>
+#include <gtsam_points/factors/nonlinear_factor_gpu.hpp>
+
+struct CUstream_st;
+
+namespace gtsam {
+class Pose3;
+}
+
+namespace gtsam_points {
+
+class LinearizedSystem6;
+class IntegratedGICPDerivatives;
+class TempBufferManager;
+
+/**
+ * @brief GPU-accelerated GICP matching cost factor
+ */
+class IntegratedGICPFactorGPU : public gtsam_points::NonlinearFactorGPU {
+public:
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+  using shared_ptr = std::shared_ptr<IntegratedGICPFactorGPU>;
+
+  /**
+   * @brief Create a binary GICP_GPU factor between target and source poses.
+   * @param target_key    Target key
+   * @param source_key    Source key
+   * @param target        Target frame
+   * @param source        Source frame
+   * @param target_tree   Target kdtree
+   * @param stream        CUDA stream
+   * @param temp_buffer   CUDA temporary buffer manager
+   */
+  IntegratedGICPFactorGPU(
+    gtsam::Key target_key,
+    gtsam::Key source_key,
+    const PointCloud::ConstPtr& target,
+    const PointCloud::ConstPtr& source,
+    const KdTreeGPU::ConstPtr& target_tree,
+    CUstream_st* stream = nullptr,
+    std::shared_ptr<TempBufferManager> temp_buffer = nullptr);
+
+  /**
+   * @brief Create a unary GICP_GPU factor between a fixed target pose and an active source pose.
+   * @param targfixed_target_pose  Fixed target pose
+   * @param source_key             Source key
+   * @param target                 Target kdtree
+   * @param source                 Source frame
+   * @param stream                 CUDA stream
+   * @param temp_buffer            CUDA temporary buffer manager
+   */
+  IntegratedGICPFactorGPU(
+    const gtsam::Pose3& fixed_target_pose,
+    gtsam::Key source_key,
+    const PointCloud::ConstPtr& target,
+    const PointCloud::ConstPtr& source,
+    const KdTreeGPU::ConstPtr& target_tree,
+    CUstream_st* stream = nullptr,
+    std::shared_ptr<TempBufferManager> temp_buffer = nullptr);
+
+  virtual ~IntegratedGICPFactorGPU() override;
+
+  /// @brief Print the factor information.
+  virtual void print(const std::string& s = "", const gtsam::KeyFormatter& keyFormatter = gtsam::DefaultKeyFormatter) const override;
+
+  /// @brief  Calculate the CPU memory usage of this factor
+  /// @note   The result is approximate and does not account for objects not owned by this factor (e.g., point clouds)
+  /// @return Approximate CPU memory usage in bytes
+  size_t memory_usage() const;
+
+  /// @brief  Calculate the GPU memory usage of this factor
+  /// @note   The result is approximate and does not account for objects not owned by this factor (e.g., point clouds)
+  /// @return Approximate GPU memory usage in bytes
+  size_t memory_usage_gpu() const;
+
+  /// @brief Enable or disable GPU memory offloading.
+  void set_enable_offloading(bool enable);
+
+  /// @brief Enable or disable surface orientation validation for correspondence search
+  /// @note  To enable surface orientation validation, source frame must have point normals
+  void set_enable_surface_validation(bool enable);
+
+  /// @brief Set the threshold values to trigger inlier points update.
+  ///        Setting larger values reduces GPU sync but may affect the registration accuracy.
+  void set_inlier_update_thresh(double trans, double angle);
+
+  /// @brief Set the maximum distance between corresponding points.
+  ///        Correspondences with distances larger than this will be rejected (i.e., correspondence trimming).
+  void set_max_correspondence_distance(double dist);
+
+  /// @brief  Get the number of inlier points.
+  /// @note   This function must be called after the factor is linearized.
+  int num_inliers() const;
+
+  /// @brief Get the fraction of inlier points.
+  /// @note  This function must be called after the factor is linearized.
+  double inlier_fraction() const;
+
+  /// @brief Get the pose of the fixed target. This function is only valid for unary factors.
+  Eigen::Isometry3f get_fixed_target_pose() const { return fixed_target_pose; }
+
+  // forbid copy
+  IntegratedGICPFactorGPU(const IntegratedGICPFactorGPU&) = delete;
+  IntegratedGICPFactorGPU& operator=(const IntegratedGICPFactorGPU&) = delete;
+
+  virtual gtsam::NonlinearFactor::shared_ptr clone() const override;
+
+  virtual size_t dim() const override { return 6; }
+  virtual double error(const gtsam::Values& values) const override;
+  virtual std::shared_ptr<gtsam::GaussianFactor> linearize(const gtsam::Values& values) const override;
+
+  virtual size_t linearization_input_size() const override;
+  virtual size_t linearization_output_size() const override;
+  virtual size_t evaluation_input_size() const override;
+  virtual size_t evaluation_output_size() const override;
+
+  virtual void set_linearization_point(const gtsam::Values& values, void* lin_input_cpu) override;
+  virtual void issue_linearize(const void* lin_input_cpu, const void* lin_input_gpu, void* lin_output_gpu) override;
+  virtual void store_linearized(const void* lin_output_cpu) override;
+
+  virtual void set_evaluation_point(const gtsam::Values& values, void* eval_input_cpu) override;
+  virtual void issue_compute_error(
+    const void* lin_input_cpu,
+    const void* eval_input_cpu,
+    const void* lin_input_gpu,
+    const void* eval_input_gpu,
+    void* eval_output_gpu) override;
+  virtual void store_computed_error(const void* eval_output_cpu) override;
+
+  virtual void sync() override;
+
+private:
+  Eigen::Isometry3f calc_delta(const gtsam::Values& values) const;
+
+private:
+  bool is_binary;
+  Eigen::Isometry3f fixed_target_pose;
+
+  PointCloud::ConstPtr target;
+  PointCloud::ConstPtr source;
+  KdTreeGPU::ConstPtr target_tree;
+
+  std::unique_ptr<IntegratedGICPDerivatives> derivatives;
+
+  mutable bool linearized;
+  mutable Eigen::Isometry3f linearization_point;
+
+  mutable std::optional<float> evaluation_result;
+  mutable std::unique_ptr<LinearizedSystem6> linearization_result;
+};
+
+}  // namespace gtsam_points
\ No newline at end of file
diff --git a/src/demo/demo_matching_cost_factors.cpp b/src/demo/demo_matching_cost_factors.cpp
index fb8bf2b0..1bfcf6a9 100644
--- a/src/demo/demo_matching_cost_factors.cpp
+++ b/src/demo/demo_matching_cost_factors.cpp
@@ -23,8 +23,10 @@
 
 #include <gtsam_points/factors/integrated_icp_factor.hpp>
 #include <gtsam_points/factors/integrated_gicp_factor.hpp>
+#include <gtsam_points/factors/integrated_gicp_factor_gpu.hpp>
 #include <gtsam_points/factors/integrated_vgicp_factor.hpp>
 #include <gtsam_points/factors/integrated_vgicp_factor_gpu.hpp>
+#include <gtsam_points/ann/kdtree_gpu.hpp>
 #include <gtsam_points/optimizers/isam2_ext.hpp>
 #include <gtsam_points/optimizers/levenberg_marquardt_ext.hpp>
 #include <gtsam_points/optimizers/linearization_hook.hpp>
@@ -56,6 +58,7 @@ class MatchingCostFactorDemo {
     frames.resize(5);
     voxelmaps.resize(5);
     voxelmaps_gpu.resize(5);
+    kdtrees_gpu.resize(5);
 
     for (int i = 0; i < 5; i++) {
       std::string token;
@@ -103,6 +106,9 @@ class MatchingCostFactorDemo {
       auto voxelmap_gpu = std::make_shared<gtsam_points::GaussianVoxelMapGPU>(2.0);
       voxelmap_gpu->insert(*frame);
       voxelmaps_gpu[i] = voxelmap_gpu;
+
+      auto kdtree_gpu = std::make_shared<gtsam_points::KdTreeGPU>(frame);
+      kdtrees_gpu[i] = kdtree_gpu;
 #endif
 
       viewer->update_drawable("frame_" + std::to_string(i), std::make_shared<glk::PointCloudBuffer>(frame->points, frame->size()), guik::Rainbow());
@@ -122,6 +128,7 @@ class MatchingCostFactorDemo {
     factor_types.push_back("GICP");
     factor_types.push_back("VGICP");
 #ifdef GTSAM_POINTS_USE_CUDA
+    factor_types.push_back("GICP_GPU");
     factor_types.push_back("VGICP_GPU");
 #endif
 
@@ -220,6 +227,10 @@ class MatchingCostFactorDemo {
       auto factor = gtsam::make_shared<gtsam_points::IntegratedVGICPFactor>(target_key, source_key, target_voxelmap, source);
       factor->set_num_threads(num_threads);
       return factor;
+    } else if (factor_types[factor_type] == std::string("GICP_GPU")) {
+#ifdef GTSAM_POINTS_USE_CUDA
+      return gtsam::make_shared<gtsam_points::IntegratedGICPFactorGPU>(target_key, source_key, target, source, kdtrees_gpu[target_key]);
+#endif
     } else if (factor_types[factor_type] == std::string("VGICP_GPU")) {
 #ifdef GTSAM_POINTS_USE_CUDA
       return gtsam::make_shared<gtsam_points::IntegratedVGICPFactorGPU>(target_key, source_key, target_voxelmap_gpu, source);
@@ -301,6 +312,7 @@ class MatchingCostFactorDemo {
   std::vector<gtsam_points::PointCloud::Ptr> frames;
   std::vector<gtsam_points::GaussianVoxelMap::Ptr> voxelmaps;
   std::vector<gtsam_points::GaussianVoxelMap::Ptr> voxelmaps_gpu;
+  std::vector<gtsam_points::KdTreeGPU::Ptr> kdtrees_gpu;
 };
 
 int main(int argc, char** argv) {
diff --git a/src/gtsam_points/ann/kdtree_gpu.cpp b/src/gtsam_points/ann/kdtree_gpu.cpp
new file mode 100644
index 00000000..96163654
--- /dev/null
+++ b/src/gtsam_points/ann/kdtree_gpu.cpp
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025  Kenji Koide (k.koide@aist.go.jp)
+#include <gtsam_points/ann/kdtree_gpu.hpp>
+
+#include <gtsam_points/cuda/check_error.cuh>
+#include <gtsam_points/ann/small_kdtree.hpp>
+
+namespace gtsam_points {
+
+KdTreeGPU::KdTreeGPU(const PointCloud::ConstPtr& points, CUstream_st* stream)
+: points(points),
+  num_indices(0),
+  num_nodes(0),
+  indices(nullptr),
+  nodes(nullptr) {
+  //
+  if (!points->has_points()) {
+    std::cerr << "error: empty point cloud is given for KdTreeGPU" << std::endl;
+    return;
+  }
+  if (!points->has_points_gpu()) {
+    std::cerr << "error: point cloud does not have GPU points for KdTreeGPU" << std::endl;
+    return;
+  }
+
+  //
+  KdTreeBuilder builder;
+  UnsafeKdTree<PointCloud> kdtree(*points, builder);
+
+  // copy to GPU
+  std::vector<std::uint32_t> h_indices(kdtree.indices.begin(), kdtree.indices.end());
+  std::vector<KdTreeNodeGPU> h_nodes(kdtree.nodes.size());
+
+  for (int i = 0; i < kdtree.nodes.size(); i++) {
+    const auto& in = kdtree.nodes[i];
+    auto& out = h_nodes[i];
+
+    out.left = in.left;
+    out.right = in.right;
+
+    if (in.left == INVALID_NODE) {
+      out.node_type.lr.first = in.node_type.lr.first;
+      out.node_type.lr.last = in.node_type.lr.last;
+    } else {
+      out.node_type.sub.axis = in.node_type.sub.proj.axis;
+      out.node_type.sub.thresh = in.node_type.sub.thresh;
+    }
+  }
+
+  num_indices = kdtree.indices.size();
+  num_nodes = kdtree.nodes.size();
+  check_error << cudaMallocAsync(&indices, sizeof(std::uint32_t) * num_indices, stream);
+  check_error << cudaMallocAsync(&nodes, sizeof(KdTreeNodeGPU) * num_nodes, stream);
+  check_error << cudaMemcpyAsync(indices, h_indices.data(), sizeof(std::uint32_t) * num_indices, cudaMemcpyHostToDevice, stream);
+  check_error << cudaMemcpyAsync(nodes, h_nodes.data(), sizeof(KdTreeNodeGPU) * num_nodes, cudaMemcpyHostToDevice, stream);
+}
+
+KdTreeGPU::~KdTreeGPU() {
+  check_error << cudaFreeAsync(indices, nullptr);
+  check_error << cudaFreeAsync(nodes, nullptr);
+}
+
+void KdTreeGPU::nearest_neighbor_search_cpu(
+  const Eigen::Vector3f* h_queries,
+  size_t num_queries,
+  std::uint32_t* h_nn_indices,
+  float* h_nn_sq_dists,
+  CUstream_st* stream) {
+  //
+  Eigen::Vector3f* d_queries;
+  std::uint32_t* d_nn_indices;
+  float* d_nn_sq_dists;
+
+  check_error << cudaMallocAsync(&d_queries, sizeof(Eigen::Vector3f) * num_queries, stream);
+  check_error << cudaMallocAsync(&d_nn_indices, sizeof(std::uint32_t) * num_queries, stream);
+  check_error << cudaMallocAsync(&d_nn_sq_dists, sizeof(float) * num_queries, stream);
+  check_error << cudaMemcpyAsync(d_queries, h_queries, sizeof(Eigen::Vector3f) * num_queries, cudaMemcpyHostToDevice, stream);
+
+  nearest_neighbor_search(d_queries, num_queries, d_nn_indices, d_nn_sq_dists, stream);
+
+  check_error << cudaMemcpyAsync(h_nn_indices, d_nn_indices, sizeof(std::uint32_t) * num_queries, cudaMemcpyDeviceToHost, stream);
+  check_error << cudaMemcpyAsync(h_nn_sq_dists, d_nn_sq_dists, sizeof(float) * num_queries, cudaMemcpyDeviceToHost, stream);
+
+  check_error << cudaFreeAsync(d_queries, stream);
+  check_error << cudaFreeAsync(d_nn_indices, stream);
+  check_error << cudaFreeAsync(d_nn_sq_dists, stream);
+}
+
+}  // namespace gtsam_points
\ No newline at end of file
diff --git a/src/gtsam_points/ann/kdtree_gpu.cu b/src/gtsam_points/ann/kdtree_gpu.cu
new file mode 100644
index 00000000..e9fbede2
--- /dev/null
+++ b/src/gtsam_points/ann/kdtree_gpu.cu
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025  Kenji Koide (k.koide@aist.go.jp)
+#include <gtsam_points/ann/kdtree_gpu.hpp>
+
+#include <algorithm>
+#include <thrust/copy.h>
+
+#include <thrust/pair.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <gtsam_points/cuda/check_error.cuh>
+#include <gtsam_points/cuda/kernels/kdtree.cuh>
+
+namespace gtsam_points {
+
+namespace {
+
+struct nn_search_kernel {
+public:
+  __device__ void operator()(std::uint32_t i) const {
+    const Eigen::Vector3f query = queries[i];
+
+    kdtree_nearest_neighbor_search_kernel search{points, indices, nodes};
+    const auto result = search(query);
+
+    nn_indices[i] = result.first;
+    nn_sq_dists[i] = result.second;
+  }
+
+public:
+  const Eigen::Vector3f* __restrict__ points;
+  const std::uint32_t* __restrict__ indices;
+  const KdTreeNodeGPU* __restrict__ nodes;
+
+  const Eigen::Vector3f* __restrict__ queries;
+
+  std::uint32_t* nn_indices;
+  float* nn_sq_dists;
+};
+}  // namespace
+
+void KdTreeGPU::nearest_neighbor_search(
+  const Eigen::Vector3f* queries,
+  const size_t num_queries,
+  std::uint32_t* nn_indices,
+  float* nn_sq_dists,
+  CUstream_st* stream) {
+  thrust::for_each(
+    thrust::cuda::par.on(stream),
+    thrust::counting_iterator<std::uint32_t>(0),
+    thrust::counting_iterator<std::uint32_t>(num_queries),
+    nn_search_kernel{points->points_gpu, indices, nodes, queries, nn_indices, nn_sq_dists});
+}
+
+}  // namespace gtsam_points
\ No newline at end of file
diff --git a/src/gtsam_points/factors/integrated_gicp_derivatives.cu b/src/gtsam_points/factors/integrated_gicp_derivatives.cu
new file mode 100644
index 00000000..c7830b87
--- /dev/null
+++ b/src/gtsam_points/factors/integrated_gicp_derivatives.cu
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025  Kenji Koide (k.koide@aist.go.jp)
+
+#include <gtsam_points/factors/integrated_gicp_derivatives.cuh>
+
+#include <iostream>
+#include <Eigen/Geometry>
+
+#include <gtsam_points/cuda/check_error.cuh>
+#include <gtsam_points/cuda/kernels/linearized_system.cuh>
+#include <gtsam_points/cuda/kernels/vgicp_derivatives.cuh>
+#include <gtsam_points/cuda/stream_temp_buffer_roundrobin.hpp>
+#include <gtsam_points/cuda/cuda_malloc_async.hpp>
+
+#include <gtsam_points/types/point_cloud_gpu.hpp>
+
+namespace gtsam_points {
+
+IntegratedGICPDerivatives::IntegratedGICPDerivatives(
+  const PointCloud::ConstPtr& target,
+  const PointCloud::ConstPtr& source,
+  const KdTreeGPU::ConstPtr& target_tree,
+  CUstream_st* ext_stream,
+  std::shared_ptr<TempBufferManager> temp_buffer)
+: enable_offloading(false),
+  enable_surface_validation(false),
+  max_correspondence_distance_sq(1.0),
+  inlier_update_thresh_trans(1e-6),
+  inlier_update_thresh_angle(1e-6),
+  target(target),
+  source(source),
+  target_tree(target_tree),
+  external_stream(true),
+  stream(ext_stream),
+  temp_buffer(temp_buffer),
+  num_inliers(0),
+  source_target_correspondences(nullptr) {
+  //
+  if (stream == nullptr) {
+    external_stream = false;
+    cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
+  }
+
+  if (this->temp_buffer == nullptr) {
+    this->temp_buffer.reset(new TempBufferManager());
+  }
+
+  check_error << cudaMallocAsync(&num_inliers_gpu, sizeof(int), stream);
+  // check_error << cudaHostRegister(&num_inliers, sizeof(int), cudaHostRegisterDefault);
+}
+
+IntegratedGICPDerivatives::~IntegratedGICPDerivatives() {
+  check_error << cudaFreeAsync(source_target_correspondences, stream);
+  check_error << cudaFreeAsync(num_inliers_gpu, stream);
+  // check_error << cudaHostUnregister(&num_inliers);
+
+  if (!external_stream) {
+    cudaStreamDestroy(stream);
+  }
+}
+
+void IntegratedGICPDerivatives::sync_stream() {
+  check_error << cudaStreamSynchronize(stream);
+}
+
+void IntegratedGICPDerivatives::touch_points() {
+  if (!enable_offloading) {
+    return;
+  }
+
+  // auto target_ = const_cast<GaussianVoxelMapGPU*>(target.get());
+  // target_->touch(stream);
+
+  // auto source_gpu_const = dynamic_cast<const PointCloudGPU*>(source.get());
+  // if (!source_gpu_const) {
+  //   return;
+  // }
+
+  // auto source_gpu = const_cast<PointCloudGPU*>(source_gpu_const);
+  // source_gpu->touch(stream);
+}
+
+LinearizedSystem6 IntegratedGICPDerivatives::linearize(const Eigen::Isometry3f& x) {
+  thrust::device_vector<Eigen::Isometry3f> x_ptr(1);
+  thrust::device_vector<LinearizedSystem6> output_ptr(1);
+
+  x_ptr[0] = x;
+
+  reset_inliers(x, thrust::raw_pointer_cast(x_ptr.data()));
+  issue_linearize(thrust::raw_pointer_cast(x_ptr.data()), thrust::raw_pointer_cast(output_ptr.data()));
+  sync_stream();
+
+  LinearizedSystem6 linearized = output_ptr[0];
+
+  return linearized;
+}
+
+double IntegratedGICPDerivatives::compute_error(const Eigen::Isometry3f& d_xl, const Eigen::Isometry3f& d_xe) {
+  thrust::device_vector<Eigen::Isometry3f> xs_ptr(2);
+  xs_ptr[0] = d_xl;
+  xs_ptr[1] = d_xe;
+  thrust::device_vector<float> output_ptr(1);
+
+  issue_compute_error(
+    thrust::raw_pointer_cast(xs_ptr.data()),
+    thrust::raw_pointer_cast(xs_ptr.data() + 1),
+    thrust::raw_pointer_cast(output_ptr.data()));
+  sync_stream();
+
+  float error = output_ptr[0];
+  return error;
+}
+
+}  // namespace gtsam_points
\ No newline at end of file
diff --git a/src/gtsam_points/factors/integrated_gicp_derivatives_compute.cu b/src/gtsam_points/factors/integrated_gicp_derivatives_compute.cu
new file mode 100644
index 00000000..db187c8e
--- /dev/null
+++ b/src/gtsam_points/factors/integrated_gicp_derivatives_compute.cu
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025  Kenji Koide (k.koide@aist.go.jp)
+
+#include <gtsam_points/factors/integrated_gicp_derivatives.cuh>
+
+#include <iostream>
+#include <thrust/remove.h>
+#include <thrust/iterator/transform_iterator.h>
+
+#include <cub/device/device_reduce.cuh>
+
+#include <gtsam_points/cuda/kernels/pose.cuh>
+#include <gtsam_points/cuda/kernels/untie.cuh>
+#include <gtsam_points/cuda/kernels/kdtree.cuh>
+#include <gtsam_points/cuda/kernels/linearized_system.cuh>
+#include <gtsam_points/cuda/kernels/gicp_derivatives.cuh>
+#include <gtsam_points/cuda/stream_temp_buffer_roundrobin.hpp>
+
+namespace gtsam_points {
+
+namespace {
+
+/// @brief Kernel to compute source-target correspondence by KdTree NN search
+struct kdtree_correspondence_kernel {
+  kdtree_correspondence_kernel(
+    const Eigen::Isometry3f* linearization_point_ptr,
+    const Eigen::Vector3f* source_points,
+    const Eigen::Vector3f* target_points,
+    const std::uint32_t* target_indices,
+    const KdTreeNodeGPU* target_nodes)
+  : linearization_point_ptr(linearization_point_ptr),
+    source_points(source_points),
+    target_points(target_points),
+    target_indices(target_indices),
+    target_nodes(target_nodes) {}
+
+  __device__ Correspondence operator()(const Correspondence& source_target) const {
+    const int source_idx = source_target.source_idx;
+    if (source_idx < 0) {
+      return source_target;
+    }
+
+    const Eigen::Isometry3f& x = *linearization_point_ptr;
+    const Eigen::Vector3f transed_pt = x.linear() * source_points[source_idx] + x.translation();
+
+    kdtree_nearest_neighbor_search_kernel nn_search;
+    nn_search.points = target_points;
+    nn_search.indices = target_indices;
+    nn_search.nodes = target_nodes;
+
+    const auto [nn_idx, sq_dist] = nn_search(transed_pt);
+
+    return Correspondence(source_idx, static_cast<int>(nn_idx));
+  }
+
+  const Eigen::Isometry3f* linearization_point_ptr;
+  const Eigen::Vector3f* source_points;
+  const Eigen::Vector3f* target_points;
+  const std::uint32_t* target_indices;
+  const KdTreeNodeGPU* target_nodes;
+};
+
+}  // namespace
+
+void IntegratedGICPDerivatives::issue_compute_error(const Eigen::Isometry3f* d_xl, const Eigen::Isometry3f* d_xe, float* d_output) {
+  // Reuse correspondences computed in issue_linearize instead of recomputing KdTree search
+  // Compute GICP error using the correspondences
+  gicp_error_kernel error_kernel(
+    d_xl,
+    d_xe,
+    reinterpret_cast<const Eigen::Vector3f*>(target->points_gpu),
+    reinterpret_cast<const Eigen::Matrix3f*>(target->covs_gpu),
+    reinterpret_cast<const Eigen::Vector3f*>(source->points_gpu),
+    reinterpret_cast<const Eigen::Matrix3f*>(source->covs_gpu));
+
+  auto first = thrust::make_transform_iterator(source_target_correspondences, error_kernel);
+
+  void* temp_storage = nullptr;
+  size_t temp_storage_bytes = 0;
+
+  cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, first, d_output, num_inliers, thrust::plus<float>(), 0.0f, stream);
+
+  temp_storage = temp_buffer->get_buffer(temp_storage_bytes);
+
+  cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, first, d_output, num_inliers, thrust::plus<float>(), 0.0f, stream);
+}
+
+}  // namespace gtsam_points
diff --git a/src/gtsam_points/factors/integrated_gicp_derivatives_inliers.cu b/src/gtsam_points/factors/integrated_gicp_derivatives_inliers.cu
new file mode 100644
index 00000000..43a1fdf6
--- /dev/null
+++ b/src/gtsam_points/factors/integrated_gicp_derivatives_inliers.cu
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025  Kenji Koide (k.koide@aist.go.jp)
+
+#include <gtsam_points/factors/integrated_gicp_derivatives.cuh>
+
+#include <iostream>
+#include <thrust/remove.h>
+#include <thrust/sequence.h>
+#include <thrust/host_vector.h>
+#include <thrust/transform.h>
+
+#include <gtsam_points/cuda/check_error.cuh>
+#include <gtsam_points/cuda/kernels/pose.cuh>
+#include <gtsam_points/cuda/stream_temp_buffer_roundrobin.hpp>
+#include <gtsam_points/cuda/cuda_malloc_async.hpp>
+
+namespace gtsam_points {
+
+namespace {
+
+struct init_source_target_indices_kernel {
+  __host__ __device__ Correspondence operator()(const int i) const { return Correspondence(i, -1); }
+};
+
+}  // namespace
+
+void IntegratedGICPDerivatives::reset_inliers(const Eigen::Isometry3f& x, const Eigen::Isometry3f* d_x, bool force_update) {
+  touch_points();
+
+  if (
+    force_update || source_target_correspondences == nullptr ||
+    large_displacement(inlier_evaluation_point, x, inlier_update_thresh_trans, inlier_update_thresh_angle)) {
+    inlier_evaluation_point = x;
+    inlier_evaluation_point_gpu = d_x;
+
+    check_error << cudaFreeAsync(source_target_correspondences, stream);
+    check_error << cudaMallocAsync(&source_target_correspondences, sizeof(Correspondence) * source->size(), stream);
+    thrust::transform(
+      thrust::cuda::par_nosync.on(stream),
+      thrust::make_counting_iterator<int>(0),
+      thrust::make_counting_iterator<int>(source->size()),
+      source_target_correspondences,
+      init_source_target_indices_kernel());
+    num_inliers = source->size();
+  } else {
+    inlier_evaluation_point_gpu = nullptr;
+  }
+}
+
+void IntegratedGICPDerivatives::update_inliers(int new_num_inliers) {
+  if (inlier_evaluation_point_gpu == nullptr) {
+    return;
+  }
+
+  // For now, we keep all correspondences without filtering
+  // The correspondence filtering can be added later with validity checks
+  this->num_inliers = new_num_inliers;
+}
+
+}  // namespace gtsam_points
\ No newline at end of file
diff --git a/src/gtsam_points/factors/integrated_gicp_derivatives_linearize.cu b/src/gtsam_points/factors/integrated_gicp_derivatives_linearize.cu
new file mode 100644
index 00000000..275f98ab
--- /dev/null
+++ b/src/gtsam_points/factors/integrated_gicp_derivatives_linearize.cu
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025  Kenji Koide (k.koide@aist.go.jp)
+
+#include <gtsam_points/factors/integrated_gicp_derivatives.cuh>
+
+#include <iostream>
+#include <thrust/remove.h>
+#include <thrust/transform.h>
+#include <thrust/iterator/transform_iterator.h>
+
+#include <cub/device/device_reduce.cuh>
+
+#include <gtsam_points/cuda/check_error.cuh>
+#include <gtsam_points/cuda/kernels/pose.cuh>
+#include <gtsam_points/cuda/kernels/untie.cuh>
+#include <gtsam_points/cuda/kernels/kdtree.cuh>
+#include <gtsam_points/cuda/kernels/linearized_system.cuh>
+#include <gtsam_points/cuda/kernels/gicp_derivatives.cuh>
+#include <gtsam_points/cuda/stream_temp_buffer_roundrobin.hpp>
+#include <gtsam_points/cuda/cuda_malloc_async.hpp>
+
+namespace gtsam_points {
+
+namespace {
+
+/// @brief Kernel to compute source-target correspondence by KdTree NN search
+struct kdtree_correspondence_kernel {
+  kdtree_correspondence_kernel(
+    const Eigen::Isometry3f* linearization_point_ptr,
+    const Eigen::Vector3f* source_points,
+    const Eigen::Vector3f* target_points,
+    const std::uint32_t* target_indices,
+    const KdTreeNodeGPU* target_nodes,
+    float max_correspondence_distance_sq)
+  : linearization_point_ptr(linearization_point_ptr),
+    source_points(source_points),
+    target_points(target_points),
+    target_indices(target_indices),
+    target_nodes(target_nodes),
+    max_correspondence_distance_sq(max_correspondence_distance_sq) {}
+
+  __device__ Correspondence operator()(const Correspondence& source_target) const {
+    const int source_idx = source_target.source_idx;
+    if (source_idx < 0) {
+      return source_target;
+    }
+
+    const Eigen::Isometry3f& x = *linearization_point_ptr;
+    const Eigen::Vector3f transed_pt = x.linear() * source_points[source_idx] + x.translation();
+
+    kdtree_nearest_neighbor_search_kernel nn_search;
+    nn_search.points = target_points;
+    nn_search.indices = target_indices;
+    nn_search.nodes = target_nodes;
+
+    const auto [nn_idx, sq_dist] = nn_search(transed_pt);
+
+    if (sq_dist > max_correspondence_distance_sq) {
+      return Correspondence(source_idx, -1);
+    }
+
+    return Correspondence(source_idx, static_cast<int>(nn_idx));
+  }
+
+  const Eigen::Isometry3f* linearization_point_ptr;
+  const Eigen::Vector3f* source_points;
+  const Eigen::Vector3f* target_points;
+  const std::uint32_t* target_indices;
+  const KdTreeNodeGPU* target_nodes;
+  float max_correspondence_distance_sq;
+};
+
+}  // namespace
+
+void IntegratedGICPDerivatives::issue_linearize(const Eigen::Isometry3f* d_x, LinearizedSystem6* d_output) {
+  // Compute correspondences using KdTree nearest neighbor search
+  kdtree_correspondence_kernel corr_kernel(
+    d_x,
+    reinterpret_cast<const Eigen::Vector3f*>(source->points_gpu),
+    reinterpret_cast<const Eigen::Vector3f*>(target->points_gpu),
+    target_tree->get_indices(),
+    target_tree->get_nodes(),
+    max_correspondence_distance_sq);
+
+  // Materialize correspondences in-place for reuse in issue_compute_error
+  thrust::transform(
+    thrust::cuda::par_nosync.on(stream),
+    source_target_correspondences,
+    source_target_correspondences + num_inliers,
+    source_target_correspondences,
+    corr_kernel);
+
+  // Compute GICP derivatives using the computed correspondences
+  gicp_derivatives_kernel deriv_kernel(
+    d_x,
+    reinterpret_cast<const Eigen::Vector3f*>(target->points_gpu),
+    reinterpret_cast<const Eigen::Matrix3f*>(target->covs_gpu),
+    reinterpret_cast<const Eigen::Vector3f*>(source->points_gpu),
+    reinterpret_cast<const Eigen::Matrix3f*>(source->covs_gpu));
+
+  auto first = thrust::make_transform_iterator(source_target_correspondences, deriv_kernel);
+
+  void* temp_storage = nullptr;
+  size_t temp_storage_bytes = 0;
+
+  cub::DeviceReduce::Reduce(
+    temp_storage,
+    temp_storage_bytes,
+    first,
+    d_output,
+    num_inliers,
+    thrust::plus<LinearizedSystem6>(),
+    LinearizedSystem6::zero(),
+    stream);
+
+  temp_storage = temp_buffer->get_buffer(temp_storage_bytes);
+
+  cub::DeviceReduce::Reduce(
+    temp_storage,
+    temp_storage_bytes,
+    first,
+    d_output,
+    num_inliers,
+    thrust::plus<LinearizedSystem6>(),
+    LinearizedSystem6::zero(),
+    stream);
+}
+
+}  // namespace gtsam_points
diff --git a/src/gtsam_points/factors/integrated_gicp_factor_gpu.cpp b/src/gtsam_points/factors/integrated_gicp_factor_gpu.cpp
new file mode 100644
index 00000000..e56ee112
--- /dev/null
+++ b/src/gtsam_points/factors/integrated_gicp_factor_gpu.cpp
@@ -0,0 +1,302 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025  Kenji Koide (k.koide@aist.go.jp)
+
+#include <gtsam_points/factors/integrated_gicp_factor_gpu.hpp>
+
+#include <cuda_runtime.h>
+
+#include <gtsam/geometry/Pose3.h>
+#include <gtsam/linear/HessianFactor.h>
+
+#include <gtsam_points/cuda/kernels/linearized_system.cuh>
+#include <gtsam_points/cuda/stream_temp_buffer_roundrobin.hpp>
+#include <gtsam_points/factors/integrated_gicp_derivatives.cuh>
+
+namespace gtsam_points {
+
+IntegratedGICPFactorGPU::IntegratedGICPFactorGPU(
+  gtsam::Key target_key,
+  gtsam::Key source_key,
+  const PointCloud::ConstPtr& target,
+  const PointCloud::ConstPtr& source,
+  const KdTreeGPU::ConstPtr& target_tree,
+  CUstream_st* stream,
+  std::shared_ptr<TempBufferManager> temp_buffer)
+: gtsam_points::NonlinearFactorGPU(gtsam::KeyVector{target_key, source_key}),
+  is_binary(true),
+  fixed_target_pose(Eigen::Isometry3f::Identity()),
+  target(target),
+  source(source),
+  target_tree(target_tree),
+  derivatives(new IntegratedGICPDerivatives(target, source, target_tree, stream, temp_buffer)),
+  linearized(false),
+  linearization_point(Eigen::Isometry3f::Identity()) {
+  //
+  if (!source->points_gpu) {
+    std::cerr << "error: GPU source points have not been allocated!!" << std::endl;
+    abort();
+  }
+
+  if (!source->covs_gpu) {
+    std::cerr << "error: GPU source covs have not been allocated!!" << std::endl;
+    abort();
+  }
+
+  if (!target->points_gpu) {
+    std::cerr << "error: GPU target points have not been allocated!!" << std::endl;
+    abort();
+  }
+
+  if (!target->covs_gpu) {
+    std::cerr << "error: GPU target covs have not been allocated!!" << std::endl;
+    abort();
+  }
+
+  if (!target_tree) {
+    std::cerr << "error: GPU target kdtree has not been created!!" << std::endl;
+    abort();
+  }
+}
+
+IntegratedGICPFactorGPU::IntegratedGICPFactorGPU(
+  const gtsam::Pose3& fixed_target_pose,
+  gtsam::Key source_key,
+  const PointCloud::ConstPtr& target,
+  const PointCloud::ConstPtr& source,
+  const KdTreeGPU::ConstPtr& target_tree,
+  CUstream_st* stream,
+  std::shared_ptr<TempBufferManager> temp_buffer)
+: gtsam_points::NonlinearFactorGPU(gtsam::KeyVector{source_key}),
+  is_binary(false),
+  fixed_target_pose(fixed_target_pose.matrix().cast<float>()),
+  target(target),
+  source(source),
+  target_tree(target_tree),
+  derivatives(new IntegratedGICPDerivatives(target, source, target_tree, stream, temp_buffer)),
+  linearized(false),
+  linearization_point(Eigen::Isometry3f::Identity()) {
+  //
+  if (!source->points_gpu) {
+    std::cerr << "error: GPU source points have not been allocated!!" << std::endl;
+    abort();
+  }
+
+  if (!source->covs_gpu) {
+    std::cerr << "error: GPU source covs have not been allocated!!" << std::endl;
+    abort();
+  }
+
+  if (!target->points_gpu) {
+    std::cerr << "error: GPU target points have not been allocated!!" << std::endl;
+    abort();
+  }
+
+  if (!target->covs_gpu) {
+    std::cerr << "error: GPU target covs have not been allocated!!" << std::endl;
+    abort();
+  }
+
+  if (!target_tree) {
+    std::cerr << "error: GPU target kdtree has not been created!!" << std::endl;
+    abort();
+  }
+}
+
+IntegratedGICPFactorGPU::~IntegratedGICPFactorGPU() {}
+
+void IntegratedGICPFactorGPU::print(const std::string& s, const gtsam::KeyFormatter& keyFormatter) const {
+  std::cout << s << "IntegratedGICPFactorGPU";
+  if (is_binary) {
+    std::cout << "(" << keyFormatter(this->keys()[0]) << ", " << keyFormatter(this->keys()[1]) << ")" << std::endl;
+  } else {
+    std::cout << "(fixed, " << keyFormatter(this->keys()[0]) << ")" << std::endl;
+  }
+
+  std::cout << "|target|=" << frame::size(*target) << "pts, |source|=" << frame::size(*source) << "pts" << std::endl;
+}
+
+size_t IntegratedGICPFactorGPU::memory_usage() const {
+  return sizeof(*this) + sizeof(IntegratedGICPDerivatives);
+}
+
+size_t IntegratedGICPFactorGPU::memory_usage_gpu() const {
+  return sizeof(Eigen::Isometry3f) + sizeof(int) + sizeof(Correspondence) * derivatives->get_num_inliers();
+}
+
+void IntegratedGICPFactorGPU::set_enable_offloading(bool enable) {
+  derivatives->set_enable_offloading(enable);
+}
+
+void IntegratedGICPFactorGPU::set_enable_surface_validation(bool enable) {
+  derivatives->set_enable_surface_validation(enable);
+}
+
+void IntegratedGICPFactorGPU::set_inlier_update_thresh(double trans, double angle) {
+  derivatives->set_inlier_update_thresh(trans, angle);
+}
+
+void IntegratedGICPFactorGPU::set_max_correspondence_distance(double dist) {
+  derivatives->set_max_correspondence_distance(dist);
+}
+
+int IntegratedGICPFactorGPU::num_inliers() const {
+  return derivatives->get_num_inliers();
+}
+
+double IntegratedGICPFactorGPU::inlier_fraction() const {
+  return derivatives->get_num_inliers() / static_cast<double>(frame::size(*source));
+}
+
+gtsam::NonlinearFactor::shared_ptr IntegratedGICPFactorGPU::clone() const {
+  if (is_binary) {
+    return gtsam::make_shared<IntegratedGICPFactorGPU>(keys()[0], keys()[1], target, source, target_tree, nullptr, nullptr);
+  }
+
+  return gtsam::make_shared<IntegratedGICPFactorGPU>(
+    gtsam::Pose3(fixed_target_pose.cast<double>().matrix()),
+    keys()[0],
+    target,
+    source,
+    target_tree,
+    nullptr,
+    nullptr);
+}
+
+size_t IntegratedGICPFactorGPU::linearization_input_size() const {
+  return sizeof(Eigen::Isometry3f);
+}
+
+size_t IntegratedGICPFactorGPU::linearization_output_size() const {
+  return sizeof(LinearizedSystem6);
+}
+
+size_t IntegratedGICPFactorGPU::evaluation_input_size() const {
+  return sizeof(Eigen::Isometry3f);
+}
+
+size_t IntegratedGICPFactorGPU::evaluation_output_size() const {
+  return sizeof(float);
+}
+
+Eigen::Isometry3f IntegratedGICPFactorGPU::calc_delta(const gtsam::Values& values) const {
+  if (!is_binary) {
+    gtsam::Pose3 source_pose = values.at<gtsam::Pose3>(keys_[0]);
+    gtsam::Pose3 delta = gtsam::Pose3(fixed_target_pose.inverse().cast<double>().matrix()) * source_pose;
+    return Eigen::Isometry3f(delta.matrix().cast<float>());
+  }
+
+  gtsam::Pose3 target_pose = values.at<gtsam::Pose3>(keys_[0]);
+  gtsam::Pose3 source_pose = values.at<gtsam::Pose3>(keys_[1]);
+  gtsam::Pose3 delta = target_pose.inverse() * source_pose;
+
+  return Eigen::Isometry3f(delta.matrix().cast<float>());
+}
+
+double IntegratedGICPFactorGPU::error(const gtsam::Values& values) const {
+  double err;
+  if (evaluation_result) {
+    err = evaluation_result.value();
+    evaluation_result = {};
+  } else {
+    std::cerr << "warning: computing error in sync mode seriously affects the processing speed!!" << std::endl;
+
+    if (!linearized) {
+      linearize(values);
+    }
+
+    Eigen::Isometry3f evaluation_point = calc_delta(values);
+    err = derivatives->compute_error(linearization_point, evaluation_point);
+  }
+
+  return err;
+}
+
+gtsam::GaussianFactor::shared_ptr IntegratedGICPFactorGPU::linearize(const gtsam::Values& values) const {
+  linearized = true;
+  linearization_point = calc_delta(values);
+
+  LinearizedSystem6 l;
+
+  if (linearization_result) {
+    l = *linearization_result;
+    linearization_result.reset();
+  } else {
+    std::cerr << "warning: performing linearization in sync mode seriously affects the processing speed!!" << std::endl;
+    l = derivatives->linearize(linearization_point);
+  }
+
+  gtsam::HessianFactor::shared_ptr factor;
+
+  if (is_binary) {
+    factor.reset(new gtsam::HessianFactor(
+      keys_[0],
+      keys_[1],
+      l.H_target.cast<double>(),
+      l.H_target_source.cast<double>(),
+      -l.b_target.cast<double>(),
+      l.H_source.cast<double>(),
+      -l.b_source.cast<double>(),
+      l.error));
+  } else {
+    factor.reset(new gtsam::HessianFactor(keys_[0], l.H_source.cast<double>(), -l.b_source.cast<double>(), l.error));
+  }
+
+  return factor;
+}
+
+void IntegratedGICPFactorGPU::set_linearization_point(const gtsam::Values& values, void* lin_input_cpu) {
+  // If -march=native is used and some GPU factor requests a storage size that causes memory misalignment,
+  // the following lines that directly operates on the input buffer may cause segfaults
+  Eigen::Isometry3f* linearization_point = reinterpret_cast<Eigen::Isometry3f*>(lin_input_cpu);
+  *linearization_point = calc_delta(values);
+}
+
+void IntegratedGICPFactorGPU::set_evaluation_point(const gtsam::Values& values, void* eval_input_cpu) {
+  Eigen::Isometry3f* evaluation_point = reinterpret_cast<Eigen::Isometry3f*>(eval_input_cpu);
+  *evaluation_point = calc_delta(values);
+}
+
+void IntegratedGICPFactorGPU::issue_linearize(const void* lin_input_cpu, const void* lin_input_gpu, void* lin_output_gpu) {
+  auto linearization_point = reinterpret_cast<const Eigen::Isometry3f*>(lin_input_cpu);
+  auto linearization_point_gpu = reinterpret_cast<const Eigen::Isometry3f*>(lin_input_gpu);
+  auto linearized_gpu = reinterpret_cast<LinearizedSystem6*>(lin_output_gpu);
+
+  derivatives->reset_inliers(*linearization_point, linearization_point_gpu);
+  derivatives->issue_linearize(linearization_point_gpu, linearized_gpu);
+}
+
+void IntegratedGICPFactorGPU::store_linearized(const void* lin_output_cpu) {
+  auto linearized = reinterpret_cast<const LinearizedSystem6*>(lin_output_cpu);
+  linearization_result.reset(new LinearizedSystem6(*linearized));
+  evaluation_result = linearized->error;
+
+  derivatives->update_inliers(linearized->num_inliers);
+}
+
+void IntegratedGICPFactorGPU::issue_compute_error(
+  const void* lin_input_cpu,
+  const void* eval_input_cpu,
+  const void* lin_input_gpu,
+  const void* eval_input_gpu,
+  void* eval_output_gpu) {
+  //
+  auto linearization_point = reinterpret_cast<const Eigen::Isometry3f*>(lin_input_cpu);
+  auto evaluation_point = reinterpret_cast<const Eigen::Isometry3f*>(eval_input_cpu);
+
+  auto linearization_point_gpu = reinterpret_cast<const Eigen::Isometry3f*>(lin_input_gpu);
+  auto evaluation_point_gpu = reinterpret_cast<const Eigen::Isometry3f*>(eval_input_gpu);
+
+  auto error_gpu = reinterpret_cast<float*>(eval_output_gpu);
+
+  derivatives->issue_compute_error(linearization_point_gpu, evaluation_point_gpu, error_gpu);
+}
+
+void IntegratedGICPFactorGPU::store_computed_error(const void* eval_output_cpu) {
+  auto evaluated = reinterpret_cast<const float*>(eval_output_cpu);
+  evaluation_result = *evaluated;
+}
+
+void IntegratedGICPFactorGPU::sync() {
+  derivatives->sync_stream();
+}
+}  // namespace gtsam_points
diff --git a/src/test/test_kdtree.cpp b/src/test/test_kdtree.cpp
index 8089ad5b..4bd61934 100644
--- a/src/test/test_kdtree.cpp
+++ b/src/test/test_kdtree.cpp
@@ -9,8 +9,11 @@
 #include <gtsam_points/ann/kdtree.hpp>
 #include <gtsam_points/ann/kdtree2.hpp>
 #include <gtsam_points/ann/kdtreex.hpp>
+#include <gtsam_points/ann/kdtree_gpu.hpp>
 #include <gtsam_points/types/point_cloud_cpu.hpp>
+#include <gtsam_points/types/point_cloud_gpu.hpp>
 #include <gtsam_points/util/parallelism.hpp>
+#include <gtsam_points/util/easy_profiler.hpp>
 
 class KdTreeTest : public testing::Test, public testing::WithParamInterface<std::string> {
   virtual void SetUp() {
@@ -239,6 +242,46 @@ TEST_P(KdTreeTest, RadiusTest) {
   }
 }
 
+#ifdef GTSAM_POINTS_USE_CUDA
+
+TEST_F(KdTreeTest, KdTreeGPU) {
+  gtsam_points::KdTree kdtree(points.data(), points.size());
+
+  auto points_gpu = std::make_shared<gtsam_points::PointCloudGPU>(points);
+  gtsam_points::KdTreeGPU kdtree_gpu(points_gpu);
+
+  std::vector<Eigen::Vector3f> points_f(points.size());
+  std::transform(points.begin(), points.end(), points_f.begin(), [](const Eigen::Vector4d& p) { return p.head<3>().cast<float>(); });
+
+  std::vector<Eigen::Vector3f> queries_f(queries.size());
+  std::transform(queries.begin(), queries.end(), queries_f.begin(), [](const Eigen::Vector4d& p) { return p.head<3>().cast<float>(); });
+
+  // self-check
+  std::vector<std::uint32_t> nn_indices(points.size());
+  std::vector<float> nn_sq_dists(points.size());
+  kdtree_gpu.nearest_neighbor_search_cpu(points_f.data(), points_f.size(), nn_indices.data(), nn_sq_dists.data());
+
+  for (int i = 0; i < points.size(); i++) {
+    EXPECT_NEAR(nn_sq_dists[i], 0.0, 1e-6);
+    EXPECT_EQ(nn_indices[i], i);
+  }
+
+  // query check
+  nn_indices.resize(queries.size());
+  nn_sq_dists.resize(queries.size());
+  kdtree_gpu.nearest_neighbor_search_cpu(queries_f.data(), queries_f.size(), nn_indices.data(), nn_sq_dists.data());
+
+  for (int i = 0; i < queries.size(); i++) {
+    EXPECT_NEAR(nn_sq_dists[i], gt_sq_dists[i][0], 1e-2);
+
+    const double d1 = (points[nn_indices[i]] - queries[i]).squaredNorm();
+    const double d2 = (points[gt_indices[i][0]] - queries[i]).squaredNorm();
+    EXPECT_NEAR(d1, d2, 1e-3);
+  }
+}
+
+#endif
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
diff --git a/src/test/test_matching_cost_factors.cpp b/src/test/test_matching_cost_factors.cpp
index 427078ab..55e49d6b 100644
--- a/src/test/test_matching_cost_factors.cpp
+++ b/src/test/test_matching_cost_factors.cpp
@@ -22,8 +22,10 @@
 #include <gtsam_points/types/gaussian_voxelmap_gpu.hpp>
 #include <gtsam_points/factors/integrated_icp_factor.hpp>
 #include <gtsam_points/factors/integrated_gicp_factor.hpp>
+#include <gtsam_points/factors/integrated_gicp_factor_gpu.hpp>
 #include <gtsam_points/factors/integrated_vgicp_factor.hpp>
 #include <gtsam_points/factors/integrated_vgicp_factor_gpu.hpp>
+#include <gtsam_points/ann/kdtree_gpu.hpp>
 #include <gtsam_points/cuda/stream_temp_buffer_roundrobin.hpp>
 #include <gtsam_points/cuda/nonlinear_factor_set_gpu_create.hpp>
 
@@ -89,8 +91,12 @@ struct MatchingCostFactorsTestBase : public testing::Test {
       auto voxelmap_gpu = std::make_shared<gtsam_points::GaussianVoxelMapGPU>(1.0);
       voxelmap_gpu->insert(*frames.back());
       voxelmaps_gpu.push_back(voxelmap_gpu);
+
+      auto kdtree_gpu = std::make_shared<gtsam_points::KdTreeGPU>(frames.back());
+      kdtrees_gpu.push_back(kdtree_gpu);
 #else
       voxelmaps_gpu.push_back(nullptr);
+      kdtrees_gpu.push_back(nullptr);
 #endif
     }
 
@@ -103,6 +109,7 @@ struct MatchingCostFactorsTestBase : public testing::Test {
   std::vector<gtsam_points::PointCloud::Ptr> frames;
   std::vector<gtsam_points::GaussianVoxelMap::Ptr> voxelmaps;
   std::vector<gtsam_points::GaussianVoxelMap::Ptr> voxelmaps_gpu;
+  std::vector<gtsam_points::KdTreeGPU::Ptr> kdtrees_gpu;
   gtsam::Values poses;
   gtsam::Values poses_gt;
 
@@ -144,6 +151,14 @@ class MatchingCostFactorTest : public MatchingCostFactorsTestBase, public testin
       auto f = gtsam::make_shared<gtsam_points::IntegratedVGICPFactor>(target_key, source_key, target_voxelmap, source);
       f->set_num_threads(num_threads);
       factor = f;
+    } else if (method == "GICP_CUDA") {
+#ifdef GTSAM_POINTS_USE_CUDA
+      auto stream_buffer = stream_buffer_roundrobin->get_stream_buffer();
+      const auto& stream = stream_buffer.first;
+      const auto& buffer = stream_buffer.second;
+      auto kdtree_gpu = std::make_shared<gtsam_points::KdTreeGPU>(target, stream);
+      factor.reset(new gtsam_points::IntegratedGICPFactorGPU(target_key, source_key, target, source, kdtree_gpu, stream, buffer));
+#endif
     } else if (method == "VGICP_CUDA") {
 #ifdef GTSAM_POINTS_USE_CUDA
       auto stream_buffer = stream_buffer_roundrobin->get_stream_buffer();
@@ -181,6 +196,14 @@ class MatchingCostFactorTest : public MatchingCostFactorsTestBase, public testin
       auto f = gtsam::make_shared<gtsam_points::IntegratedVGICPFactor>(fixed_target_pose, source_key, target_voxelmap, source);
       f->set_num_threads(num_threads);
       factor = f;
+    } else if (method == "GICP_CUDA") {
+#ifdef GTSAM_POINTS_USE_CUDA
+      auto stream_buffer = stream_buffer_roundrobin->get_stream_buffer();
+      const auto& stream = stream_buffer.first;
+      const auto& buffer = stream_buffer.second;
+      auto kdtree_gpu = std::make_shared<gtsam_points::KdTreeGPU>(target, stream);
+      factor.reset(new gtsam_points::IntegratedGICPFactorGPU(fixed_target_pose, source_key, target, source, kdtree_gpu, stream, buffer));
+#endif
     } else if (method == "VGICP_CUDA") {
 #ifdef GTSAM_POINTS_USE_CUDA
       auto stream_buffer = stream_buffer_roundrobin->get_stream_buffer();
@@ -233,7 +256,7 @@ class MatchingCostFactorTest : public MatchingCostFactorsTestBase, public testin
 INSTANTIATE_TEST_SUITE_P(
   gtsam_points,
   MatchingCostFactorTest,
-  testing::Combine(testing::Values("ICP", "GICP", "VGICP", "VGICP_CUDA"), testing::Values("NONE", "OMP", "TBB")),
+  testing::Combine(testing::Values("ICP", "GICP", "VGICP", "GICP_CUDA", "VGICP_CUDA"), testing::Values("NONE", "OMP", "TBB")),
   [](const auto& info) { return std::get<0>(info.param) + "_" + std::get<1>(info.param); });
 
 TEST_P(MatchingCostFactorTest, AlignmentTest) {