diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index b5b3c319..b6952153 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -19,7 +19,6 @@ jobs: DISTRO: [ { "tag": "jammy_gtsam4.3a0", "base_image": "ubuntu:jammy", "suffix": "gcc" }, { "tag": "jammy_gtsam4.3a0", "base_image": "ubuntu:jammy", "suffix": "llvm" }, - { "tag": "jammy_cuda12.2_gtsam4.3a0", "base_image": "nvidia/cuda:12.2.2-devel-ubuntu22.04", "suffix": "gcc.cuda" }, { "tag": "jammy_cuda12.5_gtsam4.3a0", "base_image": "nvidia/cuda:12.5.1-devel-ubuntu22.04", "suffix": "gcc.cuda" }, { "tag": "jammy_cuda13.1_gtsam4.3a0", "base_image": "nvidia/cuda:13.1.0-devel-ubuntu22.04", "suffix": "gcc.cuda" }, { "tag": "noble_gtsam4.3a0", "base_image": "ubuntu:noble", "suffix": "gcc" }, diff --git a/CMakeLists.txt b/CMakeLists.txt index 5222ac7e..39a34af1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -251,12 +251,20 @@ if(BUILD_WITH_CUDA) src/gtsam_points/cuda/nonlinear_factor_set_gpu_create.cpp src/gtsam_points/cuda/stream_roundrobin.cu src/gtsam_points/cuda/stream_temp_buffer_roundrobin.cu + # ann + src/gtsam_points/ann/kdtree_gpu.cpp + src/gtsam_points/ann/kdtree_gpu.cu # types src/gtsam_points/types/point_cloud.cu src/gtsam_points/types/point_cloud_gpu.cu src/gtsam_points/types/gaussian_voxelmap_gpu.cu src/gtsam_points/types/gaussian_voxelmap_gpu_funcs.cu # factors + src/gtsam_points/factors/integrated_gicp_derivatives.cu + src/gtsam_points/factors/integrated_gicp_derivatives_inliers.cu + src/gtsam_points/factors/integrated_gicp_derivatives_linearize.cu + src/gtsam_points/factors/integrated_gicp_derivatives_compute.cu + src/gtsam_points/factors/integrated_gicp_factor_gpu.cpp src/gtsam_points/factors/integrated_vgicp_derivatives.cu src/gtsam_points/factors/integrated_vgicp_derivatives_inliers.cu src/gtsam_points/factors/integrated_vgicp_derivatives_compute.cu @@ -267,6 +275,7 @@ if(BUILD_WITH_CUDA) ) target_include_directories(gtsam_points_cuda PUBLIC $ + $ $ ) target_link_libraries(gtsam_points_cuda diff --git a/include/gtsam_points/ann/kdtree_gpu.hpp b/include/gtsam_points/ann/kdtree_gpu.hpp new file mode 100644 index 00000000..e5f116dc --- /dev/null +++ b/include/gtsam_points/ann/kdtree_gpu.hpp @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025 Kenji Koide (k.koide@aist.go.jp) +#pragma once + +#include +#include +#include +#include + +struct CUstream_st; + +namespace gtsam_points { + +struct KdTreeNodeGPU { + NodeIndexType left = INVALID_NODE; ///< Left child node index. + NodeIndexType right = INVALID_NODE; ///< Right child node index. + + union { + struct Leaf { + NodeIndexType first; ///< First point index in the leaf node. + NodeIndexType last; ///< Last point index in the leaf node. + } lr; ///< Leaf node. + struct NonLeaf { + NodeIndexType axis; ///< Projection axis. + float thresh; ///< Threshold value. + } sub; ///< Non-leaf node. + } node_type; +}; + +class KdTreeGPU { +public: + using Ptr = std::shared_ptr; + using ConstPtr = std::shared_ptr; + + KdTreeGPU(const PointCloud::ConstPtr& points, CUstream_st* stream = nullptr); + ~KdTreeGPU(); + + void nearest_neighbor_search( + const Eigen::Vector3f* queries, + size_t num_queries, + std::uint32_t* nn_indices, + float* nn_sq_dists, + CUstream_st* stream = nullptr); + + void nearest_neighbor_search_cpu( + const Eigen::Vector3f* h_queries, + size_t num_queries, + std::uint32_t* h_nn_indices, + float* h_nn_sq_dists, + CUstream_st* stream = nullptr); + + /// @brief Get the GPU pointer to the point indices + const std::uint32_t* get_indices() const { return indices; } + + /// @brief Get the GPU pointer to the KdTree nodes + const KdTreeNodeGPU* get_nodes() const { return nodes; } + +private: + PointCloud::ConstPtr points; + size_t num_indices; + size_t num_nodes; + std::uint32_t* indices; + KdTreeNodeGPU* nodes; +}; + +} // namespace gtsam_points diff --git a/include/gtsam_points/cuda/kernels/correspondence.hpp b/include/gtsam_points/cuda/kernels/correspondence.hpp new file mode 100644 index 00000000..d5102697 --- /dev/null +++ b/include/gtsam_points/cuda/kernels/correspondence.hpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025 Kenji Koide (k.koide@aist.go.jp) + +#pragma once + +#ifdef __CUDACC__ +#define GTSAM_POINTS_HOST_DEVICE __host__ __device__ +#else +#define GTSAM_POINTS_HOST_DEVICE +#endif + +namespace gtsam_points { + +/// @brief A pair of source and target point indices representing a correspondence. +struct Correspondence { + GTSAM_POINTS_HOST_DEVICE Correspondence() : source_idx(-1), target_idx(-1) {} + GTSAM_POINTS_HOST_DEVICE Correspondence(int source_idx, int target_idx) : source_idx(source_idx), target_idx(target_idx) {} + + int source_idx; + int target_idx; +}; + +} // namespace gtsam_points diff --git a/include/gtsam_points/cuda/kernels/gicp_derivatives.cuh b/include/gtsam_points/cuda/kernels/gicp_derivatives.cuh new file mode 100644 index 00000000..fb3013a0 --- /dev/null +++ b/include/gtsam_points/cuda/kernels/gicp_derivatives.cuh @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025 Kenji Koide (k.koide@aist.go.jp) + +#pragma once + +#include +#include + +#include +#include +#include + +namespace gtsam_points { + +struct gicp_derivatives_kernel { + gicp_derivatives_kernel( + const Eigen::Isometry3f* linearization_point_ptr, + const Eigen::Vector3f* target_means, + const Eigen::Matrix3f* target_covs, + const Eigen::Vector3f* source_means, + const Eigen::Matrix3f* source_covs) + : linearization_point_ptr(linearization_point_ptr), + target_means_ptr(target_means), + target_covs_ptr(target_covs), + source_means_ptr(source_means), + source_covs_ptr(source_covs) {} + + __device__ LinearizedSystem6 operator()(const Correspondence& source_target_correspondence) const { + const int source_idx = source_target_correspondence.source_idx; + const int target_idx = source_target_correspondence.target_idx; + if (source_idx < 0 || target_idx < 0) { + return LinearizedSystem6::zero(); + } + + const Eigen::Isometry3f& x = *linearization_point_ptr; + const Eigen::Matrix3f R = x.linear(); + const Eigen::Vector3f t = x.translation(); + + const Eigen::Vector3f mean_A = source_means_ptr[source_idx]; + const Eigen::Matrix3f cov_A = source_covs_ptr[source_idx]; + const Eigen::Vector3f transed_mean_A = R * mean_A + t; + + const Eigen::Vector3f mean_B = target_means_ptr[target_idx]; + const Eigen::Matrix3f cov_B = target_covs_ptr[target_idx]; + + const Eigen::Matrix3f RCR = (R * cov_A * R.transpose()); + const Eigen::Matrix3f RCR_inv = (cov_B + RCR).inverse(); + Eigen::Vector3f error = mean_B - transed_mean_A; + + Eigen::Matrix J_target; + J_target.block<3, 3>(0, 0) = -skew_symmetric(transed_mean_A); + J_target.block<3, 3>(0, 3) = Eigen::Matrix3f::Identity(); + + Eigen::Matrix J_source; + J_source.block<3, 3>(0, 0) = R * skew_symmetric(mean_A); + J_source.block<3, 3>(0, 3) = -R; + + Eigen::Matrix J_target_RCR_inv = J_target.transpose() * RCR_inv; + Eigen::Matrix J_source_RCR_inv = J_source.transpose() * RCR_inv; + + LinearizedSystem6 linearized; + linearized.num_inliers = 1; + linearized.error = error.transpose() * RCR_inv * error; + linearized.H_target = J_target_RCR_inv * J_target; + linearized.H_source = J_source_RCR_inv * J_source; + linearized.H_target_source = J_target_RCR_inv * J_source; + linearized.b_target = J_target_RCR_inv * error; + linearized.b_source = J_source_RCR_inv * error; + + return linearized; + } + + const Eigen::Isometry3f* linearization_point_ptr; + + const Eigen::Vector3f* target_means_ptr; + const Eigen::Matrix3f* target_covs_ptr; + + const Eigen::Vector3f* source_means_ptr; + const Eigen::Matrix3f* source_covs_ptr; +}; + +struct gicp_error_kernel { + gicp_error_kernel( + const Eigen::Isometry3f* linearization_point_ptr, + const Eigen::Isometry3f* evaluation_point_ptr, + const Eigen::Vector3f* target_means, + const Eigen::Matrix3f* target_covs, + const Eigen::Vector3f* source_means, + const Eigen::Matrix3f* source_covs) + : linearization_point_ptr(linearization_point_ptr), + evaluation_point_ptr(evaluation_point_ptr), + target_means_ptr(target_means), + target_covs_ptr(target_covs), + source_means_ptr(source_means), + source_covs_ptr(source_covs) {} + + __device__ float operator()(const Correspondence& source_target_correspondence) const { + const int source_idx = source_target_correspondence.source_idx; + const int target_idx = source_target_correspondence.target_idx; + if (source_idx < 0 || target_idx < 0) { + return 0.0f; + } + + const Eigen::Isometry3f& xl = *linearization_point_ptr; + const Eigen::Matrix3f Rl = xl.linear(); + + const Eigen::Isometry3f& xe = *evaluation_point_ptr; + const Eigen::Matrix3f Re = xe.linear(); + const Eigen::Vector3f te = xe.translation(); + + const Eigen::Vector3f mean_A = source_means_ptr[source_idx]; + const Eigen::Matrix3f cov_A = source_covs_ptr[source_idx]; + const Eigen::Vector3f transed_mean_A = Re * mean_A + te; + + const Eigen::Vector3f mean_B = target_means_ptr[target_idx]; + const Eigen::Matrix3f cov_B = target_covs_ptr[target_idx]; + + const Eigen::Matrix3f RCR = (Rl * cov_A * Rl.transpose()); + const Eigen::Matrix3f RCR_inv = (cov_B + RCR).inverse(); + Eigen::Vector3f error = mean_B - transed_mean_A; + + return error.transpose() * RCR_inv * error; + } + + const Eigen::Isometry3f* linearization_point_ptr; + const Eigen::Isometry3f* evaluation_point_ptr; + + const Eigen::Vector3f* target_means_ptr; + const Eigen::Matrix3f* target_covs_ptr; + const Eigen::Vector3f* source_means_ptr; + const Eigen::Matrix3f* source_covs_ptr; +}; + +} // namespace gtsam_points diff --git a/include/gtsam_points/cuda/kernels/kdtree.cuh b/include/gtsam_points/cuda/kernels/kdtree.cuh new file mode 100644 index 00000000..013c8628 --- /dev/null +++ b/include/gtsam_points/cuda/kernels/kdtree.cuh @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025 Kenji Koide (k.koide@aist.go.jp) +#pragma once +#include + +namespace gtsam_points { + +struct kdtree_nearest_neighbor_search_kernel { +public: + static constexpr int MAX_STACK_SIZE = 20; + + __device__ thrust::pair operator()(const Eigen::Vector3f query) const { + thrust::pair result = {INVALID_NODE, std::numeric_limits::max()}; + + int stack_size = 1; + thrust::pair search_stack[MAX_STACK_SIZE] = {{0, 0.0f}}; + + while (stack_size > 0) { + const auto [node_index, sq_dist] = search_stack[--stack_size]; + if (sq_dist > result.second) { + continue; + } + + const KdTreeNodeGPU node = nodes[node_index]; + + // Leaf node + if (node.left == INVALID_NODE) { + for (NodeIndexType i = node.node_type.lr.first; i < node.node_type.lr.last; i++) { + const NodeIndexType pt_index = indices[i]; + const float sq_dist = (points[pt_index] - query).squaredNorm(); + if (sq_dist < result.second) { + result = {pt_index, sq_dist}; + } + } + continue; + } + + const float val = query[node.node_type.sub.axis]; + const float diff = val - node.node_type.sub.thresh; + const float cut_sq_dist = diff * diff; + + int best_child; + int other_child; + + if (diff < 0.0f) { + best_child = node.left; + other_child = node.right; + } else { + best_child = node.right; + other_child = node.left; + } + + if (stack_size > MAX_STACK_SIZE - 2) { + printf("kdtree stack overflow!!"); + } else if (cut_sq_dist < result.second) { + search_stack[stack_size].first = other_child; + search_stack[stack_size++].second = cut_sq_dist; + } + + search_stack[stack_size].first = best_child; + search_stack[stack_size++].second = 0.0f; + } + + return result; + } + +public: + const Eigen::Vector3f* __restrict__ points; + const std::uint32_t* __restrict__ indices; + const KdTreeNodeGPU* __restrict__ nodes; +}; + +} // namespace gtsam_points diff --git a/include/gtsam_points/factors/integrated_gicp_derivatives.cuh b/include/gtsam_points/factors/integrated_gicp_derivatives.cuh new file mode 100644 index 00000000..e219b4b0 --- /dev/null +++ b/include/gtsam_points/factors/integrated_gicp_derivatives.cuh @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025 Kenji Koide (k.koide@aist.go.jp) + +#pragma once + +#include +#include +#include + +#include +#include +#include + +struct CUstream_st; + +namespace gtsam_points { + +class LinearizedSystem6; +class TempBufferManager; + +class IntegratedGICPDerivatives { +public: + EIGEN_MAKE_ALIGNED_OPERATOR_NEW + + IntegratedGICPDerivatives( + const PointCloud::ConstPtr& target, + const PointCloud::ConstPtr& source, + const KdTreeGPU::ConstPtr& target_tree, + CUstream_st* ext_stream, + std::shared_ptr temp_buffer); + ~IntegratedGICPDerivatives(); + + void set_inlier_update_thresh(double trans, double angle) { + inlier_update_thresh_trans = trans; + inlier_update_thresh_angle = angle; + } + + void set_max_correspondence_distance(double dist) { max_correspondence_distance_sq = dist * dist; } + + void set_enable_offloading(bool enable) { enable_offloading = enable; } + + void set_enable_surface_validation(bool enable) { enable_surface_validation = enable; } + + int get_num_inliers() const { return num_inliers; } + + void touch_points(); + + // synchronized interface + LinearizedSystem6 linearize(const Eigen::Isometry3f& x); + double compute_error(const Eigen::Isometry3f& xl, const Eigen::Isometry3f& xe); + + void reset_inliers(const Eigen::Isometry3f& x, const Eigen::Isometry3f* d_x, bool force_update = false); + void update_inliers(int num_inliers); + + // async interface + void sync_stream(); + void issue_linearize(const Eigen::Isometry3f* d_x, LinearizedSystem6* d_output); + void issue_compute_error(const Eigen::Isometry3f* d_xl, const Eigen::Isometry3f* d_xe, float* d_output); + +private: + bool enable_offloading; + + bool enable_surface_validation; + double max_correspondence_distance_sq; + double inlier_update_thresh_trans; + double inlier_update_thresh_angle; + + bool external_stream; + CUstream_st* stream; + std::shared_ptr temp_buffer; + + PointCloud::ConstPtr target; + PointCloud::ConstPtr source; + KdTreeGPU::ConstPtr target_tree; + + Eigen::Isometry3f inlier_evaluation_point; + const Eigen::Isometry3f* inlier_evaluation_point_gpu; + + int num_inliers; + int* num_inliers_gpu; + Correspondence* source_target_correspondences; +}; +} // namespace gtsam_points \ No newline at end of file diff --git a/include/gtsam_points/factors/integrated_gicp_factor_gpu.hpp b/include/gtsam_points/factors/integrated_gicp_factor_gpu.hpp new file mode 100644 index 00000000..334345d2 --- /dev/null +++ b/include/gtsam_points/factors/integrated_gicp_factor_gpu.hpp @@ -0,0 +1,159 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025 Kenji Koide (k.koide@aist.go.jp) + +#pragma once + +#include +#include +#include +#include + +struct CUstream_st; + +namespace gtsam { +class Pose3; +} + +namespace gtsam_points { + +class LinearizedSystem6; +class IntegratedGICPDerivatives; +class TempBufferManager; + +/** + * @brief GPU-accelerated GICP matching cost factor + */ +class IntegratedGICPFactorGPU : public gtsam_points::NonlinearFactorGPU { +public: + EIGEN_MAKE_ALIGNED_OPERATOR_NEW + using shared_ptr = std::shared_ptr; + + /** + * @brief Create a binary GICP_GPU factor between target and source poses. + * @param target_key Target key + * @param source_key Source key + * @param target Target frame + * @param source Source frame + * @param target_tree Target kdtree + * @param stream CUDA stream + * @param temp_buffer CUDA temporary buffer manager + */ + IntegratedGICPFactorGPU( + gtsam::Key target_key, + gtsam::Key source_key, + const PointCloud::ConstPtr& target, + const PointCloud::ConstPtr& source, + const KdTreeGPU::ConstPtr& target_tree, + CUstream_st* stream = nullptr, + std::shared_ptr temp_buffer = nullptr); + + /** + * @brief Create a unary GICP_GPU factor between a fixed target pose and an active source pose. + * @param targfixed_target_pose Fixed target pose + * @param source_key Source key + * @param target Target kdtree + * @param source Source frame + * @param stream CUDA stream + * @param temp_buffer CUDA temporary buffer manager + */ + IntegratedGICPFactorGPU( + const gtsam::Pose3& fixed_target_pose, + gtsam::Key source_key, + const PointCloud::ConstPtr& target, + const PointCloud::ConstPtr& source, + const KdTreeGPU::ConstPtr& target_tree, + CUstream_st* stream = nullptr, + std::shared_ptr temp_buffer = nullptr); + + virtual ~IntegratedGICPFactorGPU() override; + + /// @brief Print the factor information. + virtual void print(const std::string& s = "", const gtsam::KeyFormatter& keyFormatter = gtsam::DefaultKeyFormatter) const override; + + /// @brief Calculate the CPU memory usage of this factor + /// @note The result is approximate and does not account for objects not owned by this factor (e.g., point clouds) + /// @return Approximate CPU memory usage in bytes + size_t memory_usage() const; + + /// @brief Calculate the GPU memory usage of this factor + /// @note The result is approximate and does not account for objects not owned by this factor (e.g., point clouds) + /// @return Approximate GPU memory usage in bytes + size_t memory_usage_gpu() const; + + /// @brief Enable or disable GPU memory offloading. + void set_enable_offloading(bool enable); + + /// @brief Enable or disable surface orientation validation for correspondence search + /// @note To enable surface orientation validation, source frame must have point normals + void set_enable_surface_validation(bool enable); + + /// @brief Set the threshold values to trigger inlier points update. + /// Setting larger values reduces GPU sync but may affect the registration accuracy. + void set_inlier_update_thresh(double trans, double angle); + + /// @brief Set the maximum distance between corresponding points. + /// Correspondences with distances larger than this will be rejected (i.e., correspondence trimming). + void set_max_correspondence_distance(double dist); + + /// @brief Get the number of inlier points. + /// @note This function must be called after the factor is linearized. + int num_inliers() const; + + /// @brief Get the fraction of inlier points. + /// @note This function must be called after the factor is linearized. + double inlier_fraction() const; + + /// @brief Get the pose of the fixed target. This function is only valid for unary factors. + Eigen::Isometry3f get_fixed_target_pose() const { return fixed_target_pose; } + + // forbid copy + IntegratedGICPFactorGPU(const IntegratedGICPFactorGPU&) = delete; + IntegratedGICPFactorGPU& operator=(const IntegratedGICPFactorGPU&) = delete; + + virtual gtsam::NonlinearFactor::shared_ptr clone() const override; + + virtual size_t dim() const override { return 6; } + virtual double error(const gtsam::Values& values) const override; + virtual std::shared_ptr linearize(const gtsam::Values& values) const override; + + virtual size_t linearization_input_size() const override; + virtual size_t linearization_output_size() const override; + virtual size_t evaluation_input_size() const override; + virtual size_t evaluation_output_size() const override; + + virtual void set_linearization_point(const gtsam::Values& values, void* lin_input_cpu) override; + virtual void issue_linearize(const void* lin_input_cpu, const void* lin_input_gpu, void* lin_output_gpu) override; + virtual void store_linearized(const void* lin_output_cpu) override; + + virtual void set_evaluation_point(const gtsam::Values& values, void* eval_input_cpu) override; + virtual void issue_compute_error( + const void* lin_input_cpu, + const void* eval_input_cpu, + const void* lin_input_gpu, + const void* eval_input_gpu, + void* eval_output_gpu) override; + virtual void store_computed_error(const void* eval_output_cpu) override; + + virtual void sync() override; + +private: + Eigen::Isometry3f calc_delta(const gtsam::Values& values) const; + +private: + bool is_binary; + Eigen::Isometry3f fixed_target_pose; + + PointCloud::ConstPtr target; + PointCloud::ConstPtr source; + KdTreeGPU::ConstPtr target_tree; + + std::unique_ptr derivatives; + + mutable bool linearized; + mutable Eigen::Isometry3f linearization_point; + + mutable std::optional evaluation_result; + mutable std::unique_ptr linearization_result; +}; + +} // namespace gtsam_points \ No newline at end of file diff --git a/src/demo/demo_matching_cost_factors.cpp b/src/demo/demo_matching_cost_factors.cpp index fb8bf2b0..1bfcf6a9 100644 --- a/src/demo/demo_matching_cost_factors.cpp +++ b/src/demo/demo_matching_cost_factors.cpp @@ -23,8 +23,10 @@ #include #include +#include #include #include +#include #include #include #include @@ -56,6 +58,7 @@ class MatchingCostFactorDemo { frames.resize(5); voxelmaps.resize(5); voxelmaps_gpu.resize(5); + kdtrees_gpu.resize(5); for (int i = 0; i < 5; i++) { std::string token; @@ -103,6 +106,9 @@ class MatchingCostFactorDemo { auto voxelmap_gpu = std::make_shared(2.0); voxelmap_gpu->insert(*frame); voxelmaps_gpu[i] = voxelmap_gpu; + + auto kdtree_gpu = std::make_shared(frame); + kdtrees_gpu[i] = kdtree_gpu; #endif viewer->update_drawable("frame_" + std::to_string(i), std::make_shared(frame->points, frame->size()), guik::Rainbow()); @@ -122,6 +128,7 @@ class MatchingCostFactorDemo { factor_types.push_back("GICP"); factor_types.push_back("VGICP"); #ifdef GTSAM_POINTS_USE_CUDA + factor_types.push_back("GICP_GPU"); factor_types.push_back("VGICP_GPU"); #endif @@ -220,6 +227,10 @@ class MatchingCostFactorDemo { auto factor = gtsam::make_shared(target_key, source_key, target_voxelmap, source); factor->set_num_threads(num_threads); return factor; + } else if (factor_types[factor_type] == std::string("GICP_GPU")) { +#ifdef GTSAM_POINTS_USE_CUDA + return gtsam::make_shared(target_key, source_key, target, source, kdtrees_gpu[target_key]); +#endif } else if (factor_types[factor_type] == std::string("VGICP_GPU")) { #ifdef GTSAM_POINTS_USE_CUDA return gtsam::make_shared(target_key, source_key, target_voxelmap_gpu, source); @@ -301,6 +312,7 @@ class MatchingCostFactorDemo { std::vector frames; std::vector voxelmaps; std::vector voxelmaps_gpu; + std::vector kdtrees_gpu; }; int main(int argc, char** argv) { diff --git a/src/gtsam_points/ann/kdtree_gpu.cpp b/src/gtsam_points/ann/kdtree_gpu.cpp new file mode 100644 index 00000000..96163654 --- /dev/null +++ b/src/gtsam_points/ann/kdtree_gpu.cpp @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025 Kenji Koide (k.koide@aist.go.jp) +#include + +#include +#include + +namespace gtsam_points { + +KdTreeGPU::KdTreeGPU(const PointCloud::ConstPtr& points, CUstream_st* stream) +: points(points), + num_indices(0), + num_nodes(0), + indices(nullptr), + nodes(nullptr) { + // + if (!points->has_points()) { + std::cerr << "error: empty point cloud is given for KdTreeGPU" << std::endl; + return; + } + if (!points->has_points_gpu()) { + std::cerr << "error: point cloud does not have GPU points for KdTreeGPU" << std::endl; + return; + } + + // + KdTreeBuilder builder; + UnsafeKdTree kdtree(*points, builder); + + // copy to GPU + std::vector h_indices(kdtree.indices.begin(), kdtree.indices.end()); + std::vector h_nodes(kdtree.nodes.size()); + + for (int i = 0; i < kdtree.nodes.size(); i++) { + const auto& in = kdtree.nodes[i]; + auto& out = h_nodes[i]; + + out.left = in.left; + out.right = in.right; + + if (in.left == INVALID_NODE) { + out.node_type.lr.first = in.node_type.lr.first; + out.node_type.lr.last = in.node_type.lr.last; + } else { + out.node_type.sub.axis = in.node_type.sub.proj.axis; + out.node_type.sub.thresh = in.node_type.sub.thresh; + } + } + + num_indices = kdtree.indices.size(); + num_nodes = kdtree.nodes.size(); + check_error << cudaMallocAsync(&indices, sizeof(std::uint32_t) * num_indices, stream); + check_error << cudaMallocAsync(&nodes, sizeof(KdTreeNodeGPU) * num_nodes, stream); + check_error << cudaMemcpyAsync(indices, h_indices.data(), sizeof(std::uint32_t) * num_indices, cudaMemcpyHostToDevice, stream); + check_error << cudaMemcpyAsync(nodes, h_nodes.data(), sizeof(KdTreeNodeGPU) * num_nodes, cudaMemcpyHostToDevice, stream); +} + +KdTreeGPU::~KdTreeGPU() { + check_error << cudaFreeAsync(indices, nullptr); + check_error << cudaFreeAsync(nodes, nullptr); +} + +void KdTreeGPU::nearest_neighbor_search_cpu( + const Eigen::Vector3f* h_queries, + size_t num_queries, + std::uint32_t* h_nn_indices, + float* h_nn_sq_dists, + CUstream_st* stream) { + // + Eigen::Vector3f* d_queries; + std::uint32_t* d_nn_indices; + float* d_nn_sq_dists; + + check_error << cudaMallocAsync(&d_queries, sizeof(Eigen::Vector3f) * num_queries, stream); + check_error << cudaMallocAsync(&d_nn_indices, sizeof(std::uint32_t) * num_queries, stream); + check_error << cudaMallocAsync(&d_nn_sq_dists, sizeof(float) * num_queries, stream); + check_error << cudaMemcpyAsync(d_queries, h_queries, sizeof(Eigen::Vector3f) * num_queries, cudaMemcpyHostToDevice, stream); + + nearest_neighbor_search(d_queries, num_queries, d_nn_indices, d_nn_sq_dists, stream); + + check_error << cudaMemcpyAsync(h_nn_indices, d_nn_indices, sizeof(std::uint32_t) * num_queries, cudaMemcpyDeviceToHost, stream); + check_error << cudaMemcpyAsync(h_nn_sq_dists, d_nn_sq_dists, sizeof(float) * num_queries, cudaMemcpyDeviceToHost, stream); + + check_error << cudaFreeAsync(d_queries, stream); + check_error << cudaFreeAsync(d_nn_indices, stream); + check_error << cudaFreeAsync(d_nn_sq_dists, stream); +} + +} // namespace gtsam_points \ No newline at end of file diff --git a/src/gtsam_points/ann/kdtree_gpu.cu b/src/gtsam_points/ann/kdtree_gpu.cu new file mode 100644 index 00000000..e9fbede2 --- /dev/null +++ b/src/gtsam_points/ann/kdtree_gpu.cu @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025 Kenji Koide (k.koide@aist.go.jp) +#include + +#include +#include + +#include +#include +#include +#include +#include + +namespace gtsam_points { + +namespace { + +struct nn_search_kernel { +public: + __device__ void operator()(std::uint32_t i) const { + const Eigen::Vector3f query = queries[i]; + + kdtree_nearest_neighbor_search_kernel search{points, indices, nodes}; + const auto result = search(query); + + nn_indices[i] = result.first; + nn_sq_dists[i] = result.second; + } + +public: + const Eigen::Vector3f* __restrict__ points; + const std::uint32_t* __restrict__ indices; + const KdTreeNodeGPU* __restrict__ nodes; + + const Eigen::Vector3f* __restrict__ queries; + + std::uint32_t* nn_indices; + float* nn_sq_dists; +}; +} // namespace + +void KdTreeGPU::nearest_neighbor_search( + const Eigen::Vector3f* queries, + const size_t num_queries, + std::uint32_t* nn_indices, + float* nn_sq_dists, + CUstream_st* stream) { + thrust::for_each( + thrust::cuda::par.on(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(num_queries), + nn_search_kernel{points->points_gpu, indices, nodes, queries, nn_indices, nn_sq_dists}); +} + +} // namespace gtsam_points \ No newline at end of file diff --git a/src/gtsam_points/factors/integrated_gicp_derivatives.cu b/src/gtsam_points/factors/integrated_gicp_derivatives.cu new file mode 100644 index 00000000..c7830b87 --- /dev/null +++ b/src/gtsam_points/factors/integrated_gicp_derivatives.cu @@ -0,0 +1,114 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025 Kenji Koide (k.koide@aist.go.jp) + +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include + +namespace gtsam_points { + +IntegratedGICPDerivatives::IntegratedGICPDerivatives( + const PointCloud::ConstPtr& target, + const PointCloud::ConstPtr& source, + const KdTreeGPU::ConstPtr& target_tree, + CUstream_st* ext_stream, + std::shared_ptr temp_buffer) +: enable_offloading(false), + enable_surface_validation(false), + max_correspondence_distance_sq(1.0), + inlier_update_thresh_trans(1e-6), + inlier_update_thresh_angle(1e-6), + target(target), + source(source), + target_tree(target_tree), + external_stream(true), + stream(ext_stream), + temp_buffer(temp_buffer), + num_inliers(0), + source_target_correspondences(nullptr) { + // + if (stream == nullptr) { + external_stream = false; + cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); + } + + if (this->temp_buffer == nullptr) { + this->temp_buffer.reset(new TempBufferManager()); + } + + check_error << cudaMallocAsync(&num_inliers_gpu, sizeof(int), stream); + // check_error << cudaHostRegister(&num_inliers, sizeof(int), cudaHostRegisterDefault); +} + +IntegratedGICPDerivatives::~IntegratedGICPDerivatives() { + check_error << cudaFreeAsync(source_target_correspondences, stream); + check_error << cudaFreeAsync(num_inliers_gpu, stream); + // check_error << cudaHostUnregister(&num_inliers); + + if (!external_stream) { + cudaStreamDestroy(stream); + } +} + +void IntegratedGICPDerivatives::sync_stream() { + check_error << cudaStreamSynchronize(stream); +} + +void IntegratedGICPDerivatives::touch_points() { + if (!enable_offloading) { + return; + } + + // auto target_ = const_cast(target.get()); + // target_->touch(stream); + + // auto source_gpu_const = dynamic_cast(source.get()); + // if (!source_gpu_const) { + // return; + // } + + // auto source_gpu = const_cast(source_gpu_const); + // source_gpu->touch(stream); +} + +LinearizedSystem6 IntegratedGICPDerivatives::linearize(const Eigen::Isometry3f& x) { + thrust::device_vector x_ptr(1); + thrust::device_vector output_ptr(1); + + x_ptr[0] = x; + + reset_inliers(x, thrust::raw_pointer_cast(x_ptr.data())); + issue_linearize(thrust::raw_pointer_cast(x_ptr.data()), thrust::raw_pointer_cast(output_ptr.data())); + sync_stream(); + + LinearizedSystem6 linearized = output_ptr[0]; + + return linearized; +} + +double IntegratedGICPDerivatives::compute_error(const Eigen::Isometry3f& d_xl, const Eigen::Isometry3f& d_xe) { + thrust::device_vector xs_ptr(2); + xs_ptr[0] = d_xl; + xs_ptr[1] = d_xe; + thrust::device_vector output_ptr(1); + + issue_compute_error( + thrust::raw_pointer_cast(xs_ptr.data()), + thrust::raw_pointer_cast(xs_ptr.data() + 1), + thrust::raw_pointer_cast(output_ptr.data())); + sync_stream(); + + float error = output_ptr[0]; + return error; +} + +} // namespace gtsam_points \ No newline at end of file diff --git a/src/gtsam_points/factors/integrated_gicp_derivatives_compute.cu b/src/gtsam_points/factors/integrated_gicp_derivatives_compute.cu new file mode 100644 index 00000000..db187c8e --- /dev/null +++ b/src/gtsam_points/factors/integrated_gicp_derivatives_compute.cu @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025 Kenji Koide (k.koide@aist.go.jp) + +#include + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +namespace gtsam_points { + +namespace { + +/// @brief Kernel to compute source-target correspondence by KdTree NN search +struct kdtree_correspondence_kernel { + kdtree_correspondence_kernel( + const Eigen::Isometry3f* linearization_point_ptr, + const Eigen::Vector3f* source_points, + const Eigen::Vector3f* target_points, + const std::uint32_t* target_indices, + const KdTreeNodeGPU* target_nodes) + : linearization_point_ptr(linearization_point_ptr), + source_points(source_points), + target_points(target_points), + target_indices(target_indices), + target_nodes(target_nodes) {} + + __device__ Correspondence operator()(const Correspondence& source_target) const { + const int source_idx = source_target.source_idx; + if (source_idx < 0) { + return source_target; + } + + const Eigen::Isometry3f& x = *linearization_point_ptr; + const Eigen::Vector3f transed_pt = x.linear() * source_points[source_idx] + x.translation(); + + kdtree_nearest_neighbor_search_kernel nn_search; + nn_search.points = target_points; + nn_search.indices = target_indices; + nn_search.nodes = target_nodes; + + const auto [nn_idx, sq_dist] = nn_search(transed_pt); + + return Correspondence(source_idx, static_cast(nn_idx)); + } + + const Eigen::Isometry3f* linearization_point_ptr; + const Eigen::Vector3f* source_points; + const Eigen::Vector3f* target_points; + const std::uint32_t* target_indices; + const KdTreeNodeGPU* target_nodes; +}; + +} // namespace + +void IntegratedGICPDerivatives::issue_compute_error(const Eigen::Isometry3f* d_xl, const Eigen::Isometry3f* d_xe, float* d_output) { + // Reuse correspondences computed in issue_linearize instead of recomputing KdTree search + // Compute GICP error using the correspondences + gicp_error_kernel error_kernel( + d_xl, + d_xe, + reinterpret_cast(target->points_gpu), + reinterpret_cast(target->covs_gpu), + reinterpret_cast(source->points_gpu), + reinterpret_cast(source->covs_gpu)); + + auto first = thrust::make_transform_iterator(source_target_correspondences, error_kernel); + + void* temp_storage = nullptr; + size_t temp_storage_bytes = 0; + + cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, first, d_output, num_inliers, thrust::plus(), 0.0f, stream); + + temp_storage = temp_buffer->get_buffer(temp_storage_bytes); + + cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, first, d_output, num_inliers, thrust::plus(), 0.0f, stream); +} + +} // namespace gtsam_points diff --git a/src/gtsam_points/factors/integrated_gicp_derivatives_inliers.cu b/src/gtsam_points/factors/integrated_gicp_derivatives_inliers.cu new file mode 100644 index 00000000..43a1fdf6 --- /dev/null +++ b/src/gtsam_points/factors/integrated_gicp_derivatives_inliers.cu @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025 Kenji Koide (k.koide@aist.go.jp) + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace gtsam_points { + +namespace { + +struct init_source_target_indices_kernel { + __host__ __device__ Correspondence operator()(const int i) const { return Correspondence(i, -1); } +}; + +} // namespace + +void IntegratedGICPDerivatives::reset_inliers(const Eigen::Isometry3f& x, const Eigen::Isometry3f* d_x, bool force_update) { + touch_points(); + + if ( + force_update || source_target_correspondences == nullptr || + large_displacement(inlier_evaluation_point, x, inlier_update_thresh_trans, inlier_update_thresh_angle)) { + inlier_evaluation_point = x; + inlier_evaluation_point_gpu = d_x; + + check_error << cudaFreeAsync(source_target_correspondences, stream); + check_error << cudaMallocAsync(&source_target_correspondences, sizeof(Correspondence) * source->size(), stream); + thrust::transform( + thrust::cuda::par_nosync.on(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(source->size()), + source_target_correspondences, + init_source_target_indices_kernel()); + num_inliers = source->size(); + } else { + inlier_evaluation_point_gpu = nullptr; + } +} + +void IntegratedGICPDerivatives::update_inliers(int new_num_inliers) { + if (inlier_evaluation_point_gpu == nullptr) { + return; + } + + // For now, we keep all correspondences without filtering + // The correspondence filtering can be added later with validity checks + this->num_inliers = new_num_inliers; +} + +} // namespace gtsam_points \ No newline at end of file diff --git a/src/gtsam_points/factors/integrated_gicp_derivatives_linearize.cu b/src/gtsam_points/factors/integrated_gicp_derivatives_linearize.cu new file mode 100644 index 00000000..275f98ab --- /dev/null +++ b/src/gtsam_points/factors/integrated_gicp_derivatives_linearize.cu @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025 Kenji Koide (k.koide@aist.go.jp) + +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace gtsam_points { + +namespace { + +/// @brief Kernel to compute source-target correspondence by KdTree NN search +struct kdtree_correspondence_kernel { + kdtree_correspondence_kernel( + const Eigen::Isometry3f* linearization_point_ptr, + const Eigen::Vector3f* source_points, + const Eigen::Vector3f* target_points, + const std::uint32_t* target_indices, + const KdTreeNodeGPU* target_nodes, + float max_correspondence_distance_sq) + : linearization_point_ptr(linearization_point_ptr), + source_points(source_points), + target_points(target_points), + target_indices(target_indices), + target_nodes(target_nodes), + max_correspondence_distance_sq(max_correspondence_distance_sq) {} + + __device__ Correspondence operator()(const Correspondence& source_target) const { + const int source_idx = source_target.source_idx; + if (source_idx < 0) { + return source_target; + } + + const Eigen::Isometry3f& x = *linearization_point_ptr; + const Eigen::Vector3f transed_pt = x.linear() * source_points[source_idx] + x.translation(); + + kdtree_nearest_neighbor_search_kernel nn_search; + nn_search.points = target_points; + nn_search.indices = target_indices; + nn_search.nodes = target_nodes; + + const auto [nn_idx, sq_dist] = nn_search(transed_pt); + + if (sq_dist > max_correspondence_distance_sq) { + return Correspondence(source_idx, -1); + } + + return Correspondence(source_idx, static_cast(nn_idx)); + } + + const Eigen::Isometry3f* linearization_point_ptr; + const Eigen::Vector3f* source_points; + const Eigen::Vector3f* target_points; + const std::uint32_t* target_indices; + const KdTreeNodeGPU* target_nodes; + float max_correspondence_distance_sq; +}; + +} // namespace + +void IntegratedGICPDerivatives::issue_linearize(const Eigen::Isometry3f* d_x, LinearizedSystem6* d_output) { + // Compute correspondences using KdTree nearest neighbor search + kdtree_correspondence_kernel corr_kernel( + d_x, + reinterpret_cast(source->points_gpu), + reinterpret_cast(target->points_gpu), + target_tree->get_indices(), + target_tree->get_nodes(), + max_correspondence_distance_sq); + + // Materialize correspondences in-place for reuse in issue_compute_error + thrust::transform( + thrust::cuda::par_nosync.on(stream), + source_target_correspondences, + source_target_correspondences + num_inliers, + source_target_correspondences, + corr_kernel); + + // Compute GICP derivatives using the computed correspondences + gicp_derivatives_kernel deriv_kernel( + d_x, + reinterpret_cast(target->points_gpu), + reinterpret_cast(target->covs_gpu), + reinterpret_cast(source->points_gpu), + reinterpret_cast(source->covs_gpu)); + + auto first = thrust::make_transform_iterator(source_target_correspondences, deriv_kernel); + + void* temp_storage = nullptr; + size_t temp_storage_bytes = 0; + + cub::DeviceReduce::Reduce( + temp_storage, + temp_storage_bytes, + first, + d_output, + num_inliers, + thrust::plus(), + LinearizedSystem6::zero(), + stream); + + temp_storage = temp_buffer->get_buffer(temp_storage_bytes); + + cub::DeviceReduce::Reduce( + temp_storage, + temp_storage_bytes, + first, + d_output, + num_inliers, + thrust::plus(), + LinearizedSystem6::zero(), + stream); +} + +} // namespace gtsam_points diff --git a/src/gtsam_points/factors/integrated_gicp_factor_gpu.cpp b/src/gtsam_points/factors/integrated_gicp_factor_gpu.cpp new file mode 100644 index 00000000..e56ee112 --- /dev/null +++ b/src/gtsam_points/factors/integrated_gicp_factor_gpu.cpp @@ -0,0 +1,302 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025 Kenji Koide (k.koide@aist.go.jp) + +#include + +#include + +#include +#include + +#include +#include +#include + +namespace gtsam_points { + +IntegratedGICPFactorGPU::IntegratedGICPFactorGPU( + gtsam::Key target_key, + gtsam::Key source_key, + const PointCloud::ConstPtr& target, + const PointCloud::ConstPtr& source, + const KdTreeGPU::ConstPtr& target_tree, + CUstream_st* stream, + std::shared_ptr temp_buffer) +: gtsam_points::NonlinearFactorGPU(gtsam::KeyVector{target_key, source_key}), + is_binary(true), + fixed_target_pose(Eigen::Isometry3f::Identity()), + target(target), + source(source), + target_tree(target_tree), + derivatives(new IntegratedGICPDerivatives(target, source, target_tree, stream, temp_buffer)), + linearized(false), + linearization_point(Eigen::Isometry3f::Identity()) { + // + if (!source->points_gpu) { + std::cerr << "error: GPU source points have not been allocated!!" << std::endl; + abort(); + } + + if (!source->covs_gpu) { + std::cerr << "error: GPU source covs have not been allocated!!" << std::endl; + abort(); + } + + if (!target->points_gpu) { + std::cerr << "error: GPU target points have not been allocated!!" << std::endl; + abort(); + } + + if (!target->covs_gpu) { + std::cerr << "error: GPU target covs have not been allocated!!" << std::endl; + abort(); + } + + if (!target_tree) { + std::cerr << "error: GPU target kdtree has not been created!!" << std::endl; + abort(); + } +} + +IntegratedGICPFactorGPU::IntegratedGICPFactorGPU( + const gtsam::Pose3& fixed_target_pose, + gtsam::Key source_key, + const PointCloud::ConstPtr& target, + const PointCloud::ConstPtr& source, + const KdTreeGPU::ConstPtr& target_tree, + CUstream_st* stream, + std::shared_ptr temp_buffer) +: gtsam_points::NonlinearFactorGPU(gtsam::KeyVector{source_key}), + is_binary(false), + fixed_target_pose(fixed_target_pose.matrix().cast()), + target(target), + source(source), + target_tree(target_tree), + derivatives(new IntegratedGICPDerivatives(target, source, target_tree, stream, temp_buffer)), + linearized(false), + linearization_point(Eigen::Isometry3f::Identity()) { + // + if (!source->points_gpu) { + std::cerr << "error: GPU source points have not been allocated!!" << std::endl; + abort(); + } + + if (!source->covs_gpu) { + std::cerr << "error: GPU source covs have not been allocated!!" << std::endl; + abort(); + } + + if (!target->points_gpu) { + std::cerr << "error: GPU target points have not been allocated!!" << std::endl; + abort(); + } + + if (!target->covs_gpu) { + std::cerr << "error: GPU target covs have not been allocated!!" << std::endl; + abort(); + } + + if (!target_tree) { + std::cerr << "error: GPU target kdtree has not been created!!" << std::endl; + abort(); + } +} + +IntegratedGICPFactorGPU::~IntegratedGICPFactorGPU() {} + +void IntegratedGICPFactorGPU::print(const std::string& s, const gtsam::KeyFormatter& keyFormatter) const { + std::cout << s << "IntegratedGICPFactorGPU"; + if (is_binary) { + std::cout << "(" << keyFormatter(this->keys()[0]) << ", " << keyFormatter(this->keys()[1]) << ")" << std::endl; + } else { + std::cout << "(fixed, " << keyFormatter(this->keys()[0]) << ")" << std::endl; + } + + std::cout << "|target|=" << frame::size(*target) << "pts, |source|=" << frame::size(*source) << "pts" << std::endl; +} + +size_t IntegratedGICPFactorGPU::memory_usage() const { + return sizeof(*this) + sizeof(IntegratedGICPDerivatives); +} + +size_t IntegratedGICPFactorGPU::memory_usage_gpu() const { + return sizeof(Eigen::Isometry3f) + sizeof(int) + sizeof(Correspondence) * derivatives->get_num_inliers(); +} + +void IntegratedGICPFactorGPU::set_enable_offloading(bool enable) { + derivatives->set_enable_offloading(enable); +} + +void IntegratedGICPFactorGPU::set_enable_surface_validation(bool enable) { + derivatives->set_enable_surface_validation(enable); +} + +void IntegratedGICPFactorGPU::set_inlier_update_thresh(double trans, double angle) { + derivatives->set_inlier_update_thresh(trans, angle); +} + +void IntegratedGICPFactorGPU::set_max_correspondence_distance(double dist) { + derivatives->set_max_correspondence_distance(dist); +} + +int IntegratedGICPFactorGPU::num_inliers() const { + return derivatives->get_num_inliers(); +} + +double IntegratedGICPFactorGPU::inlier_fraction() const { + return derivatives->get_num_inliers() / static_cast(frame::size(*source)); +} + +gtsam::NonlinearFactor::shared_ptr IntegratedGICPFactorGPU::clone() const { + if (is_binary) { + return gtsam::make_shared(keys()[0], keys()[1], target, source, target_tree, nullptr, nullptr); + } + + return gtsam::make_shared( + gtsam::Pose3(fixed_target_pose.cast().matrix()), + keys()[0], + target, + source, + target_tree, + nullptr, + nullptr); +} + +size_t IntegratedGICPFactorGPU::linearization_input_size() const { + return sizeof(Eigen::Isometry3f); +} + +size_t IntegratedGICPFactorGPU::linearization_output_size() const { + return sizeof(LinearizedSystem6); +} + +size_t IntegratedGICPFactorGPU::evaluation_input_size() const { + return sizeof(Eigen::Isometry3f); +} + +size_t IntegratedGICPFactorGPU::evaluation_output_size() const { + return sizeof(float); +} + +Eigen::Isometry3f IntegratedGICPFactorGPU::calc_delta(const gtsam::Values& values) const { + if (!is_binary) { + gtsam::Pose3 source_pose = values.at(keys_[0]); + gtsam::Pose3 delta = gtsam::Pose3(fixed_target_pose.inverse().cast().matrix()) * source_pose; + return Eigen::Isometry3f(delta.matrix().cast()); + } + + gtsam::Pose3 target_pose = values.at(keys_[0]); + gtsam::Pose3 source_pose = values.at(keys_[1]); + gtsam::Pose3 delta = target_pose.inverse() * source_pose; + + return Eigen::Isometry3f(delta.matrix().cast()); +} + +double IntegratedGICPFactorGPU::error(const gtsam::Values& values) const { + double err; + if (evaluation_result) { + err = evaluation_result.value(); + evaluation_result = {}; + } else { + std::cerr << "warning: computing error in sync mode seriously affects the processing speed!!" << std::endl; + + if (!linearized) { + linearize(values); + } + + Eigen::Isometry3f evaluation_point = calc_delta(values); + err = derivatives->compute_error(linearization_point, evaluation_point); + } + + return err; +} + +gtsam::GaussianFactor::shared_ptr IntegratedGICPFactorGPU::linearize(const gtsam::Values& values) const { + linearized = true; + linearization_point = calc_delta(values); + + LinearizedSystem6 l; + + if (linearization_result) { + l = *linearization_result; + linearization_result.reset(); + } else { + std::cerr << "warning: performing linearization in sync mode seriously affects the processing speed!!" << std::endl; + l = derivatives->linearize(linearization_point); + } + + gtsam::HessianFactor::shared_ptr factor; + + if (is_binary) { + factor.reset(new gtsam::HessianFactor( + keys_[0], + keys_[1], + l.H_target.cast(), + l.H_target_source.cast(), + -l.b_target.cast(), + l.H_source.cast(), + -l.b_source.cast(), + l.error)); + } else { + factor.reset(new gtsam::HessianFactor(keys_[0], l.H_source.cast(), -l.b_source.cast(), l.error)); + } + + return factor; +} + +void IntegratedGICPFactorGPU::set_linearization_point(const gtsam::Values& values, void* lin_input_cpu) { + // If -march=native is used and some GPU factor requests a storage size that causes memory misalignment, + // the following lines that directly operates on the input buffer may cause segfaults + Eigen::Isometry3f* linearization_point = reinterpret_cast(lin_input_cpu); + *linearization_point = calc_delta(values); +} + +void IntegratedGICPFactorGPU::set_evaluation_point(const gtsam::Values& values, void* eval_input_cpu) { + Eigen::Isometry3f* evaluation_point = reinterpret_cast(eval_input_cpu); + *evaluation_point = calc_delta(values); +} + +void IntegratedGICPFactorGPU::issue_linearize(const void* lin_input_cpu, const void* lin_input_gpu, void* lin_output_gpu) { + auto linearization_point = reinterpret_cast(lin_input_cpu); + auto linearization_point_gpu = reinterpret_cast(lin_input_gpu); + auto linearized_gpu = reinterpret_cast(lin_output_gpu); + + derivatives->reset_inliers(*linearization_point, linearization_point_gpu); + derivatives->issue_linearize(linearization_point_gpu, linearized_gpu); +} + +void IntegratedGICPFactorGPU::store_linearized(const void* lin_output_cpu) { + auto linearized = reinterpret_cast(lin_output_cpu); + linearization_result.reset(new LinearizedSystem6(*linearized)); + evaluation_result = linearized->error; + + derivatives->update_inliers(linearized->num_inliers); +} + +void IntegratedGICPFactorGPU::issue_compute_error( + const void* lin_input_cpu, + const void* eval_input_cpu, + const void* lin_input_gpu, + const void* eval_input_gpu, + void* eval_output_gpu) { + // + auto linearization_point = reinterpret_cast(lin_input_cpu); + auto evaluation_point = reinterpret_cast(eval_input_cpu); + + auto linearization_point_gpu = reinterpret_cast(lin_input_gpu); + auto evaluation_point_gpu = reinterpret_cast(eval_input_gpu); + + auto error_gpu = reinterpret_cast(eval_output_gpu); + + derivatives->issue_compute_error(linearization_point_gpu, evaluation_point_gpu, error_gpu); +} + +void IntegratedGICPFactorGPU::store_computed_error(const void* eval_output_cpu) { + auto evaluated = reinterpret_cast(eval_output_cpu); + evaluation_result = *evaluated; +} + +void IntegratedGICPFactorGPU::sync() { + derivatives->sync_stream(); +} +} // namespace gtsam_points diff --git a/src/test/test_kdtree.cpp b/src/test/test_kdtree.cpp index 8089ad5b..4bd61934 100644 --- a/src/test/test_kdtree.cpp +++ b/src/test/test_kdtree.cpp @@ -9,8 +9,11 @@ #include #include #include +#include #include +#include #include +#include class KdTreeTest : public testing::Test, public testing::WithParamInterface { virtual void SetUp() { @@ -239,6 +242,46 @@ TEST_P(KdTreeTest, RadiusTest) { } } +#ifdef GTSAM_POINTS_USE_CUDA + +TEST_F(KdTreeTest, KdTreeGPU) { + gtsam_points::KdTree kdtree(points.data(), points.size()); + + auto points_gpu = std::make_shared(points); + gtsam_points::KdTreeGPU kdtree_gpu(points_gpu); + + std::vector points_f(points.size()); + std::transform(points.begin(), points.end(), points_f.begin(), [](const Eigen::Vector4d& p) { return p.head<3>().cast(); }); + + std::vector queries_f(queries.size()); + std::transform(queries.begin(), queries.end(), queries_f.begin(), [](const Eigen::Vector4d& p) { return p.head<3>().cast(); }); + + // self-check + std::vector nn_indices(points.size()); + std::vector nn_sq_dists(points.size()); + kdtree_gpu.nearest_neighbor_search_cpu(points_f.data(), points_f.size(), nn_indices.data(), nn_sq_dists.data()); + + for (int i = 0; i < points.size(); i++) { + EXPECT_NEAR(nn_sq_dists[i], 0.0, 1e-6); + EXPECT_EQ(nn_indices[i], i); + } + + // query check + nn_indices.resize(queries.size()); + nn_sq_dists.resize(queries.size()); + kdtree_gpu.nearest_neighbor_search_cpu(queries_f.data(), queries_f.size(), nn_indices.data(), nn_sq_dists.data()); + + for (int i = 0; i < queries.size(); i++) { + EXPECT_NEAR(nn_sq_dists[i], gt_sq_dists[i][0], 1e-2); + + const double d1 = (points[nn_indices[i]] - queries[i]).squaredNorm(); + const double d2 = (points[gt_indices[i][0]] - queries[i]).squaredNorm(); + EXPECT_NEAR(d1, d2, 1e-3); + } +} + +#endif + int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/src/test/test_matching_cost_factors.cpp b/src/test/test_matching_cost_factors.cpp index 427078ab..55e49d6b 100644 --- a/src/test/test_matching_cost_factors.cpp +++ b/src/test/test_matching_cost_factors.cpp @@ -22,8 +22,10 @@ #include #include #include +#include #include #include +#include #include #include @@ -89,8 +91,12 @@ struct MatchingCostFactorsTestBase : public testing::Test { auto voxelmap_gpu = std::make_shared(1.0); voxelmap_gpu->insert(*frames.back()); voxelmaps_gpu.push_back(voxelmap_gpu); + + auto kdtree_gpu = std::make_shared(frames.back()); + kdtrees_gpu.push_back(kdtree_gpu); #else voxelmaps_gpu.push_back(nullptr); + kdtrees_gpu.push_back(nullptr); #endif } @@ -103,6 +109,7 @@ struct MatchingCostFactorsTestBase : public testing::Test { std::vector frames; std::vector voxelmaps; std::vector voxelmaps_gpu; + std::vector kdtrees_gpu; gtsam::Values poses; gtsam::Values poses_gt; @@ -144,6 +151,14 @@ class MatchingCostFactorTest : public MatchingCostFactorsTestBase, public testin auto f = gtsam::make_shared(target_key, source_key, target_voxelmap, source); f->set_num_threads(num_threads); factor = f; + } else if (method == "GICP_CUDA") { +#ifdef GTSAM_POINTS_USE_CUDA + auto stream_buffer = stream_buffer_roundrobin->get_stream_buffer(); + const auto& stream = stream_buffer.first; + const auto& buffer = stream_buffer.second; + auto kdtree_gpu = std::make_shared(target, stream); + factor.reset(new gtsam_points::IntegratedGICPFactorGPU(target_key, source_key, target, source, kdtree_gpu, stream, buffer)); +#endif } else if (method == "VGICP_CUDA") { #ifdef GTSAM_POINTS_USE_CUDA auto stream_buffer = stream_buffer_roundrobin->get_stream_buffer(); @@ -181,6 +196,14 @@ class MatchingCostFactorTest : public MatchingCostFactorsTestBase, public testin auto f = gtsam::make_shared(fixed_target_pose, source_key, target_voxelmap, source); f->set_num_threads(num_threads); factor = f; + } else if (method == "GICP_CUDA") { +#ifdef GTSAM_POINTS_USE_CUDA + auto stream_buffer = stream_buffer_roundrobin->get_stream_buffer(); + const auto& stream = stream_buffer.first; + const auto& buffer = stream_buffer.second; + auto kdtree_gpu = std::make_shared(target, stream); + factor.reset(new gtsam_points::IntegratedGICPFactorGPU(fixed_target_pose, source_key, target, source, kdtree_gpu, stream, buffer)); +#endif } else if (method == "VGICP_CUDA") { #ifdef GTSAM_POINTS_USE_CUDA auto stream_buffer = stream_buffer_roundrobin->get_stream_buffer(); @@ -233,7 +256,7 @@ class MatchingCostFactorTest : public MatchingCostFactorsTestBase, public testin INSTANTIATE_TEST_SUITE_P( gtsam_points, MatchingCostFactorTest, - testing::Combine(testing::Values("ICP", "GICP", "VGICP", "VGICP_CUDA"), testing::Values("NONE", "OMP", "TBB")), + testing::Combine(testing::Values("ICP", "GICP", "VGICP", "GICP_CUDA", "VGICP_CUDA"), testing::Values("NONE", "OMP", "TBB")), [](const auto& info) { return std::get<0>(info.param) + "_" + std::get<1>(info.param); }); TEST_P(MatchingCostFactorTest, AlignmentTest) {