CUDA Cleanup * Renamed several interfaces to CudaBuffer for clarity and consistency. * Added unit tests for custom Cuda kernels. * Set specific CUDA architectures if the CMake version supports it. Change-Id: I269fb1089b80b25e17bca772ef8d70e7894214b8
diff --git a/CMakeLists.txt b/CMakeLists.txt index c2e1e4e..b1fcddd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt
@@ -237,6 +237,11 @@ "${CUDA_LIBRARIES};" "${CUDA_cusolver_LIBRARY};" "${CUDA_cusparse_LIBRARY}") + if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18") + # Support Maxwell, Pascal, Volta, Turing, and Ampere GPUs. + set(CMAKE_CUDA_ARCHITECTURES "50;60;70;80") + message("-- Setting CUDA Architecture to ${CMAKE_CUDA_ARCHITECTURES}") + endif() enable_language(CUDA) else (CUDA_FOUND) message("-- Did not find CUDA library, disabling CUDA support.")
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt index 75df3c6..ea21df2 100644 --- a/internal/ceres/CMakeLists.txt +++ b/internal/ceres/CMakeLists.txt
@@ -482,6 +482,7 @@ ceres_test(cubic_interpolation) ceres_test(cuda_dense_cholesky) ceres_test(cuda_dense_qr) + ceres_test(cuda_kernels) ceres_test(dense_linear_solver) ceres_test(dense_cholesky) ceres_test(dense_qr)
diff --git a/internal/ceres/ceres_cuda_kernels.cu b/internal/ceres/ceres_cuda_kernels.cu index 2fd76b5..cd045e3 100644 --- a/internal/ceres/ceres_cuda_kernels.cu +++ b/internal/ceres/ceres_cuda_kernels.cu
@@ -106,4 +106,24 @@ <<<num_blocks, kCudaBlockSize, 0, stream>>>(x, y, size); } +__global__ void CudaDtDxpyKernel(double* __restrict__ y, + const double* D, + const double* __restrict__ x, + const int size) { + const int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < size) { + y[i] = y[i] + D[i] * D[i] * x[i]; + } +} + +void CudaDtDxpy(double* y, + const double* D, + const double* x, + const int size, + cudaStream_t stream) { + const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize; + CudaDtDxpyKernel<<<num_blocks, kCudaBlockSize, 0, stream>>>( + y, D, x, size); +} + } // namespace ceres_cuda_kernels \ No newline at end of file
diff --git a/internal/ceres/ceres_cuda_kernels.h b/internal/ceres/ceres_cuda_kernels.h index f033147..91675e8 100644 --- a/internal/ceres/ceres_cuda_kernels.h +++ b/internal/ceres/ceres_cuda_kernels.h
@@ -62,6 +62,13 @@ // double (FP64). Both arrays must already be on GPU memory. void CudaDsxpy(double* x, float* y, const int size, cudaStream_t stream); +// Compute y[i] = y[i] + d[i]^2 x[i]. All arrays must already be on GPU memory. +void CudaDtDxpy(double* y, + const double* D, + const double* x, + const int size, + cudaStream_t stream); + } // namespace ceres::internal #endif // CERES_NO_CUDA
diff --git a/internal/ceres/cuda_buffer.h b/internal/ceres/cuda_buffer.h index 64774fa..89828f9 100644 --- a/internal/ceres/cuda_buffer.h +++ b/internal/ceres/cuda_buffer.h
@@ -45,8 +45,7 @@ // the appropriate GPU device is selected before each subroutine is called. This // is particularly important when using multiple GPU devices on different CPU // threads, since active Cuda devices are determined by the cuda runtime on a -// per-thread basis. Note that unless otherwise specified, all methods use the -// default stream, and are synchronous. +// per-thread basis. template <typename T> class CudaBuffer { public: @@ -72,17 +71,30 @@ } } - // Perform an asynchronous copy from CPU memory to GPU memory using the stream - // provided. - void CopyToGpuAsync(const T* data, const size_t size, cudaStream_t stream) { + // Perform an asynchronous copy from CPU memory to GPU memory managed by this + // CudaBuffer instance using the stream provided. + void CopyFromCpu(const T* data, const size_t size, cudaStream_t stream) { Reserve(size); CHECK_EQ(cudaMemcpyAsync( data_, data, size * sizeof(T), cudaMemcpyHostToDevice, stream), cudaSuccess); } - // Perform an asynchronous copy from GPU memory using the stream provided. - void CopyFromGpuAsync(const T* data, const size_t size, cudaStream_t stream) { + // Perform an asynchronous copy from a vector in CPU memory to GPU memory + // managed by this CudaBuffer instance. + void CopyFromCpuVector(const std::vector<T>& data, cudaStream_t stream) { + Reserve(data.size()); + CHECK_EQ(cudaMemcpyAsync(data_, + data.data(), + data.size() * sizeof(T), + cudaMemcpyHostToDevice, + stream), + cudaSuccess); + } + + // Perform an asynchronous copy from another GPU memory array to the GPU + // memory managed by this CudaBuffer instance using the stream provided. + void CopyFromGPUArray(const T* data, const size_t size, cudaStream_t stream) { Reserve(size); CHECK_EQ( cudaMemcpyAsync( @@ -90,21 +102,38 @@ cudaSuccess); } - // Copy data from the GPU to CPU memory. This is necessarily synchronous since - // any potential GPU kernels that may be writing to the buffer must finish - // before the transfer happens. - void CopyToHost(T* data, const size_t size) { + // Copy data from the GPU memory managed by this CudaBuffer instance to CPU + // memory. It is the caller's responsibility to ensure that the CPU memory + // pointer is valid, i.e. it is not null, and that it points to memory of + // at least this->size() size. This copy is necessarily synchronous since any + // potential GPU kernels that may be writing to the buffer must finish before + // the transfer happens. + void CopyToCpu(T* data, const size_t size) const { CHECK(data_ != nullptr); CHECK_EQ(cudaMemcpy(data, data_, size * sizeof(T), cudaMemcpyDeviceToHost), cudaSuccess); } - void CopyToGpu(const std::vector<T>& data) { - CopyToGpu(data.data(), data.size()); + // Copy N items from another GPU memory array to the GPU memory managed by + // this CudaBuffer instance, growing this buffer's size if needed. This copy + // is asynchronous, and operates on the stream provided. + void CopyNItemsFrom(int n, const CudaBuffer<T>& other, cudaStream_t stream) { + Reserve(n); + CHECK(other.data_ != nullptr); + CHECK(data_ != nullptr); + CHECK_EQ(cudaMemcpyAsync(data_, + other.data_, + size_ * sizeof(T), + cudaMemcpyDeviceToDevice, + stream), + cudaSuccess); } + // Return a pointer to the GPU memory managed by this CudaBuffer instance. T* data() { return data_; } const T* data() const { return data_; } + // Return the number of items of type T that can fit in the GPU memory + // allocated so far by this CudaBuffer instance. size_t size() const { return size_; } private:
diff --git a/internal/ceres/cuda_kernels_test.cc b/internal/ceres/cuda_kernels_test.cc new file mode 100644 index 0000000..fbdc4b3 --- /dev/null +++ b/internal/ceres/cuda_kernels_test.cc
@@ -0,0 +1,179 @@ +// Ceres Solver - A fast non-linear least squares minimizer +// Copyright 2022 Google Inc. All rights reserved. +// http://ceres-solver.org/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Google Inc. nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Author: joydeepb@cs.utexas.edu (Joydeep Biswas) + +#include <math.h> + +#include <limits> +#include <string> +#include <vector> + +#include "ceres/ceres_cuda_kernels.h" +#include "ceres/cuda_buffer.h" +#include "ceres/internal/config.h" +#include "ceres/internal/eigen.h" +#include "glog/logging.h" +#include "gtest/gtest.h" + +namespace ceres { +namespace internal { + +#ifndef CERES_NO_CUDA + +TEST(CudaFP64ToFP32, SimpleConversions) { + std::vector<double> fp64_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0}; + CudaBuffer<double> fp64_gpu; + fp64_gpu.CopyFromCpuVector(fp64_cpu, cudaStreamDefault); + CudaBuffer<float> fp32_gpu; + fp32_gpu.Reserve(fp64_cpu.size()); + CudaFP64ToFP32(fp64_gpu.data(), + fp32_gpu.data(), + fp64_cpu.size(), + cudaStreamDefault); + std::vector<float> fp32_cpu(fp64_cpu.size()); + fp32_gpu.CopyToCpu(fp32_cpu.data(), fp32_cpu.size()); + for (int i = 0; i < fp32_cpu.size(); ++i) { + EXPECT_EQ(fp32_cpu[i], static_cast<float>(fp64_cpu[i])); + } +} + +TEST(CudaFP64ToFP32, NumericallyExtremeValues) { + std::vector<double> fp64_cpu = { + DBL_MIN, + 10.0 * DBL_MIN, + DBL_MAX, + 0.1 * DBL_MAX + }; + // First just make sure that the compiler has represented these values + // accurately as fp64. + EXPECT_GT(fp64_cpu[0], 0.0); + EXPECT_GT(fp64_cpu[1], 0.0); + EXPECT_TRUE(std::isfinite(fp64_cpu[2])); + EXPECT_TRUE(std::isfinite(fp64_cpu[3])); + CudaBuffer<double> fp64_gpu; + fp64_gpu.CopyFromCpuVector(fp64_cpu, cudaStreamDefault); + CudaBuffer<float> fp32_gpu; + fp32_gpu.Reserve(fp64_cpu.size()); + CudaFP64ToFP32(fp64_gpu.data(), + fp32_gpu.data(), + fp64_cpu.size(), + cudaStreamDefault); + std::vector<float> fp32_cpu(fp64_cpu.size()); + fp32_gpu.CopyToCpu(fp32_cpu.data(), fp32_cpu.size()); + EXPECT_EQ(fp32_cpu[0], 0.0f); + EXPECT_EQ(fp32_cpu[1], 0.0f); + EXPECT_EQ(fp32_cpu[2], std::numeric_limits<float>::infinity()); + EXPECT_EQ(fp32_cpu[3], std::numeric_limits<float>::infinity()); +} + +TEST(CudaFP32ToFP64, SimpleConversions) { + std::vector<float> fp32_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0}; + CudaBuffer<float> fp32_gpu; + fp32_gpu.CopyFromCpuVector(fp32_cpu, cudaStreamDefault); + CudaBuffer<double> fp64_gpu; + fp64_gpu.Reserve(fp32_cpu.size()); + CudaFP32ToFP64(fp32_gpu.data(), + fp64_gpu.data(), + fp32_cpu.size(), + cudaStreamDefault); + std::vector<double> fp64_cpu(fp32_cpu.size()); + fp64_gpu.CopyToCpu(fp64_cpu.data(), fp64_cpu.size()); + for (int i = 0; i < fp64_cpu.size(); ++i) { + EXPECT_EQ(fp64_cpu[i], static_cast<double>(fp32_cpu[i])); + } +} + +TEST(CudaSetZeroFP32, NonZeroInput) { + std::vector<float> fp32_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0}; + CudaBuffer<float> fp32_gpu; + fp32_gpu.CopyFromCpuVector(fp32_cpu, cudaStreamDefault); + CudaSetZeroFP32(fp32_gpu.data(), fp32_cpu.size(), cudaStreamDefault); + std::vector<float> fp32_cpu_zero(fp32_cpu.size()); + fp32_gpu.CopyToCpu(fp32_cpu_zero.data(), fp32_cpu_zero.size()); + for (int i = 0; i < fp32_cpu_zero.size(); ++i) { + EXPECT_EQ(fp32_cpu_zero[i], 0.0f); + } +} + +TEST(CudaSetZeroFP64, NonZeroInput) { + std::vector<double> fp64_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0}; + CudaBuffer<double> fp64_gpu; + fp64_gpu.CopyFromCpuVector(fp64_cpu, cudaStreamDefault); + CudaSetZeroFP64(fp64_gpu.data(), fp64_cpu.size(), cudaStreamDefault); + std::vector<double> fp64_cpu_zero(fp64_cpu.size()); + fp64_gpu.CopyToCpu(fp64_cpu_zero.data(), fp64_cpu_zero.size()); + for (int i = 0; i < fp64_cpu_zero.size(); ++i) { + EXPECT_EQ(fp64_cpu_zero[i], 0.0); + } +} + +TEST(CudaDsxpy, DoubleValues) { + std::vector<float> fp32_cpu_a = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0}; + std::vector<double> fp64_cpu_b = + {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0}; + CudaBuffer<float> fp32_gpu_a; + fp32_gpu_a.CopyFromCpuVector(fp32_cpu_a, cudaStreamDefault); + CudaBuffer<double> fp64_gpu_b; + fp64_gpu_b.CopyFromCpuVector(fp64_cpu_b, cudaStreamDefault); + CudaDsxpy(fp64_gpu_b.data(), + fp32_gpu_a.data(), + fp32_gpu_a.size(), + cudaStreamDefault); + fp64_gpu_b.CopyToCpu(fp64_cpu_b.data(), fp64_cpu_b.size()); + for (int i = 0; i < fp64_cpu_b.size(); ++i) { + EXPECT_DOUBLE_EQ(fp64_cpu_b[i], 2.0 * fp32_cpu_a[i]); + } +} + +TEST(CudaDtDxpy, ComputeFourItems) { + std::vector<double> x_cpu = {1, 2, 3, 4}; + std::vector<double> y_cpu = {4, 3, 2, 1}; + std::vector<double> d_cpu = {10, 20, 30, 40}; + CudaBuffer<double> x_gpu; + x_gpu.CopyFromCpuVector(x_cpu, cudaStreamDefault); + CudaBuffer<double> y_gpu; + y_gpu.CopyFromCpuVector(y_cpu, cudaStreamDefault); + CudaBuffer<double> d_gpu; + d_gpu.CopyFromCpuVector(d_cpu, cudaStreamDefault); + CudaDtDxpy(y_gpu.data(), + d_gpu.data(), + x_gpu.data(), + y_gpu.size(), + cudaStreamDefault); + y_gpu.CopyToCpu(y_cpu.data(), y_cpu.size()); + EXPECT_DOUBLE_EQ(y_cpu[0], 4.0 + 10.0 * 10.0 * 1.0); + EXPECT_DOUBLE_EQ(y_cpu[1], 3.0 + 20.0 * 20.0 * 2.0); + EXPECT_DOUBLE_EQ(y_cpu[2], 2.0 + 30.0 * 30.0 * 3.0); + EXPECT_DOUBLE_EQ(y_cpu[3], 1.0 + 40.0 * 40.0 * 4.0); +} + +#endif // CERES_NO_CUDA + +} // namespace internal +} // namespace ceres
diff --git a/internal/ceres/dense_cholesky.cc b/internal/ceres/dense_cholesky.cc index 16d3e1a..a6220e4 100644 --- a/internal/ceres/dense_cholesky.cc +++ b/internal/ceres/dense_cholesky.cc
@@ -363,7 +363,7 @@ factorize_result_ = LinearSolverTerminationType::FATAL_ERROR; lhs_.Reserve(num_cols * num_cols); num_cols_ = num_cols; - lhs_.CopyToGpuAsync(lhs, num_cols * num_cols, stream_); + lhs_.CopyFromCpu(lhs, num_cols * num_cols, stream_); int device_workspace_size = 0; if (cusolverDnDpotrf_bufferSize(cusolver_handle_, CUBLAS_FILL_MODE_LOWER, @@ -393,7 +393,7 @@ return LinearSolverTerminationType::FATAL_ERROR; } int error = 0; - error_.CopyToHost(&error, 1); + error_.CopyToCpu(&error, 1); if (error < 0) { LOG(FATAL) << "Congratulations, you found a bug in Ceres - " << "please report it. " @@ -422,7 +422,7 @@ *message = "Factorize did not complete successfully previously."; return factorize_result_; } - rhs_.CopyToGpuAsync(rhs, num_cols_, stream_); + rhs_.CopyFromCpu(rhs, num_cols_, stream_); if (cusolverDnDpotrs(cusolver_handle_, CUBLAS_FILL_MODE_LOWER, num_cols_, @@ -441,14 +441,14 @@ return LinearSolverTerminationType::FATAL_ERROR; } int error = 0; - error_.CopyToHost(&error, 1); + error_.CopyToCpu(&error, 1); if (error != 0) { LOG(FATAL) << "Congratulations, you found a bug in Ceres. " << "Please report it." << "cuSolverDN::cusolverDnDpotrs fatal error. " << "Argument: " << -error << " is invalid."; } - rhs_.CopyToHost(solution, num_cols_); + rhs_.CopyToCpu(solution, num_cols_); *message = "Success"; return LinearSolverTerminationType::SUCCESS; } @@ -533,7 +533,7 @@ return LinearSolverTerminationType::FATAL_ERROR; } int error = 0; - error_.CopyToHost(&error, 1); + error_.CopyToCpu(&error, 1); if (error < 0) { LOG(FATAL) << "Congratulations, you found a bug in Ceres - " << "please report it. " @@ -581,7 +581,7 @@ return LinearSolverTerminationType::FATAL_ERROR; } int error = 0; - error_.CopyToHost(&error, 1); + error_.CopyToCpu(&error, 1); if (error != 0) { LOG(FATAL) << "Congratulations, you found a bug in Ceres. " << "Please report it." @@ -598,7 +598,7 @@ // Copy fp64 version of lhs to GPU. lhs_fp64_.Reserve(num_cols * num_cols); - lhs_fp64_.CopyToGpuAsync(lhs, num_cols * num_cols, stream_); + lhs_fp64_.CopyFromCpu(lhs, num_cols * num_cols, stream_); // Create an fp32 copy of lhs, lhs_fp32. lhs_fp32_.Reserve(num_cols * num_cols); @@ -629,8 +629,8 @@ CudaSetZeroFP64(x_fp64_.data(), num_cols_, stream_); // Initialize residual = rhs. - rhs_fp64_.CopyToGpuAsync(rhs, num_cols_, stream_); - residual_fp64_.CopyFromGpuAsync(rhs_fp64_.data(), num_cols_, stream_); + rhs_fp64_.CopyFromCpu(rhs, num_cols_, stream_); + residual_fp64_.CopyFromGPUArray(rhs_fp64_.data(), num_cols_, stream_); for (int i = 0; i <= max_num_refinement_iterations_; ++i) { // Cast residual from fp64 to fp32. @@ -647,7 +647,7 @@ // [fp64] residual = rhs - lhs * x // This is done in two steps: // 1. [fp64] residual = rhs - residual_fp64_.CopyFromGpuAsync(rhs_fp64_.data(), num_cols_, stream_); + residual_fp64_.CopyFromGPUArray(rhs_fp64_.data(), num_cols_, stream_); // 2. [fp64] residual = residual - lhs * x double alpha = -1.0; double beta = 1.0; @@ -664,7 +664,7 @@ 1); } } - x_fp64_.CopyToHost(solution, num_cols_); + x_fp64_.CopyToCpu(solution, num_cols_); *message = "Success."; return LinearSolverTerminationType::SUCCESS; }
diff --git a/internal/ceres/dense_qr.cc b/internal/ceres/dense_qr.cc index cbe7533..775073d 100644 --- a/internal/ceres/dense_qr.cc +++ b/internal/ceres/dense_qr.cc
@@ -331,7 +331,7 @@ tau_.Reserve(std::min(num_rows, num_cols)); num_rows_ = num_rows; num_cols_ = num_cols; - lhs_.CopyToGpuAsync(lhs, num_rows * num_cols, stream_); + lhs_.CopyFromCpu(lhs, num_rows * num_cols, stream_); int device_workspace_size = 0; if (cusolverDnDgeqrf_bufferSize(cusolver_handle_, num_rows, @@ -362,7 +362,7 @@ return LinearSolverTerminationType::FATAL_ERROR; } int error = 0; - error_.CopyToHost(&error, 1); + error_.CopyToCpu(&error, 1); if (error < 0) { LOG(FATAL) << "Congratulations, you found a bug in Ceres - " << "please report it. " @@ -385,7 +385,7 @@ *message = "Factorize did not complete successfully previously."; return factorize_result_; } - rhs_.CopyToGpuAsync(rhs, num_rows_, stream_); + rhs_.CopyFromCpu(rhs, num_rows_, stream_); int device_workspace_size = 0; if (cusolverDnDormqr_bufferSize(cusolver_handle_, CUBLAS_SIDE_LEFT, @@ -424,7 +424,7 @@ return LinearSolverTerminationType::FATAL_ERROR; } int error = 0; - error_.CopyToHost(&error, 1); + error_.CopyToCpu(&error, 1); if (error < 0) { LOG(FATAL) << "Congratulations, you found a bug in Ceres. " << "Please report it." @@ -450,7 +450,7 @@ *message = "Cuda device synchronization failed."; return LinearSolverTerminationType::FATAL_ERROR; } - rhs_.CopyToHost(solution, num_cols_); + rhs_.CopyToCpu(solution, num_cols_); *message = "Success"; return LinearSolverTerminationType::SUCCESS; }