Fix a missing CERES_NO_CUDA guard Also run format_all.sh. Change-Id: I13902c1d3eb0d3a97548540fee13ec67c490a5ff
diff --git a/internal/ceres/cuda_sparse_matrix.cc b/internal/ceres/cuda_sparse_matrix.cc index 2f857e2..da64981 100644 --- a/internal/ceres/cuda_sparse_matrix.cc +++ b/internal/ceres/cuda_sparse_matrix.cc
@@ -38,41 +38,37 @@ #include "ceres/cuda_sparse_matrix.h" #include <math.h> + #include <memory> -#include "ceres/internal/export.h" #include "ceres/block_sparse_matrix.h" #include "ceres/compressed_row_sparse_matrix.h" -#include "ceres/crs_matrix.h" -#include "ceres/types.h" #include "ceres/context_impl.h" +#include "ceres/crs_matrix.h" +#include "ceres/internal/export.h" +#include "ceres/types.h" #include "ceres/wall_time.h" #ifndef CERES_NO_CUDA +#include "ceres/ceres_cuda_kernels.h" #include "ceres/cuda_buffer.h" #include "ceres/cuda_vector.h" -#include "ceres/ceres_cuda_kernels.h" #include "cusparse.h" - namespace ceres::internal { CudaSparseMatrix::CudaSparseMatrix( - ContextImpl* context, - const CompressedRowSparseMatrix& crs_matrix) { + ContextImpl* context, const CompressedRowSparseMatrix& crs_matrix) { DCHECK_NE(context, nullptr); CHECK(context->IsCUDAInitialized()); context_ = context; num_rows_ = crs_matrix.num_rows(); num_cols_ = crs_matrix.num_cols(); num_nonzeros_ = crs_matrix.num_nonzeros(); - rows_.CopyFromCpu( - crs_matrix.rows(), num_rows_ + 1, context_->stream_); - cols_.CopyFromCpu( - crs_matrix.cols(), num_nonzeros_, context_->stream_); - values_.CopyFromCpu( - crs_matrix.values(), num_nonzeros_, context_->stream_); + rows_.CopyFromCpu(crs_matrix.rows(), num_rows_ + 1, context_->stream_); + cols_.CopyFromCpu(crs_matrix.cols(), num_nonzeros_, context_->stream_); + values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_, context_->stream_); cusparseCreateCsr(&descr_, num_rows_, num_cols_, @@ -99,8 +95,7 @@ CHECK_EQ(num_rows_, crs_matrix.num_rows()); CHECK_EQ(num_cols_, crs_matrix.num_cols()); CHECK_EQ(num_nonzeros_, crs_matrix.num_nonzeros()); - values_.CopyFromCpu( - crs_matrix.values(), num_nonzeros_, context_->stream_); + values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_, context_->stream_); } void CudaSparseMatrix::SpMv(cusparseOperation_t op, @@ -135,11 +130,13 @@ CUSPARSE_STATUS_SUCCESS); } -void CudaSparseMatrix::RightMultiplyAndAccumulate(const CudaVector& x, CudaVector* y) { +void CudaSparseMatrix::RightMultiplyAndAccumulate(const CudaVector& x, + CudaVector* y) { SpMv(CUSPARSE_OPERATION_NON_TRANSPOSE, x, y); } -void CudaSparseMatrix::LeftMultiplyAndAccumulate(const CudaVector& x, CudaVector* y) { +void CudaSparseMatrix::LeftMultiplyAndAccumulate(const CudaVector& x, + CudaVector* y) { // TODO(Joydeep Biswas): We should consider storing a transposed copy of the // matrix by converting CSR to CSC. From the cuSPARSE documentation: // "In general, opA == CUSPARSE_OPERATION_NON_TRANSPOSE is 3x faster than opA
diff --git a/internal/ceres/cuda_sparse_matrix.h b/internal/ceres/cuda_sparse_matrix.h index 7661fb9..62f6b77 100644 --- a/internal/ceres/cuda_sparse_matrix.h +++ b/internal/ceres/cuda_sparse_matrix.h
@@ -43,9 +43,9 @@ #include <string> #include "ceres/compressed_row_sparse_matrix.h" +#include "ceres/context_impl.h" #include "ceres/internal/export.h" #include "ceres/types.h" -#include "ceres/context_impl.h" #ifndef CERES_NO_CUDA #include "ceres/cuda_buffer.h" @@ -58,7 +58,6 @@ // CUDA-accelerated operations. class CERES_NO_EXPORT CudaSparseMatrix { public: - // Create a GPU copy of the matrix provided. The caller must ensure that // InitCuda() has already been successfully called on context before calling // this constructor. @@ -86,7 +85,6 @@ const cusparseSpMatDescr_t& descr() const { return descr_; } private: - // Disable copy and assignment. CudaSparseMatrix(const CudaSparseMatrix&) = delete; CudaSparseMatrix& operator=(const CudaSparseMatrix&) = delete;
diff --git a/internal/ceres/cuda_sparse_matrix_test.cc b/internal/ceres/cuda_sparse_matrix_test.cc index 3a01ff9..ae76b8f 100644 --- a/internal/ceres/cuda_sparse_matrix_test.cc +++ b/internal/ceres/cuda_sparse_matrix_test.cc
@@ -28,15 +28,16 @@ // // Author: joydeepb@cs.utexas.edu (Joydeep Biswas) +#include "ceres/cuda_sparse_matrix.h" + #include <string> +#include "ceres/block_sparse_matrix.h" #include "ceres/casts.h" +#include "ceres/cuda_vector.h" #include "ceres/internal/config.h" #include "ceres/internal/eigen.h" #include "ceres/linear_least_squares_problems.h" -#include "ceres/block_sparse_matrix.h" -#include "ceres/cuda_vector.h" -#include "ceres/cuda_sparse_matrix.h" #include "ceres/triplet_sparse_matrix.h" #include "glog/logging.h" #include "gtest/gtest.h" @@ -113,24 +114,11 @@ // b: [1 2 3 4]' // A1 * b = [3 5]' // A2 * b = [5 18]' - TripletSparseMatrix A1( - 2, - 4, - {0, 0, 1, 1}, - {0, 1, 1, 2}, - {1, 1, 1, 1} - ); - TripletSparseMatrix A2( - 2, - 4, - {0, 0, 1, 1}, - {0, 1, 1, 2}, - {1, 2, 3, 4} - ); + TripletSparseMatrix A1(2, 4, {0, 0, 1, 1}, {0, 1, 1, 2}, {1, 1, 1, 1}); + TripletSparseMatrix A2(2, 4, {0, 0, 1, 1}, {0, 1, 1, 2}, {1, 2, 3, 4}); Vector b(4); b << 1, 2, 3, 4; - ContextImpl context; std::string message; CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message; @@ -163,13 +151,7 @@ // 0 3 4 0] // b: [1 2 3 4]' // A * b = [5 18]' - TripletSparseMatrix A( - 2, - 4, - {0, 0, 1, 1}, - {0, 1, 1, 2}, - {1, 2, 3, 4} - ); + TripletSparseMatrix A(2, 4, {0, 0, 1, 1}, {0, 1, 1, 2}, {1, 2, 3, 4}); Vector b(4); b << 1, 2, 3, 4; Vector x_expected(2); @@ -199,13 +181,7 @@ // 0 3 4 0] // b: [1 2]' // A'* b = [1 8 8 0]' - TripletSparseMatrix A( - 2, - 4, - {0, 0, 1, 1}, - {0, 1, 1, 2}, - {1, 2, 3, 4} - ); + TripletSparseMatrix A(2, 4, {0, 0, 1, 1}, {0, 1, 1, 2}, {1, 2, 3, 4}); Vector b(2); b << 1, 2; Vector x_expected(4); @@ -213,8 +189,7 @@ ContextImpl context; std::string message; - CHECK(context.InitCUDA(&message)) - << "InitCUDA() failed because: " << message; + CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message; auto A_crs = CompressedRowSparseMatrix::FromTripletSparseMatrix(A); CudaSparseMatrix A_gpu(&context, *A_crs); CudaVector b_gpu(&context, A.num_rows()); @@ -269,15 +244,13 @@ ContextImpl context; std::string message; - CHECK(context.InitCUDA(&message)) - << "InitCUDA() failed because: " << message; + CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message; auto A_crs = CompressedRowSparseMatrix::FromTripletSparseMatrix(A); CudaSparseMatrix A_gpu(&context, *A_crs); CudaVector b_gpu(&context, N); CudaVector x_gpu(&context, N); x_gpu.CopyFromCpu(x); - // First check RightMultiply. { b_gpu.SetZero();
diff --git a/internal/ceres/cuda_vector.cc b/internal/ceres/cuda_vector.cc index 5a128de..7debeba 100644 --- a/internal/ceres/cuda_vector.cc +++ b/internal/ceres/cuda_vector.cc
@@ -37,15 +37,15 @@ #include <math.h> +#include "ceres/context_impl.h" #include "ceres/internal/export.h" #include "ceres/types.h" -#include "ceres/context_impl.h" #ifndef CERES_NO_CUDA +#include "ceres/ceres_cuda_kernels.h" #include "ceres/cuda_buffer.h" #include "ceres/cuda_vector.h" -#include "ceres/ceres_cuda_kernels.h" #include "cublas_v2.h" namespace ceres::internal { @@ -72,18 +72,14 @@ } } -CudaVector::~CudaVector() { - DestroyDescriptor(); -} +CudaVector::~CudaVector() { DestroyDescriptor(); } void CudaVector::Resize(int size) { data_.Reserve(size); num_rows_ = size; DestroyDescriptor(); - CHECK_EQ(cusparseCreateDnVec(&descr_, - num_rows_, - data_.data(), - CUDA_R_64F), CUSPARSE_STATUS_SUCCESS); + CHECK_EQ(cusparseCreateDnVec(&descr_, num_rows_, data_.data(), CUDA_R_64F), + CUSPARSE_STATUS_SUCCESS); } double CudaVector::Dot(const CudaVector& x) const { @@ -93,19 +89,19 @@ data_.data(), 1, x.data().data(), - 1, &result), - CUBLAS_STATUS_SUCCESS) << "CuBLAS cublasDdot failed."; + 1, + &result), + CUBLAS_STATUS_SUCCESS) + << "CuBLAS cublasDdot failed."; return result; } double CudaVector::Norm() const { double result = 0; - CHECK_EQ(cublasDnrm2(context_->cublas_handle_, - num_rows_, - data_.data(), - 1, - &result), - CUBLAS_STATUS_SUCCESS) << "CuBLAS cublasDnrm2 failed."; + CHECK_EQ(cublasDnrm2( + context_->cublas_handle_, num_rows_, data_.data(), 1, &result), + CUBLAS_STATUS_SUCCESS) + << "CuBLAS cublasDnrm2 failed."; return result; } @@ -114,10 +110,8 @@ data_.CopyFromCpu(x.data(), x.rows(), context_->stream_); num_rows_ = x.rows(); DestroyDescriptor(); - CHECK_EQ(cusparseCreateDnVec(&descr_, - num_rows_, - data_.data(), - CUDA_R_64F), CUSPARSE_STATUS_SUCCESS); + CHECK_EQ(cusparseCreateDnVec(&descr_, num_rows_, data_.data(), CUDA_R_64F), + CUSPARSE_STATUS_SUCCESS); } void CudaVector::CopyTo(Vector* x) const { @@ -150,12 +144,10 @@ CHECK_EQ(num_rows_, x.num_rows_); if (b != 1.0) { // First scale y by b. - CHECK_EQ(cublasDscal(context_->cublas_handle_, - num_rows_, - &b, - data_.data(), - 1), - CUBLAS_STATUS_SUCCESS) << "CuBLAS cublasDscal failed."; + CHECK_EQ( + cublasDscal(context_->cublas_handle_, num_rows_, &b, data_.data(), 1), + CUBLAS_STATUS_SUCCESS) + << "CuBLAS cublasDscal failed."; } // Then add a * x to y. CHECK_EQ(cublasDaxpy(context_->cublas_handle_, @@ -165,7 +157,8 @@ 1, data_.data(), 1), - CUBLAS_STATUS_SUCCESS) << "CuBLAS cublasDaxpy failed."; + CUBLAS_STATUS_SUCCESS) + << "CuBLAS cublasDaxpy failed."; } void CudaVector::DtDxpy(const CudaVector& D, const CudaVector& x) { @@ -177,12 +170,10 @@ } void CudaVector::Scale(double s) { - CHECK_EQ(cublasDscal(context_->cublas_handle_, - num_rows_, - &s, - data_.data(), - 1), - CUBLAS_STATUS_SUCCESS) << "CuBLAS cublasDscal failed."; + CHECK_EQ( + cublasDscal(context_->cublas_handle_, num_rows_, &s, data_.data(), 1), + CUBLAS_STATUS_SUCCESS) + << "CuBLAS cublasDscal failed."; } } // namespace ceres::internal
diff --git a/internal/ceres/cuda_vector.h b/internal/ceres/cuda_vector.h index 4018c1b..e7c4b81 100644 --- a/internal/ceres/cuda_vector.h +++ b/internal/ceres/cuda_vector.h
@@ -39,17 +39,18 @@ // clang-format on #include <math.h> + #include <memory> #include <string> +#include "ceres/context_impl.h" #include "ceres/internal/export.h" #include "ceres/types.h" -#include "ceres/context_impl.h" #ifndef CERES_NO_CUDA -#include "ceres/cuda_buffer.h" #include "ceres/ceres_cuda_kernels.h" +#include "ceres/cuda_buffer.h" #include "ceres/internal/eigen.h" #include "cublas_v2.h" #include "cusparse.h" @@ -59,7 +60,6 @@ // An Nx1 vector, denoted y hosted on the GPU, with CUDA-accelerated operations. class CERES_NO_EXPORT CudaVector { public: - // Create a pre-allocated vector of size N and return a pointer to it. The // caller must ensure that InitCuda() has already been successfully called on // context before calling this method. @@ -123,12 +123,11 @@ // object in the conjugate gradients linear solver. inline double Norm(const CudaVector& x) { return x.Norm(); } inline void SetZero(CudaVector& x) { x.SetZero(); } -inline void Axpby( - double a, - const CudaVector& x, - double b, - const CudaVector& y, - CudaVector& z) { +inline void Axpby(double a, + const CudaVector& x, + double b, + const CudaVector& y, + CudaVector& z) { if (&x == &y && &y == &z) { // z = (a + b) * z; z.Scale(a + b);
diff --git a/internal/ceres/cuda_vector_test.cc b/internal/ceres/cuda_vector_test.cc index db930ed..84193c0 100644 --- a/internal/ceres/cuda_vector_test.cc +++ b/internal/ceres/cuda_vector_test.cc
@@ -28,11 +28,12 @@ // // Author: joydeepb@cs.utexas.edu (Joydeep Biswas) +#include "ceres/cuda_vector.h" + #include <string> #include "ceres/internal/config.h" #include "ceres/internal/eigen.h" -#include "ceres/cuda_vector.h" #include "glog/logging.h" #include "gtest/gtest.h" @@ -130,23 +131,15 @@ CudaVector x_gpu(&context, 10); x_gpu.CopyFromCpu(x); - EXPECT_NEAR(x_gpu.Norm(), - 2.0, - std::numeric_limits<double>::epsilon()); + EXPECT_NEAR(x_gpu.Norm(), 2.0, std::numeric_limits<double>::epsilon()); x_gpu.SetZero(); - EXPECT_NEAR(x_gpu.Norm(), - 0.0, - std::numeric_limits<double>::epsilon()); + EXPECT_NEAR(x_gpu.Norm(), 0.0, std::numeric_limits<double>::epsilon()); x_gpu.CopyFromCpu(x); - EXPECT_NEAR(x_gpu.Norm(), - 2.0, - std::numeric_limits<double>::epsilon()); + EXPECT_NEAR(x_gpu.Norm(), 2.0, std::numeric_limits<double>::epsilon()); SetZero(x_gpu); - EXPECT_NEAR(x_gpu.Norm(), - 0.0, - std::numeric_limits<double>::epsilon()); + EXPECT_NEAR(x_gpu.Norm(), 0.0, std::numeric_limits<double>::epsilon()); } TEST(CudaVector, Resize) { @@ -187,8 +180,7 @@ y << 100, 10, 1, 0; ContextImpl context; std::string message; - CHECK(context.InitCUDA(&message)) - << "InitCUDA() failed because: " << message; + CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message; CudaVector x_gpu(&context, 4); CudaVector y_gpu(&context, 4); x_gpu.CopyFromCpu(x); @@ -209,8 +201,7 @@ y << 100, 10, 1, 0; ContextImpl context; std::string message; - CHECK(context.InitCUDA(&message)) - << "InitCUDA() failed because: " << message; + CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message; CudaVector x_gpu(&context, 4); CudaVector y_gpu(&context, 4); x_gpu.CopyFromCpu(x); @@ -231,8 +222,7 @@ y << 100, 10, 1, 0; ContextImpl context; std::string message; - CHECK(context.InitCUDA(&message)) - << "InitCUDA() failed because: " << message; + CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message; CudaVector x_gpu(&context, 4); CudaVector y_gpu(&context, 4); x_gpu.CopyFromCpu(x); @@ -251,8 +241,7 @@ x << 100, 10, 1, 0; ContextImpl context; std::string message; - CHECK(context.InitCUDA(&message)) - << "InitCUDA() failed because: " << message; + CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message; CudaVector x_gpu(&context, 4); CudaVector y_gpu(&context, 4); x_gpu.CopyFromCpu(x); @@ -273,8 +262,7 @@ y << 100, 10, 1, 0; ContextImpl context; std::string message; - CHECK(context.InitCUDA(&message)) - << "InitCUDA() failed because: " << message; + CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message; CudaVector x_gpu(&context, 4); CudaVector y_gpu(&context, 4); CudaVector z_gpu(&context, 4); @@ -296,8 +284,7 @@ x << 100, 10, 1, 0; ContextImpl context; std::string message; - CHECK(context.InitCUDA(&message)) - << "InitCUDA() failed because: " << message; + CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message; CudaVector x_gpu(&context, 4); CudaVector z_gpu(&context, 4); x_gpu.CopyFromCpu(x); @@ -318,8 +305,7 @@ y << 100, 10, 1, 0; ContextImpl context; std::string message; - CHECK(context.InitCUDA(&message)) - << "InitCUDA() failed because: " << message; + CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message; CudaVector x_gpu(&context, 10); CudaVector y_gpu(&context, 10); x_gpu.CopyFromCpu(x); @@ -340,8 +326,7 @@ y << 100, 10, 1, 0; ContextImpl context; std::string message; - CHECK(context.InitCUDA(&message)) - << "InitCUDA() failed because: " << message; + CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message; CudaVector x_gpu(&context, 4); CudaVector y_gpu(&context, 4); x_gpu.CopyFromCpu(x); @@ -360,8 +345,7 @@ x << 100, 10, 1, 0; ContextImpl context; std::string message; - CHECK(context.InitCUDA(&message)) - << "InitCUDA() failed because: " << message; + CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message; CudaVector x_gpu(&context, 10); x_gpu.CopyFromCpu(x);
diff --git a/internal/ceres/implicit_schur_complement_test.cc b/internal/ceres/implicit_schur_complement_test.cc index 66c6707..f7abca1 100644 --- a/internal/ceres/implicit_schur_complement_test.cc +++ b/internal/ceres/implicit_schur_complement_test.cc
@@ -155,8 +155,8 @@ // Here, assuming that block_diagonal(F'F) == diagonal(F'F) Matrix Z_reference = (F.transpose() * F + DF).diagonal().asDiagonal().inverse() * - F.transpose() * E * (E.transpose() * E + DE).inverse() * - E.transpose() * F; + F.transpose() * E * (E.transpose() * E + DE).inverse() * E.transpose() * + F; for (int i = 0; i < num_f_cols; ++i) { Vector x(num_f_cols); @@ -166,7 +166,6 @@ Vector y(num_f_cols); y = lhs * x; - Vector z(num_f_cols); isc.RightMultiplyAndAccumulate(x.data(), z.data());
diff --git a/internal/ceres/sparse_linear_operator_benchmark.cc b/internal/ceres/sparse_linear_operator_benchmark.cc index c224e35..e2e22c9 100644 --- a/internal/ceres/sparse_linear_operator_benchmark.cc +++ b/internal/ceres/sparse_linear_operator_benchmark.cc
@@ -34,16 +34,19 @@ #include <string> #include "Eigen/Dense" -#include "gflags/gflags.h" #include "benchmark/benchmark.h" -#include "ceres/context_impl.h" #include "ceres/block_sparse_matrix.h" +#include "ceres/context_impl.h" #include "ceres/cuda_sparse_matrix.h" #include "ceres/cuda_vector.h" #include "ceres/internal/config.h" #include "ceres/internal/eigen.h" #include "ceres/linear_solver.h" +#include "gflags/gflags.h" + +#ifndef CERES_NO_CUDA #include "cuda_runtime.h" +#endif namespace ceres::internal { @@ -153,6 +156,10 @@ CHECK_NE(sum, 0.0); } +BENCHMARK(BM_CpuRightMultiplyAndAccumulate); +BENCHMARK(BM_CpuLeftMultiplyAndAccumulate); + +#ifndef CERES_NO_CUDA static void BM_CudaRightMultiplyAndAccumulate(benchmark::State& state) { // Perform setup here std::unique_ptr<BlockSparseMatrix> jacobian = @@ -165,9 +172,7 @@ std::string message; context.InitCUDA(&message); CompressedRowSparseMatrix jacobian_crs( - jacobian->num_rows(), - jacobian->num_cols(), - jacobian->num_nonzeros()); + jacobian->num_rows(), jacobian->num_cols(), jacobian->num_nonzeros()); jacobian->ToCompressedRowSparseMatrix(&jacobian_crs); CudaSparseMatrix cuda_jacobian(&context, jacobian_crs); CudaVector cuda_x(&context, 0); @@ -202,9 +207,7 @@ std::string message; context.InitCUDA(&message); CompressedRowSparseMatrix jacobian_crs( - jacobian->num_rows(), - jacobian->num_cols(), - jacobian->num_nonzeros()); + jacobian->num_rows(), jacobian->num_cols(), jacobian->num_nonzeros()); jacobian->ToCompressedRowSparseMatrix(&jacobian_crs); CudaSparseMatrix cuda_jacobian(&context, jacobian_crs); CudaVector cuda_x(&context, 0); @@ -227,10 +230,9 @@ CHECK_NE(sum, 0.0); } -BENCHMARK(BM_CpuRightMultiplyAndAccumulate); -BENCHMARK(BM_CpuLeftMultiplyAndAccumulate); BENCHMARK(BM_CudaRightMultiplyAndAccumulate); BENCHMARK(BM_CudaLeftMultiplyAndAccumulate); +#endif BENCHMARK_MAIN();