Bugfix to CUDA workspace handling * Fix workspace type in CUDADenseQR and CUDADenseCholesky -- Workspace sizes are in terms of number of elements, not bytes. * Add cuda-memcheck tests to catch such CUDA memory errors in the future. Change-Id: I3dd0f0947daba9e4c6cd0216bef81d694547d505
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt index fa538fe..cc96af5 100644 --- a/internal/ceres/CMakeLists.txt +++ b/internal/ceres/CMakeLists.txt
@@ -136,6 +136,14 @@ ${CUDA_cublas_LIBRARY} ${CUDA_cusolver_LIBRARY} ${CUDA_cusparse_LIBRARY}) + add_test( + NAME cuda_memcheck_dense_qr_test + COMMAND cuda-memcheck --leak-check full + ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/cuda_dense_qr_test) + add_test( + NAME cuda_memcheck_dense_cholesky_test + COMMAND cuda-memcheck --leak-check full + ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/cuda_dense_cholesky_test) endif (CUDA_FOUND) if (LAPACK_FOUND)
diff --git a/internal/ceres/cuda_dense_cholesky_test.cc b/internal/ceres/cuda_dense_cholesky_test.cc index cca97d8..5837e5c 100644 --- a/internal/ceres/cuda_dense_cholesky_test.cc +++ b/internal/ceres/cuda_dense_cholesky_test.cc
@@ -128,6 +128,48 @@ LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR); } +TEST(CUDADenseCholesky, Randomized1600x1600Tests) { + const int kNumCols = 1600; + using LhsType = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>; + using RhsType = Eigen::Matrix<double, Eigen::Dynamic, 1>; + using SolutionType = Eigen::Matrix<double, Eigen::Dynamic, 1>; + + LinearSolver::Options options; + ContextImpl context; + options.context = &context; + options.dense_linear_algebra_library_type = ceres::CUDA; + std::unique_ptr<DenseCholesky> dense_cholesky = CUDADenseCholesky::Create(options); + + const int kNumTrials = 20; + for (int i = 0; i < kNumTrials; ++i) { + LhsType lhs = LhsType::Random(kNumCols, kNumCols); + lhs = lhs.transpose() * lhs; + lhs += 1e-3 * LhsType::Identity(kNumCols, kNumCols); + SolutionType x_expected = SolutionType::Random(kNumCols); + RhsType rhs = lhs * x_expected; + SolutionType x_computed = SolutionType::Zero(kNumCols); + // Sanity check the random matrix sizes. + EXPECT_EQ(lhs.rows(), kNumCols); + EXPECT_EQ(lhs.cols(), kNumCols); + EXPECT_EQ(rhs.rows(), kNumCols); + EXPECT_EQ(rhs.cols(), 1); + EXPECT_EQ(x_expected.rows(), kNumCols); + EXPECT_EQ(x_expected.cols(), 1); + EXPECT_EQ(x_computed.rows(), kNumCols); + EXPECT_EQ(x_computed.cols(), 1); + LinearSolver::Summary summary; + summary.termination_type = dense_cholesky->FactorAndSolve(kNumCols, + lhs.data(), + rhs.data(), + x_computed.data(), + &summary.message); + ASSERT_EQ(summary.termination_type, LINEAR_SOLVER_SUCCESS); + ASSERT_NEAR((x_computed - x_expected).norm() / x_expected.norm(), + 0.0, + 1e-10); + } +} + #endif // CERES_NO_CUDA } // namespace internal
diff --git a/internal/ceres/cuda_dense_qr_test.cc b/internal/ceres/cuda_dense_qr_test.cc index 6a64298..5d7b48c 100644 --- a/internal/ceres/cuda_dense_qr_test.cc +++ b/internal/ceres/cuda_dense_qr_test.cc
@@ -135,10 +135,6 @@ std::unique_ptr<DenseQR> dense_qr = CUDADenseQR::Create(options); const int kNumTrials = 100; - const int kMinNumCols = 1; - const int kMaxNumCols = 10; - const int kMinRowsFactor = 1; - const int kMaxRowsFactor = 3; for (int i = 0; i < kNumTrials; ++i) { LhsType lhs = LhsType::Random(kNumRows, kNumCols); SolutionType x_expected = SolutionType::Random(kNumCols);
diff --git a/internal/ceres/dense_cholesky.h b/internal/ceres/dense_cholesky.h index d056d85..655a2f8 100644 --- a/internal/ceres/dense_cholesky.h +++ b/internal/ceres/dense_cholesky.h
@@ -167,7 +167,7 @@ // GPU memory allocated for the B matrix (rhs vector). CudaBuffer<double> rhs_; // Scratch space for cuSOLVER on the GPU. - CudaBuffer<uint8_t> device_workspace_; + CudaBuffer<double> device_workspace_; // Required for error handling with cuSOLVER. CudaBuffer<int> error_; // Cache the result of Factorize to ensure that when Solve is called, the
diff --git a/internal/ceres/dense_qr.h b/internal/ceres/dense_qr.h index 1a3bc81..7a2ffb5 100644 --- a/internal/ceres/dense_qr.h +++ b/internal/ceres/dense_qr.h
@@ -189,7 +189,7 @@ // GPU memory allocated for the TAU matrix (scaling of householder vectors). CudaBuffer<double> tau_; // Scratch space for cuSOLVER on the GPU. - CudaBuffer<uint8_t> device_workspace_; + CudaBuffer<double> device_workspace_; // Required for error handling with cuSOLVER. CudaBuffer<int> error_; // Cache the result of Factorize to ensure that when Solve is called, the