Bugfix to CUDA workspace handling
* Fix workspace type in CUDADenseQR and CUDADenseCholesky --
Workspace sizes are in terms of number of elements, not bytes.
* Add cuda-memcheck tests to catch such CUDA memory errors in
the future.
Change-Id: I3dd0f0947daba9e4c6cd0216bef81d694547d505
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index fa538fe..cc96af5 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -136,6 +136,14 @@
${CUDA_cublas_LIBRARY}
${CUDA_cusolver_LIBRARY}
${CUDA_cusparse_LIBRARY})
+ add_test(
+ NAME cuda_memcheck_dense_qr_test
+ COMMAND cuda-memcheck --leak-check full
+ ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/cuda_dense_qr_test)
+ add_test(
+ NAME cuda_memcheck_dense_cholesky_test
+ COMMAND cuda-memcheck --leak-check full
+ ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/cuda_dense_cholesky_test)
endif (CUDA_FOUND)
if (LAPACK_FOUND)
diff --git a/internal/ceres/cuda_dense_cholesky_test.cc b/internal/ceres/cuda_dense_cholesky_test.cc
index cca97d8..5837e5c 100644
--- a/internal/ceres/cuda_dense_cholesky_test.cc
+++ b/internal/ceres/cuda_dense_cholesky_test.cc
@@ -128,6 +128,48 @@
LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR);
}
+TEST(CUDADenseCholesky, Randomized1600x1600Tests) {
+ const int kNumCols = 1600;
+ using LhsType = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>;
+ using RhsType = Eigen::Matrix<double, Eigen::Dynamic, 1>;
+ using SolutionType = Eigen::Matrix<double, Eigen::Dynamic, 1>;
+
+ LinearSolver::Options options;
+ ContextImpl context;
+ options.context = &context;
+ options.dense_linear_algebra_library_type = ceres::CUDA;
+ std::unique_ptr<DenseCholesky> dense_cholesky = CUDADenseCholesky::Create(options);
+
+ const int kNumTrials = 20;
+ for (int i = 0; i < kNumTrials; ++i) {
+ LhsType lhs = LhsType::Random(kNumCols, kNumCols);
+ lhs = lhs.transpose() * lhs;
+ lhs += 1e-3 * LhsType::Identity(kNumCols, kNumCols);
+ SolutionType x_expected = SolutionType::Random(kNumCols);
+ RhsType rhs = lhs * x_expected;
+ SolutionType x_computed = SolutionType::Zero(kNumCols);
+ // Sanity check the random matrix sizes.
+ EXPECT_EQ(lhs.rows(), kNumCols);
+ EXPECT_EQ(lhs.cols(), kNumCols);
+ EXPECT_EQ(rhs.rows(), kNumCols);
+ EXPECT_EQ(rhs.cols(), 1);
+ EXPECT_EQ(x_expected.rows(), kNumCols);
+ EXPECT_EQ(x_expected.cols(), 1);
+ EXPECT_EQ(x_computed.rows(), kNumCols);
+ EXPECT_EQ(x_computed.cols(), 1);
+ LinearSolver::Summary summary;
+ summary.termination_type = dense_cholesky->FactorAndSolve(kNumCols,
+ lhs.data(),
+ rhs.data(),
+ x_computed.data(),
+ &summary.message);
+ ASSERT_EQ(summary.termination_type, LINEAR_SOLVER_SUCCESS);
+ ASSERT_NEAR((x_computed - x_expected).norm() / x_expected.norm(),
+ 0.0,
+ 1e-10);
+ }
+}
+
#endif // CERES_NO_CUDA
} // namespace internal
diff --git a/internal/ceres/cuda_dense_qr_test.cc b/internal/ceres/cuda_dense_qr_test.cc
index 6a64298..5d7b48c 100644
--- a/internal/ceres/cuda_dense_qr_test.cc
+++ b/internal/ceres/cuda_dense_qr_test.cc
@@ -135,10 +135,6 @@
std::unique_ptr<DenseQR> dense_qr = CUDADenseQR::Create(options);
const int kNumTrials = 100;
- const int kMinNumCols = 1;
- const int kMaxNumCols = 10;
- const int kMinRowsFactor = 1;
- const int kMaxRowsFactor = 3;
for (int i = 0; i < kNumTrials; ++i) {
LhsType lhs = LhsType::Random(kNumRows, kNumCols);
SolutionType x_expected = SolutionType::Random(kNumCols);
diff --git a/internal/ceres/dense_cholesky.h b/internal/ceres/dense_cholesky.h
index d056d85..655a2f8 100644
--- a/internal/ceres/dense_cholesky.h
+++ b/internal/ceres/dense_cholesky.h
@@ -167,7 +167,7 @@
// GPU memory allocated for the B matrix (rhs vector).
CudaBuffer<double> rhs_;
// Scratch space for cuSOLVER on the GPU.
- CudaBuffer<uint8_t> device_workspace_;
+ CudaBuffer<double> device_workspace_;
// Required for error handling with cuSOLVER.
CudaBuffer<int> error_;
// Cache the result of Factorize to ensure that when Solve is called, the
diff --git a/internal/ceres/dense_qr.h b/internal/ceres/dense_qr.h
index 1a3bc81..7a2ffb5 100644
--- a/internal/ceres/dense_qr.h
+++ b/internal/ceres/dense_qr.h
@@ -189,7 +189,7 @@
// GPU memory allocated for the TAU matrix (scaling of householder vectors).
CudaBuffer<double> tau_;
// Scratch space for cuSOLVER on the GPU.
- CudaBuffer<uint8_t> device_workspace_;
+ CudaBuffer<double> device_workspace_;
// Required for error handling with cuSOLVER.
CudaBuffer<int> error_;
// Cache the result of Factorize to ensure that when Solve is called, the