Bugfix to CUDA workspace handling

* Fix workspace type in CUDADenseQR and CUDADenseCholesky --
  Workspace sizes are in terms of number of elements, not bytes.
* Add cuda-memcheck tests to catch such CUDA memory errors in
  the future.

Change-Id: I3dd0f0947daba9e4c6cd0216bef81d694547d505
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index fa538fe..cc96af5 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -136,6 +136,14 @@
        ${CUDA_cublas_LIBRARY}
        ${CUDA_cusolver_LIBRARY}
        ${CUDA_cusparse_LIBRARY})
+  add_test(
+      NAME cuda_memcheck_dense_qr_test
+      COMMAND cuda-memcheck --leak-check full
+          ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/cuda_dense_qr_test)
+  add_test(
+      NAME cuda_memcheck_dense_cholesky_test
+      COMMAND cuda-memcheck --leak-check full
+          ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/cuda_dense_cholesky_test)
 endif (CUDA_FOUND)
 
 if (LAPACK_FOUND)
diff --git a/internal/ceres/cuda_dense_cholesky_test.cc b/internal/ceres/cuda_dense_cholesky_test.cc
index cca97d8..5837e5c 100644
--- a/internal/ceres/cuda_dense_cholesky_test.cc
+++ b/internal/ceres/cuda_dense_cholesky_test.cc
@@ -128,6 +128,48 @@
             LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR);
 }
 
+TEST(CUDADenseCholesky, Randomized1600x1600Tests) {
+  const int kNumCols = 1600;
+  using LhsType = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>;
+  using RhsType = Eigen::Matrix<double, Eigen::Dynamic, 1>;
+  using SolutionType = Eigen::Matrix<double, Eigen::Dynamic, 1>;
+
+  LinearSolver::Options options;
+  ContextImpl context;
+  options.context = &context;
+  options.dense_linear_algebra_library_type = ceres::CUDA;
+  std::unique_ptr<DenseCholesky> dense_cholesky = CUDADenseCholesky::Create(options);
+
+  const int kNumTrials = 20;
+  for (int i = 0; i < kNumTrials; ++i) {
+    LhsType lhs = LhsType::Random(kNumCols, kNumCols);
+    lhs = lhs.transpose() * lhs;
+    lhs += 1e-3 * LhsType::Identity(kNumCols, kNumCols);
+    SolutionType x_expected = SolutionType::Random(kNumCols);
+    RhsType rhs = lhs * x_expected;
+    SolutionType x_computed = SolutionType::Zero(kNumCols);
+    // Sanity check the random matrix sizes.
+    EXPECT_EQ(lhs.rows(), kNumCols);
+    EXPECT_EQ(lhs.cols(), kNumCols);
+    EXPECT_EQ(rhs.rows(), kNumCols);
+    EXPECT_EQ(rhs.cols(), 1);
+    EXPECT_EQ(x_expected.rows(), kNumCols);
+    EXPECT_EQ(x_expected.cols(), 1);
+    EXPECT_EQ(x_computed.rows(), kNumCols);
+    EXPECT_EQ(x_computed.cols(), 1);
+    LinearSolver::Summary summary;
+    summary.termination_type = dense_cholesky->FactorAndSolve(kNumCols,
+                                                              lhs.data(),
+                                                              rhs.data(),
+                                                              x_computed.data(),
+                                                              &summary.message);
+    ASSERT_EQ(summary.termination_type, LINEAR_SOLVER_SUCCESS);
+    ASSERT_NEAR((x_computed - x_expected).norm() / x_expected.norm(),
+                0.0,
+                1e-10);
+  }
+}
+
 #endif  // CERES_NO_CUDA
 
 }  // namespace internal
diff --git a/internal/ceres/cuda_dense_qr_test.cc b/internal/ceres/cuda_dense_qr_test.cc
index 6a64298..5d7b48c 100644
--- a/internal/ceres/cuda_dense_qr_test.cc
+++ b/internal/ceres/cuda_dense_qr_test.cc
@@ -135,10 +135,6 @@
   std::unique_ptr<DenseQR> dense_qr = CUDADenseQR::Create(options);
 
   const int kNumTrials = 100;
-  const int kMinNumCols = 1;
-  const int kMaxNumCols = 10;
-  const int kMinRowsFactor = 1;
-  const int kMaxRowsFactor = 3;
   for (int i = 0; i < kNumTrials; ++i) {
     LhsType lhs = LhsType::Random(kNumRows, kNumCols);
     SolutionType x_expected = SolutionType::Random(kNumCols);
diff --git a/internal/ceres/dense_cholesky.h b/internal/ceres/dense_cholesky.h
index d056d85..655a2f8 100644
--- a/internal/ceres/dense_cholesky.h
+++ b/internal/ceres/dense_cholesky.h
@@ -167,7 +167,7 @@
   // GPU memory allocated for the B matrix (rhs vector).
   CudaBuffer<double> rhs_;
   // Scratch space for cuSOLVER on the GPU.
-  CudaBuffer<uint8_t> device_workspace_;
+  CudaBuffer<double> device_workspace_;
   // Required for error handling with cuSOLVER.
   CudaBuffer<int> error_;
   // Cache the result of Factorize to ensure that when Solve is called, the
diff --git a/internal/ceres/dense_qr.h b/internal/ceres/dense_qr.h
index 1a3bc81..7a2ffb5 100644
--- a/internal/ceres/dense_qr.h
+++ b/internal/ceres/dense_qr.h
@@ -189,7 +189,7 @@
   // GPU memory allocated for the TAU matrix (scaling of householder vectors).
   CudaBuffer<double> tau_;
   // Scratch space for cuSOLVER on the GPU.
-  CudaBuffer<uint8_t> device_workspace_;
+  CudaBuffer<double> device_workspace_;
   // Required for error handling with cuSOLVER.
   CudaBuffer<int> error_;
   // Cache the result of Factorize to ensure that when Solve is called, the