Use page locked memory in BlockSparseMatrix

If using CUDA_SPARSE for an iterative solve on the GPU,
allocate the values array in BlockSparseMatrix to make copying
to the GPU faster.

Change-Id: I63c1d2512babd74fc275b277ac8c3eabf3ec1144
diff --git a/internal/ceres/block_jacobian_writer.cc b/internal/ceres/block_jacobian_writer.cc
index 29fe688..f74d64d 100644
--- a/internal/ceres/block_jacobian_writer.cc
+++ b/internal/ceres/block_jacobian_writer.cc
@@ -125,7 +125,7 @@
 
 BlockJacobianWriter::BlockJacobianWriter(const Evaluator::Options& options,
                                          Program* program)
-    : program_(program) {
+    : options_(options), program_(program) {
   CHECK_GE(options.num_eliminate_blocks, 0)
       << "num_eliminate_blocks must be greater than 0.";
 
@@ -207,7 +207,8 @@
     std::sort(row->cells.begin(), row->cells.end(), CellLessThan);
   }
 
-  return std::make_unique<BlockSparseMatrix>(bs);
+  return std::make_unique<BlockSparseMatrix>(
+      bs, options_.sparse_linear_algebra_library_type == CUDA_SPARSE);
 }
 
 }  // namespace ceres::internal
diff --git a/internal/ceres/block_jacobian_writer.h b/internal/ceres/block_jacobian_writer.h
index 7f5c50b..61f69b3 100644
--- a/internal/ceres/block_jacobian_writer.h
+++ b/internal/ceres/block_jacobian_writer.h
@@ -74,6 +74,7 @@
   }
 
  private:
+  Evaluator::Options options_;
   Program* program_;
 
   // Stores the position of each residual / parameter jacobian.
diff --git a/internal/ceres/block_sparse_matrix.cc b/internal/ceres/block_sparse_matrix.cc
index b3d4efd..ab1d746 100644
--- a/internal/ceres/block_sparse_matrix.cc
+++ b/internal/ceres/block_sparse_matrix.cc
@@ -46,6 +46,10 @@
 #include "ceres/triplet_sparse_matrix.h"
 #include "glog/logging.h"
 
+#ifndef CERES_NO_CUDA
+#include "cuda_runtime.h"
+#endif
+
 namespace ceres::internal {
 
 namespace {
@@ -171,8 +175,9 @@
 }  // namespace
 
 BlockSparseMatrix::BlockSparseMatrix(
-    CompressedRowBlockStructure* block_structure)
-    : num_rows_(0),
+    CompressedRowBlockStructure* block_structure, bool use_page_locked_memory)
+    : use_page_locked_memory_(use_page_locked_memory),
+      num_rows_(0),
       num_cols_(0),
       num_nonzeros_(0),
       block_structure_(block_structure) {
@@ -202,12 +207,15 @@
   CHECK_GE(num_nonzeros_, 0);
   VLOG(2) << "Allocating values array with " << num_nonzeros_ * sizeof(double)
           << " bytes.";  // NOLINT
-  values_ = std::make_unique<double[]>(num_nonzeros_);
+
+  values_ = AllocateValues(num_nonzeros_);
   max_num_nonzeros_ = num_nonzeros_;
   CHECK(values_ != nullptr);
   AddTransposeBlockStructure();
 }
 
+BlockSparseMatrix::~BlockSparseMatrix() { FreeValues(values_); }
+
 void BlockSparseMatrix::AddTransposeBlockStructure() {
   if (transpose_block_structure_ == nullptr) {
     transpose_block_structure_ = CreateTranspose(*block_structure_);
@@ -215,11 +223,11 @@
 }
 
 void BlockSparseMatrix::SetZero() {
-  std::fill(values_.get(), values_.get() + num_nonzeros_, 0.0);
+  std::fill(values_, values_ + num_nonzeros_, 0.0);
 }
 
 void BlockSparseMatrix::SetZero(ContextImpl* context, int num_threads) {
-  ParallelSetZero(context, num_threads, values_.get(), num_nonzeros_);
+  ParallelSetZero(context, num_threads, values_, num_nonzeros_);
 }
 
 void BlockSparseMatrix::RightMultiplyAndAccumulate(const double* x,
@@ -234,7 +242,7 @@
   CHECK(x != nullptr);
   CHECK(y != nullptr);
 
-  const auto values = values_.get();
+  const auto values = values_;
   const auto block_structure = block_structure_.get();
   const auto num_row_blocks = block_structure->rows.size();
 
@@ -282,7 +290,7 @@
   }
 
   auto transpose_bs = transpose_block_structure_.get();
-  const auto values = values_.get();
+  const auto values = values_;
   const int num_col_blocks = transpose_bs->rows.size();
   if (!num_col_blocks) {
     return;
@@ -330,7 +338,7 @@
       int col_block_size = block_structure_->cols[col_block_id].size;
       int col_block_pos = block_structure_->cols[col_block_id].position;
       MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
-          values_.get() + cell.position,
+          values_ + cell.position,
           row_block_size,
           col_block_size,
           x + row_block_pos,
@@ -350,7 +358,7 @@
       int col_block_size = block_structure_->cols[col_block_id].size;
       int col_block_pos = block_structure_->cols[col_block_id].position;
       const MatrixRef m(
-          values_.get() + cell.position, row_block_size, col_block_size);
+          values_ + cell.position, row_block_size, col_block_size);
       VectorRef(x + col_block_pos, col_block_size) += m.colwise().squaredNorm();
     }
   }
@@ -370,7 +378,7 @@
   ParallelSetZero(context, num_threads, x, num_cols_);
 
   auto transpose_bs = transpose_block_structure_.get();
-  const auto values = values_.get();
+  const auto values = values_;
   const int num_col_blocks = transpose_bs->rows.size();
   ParallelFor(
       context,
@@ -401,8 +409,7 @@
       int col_block_id = cell.block_id;
       int col_block_size = block_structure_->cols[col_block_id].size;
       int col_block_pos = block_structure_->cols[col_block_id].position;
-      MatrixRef m(
-          values_.get() + cell.position, row_block_size, col_block_size);
+      MatrixRef m(values_ + cell.position, row_block_size, col_block_size);
       m *= ConstVectorRef(scale + col_block_pos, col_block_size).asDiagonal();
     }
   }
@@ -420,7 +427,7 @@
 
   CHECK(scale != nullptr);
   auto transpose_bs = transpose_block_structure_.get();
-  auto values = values_.get();
+  auto values = values_;
   const int num_col_blocks = transpose_bs->rows.size();
   ParallelFor(
       context,
@@ -500,7 +507,7 @@
       int col_block_pos = block_structure_->cols[col_block_id].position;
       int jac_pos = cell.position;
       m.block(row_block_pos, col_block_pos, row_block_size, col_block_size) +=
-          MatrixRef(values_.get() + jac_pos, row_block_size, col_block_size);
+          MatrixRef(values_ + jac_pos, row_block_size, col_block_size);
     }
   }
 }
@@ -643,15 +650,15 @@
   }
 
   if (num_nonzeros_ > max_num_nonzeros_) {
-    auto new_values = std::make_unique<double[]>(num_nonzeros_);
-    std::copy_n(values_.get(), old_num_nonzeros, new_values.get());
-    values_ = std::move(new_values);
+    double* old_values = values_;
+    values_ = AllocateValues(num_nonzeros_);
+    std::copy_n(old_values, old_num_nonzeros, values_);
     max_num_nonzeros_ = num_nonzeros_;
+    FreeValues(old_values);
   }
 
-  std::copy(m.values(),
-            m.values() + m.num_nonzeros(),
-            values_.get() + old_num_nonzeros);
+  std::copy(
+      m.values(), m.values() + m.num_nonzeros(), values_ + old_num_nonzeros);
 
   if (transpose_block_structure_ == nullptr) {
     return;
@@ -796,4 +803,39 @@
   return transpose;
 }
 
+double* BlockSparseMatrix::AllocateValues(int size) {
+  if (!use_page_locked_memory_) {
+    return new double[size];
+  }
+
+#ifndef CERES_NO_CUDA
+
+  double* values = nullptr;
+  CHECK_EQ(cudaSuccess,
+           cudaHostAlloc(&values, sizeof(double) * size, cudaHostAllocDefault));
+  return values;
+#else
+  LOG(FATAL) << "Page locked memory requested when CUDA is not available. "
+             << "This is a Ceres bug; please contact the developers!";
+  return nullptr;
+#endif
+};
+
+void BlockSparseMatrix::FreeValues(double* values) {
+  if (!use_page_locked_memory_) {
+    delete values;
+    values = nullptr;
+    return;
+  }
+
+#ifndef CERES_NO_CUDA
+  CHECK_EQ(cudaSuccess, cudaFreeHost(values));
+#else
+  LOG(FATAL) << "Page locked memory requested when CUDA is not available. "
+             << "This is a Ceres bug; please contact the developers!";
+#endif
+
+  values = nullptr;
+};
+
 }  // namespace ceres::internal
diff --git a/internal/ceres/block_sparse_matrix.h b/internal/ceres/block_sparse_matrix.h
index 55f1cc4..0d99e15 100644
--- a/internal/ceres/block_sparse_matrix.h
+++ b/internal/ceres/block_sparse_matrix.h
@@ -65,7 +65,9 @@
   //
   // TODO(sameeragarwal): Add a function which will validate legal
   // CompressedRowBlockStructure objects.
-  explicit BlockSparseMatrix(CompressedRowBlockStructure* block_structure);
+  explicit BlockSparseMatrix(CompressedRowBlockStructure* block_structure,
+                             bool use_page_locked_memory = false);
+  ~BlockSparseMatrix();
 
   BlockSparseMatrix(const BlockSparseMatrix&) = delete;
   void operator=(const BlockSparseMatrix&) = delete;
@@ -114,8 +116,8 @@
   int num_rows()         const final { return num_rows_;     }
   int num_cols()         const final { return num_cols_;     }
   int num_nonzeros()     const final { return num_nonzeros_; }
-  const double* values() const final { return values_.get(); }
-  double* mutable_values()     final { return values_.get(); }
+  const double* values() const final { return values_; }
+  double* mutable_values()     final { return values_; }
   // clang-format on
 
   void ToTripletSparseMatrix(TripletSparseMatrix* matrix) const;
@@ -158,11 +160,15 @@
       const RandomMatrixOptions& options, std::mt19937& prng);
 
  private:
+  double* AllocateValues(int size);
+  void FreeValues(double* values);
+
+  const bool use_page_locked_memory_;
   int num_rows_;
   int num_cols_;
   int num_nonzeros_;
   int max_num_nonzeros_;
-  std::unique_ptr<double[]> values_;
+  double* values_;
   std::unique_ptr<CompressedRowBlockStructure> block_structure_;
   std::unique_ptr<CompressedRowBlockStructure> transpose_block_structure_;
 };