Use page locked memory in BlockSparseMatrix If using CUDA_SPARSE for an iterative solve on the GPU, allocate the values array in BlockSparseMatrix to make copying to the GPU faster. Change-Id: I63c1d2512babd74fc275b277ac8c3eabf3ec1144
diff --git a/internal/ceres/block_jacobian_writer.cc b/internal/ceres/block_jacobian_writer.cc index 29fe688..f74d64d 100644 --- a/internal/ceres/block_jacobian_writer.cc +++ b/internal/ceres/block_jacobian_writer.cc
@@ -125,7 +125,7 @@ BlockJacobianWriter::BlockJacobianWriter(const Evaluator::Options& options, Program* program) - : program_(program) { + : options_(options), program_(program) { CHECK_GE(options.num_eliminate_blocks, 0) << "num_eliminate_blocks must be greater than 0."; @@ -207,7 +207,8 @@ std::sort(row->cells.begin(), row->cells.end(), CellLessThan); } - return std::make_unique<BlockSparseMatrix>(bs); + return std::make_unique<BlockSparseMatrix>( + bs, options_.sparse_linear_algebra_library_type == CUDA_SPARSE); } } // namespace ceres::internal
diff --git a/internal/ceres/block_jacobian_writer.h b/internal/ceres/block_jacobian_writer.h index 7f5c50b..61f69b3 100644 --- a/internal/ceres/block_jacobian_writer.h +++ b/internal/ceres/block_jacobian_writer.h
@@ -74,6 +74,7 @@ } private: + Evaluator::Options options_; Program* program_; // Stores the position of each residual / parameter jacobian.
diff --git a/internal/ceres/block_sparse_matrix.cc b/internal/ceres/block_sparse_matrix.cc index b3d4efd..ab1d746 100644 --- a/internal/ceres/block_sparse_matrix.cc +++ b/internal/ceres/block_sparse_matrix.cc
@@ -46,6 +46,10 @@ #include "ceres/triplet_sparse_matrix.h" #include "glog/logging.h" +#ifndef CERES_NO_CUDA +#include "cuda_runtime.h" +#endif + namespace ceres::internal { namespace { @@ -171,8 +175,9 @@ } // namespace BlockSparseMatrix::BlockSparseMatrix( - CompressedRowBlockStructure* block_structure) - : num_rows_(0), + CompressedRowBlockStructure* block_structure, bool use_page_locked_memory) + : use_page_locked_memory_(use_page_locked_memory), + num_rows_(0), num_cols_(0), num_nonzeros_(0), block_structure_(block_structure) { @@ -202,12 +207,15 @@ CHECK_GE(num_nonzeros_, 0); VLOG(2) << "Allocating values array with " << num_nonzeros_ * sizeof(double) << " bytes."; // NOLINT - values_ = std::make_unique<double[]>(num_nonzeros_); + + values_ = AllocateValues(num_nonzeros_); max_num_nonzeros_ = num_nonzeros_; CHECK(values_ != nullptr); AddTransposeBlockStructure(); } +BlockSparseMatrix::~BlockSparseMatrix() { FreeValues(values_); } + void BlockSparseMatrix::AddTransposeBlockStructure() { if (transpose_block_structure_ == nullptr) { transpose_block_structure_ = CreateTranspose(*block_structure_); @@ -215,11 +223,11 @@ } void BlockSparseMatrix::SetZero() { - std::fill(values_.get(), values_.get() + num_nonzeros_, 0.0); + std::fill(values_, values_ + num_nonzeros_, 0.0); } void BlockSparseMatrix::SetZero(ContextImpl* context, int num_threads) { - ParallelSetZero(context, num_threads, values_.get(), num_nonzeros_); + ParallelSetZero(context, num_threads, values_, num_nonzeros_); } void BlockSparseMatrix::RightMultiplyAndAccumulate(const double* x, @@ -234,7 +242,7 @@ CHECK(x != nullptr); CHECK(y != nullptr); - const auto values = values_.get(); + const auto values = values_; const auto block_structure = block_structure_.get(); const auto num_row_blocks = block_structure->rows.size(); @@ -282,7 +290,7 @@ } auto transpose_bs = transpose_block_structure_.get(); - const auto values = values_.get(); + const auto values = values_; const int num_col_blocks = transpose_bs->rows.size(); if (!num_col_blocks) { return; @@ -330,7 +338,7 @@ int col_block_size = block_structure_->cols[col_block_id].size; int col_block_pos = block_structure_->cols[col_block_id].position; MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>( - values_.get() + cell.position, + values_ + cell.position, row_block_size, col_block_size, x + row_block_pos, @@ -350,7 +358,7 @@ int col_block_size = block_structure_->cols[col_block_id].size; int col_block_pos = block_structure_->cols[col_block_id].position; const MatrixRef m( - values_.get() + cell.position, row_block_size, col_block_size); + values_ + cell.position, row_block_size, col_block_size); VectorRef(x + col_block_pos, col_block_size) += m.colwise().squaredNorm(); } } @@ -370,7 +378,7 @@ ParallelSetZero(context, num_threads, x, num_cols_); auto transpose_bs = transpose_block_structure_.get(); - const auto values = values_.get(); + const auto values = values_; const int num_col_blocks = transpose_bs->rows.size(); ParallelFor( context, @@ -401,8 +409,7 @@ int col_block_id = cell.block_id; int col_block_size = block_structure_->cols[col_block_id].size; int col_block_pos = block_structure_->cols[col_block_id].position; - MatrixRef m( - values_.get() + cell.position, row_block_size, col_block_size); + MatrixRef m(values_ + cell.position, row_block_size, col_block_size); m *= ConstVectorRef(scale + col_block_pos, col_block_size).asDiagonal(); } } @@ -420,7 +427,7 @@ CHECK(scale != nullptr); auto transpose_bs = transpose_block_structure_.get(); - auto values = values_.get(); + auto values = values_; const int num_col_blocks = transpose_bs->rows.size(); ParallelFor( context, @@ -500,7 +507,7 @@ int col_block_pos = block_structure_->cols[col_block_id].position; int jac_pos = cell.position; m.block(row_block_pos, col_block_pos, row_block_size, col_block_size) += - MatrixRef(values_.get() + jac_pos, row_block_size, col_block_size); + MatrixRef(values_ + jac_pos, row_block_size, col_block_size); } } } @@ -643,15 +650,15 @@ } if (num_nonzeros_ > max_num_nonzeros_) { - auto new_values = std::make_unique<double[]>(num_nonzeros_); - std::copy_n(values_.get(), old_num_nonzeros, new_values.get()); - values_ = std::move(new_values); + double* old_values = values_; + values_ = AllocateValues(num_nonzeros_); + std::copy_n(old_values, old_num_nonzeros, values_); max_num_nonzeros_ = num_nonzeros_; + FreeValues(old_values); } - std::copy(m.values(), - m.values() + m.num_nonzeros(), - values_.get() + old_num_nonzeros); + std::copy( + m.values(), m.values() + m.num_nonzeros(), values_ + old_num_nonzeros); if (transpose_block_structure_ == nullptr) { return; @@ -796,4 +803,39 @@ return transpose; } +double* BlockSparseMatrix::AllocateValues(int size) { + if (!use_page_locked_memory_) { + return new double[size]; + } + +#ifndef CERES_NO_CUDA + + double* values = nullptr; + CHECK_EQ(cudaSuccess, + cudaHostAlloc(&values, sizeof(double) * size, cudaHostAllocDefault)); + return values; +#else + LOG(FATAL) << "Page locked memory requested when CUDA is not available. " + << "This is a Ceres bug; please contact the developers!"; + return nullptr; +#endif +}; + +void BlockSparseMatrix::FreeValues(double* values) { + if (!use_page_locked_memory_) { + delete values; + values = nullptr; + return; + } + +#ifndef CERES_NO_CUDA + CHECK_EQ(cudaSuccess, cudaFreeHost(values)); +#else + LOG(FATAL) << "Page locked memory requested when CUDA is not available. " + << "This is a Ceres bug; please contact the developers!"; +#endif + + values = nullptr; +}; + } // namespace ceres::internal
diff --git a/internal/ceres/block_sparse_matrix.h b/internal/ceres/block_sparse_matrix.h index 55f1cc4..0d99e15 100644 --- a/internal/ceres/block_sparse_matrix.h +++ b/internal/ceres/block_sparse_matrix.h
@@ -65,7 +65,9 @@ // // TODO(sameeragarwal): Add a function which will validate legal // CompressedRowBlockStructure objects. - explicit BlockSparseMatrix(CompressedRowBlockStructure* block_structure); + explicit BlockSparseMatrix(CompressedRowBlockStructure* block_structure, + bool use_page_locked_memory = false); + ~BlockSparseMatrix(); BlockSparseMatrix(const BlockSparseMatrix&) = delete; void operator=(const BlockSparseMatrix&) = delete; @@ -114,8 +116,8 @@ int num_rows() const final { return num_rows_; } int num_cols() const final { return num_cols_; } int num_nonzeros() const final { return num_nonzeros_; } - const double* values() const final { return values_.get(); } - double* mutable_values() final { return values_.get(); } + const double* values() const final { return values_; } + double* mutable_values() final { return values_; } // clang-format on void ToTripletSparseMatrix(TripletSparseMatrix* matrix) const; @@ -158,11 +160,15 @@ const RandomMatrixOptions& options, std::mt19937& prng); private: + double* AllocateValues(int size); + void FreeValues(double* values); + + const bool use_page_locked_memory_; int num_rows_; int num_cols_; int num_nonzeros_; int max_num_nonzeros_; - std::unique_ptr<double[]> values_; + double* values_; std::unique_ptr<CompressedRowBlockStructure> block_structure_; std::unique_ptr<CompressedRowBlockStructure> transpose_block_structure_; };