blob: 1e361d2b0b9c3f81260b7b3eb023254d70cf9225 [file] [log] [blame]
// Ceres Solver - A fast non-linear least squares minimizer
// Copyright 2022 Google Inc. All rights reserved.
// http://ceres-solver.org/
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// * Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
// * Neither the name of Google Inc. nor the names of its contributors may be
// used to endorse or promote products derived from this software without
// specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
//
// A CUDA sparse matrix linear operator.
// This include must come before any #ifndef check on Ceres compile options.
// clang-format off
#include "ceres/internal/config.h"
// clang-format on
#include "ceres/cuda_sparse_matrix.h"
#include <math.h>
#include <memory>
#include "ceres/block_sparse_matrix.h"
#include "ceres/compressed_row_sparse_matrix.h"
#include "ceres/context_impl.h"
#include "ceres/crs_matrix.h"
#include "ceres/internal/export.h"
#include "ceres/types.h"
#include "ceres/wall_time.h"
#ifndef CERES_NO_CUDA
#include "ceres/cuda_buffer.h"
#include "ceres/cuda_kernels.h"
#include "ceres/cuda_vector.h"
#include "cuda_runtime_api.h"
#include "cusparse.h"
namespace ceres::internal {
CudaSparseMatrix::CudaSparseMatrix(
ContextImpl* context, const CompressedRowSparseMatrix& crs_matrix) {
DCHECK_NE(context, nullptr);
CHECK(context->IsCudaInitialized());
context_ = context;
num_rows_ = crs_matrix.num_rows();
num_cols_ = crs_matrix.num_cols();
num_nonzeros_ = crs_matrix.num_nonzeros();
rows_.CopyFromCpu(crs_matrix.rows(), num_rows_ + 1, context_->stream_);
cols_.CopyFromCpu(crs_matrix.cols(), num_nonzeros_, context_->stream_);
values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_, context_->stream_);
cusparseCreateCsr(&descr_,
num_rows_,
num_cols_,
num_nonzeros_,
rows_.data(),
cols_.data(),
values_.data(),
CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_BASE_ZERO,
CUDA_R_64F);
}
CudaSparseMatrix::~CudaSparseMatrix() {
CHECK_EQ(cusparseDestroySpMat(descr_), CUSPARSE_STATUS_SUCCESS);
descr_ = nullptr;
}
void CudaSparseMatrix::CopyValuesFromCpu(
const CompressedRowSparseMatrix& crs_matrix) {
// There is no quick and easy way to verify that the structure is unchanged,
// but at least we can check that the size of the matrix and the number of
// nonzeros is unchanged.
CHECK_EQ(num_rows_, crs_matrix.num_rows());
CHECK_EQ(num_cols_, crs_matrix.num_cols());
CHECK_EQ(num_nonzeros_, crs_matrix.num_nonzeros());
values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_, context_->stream_);
}
void CudaSparseMatrix::SpMv(cusparseOperation_t op,
const CudaVector& x,
CudaVector* y) {
size_t buffer_size = 0;
const double alpha = 1.0;
const double beta = 1.0;
// Starting in CUDA 11.2.1, CUSPARSE_MV_ALG_DEFAULT was deprecated in favor of
// CUSPARSE_SPMV_ALG_DEFAULT.
#if CUDART_VERSION >= 11021
const auto algorithm = CUSPARSE_SPMV_ALG_DEFAULT;
#else // CUDART_VERSION >= 11021
const auto algorithm = CUSPARSE_MV_ALG_DEFAULT;
#endif // CUDART_VERSION >= 11021
CHECK_EQ(cusparseSpMV_bufferSize(context_->cusparse_handle_,
op,
&alpha,
descr_,
x.descr(),
&beta,
y->descr(),
CUDA_R_64F,
algorithm,
&buffer_size),
CUSPARSE_STATUS_SUCCESS);
spmv_buffer_.Reserve(buffer_size);
CHECK_EQ(cusparseSpMV(context_->cusparse_handle_,
op,
&alpha,
descr_,
x.descr(),
&beta,
y->descr(),
CUDA_R_64F,
algorithm,
spmv_buffer_.data()),
CUSPARSE_STATUS_SUCCESS);
}
void CudaSparseMatrix::RightMultiplyAndAccumulate(const CudaVector& x,
CudaVector* y) {
SpMv(CUSPARSE_OPERATION_NON_TRANSPOSE, x, y);
}
void CudaSparseMatrix::LeftMultiplyAndAccumulate(const CudaVector& x,
CudaVector* y) {
// TODO(Joydeep Biswas): We should consider storing a transposed copy of the
// matrix by converting CSR to CSC. From the cuSPARSE documentation:
// "In general, opA == CUSPARSE_OPERATION_NON_TRANSPOSE is 3x faster than opA
// != CUSPARSE_OPERATION_NON_TRANSPOSE"
SpMv(CUSPARSE_OPERATION_TRANSPOSE, x, y);
}
} // namespace ceres::internal
#endif // CERES_NO_CUDA