blob: 5a3e7e2cad19c5ea225f84fef9e9fbddf8b97fc6 [file] [log] [blame]
// Ceres Solver - A fast non-linear least squares minimizer
// Copyright 2023 Google Inc. All rights reserved.
// http://ceres-solver.org/
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// * Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
// * Neither the name of Google Inc. nor the names of its contributors may be
// used to endorse or promote products derived from this software without
// specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
// Author: sameeragarwal@google.com (Sameer Agarwal)
#include "ceres/dense_cholesky.h"
#include <algorithm>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "ceres/internal/config.h"
#include "ceres/iterative_refiner.h"
#ifndef CERES_NO_CUDA
#include "ceres/context_impl.h"
#include "ceres/cuda_kernels_vector_ops.h"
#include "cuda_runtime.h"
#include "cusolverDn.h"
#endif // CERES_NO_CUDA
#ifndef CERES_NO_LAPACK
// C interface to the LAPACK Cholesky factorization and triangular solve.
extern "C" void dpotrf_(
const char* uplo, const int* n, double* a, const int* lda, int* info);
extern "C" void dpotrs_(const char* uplo,
const int* n,
const int* nrhs,
const double* a,
const int* lda,
double* b,
const int* ldb,
int* info);
extern "C" void spotrf_(
const char* uplo, const int* n, float* a, const int* lda, int* info);
extern "C" void spotrs_(const char* uplo,
const int* n,
const int* nrhs,
const float* a,
const int* lda,
float* b,
const int* ldb,
int* info);
#endif
namespace ceres::internal {
DenseCholesky::~DenseCholesky() = default;
std::unique_ptr<DenseCholesky> DenseCholesky::Create(
const LinearSolver::Options& options) {
std::unique_ptr<DenseCholesky> dense_cholesky;
switch (options.dense_linear_algebra_library_type) {
case EIGEN:
// Eigen mixed precision solver not yet implemented.
if (options.use_mixed_precision_solves) {
dense_cholesky = std::make_unique<FloatEigenDenseCholesky>();
} else {
dense_cholesky = std::make_unique<EigenDenseCholesky>();
}
break;
case LAPACK:
#ifndef CERES_NO_LAPACK
// LAPACK mixed precision solver not yet implemented.
if (options.use_mixed_precision_solves) {
dense_cholesky = std::make_unique<FloatLAPACKDenseCholesky>();
} else {
dense_cholesky = std::make_unique<LAPACKDenseCholesky>();
}
break;
#else
LOG(FATAL) << "Ceres was compiled without support for LAPACK.";
#endif
case CUDA:
#ifndef CERES_NO_CUDA
if (options.use_mixed_precision_solves) {
dense_cholesky = CUDADenseCholeskyMixedPrecision::Create(options);
} else {
dense_cholesky = CUDADenseCholesky::Create(options);
}
break;
#else
LOG(FATAL) << "Ceres was compiled without support for CUDA.";
#endif
default:
LOG(FATAL) << "Unknown dense linear algebra library type : "
<< DenseLinearAlgebraLibraryTypeToString(
options.dense_linear_algebra_library_type);
}
if (options.max_num_refinement_iterations > 0) {
auto refiner = std::make_unique<DenseIterativeRefiner>(
options.max_num_refinement_iterations);
dense_cholesky = std::make_unique<RefinedDenseCholesky>(
std::move(dense_cholesky), std::move(refiner));
}
return dense_cholesky;
}
LinearSolverTerminationType DenseCholesky::FactorAndSolve(
int num_cols,
double* lhs,
const double* rhs,
double* solution,
std::string* message) {
LinearSolverTerminationType termination_type =
Factorize(num_cols, lhs, message);
if (termination_type == LinearSolverTerminationType::SUCCESS) {
termination_type = Solve(rhs, solution, message);
}
return termination_type;
}
LinearSolverTerminationType EigenDenseCholesky::Factorize(
int num_cols, double* lhs, std::string* message) {
Eigen::Map<Eigen::MatrixXd> m(lhs, num_cols, num_cols);
llt_ = std::make_unique<LLTType>(m);
if (llt_->info() != Eigen::Success) {
*message = "Eigen failure. Unable to perform dense Cholesky factorization.";
return LinearSolverTerminationType::FAILURE;
}
*message = "Success.";
return LinearSolverTerminationType::SUCCESS;
}
LinearSolverTerminationType EigenDenseCholesky::Solve(const double* rhs,
double* solution,
std::string* message) {
if (llt_->info() != Eigen::Success) {
*message = "Eigen failure. Unable to perform dense Cholesky factorization.";
return LinearSolverTerminationType::FAILURE;
}
VectorRef(solution, llt_->cols()) =
llt_->solve(ConstVectorRef(rhs, llt_->cols()));
*message = "Success.";
return LinearSolverTerminationType::SUCCESS;
}
LinearSolverTerminationType FloatEigenDenseCholesky::Factorize(
int num_cols, double* lhs, std::string* message) {
// TODO(sameeragarwal): Check if this causes a double allocation.
lhs_ = Eigen::Map<Eigen::MatrixXd>(lhs, num_cols, num_cols).cast<float>();
llt_ = std::make_unique<LLTType>(lhs_);
if (llt_->info() != Eigen::Success) {
*message = "Eigen failure. Unable to perform dense Cholesky factorization.";
return LinearSolverTerminationType::FAILURE;
}
*message = "Success.";
return LinearSolverTerminationType::SUCCESS;
}
LinearSolverTerminationType FloatEigenDenseCholesky::Solve(
const double* rhs, double* solution, std::string* message) {
if (llt_->info() != Eigen::Success) {
*message = "Eigen failure. Unable to perform dense Cholesky factorization.";
return LinearSolverTerminationType::FAILURE;
}
rhs_ = ConstVectorRef(rhs, llt_->cols()).cast<float>();
solution_ = llt_->solve(rhs_);
VectorRef(solution, llt_->cols()) = solution_.cast<double>();
*message = "Success.";
return LinearSolverTerminationType::SUCCESS;
}
#ifndef CERES_NO_LAPACK
LinearSolverTerminationType LAPACKDenseCholesky::Factorize(
int num_cols, double* lhs, std::string* message) {
lhs_ = lhs;
num_cols_ = num_cols;
const char uplo = 'L';
int info = 0;
dpotrf_(&uplo, &num_cols_, lhs_, &num_cols_, &info);
if (info < 0) {
termination_type_ = LinearSolverTerminationType::FATAL_ERROR;
LOG(FATAL) << "Congratulations, you found a bug in Ceres. "
<< "Please report it. "
<< "LAPACK::dpotrf fatal error. "
<< "Argument: " << -info << " is invalid.";
} else if (info > 0) {
termination_type_ = LinearSolverTerminationType::FAILURE;
*message = StringPrintf(
"LAPACK::dpotrf numerical failure. "
"The leading minor of order %d is not positive definite.",
info);
} else {
termination_type_ = LinearSolverTerminationType::SUCCESS;
*message = "Success.";
}
return termination_type_;
}
LinearSolverTerminationType LAPACKDenseCholesky::Solve(const double* rhs,
double* solution,
std::string* message) {
const char uplo = 'L';
const int nrhs = 1;
int info = 0;
VectorRef(solution, num_cols_) = ConstVectorRef(rhs, num_cols_);
dpotrs_(
&uplo, &num_cols_, &nrhs, lhs_, &num_cols_, solution, &num_cols_, &info);
if (info < 0) {
termination_type_ = LinearSolverTerminationType::FATAL_ERROR;
LOG(FATAL) << "Congratulations, you found a bug in Ceres. "
<< "Please report it. "
<< "LAPACK::dpotrs fatal error. "
<< "Argument: " << -info << " is invalid.";
}
*message = "Success";
termination_type_ = LinearSolverTerminationType::SUCCESS;
return termination_type_;
}
LinearSolverTerminationType FloatLAPACKDenseCholesky::Factorize(
int num_cols, double* lhs, std::string* message) {
num_cols_ = num_cols;
lhs_ = Eigen::Map<Eigen::MatrixXd>(lhs, num_cols, num_cols).cast<float>();
const char uplo = 'L';
int info = 0;
spotrf_(&uplo, &num_cols_, lhs_.data(), &num_cols_, &info);
if (info < 0) {
termination_type_ = LinearSolverTerminationType::FATAL_ERROR;
LOG(FATAL) << "Congratulations, you found a bug in Ceres. "
<< "Please report it. "
<< "LAPACK::spotrf fatal error. "
<< "Argument: " << -info << " is invalid.";
} else if (info > 0) {
termination_type_ = LinearSolverTerminationType::FAILURE;
*message = StringPrintf(
"LAPACK::spotrf numerical failure. "
"The leading minor of order %d is not positive definite.",
info);
} else {
termination_type_ = LinearSolverTerminationType::SUCCESS;
*message = "Success.";
}
return termination_type_;
}
LinearSolverTerminationType FloatLAPACKDenseCholesky::Solve(
const double* rhs, double* solution, std::string* message) {
const char uplo = 'L';
const int nrhs = 1;
int info = 0;
rhs_and_solution_ = ConstVectorRef(rhs, num_cols_).cast<float>();
spotrs_(&uplo,
&num_cols_,
&nrhs,
lhs_.data(),
&num_cols_,
rhs_and_solution_.data(),
&num_cols_,
&info);
if (info < 0) {
termination_type_ = LinearSolverTerminationType::FATAL_ERROR;
LOG(FATAL) << "Congratulations, you found a bug in Ceres. "
<< "Please report it. "
<< "LAPACK::dpotrs fatal error. "
<< "Argument: " << -info << " is invalid.";
}
*message = "Success";
termination_type_ = LinearSolverTerminationType::SUCCESS;
VectorRef(solution, num_cols_) =
rhs_and_solution_.head(num_cols_).cast<double>();
return termination_type_;
}
#endif // CERES_NO_LAPACK
RefinedDenseCholesky::RefinedDenseCholesky(
std::unique_ptr<DenseCholesky> dense_cholesky,
std::unique_ptr<DenseIterativeRefiner> iterative_refiner)
: dense_cholesky_(std::move(dense_cholesky)),
iterative_refiner_(std::move(iterative_refiner)) {}
RefinedDenseCholesky::~RefinedDenseCholesky() = default;
LinearSolverTerminationType RefinedDenseCholesky::Factorize(
const int num_cols, double* lhs, std::string* message) {
lhs_ = lhs;
num_cols_ = num_cols;
return dense_cholesky_->Factorize(num_cols, lhs, message);
}
LinearSolverTerminationType RefinedDenseCholesky::Solve(const double* rhs,
double* solution,
std::string* message) {
CHECK(lhs_ != nullptr);
auto termination_type = dense_cholesky_->Solve(rhs, solution, message);
if (termination_type != LinearSolverTerminationType::SUCCESS) {
return termination_type;
}
iterative_refiner_->Refine(
num_cols_, lhs_, rhs, dense_cholesky_.get(), solution);
return LinearSolverTerminationType::SUCCESS;
}
#ifndef CERES_NO_CUDA
CUDADenseCholesky::CUDADenseCholesky(ContextImpl* context)
: context_(context),
lhs_{context},
rhs_{context},
device_workspace_{context},
error_(context, 1) {}
LinearSolverTerminationType CUDADenseCholesky::Factorize(int num_cols,
double* lhs,
std::string* message) {
factorize_result_ = LinearSolverTerminationType::FATAL_ERROR;
lhs_.Reserve(num_cols * num_cols);
num_cols_ = num_cols;
lhs_.CopyFromCpu(lhs, num_cols * num_cols);
int device_workspace_size = 0;
if (cusolverDnDpotrf_bufferSize(context_->cusolver_handle_,
CUBLAS_FILL_MODE_LOWER,
num_cols,
lhs_.data(),
num_cols,
&device_workspace_size) !=
CUSOLVER_STATUS_SUCCESS) {
*message = "cuSolverDN::cusolverDnDpotrf_bufferSize failed.";
return LinearSolverTerminationType::FATAL_ERROR;
}
device_workspace_.Reserve(device_workspace_size);
if (cusolverDnDpotrf(context_->cusolver_handle_,
CUBLAS_FILL_MODE_LOWER,
num_cols,
lhs_.data(),
num_cols,
reinterpret_cast<double*>(device_workspace_.data()),
device_workspace_.size(),
error_.data()) != CUSOLVER_STATUS_SUCCESS) {
*message = "cuSolverDN::cusolverDnDpotrf failed.";
return LinearSolverTerminationType::FATAL_ERROR;
}
int error = 0;
error_.CopyToCpu(&error, 1);
if (error < 0) {
LOG(FATAL) << "Congratulations, you found a bug in Ceres - "
<< "please report it. "
<< "cuSolverDN::cusolverDnXpotrf fatal error. "
<< "Argument: " << -error << " is invalid.";
// The following line is unreachable, but return failure just to be
// pedantic, since the compiler does not know that.
return LinearSolverTerminationType::FATAL_ERROR;
} else if (error > 0) {
*message = StringPrintf(
"cuSolverDN::cusolverDnDpotrf numerical failure. "
"The leading minor of order %d is not positive definite.",
error);
factorize_result_ = LinearSolverTerminationType::FAILURE;
return LinearSolverTerminationType::FAILURE;
}
*message = "Success";
factorize_result_ = LinearSolverTerminationType::SUCCESS;
return LinearSolverTerminationType::SUCCESS;
}
LinearSolverTerminationType CUDADenseCholesky::Solve(const double* rhs,
double* solution,
std::string* message) {
if (factorize_result_ != LinearSolverTerminationType::SUCCESS) {
*message = "Factorize did not complete successfully previously.";
return factorize_result_;
}
rhs_.CopyFromCpu(rhs, num_cols_);
if (cusolverDnDpotrs(context_->cusolver_handle_,
CUBLAS_FILL_MODE_LOWER,
num_cols_,
1,
lhs_.data(),
num_cols_,
rhs_.data(),
num_cols_,
error_.data()) != CUSOLVER_STATUS_SUCCESS) {
*message = "cuSolverDN::cusolverDnDpotrs failed.";
return LinearSolverTerminationType::FATAL_ERROR;
}
int error = 0;
error_.CopyToCpu(&error, 1);
if (error != 0) {
LOG(FATAL) << "Congratulations, you found a bug in Ceres. "
<< "Please report it."
<< "cuSolverDN::cusolverDnDpotrs fatal error. "
<< "Argument: " << -error << " is invalid.";
}
rhs_.CopyToCpu(solution, num_cols_);
*message = "Success";
return LinearSolverTerminationType::SUCCESS;
}
std::unique_ptr<CUDADenseCholesky> CUDADenseCholesky::Create(
const LinearSolver::Options& options) {
if (options.dense_linear_algebra_library_type != CUDA ||
options.context == nullptr || !options.context->IsCudaInitialized()) {
return nullptr;
}
return std::unique_ptr<CUDADenseCholesky>(
new CUDADenseCholesky(options.context));
}
std::unique_ptr<CUDADenseCholeskyMixedPrecision>
CUDADenseCholeskyMixedPrecision::Create(const LinearSolver::Options& options) {
if (options.dense_linear_algebra_library_type != CUDA ||
!options.use_mixed_precision_solves || options.context == nullptr ||
!options.context->IsCudaInitialized()) {
return nullptr;
}
return std::unique_ptr<CUDADenseCholeskyMixedPrecision>(
new CUDADenseCholeskyMixedPrecision(
options.context, options.max_num_refinement_iterations));
}
LinearSolverTerminationType
CUDADenseCholeskyMixedPrecision::CudaCholeskyFactorize(std::string* message) {
int device_workspace_size = 0;
if (cusolverDnSpotrf_bufferSize(context_->cusolver_handle_,
CUBLAS_FILL_MODE_LOWER,
num_cols_,
lhs_fp32_.data(),
num_cols_,
&device_workspace_size) !=
CUSOLVER_STATUS_SUCCESS) {
*message = "cuSolverDN::cusolverDnSpotrf_bufferSize failed.";
return LinearSolverTerminationType::FATAL_ERROR;
}
device_workspace_.Reserve(device_workspace_size);
if (cusolverDnSpotrf(context_->cusolver_handle_,
CUBLAS_FILL_MODE_LOWER,
num_cols_,
lhs_fp32_.data(),
num_cols_,
device_workspace_.data(),
device_workspace_.size(),
error_.data()) != CUSOLVER_STATUS_SUCCESS) {
*message = "cuSolverDN::cusolverDnSpotrf failed.";
return LinearSolverTerminationType::FATAL_ERROR;
}
int error = 0;
error_.CopyToCpu(&error, 1);
if (error < 0) {
LOG(FATAL) << "Congratulations, you found a bug in Ceres - "
<< "please report it. "
<< "cuSolverDN::cusolverDnSpotrf fatal error. "
<< "Argument: " << -error << " is invalid.";
// The following line is unreachable, but return failure just to be
// pedantic, since the compiler does not know that.
return LinearSolverTerminationType::FATAL_ERROR;
}
if (error > 0) {
*message = StringPrintf(
"cuSolverDN::cusolverDnSpotrf numerical failure. "
"The leading minor of order %d is not positive definite.",
error);
factorize_result_ = LinearSolverTerminationType::FAILURE;
return LinearSolverTerminationType::FAILURE;
}
*message = "Success";
return LinearSolverTerminationType::SUCCESS;
}
LinearSolverTerminationType CUDADenseCholeskyMixedPrecision::CudaCholeskySolve(
std::string* message) {
CHECK_EQ(cudaMemcpyAsync(correction_fp32_.data(),
residual_fp32_.data(),
num_cols_ * sizeof(float),
cudaMemcpyDeviceToDevice,
context_->DefaultStream()),
cudaSuccess);
if (cusolverDnSpotrs(context_->cusolver_handle_,
CUBLAS_FILL_MODE_LOWER,
num_cols_,
1,
lhs_fp32_.data(),
num_cols_,
correction_fp32_.data(),
num_cols_,
error_.data()) != CUSOLVER_STATUS_SUCCESS) {
*message = "cuSolverDN::cusolverDnDpotrs failed.";
return LinearSolverTerminationType::FATAL_ERROR;
}
int error = 0;
error_.CopyToCpu(&error, 1);
if (error != 0) {
LOG(FATAL) << "Congratulations, you found a bug in Ceres. "
<< "Please report it."
<< "cuSolverDN::cusolverDnDpotrs fatal error. "
<< "Argument: " << -error << " is invalid.";
}
*message = "Success";
return LinearSolverTerminationType::SUCCESS;
}
CUDADenseCholeskyMixedPrecision::CUDADenseCholeskyMixedPrecision(
ContextImpl* context, int max_num_refinement_iterations)
: context_(context),
lhs_fp64_{context},
rhs_fp64_{context},
lhs_fp32_{context},
device_workspace_{context},
error_(context, 1),
x_fp64_{context},
correction_fp32_{context},
residual_fp32_{context},
residual_fp64_{context},
max_num_refinement_iterations_(max_num_refinement_iterations) {}
LinearSolverTerminationType CUDADenseCholeskyMixedPrecision::Factorize(
int num_cols, double* lhs, std::string* message) {
num_cols_ = num_cols;
// Copy fp64 version of lhs to GPU.
lhs_fp64_.Reserve(num_cols * num_cols);
lhs_fp64_.CopyFromCpu(lhs, num_cols * num_cols);
// Create an fp32 copy of lhs, lhs_fp32.
lhs_fp32_.Reserve(num_cols * num_cols);
CudaFP64ToFP32(lhs_fp64_.data(),
lhs_fp32_.data(),
num_cols * num_cols,
context_->DefaultStream());
// Factorize lhs_fp32.
factorize_result_ = CudaCholeskyFactorize(message);
return factorize_result_;
}
LinearSolverTerminationType CUDADenseCholeskyMixedPrecision::Solve(
const double* rhs, double* solution, std::string* message) {
// If factorization failed, return failure.
if (factorize_result_ != LinearSolverTerminationType::SUCCESS) {
*message = "Factorize did not complete successfully previously.";
return factorize_result_;
}
// Reserve memory for all arrays.
rhs_fp64_.Reserve(num_cols_);
x_fp64_.Reserve(num_cols_);
correction_fp32_.Reserve(num_cols_);
residual_fp32_.Reserve(num_cols_);
residual_fp64_.Reserve(num_cols_);
// Initialize x = 0.
CudaSetZeroFP64(x_fp64_.data(), num_cols_, context_->DefaultStream());
// Initialize residual = rhs.
rhs_fp64_.CopyFromCpu(rhs, num_cols_);
residual_fp64_.CopyFromGPUArray(rhs_fp64_.data(), num_cols_);
for (int i = 0; i <= max_num_refinement_iterations_; ++i) {
// Cast residual from fp64 to fp32.
CudaFP64ToFP32(residual_fp64_.data(),
residual_fp32_.data(),
num_cols_,
context_->DefaultStream());
// [fp32] c = lhs^-1 * residual.
auto result = CudaCholeskySolve(message);
if (result != LinearSolverTerminationType::SUCCESS) {
return result;
}
// [fp64] x += c.
CudaDsxpy(x_fp64_.data(),
correction_fp32_.data(),
num_cols_,
context_->DefaultStream());
if (i < max_num_refinement_iterations_) {
// [fp64] residual = rhs - lhs * x
// This is done in two steps:
// 1. [fp64] residual = rhs
residual_fp64_.CopyFromGPUArray(rhs_fp64_.data(), num_cols_);
// 2. [fp64] residual = residual - lhs * x
double alpha = -1.0;
double beta = 1.0;
cublasDsymv(context_->cublas_handle_,
CUBLAS_FILL_MODE_LOWER,
num_cols_,
&alpha,
lhs_fp64_.data(),
num_cols_,
x_fp64_.data(),
1,
&beta,
residual_fp64_.data(),
1);
}
}
x_fp64_.CopyToCpu(solution, num_cols_);
*message = "Success.";
return LinearSolverTerminationType::SUCCESS;
}
#endif // CERES_NO_CUDA
} // namespace ceres::internal