blob: cbe7533316eefd2bfa438a8ecb341c53e7b07493 [file] [log] [blame]
// Ceres Solver - A fast non-linear least squares minimizer
// Copyright 2022 Google Inc. All rights reserved.
// http://ceres-solver.org/
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// * Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
// * Neither the name of Google Inc. nor the names of its contributors may be
// used to endorse or promote products derived from this software without
// specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
// Author: sameeragarwal@google.com (Sameer Agarwal)
#include "ceres/dense_qr.h"
#include <algorithm>
#include <memory>
#include <string>
#ifndef CERES_NO_CUDA
#include "ceres/context_impl.h"
#include "cublas_v2.h"
#include "cusolverDn.h"
#endif // CERES_NO_CUDA
#ifndef CERES_NO_LAPACK
// LAPACK routines for solving a linear least squares problem using QR
// factorization. This is done in three stages:
//
// A * x = b
// Q * R * x = b (dgeqrf)
// R * x = Q' * b (dormqr)
// x = R^{-1} * Q'* b (dtrtrs)
// clang-format off
// Compute the QR factorization of a.
//
// a is an m x n column major matrix (Denoted by "A" in the above description)
// lda is the leading dimension of a. lda >= max(1, num_rows)
// tau is an array of size min(m,n). It contains the scalar factors of the
// elementary reflectors.
// work is an array of size max(1,lwork). On exit, if info=0, work[0] contains
// the optimal size of work.
//
// if lwork >= 1 it is the size of work. If lwork = -1, then a workspace query is assumed.
// dgeqrf computes the optimal size of the work array and returns it as work[0].
//
// info = 0, successful exit.
// info < 0, if info = -i, then the i^th argument had illegal value.
extern "C" void dgeqrf_(const int* m, const int* n, double* a, const int* lda,
double* tau, double* work, const int* lwork, int* info);
// Apply Q or Q' to b.
//
// b is a m times n column major matrix.
// size = 'L' applies Q or Q' on the left, size = 'R' applies Q or Q' on the right.
// trans = 'N', applies Q, trans = 'T', applies Q'.
// k is the number of elementary reflectors whose product defines the matrix Q.
// If size = 'L', m >= k >= 0 and if side = 'R', n >= k >= 0.
// a is an lda x k column major matrix containing the reflectors as returned by dgeqrf.
// ldb is the leading dimension of b.
// work is an array of size max(1, lwork)
// lwork if positive is the size of work. If lwork = -1, then a
// workspace query is assumed.
//
// info = 0, successful exit.
// info < 0, if info = -i, then the i^th argument had illegal value.
extern "C" void dormqr_(const char* side, const char* trans, const int* m,
const int* n ,const int* k, double* a, const int* lda,
double* tau, double* b, const int* ldb, double* work,
const int* lwork, int* info);
// Solve a triangular system of the form A * x = b
//
// uplo = 'U', A is upper triangular. uplo = 'L' is lower triangular.
// trans = 'N', 'T', 'C' specifies the form - A, A^T, A^H.
// DIAG = 'N', A is not unit triangular. 'U' is unit triangular.
// n is the order of the matrix A.
// nrhs number of columns of b.
// a is a column major lda x n.
// b is a column major matrix of ldb x nrhs
//
// info = 0 successful.
// = -i < 0 i^th argument is an illegal value.
// = i > 0, i^th diagonal element of A is zero.
extern "C" void dtrtrs_(const char* uplo, const char* trans, const char* diag,
const int* n, const int* nrhs, double* a, const int* lda,
double* b, const int* ldb, int* info);
// clang-format on
#endif
namespace ceres::internal {
DenseQR::~DenseQR() = default;
std::unique_ptr<DenseQR> DenseQR::Create(const LinearSolver::Options& options) {
std::unique_ptr<DenseQR> dense_qr;
switch (options.dense_linear_algebra_library_type) {
case EIGEN:
dense_qr = std::make_unique<EigenDenseQR>();
break;
case LAPACK:
#ifndef CERES_NO_LAPACK
dense_qr = std::make_unique<LAPACKDenseQR>();
break;
#else
LOG(FATAL) << "Ceres was compiled without support for LAPACK.";
#endif
case CUDA:
#ifndef CERES_NO_CUDA
dense_qr = CUDADenseQR::Create(options);
break;
#else
LOG(FATAL) << "Ceres was compiled without support for CUDA.";
#endif
default:
LOG(FATAL) << "Unknown dense linear algebra library type : "
<< DenseLinearAlgebraLibraryTypeToString(
options.dense_linear_algebra_library_type);
}
return dense_qr;
}
LinearSolverTerminationType DenseQR::FactorAndSolve(int num_rows,
int num_cols,
double* lhs,
const double* rhs,
double* solution,
std::string* message) {
LinearSolverTerminationType termination_type =
Factorize(num_rows, num_cols, lhs, message);
if (termination_type == LinearSolverTerminationType::SUCCESS) {
termination_type = Solve(rhs, solution, message);
}
return termination_type;
}
LinearSolverTerminationType EigenDenseQR::Factorize(int num_rows,
int num_cols,
double* lhs,
std::string* message) {
Eigen::Map<ColMajorMatrix> m(lhs, num_rows, num_cols);
qr_ = std::make_unique<QRType>(m);
*message = "Success.";
return LinearSolverTerminationType::SUCCESS;
}
LinearSolverTerminationType EigenDenseQR::Solve(const double* rhs,
double* solution,
std::string* message) {
VectorRef(solution, qr_->cols()) =
qr_->solve(ConstVectorRef(rhs, qr_->rows()));
*message = "Success.";
return LinearSolverTerminationType::SUCCESS;
}
#ifndef CERES_NO_LAPACK
LinearSolverTerminationType LAPACKDenseQR::Factorize(int num_rows,
int num_cols,
double* lhs,
std::string* message) {
int lwork = -1;
double work_size;
int info = 0;
// Compute the size of the temporary workspace needed to compute the QR
// factorization in the dgeqrf call below.
dgeqrf_(&num_rows,
&num_cols,
lhs_,
&num_rows,
tau_.data(),
&work_size,
&lwork,
&info);
if (info < 0) {
LOG(FATAL) << "Congratulations, you found a bug in Ceres."
<< "Please report it."
<< "LAPACK::dgels fatal error."
<< "Argument: " << -info << " is invalid.";
}
lhs_ = lhs;
num_rows_ = num_rows;
num_cols_ = num_cols;
lwork = static_cast<int>(work_size);
if (work_.size() < lwork) {
work_.resize(lwork);
}
if (tau_.size() < num_cols) {
tau_.resize(num_cols);
}
if (q_transpose_rhs_.size() < num_rows) {
q_transpose_rhs_.resize(num_rows);
}
// Factorize the lhs_ using the workspace that we just constructed above.
dgeqrf_(&num_rows,
&num_cols,
lhs_,
&num_rows,
tau_.data(),
work_.data(),
&lwork,
&info);
if (info < 0) {
LOG(FATAL) << "Congratulations, you found a bug in Ceres."
<< "Please report it. dgeqrf fatal error."
<< "Argument: " << -info << " is invalid.";
}
termination_type_ = LinearSolverTerminationType::SUCCESS;
*message = "Success.";
return termination_type_;
}
LinearSolverTerminationType LAPACKDenseQR::Solve(const double* rhs,
double* solution,
std::string* message) {
if (termination_type_ != LinearSolverTerminationType::SUCCESS) {
*message = "QR factorization failed and solve called.";
return termination_type_;
}
std::copy_n(rhs, num_rows_, q_transpose_rhs_.data());
const char side = 'L';
char trans = 'T';
const int num_c_cols = 1;
const int lwork = work_.size();
int info = 0;
dormqr_(&side,
&trans,
&num_rows_,
&num_c_cols,
&num_cols_,
lhs_,
&num_rows_,
tau_.data(),
q_transpose_rhs_.data(),
&num_rows_,
work_.data(),
&lwork,
&info);
if (info < 0) {
LOG(FATAL) << "Congratulations, you found a bug in Ceres."
<< "Please report it. dormr fatal error."
<< "Argument: " << -info << " is invalid.";
}
const char uplo = 'U';
trans = 'N';
const char diag = 'N';
dtrtrs_(&uplo,
&trans,
&diag,
&num_cols_,
&num_c_cols,
lhs_,
&num_rows_,
q_transpose_rhs_.data(),
&num_rows_,
&info);
if (info < 0) {
LOG(FATAL) << "Congratulations, you found a bug in Ceres."
<< "Please report it. dormr fatal error."
<< "Argument: " << -info << " is invalid.";
} else if (info > 0) {
*message =
"QR factorization failure. The factorization is not full rank. R has "
"zeros on the diagonal.";
termination_type_ = LinearSolverTerminationType::FAILURE;
} else {
std::copy_n(q_transpose_rhs_.data(), num_cols_, solution);
termination_type_ = LinearSolverTerminationType::SUCCESS;
}
return termination_type_;
}
#endif // CERES_NO_LAPACK
#ifndef CERES_NO_CUDA
bool CUDADenseQR::Init(ContextImpl* context, std::string* message) {
if (!context->InitCUDA(message)) {
return false;
}
cublas_handle_ = context->cublas_handle_;
cusolver_handle_ = context->cusolver_handle_;
stream_ = context->stream_;
error_.Reserve(1);
*message = "CUDADenseQR::Init Success.";
return true;
}
LinearSolverTerminationType CUDADenseQR::Factorize(int num_rows,
int num_cols,
double* lhs,
std::string* message) {
factorize_result_ = LinearSolverTerminationType::FATAL_ERROR;
lhs_.Reserve(num_rows * num_cols);
tau_.Reserve(std::min(num_rows, num_cols));
num_rows_ = num_rows;
num_cols_ = num_cols;
lhs_.CopyToGpuAsync(lhs, num_rows * num_cols, stream_);
int device_workspace_size = 0;
if (cusolverDnDgeqrf_bufferSize(cusolver_handle_,
num_rows,
num_cols,
lhs_.data(),
num_rows,
&device_workspace_size) !=
CUSOLVER_STATUS_SUCCESS) {
*message = "cuSolverDN::cusolverDnDgeqrf_bufferSize failed.";
return LinearSolverTerminationType::FATAL_ERROR;
}
device_workspace_.Reserve(device_workspace_size);
if (cusolverDnDgeqrf(cusolver_handle_,
num_rows,
num_cols,
lhs_.data(),
num_rows,
tau_.data(),
reinterpret_cast<double*>(device_workspace_.data()),
device_workspace_.size(),
error_.data()) != CUSOLVER_STATUS_SUCCESS) {
*message = "cuSolverDN::cusolverDnDgeqrf failed.";
return LinearSolverTerminationType::FATAL_ERROR;
}
if (cudaDeviceSynchronize() != cudaSuccess ||
cudaStreamSynchronize(stream_) != cudaSuccess) {
*message = "Cuda device synchronization failed.";
return LinearSolverTerminationType::FATAL_ERROR;
}
int error = 0;
error_.CopyToHost(&error, 1);
if (error < 0) {
LOG(FATAL) << "Congratulations, you found a bug in Ceres - "
<< "please report it. "
<< "cuSolverDN::cusolverDnDgeqrf fatal error. "
<< "Argument: " << -error << " is invalid.";
// The following line is unreachable, but return failure just to be
// pedantic, since the compiler does not know that.
return LinearSolverTerminationType::FATAL_ERROR;
}
*message = "Success";
factorize_result_ = LinearSolverTerminationType::SUCCESS;
return LinearSolverTerminationType::SUCCESS;
}
LinearSolverTerminationType CUDADenseQR::Solve(const double* rhs,
double* solution,
std::string* message) {
if (factorize_result_ != LinearSolverTerminationType::SUCCESS) {
*message = "Factorize did not complete successfully previously.";
return factorize_result_;
}
rhs_.CopyToGpuAsync(rhs, num_rows_, stream_);
int device_workspace_size = 0;
if (cusolverDnDormqr_bufferSize(cusolver_handle_,
CUBLAS_SIDE_LEFT,
CUBLAS_OP_T,
num_rows_,
1,
num_cols_,
lhs_.data(),
num_rows_,
tau_.data(),
rhs_.data(),
num_rows_,
&device_workspace_size) !=
CUSOLVER_STATUS_SUCCESS) {
*message = "cuSolverDN::cusolverDnDormqr_bufferSize failed.";
return LinearSolverTerminationType::FATAL_ERROR;
}
device_workspace_.Reserve(device_workspace_size);
// Compute rhs = Q^T * rhs, assuming that lhs has already been factorized.
// The result of factorization would have stored Q in a packed form in lhs_.
if (cusolverDnDormqr(cusolver_handle_,
CUBLAS_SIDE_LEFT,
CUBLAS_OP_T,
num_rows_,
1,
num_cols_,
lhs_.data(),
num_rows_,
tau_.data(),
rhs_.data(),
num_rows_,
reinterpret_cast<double*>(device_workspace_.data()),
device_workspace_.size(),
error_.data()) != CUSOLVER_STATUS_SUCCESS) {
*message = "cuSolverDN::cusolverDnDormqr failed.";
return LinearSolverTerminationType::FATAL_ERROR;
}
int error = 0;
error_.CopyToHost(&error, 1);
if (error < 0) {
LOG(FATAL) << "Congratulations, you found a bug in Ceres. "
<< "Please report it."
<< "cuSolverDN::cusolverDnDormqr fatal error. "
<< "Argument: " << -error << " is invalid.";
}
// Compute the solution vector as x = R \ (Q^T * rhs). Since the previous step
// replaced rhs by (Q^T * rhs), this is just x = R \ rhs.
if (cublasDtrsv(cublas_handle_,
CUBLAS_FILL_MODE_UPPER,
CUBLAS_OP_N,
CUBLAS_DIAG_NON_UNIT,
num_cols_,
lhs_.data(),
num_rows_,
rhs_.data(),
1) != CUBLAS_STATUS_SUCCESS) {
*message = "cuBLAS::cublasDtrsv failed.";
return LinearSolverTerminationType::FATAL_ERROR;
}
if (cudaDeviceSynchronize() != cudaSuccess ||
cudaStreamSynchronize(stream_) != cudaSuccess) {
*message = "Cuda device synchronization failed.";
return LinearSolverTerminationType::FATAL_ERROR;
}
rhs_.CopyToHost(solution, num_cols_);
*message = "Success";
return LinearSolverTerminationType::SUCCESS;
}
std::unique_ptr<CUDADenseQR> CUDADenseQR::Create(
const LinearSolver::Options& options) {
if (options.dense_linear_algebra_library_type != CUDA) {
// The user called the wrong factory method.
return nullptr;
}
auto cuda_dense_qr = std::unique_ptr<CUDADenseQR>(new CUDADenseQR());
std::string cuda_error;
if (cuda_dense_qr->Init(options.context, &cuda_error)) {
return cuda_dense_qr;
}
// Initialization failed, destroy the object (done automatically) and return a
// nullptr.
LOG(ERROR) << "CUDADenseQR::Init failed: " << cuda_error;
return nullptr;
}
CUDADenseQR::CUDADenseQR() = default;
#endif // CERES_NO_CUDA
} // namespace ceres::internal