internal/ceres/dense_qr.cc - ceres-solver - Git at Google

 // Ceres Solver - A fast non-linear least squares minimizer
 // Copyright 2022 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
 // * Redistributions of source code must retain the above copyright notice,
 //   this list of conditions and the following disclaimer.
 // * Redistributions in binary form must reproduce the above copyright notice,
 //   this list of conditions and the following disclaimer in the documentation
 //   and/or other materials provided with the distribution.
 // * Neither the name of Google Inc. nor the names of its contributors may be
 //   used to endorse or promote products derived from this software without
 //   specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 //
 // Author: sameeragarwal@google.com (Sameer Agarwal)

 #include "ceres/dense_qr.h"

 #include <algorithm>
 #include <memory>
 #include <string>
 #ifndef CERES_NO_CUDA
 #include "ceres/context_impl.h"
 #include "cusolverDn.h"
 #include "cublas_v2.h"
 #endif  // CERES_NO_CUDA

 #ifndef CERES_NO_LAPACK

 // LAPACK routines for solving a linear least squares problem using QR
 // factorization. This is done in three stages:
 //
 // A * x     = b
 // Q * R * x = b               (dgeqrf)
 //     R * x = Q' * b          (dormqr)
 //         x = R^{-1} * Q'* b  (dtrtrs)

 // clang-format off

 // Compute the QR factorization of a.
 //
 // a is an m x n column major matrix (Denoted by "A" in the above description)
 // lda is the leading dimension of a. lda >= max(1, num_rows)
 // tau is an array of size min(m,n). It contains the scalar factors of the
 // elementary reflectors.
 // work is an array of size max(1,lwork). On exit, if info=0, work[0] contains
 // the optimal size of work.
 //
 // if lwork >= 1 it is the size of work. If lwork = -1, then a workspace query is assumed.
 // dgeqrf computes the optimal size of the work array and returns it as work[0].
 //
 // info = 0, successful exit.
 // info < 0, if info = -i, then the i^th argument had illegal value.
 extern "C" void dgeqrf_(const int* m, const int* n, double* a, const int* lda,
                         double* tau, double* work, const int* lwork, int* info);

 // Apply Q or Q' to b.
 //
 // b is a m times n column major matrix.
 // size = 'L' applies Q or Q' on the left, size = 'R' applies Q or Q' on the right.
 // trans = 'N', applies Q, trans = 'T', applies Q'.
 // k is the number of elementary reflectors whose product defines the matrix Q.
 // If size = 'L', m >= k >= 0 and if side = 'R', n >= k >= 0.
 // a is an lda x k column major matrix containing the reflectors as returned by dgeqrf.
 // ldb is the leading dimension of b.
 // work is an array of size max(1, lwork)
 // lwork if positive is the size of work. If lwork = -1, then a
 // workspace query is assumed.
 //
 // info = 0, successful exit.
 // info < 0, if info = -i, then the i^th argument had illegal value.
 extern "C" void dormqr_(const char* side, const char* trans, const int* m,
                         const int* n ,const int* k, double* a, const int* lda,
                         double* tau, double* b, const int* ldb, double* work,
                         const int* lwork, int* info);

 // Solve a triangular system of the form A * x = b
 //
 // uplo = 'U', A is upper triangular. uplo = 'L' is lower triangular.
 // trans = 'N', 'T', 'C' specifies the form  - A, A^T, A^H.
 // DIAG = 'N', A is not unit triangular. 'U' is unit triangular.
 // n is the order of the matrix A.
 // nrhs number of columns of b.
 // a is a column major lda x n.
 // b is a column major matrix of ldb x nrhs
 //
 // info = 0 succesful.
 //      = -i < 0 i^th argument is an illegal value.
 //      = i > 0, i^th diagonal element of A is zero.
 extern "C" void dtrtrs_(const char* uplo, const char* trans, const char* diag,
                         const int* n, const int* nrhs, double* a, const int* lda,
                         double* b, const int* ldb, int* info);
 // clang-format on

 #endif

 namespace ceres {
 namespace internal {

 DenseQR::~DenseQR() = default;

 std::unique_ptr<DenseQR> DenseQR::Create(const LinearSolver::Options& options) {
   std::unique_ptr<DenseQR> dense_qr;

   switch (options.dense_linear_algebra_library_type) {
     case EIGEN:
       dense_qr = std::make_unique<EigenDenseQR>();
       break;

     case LAPACK:
 #ifndef CERES_NO_LAPACK
       dense_qr = std::make_unique<LAPACKDenseQR>();
       break;
 #else
       LOG(FATAL) << "Ceres was compiled without support for LAPACK.";
 #endif

     case CUDA:
 #ifndef CERES_NO_CUDA
       dense_qr = CUDADenseQR::Create(options);
       break;
 #else
       LOG(FATAL) << "Ceres was compiled without support for CUDA.";
 #endif

     default:
       LOG(FATAL) << "Unknown dense linear algebra library type : "
                  << DenseLinearAlgebraLibraryTypeToString(
                         options.dense_linear_algebra_library_type);
   }
   return dense_qr;
 }

 LinearSolverTerminationType DenseQR::FactorAndSolve(int num_rows,
                                                     int num_cols,
                                                     double* lhs,
                                                     const double* rhs,
                                                     double* solution,
                                                     std::string* message) {
   LinearSolverTerminationType termination_type =
       Factorize(num_rows, num_cols, lhs, message);
   if (termination_type == LINEAR_SOLVER_SUCCESS) {
     termination_type = Solve(rhs, solution, message);
   }
   return termination_type;
 }

 LinearSolverTerminationType EigenDenseQR::Factorize(int num_rows,
                                                     int num_cols,
                                                     double* lhs,
                                                     std::string* message) {
   Eigen::Map<ColMajorMatrix> m(lhs, num_rows, num_cols);
   qr_ = std::make_unique<QRType>(m);
   *message = "Success.";
   return LINEAR_SOLVER_SUCCESS;
 }

 LinearSolverTerminationType EigenDenseQR::Solve(const double* rhs,
                                                 double* solution,
                                                 std::string* message) {
   VectorRef(solution, qr_->cols()) =
       qr_->solve(ConstVectorRef(rhs, qr_->rows()));
   *message = "Success.";
   return LINEAR_SOLVER_SUCCESS;
 }

 #ifndef CERES_NO_LAPACK
 LinearSolverTerminationType LAPACKDenseQR::Factorize(int num_rows,
                                                      int num_cols,
                                                      double* lhs,
                                                      std::string* message) {
   int lwork = -1;
   double work_size;
   int info = 0;

   // Compute the size of the temporary workspace needed to compute the QR
   // factorization in the dgeqrf call below.
   dgeqrf_(&num_rows,
           &num_cols,
           lhs_,
           &num_rows,
           tau_.data(),
           &work_size,
           &lwork,
           &info);
   if (info < 0) {
     LOG(FATAL) << "Congratulations, you found a bug in Ceres."
                << "Please report it."
                << "LAPACK::dgels fatal error."
                << "Argument: " << -info << " is invalid.";
   }

   lhs_ = lhs;
   num_rows_ = num_rows;
   num_cols_ = num_cols;

   lwork = static_cast<int>(work_size);

   if (work_.size() < lwork) {
     work_.resize(lwork);
   }
   if (tau_.size() < num_cols) {
     tau_.resize(num_cols);
   }

   if (q_transpose_rhs_.size() < num_rows) {
     q_transpose_rhs_.resize(num_rows);
   }

   // Factorize the lhs_ using the workspace that we just constructed above.
   dgeqrf_(&num_rows,
           &num_cols,
           lhs_,
           &num_rows,
           tau_.data(),
           work_.data(),
           &lwork,
           &info);

   if (info < 0) {
     LOG(FATAL) << "Congratulations, you found a bug in Ceres."
                << "Please report it. dgeqrf fatal error."
                << "Argument: " << -info << " is invalid.";
   }

   termination_type_ = LINEAR_SOLVER_SUCCESS;
   *message = "Success.";
   return termination_type_;
 }

 LinearSolverTerminationType LAPACKDenseQR::Solve(const double* rhs,
                                                  double* solution,
                                                  std::string* message) {
   if (termination_type_ != LINEAR_SOLVER_SUCCESS) {
     *message = "QR factorization failed and solve called.";
     return termination_type_;
   }

   std::copy_n(rhs, num_rows_, q_transpose_rhs_.data());

   const char side = 'L';
   char trans = 'T';
   const int num_c_cols = 1;
   const int lwork = work_.size();
   int info = 0;
   dormqr_(&side,
           &trans,
           &num_rows_,
           &num_c_cols,
           &num_cols_,
           lhs_,
           &num_rows_,
           tau_.data(),
           q_transpose_rhs_.data(),
           &num_rows_,
           work_.data(),
           &lwork,
           &info);
   if (info < 0) {
     LOG(FATAL) << "Congratulations, you found a bug in Ceres."
                << "Please report it. dormr fatal error."
                << "Argument: " << -info << " is invalid.";
   }

   const char uplo = 'U';
   trans = 'N';
   const char diag = 'N';
   dtrtrs_(&uplo,
           &trans,
           &diag,
           &num_cols_,
           &num_c_cols,
           lhs_,
           &num_rows_,
           q_transpose_rhs_.data(),
           &num_rows_,
           &info);

   if (info < 0) {
     LOG(FATAL) << "Congratulations, you found a bug in Ceres."
                << "Please report it. dormr fatal error."
                << "Argument: " << -info << " is invalid.";
   } else if (info > 0) {
     *message =
         "QR factorization failure. The factorization is not full rank. R has "
         "zeros on the diagonal.";
     termination_type_ = LINEAR_SOLVER_FAILURE;
   } else {
     std::copy_n(q_transpose_rhs_.data(), num_cols_, solution);
     termination_type_ = LINEAR_SOLVER_SUCCESS;
   }

   return termination_type_;
 }

 #endif  // CERES_NO_LAPACK

 #ifndef CERES_NO_CUDA

 bool CUDADenseQR::Init(ContextImpl* context, std::string* message) {
   if (!context->InitCUDA(message)) {
     return false;
   }
   cublas_handle_ = context->cublas_handle_;
   cusolver_handle_ = context->cusolver_handle_;
   stream_ = context->stream_;
   error_.Reserve(1);
   *message = "CUDADenseQR::Init Success.";
   return true;
 }

 LinearSolverTerminationType CUDADenseQR::Factorize(
     int num_rows, int num_cols, double* lhs, std::string* message) {
   factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
   lhs_.Reserve(num_rows * num_cols);
   tau_.Reserve(std::min(num_rows, num_cols));
   num_rows_ = num_rows;
   num_cols_ = num_cols;
   lhs_.CopyToGpuAsync(lhs, num_rows * num_cols, stream_);
   int device_workspace_size = 0;
   if (cusolverDnDgeqrf_bufferSize(cusolver_handle_,
                                   num_rows,
                                   num_cols,
                                   lhs_.data(),
                                   num_rows,
                                   &device_workspace_size) !=
       CUSOLVER_STATUS_SUCCESS) {
     *message = "cuSolverDN::cusolverDnDgeqrf_bufferSize failed.";
     return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
   }
   device_workspace_.Reserve(device_workspace_size);
   if (cusolverDnDgeqrf(cusolver_handle_,
                        num_rows,
                        num_cols,
                        lhs_.data(),
                        num_rows,
                        tau_.data(),
                        reinterpret_cast<double*>(device_workspace_.data()),
                        device_workspace_.size(),
                        error_.data()) != CUSOLVER_STATUS_SUCCESS) {
     *message = "cuSolverDN::cusolverDnDgeqrf failed.";
     return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
   }
   if (cudaDeviceSynchronize() != cudaSuccess ||
       cudaStreamSynchronize(stream_) != cudaSuccess) {
     *message = "Cuda device synchronization failed.";
     return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
   }
   int error = 0;
   error_.CopyToHost(&error, 1);
   if (error < 0) {
     LOG(FATAL) << "Congratulations, you found a bug in Ceres - "
                << "please report it. "
                << "cuSolverDN::cusolverDnDgeqrf fatal error. "
                << "Argument: " << -error << " is invalid.";
     // The following line is unreachable, but return failure just to be
     // pedantic, since the compiler does not know that.
     return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
   }

   *message = "Success";
   factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
   return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
 }

 LinearSolverTerminationType CUDADenseQR::Solve(
     const double* rhs, double* solution, std::string* message) {
   if (factorize_result_ != LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS) {
     *message = "Factorize did not complete succesfully previously.";
     return factorize_result_;
   }
   rhs_.CopyToGpuAsync(rhs, num_rows_, stream_);
   int device_workspace_size = 0;
   if (cusolverDnDormqr_bufferSize(cusolver_handle_,
                                   CUBLAS_SIDE_LEFT,
                                   CUBLAS_OP_T,
                                   num_rows_,
                                   1,
                                   num_cols_,
                                   lhs_.data(),
                                   num_rows_,
                                   tau_.data(),
                                   rhs_.data(),
                                   num_rows_,
                                   &device_workspace_size) !=
       CUSOLVER_STATUS_SUCCESS) {
     *message = "cuSolverDN::cusolverDnDormqr_bufferSize failed.";
     return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
   }
   device_workspace_.Reserve(device_workspace_size);
   // Compute rhs = Q^T * rhs, assuming that lhs has already been factorized.
   // The result of factorization would have stored Q in a packed form in lhs_.
   if (cusolverDnDormqr(cusolver_handle_,
                        CUBLAS_SIDE_LEFT,
                        CUBLAS_OP_T,
                        num_rows_,
                        1,
                        num_cols_,
                        lhs_.data(),
                        num_rows_,
                        tau_.data(),
                        rhs_.data(),
                        num_rows_,
                        reinterpret_cast<double*>(device_workspace_.data()),
                        device_workspace_.size(),
                        error_.data()) != CUSOLVER_STATUS_SUCCESS) {
     *message = "cuSolverDN::cusolverDnDormqr failed.";
     return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
   }
   int error = 0;
   error_.CopyToHost(&error, 1);
   if (error < 0) {
     LOG(FATAL) << "Congratulations, you found a bug in Ceres. "
                << "Please report it."
                << "cuSolverDN::cusolverDnDormqr fatal error. "
                << "Argument: " << -error << " is invalid.";
   }
   // Compute the solution vector as x = R \ (Q^T * rhs). Since the previous step
   // replaced rhs by (Q^T * rhs), this is just x = R \ rhs.
   if (cublasDtrsv(cublas_handle_,
                   CUBLAS_FILL_MODE_UPPER,
                   CUBLAS_OP_N,
                   CUBLAS_DIAG_NON_UNIT,
                   num_cols_,
                   lhs_.data(),
                   num_rows_,
                   rhs_.data(),
                   1) != CUBLAS_STATUS_SUCCESS) {
     *message = "cuBLAS::cublasDtrsv failed.";
     return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
   }
   if (cudaDeviceSynchronize() != cudaSuccess ||
       cudaStreamSynchronize(stream_) != cudaSuccess) {
     *message = "Cuda device synchronization failed.";
     return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
   }
   rhs_.CopyToHost(solution, num_cols_);
   *message = "Success";
   return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
 }

 std::unique_ptr<CUDADenseQR> CUDADenseQR::Create(
     const LinearSolver::Options& options) {
   if (options.dense_linear_algebra_library_type != CUDA) {
     // The user called the wrong factory method.
     return nullptr;
   }
   auto cuda_dense_qr =
       std::unique_ptr<CUDADenseQR>(new CUDADenseQR());
   std::string cuda_error;
   if (cuda_dense_qr->Init(options.context, &cuda_error)) {
     return cuda_dense_qr;
   }
   // Initialization failed, destroy the object (done automatically) and return a
   // nullptr.
   LOG(ERROR) << "CUDADenseQR::Init failed: " << cuda_error;
   return nullptr;
 }

 CUDADenseQR::CUDADenseQR()  = default;

 #endif  // CERES_NO_CUDA


 }  // namespace internal
 }  // namespace ceres
	// Ceres Solver - A fast non-linear least squares minimizer
	// Copyright 2022 Google Inc. All rights reserved.
	// http://ceres-solver.org/
	//
	// Redistribution and use in source and binary forms, with or without
	// modification, are permitted provided that the following conditions are met:
	//
	// * Redistributions of source code must retain the above copyright notice,
	// this list of conditions and the following disclaimer.
	// * Redistributions in binary form must reproduce the above copyright notice,
	// this list of conditions and the following disclaimer in the documentation
	// and/or other materials provided with the distribution.
	// * Neither the name of Google Inc. nor the names of its contributors may be
	// used to endorse or promote products derived from this software without
	// specific prior written permission.
	//
	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
	// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	// POSSIBILITY OF SUCH DAMAGE.
	//
	// Author: sameeragarwal@google.com (Sameer Agarwal)

	#include "ceres/dense_qr.h"

	#include <algorithm>
	#include <memory>
	#include <string>
	#ifndef CERES_NO_CUDA
	#include "ceres/context_impl.h"
	#include "cusolverDn.h"
	#include "cublas_v2.h"
	#endif // CERES_NO_CUDA

	#ifndef CERES_NO_LAPACK

	// LAPACK routines for solving a linear least squares problem using QR
	// factorization. This is done in three stages:
	//
	// A * x = b
	// Q * R * x = b (dgeqrf)
	// R * x = Q' * b (dormqr)
	// x = R^{-1} * Q'* b (dtrtrs)

	// clang-format off

	// Compute the QR factorization of a.
	//
	// a is an m x n column major matrix (Denoted by "A" in the above description)
	// lda is the leading dimension of a. lda >= max(1, num_rows)
	// tau is an array of size min(m,n). It contains the scalar factors of the
	// elementary reflectors.
	// work is an array of size max(1,lwork). On exit, if info=0, work[0] contains
	// the optimal size of work.
	//
	// if lwork >= 1 it is the size of work. If lwork = -1, then a workspace query is assumed.
	// dgeqrf computes the optimal size of the work array and returns it as work[0].
	//
	// info = 0, successful exit.
	// info < 0, if info = -i, then the i^th argument had illegal value.
	extern "C" void dgeqrf_(const int* m, const int* n, double* a, const int* lda,
	double* tau, double* work, const int* lwork, int* info);

	// Apply Q or Q' to b.
	//
	// b is a m times n column major matrix.
	// size = 'L' applies Q or Q' on the left, size = 'R' applies Q or Q' on the right.
	// trans = 'N', applies Q, trans = 'T', applies Q'.
	// k is the number of elementary reflectors whose product defines the matrix Q.
	// If size = 'L', m >= k >= 0 and if side = 'R', n >= k >= 0.
	// a is an lda x k column major matrix containing the reflectors as returned by dgeqrf.
	// ldb is the leading dimension of b.
	// work is an array of size max(1, lwork)
	// lwork if positive is the size of work. If lwork = -1, then a
	// workspace query is assumed.
	//
	// info = 0, successful exit.
	// info < 0, if info = -i, then the i^th argument had illegal value.
	extern "C" void dormqr_(const char* side, const char* trans, const int* m,
	const int* n ,const int* k, double* a, const int* lda,
	double* tau, double* b, const int* ldb, double* work,
	const int* lwork, int* info);

	// Solve a triangular system of the form A * x = b
	//
	// uplo = 'U', A is upper triangular. uplo = 'L' is lower triangular.
	// trans = 'N', 'T', 'C' specifies the form - A, A^T, A^H.
	// DIAG = 'N', A is not unit triangular. 'U' is unit triangular.
	// n is the order of the matrix A.
	// nrhs number of columns of b.
	// a is a column major lda x n.
	// b is a column major matrix of ldb x nrhs
	//
	// info = 0 succesful.
	// = -i < 0 i^th argument is an illegal value.
	// = i > 0, i^th diagonal element of A is zero.
	extern "C" void dtrtrs_(const char* uplo, const char* trans, const char* diag,
	const int* n, const int* nrhs, double* a, const int* lda,
	double* b, const int* ldb, int* info);
	// clang-format on

	#endif

	namespace ceres {
	namespace internal {

	DenseQR::~DenseQR() = default;

	std::unique_ptr<DenseQR> DenseQR::Create(const LinearSolver::Options& options) {
	std::unique_ptr<DenseQR> dense_qr;

	switch (options.dense_linear_algebra_library_type) {
	case EIGEN:
	dense_qr = std::make_unique<EigenDenseQR>();
	break;

	case LAPACK:
	#ifndef CERES_NO_LAPACK
	dense_qr = std::make_unique<LAPACKDenseQR>();
	break;
	#else
	LOG(FATAL) << "Ceres was compiled without support for LAPACK.";
	#endif

	case CUDA:
	#ifndef CERES_NO_CUDA
	dense_qr = CUDADenseQR::Create(options);
	break;
	#else
	LOG(FATAL) << "Ceres was compiled without support for CUDA.";
	#endif

	default:
	LOG(FATAL) << "Unknown dense linear algebra library type : "
	<< DenseLinearAlgebraLibraryTypeToString(
	options.dense_linear_algebra_library_type);
	}
	return dense_qr;
	}

	LinearSolverTerminationType DenseQR::FactorAndSolve(int num_rows,
	int num_cols,
	double* lhs,
	const double* rhs,
	double* solution,
	std::string* message) {
	LinearSolverTerminationType termination_type =
	Factorize(num_rows, num_cols, lhs, message);
	if (termination_type == LINEAR_SOLVER_SUCCESS) {
	termination_type = Solve(rhs, solution, message);
	}
	return termination_type;
	}

	LinearSolverTerminationType EigenDenseQR::Factorize(int num_rows,
	int num_cols,
	double* lhs,
	std::string* message) {
	Eigen::Map<ColMajorMatrix> m(lhs, num_rows, num_cols);
	qr_ = std::make_unique<QRType>(m);
	*message = "Success.";
	return LINEAR_SOLVER_SUCCESS;
	}

	LinearSolverTerminationType EigenDenseQR::Solve(const double* rhs,
	double* solution,
	std::string* message) {
	VectorRef(solution, qr_->cols()) =
	qr_->solve(ConstVectorRef(rhs, qr_->rows()));
	*message = "Success.";
	return LINEAR_SOLVER_SUCCESS;
	}

	#ifndef CERES_NO_LAPACK
	LinearSolverTerminationType LAPACKDenseQR::Factorize(int num_rows,
	int num_cols,
	double* lhs,
	std::string* message) {
	int lwork = -1;
	double work_size;
	int info = 0;

	// Compute the size of the temporary workspace needed to compute the QR
	// factorization in the dgeqrf call below.
	dgeqrf_(&num_rows,
	&num_cols,
	lhs_,
	&num_rows,
	tau_.data(),
	&work_size,
	&lwork,
	&info);
	if (info < 0) {
	LOG(FATAL) << "Congratulations, you found a bug in Ceres."
	<< "Please report it."
	<< "LAPACK::dgels fatal error."
	<< "Argument: " << -info << " is invalid.";
	}

	lhs_ = lhs;
	num_rows_ = num_rows;
	num_cols_ = num_cols;

	lwork = static_cast<int>(work_size);

	if (work_.size() < lwork) {
	work_.resize(lwork);
	}
	if (tau_.size() < num_cols) {
	tau_.resize(num_cols);
	}

	if (q_transpose_rhs_.size() < num_rows) {
	q_transpose_rhs_.resize(num_rows);
	}

	// Factorize the lhs_ using the workspace that we just constructed above.
	dgeqrf_(&num_rows,
	&num_cols,
	lhs_,
	&num_rows,
	tau_.data(),
	work_.data(),
	&lwork,
	&info);

	if (info < 0) {
	LOG(FATAL) << "Congratulations, you found a bug in Ceres."
	<< "Please report it. dgeqrf fatal error."
	<< "Argument: " << -info << " is invalid.";
	}

	termination_type_ = LINEAR_SOLVER_SUCCESS;
	*message = "Success.";
	return termination_type_;
	}

	LinearSolverTerminationType LAPACKDenseQR::Solve(const double* rhs,
	double* solution,
	std::string* message) {
	if (termination_type_ != LINEAR_SOLVER_SUCCESS) {
	*message = "QR factorization failed and solve called.";
	return termination_type_;
	}

	std::copy_n(rhs, num_rows_, q_transpose_rhs_.data());

	const char side = 'L';
	char trans = 'T';
	const int num_c_cols = 1;
	const int lwork = work_.size();
	int info = 0;
	dormqr_(&side,
	&trans,
	&num_rows_,
	&num_c_cols,
	&num_cols_,
	lhs_,
	&num_rows_,
	tau_.data(),
	q_transpose_rhs_.data(),
	&num_rows_,
	work_.data(),
	&lwork,
	&info);
	if (info < 0) {
	LOG(FATAL) << "Congratulations, you found a bug in Ceres."
	<< "Please report it. dormr fatal error."
	<< "Argument: " << -info << " is invalid.";
	}

	const char uplo = 'U';
	trans = 'N';
	const char diag = 'N';
	dtrtrs_(&uplo,
	&trans,
	&diag,
	&num_cols_,
	&num_c_cols,
	lhs_,
	&num_rows_,
	q_transpose_rhs_.data(),
	&num_rows_,
	&info);

	if (info < 0) {
	LOG(FATAL) << "Congratulations, you found a bug in Ceres."
	<< "Please report it. dormr fatal error."
	<< "Argument: " << -info << " is invalid.";
	} else if (info > 0) {
	*message =
	"QR factorization failure. The factorization is not full rank. R has "
	"zeros on the diagonal.";
	termination_type_ = LINEAR_SOLVER_FAILURE;
	} else {
	std::copy_n(q_transpose_rhs_.data(), num_cols_, solution);
	termination_type_ = LINEAR_SOLVER_SUCCESS;
	}

	return termination_type_;
	}

	#endif // CERES_NO_LAPACK

	#ifndef CERES_NO_CUDA

	bool CUDADenseQR::Init(ContextImpl* context, std::string* message) {
	if (!context->InitCUDA(message)) {
	return false;
	}
	cublas_handle_ = context->cublas_handle_;
	cusolver_handle_ = context->cusolver_handle_;
	stream_ = context->stream_;
	error_.Reserve(1);
	*message = "CUDADenseQR::Init Success.";
	return true;
	}

	LinearSolverTerminationType CUDADenseQR::Factorize(
	int num_rows, int num_cols, double* lhs, std::string* message) {
	factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
	lhs_.Reserve(num_rows * num_cols);
	tau_.Reserve(std::min(num_rows, num_cols));
	num_rows_ = num_rows;
	num_cols_ = num_cols;
	lhs_.CopyToGpuAsync(lhs, num_rows * num_cols, stream_);
	int device_workspace_size = 0;
	if (cusolverDnDgeqrf_bufferSize(cusolver_handle_,
	num_rows,
	num_cols,
	lhs_.data(),
	num_rows,
	&device_workspace_size) !=
	CUSOLVER_STATUS_SUCCESS) {
	*message = "cuSolverDN::cusolverDnDgeqrf_bufferSize failed.";
	return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
	}
	device_workspace_.Reserve(device_workspace_size);
	if (cusolverDnDgeqrf(cusolver_handle_,
	num_rows,
	num_cols,
	lhs_.data(),
	num_rows,
	tau_.data(),
	reinterpret_cast<double*>(device_workspace_.data()),
	device_workspace_.size(),
	error_.data()) != CUSOLVER_STATUS_SUCCESS) {
	*message = "cuSolverDN::cusolverDnDgeqrf failed.";
	return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
	}
	if (cudaDeviceSynchronize() != cudaSuccess \|\|
	cudaStreamSynchronize(stream_) != cudaSuccess) {
	*message = "Cuda device synchronization failed.";
	return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
	}
	int error = 0;
	error_.CopyToHost(&error, 1);
	if (error < 0) {
	LOG(FATAL) << "Congratulations, you found a bug in Ceres - "
	<< "please report it. "
	<< "cuSolverDN::cusolverDnDgeqrf fatal error. "
	<< "Argument: " << -error << " is invalid.";
	// The following line is unreachable, but return failure just to be
	// pedantic, since the compiler does not know that.
	return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
	}

	*message = "Success";
	factorize_result_ = LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
	return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
	}

	LinearSolverTerminationType CUDADenseQR::Solve(
	const double* rhs, double* solution, std::string* message) {
	if (factorize_result_ != LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS) {
	*message = "Factorize did not complete succesfully previously.";
	return factorize_result_;
	}
	rhs_.CopyToGpuAsync(rhs, num_rows_, stream_);
	int device_workspace_size = 0;
	if (cusolverDnDormqr_bufferSize(cusolver_handle_,
	CUBLAS_SIDE_LEFT,
	CUBLAS_OP_T,
	num_rows_,
	1,
	num_cols_,
	lhs_.data(),
	num_rows_,
	tau_.data(),
	rhs_.data(),
	num_rows_,
	&device_workspace_size) !=
	CUSOLVER_STATUS_SUCCESS) {
	*message = "cuSolverDN::cusolverDnDormqr_bufferSize failed.";
	return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
	}
	device_workspace_.Reserve(device_workspace_size);
	// Compute rhs = Q^T * rhs, assuming that lhs has already been factorized.
	// The result of factorization would have stored Q in a packed form in lhs_.
	if (cusolverDnDormqr(cusolver_handle_,
	CUBLAS_SIDE_LEFT,
	CUBLAS_OP_T,
	num_rows_,
	1,
	num_cols_,
	lhs_.data(),
	num_rows_,
	tau_.data(),
	rhs_.data(),
	num_rows_,
	reinterpret_cast<double*>(device_workspace_.data()),
	device_workspace_.size(),
	error_.data()) != CUSOLVER_STATUS_SUCCESS) {
	*message = "cuSolverDN::cusolverDnDormqr failed.";
	return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
	}
	int error = 0;
	error_.CopyToHost(&error, 1);
	if (error < 0) {
	LOG(FATAL) << "Congratulations, you found a bug in Ceres. "
	<< "Please report it."
	<< "cuSolverDN::cusolverDnDormqr fatal error. "
	<< "Argument: " << -error << " is invalid.";
	}
	// Compute the solution vector as x = R \ (Q^T * rhs). Since the previous step
	// replaced rhs by (Q^T * rhs), this is just x = R \ rhs.
	if (cublasDtrsv(cublas_handle_,
	CUBLAS_FILL_MODE_UPPER,
	CUBLAS_OP_N,
	CUBLAS_DIAG_NON_UNIT,
	num_cols_,
	lhs_.data(),
	num_rows_,
	rhs_.data(),
	1) != CUBLAS_STATUS_SUCCESS) {
	*message = "cuBLAS::cublasDtrsv failed.";
	return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
	}
	if (cudaDeviceSynchronize() != cudaSuccess \|\|
	cudaStreamSynchronize(stream_) != cudaSuccess) {
	*message = "Cuda device synchronization failed.";
	return LinearSolverTerminationType::LINEAR_SOLVER_FATAL_ERROR;
	}
	rhs_.CopyToHost(solution, num_cols_);
	*message = "Success";
	return LinearSolverTerminationType::LINEAR_SOLVER_SUCCESS;
	}

	std::unique_ptr<CUDADenseQR> CUDADenseQR::Create(
	const LinearSolver::Options& options) {
	if (options.dense_linear_algebra_library_type != CUDA) {
	// The user called the wrong factory method.
	return nullptr;
	}
	auto cuda_dense_qr =
	std::unique_ptr<CUDADenseQR>(new CUDADenseQR());
	std::string cuda_error;
	if (cuda_dense_qr->Init(options.context, &cuda_error)) {
	return cuda_dense_qr;
	}
	// Initialization failed, destroy the object (done automatically) and return a
	// nullptr.
	LOG(ERROR) << "CUDADenseQR::Init failed: " << cuda_error;
	return nullptr;
	}

	CUDADenseQR::CUDADenseQR() = default;

	#endif // CERES_NO_CUDA


	} // namespace internal
	} // namespace ceres