| // Ceres Solver - A fast non-linear least squares minimizer | 
 | // Copyright 2022 Google Inc. All rights reserved. | 
 | // http://ceres-solver.org/ | 
 | // | 
 | // Redistribution and use in source and binary forms, with or without | 
 | // modification, are permitted provided that the following conditions are met: | 
 | // | 
 | // * Redistributions of source code must retain the above copyright notice, | 
 | //   this list of conditions and the following disclaimer. | 
 | // * Redistributions in binary form must reproduce the above copyright notice, | 
 | //   this list of conditions and the following disclaimer in the documentation | 
 | //   and/or other materials provided with the distribution. | 
 | // * Neither the name of Google Inc. nor the names of its contributors may be | 
 | //   used to endorse or promote products derived from this software without | 
 | //   specific prior written permission. | 
 | // | 
 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | 
 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | 
 | // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | 
 | // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | 
 | // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | 
 | // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | 
 | // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | 
 | // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | 
 | // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | 
 | // POSSIBILITY OF SUCH DAMAGE. | 
 | // | 
 | // Author: sameeragarwal@google.com (Sameer Agarwal) | 
 |  | 
 | #include "ceres/dense_qr.h" | 
 |  | 
 | #include <algorithm> | 
 | #include <memory> | 
 | #include <string> | 
 |  | 
 | #ifndef CERES_NO_CUDA | 
 | #include "ceres/context_impl.h" | 
 | #include "cublas_v2.h" | 
 | #include "cusolverDn.h" | 
 | #endif  // CERES_NO_CUDA | 
 |  | 
 | #ifndef CERES_NO_LAPACK | 
 |  | 
 | // LAPACK routines for solving a linear least squares problem using QR | 
 | // factorization. This is done in three stages: | 
 | // | 
 | // A * x     = b | 
 | // Q * R * x = b               (dgeqrf) | 
 | //     R * x = Q' * b          (dormqr) | 
 | //         x = R^{-1} * Q'* b  (dtrtrs) | 
 |  | 
 | // clang-format off | 
 |  | 
 | // Compute the QR factorization of a. | 
 | // | 
 | // a is an m x n column major matrix (Denoted by "A" in the above description) | 
 | // lda is the leading dimension of a. lda >= max(1, num_rows) | 
 | // tau is an array of size min(m,n). It contains the scalar factors of the | 
 | // elementary reflectors. | 
 | // work is an array of size max(1,lwork). On exit, if info=0, work[0] contains | 
 | // the optimal size of work. | 
 | // | 
 | // if lwork >= 1 it is the size of work. If lwork = -1, then a workspace query is assumed. | 
 | // dgeqrf computes the optimal size of the work array and returns it as work[0]. | 
 | // | 
 | // info = 0, successful exit. | 
 | // info < 0, if info = -i, then the i^th argument had illegal value. | 
 | extern "C" void dgeqrf_(const int* m, const int* n, double* a, const int* lda, | 
 |                         double* tau, double* work, const int* lwork, int* info); | 
 |  | 
 | // Apply Q or Q' to b. | 
 | // | 
 | // b is a m times n column major matrix. | 
 | // size = 'L' applies Q or Q' on the left, size = 'R' applies Q or Q' on the right. | 
 | // trans = 'N', applies Q, trans = 'T', applies Q'. | 
 | // k is the number of elementary reflectors whose product defines the matrix Q. | 
 | // If size = 'L', m >= k >= 0 and if side = 'R', n >= k >= 0. | 
 | // a is an lda x k column major matrix containing the reflectors as returned by dgeqrf. | 
 | // ldb is the leading dimension of b. | 
 | // work is an array of size max(1, lwork) | 
 | // lwork if positive is the size of work. If lwork = -1, then a | 
 | // workspace query is assumed. | 
 | // | 
 | // info = 0, successful exit. | 
 | // info < 0, if info = -i, then the i^th argument had illegal value. | 
 | extern "C" void dormqr_(const char* side, const char* trans, const int* m, | 
 |                         const int* n ,const int* k, double* a, const int* lda, | 
 |                         double* tau, double* b, const int* ldb, double* work, | 
 |                         const int* lwork, int* info); | 
 |  | 
 | // Solve a triangular system of the form A * x = b | 
 | // | 
 | // uplo = 'U', A is upper triangular. uplo = 'L' is lower triangular. | 
 | // trans = 'N', 'T', 'C' specifies the form  - A, A^T, A^H. | 
 | // DIAG = 'N', A is not unit triangular. 'U' is unit triangular. | 
 | // n is the order of the matrix A. | 
 | // nrhs number of columns of b. | 
 | // a is a column major lda x n. | 
 | // b is a column major matrix of ldb x nrhs | 
 | // | 
 | // info = 0 successful. | 
 | //      = -i < 0 i^th argument is an illegal value. | 
 | //      = i > 0, i^th diagonal element of A is zero. | 
 | extern "C" void dtrtrs_(const char* uplo, const char* trans, const char* diag, | 
 |                         const int* n, const int* nrhs, double* a, const int* lda, | 
 |                         double* b, const int* ldb, int* info); | 
 | // clang-format on | 
 |  | 
 | #endif | 
 |  | 
 | namespace ceres::internal { | 
 |  | 
 | DenseQR::~DenseQR() = default; | 
 |  | 
 | std::unique_ptr<DenseQR> DenseQR::Create(const LinearSolver::Options& options) { | 
 |   std::unique_ptr<DenseQR> dense_qr; | 
 |  | 
 |   switch (options.dense_linear_algebra_library_type) { | 
 |     case EIGEN: | 
 |       dense_qr = std::make_unique<EigenDenseQR>(); | 
 |       break; | 
 |  | 
 |     case LAPACK: | 
 | #ifndef CERES_NO_LAPACK | 
 |       dense_qr = std::make_unique<LAPACKDenseQR>(); | 
 |       break; | 
 | #else | 
 |       LOG(FATAL) << "Ceres was compiled without support for LAPACK."; | 
 | #endif | 
 |  | 
 |     case CUDA: | 
 | #ifndef CERES_NO_CUDA | 
 |       dense_qr = CUDADenseQR::Create(options); | 
 |       break; | 
 | #else | 
 |       LOG(FATAL) << "Ceres was compiled without support for CUDA."; | 
 | #endif | 
 |  | 
 |     default: | 
 |       LOG(FATAL) << "Unknown dense linear algebra library type : " | 
 |                  << DenseLinearAlgebraLibraryTypeToString( | 
 |                         options.dense_linear_algebra_library_type); | 
 |   } | 
 |   return dense_qr; | 
 | } | 
 |  | 
 | LinearSolverTerminationType DenseQR::FactorAndSolve(int num_rows, | 
 |                                                     int num_cols, | 
 |                                                     double* lhs, | 
 |                                                     const double* rhs, | 
 |                                                     double* solution, | 
 |                                                     std::string* message) { | 
 |   LinearSolverTerminationType termination_type = | 
 |       Factorize(num_rows, num_cols, lhs, message); | 
 |   if (termination_type == LinearSolverTerminationType::SUCCESS) { | 
 |     termination_type = Solve(rhs, solution, message); | 
 |   } | 
 |   return termination_type; | 
 | } | 
 |  | 
 | LinearSolverTerminationType EigenDenseQR::Factorize(int num_rows, | 
 |                                                     int num_cols, | 
 |                                                     double* lhs, | 
 |                                                     std::string* message) { | 
 |   Eigen::Map<ColMajorMatrix> m(lhs, num_rows, num_cols); | 
 |   qr_ = std::make_unique<QRType>(m); | 
 |   *message = "Success."; | 
 |   return LinearSolverTerminationType::SUCCESS; | 
 | } | 
 |  | 
 | LinearSolverTerminationType EigenDenseQR::Solve(const double* rhs, | 
 |                                                 double* solution, | 
 |                                                 std::string* message) { | 
 |   VectorRef(solution, qr_->cols()) = | 
 |       qr_->solve(ConstVectorRef(rhs, qr_->rows())); | 
 |   *message = "Success."; | 
 |   return LinearSolverTerminationType::SUCCESS; | 
 | } | 
 |  | 
 | #ifndef CERES_NO_LAPACK | 
 | LinearSolverTerminationType LAPACKDenseQR::Factorize(int num_rows, | 
 |                                                      int num_cols, | 
 |                                                      double* lhs, | 
 |                                                      std::string* message) { | 
 |   int lwork = -1; | 
 |   double work_size; | 
 |   int info = 0; | 
 |  | 
 |   // Compute the size of the temporary workspace needed to compute the QR | 
 |   // factorization in the dgeqrf call below. | 
 |   dgeqrf_(&num_rows, | 
 |           &num_cols, | 
 |           lhs_, | 
 |           &num_rows, | 
 |           tau_.data(), | 
 |           &work_size, | 
 |           &lwork, | 
 |           &info); | 
 |   if (info < 0) { | 
 |     LOG(FATAL) << "Congratulations, you found a bug in Ceres." | 
 |                << "Please report it." | 
 |                << "LAPACK::dgels fatal error." | 
 |                << "Argument: " << -info << " is invalid."; | 
 |   } | 
 |  | 
 |   lhs_ = lhs; | 
 |   num_rows_ = num_rows; | 
 |   num_cols_ = num_cols; | 
 |  | 
 |   lwork = static_cast<int>(work_size); | 
 |  | 
 |   if (work_.size() < lwork) { | 
 |     work_.resize(lwork); | 
 |   } | 
 |   if (tau_.size() < num_cols) { | 
 |     tau_.resize(num_cols); | 
 |   } | 
 |  | 
 |   if (q_transpose_rhs_.size() < num_rows) { | 
 |     q_transpose_rhs_.resize(num_rows); | 
 |   } | 
 |  | 
 |   // Factorize the lhs_ using the workspace that we just constructed above. | 
 |   dgeqrf_(&num_rows, | 
 |           &num_cols, | 
 |           lhs_, | 
 |           &num_rows, | 
 |           tau_.data(), | 
 |           work_.data(), | 
 |           &lwork, | 
 |           &info); | 
 |  | 
 |   if (info < 0) { | 
 |     LOG(FATAL) << "Congratulations, you found a bug in Ceres." | 
 |                << "Please report it. dgeqrf fatal error." | 
 |                << "Argument: " << -info << " is invalid."; | 
 |   } | 
 |  | 
 |   termination_type_ = LinearSolverTerminationType::SUCCESS; | 
 |   *message = "Success."; | 
 |   return termination_type_; | 
 | } | 
 |  | 
 | LinearSolverTerminationType LAPACKDenseQR::Solve(const double* rhs, | 
 |                                                  double* solution, | 
 |                                                  std::string* message) { | 
 |   if (termination_type_ != LinearSolverTerminationType::SUCCESS) { | 
 |     *message = "QR factorization failed and solve called."; | 
 |     return termination_type_; | 
 |   } | 
 |  | 
 |   std::copy_n(rhs, num_rows_, q_transpose_rhs_.data()); | 
 |  | 
 |   const char side = 'L'; | 
 |   char trans = 'T'; | 
 |   const int num_c_cols = 1; | 
 |   const int lwork = work_.size(); | 
 |   int info = 0; | 
 |   dormqr_(&side, | 
 |           &trans, | 
 |           &num_rows_, | 
 |           &num_c_cols, | 
 |           &num_cols_, | 
 |           lhs_, | 
 |           &num_rows_, | 
 |           tau_.data(), | 
 |           q_transpose_rhs_.data(), | 
 |           &num_rows_, | 
 |           work_.data(), | 
 |           &lwork, | 
 |           &info); | 
 |   if (info < 0) { | 
 |     LOG(FATAL) << "Congratulations, you found a bug in Ceres." | 
 |                << "Please report it. dormr fatal error." | 
 |                << "Argument: " << -info << " is invalid."; | 
 |   } | 
 |  | 
 |   const char uplo = 'U'; | 
 |   trans = 'N'; | 
 |   const char diag = 'N'; | 
 |   dtrtrs_(&uplo, | 
 |           &trans, | 
 |           &diag, | 
 |           &num_cols_, | 
 |           &num_c_cols, | 
 |           lhs_, | 
 |           &num_rows_, | 
 |           q_transpose_rhs_.data(), | 
 |           &num_rows_, | 
 |           &info); | 
 |  | 
 |   if (info < 0) { | 
 |     LOG(FATAL) << "Congratulations, you found a bug in Ceres." | 
 |                << "Please report it. dormr fatal error." | 
 |                << "Argument: " << -info << " is invalid."; | 
 |   } else if (info > 0) { | 
 |     *message = | 
 |         "QR factorization failure. The factorization is not full rank. R has " | 
 |         "zeros on the diagonal."; | 
 |     termination_type_ = LinearSolverTerminationType::FAILURE; | 
 |   } else { | 
 |     std::copy_n(q_transpose_rhs_.data(), num_cols_, solution); | 
 |     termination_type_ = LinearSolverTerminationType::SUCCESS; | 
 |   } | 
 |  | 
 |   return termination_type_; | 
 | } | 
 |  | 
 | #endif  // CERES_NO_LAPACK | 
 |  | 
 | #ifndef CERES_NO_CUDA | 
 |  | 
 | bool CUDADenseQR::Init(ContextImpl* context, std::string* message) { | 
 |   if (!context->InitCuda(message)) { | 
 |     return false; | 
 |   } | 
 |   cublas_handle_ = context->cublas_handle_; | 
 |   cusolver_handle_ = context->cusolver_handle_; | 
 |   stream_ = context->stream_; | 
 |   error_.Reserve(1); | 
 |   *message = "CUDADenseQR::Init Success."; | 
 |   return true; | 
 | } | 
 |  | 
 | LinearSolverTerminationType CUDADenseQR::Factorize(int num_rows, | 
 |                                                    int num_cols, | 
 |                                                    double* lhs, | 
 |                                                    std::string* message) { | 
 |   factorize_result_ = LinearSolverTerminationType::FATAL_ERROR; | 
 |   lhs_.Reserve(num_rows * num_cols); | 
 |   tau_.Reserve(std::min(num_rows, num_cols)); | 
 |   num_rows_ = num_rows; | 
 |   num_cols_ = num_cols; | 
 |   lhs_.CopyFromCpu(lhs, num_rows * num_cols, stream_); | 
 |   int device_workspace_size = 0; | 
 |   if (cusolverDnDgeqrf_bufferSize(cusolver_handle_, | 
 |                                   num_rows, | 
 |                                   num_cols, | 
 |                                   lhs_.data(), | 
 |                                   num_rows, | 
 |                                   &device_workspace_size) != | 
 |       CUSOLVER_STATUS_SUCCESS) { | 
 |     *message = "cuSolverDN::cusolverDnDgeqrf_bufferSize failed."; | 
 |     return LinearSolverTerminationType::FATAL_ERROR; | 
 |   } | 
 |   device_workspace_.Reserve(device_workspace_size); | 
 |   if (cusolverDnDgeqrf(cusolver_handle_, | 
 |                        num_rows, | 
 |                        num_cols, | 
 |                        lhs_.data(), | 
 |                        num_rows, | 
 |                        tau_.data(), | 
 |                        reinterpret_cast<double*>(device_workspace_.data()), | 
 |                        device_workspace_.size(), | 
 |                        error_.data()) != CUSOLVER_STATUS_SUCCESS) { | 
 |     *message = "cuSolverDN::cusolverDnDgeqrf failed."; | 
 |     return LinearSolverTerminationType::FATAL_ERROR; | 
 |   } | 
 |   if (cudaDeviceSynchronize() != cudaSuccess || | 
 |       cudaStreamSynchronize(stream_) != cudaSuccess) { | 
 |     *message = "Cuda device synchronization failed."; | 
 |     return LinearSolverTerminationType::FATAL_ERROR; | 
 |   } | 
 |   int error = 0; | 
 |   error_.CopyToCpu(&error, 1); | 
 |   if (error < 0) { | 
 |     LOG(FATAL) << "Congratulations, you found a bug in Ceres - " | 
 |                << "please report it. " | 
 |                << "cuSolverDN::cusolverDnDgeqrf fatal error. " | 
 |                << "Argument: " << -error << " is invalid."; | 
 |     // The following line is unreachable, but return failure just to be | 
 |     // pedantic, since the compiler does not know that. | 
 |     return LinearSolverTerminationType::FATAL_ERROR; | 
 |   } | 
 |  | 
 |   *message = "Success"; | 
 |   factorize_result_ = LinearSolverTerminationType::SUCCESS; | 
 |   return LinearSolverTerminationType::SUCCESS; | 
 | } | 
 |  | 
 | LinearSolverTerminationType CUDADenseQR::Solve(const double* rhs, | 
 |                                                double* solution, | 
 |                                                std::string* message) { | 
 |   if (factorize_result_ != LinearSolverTerminationType::SUCCESS) { | 
 |     *message = "Factorize did not complete successfully previously."; | 
 |     return factorize_result_; | 
 |   } | 
 |   rhs_.CopyFromCpu(rhs, num_rows_, stream_); | 
 |   int device_workspace_size = 0; | 
 |   if (cusolverDnDormqr_bufferSize(cusolver_handle_, | 
 |                                   CUBLAS_SIDE_LEFT, | 
 |                                   CUBLAS_OP_T, | 
 |                                   num_rows_, | 
 |                                   1, | 
 |                                   num_cols_, | 
 |                                   lhs_.data(), | 
 |                                   num_rows_, | 
 |                                   tau_.data(), | 
 |                                   rhs_.data(), | 
 |                                   num_rows_, | 
 |                                   &device_workspace_size) != | 
 |       CUSOLVER_STATUS_SUCCESS) { | 
 |     *message = "cuSolverDN::cusolverDnDormqr_bufferSize failed."; | 
 |     return LinearSolverTerminationType::FATAL_ERROR; | 
 |   } | 
 |   device_workspace_.Reserve(device_workspace_size); | 
 |   // Compute rhs = Q^T * rhs, assuming that lhs has already been factorized. | 
 |   // The result of factorization would have stored Q in a packed form in lhs_. | 
 |   if (cusolverDnDormqr(cusolver_handle_, | 
 |                        CUBLAS_SIDE_LEFT, | 
 |                        CUBLAS_OP_T, | 
 |                        num_rows_, | 
 |                        1, | 
 |                        num_cols_, | 
 |                        lhs_.data(), | 
 |                        num_rows_, | 
 |                        tau_.data(), | 
 |                        rhs_.data(), | 
 |                        num_rows_, | 
 |                        reinterpret_cast<double*>(device_workspace_.data()), | 
 |                        device_workspace_.size(), | 
 |                        error_.data()) != CUSOLVER_STATUS_SUCCESS) { | 
 |     *message = "cuSolverDN::cusolverDnDormqr failed."; | 
 |     return LinearSolverTerminationType::FATAL_ERROR; | 
 |   } | 
 |   int error = 0; | 
 |   error_.CopyToCpu(&error, 1); | 
 |   if (error < 0) { | 
 |     LOG(FATAL) << "Congratulations, you found a bug in Ceres. " | 
 |                << "Please report it." | 
 |                << "cuSolverDN::cusolverDnDormqr fatal error. " | 
 |                << "Argument: " << -error << " is invalid."; | 
 |   } | 
 |   // Compute the solution vector as x = R \ (Q^T * rhs). Since the previous step | 
 |   // replaced rhs by (Q^T * rhs), this is just x = R \ rhs. | 
 |   if (cublasDtrsv(cublas_handle_, | 
 |                   CUBLAS_FILL_MODE_UPPER, | 
 |                   CUBLAS_OP_N, | 
 |                   CUBLAS_DIAG_NON_UNIT, | 
 |                   num_cols_, | 
 |                   lhs_.data(), | 
 |                   num_rows_, | 
 |                   rhs_.data(), | 
 |                   1) != CUBLAS_STATUS_SUCCESS) { | 
 |     *message = "cuBLAS::cublasDtrsv failed."; | 
 |     return LinearSolverTerminationType::FATAL_ERROR; | 
 |   } | 
 |   if (cudaDeviceSynchronize() != cudaSuccess || | 
 |       cudaStreamSynchronize(stream_) != cudaSuccess) { | 
 |     *message = "Cuda device synchronization failed."; | 
 |     return LinearSolverTerminationType::FATAL_ERROR; | 
 |   } | 
 |   rhs_.CopyToCpu(solution, num_cols_); | 
 |   *message = "Success"; | 
 |   return LinearSolverTerminationType::SUCCESS; | 
 | } | 
 |  | 
 | std::unique_ptr<CUDADenseQR> CUDADenseQR::Create( | 
 |     const LinearSolver::Options& options) { | 
 |   if (options.dense_linear_algebra_library_type != CUDA) { | 
 |     // The user called the wrong factory method. | 
 |     return nullptr; | 
 |   } | 
 |   auto cuda_dense_qr = std::unique_ptr<CUDADenseQR>(new CUDADenseQR()); | 
 |   std::string cuda_error; | 
 |   if (cuda_dense_qr->Init(options.context, &cuda_error)) { | 
 |     return cuda_dense_qr; | 
 |   } | 
 |   // Initialization failed, destroy the object (done automatically) and return a | 
 |   // nullptr. | 
 |   LOG(ERROR) << "CUDADenseQR::Init failed: " << cuda_error; | 
 |   return nullptr; | 
 | } | 
 |  | 
 | CUDADenseQR::CUDADenseQR() = default; | 
 |  | 
 | #endif  // CERES_NO_CUDA | 
 |  | 
 | }  // namespace ceres::internal |