|  | // Ceres Solver - A fast non-linear least squares minimizer | 
|  | // Copyright 2022 Google Inc. All rights reserved. | 
|  | // http://ceres-solver.org/ | 
|  | // | 
|  | // Redistribution and use in source and binary forms, with or without | 
|  | // modification, are permitted provided that the following conditions are met: | 
|  | // | 
|  | // * Redistributions of source code must retain the above copyright notice, | 
|  | //   this list of conditions and the following disclaimer. | 
|  | // * Redistributions in binary form must reproduce the above copyright notice, | 
|  | //   this list of conditions and the following disclaimer in the documentation | 
|  | //   and/or other materials provided with the distribution. | 
|  | // * Neither the name of Google Inc. nor the names of its contributors may be | 
|  | //   used to endorse or promote products derived from this software without | 
|  | //   specific prior written permission. | 
|  | // | 
|  | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | 
|  | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
|  | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | 
|  | // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | 
|  | // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | 
|  | // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | 
|  | // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | 
|  | // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | 
|  | // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | 
|  | // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | 
|  | // POSSIBILITY OF SUCH DAMAGE. | 
|  | // | 
|  | // Author: sameeragarwal@google.com (Sameer Agarwal) | 
|  |  | 
|  | #ifndef CERES_INTERNAL_DENSE_CHOLESKY_H_ | 
|  | #define CERES_INTERNAL_DENSE_CHOLESKY_H_ | 
|  |  | 
|  | // This include must come before any #ifndef check on Ceres compile options. | 
|  | // clang-format off | 
|  | #include "ceres/internal/config.h" | 
|  | // clang-format on | 
|  |  | 
|  | #include <memory> | 
|  | #include <vector> | 
|  |  | 
|  | #include "Eigen/Dense" | 
|  | #include "ceres/context_impl.h" | 
|  | #include "ceres/cuda_buffer.h" | 
|  | #include "ceres/linear_solver.h" | 
|  | #include "glog/logging.h" | 
|  | #ifndef CERES_NO_CUDA | 
|  | #include "ceres/context_impl.h" | 
|  | #include "cuda_runtime.h" | 
|  | #include "cusolverDn.h" | 
|  | #endif  // CERES_NO_CUDA | 
|  |  | 
|  | namespace ceres::internal { | 
|  |  | 
|  | // An interface that abstracts away the internal details of various dense linear | 
|  | // algebra libraries and offers a simple API for solving dense symmetric | 
|  | // positive definite linear systems using a Cholesky factorization. | 
|  | class CERES_NO_EXPORT DenseCholesky { | 
|  | public: | 
|  | static std::unique_ptr<DenseCholesky> Create( | 
|  | const LinearSolver::Options& options); | 
|  |  | 
|  | virtual ~DenseCholesky(); | 
|  |  | 
|  | // Computes the Cholesky factorization of the given matrix. | 
|  | // | 
|  | // The input matrix lhs is assumed to be a column-major num_cols x num_cols | 
|  | // matrix, that is symmetric positive definite with its lower triangular part | 
|  | // containing the left hand side of the linear system being solved. | 
|  | // | 
|  | // The input matrix lhs may be modified by the implementation to store the | 
|  | // factorization, irrespective of whether the factorization succeeds or not. | 
|  | // As a result it is the user's responsibility to ensure that lhs is valid | 
|  | // when Solve is called. | 
|  | virtual LinearSolverTerminationType Factorize(int num_cols, | 
|  | double* lhs, | 
|  | std::string* message) = 0; | 
|  |  | 
|  | // Computes the solution to the equation | 
|  | // | 
|  | // lhs * solution = rhs | 
|  | // | 
|  | // Calling Solve without calling Factorize is undefined behaviour. It is the | 
|  | // user's responsibility to ensure that the input matrix lhs passed to | 
|  | // Factorize has not been freed/modified when Solve is called. | 
|  | virtual LinearSolverTerminationType Solve(const double* rhs, | 
|  | double* solution, | 
|  | std::string* message) = 0; | 
|  |  | 
|  | // Convenience method which combines a call to Factorize and Solve. Solve is | 
|  | // only called if Factorize returns LinearSolverTerminationType::SUCCESS. | 
|  | // | 
|  | // The input matrix lhs may be modified by the implementation to store the | 
|  | // factorization, irrespective of whether the method succeeds or not. It is | 
|  | // the user's responsibility to ensure that lhs is valid if and when Solve is | 
|  | // called again after this call. | 
|  | LinearSolverTerminationType FactorAndSolve(int num_cols, | 
|  | double* lhs, | 
|  | const double* rhs, | 
|  | double* solution, | 
|  | std::string* message); | 
|  | }; | 
|  |  | 
|  | class CERES_NO_EXPORT EigenDenseCholesky final : public DenseCholesky { | 
|  | public: | 
|  | LinearSolverTerminationType Factorize(int num_cols, | 
|  | double* lhs, | 
|  | std::string* message) override; | 
|  | LinearSolverTerminationType Solve(const double* rhs, | 
|  | double* solution, | 
|  | std::string* message) override; | 
|  |  | 
|  | private: | 
|  | using LLTType = Eigen::LLT<Eigen::Ref<Eigen::MatrixXd>, Eigen::Lower>; | 
|  | std::unique_ptr<LLTType> llt_; | 
|  | }; | 
|  |  | 
|  | class CERES_NO_EXPORT FloatEigenDenseCholesky final : public DenseCholesky { | 
|  | public: | 
|  | LinearSolverTerminationType Factorize(int num_cols, | 
|  | double* lhs, | 
|  | std::string* message) override; | 
|  | LinearSolverTerminationType Solve(const double* rhs, | 
|  | double* solution, | 
|  | std::string* message) override; | 
|  |  | 
|  | private: | 
|  | Eigen::MatrixXf lhs_; | 
|  | Eigen::VectorXf rhs_; | 
|  | Eigen::VectorXf solution_; | 
|  | using LLTType = Eigen::LLT<Eigen::MatrixXf, Eigen::Lower>; | 
|  | std::unique_ptr<LLTType> llt_; | 
|  | }; | 
|  |  | 
|  | #ifndef CERES_NO_LAPACK | 
|  | class CERES_NO_EXPORT LAPACKDenseCholesky final : public DenseCholesky { | 
|  | public: | 
|  | LinearSolverTerminationType Factorize(int num_cols, | 
|  | double* lhs, | 
|  | std::string* message) override; | 
|  | LinearSolverTerminationType Solve(const double* rhs, | 
|  | double* solution, | 
|  | std::string* message) override; | 
|  |  | 
|  | private: | 
|  | double* lhs_ = nullptr; | 
|  | int num_cols_ = -1; | 
|  | LinearSolverTerminationType termination_type_ = | 
|  | LinearSolverTerminationType::FATAL_ERROR; | 
|  | }; | 
|  |  | 
|  | class CERES_NO_EXPORT FloatLAPACKDenseCholesky final : public DenseCholesky { | 
|  | public: | 
|  | LinearSolverTerminationType Factorize(int num_cols, | 
|  | double* lhs, | 
|  | std::string* message) override; | 
|  | LinearSolverTerminationType Solve(const double* rhs, | 
|  | double* solution, | 
|  | std::string* message) override; | 
|  |  | 
|  | private: | 
|  | Eigen::MatrixXf lhs_; | 
|  | Eigen::VectorXf rhs_and_solution_; | 
|  | int num_cols_ = -1; | 
|  | LinearSolverTerminationType termination_type_ = | 
|  | LinearSolverTerminationType::FATAL_ERROR; | 
|  | }; | 
|  | #endif  // CERES_NO_LAPACK | 
|  |  | 
|  | class DenseIterativeRefiner; | 
|  |  | 
|  | // Computes an initial solution using the given instance of | 
|  | // DenseCholesky, and then refines it using the DenseIterativeRefiner. | 
|  | class CERES_NO_EXPORT RefinedDenseCholesky final : public DenseCholesky { | 
|  | public: | 
|  | RefinedDenseCholesky( | 
|  | std::unique_ptr<DenseCholesky> dense_cholesky, | 
|  | std::unique_ptr<DenseIterativeRefiner> iterative_refiner); | 
|  | ~RefinedDenseCholesky() override; | 
|  |  | 
|  | LinearSolverTerminationType Factorize(int num_cols, | 
|  | double* lhs, | 
|  | std::string* message) override; | 
|  | LinearSolverTerminationType Solve(const double* rhs, | 
|  | double* solution, | 
|  | std::string* message) override; | 
|  |  | 
|  | private: | 
|  | std::unique_ptr<DenseCholesky> dense_cholesky_; | 
|  | std::unique_ptr<DenseIterativeRefiner> iterative_refiner_; | 
|  | double* lhs_ = nullptr; | 
|  | int num_cols_; | 
|  | }; | 
|  |  | 
|  | #ifndef CERES_NO_CUDA | 
|  | // CUDA implementation of DenseCholesky using the cuSolverDN library using the | 
|  | // 32-bit legacy interface for maximum compatibility. | 
|  | class CERES_NO_EXPORT CUDADenseCholesky final : public DenseCholesky { | 
|  | public: | 
|  | static std::unique_ptr<CUDADenseCholesky> Create( | 
|  | const LinearSolver::Options& options); | 
|  | CUDADenseCholesky(const CUDADenseCholesky&) = delete; | 
|  | CUDADenseCholesky& operator=(const CUDADenseCholesky&) = delete; | 
|  | LinearSolverTerminationType Factorize(int num_cols, | 
|  | double* lhs, | 
|  | std::string* message) override; | 
|  | LinearSolverTerminationType Solve(const double* rhs, | 
|  | double* solution, | 
|  | std::string* message) override; | 
|  |  | 
|  | private: | 
|  | explicit CUDADenseCholesky(ContextImpl* context); | 
|  |  | 
|  | ContextImpl* context_ = nullptr; | 
|  | // Number of columns in the A matrix, to be cached between calls to *Factorize | 
|  | // and *Solve. | 
|  | size_t num_cols_ = 0; | 
|  | // GPU memory allocated for the A matrix (lhs matrix). | 
|  | CudaBuffer<double> lhs_; | 
|  | // GPU memory allocated for the B matrix (rhs vector). | 
|  | CudaBuffer<double> rhs_; | 
|  | // Scratch space for cuSOLVER on the GPU. | 
|  | CudaBuffer<double> device_workspace_; | 
|  | // Required for error handling with cuSOLVER. | 
|  | CudaBuffer<int> error_; | 
|  | // Cache the result of Factorize to ensure that when Solve is called, the | 
|  | // factorization of lhs is valid. | 
|  | LinearSolverTerminationType factorize_result_ = | 
|  | LinearSolverTerminationType::FATAL_ERROR; | 
|  | }; | 
|  |  | 
|  | // A mixed-precision iterative refinement dense Cholesky solver using FP32 CUDA | 
|  | // Dense Cholesky for inner iterations, and FP64 outer refinements. | 
|  | // This class implements a modified version of the  "Classical iterative | 
|  | // refinement" (Algorithm 4.1) from the following paper: | 
|  | // Haidar, Azzam, Harun Bayraktar, Stanimire Tomov, Jack Dongarra, and Nicholas | 
|  | // J. Higham. "Mixed-precision iterative refinement using tensor cores on GPUs | 
|  | // to accelerate solution of linear systems." Proceedings of the Royal Society A | 
|  | // 476, no. 2243 (2020): 20200110. | 
|  | // | 
|  | // The three key modifications from Algorithm 4.1 in the paper are: | 
|  | // 1. We use Cholesky factorization instead of LU factorization since our A is | 
|  | //    symmetric positive definite. | 
|  | // 2. During the solution update, the up-cast and accumulation is performed in | 
|  | //    one step with a custom kernel. | 
|  | class CERES_NO_EXPORT CUDADenseCholeskyMixedPrecision final | 
|  | : public DenseCholesky { | 
|  | public: | 
|  | static std::unique_ptr<CUDADenseCholeskyMixedPrecision> Create( | 
|  | const LinearSolver::Options& options); | 
|  | CUDADenseCholeskyMixedPrecision(const CUDADenseCholeskyMixedPrecision&) = | 
|  | delete; | 
|  | CUDADenseCholeskyMixedPrecision& operator=( | 
|  | const CUDADenseCholeskyMixedPrecision&) = delete; | 
|  | LinearSolverTerminationType Factorize(int num_cols, | 
|  | double* lhs, | 
|  | std::string* message) override; | 
|  | LinearSolverTerminationType Solve(const double* rhs, | 
|  | double* solution, | 
|  | std::string* message) override; | 
|  |  | 
|  | private: | 
|  | CUDADenseCholeskyMixedPrecision(ContextImpl* context, | 
|  | int max_num_refinement_iterations); | 
|  |  | 
|  | // Helper function to wrap Cuda boilerplate needed to call Spotrf. | 
|  | LinearSolverTerminationType CudaCholeskyFactorize(std::string* message); | 
|  | // Helper function to wrap Cuda boilerplate needed to call Spotrs. | 
|  | LinearSolverTerminationType CudaCholeskySolve(std::string* message); | 
|  | // Picks up the cuSolverDN and cuStream handles from the context in the | 
|  | // options, and the number of refinement iterations from the options. If | 
|  | // the context is unable to initialize CUDA, returns false with a | 
|  | // human-readable message indicating the reason. | 
|  | bool Init(const LinearSolver::Options& options, std::string* message); | 
|  |  | 
|  | ContextImpl* context_ = nullptr; | 
|  | // Number of columns in the A matrix, to be cached between calls to *Factorize | 
|  | // and *Solve. | 
|  | size_t num_cols_ = 0; | 
|  | CudaBuffer<double> lhs_fp64_; | 
|  | CudaBuffer<double> rhs_fp64_; | 
|  | CudaBuffer<float> lhs_fp32_; | 
|  | // Scratch space for cuSOLVER on the GPU. | 
|  | CudaBuffer<float> device_workspace_; | 
|  | // Required for error handling with cuSOLVER. | 
|  | CudaBuffer<int> error_; | 
|  |  | 
|  | // Solution to lhs * x = rhs. | 
|  | CudaBuffer<double> x_fp64_; | 
|  | // Incremental correction to x. | 
|  | CudaBuffer<float> correction_fp32_; | 
|  | // Residual to iterative refinement. | 
|  | CudaBuffer<float> residual_fp32_; | 
|  | CudaBuffer<double> residual_fp64_; | 
|  |  | 
|  | // Number of inner refinement iterations to perform. | 
|  | int max_num_refinement_iterations_ = 0; | 
|  | // Cache the result of Factorize to ensure that when Solve is called, the | 
|  | // factorization of lhs is valid. | 
|  | LinearSolverTerminationType factorize_result_ = | 
|  | LinearSolverTerminationType::FATAL_ERROR; | 
|  | }; | 
|  |  | 
|  | #endif  // CERES_NO_CUDA | 
|  |  | 
|  | }  // namespace ceres::internal | 
|  |  | 
|  | #endif  // CERES_INTERNAL_DENSE_CHOLESKY_H_ |