| // Ceres Solver - A fast non-linear least squares minimizer |
| // Copyright 2022 Google Inc. All rights reserved. |
| // http://ceres-solver.org/ |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are met: |
| // |
| // * Redistributions of source code must retain the above copyright notice, |
| // this list of conditions and the following disclaimer. |
| // * Redistributions in binary form must reproduce the above copyright notice, |
| // this list of conditions and the following disclaimer in the documentation |
| // and/or other materials provided with the distribution. |
| // * Neither the name of Google Inc. nor the names of its contributors may be |
| // used to endorse or promote products derived from this software without |
| // specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| // POSSIBILITY OF SUCH DAMAGE. |
| // |
| // Author: sameeragarwal@google.com (Sameer Agarwal) |
| |
| #ifndef CERES_INTERNAL_DENSE_CHOLESKY_H_ |
| #define CERES_INTERNAL_DENSE_CHOLESKY_H_ |
| |
| // This include must come before any #ifndef check on Ceres compile options. |
| // clang-format off |
| #include "ceres/internal/config.h" |
| // clang-format on |
| |
| #include <memory> |
| #include <vector> |
| |
| #include "Eigen/Dense" |
| #include "ceres/context_impl.h" |
| #include "ceres/cuda_buffer.h" |
| #include "ceres/linear_solver.h" |
| #include "glog/logging.h" |
| #ifndef CERES_NO_CUDA |
| #include "ceres/context_impl.h" |
| #include "cuda_runtime.h" |
| #include "cusolverDn.h" |
| #endif // CERES_NO_CUDA |
| |
| namespace ceres::internal { |
| |
| // An interface that abstracts away the internal details of various dense linear |
| // algebra libraries and offers a simple API for solving dense symmetric |
| // positive definite linear systems using a Cholesky factorization. |
| class CERES_NO_EXPORT DenseCholesky { |
| public: |
| static std::unique_ptr<DenseCholesky> Create( |
| const LinearSolver::Options& options); |
| |
| virtual ~DenseCholesky(); |
| |
| // Computes the Cholesky factorization of the given matrix. |
| // |
| // The input matrix lhs is assumed to be a column-major num_cols x num_cols |
| // matrix, that is symmetric positive definite with its lower triangular part |
| // containing the left hand side of the linear system being solved. |
| // |
| // The input matrix lhs may be modified by the implementation to store the |
| // factorization, irrespective of whether the factorization succeeds or not. |
| // As a result it is the user's responsibility to ensure that lhs is valid |
| // when Solve is called. |
| virtual LinearSolverTerminationType Factorize(int num_cols, |
| double* lhs, |
| std::string* message) = 0; |
| |
| // Computes the solution to the equation |
| // |
| // lhs * solution = rhs |
| // |
| // Calling Solve without calling Factorize is undefined behaviour. It is the |
| // user's responsibility to ensure that the input matrix lhs passed to |
| // Factorize has not been freed/modified when Solve is called. |
| virtual LinearSolverTerminationType Solve(const double* rhs, |
| double* solution, |
| std::string* message) = 0; |
| |
| // Convenience method which combines a call to Factorize and Solve. Solve is |
| // only called if Factorize returns LinearSolverTerminationType::SUCCESS. |
| // |
| // The input matrix lhs may be modified by the implementation to store the |
| // factorization, irrespective of whether the method succeeds or not. It is |
| // the user's responsibility to ensure that lhs is valid if and when Solve is |
| // called again after this call. |
| LinearSolverTerminationType FactorAndSolve(int num_cols, |
| double* lhs, |
| const double* rhs, |
| double* solution, |
| std::string* message); |
| }; |
| |
| class CERES_NO_EXPORT EigenDenseCholesky final : public DenseCholesky { |
| public: |
| LinearSolverTerminationType Factorize(int num_cols, |
| double* lhs, |
| std::string* message) override; |
| LinearSolverTerminationType Solve(const double* rhs, |
| double* solution, |
| std::string* message) override; |
| |
| private: |
| using LLTType = Eigen::LLT<Eigen::Ref<Eigen::MatrixXd>, Eigen::Lower>; |
| std::unique_ptr<LLTType> llt_; |
| }; |
| |
| class CERES_NO_EXPORT FloatEigenDenseCholesky final : public DenseCholesky { |
| public: |
| LinearSolverTerminationType Factorize(int num_cols, |
| double* lhs, |
| std::string* message) override; |
| LinearSolverTerminationType Solve(const double* rhs, |
| double* solution, |
| std::string* message) override; |
| |
| private: |
| Eigen::MatrixXf lhs_; |
| Eigen::VectorXf rhs_; |
| Eigen::VectorXf solution_; |
| using LLTType = Eigen::LLT<Eigen::MatrixXf, Eigen::Lower>; |
| std::unique_ptr<LLTType> llt_; |
| }; |
| |
| #ifndef CERES_NO_LAPACK |
| class CERES_NO_EXPORT LAPACKDenseCholesky final : public DenseCholesky { |
| public: |
| LinearSolverTerminationType Factorize(int num_cols, |
| double* lhs, |
| std::string* message) override; |
| LinearSolverTerminationType Solve(const double* rhs, |
| double* solution, |
| std::string* message) override; |
| |
| private: |
| double* lhs_ = nullptr; |
| int num_cols_ = -1; |
| LinearSolverTerminationType termination_type_ = |
| LinearSolverTerminationType::FATAL_ERROR; |
| }; |
| |
| class CERES_NO_EXPORT FloatLAPACKDenseCholesky final : public DenseCholesky { |
| public: |
| LinearSolverTerminationType Factorize(int num_cols, |
| double* lhs, |
| std::string* message) override; |
| LinearSolverTerminationType Solve(const double* rhs, |
| double* solution, |
| std::string* message) override; |
| |
| private: |
| Eigen::MatrixXf lhs_; |
| Eigen::VectorXf rhs_and_solution_; |
| int num_cols_ = -1; |
| LinearSolverTerminationType termination_type_ = |
| LinearSolverTerminationType::FATAL_ERROR; |
| }; |
| #endif // CERES_NO_LAPACK |
| |
| class DenseIterativeRefiner; |
| |
| // Computes an initial solution using the given instance of |
| // DenseCholesky, and then refines it using the DenseIterativeRefiner. |
| class CERES_NO_EXPORT RefinedDenseCholesky final : public DenseCholesky { |
| public: |
| RefinedDenseCholesky( |
| std::unique_ptr<DenseCholesky> dense_cholesky, |
| std::unique_ptr<DenseIterativeRefiner> iterative_refiner); |
| ~RefinedDenseCholesky() override; |
| |
| LinearSolverTerminationType Factorize(int num_cols, |
| double* lhs, |
| std::string* message) override; |
| LinearSolverTerminationType Solve(const double* rhs, |
| double* solution, |
| std::string* message) override; |
| |
| private: |
| std::unique_ptr<DenseCholesky> dense_cholesky_; |
| std::unique_ptr<DenseIterativeRefiner> iterative_refiner_; |
| double* lhs_ = nullptr; |
| int num_cols_; |
| }; |
| |
| #ifndef CERES_NO_CUDA |
| // CUDA implementation of DenseCholesky using the cuSolverDN library using the |
| // 32-bit legacy interface for maximum compatibility. |
| class CERES_NO_EXPORT CUDADenseCholesky final : public DenseCholesky { |
| public: |
| static std::unique_ptr<CUDADenseCholesky> Create( |
| const LinearSolver::Options& options); |
| CUDADenseCholesky(const CUDADenseCholesky&) = delete; |
| CUDADenseCholesky& operator=(const CUDADenseCholesky&) = delete; |
| LinearSolverTerminationType Factorize(int num_cols, |
| double* lhs, |
| std::string* message) override; |
| LinearSolverTerminationType Solve(const double* rhs, |
| double* solution, |
| std::string* message) override; |
| |
| private: |
| explicit CUDADenseCholesky(ContextImpl* context); |
| |
| ContextImpl* context_ = nullptr; |
| // Number of columns in the A matrix, to be cached between calls to *Factorize |
| // and *Solve. |
| size_t num_cols_ = 0; |
| // GPU memory allocated for the A matrix (lhs matrix). |
| CudaBuffer<double> lhs_; |
| // GPU memory allocated for the B matrix (rhs vector). |
| CudaBuffer<double> rhs_; |
| // Scratch space for cuSOLVER on the GPU. |
| CudaBuffer<double> device_workspace_; |
| // Required for error handling with cuSOLVER. |
| CudaBuffer<int> error_; |
| // Cache the result of Factorize to ensure that when Solve is called, the |
| // factorization of lhs is valid. |
| LinearSolverTerminationType factorize_result_ = |
| LinearSolverTerminationType::FATAL_ERROR; |
| }; |
| |
| // A mixed-precision iterative refinement dense Cholesky solver using FP32 CUDA |
| // Dense Cholesky for inner iterations, and FP64 outer refinements. |
| // This class implements a modified version of the "Classical iterative |
| // refinement" (Algorithm 4.1) from the following paper: |
| // Haidar, Azzam, Harun Bayraktar, Stanimire Tomov, Jack Dongarra, and Nicholas |
| // J. Higham. "Mixed-precision iterative refinement using tensor cores on GPUs |
| // to accelerate solution of linear systems." Proceedings of the Royal Society A |
| // 476, no. 2243 (2020): 20200110. |
| // |
| // The three key modifications from Algorithm 4.1 in the paper are: |
| // 1. We use Cholesky factorization instead of LU factorization since our A is |
| // symmetric positive definite. |
| // 2. During the solution update, the up-cast and accumulation is performed in |
| // one step with a custom kernel. |
| class CERES_NO_EXPORT CUDADenseCholeskyMixedPrecision final |
| : public DenseCholesky { |
| public: |
| static std::unique_ptr<CUDADenseCholeskyMixedPrecision> Create( |
| const LinearSolver::Options& options); |
| CUDADenseCholeskyMixedPrecision(const CUDADenseCholeskyMixedPrecision&) = |
| delete; |
| CUDADenseCholeskyMixedPrecision& operator=( |
| const CUDADenseCholeskyMixedPrecision&) = delete; |
| LinearSolverTerminationType Factorize(int num_cols, |
| double* lhs, |
| std::string* message) override; |
| LinearSolverTerminationType Solve(const double* rhs, |
| double* solution, |
| std::string* message) override; |
| |
| private: |
| CUDADenseCholeskyMixedPrecision(ContextImpl* context, |
| int max_num_refinement_iterations); |
| |
| // Helper function to wrap Cuda boilerplate needed to call Spotrf. |
| LinearSolverTerminationType CudaCholeskyFactorize(std::string* message); |
| // Helper function to wrap Cuda boilerplate needed to call Spotrs. |
| LinearSolverTerminationType CudaCholeskySolve(std::string* message); |
| // Picks up the cuSolverDN and cuStream handles from the context in the |
| // options, and the number of refinement iterations from the options. If |
| // the context is unable to initialize CUDA, returns false with a |
| // human-readable message indicating the reason. |
| bool Init(const LinearSolver::Options& options, std::string* message); |
| |
| ContextImpl* context_ = nullptr; |
| // Number of columns in the A matrix, to be cached between calls to *Factorize |
| // and *Solve. |
| size_t num_cols_ = 0; |
| CudaBuffer<double> lhs_fp64_; |
| CudaBuffer<double> rhs_fp64_; |
| CudaBuffer<float> lhs_fp32_; |
| // Scratch space for cuSOLVER on the GPU. |
| CudaBuffer<float> device_workspace_; |
| // Required for error handling with cuSOLVER. |
| CudaBuffer<int> error_; |
| |
| // Solution to lhs * x = rhs. |
| CudaBuffer<double> x_fp64_; |
| // Incremental correction to x. |
| CudaBuffer<float> correction_fp32_; |
| // Residual to iterative refinement. |
| CudaBuffer<float> residual_fp32_; |
| CudaBuffer<double> residual_fp64_; |
| |
| // Number of inner refinement iterations to perform. |
| int max_num_refinement_iterations_ = 0; |
| // Cache the result of Factorize to ensure that when Solve is called, the |
| // factorization of lhs is valid. |
| LinearSolverTerminationType factorize_result_ = |
| LinearSolverTerminationType::FATAL_ERROR; |
| }; |
| |
| #endif // CERES_NO_CUDA |
| |
| } // namespace ceres::internal |
| |
| #endif // CERES_INTERNAL_DENSE_CHOLESKY_H_ |