internal/ceres/cuda_sparse_cholesky.cc - ceres-solver - Git at Google

 // Ceres Solver - A fast non-linear least squares minimizer
 // Copyright 2024 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
 // * Redistributions of source code must retain the above copyright notice,
 //   this list of conditions and the following disclaimer.
 // * Redistributions in binary form must reproduce the above copyright notice,
 //   this list of conditions and the following disclaimer in the documentation
 //   and/or other materials provided with the distribution.
 // * Neither the name of Google Inc. nor the names of its contributors may be
 //   used to endorse or promote products derived from this software without
 //   specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 //
 // Author: markshachkov@gmail.com (Mark Shachkov)

 #include "ceres/cuda_sparse_cholesky.h"

 #ifndef CERES_NO_CUDSS

 #include <cstddef>
 #include <iostream>
 #include <memory>
 #include <string>
 #include <type_traits>

 #include "Eigen/Core"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "ceres/compressed_row_sparse_matrix.h"
 #include "ceres/cuda_buffer.h"
 #include "ceres/linear_solver.h"
 #include "cudss.h"

 namespace ceres::internal {

 inline std::string cuDSSStatusToString(cudssStatus_t status) {
   switch (status) {
     case CUDSS_STATUS_SUCCESS:
       return "CUDSS_STATUS_SUCCESS";
     case CUDSS_STATUS_NOT_INITIALIZED:
       return "CUDSS_STATUS_NOT_INITIALIZED";
     case CUDSS_STATUS_ALLOC_FAILED:
       return "CUDSS_STATUS_ALLOC_FAILED";
     case CUDSS_STATUS_INVALID_VALUE:
       return "CUDSS_STATUS_INVALID_VALUE";
     case CUDSS_STATUS_NOT_SUPPORTED:
       return "CUDSS_STATUS_NOT_SUPPORTED";
     case CUDSS_STATUS_EXECUTION_FAILED:
       return "CUDSS_STATUS_EXECUTION_FAILED";
     case CUDSS_STATUS_INTERNAL_ERROR:
       return "CUDSS_STATUS_INTERNAL_ERROR";
     default:
       return "unknown cuDSS status: " + std::to_string(status);
   }
 }

 #define CUDSS_STATUS_CHECK(IN)                                     \
   if (cudssStatus_t status = IN; status != CUDSS_STATUS_SUCCESS) { \
     CHECK(false) << "Got error: " << cuDSSStatusToString(status);  \
   }

 #define CUDSS_STATUS_OK_OR_RETURN_FATAL_ERROR(IN, additional_message)    \
   if (cudssStatus_t status = IN; status != CUDSS_STATUS_SUCCESS) {       \
     *message = std::string(additional_message) +                         \
                " Got error: " + cuDSSStatusToString(status);             \
     return factorize_result_ = LinearSolverTerminationType::FATAL_ERROR; \
   }

 #define CUDSS_STATUS_OK_OR_RETURN_CUDSS_STATUS(IN)                 \
   if (cudssStatus_t status = IN; status != CUDSS_STATUS_SUCCESS) { \
     return status;                                                 \
   }

 class CERES_NO_EXPORT CuDSSMatrixBase {
  public:
   CuDSSMatrixBase() = default;
   CuDSSMatrixBase(const CuDSSMatrixBase&) = delete;
   CuDSSMatrixBase(CuDSSMatrixBase&&) = delete;
   CuDSSMatrixBase& operator=(const CuDSSMatrixBase&) = delete;
   CuDSSMatrixBase& operator=(CuDSSMatrixBase&&) = delete;
   ~CuDSSMatrixBase() { CUDSS_STATUS_CHECK(Free()); }

   cudssStatus_t Free() noexcept {
     if (matrix_) {
       const auto status = cudssMatrixDestroy(matrix_);
       matrix_ = nullptr;
       return status;
     }

     return CUDSS_STATUS_SUCCESS;
   }

   cudssMatrix_t Get() const noexcept { return matrix_; }

  protected:
   cudssMatrix_t matrix_{nullptr};
 };

 class CERES_NO_EXPORT CuDSSMatrixCSR : public CuDSSMatrixBase {
  public:
   cudssStatus_t Reset(int64_t num_rows,
                       int64_t num_cols,
                       int64_t num_nonzeros,
                       void* rows_start,
                       void* rows_end,
                       void* cols,
                       void* values,
                       cudaDataType_t index_type,
                       cudaDataType_t value_type,
                       cudssMatrixType_t matrix_type,
                       cudssMatrixViewType_t matrix_storage_type,
                       cudssIndexBase_t index_base) {
     CUDSS_STATUS_OK_OR_RETURN_CUDSS_STATUS(Free());

     return cudssMatrixCreateCsr(&matrix_,
                                 num_rows,
                                 num_cols,
                                 num_nonzeros,
                                 rows_start,
                                 rows_end,
                                 cols,
                                 values,
                                 index_type,
                                 value_type,
                                 matrix_type,
                                 matrix_storage_type,
                                 index_base);
   }
 };

 class CERES_NO_EXPORT CuDSSMatrixDense : public CuDSSMatrixBase {
  public:
   cudssStatus_t Reset(int64_t num_rows,
                       int64_t num_cols,
                       int64_t leading_dimension_size,
                       void* values,
                       cudaDataType_t value_type,
                       cudssLayout_t layout) {
     CUDSS_STATUS_OK_OR_RETURN_CUDSS_STATUS(Free());

     return cudssMatrixCreateDn(&matrix_,
                                num_rows,
                                num_cols,
                                leading_dimension_size,
                                values,
                                value_type,
                                layout);
   }
 };

 struct CudssContext {
   CudssContext(cudssHandle_t cudss_handle) : cudss_handle_(cudss_handle) {
     CUDSS_STATUS_CHECK(cudssConfigCreate(&solver_config_));
     CUDSS_STATUS_CHECK(cudssDataCreate(cudss_handle_, &solver_data_));
   }
   CudssContext(const CudssContext&) = delete;
   CudssContext(CudssContext&&) = delete;
   CudssContext& operator=(const CudssContext&) = delete;
   CudssContext& operator=(CudssContext&&) = delete;
   ~CudssContext() {
     CUDSS_STATUS_CHECK(cudssDataDestroy(cudss_handle_, solver_data_));
     CUDSS_STATUS_CHECK(cudssConfigDestroy(solver_config_));
   }

   cudssHandle_t cudss_handle_{nullptr};
   cudssConfig_t solver_config_{nullptr};
   cudssData_t solver_data_{nullptr};
 };

 template <typename Scalar>
 class CERES_NO_EXPORT CudaSparseCholeskyImpl final : public SparseCholesky {
  public:
   static_assert(std::is_same_v<Scalar, float> || std::is_same_v<Scalar, double>,
                 "Scalar type is unsupported by cuDSS");
   static constexpr cudaDataType_t kCuDSSScalar =
       std::is_same_v<Scalar, float> ? CUDA_R_32F : CUDA_R_64F;

   CudaSparseCholeskyImpl(ContextImpl* context)
       : context_(context),
         lhs_cols_d_(context_),
         lhs_rows_d_(context_),
         lhs_values_d_(context_),
         rhs_d_(context_),
         x_d_(context_) {}
   CudaSparseCholeskyImpl(const CudaSparseCholeskyImpl&) = delete;
   CudaSparseCholeskyImpl(CudaSparseCholeskyImpl&&) = delete;
   CudaSparseCholeskyImpl& operator=(const CudaSparseCholeskyImpl&) = delete;
   CudaSparseCholeskyImpl& operator=(CudaSparseCholeskyImpl&&) = delete;
   ~CudaSparseCholeskyImpl() = default;

   CompressedRowSparseMatrix::StorageType StorageType() const {
     return CompressedRowSparseMatrix::StorageType::LOWER_TRIANGULAR;
   }

   LinearSolverTerminationType Factorize(CompressedRowSparseMatrix* lhs,
                                         std::string* message) {
     if (lhs->num_rows() != lhs->num_cols()) {
       *message = "lhs matrix must be square";
       return factorize_result_ = LinearSolverTerminationType::FATAL_ERROR;
     }
     if (lhs->storage_type() != StorageType()) {
       *message = "lhs matrix must be lower triangular";
       return factorize_result_ = LinearSolverTerminationType::FATAL_ERROR;
     }

     // If, after previous attempt to factorize, cudssDataGet(CUDSS_DATA_INFO)
     // returned a numerical error, such error will be preserved by cuDSS 0.3.0
     // and returned by cudssDataGet(CUDSS_DATA_INFO) even after correctly
     // factorizeable matrix is provided. Such behaviour forces us to reset a
     // cudssData_t object (managed by CudssContext) and to loose a result of
     // anylyze stage, thus we have to perform analyze one more time.
     // TODO: do not re-perform analyze in case of failed factorization numerics
     if (analyze_result_ != LinearSolverTerminationType::SUCCESS ||
         factorize_result_ != LinearSolverTerminationType::SUCCESS) {
       analyze_result_ = Analyze(lhs, message);
       if (analyze_result_ != LinearSolverTerminationType::SUCCESS) {
         return analyze_result_;
       }
     }
     CHECK_NE(cudss_context_.get(), nullptr);

     ConvertAndCopyToDevice(lhs->values(), lhs_values_h_.data(), lhs_values_d_);

     CUDSS_STATUS_OK_OR_RETURN_FATAL_ERROR(
         cudssExecute(context_->cudss_handle_,
                      CUDSS_PHASE_FACTORIZATION,
                      cudss_context_->solver_config_,
                      cudss_context_->solver_data_,
                      cudss_lhs_.Get(),
                      cudss_x_.Get(),
                      cudss_rhs_.Get()),
         "cudssExecute with CUDSS_PHASE_FACTORIZATION failed");

     int cudss_data_info;
     CUDSS_STATUS_OK_OR_RETURN_FATAL_ERROR(
         GetCudssDataInfo(cudss_data_info),
         "cudssDataGet with CUDSS_DATA_INFO failed");
     const auto factorization_status =
         static_cast<cudssStatus_t>(cudss_data_info);

     if (factorization_status == CUDSS_STATUS_SUCCESS) {
       return factorize_result_ = LinearSolverTerminationType::SUCCESS;
     }

     if (cudss_data_info > 0) {
       return factorize_result_ = LinearSolverTerminationType::FAILURE;
     }

     return factorize_result_ = LinearSolverTerminationType::FATAL_ERROR;
   }

   LinearSolverTerminationType Solve(const double* rhs,
                                     double* solution,
                                     std::string* message) {
     CHECK_NE(cudss_context_.get(), nullptr);

     if (factorize_result_ != LinearSolverTerminationType::SUCCESS) {
       *message = "Factorize did not complete successfully previously.";
       return factorize_result_;
     }

     ConvertAndCopyToDevice(rhs, rhs_h_.data(), rhs_d_);

     CUDSS_STATUS_OK_OR_RETURN_FATAL_ERROR(
         cudssExecute(context_->cudss_handle_,
                      CUDSS_PHASE_SOLVE,
                      cudss_context_->solver_config_,
                      cudss_context_->solver_data_,
                      cudss_lhs_.Get(),
                      cudss_x_.Get(),
                      cudss_rhs_.Get()),
         "cudssExecute with CUDSS_PHASE_SOLVE failed");

     ConvertAndCopyToHost(x_d_, x_h_.data(), solution);

     int cudss_data_info;
     CUDSS_STATUS_OK_OR_RETURN_FATAL_ERROR(
         GetCudssDataInfo(cudss_data_info),
         "cudssDataGet with CUDSS_DATA_INFO failed");
     const auto solve_status = static_cast<cudssStatus_t>(cudss_data_info);

     if (solve_status != CUDSS_STATUS_SUCCESS) {
       return LinearSolverTerminationType::FAILURE;
     }

     return LinearSolverTerminationType::SUCCESS;
   }

  private:
   cudssStatus_t GetCudssDataInfo(int& cudss_data_info) {
     CHECK_NE(cudss_context_.get(), nullptr);
     std::size_t size_written = 0;
     CUDSS_STATUS_OK_OR_RETURN_CUDSS_STATUS(
         cudssDataGet(context_->cudss_handle_,
                      cudss_context_->solver_data_,
                      CUDSS_DATA_INFO,
                      &cudss_data_info,
                      sizeof(cudss_data_info),
                      &size_written));
     // TODO: enable following check after cudssDataGet will be fixed
     // CHECK_EQ(size_written, sizeof(cudss_data_info));

     return CUDSS_STATUS_SUCCESS;
   }

   LinearSolverTerminationType Analyze(const CompressedRowSparseMatrix* lhs,
                                       std::string* message) {
     if (auto status = SetupCudssMatrices(lhs, message);
         status != LinearSolverTerminationType::SUCCESS) {
       return status;
     }

     lhs_rows_d_.CopyFromCpu(lhs->rows(), lhs->num_rows() + 1);
     lhs_cols_d_.CopyFromCpu(lhs->cols(), lhs->num_nonzeros());

     // Analyze and factorization results are stored in cudssData_t (managed by
     // CudssContext). Given that cuDSS 0.3.0 does not reset it's error state in
     // case of failed numerics at factorization stage, we have to reset
     // cudssData_t and to recompute an analyze stage while trying to factorize a
     // rescaled matrix with the same structure.
     // TODO: move creation of CudssContext to ctor of CudaSparseCholeskyImpl
     cudss_context_ = std::make_unique<CudssContext>(context_->cudss_handle_);

     CUDSS_STATUS_OK_OR_RETURN_FATAL_ERROR(
         cudssExecute(context_->cudss_handle_,
                      CUDSS_PHASE_ANALYSIS,
                      cudss_context_->solver_config_,
                      cudss_context_->solver_data_,
                      cudss_lhs_.Get(),
                      cudss_x_.Get(),
                      cudss_rhs_.Get()),
         "cudssExecute with CUDSS_PHASE_ANALYSIS failed");

     return LinearSolverTerminationType::SUCCESS;
   }

   // Resize buffers and setup cuDSS structs that describe the type and storage
   // configuration of linear system operands.
   LinearSolverTerminationType SetupCudssMatrices(
       const CompressedRowSparseMatrix* lhs, std::string* message) {
     const auto num_rows = lhs->num_rows();
     const auto num_nonzeros = lhs->num_nonzeros();

     if constexpr (std::is_same_v<Scalar, float>) {
       lhs_values_h_.Reserve(num_nonzeros);
       rhs_h_.Reserve(num_rows);
       x_h_.Reserve(num_rows);
     }

     lhs_rows_d_.Reserve(num_rows + 1);
     lhs_cols_d_.Reserve(num_nonzeros);
     lhs_values_d_.Reserve(num_nonzeros);
     rhs_d_.Reserve(num_rows);
     x_d_.Reserve(num_rows);

     static constexpr auto kFailedToCreateCuDSSMatrix =
         "cudssMatrixCreate() call failed";
     CUDSS_STATUS_OK_OR_RETURN_FATAL_ERROR(cudss_lhs_.Reset(num_rows,
                                                            num_rows,
                                                            num_nonzeros,
                                                            lhs_rows_d_.data(),
                                                            nullptr,
                                                            lhs_cols_d_.data(),
                                                            lhs_values_d_.data(),
                                                            CUDA_R_32I,
                                                            kCuDSSScalar,
                                                            CUDSS_MTYPE_SPD,
                                                            CUDSS_MVIEW_LOWER,
                                                            CUDSS_BASE_ZERO),
                                           kFailedToCreateCuDSSMatrix);

     CUDSS_STATUS_OK_OR_RETURN_FATAL_ERROR(
         cudss_rhs_.Reset(num_rows,
                          1,
                          num_rows,
                          rhs_d_.data(),
                          kCuDSSScalar,
                          CUDSS_LAYOUT_COL_MAJOR),
         kFailedToCreateCuDSSMatrix);

     CUDSS_STATUS_OK_OR_RETURN_FATAL_ERROR(
         cudss_x_.Reset(num_rows,
                        1,
                        num_rows,
                        x_d_.data(),
                        kCuDSSScalar,
                        CUDSS_LAYOUT_COL_MAJOR),
         kFailedToCreateCuDSSMatrix);

     return LinearSolverTerminationType::SUCCESS;
   }

   template <typename S, typename D>
   void Convert(const S* source, D* destination, size_t size) {
     Eigen::Map<Eigen::Matrix<D, Eigen::Dynamic, 1>>(destination, size) =
         Eigen::Map<const Eigen::Matrix<S, Eigen::Dynamic, 1>>(source, size)
             .template cast<D>();
   }

   void ConvertAndCopyToDevice(const double* source,
                               Scalar* intermediate,
                               CudaBuffer<Scalar>& destination) {
     const auto size = destination.size();
     if constexpr (std::is_same_v<Scalar, double>) {
       destination.CopyFromCpu(source, size);
     } else {
       Convert(source, intermediate, size);
       destination.CopyFromCpu(intermediate, size);
     }
   }

   void ConvertAndCopyToHost(const CudaBuffer<Scalar>& source,
                             Scalar* intermediate,
                             double* destination) {
     const auto size = source.size();
     if constexpr (std::is_same_v<Scalar, double>) {
       source.CopyToCpu(destination, source.size());
     } else {
       source.CopyToCpu(intermediate, source.size());
       Convert(intermediate, destination, size);
     }
   }

   ContextImpl* context_{nullptr};
   std::unique_ptr<CudssContext> cudss_context_;
   CuDSSMatrixCSR cudss_lhs_;
   CuDSSMatrixDense cudss_rhs_;
   CuDSSMatrixDense cudss_x_;

   CudaPinnedHostBuffer<Scalar> lhs_values_h_;
   CudaPinnedHostBuffer<Scalar> rhs_h_;
   CudaPinnedHostBuffer<Scalar> x_h_;
   CudaBuffer<int> lhs_rows_d_;
   CudaBuffer<int> lhs_cols_d_;
   CudaBuffer<Scalar> lhs_values_d_;
   CudaBuffer<Scalar> rhs_d_;
   CudaBuffer<Scalar> x_d_;

   LinearSolverTerminationType analyze_result_ =
       LinearSolverTerminationType::FATAL_ERROR;
   LinearSolverTerminationType factorize_result_ =
       LinearSolverTerminationType::FATAL_ERROR;
 };

 template <typename Scalar>
 std::unique_ptr<SparseCholesky> CudaSparseCholesky<Scalar>::Create(
     ContextImpl* context, const OrderingType ordering_type) {
   if (ordering_type == OrderingType::NESDIS) {
     LOG(FATAL)
         << "Congratulations you have found a bug in Ceres Solver. Please "
            "report it to the Ceres Solver developers.";
     return nullptr;
   }

   if (context == nullptr || !context->IsCudaInitialized()) {
     LOG(FATAL) << "CudaSparseCholesky requires CUDA context to be initialized";
     return nullptr;
   }

   return std::make_unique<CudaSparseCholeskyImpl<Scalar>>(context);
 }

 template class CudaSparseCholesky<float>;
 template class CudaSparseCholesky<double>;

 }  // namespace ceres::internal

 #undef CUDSS_STATUS_CHECK
 #undef CUDSS_STATUS_OK_OR_RETURN_FATAL_ERROR
 #undef CUDSS_STATUS_OK_OR_RETURN_CUDSS_STATUS

 #endif  // CERES_NO_CUDSS
	// Ceres Solver - A fast non-linear least squares minimizer
	// Copyright 2024 Google Inc. All rights reserved.
	// http://ceres-solver.org/
	//
	// Redistribution and use in source and binary forms, with or without
	// modification, are permitted provided that the following conditions are met:
	//
	// * Redistributions of source code must retain the above copyright notice,
	// this list of conditions and the following disclaimer.
	// * Redistributions in binary form must reproduce the above copyright notice,
	// this list of conditions and the following disclaimer in the documentation
	// and/or other materials provided with the distribution.
	// * Neither the name of Google Inc. nor the names of its contributors may be
	// used to endorse or promote products derived from this software without
	// specific prior written permission.
	//
	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
	// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	// POSSIBILITY OF SUCH DAMAGE.
	//
	// Author: markshachkov@gmail.com (Mark Shachkov)

	#include "ceres/cuda_sparse_cholesky.h"

	#ifndef CERES_NO_CUDSS

	#include <cstddef>
	#include <iostream>
	#include <memory>
	#include <string>
	#include <type_traits>

	#include "Eigen/Core"
	#include "absl/log/check.h"
	#include "absl/log/log.h"
	#include "ceres/compressed_row_sparse_matrix.h"
	#include "ceres/cuda_buffer.h"
	#include "ceres/linear_solver.h"
	#include "cudss.h"

	namespace ceres::internal {

	inline std::string cuDSSStatusToString(cudssStatus_t status) {
	switch (status) {
	case CUDSS_STATUS_SUCCESS:
	return "CUDSS_STATUS_SUCCESS";
	case CUDSS_STATUS_NOT_INITIALIZED:
	return "CUDSS_STATUS_NOT_INITIALIZED";
	case CUDSS_STATUS_ALLOC_FAILED:
	return "CUDSS_STATUS_ALLOC_FAILED";
	case CUDSS_STATUS_INVALID_VALUE:
	return "CUDSS_STATUS_INVALID_VALUE";
	case CUDSS_STATUS_NOT_SUPPORTED:
	return "CUDSS_STATUS_NOT_SUPPORTED";
	case CUDSS_STATUS_EXECUTION_FAILED:
	return "CUDSS_STATUS_EXECUTION_FAILED";
	case CUDSS_STATUS_INTERNAL_ERROR:
	return "CUDSS_STATUS_INTERNAL_ERROR";
	default:
	return "unknown cuDSS status: " + std::to_string(status);
	}
	}

	#define CUDSS_STATUS_CHECK(IN) \
	if (cudssStatus_t status = IN; status != CUDSS_STATUS_SUCCESS) { \
	CHECK(false) << "Got error: " << cuDSSStatusToString(status); \
	}

	#define CUDSS_STATUS_OK_OR_RETURN_FATAL_ERROR(IN, additional_message) \
	if (cudssStatus_t status = IN; status != CUDSS_STATUS_SUCCESS) { \
	*message = std::string(additional_message) + \
	" Got error: " + cuDSSStatusToString(status); \
	return factorize_result_ = LinearSolverTerminationType::FATAL_ERROR; \
	}

	#define CUDSS_STATUS_OK_OR_RETURN_CUDSS_STATUS(IN) \
	if (cudssStatus_t status = IN; status != CUDSS_STATUS_SUCCESS) { \
	return status; \
	}

	class CERES_NO_EXPORT CuDSSMatrixBase {
	public:
	CuDSSMatrixBase() = default;
	CuDSSMatrixBase(const CuDSSMatrixBase&) = delete;
	CuDSSMatrixBase(CuDSSMatrixBase&&) = delete;
	CuDSSMatrixBase& operator=(const CuDSSMatrixBase&) = delete;
	CuDSSMatrixBase& operator=(CuDSSMatrixBase&&) = delete;
	~CuDSSMatrixBase() { CUDSS_STATUS_CHECK(Free()); }

	cudssStatus_t Free() noexcept {
	if (matrix_) {
	const auto status = cudssMatrixDestroy(matrix_);
	matrix_ = nullptr;
	return status;
	}

	return CUDSS_STATUS_SUCCESS;
	}

	cudssMatrix_t Get() const noexcept { return matrix_; }

	protected:
	cudssMatrix_t matrix_{nullptr};
	};

	class CERES_NO_EXPORT CuDSSMatrixCSR : public CuDSSMatrixBase {
	public:
	cudssStatus_t Reset(int64_t num_rows,
	int64_t num_cols,
	int64_t num_nonzeros,
	void* rows_start,
	void* rows_end,
	void* cols,
	void* values,
	cudaDataType_t index_type,
	cudaDataType_t value_type,
	cudssMatrixType_t matrix_type,
	cudssMatrixViewType_t matrix_storage_type,
	cudssIndexBase_t index_base) {
	CUDSS_STATUS_OK_OR_RETURN_CUDSS_STATUS(Free());

	return cudssMatrixCreateCsr(&matrix_,
	num_rows,
	num_cols,
	num_nonzeros,
	rows_start,
	rows_end,
	cols,
	values,
	index_type,
	value_type,
	matrix_type,
	matrix_storage_type,
	index_base);
	}
	};

	class CERES_NO_EXPORT CuDSSMatrixDense : public CuDSSMatrixBase {
	public:
	cudssStatus_t Reset(int64_t num_rows,
	int64_t num_cols,
	int64_t leading_dimension_size,
	void* values,
	cudaDataType_t value_type,
	cudssLayout_t layout) {
	CUDSS_STATUS_OK_OR_RETURN_CUDSS_STATUS(Free());

	return cudssMatrixCreateDn(&matrix_,
	num_rows,
	num_cols,
	leading_dimension_size,
	values,
	value_type,
	layout);
	}
	};

	struct CudssContext {
	CudssContext(cudssHandle_t cudss_handle) : cudss_handle_(cudss_handle) {
	CUDSS_STATUS_CHECK(cudssConfigCreate(&solver_config_));
	CUDSS_STATUS_CHECK(cudssDataCreate(cudss_handle_, &solver_data_));
	}
	CudssContext(const CudssContext&) = delete;
	CudssContext(CudssContext&&) = delete;
	CudssContext& operator=(const CudssContext&) = delete;
	CudssContext& operator=(CudssContext&&) = delete;
	~CudssContext() {
	CUDSS_STATUS_CHECK(cudssDataDestroy(cudss_handle_, solver_data_));
	CUDSS_STATUS_CHECK(cudssConfigDestroy(solver_config_));
	}

	cudssHandle_t cudss_handle_{nullptr};
	cudssConfig_t solver_config_{nullptr};
	cudssData_t solver_data_{nullptr};
	};

	template <typename Scalar>
	class CERES_NO_EXPORT CudaSparseCholeskyImpl final : public SparseCholesky {
	public:
	static_assert(std::is_same_v<Scalar, float> \|\| std::is_same_v<Scalar, double>,
	"Scalar type is unsupported by cuDSS");
	static constexpr cudaDataType_t kCuDSSScalar =
	std::is_same_v<Scalar, float> ? CUDA_R_32F : CUDA_R_64F;

	CudaSparseCholeskyImpl(ContextImpl* context)
	: context_(context),
	lhs_cols_d_(context_),
	lhs_rows_d_(context_),
	lhs_values_d_(context_),
	rhs_d_(context_),
	x_d_(context_) {}
	CudaSparseCholeskyImpl(const CudaSparseCholeskyImpl&) = delete;
	CudaSparseCholeskyImpl(CudaSparseCholeskyImpl&&) = delete;
	CudaSparseCholeskyImpl& operator=(const CudaSparseCholeskyImpl&) = delete;
	CudaSparseCholeskyImpl& operator=(CudaSparseCholeskyImpl&&) = delete;
	~CudaSparseCholeskyImpl() = default;

	CompressedRowSparseMatrix::StorageType StorageType() const {
	return CompressedRowSparseMatrix::StorageType::LOWER_TRIANGULAR;
	}

	LinearSolverTerminationType Factorize(CompressedRowSparseMatrix* lhs,
	std::string* message) {
	if (lhs->num_rows() != lhs->num_cols()) {
	*message = "lhs matrix must be square";
	return factorize_result_ = LinearSolverTerminationType::FATAL_ERROR;
	}
	if (lhs->storage_type() != StorageType()) {
	*message = "lhs matrix must be lower triangular";
	return factorize_result_ = LinearSolverTerminationType::FATAL_ERROR;
	}

	// If, after previous attempt to factorize, cudssDataGet(CUDSS_DATA_INFO)
	// returned a numerical error, such error will be preserved by cuDSS 0.3.0
	// and returned by cudssDataGet(CUDSS_DATA_INFO) even after correctly
	// factorizeable matrix is provided. Such behaviour forces us to reset a
	// cudssData_t object (managed by CudssContext) and to loose a result of
	// anylyze stage, thus we have to perform analyze one more time.
	// TODO: do not re-perform analyze in case of failed factorization numerics
	if (analyze_result_ != LinearSolverTerminationType::SUCCESS \|\|
	factorize_result_ != LinearSolverTerminationType::SUCCESS) {
	analyze_result_ = Analyze(lhs, message);
	if (analyze_result_ != LinearSolverTerminationType::SUCCESS) {
	return analyze_result_;
	}
	}
	CHECK_NE(cudss_context_.get(), nullptr);

	ConvertAndCopyToDevice(lhs->values(), lhs_values_h_.data(), lhs_values_d_);

	CUDSS_STATUS_OK_OR_RETURN_FATAL_ERROR(
	cudssExecute(context_->cudss_handle_,
	CUDSS_PHASE_FACTORIZATION,
	cudss_context_->solver_config_,
	cudss_context_->solver_data_,
	cudss_lhs_.Get(),
	cudss_x_.Get(),
	cudss_rhs_.Get()),
	"cudssExecute with CUDSS_PHASE_FACTORIZATION failed");

	int cudss_data_info;
	CUDSS_STATUS_OK_OR_RETURN_FATAL_ERROR(
	GetCudssDataInfo(cudss_data_info),
	"cudssDataGet with CUDSS_DATA_INFO failed");
	const auto factorization_status =
	static_cast<cudssStatus_t>(cudss_data_info);

	if (factorization_status == CUDSS_STATUS_SUCCESS) {
	return factorize_result_ = LinearSolverTerminationType::SUCCESS;
	}

	if (cudss_data_info > 0) {
	return factorize_result_ = LinearSolverTerminationType::FAILURE;
	}

	return factorize_result_ = LinearSolverTerminationType::FATAL_ERROR;
	}

	LinearSolverTerminationType Solve(const double* rhs,
	double* solution,
	std::string* message) {
	CHECK_NE(cudss_context_.get(), nullptr);

	if (factorize_result_ != LinearSolverTerminationType::SUCCESS) {
	*message = "Factorize did not complete successfully previously.";
	return factorize_result_;
	}

	ConvertAndCopyToDevice(rhs, rhs_h_.data(), rhs_d_);

	CUDSS_STATUS_OK_OR_RETURN_FATAL_ERROR(
	cudssExecute(context_->cudss_handle_,
	CUDSS_PHASE_SOLVE,
	cudss_context_->solver_config_,
	cudss_context_->solver_data_,
	cudss_lhs_.Get(),
	cudss_x_.Get(),
	cudss_rhs_.Get()),
	"cudssExecute with CUDSS_PHASE_SOLVE failed");

	ConvertAndCopyToHost(x_d_, x_h_.data(), solution);

	int cudss_data_info;
	CUDSS_STATUS_OK_OR_RETURN_FATAL_ERROR(
	GetCudssDataInfo(cudss_data_info),
	"cudssDataGet with CUDSS_DATA_INFO failed");
	const auto solve_status = static_cast<cudssStatus_t>(cudss_data_info);

	if (solve_status != CUDSS_STATUS_SUCCESS) {
	return LinearSolverTerminationType::FAILURE;
	}

	return LinearSolverTerminationType::SUCCESS;
	}

	private:
	cudssStatus_t GetCudssDataInfo(int& cudss_data_info) {
	CHECK_NE(cudss_context_.get(), nullptr);
	std::size_t size_written = 0;
	CUDSS_STATUS_OK_OR_RETURN_CUDSS_STATUS(
	cudssDataGet(context_->cudss_handle_,
	cudss_context_->solver_data_,
	CUDSS_DATA_INFO,
	&cudss_data_info,
	sizeof(cudss_data_info),
	&size_written));
	// TODO: enable following check after cudssDataGet will be fixed
	// CHECK_EQ(size_written, sizeof(cudss_data_info));

	return CUDSS_STATUS_SUCCESS;
	}

	LinearSolverTerminationType Analyze(const CompressedRowSparseMatrix* lhs,
	std::string* message) {
	if (auto status = SetupCudssMatrices(lhs, message);
	status != LinearSolverTerminationType::SUCCESS) {
	return status;
	}

	lhs_rows_d_.CopyFromCpu(lhs->rows(), lhs->num_rows() + 1);
	lhs_cols_d_.CopyFromCpu(lhs->cols(), lhs->num_nonzeros());

	// Analyze and factorization results are stored in cudssData_t (managed by
	// CudssContext). Given that cuDSS 0.3.0 does not reset it's error state in
	// case of failed numerics at factorization stage, we have to reset
	// cudssData_t and to recompute an analyze stage while trying to factorize a
	// rescaled matrix with the same structure.
	// TODO: move creation of CudssContext to ctor of CudaSparseCholeskyImpl
	cudss_context_ = std::make_unique<CudssContext>(context_->cudss_handle_);

	CUDSS_STATUS_OK_OR_RETURN_FATAL_ERROR(
	cudssExecute(context_->cudss_handle_,
	CUDSS_PHASE_ANALYSIS,
	cudss_context_->solver_config_,
	cudss_context_->solver_data_,
	cudss_lhs_.Get(),
	cudss_x_.Get(),
	cudss_rhs_.Get()),
	"cudssExecute with CUDSS_PHASE_ANALYSIS failed");

	return LinearSolverTerminationType::SUCCESS;
	}

	// Resize buffers and setup cuDSS structs that describe the type and storage
	// configuration of linear system operands.
	LinearSolverTerminationType SetupCudssMatrices(
	const CompressedRowSparseMatrix* lhs, std::string* message) {
	const auto num_rows = lhs->num_rows();
	const auto num_nonzeros = lhs->num_nonzeros();

	if constexpr (std::is_same_v<Scalar, float>) {
	lhs_values_h_.Reserve(num_nonzeros);
	rhs_h_.Reserve(num_rows);
	x_h_.Reserve(num_rows);
	}

	lhs_rows_d_.Reserve(num_rows + 1);
	lhs_cols_d_.Reserve(num_nonzeros);
	lhs_values_d_.Reserve(num_nonzeros);
	rhs_d_.Reserve(num_rows);
	x_d_.Reserve(num_rows);

	static constexpr auto kFailedToCreateCuDSSMatrix =
	"cudssMatrixCreate() call failed";
	CUDSS_STATUS_OK_OR_RETURN_FATAL_ERROR(cudss_lhs_.Reset(num_rows,
	num_rows,
	num_nonzeros,
	lhs_rows_d_.data(),
	nullptr,
	lhs_cols_d_.data(),
	lhs_values_d_.data(),
	CUDA_R_32I,
	kCuDSSScalar,
	CUDSS_MTYPE_SPD,
	CUDSS_MVIEW_LOWER,
	CUDSS_BASE_ZERO),
	kFailedToCreateCuDSSMatrix);

	CUDSS_STATUS_OK_OR_RETURN_FATAL_ERROR(
	cudss_rhs_.Reset(num_rows,
	1,
	num_rows,
	rhs_d_.data(),
	kCuDSSScalar,
	CUDSS_LAYOUT_COL_MAJOR),
	kFailedToCreateCuDSSMatrix);

	CUDSS_STATUS_OK_OR_RETURN_FATAL_ERROR(
	cudss_x_.Reset(num_rows,
	1,
	num_rows,
	x_d_.data(),
	kCuDSSScalar,
	CUDSS_LAYOUT_COL_MAJOR),
	kFailedToCreateCuDSSMatrix);

	return LinearSolverTerminationType::SUCCESS;
	}

	template <typename S, typename D>
	void Convert(const S* source, D* destination, size_t size) {
	Eigen::Map<Eigen::Matrix<D, Eigen::Dynamic, 1>>(destination, size) =
	Eigen::Map<const Eigen::Matrix<S, Eigen::Dynamic, 1>>(source, size)
	.template cast<D>();
	}

	void ConvertAndCopyToDevice(const double* source,
	Scalar* intermediate,
	CudaBuffer<Scalar>& destination) {
	const auto size = destination.size();
	if constexpr (std::is_same_v<Scalar, double>) {
	destination.CopyFromCpu(source, size);
	} else {
	Convert(source, intermediate, size);
	destination.CopyFromCpu(intermediate, size);
	}
	}

	void ConvertAndCopyToHost(const CudaBuffer<Scalar>& source,
	Scalar* intermediate,
	double* destination) {
	const auto size = source.size();
	if constexpr (std::is_same_v<Scalar, double>) {
	source.CopyToCpu(destination, source.size());
	} else {
	source.CopyToCpu(intermediate, source.size());
	Convert(intermediate, destination, size);
	}
	}

	ContextImpl* context_{nullptr};
	std::unique_ptr<CudssContext> cudss_context_;
	CuDSSMatrixCSR cudss_lhs_;
	CuDSSMatrixDense cudss_rhs_;
	CuDSSMatrixDense cudss_x_;

	CudaPinnedHostBuffer<Scalar> lhs_values_h_;
	CudaPinnedHostBuffer<Scalar> rhs_h_;
	CudaPinnedHostBuffer<Scalar> x_h_;
	CudaBuffer<int> lhs_rows_d_;
	CudaBuffer<int> lhs_cols_d_;
	CudaBuffer<Scalar> lhs_values_d_;
	CudaBuffer<Scalar> rhs_d_;
	CudaBuffer<Scalar> x_d_;

	LinearSolverTerminationType analyze_result_ =
	LinearSolverTerminationType::FATAL_ERROR;
	LinearSolverTerminationType factorize_result_ =
	LinearSolverTerminationType::FATAL_ERROR;
	};

	template <typename Scalar>
	std::unique_ptr<SparseCholesky> CudaSparseCholesky<Scalar>::Create(
	ContextImpl* context, const OrderingType ordering_type) {
	if (ordering_type == OrderingType::NESDIS) {
	LOG(FATAL)
	<< "Congratulations you have found a bug in Ceres Solver. Please "
	"report it to the Ceres Solver developers.";
	return nullptr;
	}

	if (context == nullptr \|\| !context->IsCudaInitialized()) {
	LOG(FATAL) << "CudaSparseCholesky requires CUDA context to be initialized";
	return nullptr;
	}

	return std::make_unique<CudaSparseCholeskyImpl<Scalar>>(context);
	}

	template class CudaSparseCholesky<float>;
	template class CudaSparseCholesky<double>;

	} // namespace ceres::internal

	#undef CUDSS_STATUS_CHECK
	#undef CUDSS_STATUS_OK_OR_RETURN_FATAL_ERROR
	#undef CUDSS_STATUS_OK_OR_RETURN_CUDSS_STATUS

	#endif // CERES_NO_CUDSS