internal/ceres/cgnr_solver.cc - ceres-solver - Git at Google

 // Ceres Solver - A fast non-linear least squares minimizer
 // Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
 // * Redistributions of source code must retain the above copyright notice,
 //   this list of conditions and the following disclaimer.
 // * Redistributions in binary form must reproduce the above copyright notice,
 //   this list of conditions and the following disclaimer in the documentation
 //   and/or other materials provided with the distribution.
 // * Neither the name of Google Inc. nor the names of its contributors may be
 //   used to endorse or promote products derived from this software without
 //   specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 //
 // Author: keir@google.com (Keir Mierle)

 #include "ceres/cgnr_solver.h"

 #include <memory>
 #include <utility>

 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "ceres/block_jacobi_preconditioner.h"
 #include "ceres/conjugate_gradients_solver.h"
 #include "ceres/cuda_sparse_matrix.h"
 #include "ceres/cuda_vector.h"
 #include "ceres/internal/eigen.h"
 #include "ceres/linear_solver.h"
 #include "ceres/subset_preconditioner.h"
 #include "ceres/wall_time.h"

 namespace ceres::internal {

 // A linear operator which takes a matrix A and a diagonal vector D and
 // performs products of the form
 //
 //   (A^T A + D^T D)x
 //
 // This is used to implement iterative general sparse linear solving with
 // conjugate gradients, where A is the Jacobian and D is a regularizing
 // parameter. A brief proof that D^T D is the correct regularizer:
 //
 // Given a regularized least squares problem:
 //
 //   min  ||Ax - b||^2 + ||Dx||^2
 //    x
 //
 // First expand into matrix notation:
 //
 //   (Ax - b)^T (Ax - b) + xD^TDx
 //
 // Then multiply out to get:
 //
 //   = xA^TAx - 2b^T Ax + b^Tb + xD^TDx
 //
 // Take the derivative:
 //
 //   0 = 2A^TAx - 2A^T b + 2 D^TDx
 //   0 = A^TAx - A^T b + D^TDx
 //   0 = (A^TA + D^TD)x - A^T b
 //
 // Thus, the symmetric system we need to solve for CGNR is
 //
 //   Sx = z
 //
 // with S = A^TA + D^TD
 //  and z = A^T b
 //
 // Note: This class is not thread safe, since it uses some temporary storage.
 class CERES_NO_EXPORT CgnrLinearOperator final
     : public ConjugateGradientsLinearOperator<Vector> {
  public:
   CgnrLinearOperator(const LinearOperator& A,
                      const double* D,
                      ContextImpl* context,
                      int num_threads)
       : A_(A),
         D_(D),
         z_(Vector::Zero(A.num_rows())),
         context_(context),
         num_threads_(num_threads) {}

   void RightMultiplyAndAccumulate(const Vector& x, Vector& y) final {
     // z = Ax
     // y = y + Atz
     z_.setZero();
     A_.RightMultiplyAndAccumulate(x, z_, context_, num_threads_);
     A_.LeftMultiplyAndAccumulate(z_, y, context_, num_threads_);

     // y = y + DtDx
     if (D_ != nullptr) {
       int n = A_.num_cols();
       ParallelAssign(
           context_,
           num_threads_,
           y,
           y.array() + ConstVectorRef(D_, n).array().square() * x.array());
     }
   }

  private:
   const LinearOperator& A_;
   const double* D_;
   Vector z_;

   ContextImpl* context_;
   int num_threads_;
 };

 CgnrSolver::CgnrSolver(LinearSolver::Options options)
     : options_(std::move(options)) {
   if (options_.preconditioner_type != JACOBI &&
       options_.preconditioner_type != IDENTITY &&
       options_.preconditioner_type != SUBSET) {
     LOG(FATAL)
         << "Preconditioner = "
         << PreconditionerTypeToString(options_.preconditioner_type) << ". "
         << "Congratulations, you found a bug in Ceres. Please report it.";
   }
 }

 CgnrSolver::~CgnrSolver() {
   for (int i = 0; i < 4; ++i) {
     if (scratch_[i]) {
       delete scratch_[i];
       scratch_[i] = nullptr;
     }
   }
 }

 LinearSolver::Summary CgnrSolver::SolveImpl(
     BlockSparseMatrix* A,
     const double* b,
     const LinearSolver::PerSolveOptions& per_solve_options,
     double* x) {
   EventLogger event_logger("CgnrSolver::Solve");
   if (!preconditioner_) {
     Preconditioner::Options preconditioner_options;
     preconditioner_options.type = options_.preconditioner_type;
     preconditioner_options.subset_preconditioner_start_row_block =
         options_.subset_preconditioner_start_row_block;
     preconditioner_options.sparse_linear_algebra_library_type =
         options_.sparse_linear_algebra_library_type;
     preconditioner_options.ordering_type = options_.ordering_type;
     preconditioner_options.num_threads = options_.num_threads;
     preconditioner_options.context = options_.context;

     if (options_.preconditioner_type == JACOBI) {
       preconditioner_ = std::make_unique<BlockSparseJacobiPreconditioner>(
           preconditioner_options, *A);
     } else if (options_.preconditioner_type == SUBSET) {
       preconditioner_ =
           std::make_unique<SubsetPreconditioner>(preconditioner_options, *A);
     } else {
       preconditioner_ = std::make_unique<IdentityPreconditioner>(A->num_cols());
     }
   }
   preconditioner_->Update(*A, per_solve_options.D);

   ConjugateGradientsSolverOptions cg_options;
   cg_options.min_num_iterations = options_.min_num_iterations;
   cg_options.max_num_iterations = options_.max_num_iterations;
   cg_options.residual_reset_period = options_.residual_reset_period;
   cg_options.q_tolerance = per_solve_options.q_tolerance;
   cg_options.r_tolerance = per_solve_options.r_tolerance;
   cg_options.context = options_.context;
   cg_options.num_threads = options_.num_threads;

   // lhs = AtA + DtD
   CgnrLinearOperator lhs(
       *A, per_solve_options.D, options_.context, options_.num_threads);
   // rhs = Atb.
   Vector rhs(A->num_cols());
   rhs.setZero();
   A->LeftMultiplyAndAccumulate(
       b, rhs.data(), options_.context, options_.num_threads);

   cg_solution_ = Vector::Zero(A->num_cols());
   for (int i = 0; i < 4; ++i) {
     if (scratch_[i] == nullptr) {
       scratch_[i] = new Vector(A->num_cols());
     }
   }
   event_logger.AddEvent("Setup");

   LinearOperatorAdapter preconditioner(*preconditioner_);
   auto summary = ConjugateGradientsSolver(
       cg_options, lhs, rhs, preconditioner, scratch_, cg_solution_);
   VectorRef(x, A->num_cols()) = cg_solution_;
   event_logger.AddEvent("Solve");
   return summary;
 }

 #ifndef CERES_NO_CUDA

 // A linear operator which takes a matrix A and a diagonal vector D and
 // performs products of the form
 //
 //   (A^T A + D^T D)x
 //
 // This is used to implement iterative general sparse linear solving with
 // conjugate gradients, where A is the Jacobian and D is a regularizing
 // parameter. A brief proof is included in cgnr_linear_operator.h.
 class CERES_NO_EXPORT CudaCgnrLinearOperator final
     : public ConjugateGradientsLinearOperator<CudaVector> {
  public:
   CudaCgnrLinearOperator(CudaSparseMatrix& A,
                          const CudaVector& D,
                          CudaVector* z)
       : A_(A), D_(D), z_(z) {}

   void RightMultiplyAndAccumulate(const CudaVector& x, CudaVector& y) final {
     // z = Ax
     z_->SetZero();
     A_.RightMultiplyAndAccumulate(x, z_);

     // y = y + Atz
     //   = y + AtAx
     A_.LeftMultiplyAndAccumulate(*z_, &y);

     // y = y + DtDx
     y.DtDxpy(D_, x);
   }

  private:
   CudaSparseMatrix& A_;
   const CudaVector& D_;
   CudaVector* z_ = nullptr;
 };

 class CERES_NO_EXPORT CudaIdentityPreconditioner final
     : public CudaPreconditioner {
  public:
   void Update(const CompressedRowSparseMatrix& A, const double* D) final {}
   void RightMultiplyAndAccumulate(const CudaVector& x, CudaVector& y) final {
     y.Axpby(1.0, x, 1.0);
   }
 };

 // This class wraps the existing CPU Jacobi preconditioner, caches the structure
 // of the block diagonal, and for each CGNR solve updates the values on the CPU
 // and then copies them over to the GPU.
 class CERES_NO_EXPORT CudaJacobiPreconditioner final
     : public CudaPreconditioner {
  public:
   explicit CudaJacobiPreconditioner(Preconditioner::Options options,
                                     const CompressedRowSparseMatrix& A)
       : options_(std::move(options)),
         cpu_preconditioner_(options_, A),
         m_(options_.context, cpu_preconditioner_.matrix()) {}
   ~CudaJacobiPreconditioner() = default;

   void Update(const CompressedRowSparseMatrix& A, const double* D) final {
     cpu_preconditioner_.Update(A, D);
     m_.CopyValuesFromCpu(cpu_preconditioner_.matrix());
   }

   void RightMultiplyAndAccumulate(const CudaVector& x, CudaVector& y) final {
     m_.RightMultiplyAndAccumulate(x, &y);
   }

  private:
   Preconditioner::Options options_;
   BlockCRSJacobiPreconditioner cpu_preconditioner_;
   CudaSparseMatrix m_;
 };

 CudaCgnrSolver::CudaCgnrSolver(LinearSolver::Options options)
     : options_(std::move(options)) {}

 CudaCgnrSolver::~CudaCgnrSolver() {
   for (int i = 0; i < 4; ++i) {
     if (scratch_[i]) {
       delete scratch_[i];
       scratch_[i] = nullptr;
     }
   }
 }

 std::unique_ptr<CudaCgnrSolver> CudaCgnrSolver::Create(
     LinearSolver::Options options, std::string* error) {
   CHECK(error != nullptr);
   if (options.preconditioner_type != IDENTITY &&
       options.preconditioner_type != JACOBI) {
     *error =
         "CudaCgnrSolver does not support preconditioner type " +
         std::string(PreconditionerTypeToString(options.preconditioner_type)) +
         ". ";
     return nullptr;
   }
   CHECK(options.context->IsCudaInitialized())
       << "CudaCgnrSolver requires CUDA initialization.";
   auto solver = std::make_unique<CudaCgnrSolver>(options);
   return solver;
 }

 void CudaCgnrSolver::CpuToGpuTransfer(const CompressedRowSparseMatrix& A,
                                       const double* b,
                                       const double* D) {
   if (A_ == nullptr) {
     // Assume structure is not cached, do an initialization and structural copy.
     A_ = std::make_unique<CudaSparseMatrix>(options_.context, A);
     b_ = std::make_unique<CudaVector>(options_.context, A.num_rows());
     x_ = std::make_unique<CudaVector>(options_.context, A.num_cols());
     Atb_ = std::make_unique<CudaVector>(options_.context, A.num_cols());
     Ax_ = std::make_unique<CudaVector>(options_.context, A.num_rows());
     D_ = std::make_unique<CudaVector>(options_.context, A.num_cols());

     Preconditioner::Options preconditioner_options;
     preconditioner_options.type = options_.preconditioner_type;
     preconditioner_options.subset_preconditioner_start_row_block =
         options_.subset_preconditioner_start_row_block;
     preconditioner_options.sparse_linear_algebra_library_type =
         options_.sparse_linear_algebra_library_type;
     preconditioner_options.ordering_type = options_.ordering_type;
     preconditioner_options.num_threads = options_.num_threads;
     preconditioner_options.context = options_.context;

     if (options_.preconditioner_type == JACOBI) {
       preconditioner_ =
           std::make_unique<CudaJacobiPreconditioner>(preconditioner_options, A);
     } else {
       preconditioner_ = std::make_unique<CudaIdentityPreconditioner>();
     }
     for (int i = 0; i < 4; ++i) {
       scratch_[i] = new CudaVector(options_.context, A.num_cols());
     }
   } else {
     // Assume structure is cached, do a value copy.
     A_->CopyValuesFromCpu(A);
   }
   b_->CopyFromCpu(ConstVectorRef(b, A.num_rows()));
   D_->CopyFromCpu(ConstVectorRef(D, A.num_cols()));
 }

 LinearSolver::Summary CudaCgnrSolver::SolveImpl(
     CompressedRowSparseMatrix* A,
     const double* b,
     const LinearSolver::PerSolveOptions& per_solve_options,
     double* x) {
   EventLogger event_logger("CudaCgnrSolver::Solve");
   LinearSolver::Summary summary;
   summary.num_iterations = 0;
   summary.termination_type = LinearSolverTerminationType::FATAL_ERROR;

   CpuToGpuTransfer(*A, b, per_solve_options.D);
   event_logger.AddEvent("CPU to GPU Transfer");
   preconditioner_->Update(*A, per_solve_options.D);
   event_logger.AddEvent("Preconditioner Update");

   // Form z = Atb.
   Atb_->SetZero();
   A_->LeftMultiplyAndAccumulate(*b_, Atb_.get());

   // Solve (AtA + DtD)x = z (= Atb).
   x_->SetZero();
   CudaCgnrLinearOperator lhs(*A_, *D_, Ax_.get());

   event_logger.AddEvent("Setup");

   ConjugateGradientsSolverOptions cg_options;
   cg_options.min_num_iterations = options_.min_num_iterations;
   cg_options.max_num_iterations = options_.max_num_iterations;
   cg_options.residual_reset_period = options_.residual_reset_period;
   cg_options.q_tolerance = per_solve_options.q_tolerance;
   cg_options.r_tolerance = per_solve_options.r_tolerance;

   summary = ConjugateGradientsSolver(
       cg_options, lhs, *Atb_, *preconditioner_, scratch_, *x_);
   x_->CopyTo(x);
   event_logger.AddEvent("Solve");
   return summary;
 }

 #endif  // CERES_NO_CUDA

 }  // namespace ceres::internal
	// Ceres Solver - A fast non-linear least squares minimizer
	// Copyright 2023 Google Inc. All rights reserved.
	// http://ceres-solver.org/
	//
	// Redistribution and use in source and binary forms, with or without
	// modification, are permitted provided that the following conditions are met:
	//
	// * Redistributions of source code must retain the above copyright notice,
	// this list of conditions and the following disclaimer.
	// * Redistributions in binary form must reproduce the above copyright notice,
	// this list of conditions and the following disclaimer in the documentation
	// and/or other materials provided with the distribution.
	// * Neither the name of Google Inc. nor the names of its contributors may be
	// used to endorse or promote products derived from this software without
	// specific prior written permission.
	//
	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
	// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	// POSSIBILITY OF SUCH DAMAGE.
	//
	// Author: keir@google.com (Keir Mierle)

	#include "ceres/cgnr_solver.h"

	#include <memory>
	#include <utility>

	#include "absl/log/check.h"
	#include "absl/log/log.h"
	#include "ceres/block_jacobi_preconditioner.h"
	#include "ceres/conjugate_gradients_solver.h"
	#include "ceres/cuda_sparse_matrix.h"
	#include "ceres/cuda_vector.h"
	#include "ceres/internal/eigen.h"
	#include "ceres/linear_solver.h"
	#include "ceres/subset_preconditioner.h"
	#include "ceres/wall_time.h"

	namespace ceres::internal {

	// A linear operator which takes a matrix A and a diagonal vector D and
	// performs products of the form
	//
	// (A^T A + D^T D)x
	//
	// This is used to implement iterative general sparse linear solving with
	// conjugate gradients, where A is the Jacobian and D is a regularizing
	// parameter. A brief proof that D^T D is the correct regularizer:
	//
	// Given a regularized least squares problem:
	//
	// min \|\|Ax - b\|\|^2 + \|\|Dx\|\|^2
	// x
	//
	// First expand into matrix notation:
	//
	// (Ax - b)^T (Ax - b) + xD^TDx
	//
	// Then multiply out to get:
	//
	// = xA^TAx - 2b^T Ax + b^Tb + xD^TDx
	//
	// Take the derivative:
	//
	// 0 = 2A^TAx - 2A^T b + 2 D^TDx
	// 0 = A^TAx - A^T b + D^TDx
	// 0 = (A^TA + D^TD)x - A^T b
	//
	// Thus, the symmetric system we need to solve for CGNR is
	//
	// Sx = z
	//
	// with S = A^TA + D^TD
	// and z = A^T b
	//
	// Note: This class is not thread safe, since it uses some temporary storage.
	class CERES_NO_EXPORT CgnrLinearOperator final
	: public ConjugateGradientsLinearOperator<Vector> {
	public:
	CgnrLinearOperator(const LinearOperator& A,
	const double* D,
	ContextImpl* context,
	int num_threads)
	: A_(A),
	D_(D),
	z_(Vector::Zero(A.num_rows())),
	context_(context),
	num_threads_(num_threads) {}

	void RightMultiplyAndAccumulate(const Vector& x, Vector& y) final {
	// z = Ax
	// y = y + Atz
	z_.setZero();
	A_.RightMultiplyAndAccumulate(x, z_, context_, num_threads_);
	A_.LeftMultiplyAndAccumulate(z_, y, context_, num_threads_);

	// y = y + DtDx
	if (D_ != nullptr) {
	int n = A_.num_cols();
	ParallelAssign(
	context_,
	num_threads_,
	y,
	y.array() + ConstVectorRef(D_, n).array().square() * x.array());
	}
	}

	private:
	const LinearOperator& A_;
	const double* D_;
	Vector z_;

	ContextImpl* context_;
	int num_threads_;
	};

	CgnrSolver::CgnrSolver(LinearSolver::Options options)
	: options_(std::move(options)) {
	if (options_.preconditioner_type != JACOBI &&
	options_.preconditioner_type != IDENTITY &&
	options_.preconditioner_type != SUBSET) {
	LOG(FATAL)
	<< "Preconditioner = "
	<< PreconditionerTypeToString(options_.preconditioner_type) << ". "
	<< "Congratulations, you found a bug in Ceres. Please report it.";
	}
	}

	CgnrSolver::~CgnrSolver() {
	for (int i = 0; i < 4; ++i) {
	if (scratch_[i]) {
	delete scratch_[i];
	scratch_[i] = nullptr;
	}
	}
	}

	LinearSolver::Summary CgnrSolver::SolveImpl(
	BlockSparseMatrix* A,
	const double* b,
	const LinearSolver::PerSolveOptions& per_solve_options,
	double* x) {
	EventLogger event_logger("CgnrSolver::Solve");
	if (!preconditioner_) {
	Preconditioner::Options preconditioner_options;
	preconditioner_options.type = options_.preconditioner_type;
	preconditioner_options.subset_preconditioner_start_row_block =
	options_.subset_preconditioner_start_row_block;
	preconditioner_options.sparse_linear_algebra_library_type =
	options_.sparse_linear_algebra_library_type;
	preconditioner_options.ordering_type = options_.ordering_type;
	preconditioner_options.num_threads = options_.num_threads;
	preconditioner_options.context = options_.context;

	if (options_.preconditioner_type == JACOBI) {
	preconditioner_ = std::make_unique<BlockSparseJacobiPreconditioner>(
	preconditioner_options, *A);
	} else if (options_.preconditioner_type == SUBSET) {
	preconditioner_ =
	std::make_unique<SubsetPreconditioner>(preconditioner_options, *A);
	} else {
	preconditioner_ = std::make_unique<IdentityPreconditioner>(A->num_cols());
	}
	}
	preconditioner_->Update(*A, per_solve_options.D);

	ConjugateGradientsSolverOptions cg_options;
	cg_options.min_num_iterations = options_.min_num_iterations;
	cg_options.max_num_iterations = options_.max_num_iterations;
	cg_options.residual_reset_period = options_.residual_reset_period;
	cg_options.q_tolerance = per_solve_options.q_tolerance;
	cg_options.r_tolerance = per_solve_options.r_tolerance;
	cg_options.context = options_.context;
	cg_options.num_threads = options_.num_threads;

	// lhs = AtA + DtD
	CgnrLinearOperator lhs(
	*A, per_solve_options.D, options_.context, options_.num_threads);
	// rhs = Atb.
	Vector rhs(A->num_cols());
	rhs.setZero();
	A->LeftMultiplyAndAccumulate(
	b, rhs.data(), options_.context, options_.num_threads);

	cg_solution_ = Vector::Zero(A->num_cols());
	for (int i = 0; i < 4; ++i) {
	if (scratch_[i] == nullptr) {
	scratch_[i] = new Vector(A->num_cols());
	}
	}
	event_logger.AddEvent("Setup");

	LinearOperatorAdapter preconditioner(*preconditioner_);
	auto summary = ConjugateGradientsSolver(
	cg_options, lhs, rhs, preconditioner, scratch_, cg_solution_);
	VectorRef(x, A->num_cols()) = cg_solution_;
	event_logger.AddEvent("Solve");
	return summary;
	}

	#ifndef CERES_NO_CUDA

	// A linear operator which takes a matrix A and a diagonal vector D and
	// performs products of the form
	//
	// (A^T A + D^T D)x
	//
	// This is used to implement iterative general sparse linear solving with
	// conjugate gradients, where A is the Jacobian and D is a regularizing
	// parameter. A brief proof is included in cgnr_linear_operator.h.
	class CERES_NO_EXPORT CudaCgnrLinearOperator final
	: public ConjugateGradientsLinearOperator<CudaVector> {
	public:
	CudaCgnrLinearOperator(CudaSparseMatrix& A,
	const CudaVector& D,
	CudaVector* z)
	: A_(A), D_(D), z_(z) {}

	void RightMultiplyAndAccumulate(const CudaVector& x, CudaVector& y) final {
	// z = Ax
	z_->SetZero();
	A_.RightMultiplyAndAccumulate(x, z_);

	// y = y + Atz
	// = y + AtAx
	A_.LeftMultiplyAndAccumulate(*z_, &y);

	// y = y + DtDx
	y.DtDxpy(D_, x);
	}

	private:
	CudaSparseMatrix& A_;
	const CudaVector& D_;
	CudaVector* z_ = nullptr;
	};

	class CERES_NO_EXPORT CudaIdentityPreconditioner final
	: public CudaPreconditioner {
	public:
	void Update(const CompressedRowSparseMatrix& A, const double* D) final {}
	void RightMultiplyAndAccumulate(const CudaVector& x, CudaVector& y) final {
	y.Axpby(1.0, x, 1.0);
	}
	};

	// This class wraps the existing CPU Jacobi preconditioner, caches the structure
	// of the block diagonal, and for each CGNR solve updates the values on the CPU
	// and then copies them over to the GPU.
	class CERES_NO_EXPORT CudaJacobiPreconditioner final
	: public CudaPreconditioner {
	public:
	explicit CudaJacobiPreconditioner(Preconditioner::Options options,
	const CompressedRowSparseMatrix& A)
	: options_(std::move(options)),
	cpu_preconditioner_(options_, A),
	m_(options_.context, cpu_preconditioner_.matrix()) {}
	~CudaJacobiPreconditioner() = default;

	void Update(const CompressedRowSparseMatrix& A, const double* D) final {
	cpu_preconditioner_.Update(A, D);
	m_.CopyValuesFromCpu(cpu_preconditioner_.matrix());
	}

	void RightMultiplyAndAccumulate(const CudaVector& x, CudaVector& y) final {
	m_.RightMultiplyAndAccumulate(x, &y);
	}

	private:
	Preconditioner::Options options_;
	BlockCRSJacobiPreconditioner cpu_preconditioner_;
	CudaSparseMatrix m_;
	};

	CudaCgnrSolver::CudaCgnrSolver(LinearSolver::Options options)
	: options_(std::move(options)) {}

	CudaCgnrSolver::~CudaCgnrSolver() {
	for (int i = 0; i < 4; ++i) {
	if (scratch_[i]) {
	delete scratch_[i];
	scratch_[i] = nullptr;
	}
	}
	}

	std::unique_ptr<CudaCgnrSolver> CudaCgnrSolver::Create(
	LinearSolver::Options options, std::string* error) {
	CHECK(error != nullptr);
	if (options.preconditioner_type != IDENTITY &&
	options.preconditioner_type != JACOBI) {
	*error =
	"CudaCgnrSolver does not support preconditioner type " +
	std::string(PreconditionerTypeToString(options.preconditioner_type)) +
	". ";
	return nullptr;
	}
	CHECK(options.context->IsCudaInitialized())
	<< "CudaCgnrSolver requires CUDA initialization.";
	auto solver = std::make_unique<CudaCgnrSolver>(options);
	return solver;
	}

	void CudaCgnrSolver::CpuToGpuTransfer(const CompressedRowSparseMatrix& A,
	const double* b,
	const double* D) {
	if (A_ == nullptr) {
	// Assume structure is not cached, do an initialization and structural copy.
	A_ = std::make_unique<CudaSparseMatrix>(options_.context, A);
	b_ = std::make_unique<CudaVector>(options_.context, A.num_rows());
	x_ = std::make_unique<CudaVector>(options_.context, A.num_cols());
	Atb_ = std::make_unique<CudaVector>(options_.context, A.num_cols());
	Ax_ = std::make_unique<CudaVector>(options_.context, A.num_rows());
	D_ = std::make_unique<CudaVector>(options_.context, A.num_cols());

	Preconditioner::Options preconditioner_options;
	preconditioner_options.type = options_.preconditioner_type;
	preconditioner_options.subset_preconditioner_start_row_block =
	options_.subset_preconditioner_start_row_block;
	preconditioner_options.sparse_linear_algebra_library_type =
	options_.sparse_linear_algebra_library_type;
	preconditioner_options.ordering_type = options_.ordering_type;
	preconditioner_options.num_threads = options_.num_threads;
	preconditioner_options.context = options_.context;

	if (options_.preconditioner_type == JACOBI) {
	preconditioner_ =
	std::make_unique<CudaJacobiPreconditioner>(preconditioner_options, A);
	} else {
	preconditioner_ = std::make_unique<CudaIdentityPreconditioner>();
	}
	for (int i = 0; i < 4; ++i) {
	scratch_[i] = new CudaVector(options_.context, A.num_cols());
	}
	} else {
	// Assume structure is cached, do a value copy.
	A_->CopyValuesFromCpu(A);
	}
	b_->CopyFromCpu(ConstVectorRef(b, A.num_rows()));
	D_->CopyFromCpu(ConstVectorRef(D, A.num_cols()));
	}

	LinearSolver::Summary CudaCgnrSolver::SolveImpl(
	CompressedRowSparseMatrix* A,
	const double* b,
	const LinearSolver::PerSolveOptions& per_solve_options,
	double* x) {
	EventLogger event_logger("CudaCgnrSolver::Solve");
	LinearSolver::Summary summary;
	summary.num_iterations = 0;
	summary.termination_type = LinearSolverTerminationType::FATAL_ERROR;

	CpuToGpuTransfer(*A, b, per_solve_options.D);
	event_logger.AddEvent("CPU to GPU Transfer");
	preconditioner_->Update(*A, per_solve_options.D);
	event_logger.AddEvent("Preconditioner Update");

	// Form z = Atb.
	Atb_->SetZero();
	A_->LeftMultiplyAndAccumulate(*b_, Atb_.get());

	// Solve (AtA + DtD)x = z (= Atb).
	x_->SetZero();
	CudaCgnrLinearOperator lhs(A_, D_, Ax_.get());

	event_logger.AddEvent("Setup");

	ConjugateGradientsSolverOptions cg_options;
	cg_options.min_num_iterations = options_.min_num_iterations;
	cg_options.max_num_iterations = options_.max_num_iterations;
	cg_options.residual_reset_period = options_.residual_reset_period;
	cg_options.q_tolerance = per_solve_options.q_tolerance;
	cg_options.r_tolerance = per_solve_options.r_tolerance;

	summary = ConjugateGradientsSolver(
	cg_options, lhs, Atb_, preconditioner_, scratch_, *x_);
	x_->CopyTo(x);
	event_logger.AddEvent("Solve");
	return summary;
	}

	#endif // CERES_NO_CUDA

	} // namespace ceres::internal