Mixed-precision Iterative Refinement Cholesky With CUDA

* Created a new class CUDADenseCholeskyMixedPrecision, which performs
  Cholesky factorization and solving in single (fp32) precision, and
  optionally performs iterative refinement.
* Added CUDA kernels for mixed-precision solve operations
* Added more detailed timing information to the FullReport about Schur
  elimination, reduced system solves, and back-substitution.

Some test performance numbers follow.
All tests were performed on an Ubuntu 20.04 desktop with an
Intel Core i9-9940X CPU and Nvidia Quadro RTX 6000 GPU.

Tests were launched as:
./bin/bundle_adjuster --input (problem_file) \
    --num_iterations 20
    --num_threads 28
    --linear_solver dense_schur
    --dense_linear_algebra_library (cuda|lapack)
    [--mixed_precision_solves]

==================================================
problem-21-11315-pre.txt
==================================================

--------------------------------------------------
Cuda Mixed Precision
--------------------------------------------------
Cost:
Initial                          4.413239e+06
Final                            3.037864e+04
Change                           4.382861e+06
  Linear solver                      0.250703 (14)
  ├ Schur eliminate                  0.234025 (14)
  ├ Reduced solve                    0.006643 (14)
  └ Backsubstitute                   0.006598 (12)

--------------------------------------------------
Cuda
--------------------------------------------------
Cost:
Initial                          4.413239e+06
Final                            3.037864e+04
Change                           4.382861e+06
  Linear solver                      0.257517 (12)
  ├ Schur eliminate                  0.233518 (12)
  ├ Reduced solve                    0.010621 (12)
  └ Backsubstitute                   0.007124 (12)

--------------------------------------------------
Lapack (OpenBLAS)
--------------------------------------------------
Cost:
Initial                          4.413239e+06
Final                            3.037864e+04
Change                           4.382861e+06
  Linear solver                      0.332349 (12)
  ├ Schur eliminate                  0.274748 (12)
  ├ Reduced solve                    0.015966 (12)
  └ Backsubstitute                   0.034192 (12)

==================================================
problem-257-65132-pre.txt
==================================================

--------------------------------------------------
Cuda Mixed Precision
--------------------------------------------------
Cost:
Initial                          2.456242e+07
Final                            9.677593e+04
Change                           2.446565e+07
  Linear solver                      1.332367 (20)
  ├ Schur eliminate                  1.021365 (20)
  ├ Reduced solve                    0.195472 (20)
  └ Backsubstitute                   0.075582 (20)

--------------------------------------------------
Cuda
--------------------------------------------------
Cost:
Initial                          2.456242e+07
Final                            9.677547e+04
Change                           2.446565e+07
  Linear solver                      1.810176 (20)
  ├ Schur eliminate                  1.012862 (20)
  ├ Reduced solve                    0.678704 (20)
  └ Backsubstitute                   0.083925 (20)

--------------------------------------------------
Lapack (OpenBLAS)
--------------------------------------------------
Cost:
Initial                          2.456242e+07
Final                            9.677547e+04
Change                           2.446565e+07
  Linear solver                      2.376273 (20)
  ├ Schur eliminate                  0.987613 (20)
  ├ Reduced solve                    1.043873 (20)
  └ Backsubstitute                   0.310402 (20)

==================================================
problem-744-543562-pre.txt
==================================================

--------------------------------------------------
Cuda Mixed Precision
--------------------------------------------------
Cost:
Initial                          1.434881e+08
Final                            1.546895e+06
Change                           1.419412e+08
  Linear solver                     27.010088 (20)
  ├ Schur eliminate                 24.362433 (20)
  ├ Reduced solve                    1.428542 (20)
  └ Backsubstitute                   0.814266 (20)

--------------------------------------------------
Cuda
--------------------------------------------------
Cost:
Initial                          1.434881e+08
Final                            1.546895e+06
Change                           1.419412e+08
  Linear solver                     32.342513 (20)
  ├ Schur eliminate                 24.638819 (20)
  ├ Reduced solve                    6.492090 (20)
  └ Backsubstitute                   0.802184 (20)

--------------------------------------------------
Lapack (OpenBLAS)
--------------------------------------------------
Cost:
Initial                          1.434881e+08
Final                            1.546895e+06
Change                           1.419412e+08
  Linear solver                     34.152224 (20)
  ├ Schur eliminate                 24.183723 (20)
  ├ Reduced solve                    8.784413 (20)
  └ Backsubstitute                   0.795044 (20)

Change-Id: I178887e776d8f4a1e8abb99bbc205bf8c278bf79
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8d6094c..82014a9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -145,7 +145,9 @@
   option(ENABLE_BITCODE
     "Enable bitcode for iOS builds (disables inline optimizations for Eigen)." OFF)
 endif()
-option(CUDA "Enable use of CUDA linear algebra solvers." ON)
+# We can't have an option called 'CUDA' since that is a reserved word -- a
+# language definition.
+option(USE_CUDA "Enable use of CUDA linear algebra solvers." ON)
 option(LAPACK "Enable use of LAPACK directly within Ceres." ON)
 # Template specializations for the Schur complement based solvers. If
 # compile time, binary size or compiler performance is an issue, you
@@ -228,23 +230,23 @@
   endif (EIGENSPARSE)
 endif (Eigen3_FOUND)
 
-if (CUDA)
+if (USE_CUDA)
   find_package(CUDA QUIET)
   if (CUDA_FOUND)
     message("-- Found CUDA version ${CUDA_VERSION}: "
         "${CUDA_LIBRARIES};"
         "${CUDA_cusolver_LIBRARY};"
         "${CUDA_cusparse_LIBRARY}")
+    enable_language(CUDA)
   else (CUDA_FOUND)
     message("-- Did not find CUDA library, disabling CUDA support.")
     update_cache_variable(CUDA OFF)
     list(APPEND CERES_COMPILE_OPTIONS CERES_NO_CUDA)
   endif (CUDA_FOUND)
-else (CUDA)
+else (USE_CUDA)
   message("-- Building without CUDA.")
   list(APPEND CERES_COMPILE_OPTIONS CERES_NO_CUDA)
-endif (CUDA)
-
+endif (USE_CUDA)
 if (LAPACK)
   find_package(LAPACK QUIET)
   if (LAPACK_FOUND)
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
index 5e6b0a7..d6bae75 100644
--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -118,11 +118,12 @@
 
 - `CUDA <https://developer.nvidia.com/cuda-toolkit>`_ If you have an
   NVIDIA GPU then Ceres Solver can use it accelerate the solution of
-  the Gauss-Newton linear systems using ``CUDA``. Currently this
-  support is limited to using the dense linear solvers that ship with
-  ``CUDA``. As a result GPU acceleration can be used to speed up
+  the Gauss-Newton linear systems using the CMake flag ``USE_CUDA``.
+  Currently this support is limited to using the dense linear solvers that ship
+  with ``CUDA``. As a result GPU acceleration can be used to speed up
   ``DENSE_QR``, ``DENSE_NORMAL_CHOLESKY`` and
-  ``DENSE_SCHUR``. **Optional**.
+  ``DENSE_SCHUR``. This also enables ``CUDA`` mixed precision solves
+  for ``DENSE_NORMAL_CHOLESKY`` and ``DENSE_SCHUR``.  **Optional**.
 
 .. _section-linux:
 
diff --git a/include/ceres/solver.h b/include/ceres/solver.h
index 097276b..3b15318 100644
--- a/include/ceres/solver.h
+++ b/include/ceres/solver.h
@@ -573,6 +573,8 @@
 
     // TODO(sameeragarwal): Further expand the documentation for the
     // following two options.
+    // TODO(joydeepbiswas): Update the documentation for the mixed precision
+    // option with CUDA.
 
     // NOTE1: EXPERIMENTAL FEATURE, UNDER DEVELOPMENT, USE AT YOUR OWN RISK.
     //
@@ -996,6 +998,8 @@
         SPARSE_NORMAL_CHOLESKY;
 #endif
 
+    bool mixed_precision_solves_used = false;
+
     LinearSolverOrderingType linear_solver_ordering_type;
 
     // Size of the elimination groups given by the user as hints to
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index 4e5008b..ae22a9d 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -157,6 +157,10 @@
         COMMAND ${CUDA_TOOLKIT_ROOT_DIR}/bin/cuda-memcheck --leak-check full
             $<TARGET_FILE:cuda_dense_cholesky_test>)
   endif (BUILD_TESTING)
+  set_source_files_properties(ceres_cuda_kernels.cu PROPERTIES LANGUAGE CUDA)
+  add_library(ceres_cuda_kernels ceres_cuda_kernels.cu)
+  target_compile_features(ceres_cuda_kernels PRIVATE cxx_std_14)
+  list(APPEND CERES_LIBRARY_PRIVATE_DEPENDENCIES ceres_cuda_kernels)
 endif (CUDA_FOUND)
 
 if (LAPACK_FOUND)
@@ -387,7 +391,7 @@
   list(APPEND CERES_LIBRARY_PRIVATE_DEPENDENCIES_INCLUDE_DIRS
     ${AccelerateSparse_INCLUDE_DIRS})
 endif()
-if (CUDA)
+if (USE_CUDA)
   list(APPEND CERES_LIBRARY_PRIVATE_DEPENDENCIES_INCLUDE_DIRS
     ${CUDA_INCLUDE_DIRS})
 endif()
@@ -403,6 +407,12 @@
 generate_export_header(ceres EXPORT_FILE_NAME
   ${Ceres_BINARY_DIR}/${CMAKE_INSTALL_INCLUDEDIR}/ceres/internal/export.h)
 
+install(TARGETS ceres_cuda_kernels
+        EXPORT  CeresExport
+        RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+        LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
+
 install(TARGETS ceres
         EXPORT  CeresExport
         RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
diff --git a/internal/ceres/ceres_cuda_kernels.cu b/internal/ceres/ceres_cuda_kernels.cu
new file mode 100644
index 0000000..2fd76b5
--- /dev/null
+++ b/internal/ceres/ceres_cuda_kernels.cu
@@ -0,0 +1,109 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2022 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+
+#include "cuda_runtime.h"
+
+namespace ceres::internal {
+
+// As the CUDA Toolkit documentation says, "although arbitrary in this case, is
+// a common choice". This is determined by the warp size, max block size, and
+// multiprocessor sizes of recent GPUs. For complex kernels with significant
+// register usage and unusual memory patterns, the occupancy calculator API
+// might provide better performance. See "Occupancy Calculator" under the CUDA
+// toolkit documentation.
+constexpr int kCudaBlockSize = 256;
+
+template<typename SrcType, typename DstType>
+__global__ void TypeConversionKernel(const SrcType* __restrict__ input,
+                                     DstType* __restrict__ output,
+                                     const int size) {
+  const int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < size) {
+    output[i] = static_cast<DstType>(input[i]);
+  }
+}
+
+void CudaFP64ToFP32(const double* input,
+                    float* output,
+                    const int size,
+                    cudaStream_t stream) {
+  const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
+  TypeConversionKernel<double, float>
+      <<<num_blocks, kCudaBlockSize, 0, stream>>>(input, output, size);
+}
+
+void CudaFP32ToFP64(const float* input,
+                    double* output,
+                    const int size,
+                    cudaStream_t stream) {
+  const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
+  TypeConversionKernel<float, double>
+      <<<num_blocks, kCudaBlockSize, 0, stream>>>(input, output, size);
+}
+
+template<typename T>
+__global__ void SetZeroKernel(T* __restrict__ output, const int size) {
+  const int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < size) {
+    output[i] = T(0.0);
+  }
+}
+
+void CudaSetZeroFP32(float* output, const int size, cudaStream_t stream) {
+  const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
+  SetZeroKernel<float><<<num_blocks, kCudaBlockSize, 0, stream>>>(output, size);
+}
+
+void CudaSetZeroFP64(double* output, const int size, cudaStream_t stream) {
+  const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
+  SetZeroKernel<double><<<num_blocks, kCudaBlockSize, 0, stream>>>(
+      output, size);
+}
+
+template <typename SrcType, typename DstType>
+__global__ void XPlusEqualsYKernel(DstType* __restrict__ x,
+                                   const SrcType* __restrict__ y,
+                                  const int size) {
+  const int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < size) {
+    x[i] = x[i] + DstType(y[i]);
+  }
+}
+
+void CudaDsxpy(double* x,
+                float* y,
+                const int size,
+                cudaStream_t stream) {
+  const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
+  XPlusEqualsYKernel<float, double>
+      <<<num_blocks, kCudaBlockSize, 0, stream>>>(x, y, size);
+}
+
+} // namespace ceres_cuda_kernels
\ No newline at end of file
diff --git a/internal/ceres/ceres_cuda_kernels.h b/internal/ceres/ceres_cuda_kernels.h
new file mode 100644
index 0000000..989399f
--- /dev/null
+++ b/internal/ceres/ceres_cuda_kernels.h
@@ -0,0 +1,66 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2022 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+
+#include "ceres/internal/config.h"
+
+#ifndef CERES_NO_CUDA
+
+#include "cuda_runtime.h"
+
+namespace ceres::internal {
+
+// Convert an array of double (FP64) values to float (FP32). Both arrays must
+// already be on GPU memory.
+void CudaFP64ToFP32(const double* input,
+                    float* output,
+                    const int size,
+                    cudaStream_t stream);
+
+// Convert an array of float (FP32) values to double (FP64). Both arrays must
+// already be on GPU memory.
+void CudaFP32ToFP64(const float* input,
+                    double* output,
+                    const int size,
+                    cudaStream_t stream);
+
+// Set all elements of the array to the FP32 value 0. The array must be in GPU
+// memory.
+void CudaSetZeroFP32(float* output, const int size, cudaStream_t stream);
+
+// Set all elements of the array to the FP64 value 0. The array must be in GPU
+// memory.
+void CudaSetZeroFP64(double* output, const int size, cudaStream_t stream);
+
+// Compute x = x + double(y). Input array is float (FP32), output array is
+// double (FP64). Both arrays must already be on GPU memory.
+void CudaDsxpy(double* x, float* y, const int size, cudaStream_t stream);
+}  // namespace ceres_cuda_kernels
+
+#endif  // CERES_NO_CUDA
\ No newline at end of file
diff --git a/internal/ceres/cuda_buffer.h b/internal/ceres/cuda_buffer.h
index a1cf784..8868e1a 100644
--- a/internal/ceres/cuda_buffer.h
+++ b/internal/ceres/cuda_buffer.h
@@ -81,6 +81,14 @@
              cudaSuccess);
   }
 
+  // Perform an asynchronous copy from GPU memory using the stream provided.
+  void CopyFromGpuAsync(const T* data, const size_t size, cudaStream_t stream) {
+    Reserve(size);
+    CHECK_EQ(cudaMemcpyAsync(
+        data_, data, size * sizeof(T), cudaMemcpyDeviceToDevice, stream),
+            cudaSuccess);
+  }
+
   // Copy data from the GPU to CPU memory. This is necessarily synchronous since
   // any potential GPU kernels that may be writing to the buffer must finish
   // before the transfer happens.
@@ -95,6 +103,7 @@
   }
 
   T* data() { return data_; }
+  const T* data() const { return data_; }
   size_t size() const { return size_; }
 
  private:
diff --git a/internal/ceres/cuda_dense_cholesky_test.cc b/internal/ceres/cuda_dense_cholesky_test.cc
index 13dc34b..b9acc99 100644
--- a/internal/ceres/cuda_dense_cholesky_test.cc
+++ b/internal/ceres/cuda_dense_cholesky_test.cc
@@ -165,8 +165,140 @@
     summary.termination_type = dense_cholesky->FactorAndSolve(
         kNumCols, lhs.data(), rhs.data(), x_computed.data(), &summary.message);
     ASSERT_EQ(summary.termination_type, LinearSolverTerminationType::SUCCESS);
+    static const double kEpsilon = std::numeric_limits<double>::epsilon() * 2e5;
     ASSERT_NEAR(
-        (x_computed - x_expected).norm() / x_expected.norm(), 0.0, 1e-10);
+        (x_computed - x_expected).norm() / x_expected.norm(), 0.0, kEpsilon);
+  }
+}
+
+TEST(CUDADenseCholeskyMixedPrecision, InvalidOptionsOnCreate) {
+  {
+    // Did not ask for CUDA, and did not ask for mixed precision.
+    LinearSolver::Options options;
+    auto solver = CUDADenseCholeskyMixedPrecision::Create(options);
+    ASSERT_EQ(solver, nullptr);
+  }
+  {
+    // Asked for CUDA, but did not ask for mixed precision.
+    LinearSolver::Options options;
+    options.dense_linear_algebra_library_type = ceres::CUDA;
+    auto solver = CUDADenseCholeskyMixedPrecision::Create(options);
+    ASSERT_EQ(solver, nullptr);
+  }
+}
+
+// Tests the CUDA Cholesky solver with a simple 4x4 matrix.
+TEST(CUDADenseCholeskyMixedPrecision, Cholesky4x4Matrix1Step) {
+  Eigen::Matrix4d A;
+  // clang-format off
+  // A common test Cholesky decomposition test matrix, see :
+  // https://en.wikipedia.org/w/index.php?title=Cholesky_decomposition&oldid=1080607368#Example
+  A <<  4,  12, -16, 0,
+       12,  37, -43, 0,
+      -16, -43,  98, 0,
+        0,   0,   0, 1;
+  // clang-format on
+
+  const Eigen::Vector4d b = Eigen::Vector4d::Ones();
+  LinearSolver::Options options;
+  options.max_num_refinement_iterations = 0;
+  ContextImpl context;
+  options.context = &context;
+  options.dense_linear_algebra_library_type = CUDA;
+  options.use_mixed_precision_solves = true;
+  auto solver = CUDADenseCholeskyMixedPrecision::Create(options);
+  ASSERT_NE(solver, nullptr);
+  std::string error_string;
+  ASSERT_EQ(solver->Factorize(A.cols(), A.data(), &error_string),
+            LinearSolverTerminationType::SUCCESS);
+  Eigen::Vector4d x = Eigen::Vector4d::Zero();
+  ASSERT_EQ(solver->Solve(b.data(), x.data(), &error_string),
+            LinearSolverTerminationType::SUCCESS);
+  // A single step of the mixed precision solver will be equivalent to solving
+  // in low precision (FP32). Hence the tolerance is defined w.r.t. FP32 epsilon
+  // instead of FP64 epsilon.
+  static const double kEpsilon = std::numeric_limits<float>::epsilon() * 15.0f;
+  EXPECT_NEAR(x(0), 113.75 / 3.0, kEpsilon);
+  EXPECT_NEAR(x(1), -31.0 / 3.0, kEpsilon);
+  EXPECT_NEAR(x(2), 5.0 / 3.0, kEpsilon);
+  EXPECT_NEAR(x(3), 1.0000, kEpsilon);
+}
+
+// Tests the CUDA Cholesky solver with a simple 4x4 matrix.
+TEST(CUDADenseCholeskyMixedPrecision, Cholesky4x4Matrix4Steps) {
+  Eigen::Matrix4d A;
+  // clang-format off
+  A <<  4,  12, -16, 0,
+       12,  37, -43, 0,
+      -16, -43,  98, 0,
+        0,   0,   0, 1;
+  // clang-format on
+
+  const Eigen::Vector4d b = Eigen::Vector4d::Ones();
+  LinearSolver::Options options;
+  options.max_num_refinement_iterations = 3;
+  ContextImpl context;
+  options.context = &context;
+  options.dense_linear_algebra_library_type = CUDA;
+  options.use_mixed_precision_solves = true;
+  auto solver = CUDADenseCholeskyMixedPrecision::Create(options);
+  ASSERT_NE(solver, nullptr);
+  std::string error_string;
+  ASSERT_EQ(solver->Factorize(A.cols(), A.data(), &error_string),
+            LinearSolverTerminationType::SUCCESS);
+  Eigen::Vector4d x = Eigen::Vector4d::Zero();
+  ASSERT_EQ(solver->Solve(b.data(), x.data(), &error_string),
+            LinearSolverTerminationType::SUCCESS);
+  // The error does not reduce beyond four iterations, and stagnates at this
+  // level of precision.
+  static const double kEpsilon = std::numeric_limits<double>::epsilon() * 3e2;
+  EXPECT_NEAR(x(0), 113.75 / 3.0, kEpsilon);
+  EXPECT_NEAR(x(1), -31.0 / 3.0, kEpsilon);
+  EXPECT_NEAR(x(2), 5.0 / 3.0, kEpsilon);
+  EXPECT_NEAR(x(3), 1.0000, kEpsilon);
+}
+
+
+
+TEST(CUDADenseCholeskyMixedPrecision, Randomized1600x1600Tests) {
+  const int kNumCols = 1600;
+  using LhsType = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>;
+  using RhsType = Eigen::Matrix<double, Eigen::Dynamic, 1>;
+  using SolutionType = Eigen::Matrix<double, Eigen::Dynamic, 1>;
+
+  LinearSolver::Options options;
+  ContextImpl context;
+  options.context = &context;
+  options.dense_linear_algebra_library_type = ceres::CUDA;
+  options.use_mixed_precision_solves = true;
+  options.max_num_refinement_iterations = 20;
+  std::unique_ptr<CUDADenseCholeskyMixedPrecision> dense_cholesky =
+      CUDADenseCholeskyMixedPrecision::Create(options);
+
+  const int kNumTrials = 20;
+  for (int i = 0; i < kNumTrials; ++i) {
+    LhsType lhs = LhsType::Random(kNumCols, kNumCols);
+    lhs = lhs.transpose() * lhs;
+    lhs += 1e-3 * LhsType::Identity(kNumCols, kNumCols);
+    SolutionType x_expected = SolutionType::Random(kNumCols);
+    RhsType rhs = lhs * x_expected;
+    SolutionType x_computed = SolutionType::Zero(kNumCols);
+    // Sanity check the random matrix sizes.
+    EXPECT_EQ(lhs.rows(), kNumCols);
+    EXPECT_EQ(lhs.cols(), kNumCols);
+    EXPECT_EQ(rhs.rows(), kNumCols);
+    EXPECT_EQ(rhs.cols(), 1);
+    EXPECT_EQ(x_expected.rows(), kNumCols);
+    EXPECT_EQ(x_expected.cols(), 1);
+    EXPECT_EQ(x_computed.rows(), kNumCols);
+    EXPECT_EQ(x_computed.cols(), 1);
+    LinearSolver::Summary summary;
+    summary.termination_type = dense_cholesky->FactorAndSolve(
+        kNumCols, lhs.data(), rhs.data(), x_computed.data(), &summary.message);
+    ASSERT_EQ(summary.termination_type, LinearSolverTerminationType::SUCCESS);
+    static const double kEpsilon = std::numeric_limits<double>::epsilon() * 1e6;
+    ASSERT_NEAR(
+        (x_computed - x_expected).norm() / x_expected.norm(), 0.0, kEpsilon);
   }
 }
 
diff --git a/internal/ceres/dense_cholesky.cc b/internal/ceres/dense_cholesky.cc
index a10f311..e2c5036 100644
--- a/internal/ceres/dense_cholesky.cc
+++ b/internal/ceres/dense_cholesky.cc
@@ -38,6 +38,7 @@
 #include "ceres/internal/config.h"
 
 #ifndef CERES_NO_CUDA
+#include "ceres/ceres_cuda_kernels.h"
 #include "ceres/context_impl.h"
 #include "cuda_runtime.h"
 #include "cusolverDn.h"
@@ -68,12 +69,16 @@
   std::unique_ptr<DenseCholesky> dense_cholesky;
 
   switch (options.dense_linear_algebra_library_type) {
-    case EIGEN:
+    case EIGEN: {
+      // Eigen mixed precision solver not yet implemented.
+      if (options.use_mixed_precision_solves) return nullptr;
       dense_cholesky = std::make_unique<EigenDenseCholesky>();
-      break;
+    } break;
 
     case LAPACK:
 #ifndef CERES_NO_LAPACK
+      // LAPACK mixed precision solver not yet implemented.
+      if (options.use_mixed_precision_solves) return nullptr;
       dense_cholesky = std::make_unique<LAPACKDenseCholesky>();
       break;
 #else
@@ -82,7 +87,11 @@
 
     case CUDA:
 #ifndef CERES_NO_CUDA
-      dense_cholesky = CUDADenseCholesky::Create(options);
+      if (options.use_mixed_precision_solves) {
+        dense_cholesky = CUDADenseCholeskyMixedPrecision::Create(options);
+      } else {
+        dense_cholesky = CUDADenseCholesky::Create(options);
+      }
       break;
 #else
       LOG(FATAL) << "Ceres was compiled without support for CUDA.";
@@ -320,6 +329,210 @@
   return nullptr;
 }
 
+std::unique_ptr<CUDADenseCholeskyMixedPrecision>
+    CUDADenseCholeskyMixedPrecision::Create(
+    const LinearSolver::Options& options) {
+  if (options.dense_linear_algebra_library_type != CUDA ||
+      !options.use_mixed_precision_solves) {
+    // The user called the wrong factory method.
+    return nullptr;
+  }
+  auto solver = std::unique_ptr<CUDADenseCholeskyMixedPrecision>(
+      new CUDADenseCholeskyMixedPrecision());
+  std::string cuda_error;
+  if (solver->Init(options, &cuda_error)) {
+    return solver;
+  }
+  LOG(ERROR) << "CUDADenseCholeskyMixedPrecision::Init failed: " << cuda_error;
+  return nullptr;
+}
+
+bool CUDADenseCholeskyMixedPrecision::Init(
+    const LinearSolver::Options& options, std::string* message) {
+  if (!options.context->InitCUDA(message)) {
+    return false;
+  }
+  cusolver_handle_ = options.context->cusolver_handle_;
+  cublas_handle_ = options.context->cublas_handle_;
+  stream_ = options.context->stream_;
+  error_.Reserve(1);
+  max_num_refinement_iterations_ = options.max_num_refinement_iterations;
+  *message = "CUDADenseCholeskyMixedPrecision::Init Success.";
+  return true;
+}
+
+LinearSolverTerminationType
+CUDADenseCholeskyMixedPrecision::CudaCholeskyFactorize(std::string* message) {
+  int device_workspace_size = 0;
+  if (cusolverDnSpotrf_bufferSize(cusolver_handle_,
+                                  CUBLAS_FILL_MODE_LOWER,
+                                  num_cols_,
+                                  lhs_fp32_.data(),
+                                  num_cols_,
+                                  &device_workspace_size) !=
+      CUSOLVER_STATUS_SUCCESS) {
+    *message = "cuSolverDN::cusolverDnSpotrf_bufferSize failed.";
+    return LinearSolverTerminationType::FATAL_ERROR;
+  }
+  device_workspace_.Reserve(device_workspace_size);
+  if (cusolverDnSpotrf(cusolver_handle_,
+                       CUBLAS_FILL_MODE_LOWER,
+                       num_cols_,
+                       lhs_fp32_.data(),
+                       num_cols_,
+                       device_workspace_.data(),
+                       device_workspace_.size(),
+                       error_.data()) != CUSOLVER_STATUS_SUCCESS) {
+    *message = "cuSolverDN::cusolverDnSpotrf failed.";
+    return LinearSolverTerminationType::FATAL_ERROR;
+  }
+  if (cudaDeviceSynchronize() != cudaSuccess ||
+      cudaStreamSynchronize(stream_) != cudaSuccess) {
+    *message = "Cuda device synchronization failed.";
+    return LinearSolverTerminationType::FATAL_ERROR;
+  }
+  int error = 0;
+  error_.CopyToHost(&error, 1);
+  if (error < 0) {
+    LOG(FATAL) << "Congratulations, you found a bug in Ceres - "
+               << "please report it. "
+               << "cuSolverDN::cusolverDnSpotrf fatal error. "
+               << "Argument: " << -error << " is invalid.";
+    // The following line is unreachable, but return failure just to be
+    // pedantic, since the compiler does not know that.
+    return LinearSolverTerminationType::FATAL_ERROR;
+  }
+  if (error > 0) {
+    *message = StringPrintf(
+        "cuSolverDN::cusolverDnSpotrf numerical failure. "
+        "The leading minor of order %d is not positive definite.",
+        error);
+    factorize_result_ = LinearSolverTerminationType::FAILURE;
+    return LinearSolverTerminationType::FAILURE;
+  }
+  *message = "Success";
+  return LinearSolverTerminationType::SUCCESS;
+}
+
+LinearSolverTerminationType CUDADenseCholeskyMixedPrecision::CudaCholeskySolve(
+    std::string* message) {
+  CHECK_EQ(cudaMemcpyAsync(correction_fp32_.data(),
+                           residual_fp32_.data(),
+                           num_cols_ * sizeof(float),
+                           cudaMemcpyDeviceToDevice,
+                           stream_),
+           cudaSuccess);
+  if (cusolverDnSpotrs(cusolver_handle_,
+                       CUBLAS_FILL_MODE_LOWER,
+                       num_cols_,
+                       1,
+                       lhs_fp32_.data(),
+                       num_cols_,
+                       correction_fp32_.data(),
+                       num_cols_,
+                       error_.data()) != CUSOLVER_STATUS_SUCCESS) {
+    *message = "cuSolverDN::cusolverDnDpotrs failed.";
+    return LinearSolverTerminationType::FATAL_ERROR;
+  }
+  if (cudaDeviceSynchronize() != cudaSuccess ||
+      cudaStreamSynchronize(stream_) != cudaSuccess) {
+    *message = "Cuda device synchronization failed.";
+    return LinearSolverTerminationType::FATAL_ERROR;
+  }
+  int error = 0;
+  error_.CopyToHost(&error, 1);
+  if (error != 0) {
+    LOG(FATAL) << "Congratulations, you found a bug in Ceres. "
+               << "Please report it."
+               << "cuSolverDN::cusolverDnDpotrs fatal error. "
+               << "Argument: " << -error << " is invalid.";
+  }
+  *message = "Success";
+  return LinearSolverTerminationType::SUCCESS;
+}
+
+LinearSolverTerminationType CUDADenseCholeskyMixedPrecision::Factorize(
+    int num_cols,
+    double* lhs,
+    std::string* message) {
+  num_cols_ = num_cols;
+
+  // Copy fp64 version of lhs to GPU.
+  lhs_fp64_.Reserve(num_cols * num_cols);
+  lhs_fp64_.CopyToGpuAsync(lhs, num_cols * num_cols, stream_);
+
+  // Create an fp32 copy of lhs, lhs_fp32.
+  lhs_fp32_.Reserve(num_cols * num_cols);
+  CudaFP64ToFP32(
+      lhs_fp64_.data(), lhs_fp32_.data(), num_cols * num_cols, stream_);
+
+  // Factorize lhs_fp32.
+  factorize_result_ = CudaCholeskyFactorize(message);
+  return factorize_result_;
+}
+
+LinearSolverTerminationType CUDADenseCholeskyMixedPrecision::Solve(
+    const double* rhs,
+    double* solution,
+    std::string* message) {
+  // If factorization failed, return failure.
+  if (factorize_result_ != LinearSolverTerminationType::SUCCESS) {
+    *message = "Factorize did not complete successfully previously.";
+    return factorize_result_;
+  }
+
+  // Reserve memory for all arrays.
+  rhs_fp64_.Reserve(num_cols_);
+  x_fp64_.Reserve(num_cols_);
+  correction_fp32_.Reserve(num_cols_);
+  residual_fp32_.Reserve(num_cols_);
+  residual_fp64_.Reserve(num_cols_);
+
+  // Initialize x = 0.
+  CudaSetZeroFP64(x_fp64_.data(), num_cols_, stream_);
+
+  // Initialize residual = rhs.
+  rhs_fp64_.CopyToGpuAsync(rhs, num_cols_, stream_);
+  residual_fp64_.CopyFromGpuAsync(rhs_fp64_.data(), num_cols_, stream_);
+
+  for (int i = 0; i <= max_num_refinement_iterations_; ++i) {
+    // Cast residual from fp64 to fp32.
+    CudaFP64ToFP32(
+        residual_fp64_.data(), residual_fp32_.data(), num_cols_, stream_);
+    // [fp32] c = lhs^-1 * residual.
+    auto result = CudaCholeskySolve(message);
+    if (result != LinearSolverTerminationType::SUCCESS) {
+      return result;
+    }
+    // [fp64] x += c.
+    CudaDsxpy(
+        x_fp64_.data(), correction_fp32_.data(), num_cols_, stream_);
+    if (i < max_num_refinement_iterations_) {
+      // [fp64] residual = rhs - lhs * x
+      // This is done in two steps:
+      // 1. [fp64] residual = rhs
+      residual_fp64_.CopyFromGpuAsync(rhs_fp64_.data(), num_cols_, stream_);
+      // 2. [fp64] residual = residual - lhs * x
+      double alpha = -1.0;
+      double beta = 1.0;
+      cublasDsymv(cublas_handle_,
+                  CUBLAS_FILL_MODE_LOWER,
+                  num_cols_,
+                  &alpha,
+                  lhs_fp64_.data(),
+                  num_cols_,
+                  x_fp64_.data(),
+                  1,
+                  &beta,
+                  residual_fp64_.data(),
+                  1);
+    }
+  }
+  x_fp64_.CopyToHost(solution, num_cols_);
+  *message = "Success.";
+  return LinearSolverTerminationType::SUCCESS;
+}
+
 #endif  // CERES_NO_CUDA
 
 }  // namespace ceres::internal
diff --git a/internal/ceres/dense_cholesky.h b/internal/ceres/dense_cholesky.h
index cc8642c..0593875 100644
--- a/internal/ceres/dense_cholesky.h
+++ b/internal/ceres/dense_cholesky.h
@@ -176,6 +176,78 @@
       LinearSolverTerminationType::FATAL_ERROR;
 };
 
+// A mixed-precision iterative refinement dense Cholesky solver using FP32 CUDA
+// Dense Cholesky for inner iterations, and FP64 outer refinements.
+// This class implements a modified version of the  "Classical iterative
+// refinement" (Algorithm 4.1) from the following paper:
+// Haidar, Azzam, Harun Bayraktar, Stanimire Tomov, Jack Dongarra, and Nicholas
+// J. Higham. "Mixed-precision iterative refinement using tensor cores on GPUs
+// to accelerate solution of linear systems." Proceedings of the Royal Society A
+// 476, no. 2243 (2020): 20200110.
+//
+// The three key modifications from Algorithm 4.1 in the paper are:
+// 1. We use Cholesky factorization instead of LU factorization since our A is
+//    symmetric positive definite.
+// 2. During the solution update, the up-cast and accumulation is performed in
+//    one step with a custom kernel.
+class CERES_NO_EXPORT CUDADenseCholeskyMixedPrecision final :
+    public DenseCholesky {
+ public:
+  static std::unique_ptr<CUDADenseCholeskyMixedPrecision> Create(
+      const LinearSolver::Options& options);
+  CUDADenseCholeskyMixedPrecision(
+      const CUDADenseCholeskyMixedPrecision&) = delete;
+  CUDADenseCholeskyMixedPrecision& operator=(
+      const CUDADenseCholeskyMixedPrecision&) = delete;
+  LinearSolverTerminationType Factorize(int num_cols,
+                                        double* lhs,
+                                        std::string* message) override;
+  LinearSolverTerminationType Solve(const double* rhs,
+                                    double* solution,
+                                    std::string* message) override;
+
+ private:
+  CUDADenseCholeskyMixedPrecision() = default;
+  // Helper function to wrap Cuda boilerplate needed to call Spotrf.
+  LinearSolverTerminationType CudaCholeskyFactorize(std::string* message);
+  // Helper function to wrap Cuda boilerplate needed to call Spotrs.
+  LinearSolverTerminationType CudaCholeskySolve(std::string* message);
+  // Picks up the cuSolverDN and cuStream handles from the context in the
+  // options, and the number of refinement iterations from the options. If
+  // the context is unable to initialize CUDA, returns false with a
+  // human-readable message indicating the reason.
+  bool Init(const LinearSolver::Options& options, std::string* message);
+
+  cusolverDnHandle_t cusolver_handle_ = nullptr;
+  cublasHandle_t cublas_handle_ = nullptr;
+  cudaStream_t stream_ = nullptr;
+  // Number of columns in the A matrix, to be cached between calls to *Factorize
+  // and *Solve.
+  size_t num_cols_ = 0;
+  CudaBuffer<double> lhs_fp64_;
+  CudaBuffer<double> rhs_fp64_;
+  CudaBuffer<float> lhs_fp32_;
+  // Scratch space for cuSOLVER on the GPU.
+  CudaBuffer<float> device_workspace_;
+  // Required for error handling with cuSOLVER.
+  CudaBuffer<int> error_;
+
+  // Solution to lhs * x = rhs.
+  CudaBuffer<double> x_fp64_;
+  // Incremental correction to x.
+  CudaBuffer<float> correction_fp32_;
+  // Residual to iterative refinement.
+  CudaBuffer<float> residual_fp32_;
+  CudaBuffer<double> residual_fp64_;
+
+  // Number of inner refinement iterations to perform.
+  int max_num_refinement_iterations_ = 0;
+  // Cache the result of Factorize to ensure that when Solve is called, the
+  // factorization of lhs is valid.
+  LinearSolverTerminationType factorize_result_ =
+      LinearSolverTerminationType::FATAL_ERROR;
+};
+
 #endif  // CERES_NO_CUDA
 
 }  // namespace ceres::internal
diff --git a/internal/ceres/dense_cholesky_test.cc b/internal/ceres/dense_cholesky_test.cc
index 7866d5c..a425343 100644
--- a/internal/ceres/dense_cholesky_test.cc
+++ b/internal/ceres/dense_cholesky_test.cc
@@ -45,12 +45,20 @@
 
 namespace ceres::internal {
 
-using Param = DenseLinearAlgebraLibraryType;
+
+using Param =
+    ::testing::tuple<DenseLinearAlgebraLibraryType, bool>;
+constexpr bool kMixedPrecision = true;
+constexpr bool kFullPrecision = false;
 
 namespace {
 
 std::string ParamInfoToString(testing::TestParamInfo<Param> info) {
-  return DenseLinearAlgebraLibraryTypeToString(info.param);
+  Param param = info.param;
+  std::stringstream ss;
+  ss << DenseLinearAlgebraLibraryTypeToString(::testing::get<0>(param)) << "_"
+     << (::testing::get<1>(param) ? "MixedPrecision" : "FullPrecision");
+  return ss.str();
 }
 }  // namespace
 
@@ -67,14 +75,18 @@
   LinearSolver::Options options;
   ContextImpl context;
   options.context = &context;
-  options.dense_linear_algebra_library_type = GetParam();
+  options.dense_linear_algebra_library_type = ::testing::get<0>(GetParam());
+  options.use_mixed_precision_solves = ::testing::get<1>(GetParam());
+  const int kNumRefinementSteps = 4;
+  if (options.use_mixed_precision_solves) {
+    options.max_num_refinement_iterations = kNumRefinementSteps;
+  }
   std::unique_ptr<DenseCholesky> dense_cholesky =
       DenseCholesky::Create(options);
 
   const int kNumTrials = 10;
   const int kMinNumCols = 1;
   const int kMaxNumCols = 10;
-
   for (int num_cols = kMinNumCols; num_cols < kMaxNumCols; ++num_cols) {
     for (int trial = 0; trial < kNumTrials; ++trial) {
       const MatrixType a = MatrixType::Random(num_cols, num_cols);
@@ -97,24 +109,70 @@
   }
 }
 
-namespace {
-
-// NOTE: preprocessor directives in a macro are not standard conforming
-decltype(auto) MakeValues() {
-  return ::testing::Values(EIGEN
+INSTANTIATE_TEST_SUITE_P(
+    EigenCholesky,
+    DenseCholeskyTest,
+    ::testing::Combine(::testing::Values(EIGEN),
+                       ::testing::Values(kFullPrecision)),
+    ParamInfoToString);
 #ifndef CERES_NO_LAPACK
-                           ,
-                           LAPACK
+INSTANTIATE_TEST_SUITE_P(
+    LapackCholesky,
+    DenseCholeskyTest,
+    ::testing::Combine(::testing::Values(LAPACK),
+                       ::testing::Values(kFullPrecision)),
+    ParamInfoToString);
 #endif
 #ifndef CERES_NO_CUDA
-                           ,
-                           CUDA
+INSTANTIATE_TEST_SUITE_P(
+    CudaCholesky,
+    DenseCholeskyTest,
+    ::testing::Combine(::testing::Values(CUDA),
+                       ::testing::Values(kMixedPrecision,
+                                         kFullPrecision)),
+    ParamInfoToString);
 #endif
-  );
+
+TEST(DenseCholesky, ValidMixedPrecisionOptions) {
+#ifndef CERES_NO_CUDA
+  {
+    // Dense Cholesky with CUDA: okay, supported.
+    ContextImpl context;
+    LinearSolver::Options options;
+    options.dense_linear_algebra_library_type = CUDA;
+    options.use_mixed_precision_solves = true;
+    options.context = &context;
+    std::unique_ptr<DenseCholesky> dense_cholesky =
+        DenseCholesky::Create(options);
+    EXPECT_NE(dense_cholesky, nullptr);
+  }
+#endif
 }
 
-}  // namespace
+TEST(DenseCholesky, InvalidMixedPrecisionOptions) {
+  {
+    // Dense Cholesky with Eigen: not supported
+    ContextImpl context;
+    LinearSolver::Options options;
+    options.dense_linear_algebra_library_type = EIGEN;
+    options.use_mixed_precision_solves = true;
+    options.context = &context;
+    std::unique_ptr<DenseCholesky> dense_cholesky =
+        DenseCholesky::Create(options);
+    EXPECT_EQ(dense_cholesky, nullptr);
+  }
 
-INSTANTIATE_TEST_SUITE_P(_, DenseCholeskyTest, MakeValues(), ParamInfoToString);
+  {
+    // Dense Cholesky with Lapack: not supported
+    ContextImpl context;
+    LinearSolver::Options options;
+    options.dense_linear_algebra_library_type = LAPACK;
+    options.use_mixed_precision_solves = true;
+    options.context = &context;
+    std::unique_ptr<DenseCholesky> dense_cholesky =
+        DenseCholesky::Create(options);
+    EXPECT_EQ(dense_cholesky, nullptr);
+  }
+}
 
 }  // namespace ceres::internal
diff --git a/internal/ceres/solver.cc b/internal/ceres/solver.cc
index 9503843..93551f7 100644
--- a/internal/ceres/solver.cc
+++ b/internal/ceres/solver.cc
@@ -118,6 +118,48 @@
            internal::EigenSparse::IsNestedDissectionAvailable()));
 }
 
+bool MixedPrecisionOptionIsValid(const Solver::Options& options,
+                                 string* error) {
+  if (options.use_mixed_precision_solves) {
+    if ((options.linear_solver_type == DENSE_NORMAL_CHOLESKY ||
+        options.linear_solver_type == DENSE_SCHUR) &&
+        options.dense_linear_algebra_library_type == CUDA) {
+      // Mixed precision with CUDA and dense Cholesky variant: okay.
+      return true;
+    }
+    if ((options.linear_solver_type == SPARSE_NORMAL_CHOLESKY ||
+        options.linear_solver_type == SPARSE_SCHUR) &&
+        (options.sparse_linear_algebra_library_type == EIGEN_SPARSE ||
+        options.sparse_linear_algebra_library_type == ACCELERATE_SPARSE)) {
+      // Mixed precision with any Eigen or Accelerate Cholesky variant: okay.
+      return true;
+    }
+    // No other mixed precision variants are supported.
+    if (options.linear_solver_type == DENSE_NORMAL_CHOLESKY ||
+        options.linear_solver_type == DENSE_SCHUR) {
+      *error = StringPrintf(
+          "use_mixed_precision_solves with %s is only supported with "
+          "CUDA as the dense_linear_algebra_library_type.",
+          LinearSolverTypeToString(options.linear_solver_type));
+      return false;
+    }
+    if ((options.linear_solver_type == SPARSE_NORMAL_CHOLESKY ||
+        options.linear_solver_type == SPARSE_SCHUR) &&
+        options.sparse_linear_algebra_library_type == SUITE_SPARSE) {
+      *error =  StringPrintf(
+          "use_mixed_precision_solves with %s is not supported with "
+          "SUITE_SPARSE as the sparse_linear_algebra_library_type.",
+          LinearSolverTypeToString(options.linear_solver_type));
+      return false;
+    }
+    *error = StringPrintf(
+          "use_mixed_precision_solves with %s is not supported.",
+          LinearSolverTypeToString(options.linear_solver_type));
+    return false;
+  }
+  return true;
+}
+
 bool TrustRegionOptionsAreValid(const Solver::Options& options, string* error) {
   OPTION_GT(initial_trust_region_radius, 0.0);
   OPTION_GT(min_trust_region_radius, 0.0);
@@ -228,6 +270,10 @@
     }
   }
 
+  if (!MixedPrecisionOptionIsValid(options, error)) {
+    return false;
+  }
+
   if (options.trust_region_strategy_type == DOGLEG) {
     if (options.linear_solver_type == ITERATIVE_SCHUR ||
         options.linear_solver_type == CGNR) {
@@ -372,7 +418,7 @@
                                  &(summary->inner_iteration_ordering_given));
 
   // clang-format off
-  summary->dense_linear_algebra_library_type  = options.dense_linear_algebra_library_type;  
+  summary->dense_linear_algebra_library_type  = options.dense_linear_algebra_library_type;
   summary->dogleg_type                        = options.dogleg_type;
   summary->inner_iteration_time_in_seconds    = 0.0;
   summary->num_line_search_steps              = 0;
@@ -381,18 +427,18 @@
   summary->line_search_polynomial_minimization_time_in_seconds = 0.0;
   summary->line_search_total_time_in_seconds  = 0.0;
   summary->inner_iterations_given             = options.use_inner_iterations;
-  summary->line_search_direction_type         = options.line_search_direction_type;       
-  summary->line_search_interpolation_type     = options.line_search_interpolation_type;   
+  summary->line_search_direction_type         = options.line_search_direction_type;
+  summary->line_search_interpolation_type     = options.line_search_interpolation_type;
   summary->line_search_type                   = options.line_search_type;
   summary->linear_solver_type_given           = options.linear_solver_type;
   summary->max_lbfgs_rank                     = options.max_lbfgs_rank;
   summary->minimizer_type                     = options.minimizer_type;
-  summary->nonlinear_conjugate_gradient_type  = options.nonlinear_conjugate_gradient_type; 
+  summary->nonlinear_conjugate_gradient_type  = options.nonlinear_conjugate_gradient_type;
   summary->num_threads_given                  = options.num_threads;
   summary->preconditioner_type_given          = options.preconditioner_type;
   summary->sparse_linear_algebra_library_type = options.sparse_linear_algebra_library_type;
-  summary->linear_solver_ordering_type        = options.linear_solver_ordering_type;       
-  summary->trust_region_strategy_type         = options.trust_region_strategy_type;        
+  summary->linear_solver_ordering_type        = options.linear_solver_ordering_type;
+  summary->trust_region_strategy_type         = options.trust_region_strategy_type;
   summary->visibility_clustering_type         = options.visibility_clustering_type;
   // clang-format on
 }
@@ -408,8 +454,9 @@
                                  &(summary->inner_iteration_ordering_used));
 
   // clang-format off
-  summary->inner_iterations_used          = pp.inner_iteration_minimizer != nullptr;   
+  summary->inner_iterations_used          = pp.inner_iteration_minimizer != nullptr;
   summary->linear_solver_type_used        = pp.linear_solver_options.type;
+  summary->mixed_precision_solves_used    = pp.options.use_mixed_precision_solves;
   summary->num_threads_used               = pp.options.num_threads;
   summary->preconditioner_type_used       = pp.options.preconditioner_type;
   // clang-format on
@@ -699,10 +746,13 @@
     if (linear_solver_type_used == DENSE_NORMAL_CHOLESKY ||
         linear_solver_type_used == DENSE_SCHUR ||
         linear_solver_type_used == DENSE_QR) {
+      const char* mixed_precision_suffix =
+          (mixed_precision_solves_used ? "(Mixed Precision)" : "");
       StringAppendF(&report,
-                    "\nDense linear algebra library  %15s\n",
+                    "\nDense linear algebra library  %15s %s\n",
                     DenseLinearAlgebraLibraryTypeToString(
-                        dense_linear_algebra_library_type));
+                        dense_linear_algebra_library_type),
+                    mixed_precision_suffix);
     }
 
     StringAppendF(&report,
@@ -723,12 +773,15 @@
         (linear_solver_type_used == ITERATIVE_SCHUR &&
          (preconditioner_type_used == CLUSTER_JACOBI ||
           preconditioner_type_used == CLUSTER_TRIDIAGONAL))) {
+      const char* mixed_precision_suffix =
+          (mixed_precision_solves_used ? "(Mixed Precision)" : "");
       StringAppendF(
           &report,
-          "\nSparse linear algebra library %15s + %s\n",
+          "\nSparse linear algebra library %15s + %s %s\n",
           SparseLinearAlgebraLibraryTypeToString(
               sparse_linear_algebra_library_type),
-          LinearSolverOrderingTypeToString(linear_solver_ordering_type));
+          LinearSolverOrderingTypeToString(linear_solver_ordering_type),
+          mixed_precision_suffix);
     }
 
     StringAppendF(&report, "\n");