Mixed-precision Iterative Refinement Cholesky With CUDA
* Created a new class CUDADenseCholeskyMixedPrecision, which performs
Cholesky factorization and solving in single (fp32) precision, and
optionally performs iterative refinement.
* Added CUDA kernels for mixed-precision solve operations
* Added more detailed timing information to the FullReport about Schur
elimination, reduced system solves, and back-substitution.
Some test performance numbers follow.
All tests were performed on an Ubuntu 20.04 desktop with an
Intel Core i9-9940X CPU and Nvidia Quadro RTX 6000 GPU.
Tests were launched as:
./bin/bundle_adjuster --input (problem_file) \
--num_iterations 20
--num_threads 28
--linear_solver dense_schur
--dense_linear_algebra_library (cuda|lapack)
[--mixed_precision_solves]
==================================================
problem-21-11315-pre.txt
==================================================
--------------------------------------------------
Cuda Mixed Precision
--------------------------------------------------
Cost:
Initial 4.413239e+06
Final 3.037864e+04
Change 4.382861e+06
Linear solver 0.250703 (14)
├ Schur eliminate 0.234025 (14)
├ Reduced solve 0.006643 (14)
└ Backsubstitute 0.006598 (12)
--------------------------------------------------
Cuda
--------------------------------------------------
Cost:
Initial 4.413239e+06
Final 3.037864e+04
Change 4.382861e+06
Linear solver 0.257517 (12)
├ Schur eliminate 0.233518 (12)
├ Reduced solve 0.010621 (12)
└ Backsubstitute 0.007124 (12)
--------------------------------------------------
Lapack (OpenBLAS)
--------------------------------------------------
Cost:
Initial 4.413239e+06
Final 3.037864e+04
Change 4.382861e+06
Linear solver 0.332349 (12)
├ Schur eliminate 0.274748 (12)
├ Reduced solve 0.015966 (12)
└ Backsubstitute 0.034192 (12)
==================================================
problem-257-65132-pre.txt
==================================================
--------------------------------------------------
Cuda Mixed Precision
--------------------------------------------------
Cost:
Initial 2.456242e+07
Final 9.677593e+04
Change 2.446565e+07
Linear solver 1.332367 (20)
├ Schur eliminate 1.021365 (20)
├ Reduced solve 0.195472 (20)
└ Backsubstitute 0.075582 (20)
--------------------------------------------------
Cuda
--------------------------------------------------
Cost:
Initial 2.456242e+07
Final 9.677547e+04
Change 2.446565e+07
Linear solver 1.810176 (20)
├ Schur eliminate 1.012862 (20)
├ Reduced solve 0.678704 (20)
└ Backsubstitute 0.083925 (20)
--------------------------------------------------
Lapack (OpenBLAS)
--------------------------------------------------
Cost:
Initial 2.456242e+07
Final 9.677547e+04
Change 2.446565e+07
Linear solver 2.376273 (20)
├ Schur eliminate 0.987613 (20)
├ Reduced solve 1.043873 (20)
└ Backsubstitute 0.310402 (20)
==================================================
problem-744-543562-pre.txt
==================================================
--------------------------------------------------
Cuda Mixed Precision
--------------------------------------------------
Cost:
Initial 1.434881e+08
Final 1.546895e+06
Change 1.419412e+08
Linear solver 27.010088 (20)
├ Schur eliminate 24.362433 (20)
├ Reduced solve 1.428542 (20)
└ Backsubstitute 0.814266 (20)
--------------------------------------------------
Cuda
--------------------------------------------------
Cost:
Initial 1.434881e+08
Final 1.546895e+06
Change 1.419412e+08
Linear solver 32.342513 (20)
├ Schur eliminate 24.638819 (20)
├ Reduced solve 6.492090 (20)
└ Backsubstitute 0.802184 (20)
--------------------------------------------------
Lapack (OpenBLAS)
--------------------------------------------------
Cost:
Initial 1.434881e+08
Final 1.546895e+06
Change 1.419412e+08
Linear solver 34.152224 (20)
├ Schur eliminate 24.183723 (20)
├ Reduced solve 8.784413 (20)
└ Backsubstitute 0.795044 (20)
Change-Id: I178887e776d8f4a1e8abb99bbc205bf8c278bf79
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8d6094c..82014a9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -145,7 +145,9 @@
option(ENABLE_BITCODE
"Enable bitcode for iOS builds (disables inline optimizations for Eigen)." OFF)
endif()
-option(CUDA "Enable use of CUDA linear algebra solvers." ON)
+# We can't have an option called 'CUDA' since that is a reserved word -- a
+# language definition.
+option(USE_CUDA "Enable use of CUDA linear algebra solvers." ON)
option(LAPACK "Enable use of LAPACK directly within Ceres." ON)
# Template specializations for the Schur complement based solvers. If
# compile time, binary size or compiler performance is an issue, you
@@ -228,23 +230,23 @@
endif (EIGENSPARSE)
endif (Eigen3_FOUND)
-if (CUDA)
+if (USE_CUDA)
find_package(CUDA QUIET)
if (CUDA_FOUND)
message("-- Found CUDA version ${CUDA_VERSION}: "
"${CUDA_LIBRARIES};"
"${CUDA_cusolver_LIBRARY};"
"${CUDA_cusparse_LIBRARY}")
+ enable_language(CUDA)
else (CUDA_FOUND)
message("-- Did not find CUDA library, disabling CUDA support.")
update_cache_variable(CUDA OFF)
list(APPEND CERES_COMPILE_OPTIONS CERES_NO_CUDA)
endif (CUDA_FOUND)
-else (CUDA)
+else (USE_CUDA)
message("-- Building without CUDA.")
list(APPEND CERES_COMPILE_OPTIONS CERES_NO_CUDA)
-endif (CUDA)
-
+endif (USE_CUDA)
if (LAPACK)
find_package(LAPACK QUIET)
if (LAPACK_FOUND)
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
index 5e6b0a7..d6bae75 100644
--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -118,11 +118,12 @@
- `CUDA <https://developer.nvidia.com/cuda-toolkit>`_ If you have an
NVIDIA GPU then Ceres Solver can use it accelerate the solution of
- the Gauss-Newton linear systems using ``CUDA``. Currently this
- support is limited to using the dense linear solvers that ship with
- ``CUDA``. As a result GPU acceleration can be used to speed up
+ the Gauss-Newton linear systems using the CMake flag ``USE_CUDA``.
+ Currently this support is limited to using the dense linear solvers that ship
+ with ``CUDA``. As a result GPU acceleration can be used to speed up
``DENSE_QR``, ``DENSE_NORMAL_CHOLESKY`` and
- ``DENSE_SCHUR``. **Optional**.
+ ``DENSE_SCHUR``. This also enables ``CUDA`` mixed precision solves
+ for ``DENSE_NORMAL_CHOLESKY`` and ``DENSE_SCHUR``. **Optional**.
.. _section-linux:
diff --git a/include/ceres/solver.h b/include/ceres/solver.h
index 097276b..3b15318 100644
--- a/include/ceres/solver.h
+++ b/include/ceres/solver.h
@@ -573,6 +573,8 @@
// TODO(sameeragarwal): Further expand the documentation for the
// following two options.
+ // TODO(joydeepbiswas): Update the documentation for the mixed precision
+ // option with CUDA.
// NOTE1: EXPERIMENTAL FEATURE, UNDER DEVELOPMENT, USE AT YOUR OWN RISK.
//
@@ -996,6 +998,8 @@
SPARSE_NORMAL_CHOLESKY;
#endif
+ bool mixed_precision_solves_used = false;
+
LinearSolverOrderingType linear_solver_ordering_type;
// Size of the elimination groups given by the user as hints to
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index 4e5008b..ae22a9d 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -157,6 +157,10 @@
COMMAND ${CUDA_TOOLKIT_ROOT_DIR}/bin/cuda-memcheck --leak-check full
$<TARGET_FILE:cuda_dense_cholesky_test>)
endif (BUILD_TESTING)
+ set_source_files_properties(ceres_cuda_kernels.cu PROPERTIES LANGUAGE CUDA)
+ add_library(ceres_cuda_kernels ceres_cuda_kernels.cu)
+ target_compile_features(ceres_cuda_kernels PRIVATE cxx_std_14)
+ list(APPEND CERES_LIBRARY_PRIVATE_DEPENDENCIES ceres_cuda_kernels)
endif (CUDA_FOUND)
if (LAPACK_FOUND)
@@ -387,7 +391,7 @@
list(APPEND CERES_LIBRARY_PRIVATE_DEPENDENCIES_INCLUDE_DIRS
${AccelerateSparse_INCLUDE_DIRS})
endif()
-if (CUDA)
+if (USE_CUDA)
list(APPEND CERES_LIBRARY_PRIVATE_DEPENDENCIES_INCLUDE_DIRS
${CUDA_INCLUDE_DIRS})
endif()
@@ -403,6 +407,12 @@
generate_export_header(ceres EXPORT_FILE_NAME
${Ceres_BINARY_DIR}/${CMAKE_INSTALL_INCLUDEDIR}/ceres/internal/export.h)
+install(TARGETS ceres_cuda_kernels
+ EXPORT CeresExport
+ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+ LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+ ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
+
install(TARGETS ceres
EXPORT CeresExport
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
diff --git a/internal/ceres/ceres_cuda_kernels.cu b/internal/ceres/ceres_cuda_kernels.cu
new file mode 100644
index 0000000..2fd76b5
--- /dev/null
+++ b/internal/ceres/ceres_cuda_kernels.cu
@@ -0,0 +1,109 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2022 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+
+#include "cuda_runtime.h"
+
+namespace ceres::internal {
+
+// As the CUDA Toolkit documentation says, "although arbitrary in this case, is
+// a common choice". This is determined by the warp size, max block size, and
+// multiprocessor sizes of recent GPUs. For complex kernels with significant
+// register usage and unusual memory patterns, the occupancy calculator API
+// might provide better performance. See "Occupancy Calculator" under the CUDA
+// toolkit documentation.
+constexpr int kCudaBlockSize = 256;
+
+template<typename SrcType, typename DstType>
+__global__ void TypeConversionKernel(const SrcType* __restrict__ input,
+ DstType* __restrict__ output,
+ const int size) {
+ const int i = blockIdx.x * blockDim.x + threadIdx.x;
+ if (i < size) {
+ output[i] = static_cast<DstType>(input[i]);
+ }
+}
+
+void CudaFP64ToFP32(const double* input,
+ float* output,
+ const int size,
+ cudaStream_t stream) {
+ const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
+ TypeConversionKernel<double, float>
+ <<<num_blocks, kCudaBlockSize, 0, stream>>>(input, output, size);
+}
+
+void CudaFP32ToFP64(const float* input,
+ double* output,
+ const int size,
+ cudaStream_t stream) {
+ const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
+ TypeConversionKernel<float, double>
+ <<<num_blocks, kCudaBlockSize, 0, stream>>>(input, output, size);
+}
+
+template<typename T>
+__global__ void SetZeroKernel(T* __restrict__ output, const int size) {
+ const int i = blockIdx.x * blockDim.x + threadIdx.x;
+ if (i < size) {
+ output[i] = T(0.0);
+ }
+}
+
+void CudaSetZeroFP32(float* output, const int size, cudaStream_t stream) {
+ const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
+ SetZeroKernel<float><<<num_blocks, kCudaBlockSize, 0, stream>>>(output, size);
+}
+
+void CudaSetZeroFP64(double* output, const int size, cudaStream_t stream) {
+ const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
+ SetZeroKernel<double><<<num_blocks, kCudaBlockSize, 0, stream>>>(
+ output, size);
+}
+
+template <typename SrcType, typename DstType>
+__global__ void XPlusEqualsYKernel(DstType* __restrict__ x,
+ const SrcType* __restrict__ y,
+ const int size) {
+ const int i = blockIdx.x * blockDim.x + threadIdx.x;
+ if (i < size) {
+ x[i] = x[i] + DstType(y[i]);
+ }
+}
+
+void CudaDsxpy(double* x,
+ float* y,
+ const int size,
+ cudaStream_t stream) {
+ const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
+ XPlusEqualsYKernel<float, double>
+ <<<num_blocks, kCudaBlockSize, 0, stream>>>(x, y, size);
+}
+
+} // namespace ceres_cuda_kernels
\ No newline at end of file
diff --git a/internal/ceres/ceres_cuda_kernels.h b/internal/ceres/ceres_cuda_kernels.h
new file mode 100644
index 0000000..989399f
--- /dev/null
+++ b/internal/ceres/ceres_cuda_kernels.h
@@ -0,0 +1,66 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2022 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+
+#include "ceres/internal/config.h"
+
+#ifndef CERES_NO_CUDA
+
+#include "cuda_runtime.h"
+
+namespace ceres::internal {
+
+// Convert an array of double (FP64) values to float (FP32). Both arrays must
+// already be on GPU memory.
+void CudaFP64ToFP32(const double* input,
+ float* output,
+ const int size,
+ cudaStream_t stream);
+
+// Convert an array of float (FP32) values to double (FP64). Both arrays must
+// already be on GPU memory.
+void CudaFP32ToFP64(const float* input,
+ double* output,
+ const int size,
+ cudaStream_t stream);
+
+// Set all elements of the array to the FP32 value 0. The array must be in GPU
+// memory.
+void CudaSetZeroFP32(float* output, const int size, cudaStream_t stream);
+
+// Set all elements of the array to the FP64 value 0. The array must be in GPU
+// memory.
+void CudaSetZeroFP64(double* output, const int size, cudaStream_t stream);
+
+// Compute x = x + double(y). Input array is float (FP32), output array is
+// double (FP64). Both arrays must already be on GPU memory.
+void CudaDsxpy(double* x, float* y, const int size, cudaStream_t stream);
+} // namespace ceres_cuda_kernels
+
+#endif // CERES_NO_CUDA
\ No newline at end of file
diff --git a/internal/ceres/cuda_buffer.h b/internal/ceres/cuda_buffer.h
index a1cf784..8868e1a 100644
--- a/internal/ceres/cuda_buffer.h
+++ b/internal/ceres/cuda_buffer.h
@@ -81,6 +81,14 @@
cudaSuccess);
}
+ // Perform an asynchronous copy from GPU memory using the stream provided.
+ void CopyFromGpuAsync(const T* data, const size_t size, cudaStream_t stream) {
+ Reserve(size);
+ CHECK_EQ(cudaMemcpyAsync(
+ data_, data, size * sizeof(T), cudaMemcpyDeviceToDevice, stream),
+ cudaSuccess);
+ }
+
// Copy data from the GPU to CPU memory. This is necessarily synchronous since
// any potential GPU kernels that may be writing to the buffer must finish
// before the transfer happens.
@@ -95,6 +103,7 @@
}
T* data() { return data_; }
+ const T* data() const { return data_; }
size_t size() const { return size_; }
private:
diff --git a/internal/ceres/cuda_dense_cholesky_test.cc b/internal/ceres/cuda_dense_cholesky_test.cc
index 13dc34b..b9acc99 100644
--- a/internal/ceres/cuda_dense_cholesky_test.cc
+++ b/internal/ceres/cuda_dense_cholesky_test.cc
@@ -165,8 +165,140 @@
summary.termination_type = dense_cholesky->FactorAndSolve(
kNumCols, lhs.data(), rhs.data(), x_computed.data(), &summary.message);
ASSERT_EQ(summary.termination_type, LinearSolverTerminationType::SUCCESS);
+ static const double kEpsilon = std::numeric_limits<double>::epsilon() * 2e5;
ASSERT_NEAR(
- (x_computed - x_expected).norm() / x_expected.norm(), 0.0, 1e-10);
+ (x_computed - x_expected).norm() / x_expected.norm(), 0.0, kEpsilon);
+ }
+}
+
+TEST(CUDADenseCholeskyMixedPrecision, InvalidOptionsOnCreate) {
+ {
+ // Did not ask for CUDA, and did not ask for mixed precision.
+ LinearSolver::Options options;
+ auto solver = CUDADenseCholeskyMixedPrecision::Create(options);
+ ASSERT_EQ(solver, nullptr);
+ }
+ {
+ // Asked for CUDA, but did not ask for mixed precision.
+ LinearSolver::Options options;
+ options.dense_linear_algebra_library_type = ceres::CUDA;
+ auto solver = CUDADenseCholeskyMixedPrecision::Create(options);
+ ASSERT_EQ(solver, nullptr);
+ }
+}
+
+// Tests the CUDA Cholesky solver with a simple 4x4 matrix.
+TEST(CUDADenseCholeskyMixedPrecision, Cholesky4x4Matrix1Step) {
+ Eigen::Matrix4d A;
+ // clang-format off
+ // A common test Cholesky decomposition test matrix, see :
+ // https://en.wikipedia.org/w/index.php?title=Cholesky_decomposition&oldid=1080607368#Example
+ A << 4, 12, -16, 0,
+ 12, 37, -43, 0,
+ -16, -43, 98, 0,
+ 0, 0, 0, 1;
+ // clang-format on
+
+ const Eigen::Vector4d b = Eigen::Vector4d::Ones();
+ LinearSolver::Options options;
+ options.max_num_refinement_iterations = 0;
+ ContextImpl context;
+ options.context = &context;
+ options.dense_linear_algebra_library_type = CUDA;
+ options.use_mixed_precision_solves = true;
+ auto solver = CUDADenseCholeskyMixedPrecision::Create(options);
+ ASSERT_NE(solver, nullptr);
+ std::string error_string;
+ ASSERT_EQ(solver->Factorize(A.cols(), A.data(), &error_string),
+ LinearSolverTerminationType::SUCCESS);
+ Eigen::Vector4d x = Eigen::Vector4d::Zero();
+ ASSERT_EQ(solver->Solve(b.data(), x.data(), &error_string),
+ LinearSolverTerminationType::SUCCESS);
+ // A single step of the mixed precision solver will be equivalent to solving
+ // in low precision (FP32). Hence the tolerance is defined w.r.t. FP32 epsilon
+ // instead of FP64 epsilon.
+ static const double kEpsilon = std::numeric_limits<float>::epsilon() * 15.0f;
+ EXPECT_NEAR(x(0), 113.75 / 3.0, kEpsilon);
+ EXPECT_NEAR(x(1), -31.0 / 3.0, kEpsilon);
+ EXPECT_NEAR(x(2), 5.0 / 3.0, kEpsilon);
+ EXPECT_NEAR(x(3), 1.0000, kEpsilon);
+}
+
+// Tests the CUDA Cholesky solver with a simple 4x4 matrix.
+TEST(CUDADenseCholeskyMixedPrecision, Cholesky4x4Matrix4Steps) {
+ Eigen::Matrix4d A;
+ // clang-format off
+ A << 4, 12, -16, 0,
+ 12, 37, -43, 0,
+ -16, -43, 98, 0,
+ 0, 0, 0, 1;
+ // clang-format on
+
+ const Eigen::Vector4d b = Eigen::Vector4d::Ones();
+ LinearSolver::Options options;
+ options.max_num_refinement_iterations = 3;
+ ContextImpl context;
+ options.context = &context;
+ options.dense_linear_algebra_library_type = CUDA;
+ options.use_mixed_precision_solves = true;
+ auto solver = CUDADenseCholeskyMixedPrecision::Create(options);
+ ASSERT_NE(solver, nullptr);
+ std::string error_string;
+ ASSERT_EQ(solver->Factorize(A.cols(), A.data(), &error_string),
+ LinearSolverTerminationType::SUCCESS);
+ Eigen::Vector4d x = Eigen::Vector4d::Zero();
+ ASSERT_EQ(solver->Solve(b.data(), x.data(), &error_string),
+ LinearSolverTerminationType::SUCCESS);
+ // The error does not reduce beyond four iterations, and stagnates at this
+ // level of precision.
+ static const double kEpsilon = std::numeric_limits<double>::epsilon() * 3e2;
+ EXPECT_NEAR(x(0), 113.75 / 3.0, kEpsilon);
+ EXPECT_NEAR(x(1), -31.0 / 3.0, kEpsilon);
+ EXPECT_NEAR(x(2), 5.0 / 3.0, kEpsilon);
+ EXPECT_NEAR(x(3), 1.0000, kEpsilon);
+}
+
+
+
+TEST(CUDADenseCholeskyMixedPrecision, Randomized1600x1600Tests) {
+ const int kNumCols = 1600;
+ using LhsType = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>;
+ using RhsType = Eigen::Matrix<double, Eigen::Dynamic, 1>;
+ using SolutionType = Eigen::Matrix<double, Eigen::Dynamic, 1>;
+
+ LinearSolver::Options options;
+ ContextImpl context;
+ options.context = &context;
+ options.dense_linear_algebra_library_type = ceres::CUDA;
+ options.use_mixed_precision_solves = true;
+ options.max_num_refinement_iterations = 20;
+ std::unique_ptr<CUDADenseCholeskyMixedPrecision> dense_cholesky =
+ CUDADenseCholeskyMixedPrecision::Create(options);
+
+ const int kNumTrials = 20;
+ for (int i = 0; i < kNumTrials; ++i) {
+ LhsType lhs = LhsType::Random(kNumCols, kNumCols);
+ lhs = lhs.transpose() * lhs;
+ lhs += 1e-3 * LhsType::Identity(kNumCols, kNumCols);
+ SolutionType x_expected = SolutionType::Random(kNumCols);
+ RhsType rhs = lhs * x_expected;
+ SolutionType x_computed = SolutionType::Zero(kNumCols);
+ // Sanity check the random matrix sizes.
+ EXPECT_EQ(lhs.rows(), kNumCols);
+ EXPECT_EQ(lhs.cols(), kNumCols);
+ EXPECT_EQ(rhs.rows(), kNumCols);
+ EXPECT_EQ(rhs.cols(), 1);
+ EXPECT_EQ(x_expected.rows(), kNumCols);
+ EXPECT_EQ(x_expected.cols(), 1);
+ EXPECT_EQ(x_computed.rows(), kNumCols);
+ EXPECT_EQ(x_computed.cols(), 1);
+ LinearSolver::Summary summary;
+ summary.termination_type = dense_cholesky->FactorAndSolve(
+ kNumCols, lhs.data(), rhs.data(), x_computed.data(), &summary.message);
+ ASSERT_EQ(summary.termination_type, LinearSolverTerminationType::SUCCESS);
+ static const double kEpsilon = std::numeric_limits<double>::epsilon() * 1e6;
+ ASSERT_NEAR(
+ (x_computed - x_expected).norm() / x_expected.norm(), 0.0, kEpsilon);
}
}
diff --git a/internal/ceres/dense_cholesky.cc b/internal/ceres/dense_cholesky.cc
index a10f311..e2c5036 100644
--- a/internal/ceres/dense_cholesky.cc
+++ b/internal/ceres/dense_cholesky.cc
@@ -38,6 +38,7 @@
#include "ceres/internal/config.h"
#ifndef CERES_NO_CUDA
+#include "ceres/ceres_cuda_kernels.h"
#include "ceres/context_impl.h"
#include "cuda_runtime.h"
#include "cusolverDn.h"
@@ -68,12 +69,16 @@
std::unique_ptr<DenseCholesky> dense_cholesky;
switch (options.dense_linear_algebra_library_type) {
- case EIGEN:
+ case EIGEN: {
+ // Eigen mixed precision solver not yet implemented.
+ if (options.use_mixed_precision_solves) return nullptr;
dense_cholesky = std::make_unique<EigenDenseCholesky>();
- break;
+ } break;
case LAPACK:
#ifndef CERES_NO_LAPACK
+ // LAPACK mixed precision solver not yet implemented.
+ if (options.use_mixed_precision_solves) return nullptr;
dense_cholesky = std::make_unique<LAPACKDenseCholesky>();
break;
#else
@@ -82,7 +87,11 @@
case CUDA:
#ifndef CERES_NO_CUDA
- dense_cholesky = CUDADenseCholesky::Create(options);
+ if (options.use_mixed_precision_solves) {
+ dense_cholesky = CUDADenseCholeskyMixedPrecision::Create(options);
+ } else {
+ dense_cholesky = CUDADenseCholesky::Create(options);
+ }
break;
#else
LOG(FATAL) << "Ceres was compiled without support for CUDA.";
@@ -320,6 +329,210 @@
return nullptr;
}
+std::unique_ptr<CUDADenseCholeskyMixedPrecision>
+ CUDADenseCholeskyMixedPrecision::Create(
+ const LinearSolver::Options& options) {
+ if (options.dense_linear_algebra_library_type != CUDA ||
+ !options.use_mixed_precision_solves) {
+ // The user called the wrong factory method.
+ return nullptr;
+ }
+ auto solver = std::unique_ptr<CUDADenseCholeskyMixedPrecision>(
+ new CUDADenseCholeskyMixedPrecision());
+ std::string cuda_error;
+ if (solver->Init(options, &cuda_error)) {
+ return solver;
+ }
+ LOG(ERROR) << "CUDADenseCholeskyMixedPrecision::Init failed: " << cuda_error;
+ return nullptr;
+}
+
+bool CUDADenseCholeskyMixedPrecision::Init(
+ const LinearSolver::Options& options, std::string* message) {
+ if (!options.context->InitCUDA(message)) {
+ return false;
+ }
+ cusolver_handle_ = options.context->cusolver_handle_;
+ cublas_handle_ = options.context->cublas_handle_;
+ stream_ = options.context->stream_;
+ error_.Reserve(1);
+ max_num_refinement_iterations_ = options.max_num_refinement_iterations;
+ *message = "CUDADenseCholeskyMixedPrecision::Init Success.";
+ return true;
+}
+
+LinearSolverTerminationType
+CUDADenseCholeskyMixedPrecision::CudaCholeskyFactorize(std::string* message) {
+ int device_workspace_size = 0;
+ if (cusolverDnSpotrf_bufferSize(cusolver_handle_,
+ CUBLAS_FILL_MODE_LOWER,
+ num_cols_,
+ lhs_fp32_.data(),
+ num_cols_,
+ &device_workspace_size) !=
+ CUSOLVER_STATUS_SUCCESS) {
+ *message = "cuSolverDN::cusolverDnSpotrf_bufferSize failed.";
+ return LinearSolverTerminationType::FATAL_ERROR;
+ }
+ device_workspace_.Reserve(device_workspace_size);
+ if (cusolverDnSpotrf(cusolver_handle_,
+ CUBLAS_FILL_MODE_LOWER,
+ num_cols_,
+ lhs_fp32_.data(),
+ num_cols_,
+ device_workspace_.data(),
+ device_workspace_.size(),
+ error_.data()) != CUSOLVER_STATUS_SUCCESS) {
+ *message = "cuSolverDN::cusolverDnSpotrf failed.";
+ return LinearSolverTerminationType::FATAL_ERROR;
+ }
+ if (cudaDeviceSynchronize() != cudaSuccess ||
+ cudaStreamSynchronize(stream_) != cudaSuccess) {
+ *message = "Cuda device synchronization failed.";
+ return LinearSolverTerminationType::FATAL_ERROR;
+ }
+ int error = 0;
+ error_.CopyToHost(&error, 1);
+ if (error < 0) {
+ LOG(FATAL) << "Congratulations, you found a bug in Ceres - "
+ << "please report it. "
+ << "cuSolverDN::cusolverDnSpotrf fatal error. "
+ << "Argument: " << -error << " is invalid.";
+ // The following line is unreachable, but return failure just to be
+ // pedantic, since the compiler does not know that.
+ return LinearSolverTerminationType::FATAL_ERROR;
+ }
+ if (error > 0) {
+ *message = StringPrintf(
+ "cuSolverDN::cusolverDnSpotrf numerical failure. "
+ "The leading minor of order %d is not positive definite.",
+ error);
+ factorize_result_ = LinearSolverTerminationType::FAILURE;
+ return LinearSolverTerminationType::FAILURE;
+ }
+ *message = "Success";
+ return LinearSolverTerminationType::SUCCESS;
+}
+
+LinearSolverTerminationType CUDADenseCholeskyMixedPrecision::CudaCholeskySolve(
+ std::string* message) {
+ CHECK_EQ(cudaMemcpyAsync(correction_fp32_.data(),
+ residual_fp32_.data(),
+ num_cols_ * sizeof(float),
+ cudaMemcpyDeviceToDevice,
+ stream_),
+ cudaSuccess);
+ if (cusolverDnSpotrs(cusolver_handle_,
+ CUBLAS_FILL_MODE_LOWER,
+ num_cols_,
+ 1,
+ lhs_fp32_.data(),
+ num_cols_,
+ correction_fp32_.data(),
+ num_cols_,
+ error_.data()) != CUSOLVER_STATUS_SUCCESS) {
+ *message = "cuSolverDN::cusolverDnDpotrs failed.";
+ return LinearSolverTerminationType::FATAL_ERROR;
+ }
+ if (cudaDeviceSynchronize() != cudaSuccess ||
+ cudaStreamSynchronize(stream_) != cudaSuccess) {
+ *message = "Cuda device synchronization failed.";
+ return LinearSolverTerminationType::FATAL_ERROR;
+ }
+ int error = 0;
+ error_.CopyToHost(&error, 1);
+ if (error != 0) {
+ LOG(FATAL) << "Congratulations, you found a bug in Ceres. "
+ << "Please report it."
+ << "cuSolverDN::cusolverDnDpotrs fatal error. "
+ << "Argument: " << -error << " is invalid.";
+ }
+ *message = "Success";
+ return LinearSolverTerminationType::SUCCESS;
+}
+
+LinearSolverTerminationType CUDADenseCholeskyMixedPrecision::Factorize(
+ int num_cols,
+ double* lhs,
+ std::string* message) {
+ num_cols_ = num_cols;
+
+ // Copy fp64 version of lhs to GPU.
+ lhs_fp64_.Reserve(num_cols * num_cols);
+ lhs_fp64_.CopyToGpuAsync(lhs, num_cols * num_cols, stream_);
+
+ // Create an fp32 copy of lhs, lhs_fp32.
+ lhs_fp32_.Reserve(num_cols * num_cols);
+ CudaFP64ToFP32(
+ lhs_fp64_.data(), lhs_fp32_.data(), num_cols * num_cols, stream_);
+
+ // Factorize lhs_fp32.
+ factorize_result_ = CudaCholeskyFactorize(message);
+ return factorize_result_;
+}
+
+LinearSolverTerminationType CUDADenseCholeskyMixedPrecision::Solve(
+ const double* rhs,
+ double* solution,
+ std::string* message) {
+ // If factorization failed, return failure.
+ if (factorize_result_ != LinearSolverTerminationType::SUCCESS) {
+ *message = "Factorize did not complete successfully previously.";
+ return factorize_result_;
+ }
+
+ // Reserve memory for all arrays.
+ rhs_fp64_.Reserve(num_cols_);
+ x_fp64_.Reserve(num_cols_);
+ correction_fp32_.Reserve(num_cols_);
+ residual_fp32_.Reserve(num_cols_);
+ residual_fp64_.Reserve(num_cols_);
+
+ // Initialize x = 0.
+ CudaSetZeroFP64(x_fp64_.data(), num_cols_, stream_);
+
+ // Initialize residual = rhs.
+ rhs_fp64_.CopyToGpuAsync(rhs, num_cols_, stream_);
+ residual_fp64_.CopyFromGpuAsync(rhs_fp64_.data(), num_cols_, stream_);
+
+ for (int i = 0; i <= max_num_refinement_iterations_; ++i) {
+ // Cast residual from fp64 to fp32.
+ CudaFP64ToFP32(
+ residual_fp64_.data(), residual_fp32_.data(), num_cols_, stream_);
+ // [fp32] c = lhs^-1 * residual.
+ auto result = CudaCholeskySolve(message);
+ if (result != LinearSolverTerminationType::SUCCESS) {
+ return result;
+ }
+ // [fp64] x += c.
+ CudaDsxpy(
+ x_fp64_.data(), correction_fp32_.data(), num_cols_, stream_);
+ if (i < max_num_refinement_iterations_) {
+ // [fp64] residual = rhs - lhs * x
+ // This is done in two steps:
+ // 1. [fp64] residual = rhs
+ residual_fp64_.CopyFromGpuAsync(rhs_fp64_.data(), num_cols_, stream_);
+ // 2. [fp64] residual = residual - lhs * x
+ double alpha = -1.0;
+ double beta = 1.0;
+ cublasDsymv(cublas_handle_,
+ CUBLAS_FILL_MODE_LOWER,
+ num_cols_,
+ &alpha,
+ lhs_fp64_.data(),
+ num_cols_,
+ x_fp64_.data(),
+ 1,
+ &beta,
+ residual_fp64_.data(),
+ 1);
+ }
+ }
+ x_fp64_.CopyToHost(solution, num_cols_);
+ *message = "Success.";
+ return LinearSolverTerminationType::SUCCESS;
+}
+
#endif // CERES_NO_CUDA
} // namespace ceres::internal
diff --git a/internal/ceres/dense_cholesky.h b/internal/ceres/dense_cholesky.h
index cc8642c..0593875 100644
--- a/internal/ceres/dense_cholesky.h
+++ b/internal/ceres/dense_cholesky.h
@@ -176,6 +176,78 @@
LinearSolverTerminationType::FATAL_ERROR;
};
+// A mixed-precision iterative refinement dense Cholesky solver using FP32 CUDA
+// Dense Cholesky for inner iterations, and FP64 outer refinements.
+// This class implements a modified version of the "Classical iterative
+// refinement" (Algorithm 4.1) from the following paper:
+// Haidar, Azzam, Harun Bayraktar, Stanimire Tomov, Jack Dongarra, and Nicholas
+// J. Higham. "Mixed-precision iterative refinement using tensor cores on GPUs
+// to accelerate solution of linear systems." Proceedings of the Royal Society A
+// 476, no. 2243 (2020): 20200110.
+//
+// The three key modifications from Algorithm 4.1 in the paper are:
+// 1. We use Cholesky factorization instead of LU factorization since our A is
+// symmetric positive definite.
+// 2. During the solution update, the up-cast and accumulation is performed in
+// one step with a custom kernel.
+class CERES_NO_EXPORT CUDADenseCholeskyMixedPrecision final :
+ public DenseCholesky {
+ public:
+ static std::unique_ptr<CUDADenseCholeskyMixedPrecision> Create(
+ const LinearSolver::Options& options);
+ CUDADenseCholeskyMixedPrecision(
+ const CUDADenseCholeskyMixedPrecision&) = delete;
+ CUDADenseCholeskyMixedPrecision& operator=(
+ const CUDADenseCholeskyMixedPrecision&) = delete;
+ LinearSolverTerminationType Factorize(int num_cols,
+ double* lhs,
+ std::string* message) override;
+ LinearSolverTerminationType Solve(const double* rhs,
+ double* solution,
+ std::string* message) override;
+
+ private:
+ CUDADenseCholeskyMixedPrecision() = default;
+ // Helper function to wrap Cuda boilerplate needed to call Spotrf.
+ LinearSolverTerminationType CudaCholeskyFactorize(std::string* message);
+ // Helper function to wrap Cuda boilerplate needed to call Spotrs.
+ LinearSolverTerminationType CudaCholeskySolve(std::string* message);
+ // Picks up the cuSolverDN and cuStream handles from the context in the
+ // options, and the number of refinement iterations from the options. If
+ // the context is unable to initialize CUDA, returns false with a
+ // human-readable message indicating the reason.
+ bool Init(const LinearSolver::Options& options, std::string* message);
+
+ cusolverDnHandle_t cusolver_handle_ = nullptr;
+ cublasHandle_t cublas_handle_ = nullptr;
+ cudaStream_t stream_ = nullptr;
+ // Number of columns in the A matrix, to be cached between calls to *Factorize
+ // and *Solve.
+ size_t num_cols_ = 0;
+ CudaBuffer<double> lhs_fp64_;
+ CudaBuffer<double> rhs_fp64_;
+ CudaBuffer<float> lhs_fp32_;
+ // Scratch space for cuSOLVER on the GPU.
+ CudaBuffer<float> device_workspace_;
+ // Required for error handling with cuSOLVER.
+ CudaBuffer<int> error_;
+
+ // Solution to lhs * x = rhs.
+ CudaBuffer<double> x_fp64_;
+ // Incremental correction to x.
+ CudaBuffer<float> correction_fp32_;
+ // Residual to iterative refinement.
+ CudaBuffer<float> residual_fp32_;
+ CudaBuffer<double> residual_fp64_;
+
+ // Number of inner refinement iterations to perform.
+ int max_num_refinement_iterations_ = 0;
+ // Cache the result of Factorize to ensure that when Solve is called, the
+ // factorization of lhs is valid.
+ LinearSolverTerminationType factorize_result_ =
+ LinearSolverTerminationType::FATAL_ERROR;
+};
+
#endif // CERES_NO_CUDA
} // namespace ceres::internal
diff --git a/internal/ceres/dense_cholesky_test.cc b/internal/ceres/dense_cholesky_test.cc
index 7866d5c..a425343 100644
--- a/internal/ceres/dense_cholesky_test.cc
+++ b/internal/ceres/dense_cholesky_test.cc
@@ -45,12 +45,20 @@
namespace ceres::internal {
-using Param = DenseLinearAlgebraLibraryType;
+
+using Param =
+ ::testing::tuple<DenseLinearAlgebraLibraryType, bool>;
+constexpr bool kMixedPrecision = true;
+constexpr bool kFullPrecision = false;
namespace {
std::string ParamInfoToString(testing::TestParamInfo<Param> info) {
- return DenseLinearAlgebraLibraryTypeToString(info.param);
+ Param param = info.param;
+ std::stringstream ss;
+ ss << DenseLinearAlgebraLibraryTypeToString(::testing::get<0>(param)) << "_"
+ << (::testing::get<1>(param) ? "MixedPrecision" : "FullPrecision");
+ return ss.str();
}
} // namespace
@@ -67,14 +75,18 @@
LinearSolver::Options options;
ContextImpl context;
options.context = &context;
- options.dense_linear_algebra_library_type = GetParam();
+ options.dense_linear_algebra_library_type = ::testing::get<0>(GetParam());
+ options.use_mixed_precision_solves = ::testing::get<1>(GetParam());
+ const int kNumRefinementSteps = 4;
+ if (options.use_mixed_precision_solves) {
+ options.max_num_refinement_iterations = kNumRefinementSteps;
+ }
std::unique_ptr<DenseCholesky> dense_cholesky =
DenseCholesky::Create(options);
const int kNumTrials = 10;
const int kMinNumCols = 1;
const int kMaxNumCols = 10;
-
for (int num_cols = kMinNumCols; num_cols < kMaxNumCols; ++num_cols) {
for (int trial = 0; trial < kNumTrials; ++trial) {
const MatrixType a = MatrixType::Random(num_cols, num_cols);
@@ -97,24 +109,70 @@
}
}
-namespace {
-
-// NOTE: preprocessor directives in a macro are not standard conforming
-decltype(auto) MakeValues() {
- return ::testing::Values(EIGEN
+INSTANTIATE_TEST_SUITE_P(
+ EigenCholesky,
+ DenseCholeskyTest,
+ ::testing::Combine(::testing::Values(EIGEN),
+ ::testing::Values(kFullPrecision)),
+ ParamInfoToString);
#ifndef CERES_NO_LAPACK
- ,
- LAPACK
+INSTANTIATE_TEST_SUITE_P(
+ LapackCholesky,
+ DenseCholeskyTest,
+ ::testing::Combine(::testing::Values(LAPACK),
+ ::testing::Values(kFullPrecision)),
+ ParamInfoToString);
#endif
#ifndef CERES_NO_CUDA
- ,
- CUDA
+INSTANTIATE_TEST_SUITE_P(
+ CudaCholesky,
+ DenseCholeskyTest,
+ ::testing::Combine(::testing::Values(CUDA),
+ ::testing::Values(kMixedPrecision,
+ kFullPrecision)),
+ ParamInfoToString);
#endif
- );
+
+TEST(DenseCholesky, ValidMixedPrecisionOptions) {
+#ifndef CERES_NO_CUDA
+ {
+ // Dense Cholesky with CUDA: okay, supported.
+ ContextImpl context;
+ LinearSolver::Options options;
+ options.dense_linear_algebra_library_type = CUDA;
+ options.use_mixed_precision_solves = true;
+ options.context = &context;
+ std::unique_ptr<DenseCholesky> dense_cholesky =
+ DenseCholesky::Create(options);
+ EXPECT_NE(dense_cholesky, nullptr);
+ }
+#endif
}
-} // namespace
+TEST(DenseCholesky, InvalidMixedPrecisionOptions) {
+ {
+ // Dense Cholesky with Eigen: not supported
+ ContextImpl context;
+ LinearSolver::Options options;
+ options.dense_linear_algebra_library_type = EIGEN;
+ options.use_mixed_precision_solves = true;
+ options.context = &context;
+ std::unique_ptr<DenseCholesky> dense_cholesky =
+ DenseCholesky::Create(options);
+ EXPECT_EQ(dense_cholesky, nullptr);
+ }
-INSTANTIATE_TEST_SUITE_P(_, DenseCholeskyTest, MakeValues(), ParamInfoToString);
+ {
+ // Dense Cholesky with Lapack: not supported
+ ContextImpl context;
+ LinearSolver::Options options;
+ options.dense_linear_algebra_library_type = LAPACK;
+ options.use_mixed_precision_solves = true;
+ options.context = &context;
+ std::unique_ptr<DenseCholesky> dense_cholesky =
+ DenseCholesky::Create(options);
+ EXPECT_EQ(dense_cholesky, nullptr);
+ }
+}
} // namespace ceres::internal
diff --git a/internal/ceres/solver.cc b/internal/ceres/solver.cc
index 9503843..93551f7 100644
--- a/internal/ceres/solver.cc
+++ b/internal/ceres/solver.cc
@@ -118,6 +118,48 @@
internal::EigenSparse::IsNestedDissectionAvailable()));
}
+bool MixedPrecisionOptionIsValid(const Solver::Options& options,
+ string* error) {
+ if (options.use_mixed_precision_solves) {
+ if ((options.linear_solver_type == DENSE_NORMAL_CHOLESKY ||
+ options.linear_solver_type == DENSE_SCHUR) &&
+ options.dense_linear_algebra_library_type == CUDA) {
+ // Mixed precision with CUDA and dense Cholesky variant: okay.
+ return true;
+ }
+ if ((options.linear_solver_type == SPARSE_NORMAL_CHOLESKY ||
+ options.linear_solver_type == SPARSE_SCHUR) &&
+ (options.sparse_linear_algebra_library_type == EIGEN_SPARSE ||
+ options.sparse_linear_algebra_library_type == ACCELERATE_SPARSE)) {
+ // Mixed precision with any Eigen or Accelerate Cholesky variant: okay.
+ return true;
+ }
+ // No other mixed precision variants are supported.
+ if (options.linear_solver_type == DENSE_NORMAL_CHOLESKY ||
+ options.linear_solver_type == DENSE_SCHUR) {
+ *error = StringPrintf(
+ "use_mixed_precision_solves with %s is only supported with "
+ "CUDA as the dense_linear_algebra_library_type.",
+ LinearSolverTypeToString(options.linear_solver_type));
+ return false;
+ }
+ if ((options.linear_solver_type == SPARSE_NORMAL_CHOLESKY ||
+ options.linear_solver_type == SPARSE_SCHUR) &&
+ options.sparse_linear_algebra_library_type == SUITE_SPARSE) {
+ *error = StringPrintf(
+ "use_mixed_precision_solves with %s is not supported with "
+ "SUITE_SPARSE as the sparse_linear_algebra_library_type.",
+ LinearSolverTypeToString(options.linear_solver_type));
+ return false;
+ }
+ *error = StringPrintf(
+ "use_mixed_precision_solves with %s is not supported.",
+ LinearSolverTypeToString(options.linear_solver_type));
+ return false;
+ }
+ return true;
+}
+
bool TrustRegionOptionsAreValid(const Solver::Options& options, string* error) {
OPTION_GT(initial_trust_region_radius, 0.0);
OPTION_GT(min_trust_region_radius, 0.0);
@@ -228,6 +270,10 @@
}
}
+ if (!MixedPrecisionOptionIsValid(options, error)) {
+ return false;
+ }
+
if (options.trust_region_strategy_type == DOGLEG) {
if (options.linear_solver_type == ITERATIVE_SCHUR ||
options.linear_solver_type == CGNR) {
@@ -372,7 +418,7 @@
&(summary->inner_iteration_ordering_given));
// clang-format off
- summary->dense_linear_algebra_library_type = options.dense_linear_algebra_library_type;
+ summary->dense_linear_algebra_library_type = options.dense_linear_algebra_library_type;
summary->dogleg_type = options.dogleg_type;
summary->inner_iteration_time_in_seconds = 0.0;
summary->num_line_search_steps = 0;
@@ -381,18 +427,18 @@
summary->line_search_polynomial_minimization_time_in_seconds = 0.0;
summary->line_search_total_time_in_seconds = 0.0;
summary->inner_iterations_given = options.use_inner_iterations;
- summary->line_search_direction_type = options.line_search_direction_type;
- summary->line_search_interpolation_type = options.line_search_interpolation_type;
+ summary->line_search_direction_type = options.line_search_direction_type;
+ summary->line_search_interpolation_type = options.line_search_interpolation_type;
summary->line_search_type = options.line_search_type;
summary->linear_solver_type_given = options.linear_solver_type;
summary->max_lbfgs_rank = options.max_lbfgs_rank;
summary->minimizer_type = options.minimizer_type;
- summary->nonlinear_conjugate_gradient_type = options.nonlinear_conjugate_gradient_type;
+ summary->nonlinear_conjugate_gradient_type = options.nonlinear_conjugate_gradient_type;
summary->num_threads_given = options.num_threads;
summary->preconditioner_type_given = options.preconditioner_type;
summary->sparse_linear_algebra_library_type = options.sparse_linear_algebra_library_type;
- summary->linear_solver_ordering_type = options.linear_solver_ordering_type;
- summary->trust_region_strategy_type = options.trust_region_strategy_type;
+ summary->linear_solver_ordering_type = options.linear_solver_ordering_type;
+ summary->trust_region_strategy_type = options.trust_region_strategy_type;
summary->visibility_clustering_type = options.visibility_clustering_type;
// clang-format on
}
@@ -408,8 +454,9 @@
&(summary->inner_iteration_ordering_used));
// clang-format off
- summary->inner_iterations_used = pp.inner_iteration_minimizer != nullptr;
+ summary->inner_iterations_used = pp.inner_iteration_minimizer != nullptr;
summary->linear_solver_type_used = pp.linear_solver_options.type;
+ summary->mixed_precision_solves_used = pp.options.use_mixed_precision_solves;
summary->num_threads_used = pp.options.num_threads;
summary->preconditioner_type_used = pp.options.preconditioner_type;
// clang-format on
@@ -699,10 +746,13 @@
if (linear_solver_type_used == DENSE_NORMAL_CHOLESKY ||
linear_solver_type_used == DENSE_SCHUR ||
linear_solver_type_used == DENSE_QR) {
+ const char* mixed_precision_suffix =
+ (mixed_precision_solves_used ? "(Mixed Precision)" : "");
StringAppendF(&report,
- "\nDense linear algebra library %15s\n",
+ "\nDense linear algebra library %15s %s\n",
DenseLinearAlgebraLibraryTypeToString(
- dense_linear_algebra_library_type));
+ dense_linear_algebra_library_type),
+ mixed_precision_suffix);
}
StringAppendF(&report,
@@ -723,12 +773,15 @@
(linear_solver_type_used == ITERATIVE_SCHUR &&
(preconditioner_type_used == CLUSTER_JACOBI ||
preconditioner_type_used == CLUSTER_TRIDIAGONAL))) {
+ const char* mixed_precision_suffix =
+ (mixed_precision_solves_used ? "(Mixed Precision)" : "");
StringAppendF(
&report,
- "\nSparse linear algebra library %15s + %s\n",
+ "\nSparse linear algebra library %15s + %s %s\n",
SparseLinearAlgebraLibraryTypeToString(
sparse_linear_algebra_library_type),
- LinearSolverOrderingTypeToString(linear_solver_ordering_type));
+ LinearSolverOrderingTypeToString(linear_solver_ordering_type),
+ mixed_precision_suffix);
}
StringAppendF(&report, "\n");