Add CUDA GPU and Runtime Detection * Added GPU device and CUDA compute capability identification. * Added GpuMemoryAvailable() to aid downstream optimizations based on GPU memory availability. Change-Id: I326dc1e4b7a6a7f5571b7e5479eb9aa300ad1075

diff --git a/internal/ceres/cgnr_solver.cc b/internal/ceres/cgnr_solver.cc
index 64e018d..5374f51 100644
--- a/internal/ceres/cgnr_solver.cc
+++ b/internal/ceres/cgnr_solver.cc

@@ -253,7 +253,7 @@
         ". ";
     return nullptr;
   }
-  CHECK(options.context->IsCUDAInitialized())
+  CHECK(options.context->IsCudaInitialized())
       << "CudaCgnrSolver requires CUDA initialization.";
   auto solver = std::make_unique<CudaCgnrSolver>(options);
   return solver;

diff --git a/internal/ceres/context_impl.cc b/internal/ceres/context_impl.cc
index 9f6cc25..a46e760 100644
--- a/internal/ceres/context_impl.cc
+++ b/internal/ceres/context_impl.cc

@@ -33,6 +33,7 @@
 #include <string>
 
 #include "ceres/internal/config.h"
+#include "ceres/stringprintf.h"
 #include "ceres/wall_time.h"
 
 #ifndef CERES_NO_CUDA
@@ -66,32 +67,84 @@
   is_cuda_initialized_ = false;
 }
 
-bool ContextImpl::InitCUDA(std::string* message) {
+std::string ContextImpl::CudaConfigAsString() const {
+  return ceres::internal::StringPrintf(
+      "======================= CUDA Device Properties ======================\n"
+      "Cuda version         : %d.%d\n"
+      "Device ID            : %d\n"
+      "Device name          : %s\n"
+      "Total GPU memory     : %6.f MiB\n"
+      "GPU memory available : %6.f MiB\n"
+      "Compute capability   : %d.%d\n"
+      "Warp size            : %d\n"
+      "Max threads per block: %d\n"
+      "Max threads per dim  : %d %d %d\n"
+      "Max grid size        : %d %d %d\n"
+      "Multiprocessor count : %d\n"
+      "====================================================================",
+      cuda_version_major_,
+      cuda_version_minor_,
+      gpu_device_id_in_use_,
+      gpu_device_properties_.name,
+      gpu_device_properties_.totalGlobalMem / 1024.0 / 1024.0,
+      GpuMemoryAvailable() / 1024.0 / 1024.0,
+      gpu_device_properties_.major,
+      gpu_device_properties_.minor,
+      gpu_device_properties_.warpSize,
+      gpu_device_properties_.maxThreadsPerBlock,
+      gpu_device_properties_.maxThreadsDim[0],
+      gpu_device_properties_.maxThreadsDim[1],
+      gpu_device_properties_.maxThreadsDim[2],
+      gpu_device_properties_.maxGridSize[0],
+      gpu_device_properties_.maxGridSize[1],
+      gpu_device_properties_.maxGridSize[2],
+      gpu_device_properties_.multiProcessorCount);
+}
+
+size_t ContextImpl::GpuMemoryAvailable() const {
+  size_t free, total;
+  cudaMemGetInfo(&free, &total);
+  return free;
+}
+
+bool ContextImpl::InitCuda(std::string* message) {
   if (is_cuda_initialized_) {
     return true;
   }
+  CHECK_EQ(cudaGetDevice(&gpu_device_id_in_use_), cudaSuccess);
+  int cuda_version;
+  CHECK_EQ(cudaRuntimeGetVersion(&cuda_version), cudaSuccess);
+  cuda_version_major_ = cuda_version / 1000;
+  cuda_version_minor_ = (cuda_version % 1000) / 10;
+  CHECK_EQ(cudaGetDeviceProperties(&gpu_device_properties_,
+                                   gpu_device_id_in_use_), cudaSuccess);
+  VLOG(3) << "\n" << CudaConfigAsString();
   EventLogger event_logger("InitCuda");
   if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) {
-    *message = "cuBLAS::cublasCreate failed.";
+    *message = "CUDA initialization failed because "
+        "cuBLAS::cublasCreate failed.";
     cublas_handle_ = nullptr;
     return false;
   }
   event_logger.AddEvent("cublasCreate");
   if (cusolverDnCreate(&cusolver_handle_) != CUSOLVER_STATUS_SUCCESS) {
-    *message = "cuSolverDN::cusolverDnCreate failed.";
+    *message = "CUDA initialization failed because "
+        "cuSolverDN::cusolverDnCreate failed.";
     TearDown();
     return false;
   }
   event_logger.AddEvent("cusolverDnCreate");
   if (cusparseCreate(&cusparse_handle_) != CUSPARSE_STATUS_SUCCESS) {
-    *message = "cuSPARSE::cusparseCreate failed.";
+    *message = "CUDA initialization failed because "
+        "cuSPARSE::cusparseCreate failed.";
     TearDown();
     return false;
   }
   event_logger.AddEvent("cusparseCreate");
   if (cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking) !=
       cudaSuccess) {
-    *message = "CUDA::cudaStreamCreateWithFlags failed.";
+    *message = "CUDA initialization failed because "
+        "CUDA::cudaStreamCreateWithFlags failed.";
     TearDown();
     return false;
   }
@@ -100,7 +153,7 @@
           CUSOLVER_STATUS_SUCCESS ||
       cublasSetStream(cublas_handle_, stream_) != CUBLAS_STATUS_SUCCESS ||
       cusparseSetStream(cusparse_handle_, stream_) != CUSPARSE_STATUS_SUCCESS) {
-    *message = "CUDA [Solver|BLAS|Sparse] SetStream failed.";
+    *message = "CUDA initialization failed because SetStream failed.";
     TearDown();
     return false;
   }

diff --git a/internal/ceres/context_impl.h b/internal/ceres/context_impl.h
index 4bd18e1..9eb59eb 100644
--- a/internal/ceres/context_impl.h
+++ b/internal/ceres/context_impl.h

@@ -72,19 +72,39 @@
 #endif  // CERES_USE_CXX_THREADS
 
 #ifndef CERES_NO_CUDA
-  // Initializes the cuSolverDN context, creates an asynchronous stream, and
-  // associates the stream with cuSolverDN. Returns true iff initialization was
-  // successful, else it returns false and a human-readable error message is
-  // returned.
-  bool InitCUDA(std::string* message);
+  // Note on Ceres' use of CUDA Devices on multi-GPU systems:
+  // 1. On a multi-GPU system, if nothing special is done, the "default" CUDA
+  //    device will be used, which is device 0.
+  // 2. If the user masks out GPUs using the  CUDA_VISIBLE_DEVICES  environment
+  //    variable, Ceres will still use device 0 visible to the program, but
+  //    device 0 will be the first GPU indicated in the environment variable.
+  // 3. If the user explicitly selects a GPU in the host process before calling
+  //    Ceres, Ceres will use that GPU.
+
+  // Initializes cuBLAS, cuSOLVER, and cuSPARSE contexts, creates an
+  // asynchronous CUDA stream, and associates the stream with the contexts.
+  // Returns true iff initialization was successful, else it returns false and a
+  // human-readable error message is returned.
+  bool InitCuda(std::string* message);
   void TearDown();
-  inline bool IsCUDAInitialized() const { return is_cuda_initialized_; }
+  inline bool IsCudaInitialized() const { return is_cuda_initialized_; }
+  // Returns a human-readable string describing the capabilities of the current
+  // CUDA device. CudaConfigAsString can only be called after InitCuda has been
+  // called.
+  std::string CudaConfigAsString() const;
+  // Returns the number of bytes of available global memory on the current CUDA 
+  // device. If it is called before InitCuda, it returns 0.
+  size_t GpuMemoryAvailable() const;
 
   cusolverDnHandle_t cusolver_handle_ = nullptr;
   cublasHandle_t cublas_handle_ = nullptr;
   cudaStream_t stream_ = nullptr;
   cusparseHandle_t cusparse_handle_ = nullptr;
   bool is_cuda_initialized_ = false;
+  int gpu_device_id_in_use_ = -1;
+  cudaDeviceProp gpu_device_properties_;
+  int cuda_version_major_ = 0;
+  int cuda_version_minor_ = 0;
 #endif  // CERES_NO_CUDA
 };
 

diff --git a/internal/ceres/cuda_dense_cholesky_test.cc b/internal/ceres/cuda_dense_cholesky_test.cc
index 1483923..4c5742e 100644
--- a/internal/ceres/cuda_dense_cholesky_test.cc
+++ b/internal/ceres/cuda_dense_cholesky_test.cc

@@ -45,7 +45,7 @@
   ContextImpl context;
   options.context = &context;
   std::string error;
-  EXPECT_TRUE(context.InitCUDA(&error)) << error;
+  EXPECT_TRUE(context.InitCuda(&error)) << error;
   auto dense_cuda_solver = CUDADenseCholesky::Create(options);
   EXPECT_EQ(dense_cuda_solver, nullptr);
 }
@@ -65,7 +65,7 @@
   ContextImpl context;
   options.context = &context;
   std::string error;
-  EXPECT_TRUE(context.InitCUDA(&error)) << error;
+  EXPECT_TRUE(context.InitCuda(&error)) << error;
   options.dense_linear_algebra_library_type = CUDA;
   auto dense_cuda_solver = CUDADenseCholesky::Create(options);
   ASSERT_NE(dense_cuda_solver, nullptr);
@@ -94,7 +94,7 @@
   ContextImpl context;
   options.context = &context;
   std::string error;
-  EXPECT_TRUE(context.InitCUDA(&error)) << error;
+  EXPECT_TRUE(context.InitCuda(&error)) << error;
   options.dense_linear_algebra_library_type = CUDA;
   auto dense_cuda_solver = CUDADenseCholesky::Create(options);
   ASSERT_NE(dense_cuda_solver, nullptr);
@@ -116,7 +116,7 @@
   ContextImpl context;
   options.context = &context;
   std::string error;
-  EXPECT_TRUE(context.InitCUDA(&error)) << error;
+  EXPECT_TRUE(context.InitCuda(&error)) << error;
   options.dense_linear_algebra_library_type = CUDA;
   auto dense_cuda_solver = CUDADenseCholesky::Create(options);
   ASSERT_NE(dense_cuda_solver, nullptr);
@@ -131,7 +131,7 @@
   ContextImpl context;
   options.context = &context;
   std::string error;
-  EXPECT_TRUE(context.InitCUDA(&error)) << error;
+  EXPECT_TRUE(context.InitCuda(&error)) << error;
   options.dense_linear_algebra_library_type = CUDA;
   auto dense_cuda_solver = CUDADenseCholesky::Create(options);
   ASSERT_NE(dense_cuda_solver, nullptr);
@@ -150,7 +150,7 @@
   ContextImpl context;
   options.context = &context;
   std::string error;
-  EXPECT_TRUE(context.InitCUDA(&error)) << error;
+  EXPECT_TRUE(context.InitCuda(&error)) << error;
   options.dense_linear_algebra_library_type = ceres::CUDA;
   std::unique_ptr<DenseCholesky> dense_cholesky =
       CUDADenseCholesky::Create(options);
@@ -189,7 +189,7 @@
     ContextImpl context;
     options.context = &context;
     std::string error;
-    EXPECT_TRUE(context.InitCUDA(&error)) << error;
+    EXPECT_TRUE(context.InitCuda(&error)) << error;
     auto solver = CUDADenseCholeskyMixedPrecision::Create(options);
     ASSERT_EQ(solver, nullptr);
   }
@@ -199,7 +199,7 @@
     ContextImpl context;
     options.context = &context;
     std::string error;
-    EXPECT_TRUE(context.InitCUDA(&error)) << error;
+    EXPECT_TRUE(context.InitCuda(&error)) << error;
     options.dense_linear_algebra_library_type = ceres::CUDA;
     auto solver = CUDADenseCholeskyMixedPrecision::Create(options);
     ASSERT_EQ(solver, nullptr);
@@ -224,7 +224,7 @@
   ContextImpl context;
   options.context = &context;
   std::string error;
-  EXPECT_TRUE(context.InitCUDA(&error)) << error;
+  EXPECT_TRUE(context.InitCuda(&error)) << error;
   options.dense_linear_algebra_library_type = CUDA;
   options.use_mixed_precision_solves = true;
   auto solver = CUDADenseCholeskyMixedPrecision::Create(options);
@@ -261,7 +261,7 @@
   ContextImpl context;
   options.context = &context;
   std::string error;
-  EXPECT_TRUE(context.InitCUDA(&error)) << error;
+  EXPECT_TRUE(context.InitCuda(&error)) << error;
   options.dense_linear_algebra_library_type = CUDA;
   options.use_mixed_precision_solves = true;
   auto solver = CUDADenseCholeskyMixedPrecision::Create(options);
@@ -291,7 +291,7 @@
   ContextImpl context;
   options.context = &context;
   std::string error;
-  EXPECT_TRUE(context.InitCUDA(&error)) << error;
+  EXPECT_TRUE(context.InitCuda(&error)) << error;
   options.dense_linear_algebra_library_type = ceres::CUDA;
   options.use_mixed_precision_solves = true;
   options.max_num_refinement_iterations = 20;

diff --git a/internal/ceres/cuda_dense_qr_test.cc b/internal/ceres/cuda_dense_qr_test.cc
index cc0dbd6..798f12a 100644
--- a/internal/ceres/cuda_dense_qr_test.cc
+++ b/internal/ceres/cuda_dense_qr_test.cc

@@ -44,7 +44,7 @@
   ContextImpl context;
   options.context = &context;
   std::string error;
-  EXPECT_TRUE(context.InitCUDA(&error)) << error;
+  EXPECT_TRUE(context.InitCuda(&error)) << error;
   auto dense_cuda_solver = CUDADenseQR::Create(options);
   EXPECT_EQ(dense_cuda_solver, nullptr);
 }
@@ -63,7 +63,7 @@
   ContextImpl context;
   options.context = &context;
   std::string error;
-  EXPECT_TRUE(context.InitCUDA(&error)) << error;
+  EXPECT_TRUE(context.InitCuda(&error)) << error;
   options.dense_linear_algebra_library_type = CUDA;
   auto dense_cuda_solver = CUDADenseQR::Create(options);
   ASSERT_NE(dense_cuda_solver, nullptr);
@@ -97,7 +97,7 @@
   ContextImpl context;
   options.context = &context;
   std::string error;
-  EXPECT_TRUE(context.InitCUDA(&error)) << error;
+  EXPECT_TRUE(context.InitCuda(&error)) << error;
   options.dense_linear_algebra_library_type = CUDA;
   auto dense_cuda_solver = CUDADenseQR::Create(options);
   ASSERT_NE(dense_cuda_solver, nullptr);
@@ -121,7 +121,7 @@
   ContextImpl context;
   options.context = &context;
   std::string error;
-  EXPECT_TRUE(context.InitCUDA(&error)) << error;
+  EXPECT_TRUE(context.InitCuda(&error)) << error;
   options.dense_linear_algebra_library_type = CUDA;
   auto dense_cuda_solver = CUDADenseQR::Create(options);
   ASSERT_NE(dense_cuda_solver, nullptr);
@@ -141,7 +141,7 @@
   ContextImpl context;
   options.context = &context;
   std::string error;
-  EXPECT_TRUE(context.InitCUDA(&error)) << error;
+  EXPECT_TRUE(context.InitCuda(&error)) << error;
   options.dense_linear_algebra_library_type = ceres::CUDA;
   std::unique_ptr<DenseQR> dense_qr = CUDADenseQR::Create(options);
 

diff --git a/internal/ceres/cuda_sparse_matrix.cc b/internal/ceres/cuda_sparse_matrix.cc
index e366112..1d8d0b0 100644
--- a/internal/ceres/cuda_sparse_matrix.cc
+++ b/internal/ceres/cuda_sparse_matrix.cc

@@ -61,7 +61,7 @@
 CudaSparseMatrix::CudaSparseMatrix(
     ContextImpl* context, const CompressedRowSparseMatrix& crs_matrix) {
   DCHECK_NE(context, nullptr);
-  CHECK(context->IsCUDAInitialized());
+  CHECK(context->IsCudaInitialized());
   context_ = context;
   num_rows_ = crs_matrix.num_rows();
   num_cols_ = crs_matrix.num_cols();

diff --git a/internal/ceres/cuda_sparse_matrix_test.cc b/internal/ceres/cuda_sparse_matrix_test.cc
index ae76b8f..a12ad40 100644
--- a/internal/ceres/cuda_sparse_matrix_test.cc
+++ b/internal/ceres/cuda_sparse_matrix_test.cc

@@ -51,8 +51,8 @@
  protected:
   void SetUp() final {
     std::string message;
-    CHECK(context_.InitCUDA(&message))
-        << "InitCUDA() failed because: " << message;
+    CHECK(context_.InitCuda(&message))
+        << "InitCuda() failed because: " << message;
     std::unique_ptr<LinearLeastSquaresProblem> problem =
         CreateLinearLeastSquaresProblemFromId(2);
     CHECK(problem != nullptr);
@@ -121,7 +121,7 @@
 
   ContextImpl context;
   std::string message;
-  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
   auto A1_crs = CompressedRowSparseMatrix::FromTripletSparseMatrix(A1);
   CudaSparseMatrix A_gpu(&context, *A1_crs);
   CudaVector b_gpu(&context, A1.num_cols());
@@ -159,7 +159,7 @@
 
   ContextImpl context;
   std::string message;
-  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
   auto A_crs = CompressedRowSparseMatrix::FromTripletSparseMatrix(A);
   CudaSparseMatrix A_gpu(&context, *A_crs);
   CudaVector b_gpu(&context, A.num_cols());
@@ -189,7 +189,7 @@
 
   ContextImpl context;
   std::string message;
-  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
   auto A_crs = CompressedRowSparseMatrix::FromTripletSparseMatrix(A);
   CudaSparseMatrix A_gpu(&context, *A_crs);
   CudaVector b_gpu(&context, A.num_rows());
@@ -244,7 +244,7 @@
 
   ContextImpl context;
   std::string message;
-  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
   auto A_crs = CompressedRowSparseMatrix::FromTripletSparseMatrix(A);
   CudaSparseMatrix A_gpu(&context, *A_crs);
   CudaVector b_gpu(&context, N);

diff --git a/internal/ceres/cuda_vector.cc b/internal/ceres/cuda_vector.cc
index 7bac13a..46e6cb2 100644
--- a/internal/ceres/cuda_vector.cc
+++ b/internal/ceres/cuda_vector.cc

@@ -52,7 +52,7 @@
 
 CudaVector::CudaVector(ContextImpl* context, int size) {
   DCHECK_NE(context, nullptr);
-  CHECK(context->IsCUDAInitialized());
+  CHECK(context->IsCudaInitialized());
   context_ = context;
   Resize(size);
 }

diff --git a/internal/ceres/cuda_vector_test.cc b/internal/ceres/cuda_vector_test.cc
index 84193c0..db1fec5 100644
--- a/internal/ceres/cuda_vector_test.cc
+++ b/internal/ceres/cuda_vector_test.cc

@@ -45,7 +45,7 @@
 TEST(CudaVector, Creation) {
   ContextImpl context;
   std::string message;
-  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
   CudaVector x(&context, 1000);
   EXPECT_EQ(x.num_rows(), 1000);
   EXPECT_NE(x.data().data(), nullptr);
@@ -56,7 +56,7 @@
   x << 1, 2, 3;
   ContextImpl context;
   std::string message;
-  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
   CudaVector y(&context, 10);
   y.CopyFromCpu(x);
   EXPECT_EQ(y.num_rows(), 3);
@@ -72,7 +72,7 @@
   x << 1, 2, 3;
   ContextImpl context;
   std::string message;
-  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
   CudaVector x_gpu(&context, 3);
   x_gpu.CopyFromCpu(x);
 
@@ -94,7 +94,7 @@
   y << 100, 10, 1;
   ContextImpl context;
   std::string message;
-  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
   CudaVector x_gpu(&context, 10);
   CudaVector y_gpu(&context, 10);
   x_gpu.CopyFromCpu(x);
@@ -109,7 +109,7 @@
   x << 1, 2, 3;
   ContextImpl context;
   std::string message;
-  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
   CudaVector x_gpu(&context, 10);
   x_gpu.CopyFromCpu(x);
 
@@ -127,7 +127,7 @@
   x << 1, 1, 1, 1;
   ContextImpl context;
   std::string message;
-  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
   CudaVector x_gpu(&context, 10);
   x_gpu.CopyFromCpu(x);
 
@@ -145,7 +145,7 @@
 TEST(CudaVector, Resize) {
   ContextImpl context;
   std::string message;
-  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
   CudaVector x_gpu(&context, 10);
   EXPECT_EQ(x_gpu.num_rows(), 10);
   x_gpu.Resize(4);
@@ -159,7 +159,7 @@
   y << 100, 10, 1, 0;
   ContextImpl context;
   std::string message;
-  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
   CudaVector x_gpu(&context, 4);
   CudaVector y_gpu(&context, 4);
   x_gpu.CopyFromCpu(x);
@@ -180,7 +180,7 @@
   y << 100, 10, 1, 0;
   ContextImpl context;
   std::string message;
-  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
   CudaVector x_gpu(&context, 4);
   CudaVector y_gpu(&context, 4);
   x_gpu.CopyFromCpu(x);
@@ -201,7 +201,7 @@
   y << 100, 10, 1, 0;
   ContextImpl context;
   std::string message;
-  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
   CudaVector x_gpu(&context, 4);
   CudaVector y_gpu(&context, 4);
   x_gpu.CopyFromCpu(x);
@@ -222,7 +222,7 @@
   y << 100, 10, 1, 0;
   ContextImpl context;
   std::string message;
-  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
   CudaVector x_gpu(&context, 4);
   CudaVector y_gpu(&context, 4);
   x_gpu.CopyFromCpu(x);
@@ -241,7 +241,7 @@
   x << 100, 10, 1, 0;
   ContextImpl context;
   std::string message;
-  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
   CudaVector x_gpu(&context, 4);
   CudaVector y_gpu(&context, 4);
   x_gpu.CopyFromCpu(x);
@@ -262,7 +262,7 @@
   y << 100, 10, 1, 0;
   ContextImpl context;
   std::string message;
-  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
   CudaVector x_gpu(&context, 4);
   CudaVector y_gpu(&context, 4);
   CudaVector z_gpu(&context, 4);
@@ -284,7 +284,7 @@
   x << 100, 10, 1, 0;
   ContextImpl context;
   std::string message;
-  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
   CudaVector x_gpu(&context, 4);
   CudaVector z_gpu(&context, 4);
   x_gpu.CopyFromCpu(x);
@@ -305,7 +305,7 @@
   y << 100, 10, 1, 0;
   ContextImpl context;
   std::string message;
-  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
   CudaVector x_gpu(&context, 10);
   CudaVector y_gpu(&context, 10);
   x_gpu.CopyFromCpu(x);
@@ -326,7 +326,7 @@
   y << 100, 10, 1, 0;
   ContextImpl context;
   std::string message;
-  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
   CudaVector x_gpu(&context, 4);
   CudaVector y_gpu(&context, 4);
   x_gpu.CopyFromCpu(x);
@@ -345,7 +345,7 @@
   x << 100, 10, 1, 0;
   ContextImpl context;
   std::string message;
-  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
   CudaVector x_gpu(&context, 10);
   x_gpu.CopyFromCpu(x);
 
@@ -366,7 +366,7 @@
   D << 4, 3, 2, 1;
   ContextImpl context;
   std::string message;
-  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
   CudaVector x_gpu(&context, 4);
   CudaVector y_gpu(&context, 4);
   CudaVector D_gpu(&context, 4);
@@ -387,7 +387,7 @@
   x << 1, 2, 3, 4;
   ContextImpl context;
   std::string message;
-  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
   CudaVector x_gpu(&context, 4);
   x_gpu.CopyFromCpu(x);
 

diff --git a/internal/ceres/dense_cholesky.cc b/internal/ceres/dense_cholesky.cc
index a3e578f..6d328ab 100644
--- a/internal/ceres/dense_cholesky.cc
+++ b/internal/ceres/dense_cholesky.cc

@@ -348,7 +348,7 @@
 #ifndef CERES_NO_CUDA
 
 bool CUDADenseCholesky::Init(ContextImpl* context, std::string* message) {
-  CHECK(context->IsCUDAInitialized())
+  CHECK(context->IsCudaInitialized())
       << "CUDADenseCholesky requires CUDA initialization.";
   cusolver_handle_ = context->cusolver_handle_;
   stream_ = context->stream_;
@@ -490,7 +490,7 @@
 
 bool CUDADenseCholeskyMixedPrecision::Init(const LinearSolver::Options& options,
                                            std::string* message) {
-  CHECK(options.context->IsCUDAInitialized())
+  CHECK(options.context->IsCudaInitialized())
       << "CUDADenseCholeskyMixedPrecision requires CUDA initialization.";
   cusolver_handle_ = options.context->cusolver_handle_;
   cublas_handle_ = options.context->cublas_handle_;

diff --git a/internal/ceres/dense_cholesky_test.cc b/internal/ceres/dense_cholesky_test.cc
index 2c4ca10..bb8a467 100644
--- a/internal/ceres/dense_cholesky_test.cc
+++ b/internal/ceres/dense_cholesky_test.cc

@@ -78,7 +78,7 @@
 #ifndef CERES_NO_CUDA
   options.context = &context;
   std::string error;
-  CHECK(context.InitCUDA(&error)) << error;
+  CHECK(context.InitCuda(&error)) << error;
 #endif  // CERES_NO_CUDA
   options.dense_linear_algebra_library_type = ::testing::get<0>(GetParam());
   options.use_mixed_precision_solves = ::testing::get<1>(GetParam());

diff --git a/internal/ceres/dense_qr.cc b/internal/ceres/dense_qr.cc
index fb3c228..22727f8 100644
--- a/internal/ceres/dense_qr.cc
+++ b/internal/ceres/dense_qr.cc

@@ -312,7 +312,7 @@
 #ifndef CERES_NO_CUDA
 
 bool CUDADenseQR::Init(ContextImpl* context, std::string* message) {
-  if (!context->InitCUDA(message)) {
+  if (!context->InitCuda(message)) {
     return false;
   }
   cublas_handle_ = context->cublas_handle_;

diff --git a/internal/ceres/dense_qr_test.cc b/internal/ceres/dense_qr_test.cc
index c10dba5..8698c90 100644
--- a/internal/ceres/dense_qr_test.cc
+++ b/internal/ceres/dense_qr_test.cc

@@ -70,7 +70,7 @@
 #ifndef CERES_NO_CUDA
   options.context = &context;
   std::string error;
-  CHECK(context.InitCUDA(&error)) << error;
+  CHECK(context.InitCuda(&error)) << error;
 #endif  // CERES_NO_CUDA
   options.dense_linear_algebra_library_type = GetParam();
   const double kEpsilon = std::numeric_limits<double>::epsilon() * 1.5e4;

diff --git a/internal/ceres/solver.cc b/internal/ceres/solver.cc
index ea18913..0656d34 100644
--- a/internal/ceres/solver.cc
+++ b/internal/ceres/solver.cc

@@ -730,7 +730,7 @@
 
 #ifndef CERES_NO_CUDA
   if (IsCudaRequired(options)) {
-    if (!problem_impl->context()->InitCUDA(&summary->message)) {
+    if (!problem_impl->context()->InitCuda(&summary->message)) {
       LOG(ERROR) << "Terminating: " << summary->message;
       return;
     }

diff --git a/internal/ceres/sparse_linear_operator_benchmark.cc b/internal/ceres/sparse_linear_operator_benchmark.cc
index a08911e..4969c0c 100644
--- a/internal/ceres/sparse_linear_operator_benchmark.cc
+++ b/internal/ceres/sparse_linear_operator_benchmark.cc

@@ -170,7 +170,7 @@
                                 FLAGS_num_residuals_per_camera);
   ContextImpl context;
   std::string message;
-  context.InitCUDA(&message);
+  context.InitCuda(&message);
   CompressedRowSparseMatrix jacobian_crs(
       jacobian->num_rows(), jacobian->num_cols(), jacobian->num_nonzeros());
   jacobian->ToCompressedRowSparseMatrix(&jacobian_crs);
@@ -205,7 +205,7 @@
                                 FLAGS_num_residuals_per_camera);
   ContextImpl context;
   std::string message;
-  context.InitCUDA(&message);
+  context.InitCuda(&message);
   CompressedRowSparseMatrix jacobian_crs(
       jacobian->num_rows(), jacobian->num_cols(), jacobian->num_nonzeros());
   jacobian->ToCompressedRowSparseMatrix(&jacobian_crs);