Minor CUDA cleanup.

1. ceres_cuda_kernels.cu -> cuda_kernels.cu.cc
2. Add missing ifdef guards.
3. Fix an errant namespace

Change-Id: I81a5bfbe3c795ff0b3ef41c3bcaa037b99d5b254
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index dae16b5..947003d 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -156,8 +156,8 @@
         COMMAND ${CUDA_TOOLKIT_ROOT_DIR}/bin/cuda-memcheck --leak-check full
             $<TARGET_FILE:cuda_dense_cholesky_test>)
   endif (BUILD_TESTING AND GFLAGS)
-  set_source_files_properties(ceres_cuda_kernels.cu PROPERTIES LANGUAGE CUDA)
-  add_library(ceres_cuda_kernels ceres_cuda_kernels.cu)
+  set_source_files_properties(cuda_kernels.cu.cc PROPERTIES LANGUAGE CUDA)
+  add_library(ceres_cuda_kernels cuda_kernels.cu.cc)
   target_compile_features(ceres_cuda_kernels PRIVATE cxx_std_14)
   list(APPEND CERES_LIBRARY_PRIVATE_DEPENDENCIES ceres_cuda_kernels)
 endif (USE_CUDA)
diff --git a/internal/ceres/ceres_cuda_kernels.cu b/internal/ceres/cuda_kernels.cu.cc
similarity index 91%
rename from internal/ceres/ceres_cuda_kernels.cu
rename to internal/ceres/cuda_kernels.cu.cc
index cd045e3..2a1e755 100644
--- a/internal/ceres/ceres_cuda_kernels.cu
+++ b/internal/ceres/cuda_kernels.cu.cc
@@ -40,7 +40,7 @@
 // toolkit documentation.
 constexpr int kCudaBlockSize = 256;
 
-template<typename SrcType, typename DstType>
+template <typename SrcType, typename DstType>
 __global__ void TypeConversionKernel(const SrcType* __restrict__ input,
                                      DstType* __restrict__ output,
                                      const int size) {
@@ -68,7 +68,7 @@
       <<<num_blocks, kCudaBlockSize, 0, stream>>>(input, output, size);
 }
 
-template<typename T>
+template <typename T>
 __global__ void SetZeroKernel(T* __restrict__ output, const int size) {
   const int i = blockIdx.x * blockDim.x + threadIdx.x;
   if (i < size) {
@@ -83,24 +83,21 @@
 
 void CudaSetZeroFP64(double* output, const int size, cudaStream_t stream) {
   const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
-  SetZeroKernel<double><<<num_blocks, kCudaBlockSize, 0, stream>>>(
-      output, size);
+  SetZeroKernel<double>
+      <<<num_blocks, kCudaBlockSize, 0, stream>>>(output, size);
 }
 
 template <typename SrcType, typename DstType>
 __global__ void XPlusEqualsYKernel(DstType* __restrict__ x,
                                    const SrcType* __restrict__ y,
-                                  const int size) {
+                                   const int size) {
   const int i = blockIdx.x * blockDim.x + threadIdx.x;
   if (i < size) {
     x[i] = x[i] + DstType(y[i]);
   }
 }
 
-void CudaDsxpy(double* x,
-                float* y,
-                const int size,
-                cudaStream_t stream) {
+void CudaDsxpy(double* x, float* y, const int size, cudaStream_t stream) {
   const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
   XPlusEqualsYKernel<float, double>
       <<<num_blocks, kCudaBlockSize, 0, stream>>>(x, y, size);
@@ -122,8 +119,7 @@
                 const int size,
                 cudaStream_t stream) {
   const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
-  CudaDtDxpyKernel<<<num_blocks, kCudaBlockSize, 0, stream>>>(
-      y, D, x, size);
+  CudaDtDxpyKernel<<<num_blocks, kCudaBlockSize, 0, stream>>>(y, D, x, size);
 }
 
-} // namespace ceres_cuda_kernels
\ No newline at end of file
+}  // namespace ceres::internal
diff --git a/internal/ceres/ceres_cuda_kernels.h b/internal/ceres/cuda_kernels.h
similarity index 96%
rename from internal/ceres/ceres_cuda_kernels.h
rename to internal/ceres/cuda_kernels.h
index 91675e8..d347c20 100644
--- a/internal/ceres/ceres_cuda_kernels.h
+++ b/internal/ceres/cuda_kernels.h
@@ -28,6 +28,9 @@
 //
 // Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
 
+#ifndef CERES_INTERNAL_CUDA_KERNELS_H_
+#define CERES_INTERNAL_CUDA_KERNELS_H_
+
 #include "ceres/internal/config.h"
 
 #ifndef CERES_NO_CUDA
@@ -72,3 +75,5 @@
 }  // namespace ceres::internal
 
 #endif  // CERES_NO_CUDA
+
+#endif  // CERES_INTERNAL_CUDA_KERNELS_H_
diff --git a/internal/ceres/cuda_kernels_test.cc b/internal/ceres/cuda_kernels_test.cc
index 83b922d..053b442 100644
--- a/internal/ceres/cuda_kernels_test.cc
+++ b/internal/ceres/cuda_kernels_test.cc
@@ -28,13 +28,14 @@
 //
 // Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
 
+#include "ceres/cuda_kernels.h"
+
 #include <math.h>
 
 #include <limits>
 #include <string>
 #include <vector>
 
-#include "ceres/ceres_cuda_kernels.h"
 #include "ceres/cuda_buffer.h"
 #include "ceres/internal/config.h"
 #include "ceres/internal/eigen.h"
diff --git a/internal/ceres/cuda_sparse_matrix.cc b/internal/ceres/cuda_sparse_matrix.cc
index da64981..e366112 100644
--- a/internal/ceres/cuda_sparse_matrix.cc
+++ b/internal/ceres/cuda_sparse_matrix.cc
@@ -51,8 +51,8 @@
 
 #ifndef CERES_NO_CUDA
 
-#include "ceres/ceres_cuda_kernels.h"
 #include "ceres/cuda_buffer.h"
+#include "ceres/cuda_kernels.h"
 #include "ceres/cuda_vector.h"
 #include "cusparse.h"
 
@@ -146,4 +146,4 @@
 
 }  // namespace ceres::internal
 
-#endif  // CERES_NO_CUDA
\ No newline at end of file
+#endif  // CERES_NO_CUDA
diff --git a/internal/ceres/cuda_vector.cc b/internal/ceres/cuda_vector.cc
index 7debeba..7bac13a 100644
--- a/internal/ceres/cuda_vector.cc
+++ b/internal/ceres/cuda_vector.cc
@@ -43,8 +43,8 @@
 
 #ifndef CERES_NO_CUDA
 
-#include "ceres/ceres_cuda_kernels.h"
 #include "ceres/cuda_buffer.h"
+#include "ceres/cuda_kernels.h"
 #include "ceres/cuda_vector.h"
 #include "cublas_v2.h"
 
@@ -178,4 +178,4 @@
 
 }  // namespace ceres::internal
 
-#endif  // CERES_NO_CUDA
\ No newline at end of file
+#endif  // CERES_NO_CUDA
diff --git a/internal/ceres/cuda_vector.h b/internal/ceres/cuda_vector.h
index e7c4b81..985eac0 100644
--- a/internal/ceres/cuda_vector.h
+++ b/internal/ceres/cuda_vector.h
@@ -49,8 +49,8 @@
 
 #ifndef CERES_NO_CUDA
 
-#include "ceres/ceres_cuda_kernels.h"
 #include "ceres/cuda_buffer.h"
+#include "ceres/cuda_kernels.h"
 #include "ceres/internal/eigen.h"
 #include "cublas_v2.h"
 #include "cusparse.h"
diff --git a/internal/ceres/dense_cholesky.cc b/internal/ceres/dense_cholesky.cc
index af42e73..a3e578f 100644
--- a/internal/ceres/dense_cholesky.cc
+++ b/internal/ceres/dense_cholesky.cc
@@ -40,8 +40,8 @@
 #include "ceres/iterative_refiner.h"
 
 #ifndef CERES_NO_CUDA
-#include "ceres/ceres_cuda_kernels.h"
 #include "ceres/context_impl.h"
+#include "ceres/cuda_kernels.h"
 #include "cuda_runtime.h"
 #include "cusolverDn.h"
 #endif  // CERES_NO_CUDA
diff --git a/internal/ceres/dense_qr.cc b/internal/ceres/dense_qr.cc
index 775073d..fb3c228 100644
--- a/internal/ceres/dense_qr.cc
+++ b/internal/ceres/dense_qr.cc
@@ -33,6 +33,7 @@
 #include <algorithm>
 #include <memory>
 #include <string>
+
 #ifndef CERES_NO_CUDA
 #include "ceres/context_impl.h"
 #include "cublas_v2.h"