Move stream-ordered memory allocations

Change-Id: Ief116e4e77c7579612b99cf552f3d8fc54c1d42a
diff --git a/internal/ceres/context_impl.cc b/internal/ceres/context_impl.cc
index c39e098..c085644 100644
--- a/internal/ceres/context_impl.cc
+++ b/internal/ceres/context_impl.cc
@@ -171,40 +171,6 @@
   is_cuda_initialized_ = true;
   return true;
 }
-
-void* ContextImpl::CudaMalloc(size_t size, cudaStream_t stream) const {
-  void* data = nullptr;
-  // Stream-ordered alloaction API is available since CUDA 11.4, but might be
-  // not implemented by particular device
-#if CUDART_VERSION < 11040
-#warning \
-    "Stream-ordered allocations are unavailable, consider updating CUDA toolkit to version 11.4+"
-  CHECK_EQ(cudaSuccess, cudaMalloc(&data, size));
-#else
-  if (gpu_device_properties_.memoryPoolsSupported) {
-    CHECK_EQ(cudaSuccess, cudaMallocAsync(&data, size, stream));
-  } else {
-    CHECK_EQ(cudaSuccess, cudaMalloc(&data, size));
-  }
-#endif
-  return data;
-}
-
-void ContextImpl::CudaFree(void* data, cudaStream_t stream) const {
-  // Stream-ordered alloaction API is available since CUDA 11.4, but might be
-  // not implemented by particular device
-#if CUDART_VERSION < 11040
-#warning \
-    "Stream-ordered allocations are unavailable, consider updating CUDA toolkit to version 11.4+"
-  CHECK_EQ(cudaSuccess, cudaFree(data));
-#else
-  if (gpu_device_properties_.memoryPoolsSupported) {
-    CHECK_EQ(cudaSuccess, cudaFreeAsync(data, stream));
-  } else {
-    CHECK_EQ(cudaSuccess, cudaFree(data));
-  }
-#endif
-}
 #endif  // CERES_NO_CUDA
 
 ContextImpl::~ContextImpl() {
diff --git a/internal/ceres/context_impl.h b/internal/ceres/context_impl.h
index 6bdfd0f..508fd06 100644
--- a/internal/ceres/context_impl.h
+++ b/internal/ceres/context_impl.h
@@ -119,20 +119,6 @@
   // Returns the number of bytes of available global memory on the current CUDA
   // device. If it is called before InitCuda, it returns 0.
   size_t GpuMemoryAvailable() const;
-  // Allocate memory stream-synchronously, if cudaMallocAsync API is available
-  // in CUDA Toolkit and particular device supports it. Otherwise
-  // device-synchronous allocation is used
-  void* CudaMalloc(size_t size, cudaStream_t stream) const;
-  template <typename T>
-  T* CudaAllocate(size_t num_elements, cudaStream_t stream) const {
-    T* data = static_cast<T*>(CudaMalloc(num_elements * sizeof(T), stream));
-    return data;
-  }
-  // Free memory previously allocated by CudaMalloc. If cudaMallocAsync is
-  // supported by both CUDA Toolkit used for compilation and device used at
-  // runtime - deallocation will be performed stream-synchronously. Otherwise
-  // device-synchronous free is used.
-  void CudaFree(void* data, cudaStream_t stream) const;
 
   cusolverDnHandle_t cusolver_handle_ = nullptr;
   cublasHandle_t cublas_handle_ = nullptr;
diff --git a/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc b/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc
index 8dc68ca..05b52e7 100644
--- a/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc
+++ b/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc
@@ -35,7 +35,6 @@
 #include <thrust/scan.h>
 
 #include "ceres/block_structure.h"
-#include "ceres/context_impl.h"
 #include "ceres/cuda_kernels_utils.h"
 
 namespace ceres {
@@ -51,6 +50,50 @@
   return thrust::cuda::par_nosync.on(stream);
 #endif
 }
+
+void* CudaMalloc(size_t size,
+                 cudaStream_t stream,
+                 bool memory_pools_supported) {
+  void* data = nullptr;
+  // Stream-ordered alloaction API is available since CUDA 11.4, but might be
+  // not implemented by particular device
+#if CUDART_VERSION < 11040
+#warning \
+    "Stream-ordered allocations are unavailable, consider updating CUDA toolkit to version 11.4+"
+  cudaMalloc(&data, size);
+#else
+  if (memory_pools_supported) {
+    cudaMallocAsync(&data, size, stream);
+  } else {
+    cudaMalloc(&data, size);
+  }
+#endif
+  return data;
+}
+
+void CudaFree(void* data, cudaStream_t stream, bool memory_pools_supported) {
+  // Stream-ordered alloaction API is available since CUDA 11.4, but might be
+  // not implemented by particular device
+#if CUDART_VERSION < 11040
+#warning \
+    "Stream-ordered allocations are unavailable, consider updating CUDA toolkit to version 11.4+"
+  cudaSuccess, cudaFree(data);
+#else
+  if (memory_pools_supported) {
+    cudaFreeAsync(data, stream);
+  } else {
+    cudaFree(data);
+  }
+#endif
+}
+template <typename T>
+T* CudaAllocate(size_t num_elements,
+                cudaStream_t stream,
+                bool memory_pools_supported) {
+  T* data = static_cast<T*>(
+      CudaMalloc(num_elements * sizeof(T), stream, memory_pools_supported));
+  return data;
+}
 }  // namespace
 
 // Fill row block id and nnz for each row using block-sparse structure
@@ -188,10 +231,11 @@
                       int* rows,
                       int* cols,
                       cudaStream_t stream,
-                      ContextImpl* context) {
+                      bool memory_pools_supported) {
   // Set number of non-zeros per row in rows array and row to row-block map in
   // row_block_ids array
-  int* row_block_ids = context->CudaAllocate<int>(num_rows, stream);
+  int* row_block_ids =
+      CudaAllocate<int>(num_rows, stream, memory_pools_supported);
   const int num_blocks_blockwise = NumBlocksInGrid(num_row_blocks + 1);
   RowBlockIdAndNNZ<false><<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>(
       num_row_blocks,
@@ -223,7 +267,7 @@
       nullptr,
       rows,
       cols);
-  context->CudaFree(row_block_ids, stream);
+  CudaFree(row_block_ids, stream, memory_pools_supported);
 }
 
 void FillCRSStructurePartitioned(const int num_row_blocks,
@@ -240,10 +284,11 @@
                                  int* rows_f,
                                  int* cols_f,
                                  cudaStream_t stream,
-                                 ContextImpl* context) {
+                                 bool memory_pools_supported) {
   // Set number of non-zeros per row in rows array and row to row-block map in
   // row_block_ids array
-  int* row_block_ids = context->CudaAllocate<int>(num_rows, stream);
+  int* row_block_ids =
+      CudaAllocate<int>(num_rows, stream, memory_pools_supported);
   const int num_blocks_blockwise = NumBlocksInGrid(num_row_blocks + 1);
   RowBlockIdAndNNZ<true><<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>(
       num_row_blocks,
@@ -281,7 +326,7 @@
       cols_e,
       rows_f,
       cols_f);
-  context->CudaFree(row_block_ids, stream);
+  CudaFree(row_block_ids, stream, memory_pools_supported);
 }
 
 template <typename T, typename Predicate>
diff --git a/internal/ceres/cuda_kernels_bsm_to_crs.h b/internal/ceres/cuda_kernels_bsm_to_crs.h
index ddfecfc..27f4a25 100644
--- a/internal/ceres/cuda_kernels_bsm_to_crs.h
+++ b/internal/ceres/cuda_kernels_bsm_to_crs.h
@@ -41,7 +41,6 @@
 namespace internal {
 struct Block;
 struct Cell;
-class ContextImpl;
 
 // Compute structure of CRS matrix using block-sparse structure.
 // Arrays corresponding to CRS matrix are to be allocated by caller
@@ -54,7 +53,7 @@
                       int* rows,
                       int* cols,
                       cudaStream_t stream,
-                      ContextImpl* context);
+                      bool memory_pools_supported);
 
 // Compute structure of partitioned CRS matrix using block-sparse structure.
 // Arrays corresponding to CRS matrices are to be allocated by caller
@@ -72,7 +71,7 @@
                                  int* rows_f,
                                  int* cols_f,
                                  cudaStream_t stream,
-                                 ContextImpl* context);
+                                 bool memory_pools_supported);
 
 // Permute segment of values from block-sparse matrix with sequential layout to
 // CRS order. Segment starts at block_sparse_offset and has length of num_values