Runtime check for cudaMallocAsync support

Change-Id: Ia0e347d99b005d805ff2351cdb8918cc1331fc24
diff --git a/internal/ceres/context_impl.cc b/internal/ceres/context_impl.cc
index 0ac7385..c39e098 100644
--- a/internal/ceres/context_impl.cc
+++ b/internal/ceres/context_impl.cc
@@ -72,17 +72,18 @@
 std::string ContextImpl::CudaConfigAsString() const {
   return ceres::internal::StringPrintf(
       "======================= CUDA Device Properties ======================\n"
-      "Cuda version         : %d.%d\n"
-      "Device ID            : %d\n"
-      "Device name          : %s\n"
-      "Total GPU memory     : %6.f MiB\n"
-      "GPU memory available : %6.f MiB\n"
-      "Compute capability   : %d.%d\n"
-      "Warp size            : %d\n"
-      "Max threads per block: %d\n"
-      "Max threads per dim  : %d %d %d\n"
-      "Max grid size        : %d %d %d\n"
-      "Multiprocessor count : %d\n"
+      "Cuda version              : %d.%d\n"
+      "Device ID                 : %d\n"
+      "Device name               : %s\n"
+      "Total GPU memory          : %6.f MiB\n"
+      "GPU memory available      : %6.f MiB\n"
+      "Compute capability        : %d.%d\n"
+      "Warp size                 : %d\n"
+      "Max threads per block     : %d\n"
+      "Max threads per dim       : %d %d %d\n"
+      "Max grid size             : %d %d %d\n"
+      "Multiprocessor count      : %d\n"
+      "cudaMallocAsync supported : %s\n"
       "====================================================================",
       cuda_version_major_,
       cuda_version_minor_,
@@ -100,7 +101,8 @@
       gpu_device_properties_.maxGridSize[0],
       gpu_device_properties_.maxGridSize[1],
       gpu_device_properties_.maxGridSize[2],
-      gpu_device_properties_.multiProcessorCount);
+      gpu_device_properties_.multiProcessorCount,
+      gpu_device_properties_.memoryPoolsSupported ? "Yes" : "No");
 }
 
 size_t ContextImpl::GpuMemoryAvailable() const {
@@ -169,6 +171,40 @@
   is_cuda_initialized_ = true;
   return true;
 }
+
+void* ContextImpl::CudaMalloc(size_t size, cudaStream_t stream) const {
+  void* data = nullptr;
+  // Stream-ordered alloaction API is available since CUDA 11.4, but might be
+  // not implemented by particular device
+#if CUDART_VERSION < 11040
+#warning \
+    "Stream-ordered allocations are unavailable, consider updating CUDA toolkit to version 11.4+"
+  CHECK_EQ(cudaSuccess, cudaMalloc(&data, size));
+#else
+  if (gpu_device_properties_.memoryPoolsSupported) {
+    CHECK_EQ(cudaSuccess, cudaMallocAsync(&data, size, stream));
+  } else {
+    CHECK_EQ(cudaSuccess, cudaMalloc(&data, size));
+  }
+#endif
+  return data;
+}
+
+void ContextImpl::CudaFree(void* data, cudaStream_t stream) const {
+  // Stream-ordered alloaction API is available since CUDA 11.4, but might be
+  // not implemented by particular device
+#if CUDART_VERSION < 11040
+#warning \
+    "Stream-ordered allocations are unavailable, consider updating CUDA toolkit to version 11.4+"
+  CHECK_EQ(cudaSuccess, cudaFree(data));
+#else
+  if (gpu_device_properties_.memoryPoolsSupported) {
+    CHECK_EQ(cudaSuccess, cudaFreeAsync(data, stream));
+  } else {
+    CHECK_EQ(cudaSuccess, cudaFree(data));
+  }
+#endif
+}
 #endif  // CERES_NO_CUDA
 
 ContextImpl::~ContextImpl() {
diff --git a/internal/ceres/context_impl.h b/internal/ceres/context_impl.h
index 508fd06..6bdfd0f 100644
--- a/internal/ceres/context_impl.h
+++ b/internal/ceres/context_impl.h
@@ -119,6 +119,20 @@
   // Returns the number of bytes of available global memory on the current CUDA
   // device. If it is called before InitCuda, it returns 0.
   size_t GpuMemoryAvailable() const;
+  // Allocate memory stream-synchronously, if cudaMallocAsync API is available
+  // in CUDA Toolkit and particular device supports it. Otherwise
+  // device-synchronous allocation is used
+  void* CudaMalloc(size_t size, cudaStream_t stream) const;
+  template <typename T>
+  T* CudaAllocate(size_t num_elements, cudaStream_t stream) const {
+    T* data = static_cast<T*>(CudaMalloc(num_elements * sizeof(T), stream));
+    return data;
+  }
+  // Free memory previously allocated by CudaMalloc. If cudaMallocAsync is
+  // supported by both CUDA Toolkit used for compilation and device used at
+  // runtime - deallocation will be performed stream-synchronously. Otherwise
+  // device-synchronous free is used.
+  void CudaFree(void* data, cudaStream_t stream) const;
 
   cusolverDnHandle_t cusolver_handle_ = nullptr;
   cublasHandle_t cublas_handle_ = nullptr;
diff --git a/internal/ceres/cuda_block_sparse_crs_view.cc b/internal/ceres/cuda_block_sparse_crs_view.cc
index f0da35f..c370d22 100644
--- a/internal/ceres/cuda_block_sparse_crs_view.cc
+++ b/internal/ceres/cuda_block_sparse_crs_view.cc
@@ -51,7 +51,8 @@
                    block_structure_->col_blocks(),
                    rows.data(),
                    cols.data(),
-                   context->DefaultStream());
+                   context->DefaultStream(),
+                   context);
   is_crs_compatible_ = block_structure_->IsCrsCompatible();
   // if matrix is crs-compatible - we can drop block-structure and don't need
   // streamed_buffer_
diff --git a/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc b/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc
index c7b4aa4..8dc68ca 100644
--- a/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc
+++ b/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc
@@ -35,6 +35,7 @@
 #include <thrust/scan.h>
 
 #include "ceres/block_structure.h"
+#include "ceres/context_impl.h"
 #include "ceres/cuda_kernels_utils.h"
 
 namespace ceres {
@@ -50,31 +51,6 @@
   return thrust::cuda::par_nosync.on(stream);
 #endif
 }
-// Allocate temporary memory on gpu, avoiding synchronization if possible
-template <typename T>
-T* AllocateTemporaryMemory(size_t num_elements, cudaStream_t stream) {
-  T* data;
-  // Stream-ordered alloactions are available since CUDA 11.4
-#if CUDART_VERSION < 11040
-#warning \
-    "Stream-ordered allocations are unavailable, consider updating CUDA toolkit"
-  cudaMalloc(&data, sizeof(T) * num_elements);
-#else
-  cudaMallocAsync(&data, sizeof(T) * num_elements, stream);
-#endif
-  return data;
-}
-
-void FreeTemporaryMemory(void* data, cudaStream_t stream) {
-  // Stream-ordered alloactions are available since CUDA 11.4
-#if CUDART_VERSION < 11040
-#warning \
-    "Stream-ordered allocations are unavailable, consider updating CUDA toolkit"
-  cudaFree(data);
-#else
-  cudaFreeAsync(data, stream);
-#endif
-}
 }  // namespace
 
 // Fill row block id and nnz for each row using block-sparse structure
@@ -211,10 +187,11 @@
                       const Block* col_blocks,
                       int* rows,
                       int* cols,
-                      cudaStream_t stream) {
+                      cudaStream_t stream,
+                      ContextImpl* context) {
   // Set number of non-zeros per row in rows array and row to row-block map in
   // row_block_ids array
-  int* row_block_ids = AllocateTemporaryMemory<int>(num_rows, stream);
+  int* row_block_ids = context->CudaAllocate<int>(num_rows, stream);
   const int num_blocks_blockwise = NumBlocksInGrid(num_row_blocks + 1);
   RowBlockIdAndNNZ<false><<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>(
       num_row_blocks,
@@ -246,7 +223,7 @@
       nullptr,
       rows,
       cols);
-  FreeTemporaryMemory(row_block_ids, stream);
+  context->CudaFree(row_block_ids, stream);
 }
 
 void FillCRSStructurePartitioned(const int num_row_blocks,
@@ -262,10 +239,11 @@
                                  int* cols_e,
                                  int* rows_f,
                                  int* cols_f,
-                                 cudaStream_t stream) {
+                                 cudaStream_t stream,
+                                 ContextImpl* context) {
   // Set number of non-zeros per row in rows array and row to row-block map in
   // row_block_ids array
-  int* row_block_ids = AllocateTemporaryMemory<int>(num_rows, stream);
+  int* row_block_ids = context->CudaAllocate<int>(num_rows, stream);
   const int num_blocks_blockwise = NumBlocksInGrid(num_row_blocks + 1);
   RowBlockIdAndNNZ<true><<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>(
       num_row_blocks,
@@ -303,7 +281,7 @@
       cols_e,
       rows_f,
       cols_f);
-  FreeTemporaryMemory(row_block_ids, stream);
+  context->CudaFree(row_block_ids, stream);
 }
 
 template <typename T, typename Predicate>
diff --git a/internal/ceres/cuda_kernels_bsm_to_crs.h b/internal/ceres/cuda_kernels_bsm_to_crs.h
index 62a2477..ddfecfc 100644
--- a/internal/ceres/cuda_kernels_bsm_to_crs.h
+++ b/internal/ceres/cuda_kernels_bsm_to_crs.h
@@ -41,6 +41,7 @@
 namespace internal {
 struct Block;
 struct Cell;
+class ContextImpl;
 
 // Compute structure of CRS matrix using block-sparse structure.
 // Arrays corresponding to CRS matrix are to be allocated by caller
@@ -52,7 +53,8 @@
                       const Block* col_blocks,
                       int* rows,
                       int* cols,
-                      cudaStream_t stream);
+                      cudaStream_t stream,
+                      ContextImpl* context);
 
 // Compute structure of partitioned CRS matrix using block-sparse structure.
 // Arrays corresponding to CRS matrices are to be allocated by caller
@@ -69,7 +71,8 @@
                                  int* cols_e,
                                  int* rows_f,
                                  int* cols_f,
-                                 cudaStream_t stream);
+                                 cudaStream_t stream,
+                                 ContextImpl* context);
 
 // Permute segment of values from block-sparse matrix with sequential layout to
 // CRS order. Segment starts at block_sparse_offset and has length of num_values
diff --git a/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc b/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc
index 9153544..550e6d0 100644
--- a/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc
+++ b/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc
@@ -79,7 +79,8 @@
                               cols_e.data(),
                               rows_f.data(),
                               cols_f.data(),
-                              context->DefaultStream());
+                              context->DefaultStream(),
+                              context);
   f_is_crs_compatible_ = block_structure_->IsCrsCompatible();
   if (f_is_crs_compatible_) {
     block_structure_ = nullptr;