Runtime check for cudaMallocAsync support
Change-Id: Ia0e347d99b005d805ff2351cdb8918cc1331fc24
diff --git a/internal/ceres/context_impl.cc b/internal/ceres/context_impl.cc
index 0ac7385..c39e098 100644
--- a/internal/ceres/context_impl.cc
+++ b/internal/ceres/context_impl.cc
@@ -72,17 +72,18 @@
std::string ContextImpl::CudaConfigAsString() const {
return ceres::internal::StringPrintf(
"======================= CUDA Device Properties ======================\n"
- "Cuda version : %d.%d\n"
- "Device ID : %d\n"
- "Device name : %s\n"
- "Total GPU memory : %6.f MiB\n"
- "GPU memory available : %6.f MiB\n"
- "Compute capability : %d.%d\n"
- "Warp size : %d\n"
- "Max threads per block: %d\n"
- "Max threads per dim : %d %d %d\n"
- "Max grid size : %d %d %d\n"
- "Multiprocessor count : %d\n"
+ "Cuda version : %d.%d\n"
+ "Device ID : %d\n"
+ "Device name : %s\n"
+ "Total GPU memory : %6.f MiB\n"
+ "GPU memory available : %6.f MiB\n"
+ "Compute capability : %d.%d\n"
+ "Warp size : %d\n"
+ "Max threads per block : %d\n"
+ "Max threads per dim : %d %d %d\n"
+ "Max grid size : %d %d %d\n"
+ "Multiprocessor count : %d\n"
+ "cudaMallocAsync supported : %s\n"
"====================================================================",
cuda_version_major_,
cuda_version_minor_,
@@ -100,7 +101,8 @@
gpu_device_properties_.maxGridSize[0],
gpu_device_properties_.maxGridSize[1],
gpu_device_properties_.maxGridSize[2],
- gpu_device_properties_.multiProcessorCount);
+ gpu_device_properties_.multiProcessorCount,
+ gpu_device_properties_.memoryPoolsSupported ? "Yes" : "No");
}
size_t ContextImpl::GpuMemoryAvailable() const {
@@ -169,6 +171,40 @@
is_cuda_initialized_ = true;
return true;
}
+
+void* ContextImpl::CudaMalloc(size_t size, cudaStream_t stream) const {
+ void* data = nullptr;
+ // Stream-ordered alloaction API is available since CUDA 11.4, but might be
+ // not implemented by particular device
+#if CUDART_VERSION < 11040
+#warning \
+ "Stream-ordered allocations are unavailable, consider updating CUDA toolkit to version 11.4+"
+ CHECK_EQ(cudaSuccess, cudaMalloc(&data, size));
+#else
+ if (gpu_device_properties_.memoryPoolsSupported) {
+ CHECK_EQ(cudaSuccess, cudaMallocAsync(&data, size, stream));
+ } else {
+ CHECK_EQ(cudaSuccess, cudaMalloc(&data, size));
+ }
+#endif
+ return data;
+}
+
+void ContextImpl::CudaFree(void* data, cudaStream_t stream) const {
+ // Stream-ordered alloaction API is available since CUDA 11.4, but might be
+ // not implemented by particular device
+#if CUDART_VERSION < 11040
+#warning \
+ "Stream-ordered allocations are unavailable, consider updating CUDA toolkit to version 11.4+"
+ CHECK_EQ(cudaSuccess, cudaFree(data));
+#else
+ if (gpu_device_properties_.memoryPoolsSupported) {
+ CHECK_EQ(cudaSuccess, cudaFreeAsync(data, stream));
+ } else {
+ CHECK_EQ(cudaSuccess, cudaFree(data));
+ }
+#endif
+}
#endif // CERES_NO_CUDA
ContextImpl::~ContextImpl() {
diff --git a/internal/ceres/context_impl.h b/internal/ceres/context_impl.h
index 508fd06..6bdfd0f 100644
--- a/internal/ceres/context_impl.h
+++ b/internal/ceres/context_impl.h
@@ -119,6 +119,20 @@
// Returns the number of bytes of available global memory on the current CUDA
// device. If it is called before InitCuda, it returns 0.
size_t GpuMemoryAvailable() const;
+ // Allocate memory stream-synchronously, if cudaMallocAsync API is available
+ // in CUDA Toolkit and particular device supports it. Otherwise
+ // device-synchronous allocation is used
+ void* CudaMalloc(size_t size, cudaStream_t stream) const;
+ template <typename T>
+ T* CudaAllocate(size_t num_elements, cudaStream_t stream) const {
+ T* data = static_cast<T*>(CudaMalloc(num_elements * sizeof(T), stream));
+ return data;
+ }
+ // Free memory previously allocated by CudaMalloc. If cudaMallocAsync is
+ // supported by both CUDA Toolkit used for compilation and device used at
+ // runtime - deallocation will be performed stream-synchronously. Otherwise
+ // device-synchronous free is used.
+ void CudaFree(void* data, cudaStream_t stream) const;
cusolverDnHandle_t cusolver_handle_ = nullptr;
cublasHandle_t cublas_handle_ = nullptr;
diff --git a/internal/ceres/cuda_block_sparse_crs_view.cc b/internal/ceres/cuda_block_sparse_crs_view.cc
index f0da35f..c370d22 100644
--- a/internal/ceres/cuda_block_sparse_crs_view.cc
+++ b/internal/ceres/cuda_block_sparse_crs_view.cc
@@ -51,7 +51,8 @@
block_structure_->col_blocks(),
rows.data(),
cols.data(),
- context->DefaultStream());
+ context->DefaultStream(),
+ context);
is_crs_compatible_ = block_structure_->IsCrsCompatible();
// if matrix is crs-compatible - we can drop block-structure and don't need
// streamed_buffer_
diff --git a/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc b/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc
index c7b4aa4..8dc68ca 100644
--- a/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc
+++ b/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc
@@ -35,6 +35,7 @@
#include <thrust/scan.h>
#include "ceres/block_structure.h"
+#include "ceres/context_impl.h"
#include "ceres/cuda_kernels_utils.h"
namespace ceres {
@@ -50,31 +51,6 @@
return thrust::cuda::par_nosync.on(stream);
#endif
}
-// Allocate temporary memory on gpu, avoiding synchronization if possible
-template <typename T>
-T* AllocateTemporaryMemory(size_t num_elements, cudaStream_t stream) {
- T* data;
- // Stream-ordered alloactions are available since CUDA 11.4
-#if CUDART_VERSION < 11040
-#warning \
- "Stream-ordered allocations are unavailable, consider updating CUDA toolkit"
- cudaMalloc(&data, sizeof(T) * num_elements);
-#else
- cudaMallocAsync(&data, sizeof(T) * num_elements, stream);
-#endif
- return data;
-}
-
-void FreeTemporaryMemory(void* data, cudaStream_t stream) {
- // Stream-ordered alloactions are available since CUDA 11.4
-#if CUDART_VERSION < 11040
-#warning \
- "Stream-ordered allocations are unavailable, consider updating CUDA toolkit"
- cudaFree(data);
-#else
- cudaFreeAsync(data, stream);
-#endif
-}
} // namespace
// Fill row block id and nnz for each row using block-sparse structure
@@ -211,10 +187,11 @@
const Block* col_blocks,
int* rows,
int* cols,
- cudaStream_t stream) {
+ cudaStream_t stream,
+ ContextImpl* context) {
// Set number of non-zeros per row in rows array and row to row-block map in
// row_block_ids array
- int* row_block_ids = AllocateTemporaryMemory<int>(num_rows, stream);
+ int* row_block_ids = context->CudaAllocate<int>(num_rows, stream);
const int num_blocks_blockwise = NumBlocksInGrid(num_row_blocks + 1);
RowBlockIdAndNNZ<false><<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>(
num_row_blocks,
@@ -246,7 +223,7 @@
nullptr,
rows,
cols);
- FreeTemporaryMemory(row_block_ids, stream);
+ context->CudaFree(row_block_ids, stream);
}
void FillCRSStructurePartitioned(const int num_row_blocks,
@@ -262,10 +239,11 @@
int* cols_e,
int* rows_f,
int* cols_f,
- cudaStream_t stream) {
+ cudaStream_t stream,
+ ContextImpl* context) {
// Set number of non-zeros per row in rows array and row to row-block map in
// row_block_ids array
- int* row_block_ids = AllocateTemporaryMemory<int>(num_rows, stream);
+ int* row_block_ids = context->CudaAllocate<int>(num_rows, stream);
const int num_blocks_blockwise = NumBlocksInGrid(num_row_blocks + 1);
RowBlockIdAndNNZ<true><<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>(
num_row_blocks,
@@ -303,7 +281,7 @@
cols_e,
rows_f,
cols_f);
- FreeTemporaryMemory(row_block_ids, stream);
+ context->CudaFree(row_block_ids, stream);
}
template <typename T, typename Predicate>
diff --git a/internal/ceres/cuda_kernels_bsm_to_crs.h b/internal/ceres/cuda_kernels_bsm_to_crs.h
index 62a2477..ddfecfc 100644
--- a/internal/ceres/cuda_kernels_bsm_to_crs.h
+++ b/internal/ceres/cuda_kernels_bsm_to_crs.h
@@ -41,6 +41,7 @@
namespace internal {
struct Block;
struct Cell;
+class ContextImpl;
// Compute structure of CRS matrix using block-sparse structure.
// Arrays corresponding to CRS matrix are to be allocated by caller
@@ -52,7 +53,8 @@
const Block* col_blocks,
int* rows,
int* cols,
- cudaStream_t stream);
+ cudaStream_t stream,
+ ContextImpl* context);
// Compute structure of partitioned CRS matrix using block-sparse structure.
// Arrays corresponding to CRS matrices are to be allocated by caller
@@ -69,7 +71,8 @@
int* cols_e,
int* rows_f,
int* cols_f,
- cudaStream_t stream);
+ cudaStream_t stream,
+ ContextImpl* context);
// Permute segment of values from block-sparse matrix with sequential layout to
// CRS order. Segment starts at block_sparse_offset and has length of num_values
diff --git a/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc b/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc
index 9153544..550e6d0 100644
--- a/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc
+++ b/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc
@@ -79,7 +79,8 @@
cols_e.data(),
rows_f.data(),
cols_f.data(),
- context->DefaultStream());
+ context->DefaultStream(),
+ context);
f_is_crs_compatible_ = block_structure_->IsCrsCompatible();
if (f_is_crs_compatible_) {
block_structure_ = nullptr;