Runtime check for cudaMallocAsync support Change-Id: Ia0e347d99b005d805ff2351cdb8918cc1331fc24
diff --git a/internal/ceres/context_impl.cc b/internal/ceres/context_impl.cc index 0ac7385..c39e098 100644 --- a/internal/ceres/context_impl.cc +++ b/internal/ceres/context_impl.cc
@@ -72,17 +72,18 @@ std::string ContextImpl::CudaConfigAsString() const { return ceres::internal::StringPrintf( "======================= CUDA Device Properties ======================\n" - "Cuda version : %d.%d\n" - "Device ID : %d\n" - "Device name : %s\n" - "Total GPU memory : %6.f MiB\n" - "GPU memory available : %6.f MiB\n" - "Compute capability : %d.%d\n" - "Warp size : %d\n" - "Max threads per block: %d\n" - "Max threads per dim : %d %d %d\n" - "Max grid size : %d %d %d\n" - "Multiprocessor count : %d\n" + "Cuda version : %d.%d\n" + "Device ID : %d\n" + "Device name : %s\n" + "Total GPU memory : %6.f MiB\n" + "GPU memory available : %6.f MiB\n" + "Compute capability : %d.%d\n" + "Warp size : %d\n" + "Max threads per block : %d\n" + "Max threads per dim : %d %d %d\n" + "Max grid size : %d %d %d\n" + "Multiprocessor count : %d\n" + "cudaMallocAsync supported : %s\n" "====================================================================", cuda_version_major_, cuda_version_minor_, @@ -100,7 +101,8 @@ gpu_device_properties_.maxGridSize[0], gpu_device_properties_.maxGridSize[1], gpu_device_properties_.maxGridSize[2], - gpu_device_properties_.multiProcessorCount); + gpu_device_properties_.multiProcessorCount, + gpu_device_properties_.memoryPoolsSupported ? "Yes" : "No"); } size_t ContextImpl::GpuMemoryAvailable() const { @@ -169,6 +171,40 @@ is_cuda_initialized_ = true; return true; } + +void* ContextImpl::CudaMalloc(size_t size, cudaStream_t stream) const { + void* data = nullptr; + // Stream-ordered alloaction API is available since CUDA 11.4, but might be + // not implemented by particular device +#if CUDART_VERSION < 11040 +#warning \ + "Stream-ordered allocations are unavailable, consider updating CUDA toolkit to version 11.4+" + CHECK_EQ(cudaSuccess, cudaMalloc(&data, size)); +#else + if (gpu_device_properties_.memoryPoolsSupported) { + CHECK_EQ(cudaSuccess, cudaMallocAsync(&data, size, stream)); + } else { + CHECK_EQ(cudaSuccess, cudaMalloc(&data, size)); + } +#endif + return data; +} + +void ContextImpl::CudaFree(void* data, cudaStream_t stream) const { + // Stream-ordered alloaction API is available since CUDA 11.4, but might be + // not implemented by particular device +#if CUDART_VERSION < 11040 +#warning \ + "Stream-ordered allocations are unavailable, consider updating CUDA toolkit to version 11.4+" + CHECK_EQ(cudaSuccess, cudaFree(data)); +#else + if (gpu_device_properties_.memoryPoolsSupported) { + CHECK_EQ(cudaSuccess, cudaFreeAsync(data, stream)); + } else { + CHECK_EQ(cudaSuccess, cudaFree(data)); + } +#endif +} #endif // CERES_NO_CUDA ContextImpl::~ContextImpl() {
diff --git a/internal/ceres/context_impl.h b/internal/ceres/context_impl.h index 508fd06..6bdfd0f 100644 --- a/internal/ceres/context_impl.h +++ b/internal/ceres/context_impl.h
@@ -119,6 +119,20 @@ // Returns the number of bytes of available global memory on the current CUDA // device. If it is called before InitCuda, it returns 0. size_t GpuMemoryAvailable() const; + // Allocate memory stream-synchronously, if cudaMallocAsync API is available + // in CUDA Toolkit and particular device supports it. Otherwise + // device-synchronous allocation is used + void* CudaMalloc(size_t size, cudaStream_t stream) const; + template <typename T> + T* CudaAllocate(size_t num_elements, cudaStream_t stream) const { + T* data = static_cast<T*>(CudaMalloc(num_elements * sizeof(T), stream)); + return data; + } + // Free memory previously allocated by CudaMalloc. If cudaMallocAsync is + // supported by both CUDA Toolkit used for compilation and device used at + // runtime - deallocation will be performed stream-synchronously. Otherwise + // device-synchronous free is used. + void CudaFree(void* data, cudaStream_t stream) const; cusolverDnHandle_t cusolver_handle_ = nullptr; cublasHandle_t cublas_handle_ = nullptr;
diff --git a/internal/ceres/cuda_block_sparse_crs_view.cc b/internal/ceres/cuda_block_sparse_crs_view.cc index f0da35f..c370d22 100644 --- a/internal/ceres/cuda_block_sparse_crs_view.cc +++ b/internal/ceres/cuda_block_sparse_crs_view.cc
@@ -51,7 +51,8 @@ block_structure_->col_blocks(), rows.data(), cols.data(), - context->DefaultStream()); + context->DefaultStream(), + context); is_crs_compatible_ = block_structure_->IsCrsCompatible(); // if matrix is crs-compatible - we can drop block-structure and don't need // streamed_buffer_
diff --git a/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc b/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc index c7b4aa4..8dc68ca 100644 --- a/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc +++ b/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc
@@ -35,6 +35,7 @@ #include <thrust/scan.h> #include "ceres/block_structure.h" +#include "ceres/context_impl.h" #include "ceres/cuda_kernels_utils.h" namespace ceres { @@ -50,31 +51,6 @@ return thrust::cuda::par_nosync.on(stream); #endif } -// Allocate temporary memory on gpu, avoiding synchronization if possible -template <typename T> -T* AllocateTemporaryMemory(size_t num_elements, cudaStream_t stream) { - T* data; - // Stream-ordered alloactions are available since CUDA 11.4 -#if CUDART_VERSION < 11040 -#warning \ - "Stream-ordered allocations are unavailable, consider updating CUDA toolkit" - cudaMalloc(&data, sizeof(T) * num_elements); -#else - cudaMallocAsync(&data, sizeof(T) * num_elements, stream); -#endif - return data; -} - -void FreeTemporaryMemory(void* data, cudaStream_t stream) { - // Stream-ordered alloactions are available since CUDA 11.4 -#if CUDART_VERSION < 11040 -#warning \ - "Stream-ordered allocations are unavailable, consider updating CUDA toolkit" - cudaFree(data); -#else - cudaFreeAsync(data, stream); -#endif -} } // namespace // Fill row block id and nnz for each row using block-sparse structure @@ -211,10 +187,11 @@ const Block* col_blocks, int* rows, int* cols, - cudaStream_t stream) { + cudaStream_t stream, + ContextImpl* context) { // Set number of non-zeros per row in rows array and row to row-block map in // row_block_ids array - int* row_block_ids = AllocateTemporaryMemory<int>(num_rows, stream); + int* row_block_ids = context->CudaAllocate<int>(num_rows, stream); const int num_blocks_blockwise = NumBlocksInGrid(num_row_blocks + 1); RowBlockIdAndNNZ<false><<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>( num_row_blocks, @@ -246,7 +223,7 @@ nullptr, rows, cols); - FreeTemporaryMemory(row_block_ids, stream); + context->CudaFree(row_block_ids, stream); } void FillCRSStructurePartitioned(const int num_row_blocks, @@ -262,10 +239,11 @@ int* cols_e, int* rows_f, int* cols_f, - cudaStream_t stream) { + cudaStream_t stream, + ContextImpl* context) { // Set number of non-zeros per row in rows array and row to row-block map in // row_block_ids array - int* row_block_ids = AllocateTemporaryMemory<int>(num_rows, stream); + int* row_block_ids = context->CudaAllocate<int>(num_rows, stream); const int num_blocks_blockwise = NumBlocksInGrid(num_row_blocks + 1); RowBlockIdAndNNZ<true><<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>( num_row_blocks, @@ -303,7 +281,7 @@ cols_e, rows_f, cols_f); - FreeTemporaryMemory(row_block_ids, stream); + context->CudaFree(row_block_ids, stream); } template <typename T, typename Predicate>
diff --git a/internal/ceres/cuda_kernels_bsm_to_crs.h b/internal/ceres/cuda_kernels_bsm_to_crs.h index 62a2477..ddfecfc 100644 --- a/internal/ceres/cuda_kernels_bsm_to_crs.h +++ b/internal/ceres/cuda_kernels_bsm_to_crs.h
@@ -41,6 +41,7 @@ namespace internal { struct Block; struct Cell; +class ContextImpl; // Compute structure of CRS matrix using block-sparse structure. // Arrays corresponding to CRS matrix are to be allocated by caller @@ -52,7 +53,8 @@ const Block* col_blocks, int* rows, int* cols, - cudaStream_t stream); + cudaStream_t stream, + ContextImpl* context); // Compute structure of partitioned CRS matrix using block-sparse structure. // Arrays corresponding to CRS matrices are to be allocated by caller @@ -69,7 +71,8 @@ int* cols_e, int* rows_f, int* cols_f, - cudaStream_t stream); + cudaStream_t stream, + ContextImpl* context); // Permute segment of values from block-sparse matrix with sequential layout to // CRS order. Segment starts at block_sparse_offset and has length of num_values
diff --git a/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc b/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc index 9153544..550e6d0 100644 --- a/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc +++ b/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc
@@ -79,7 +79,8 @@ cols_e.data(), rows_f.data(), cols_f.data(), - context->DefaultStream()); + context->DefaultStream(), + context); f_is_crs_compatible_ = block_structure_->IsCrsCompatible(); if (f_is_crs_compatible_) { block_structure_ = nullptr;