Move stream-ordered memory allocations Change-Id: Ief116e4e77c7579612b99cf552f3d8fc54c1d42a
diff --git a/internal/ceres/context_impl.cc b/internal/ceres/context_impl.cc index c39e098..c085644 100644 --- a/internal/ceres/context_impl.cc +++ b/internal/ceres/context_impl.cc
@@ -171,40 +171,6 @@ is_cuda_initialized_ = true; return true; } - -void* ContextImpl::CudaMalloc(size_t size, cudaStream_t stream) const { - void* data = nullptr; - // Stream-ordered alloaction API is available since CUDA 11.4, but might be - // not implemented by particular device -#if CUDART_VERSION < 11040 -#warning \ - "Stream-ordered allocations are unavailable, consider updating CUDA toolkit to version 11.4+" - CHECK_EQ(cudaSuccess, cudaMalloc(&data, size)); -#else - if (gpu_device_properties_.memoryPoolsSupported) { - CHECK_EQ(cudaSuccess, cudaMallocAsync(&data, size, stream)); - } else { - CHECK_EQ(cudaSuccess, cudaMalloc(&data, size)); - } -#endif - return data; -} - -void ContextImpl::CudaFree(void* data, cudaStream_t stream) const { - // Stream-ordered alloaction API is available since CUDA 11.4, but might be - // not implemented by particular device -#if CUDART_VERSION < 11040 -#warning \ - "Stream-ordered allocations are unavailable, consider updating CUDA toolkit to version 11.4+" - CHECK_EQ(cudaSuccess, cudaFree(data)); -#else - if (gpu_device_properties_.memoryPoolsSupported) { - CHECK_EQ(cudaSuccess, cudaFreeAsync(data, stream)); - } else { - CHECK_EQ(cudaSuccess, cudaFree(data)); - } -#endif -} #endif // CERES_NO_CUDA ContextImpl::~ContextImpl() {
diff --git a/internal/ceres/context_impl.h b/internal/ceres/context_impl.h index 6bdfd0f..508fd06 100644 --- a/internal/ceres/context_impl.h +++ b/internal/ceres/context_impl.h
@@ -119,20 +119,6 @@ // Returns the number of bytes of available global memory on the current CUDA // device. If it is called before InitCuda, it returns 0. size_t GpuMemoryAvailable() const; - // Allocate memory stream-synchronously, if cudaMallocAsync API is available - // in CUDA Toolkit and particular device supports it. Otherwise - // device-synchronous allocation is used - void* CudaMalloc(size_t size, cudaStream_t stream) const; - template <typename T> - T* CudaAllocate(size_t num_elements, cudaStream_t stream) const { - T* data = static_cast<T*>(CudaMalloc(num_elements * sizeof(T), stream)); - return data; - } - // Free memory previously allocated by CudaMalloc. If cudaMallocAsync is - // supported by both CUDA Toolkit used for compilation and device used at - // runtime - deallocation will be performed stream-synchronously. Otherwise - // device-synchronous free is used. - void CudaFree(void* data, cudaStream_t stream) const; cusolverDnHandle_t cusolver_handle_ = nullptr; cublasHandle_t cublas_handle_ = nullptr;
diff --git a/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc b/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc index 8dc68ca..05b52e7 100644 --- a/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc +++ b/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc
@@ -35,7 +35,6 @@ #include <thrust/scan.h> #include "ceres/block_structure.h" -#include "ceres/context_impl.h" #include "ceres/cuda_kernels_utils.h" namespace ceres { @@ -51,6 +50,50 @@ return thrust::cuda::par_nosync.on(stream); #endif } + +void* CudaMalloc(size_t size, + cudaStream_t stream, + bool memory_pools_supported) { + void* data = nullptr; + // Stream-ordered alloaction API is available since CUDA 11.4, but might be + // not implemented by particular device +#if CUDART_VERSION < 11040 +#warning \ + "Stream-ordered allocations are unavailable, consider updating CUDA toolkit to version 11.4+" + cudaMalloc(&data, size); +#else + if (memory_pools_supported) { + cudaMallocAsync(&data, size, stream); + } else { + cudaMalloc(&data, size); + } +#endif + return data; +} + +void CudaFree(void* data, cudaStream_t stream, bool memory_pools_supported) { + // Stream-ordered alloaction API is available since CUDA 11.4, but might be + // not implemented by particular device +#if CUDART_VERSION < 11040 +#warning \ + "Stream-ordered allocations are unavailable, consider updating CUDA toolkit to version 11.4+" + cudaSuccess, cudaFree(data); +#else + if (memory_pools_supported) { + cudaFreeAsync(data, stream); + } else { + cudaFree(data); + } +#endif +} +template <typename T> +T* CudaAllocate(size_t num_elements, + cudaStream_t stream, + bool memory_pools_supported) { + T* data = static_cast<T*>( + CudaMalloc(num_elements * sizeof(T), stream, memory_pools_supported)); + return data; +} } // namespace // Fill row block id and nnz for each row using block-sparse structure @@ -188,10 +231,11 @@ int* rows, int* cols, cudaStream_t stream, - ContextImpl* context) { + bool memory_pools_supported) { // Set number of non-zeros per row in rows array and row to row-block map in // row_block_ids array - int* row_block_ids = context->CudaAllocate<int>(num_rows, stream); + int* row_block_ids = + CudaAllocate<int>(num_rows, stream, memory_pools_supported); const int num_blocks_blockwise = NumBlocksInGrid(num_row_blocks + 1); RowBlockIdAndNNZ<false><<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>( num_row_blocks, @@ -223,7 +267,7 @@ nullptr, rows, cols); - context->CudaFree(row_block_ids, stream); + CudaFree(row_block_ids, stream, memory_pools_supported); } void FillCRSStructurePartitioned(const int num_row_blocks, @@ -240,10 +284,11 @@ int* rows_f, int* cols_f, cudaStream_t stream, - ContextImpl* context) { + bool memory_pools_supported) { // Set number of non-zeros per row in rows array and row to row-block map in // row_block_ids array - int* row_block_ids = context->CudaAllocate<int>(num_rows, stream); + int* row_block_ids = + CudaAllocate<int>(num_rows, stream, memory_pools_supported); const int num_blocks_blockwise = NumBlocksInGrid(num_row_blocks + 1); RowBlockIdAndNNZ<true><<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>( num_row_blocks, @@ -281,7 +326,7 @@ cols_e, rows_f, cols_f); - context->CudaFree(row_block_ids, stream); + CudaFree(row_block_ids, stream, memory_pools_supported); } template <typename T, typename Predicate>
diff --git a/internal/ceres/cuda_kernels_bsm_to_crs.h b/internal/ceres/cuda_kernels_bsm_to_crs.h index ddfecfc..27f4a25 100644 --- a/internal/ceres/cuda_kernels_bsm_to_crs.h +++ b/internal/ceres/cuda_kernels_bsm_to_crs.h
@@ -41,7 +41,6 @@ namespace internal { struct Block; struct Cell; -class ContextImpl; // Compute structure of CRS matrix using block-sparse structure. // Arrays corresponding to CRS matrix are to be allocated by caller @@ -54,7 +53,7 @@ int* rows, int* cols, cudaStream_t stream, - ContextImpl* context); + bool memory_pools_supported); // Compute structure of partitioned CRS matrix using block-sparse structure. // Arrays corresponding to CRS matrices are to be allocated by caller @@ -72,7 +71,7 @@ int* rows_f, int* cols_f, cudaStream_t stream, - ContextImpl* context); + bool memory_pools_supported); // Permute segment of values from block-sparse matrix with sequential layout to // CRS order. Segment starts at block_sparse_offset and has length of num_values