Improve support of older CUDA toolkit versions - Provides fall-back for older versions of CUDA toolkit - Using older versions of CUDA toolkit might result in over-synchronization Change-Id: I545e6625d2342be30cb759b90bda379e555d7370
diff --git a/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc b/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc index 8be3553..a1ea6af 100644 --- a/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc +++ b/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc
@@ -40,6 +40,43 @@ namespace ceres { namespace internal { +namespace { +inline auto ThrustCudaStreamExecutionPolicy(cudaStream_t stream) { + // par_nosync execution policy was added in Thrust 1.16 + // https://github.com/NVIDIA/thrust/blob/main/CHANGELOG.md#thrust-1160 +#if THRUST_VERSION < 101700 + return thrust::cuda::par.on(stream); +#else + return thrust::cuda::par_nosync.on(stream); +#endif +} +// Allocate temporary memory on gpu, avoiding synchronization if possible +template <typename T> +T* AllocateTemporaryMemory(size_t num_elements, cudaStream_t stream) { + T* data; + // Stream-ordered alloactions are available since CUDA 11.4 +#if CUDART_VERSION < 11040 +#warning \ + "Stream-ordered allocations are unavailable, consider updating CUDA toolkit" + cudaMalloc(&data, sizeof(T) * num_elements); +#else + cudaMallocAsync(&data, sizeof(T) * num_elements, stream); +#endif + return data; +} + +void FreeTemporaryMemory(void* data, cudaStream_t stream) { + // Stream-ordered alloactions are available since CUDA 11.4 +#if CUDART_VERSION < 11040 +#warning \ + "Stream-ordered allocations are unavailable, consider updating CUDA toolkit" + cudaFree(data); +#else + cudaFreeAsync(data, stream); +#endif +} +} // namespace + // Fill row block id and nnz for each row using block-sparse structure // represented by a set of flat arrays. // Inputs: @@ -144,8 +181,7 @@ cudaStream_t stream) { // Set number of non-zeros per row in rows array and row to row-block map in // row_block_ids array - int* row_block_ids; - cudaMallocAsync(&row_block_ids, sizeof(int) * num_rows, stream); + int* row_block_ids = AllocateTemporaryMemory<int>(num_rows, stream); const int num_blocks_blockwise = NumBlocksInGrid(num_row_blocks + 1); RowBlockIdAndNNZ<<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>( num_row_blocks, @@ -157,7 +193,7 @@ row_block_ids); // Finalize row-index array of CRS strucure by computing prefix sum thrust::inclusive_scan( - thrust::cuda::par_nosync.on(stream), rows, rows + num_rows + 1, rows); + ThrustCudaStreamExecutionPolicy(stream), rows, rows + num_rows + 1, rows); // Fill cols array of CRS structure and permutation from block-sparse to CRS const int num_blocks_rowwise = NumBlocksInGrid(num_rows); @@ -172,7 +208,7 @@ row_block_ids, rows, cols); - cudaFreeAsync(row_block_ids, stream); + FreeTemporaryMemory(row_block_ids, stream); } template <typename T, typename Predicate>