Move stream-ordered memory allocations
Change-Id: Ief116e4e77c7579612b99cf552f3d8fc54c1d42a
diff --git a/internal/ceres/context_impl.cc b/internal/ceres/context_impl.cc
index c39e098..c085644 100644
--- a/internal/ceres/context_impl.cc
+++ b/internal/ceres/context_impl.cc
@@ -171,40 +171,6 @@
is_cuda_initialized_ = true;
return true;
}
-
-void* ContextImpl::CudaMalloc(size_t size, cudaStream_t stream) const {
- void* data = nullptr;
- // Stream-ordered alloaction API is available since CUDA 11.4, but might be
- // not implemented by particular device
-#if CUDART_VERSION < 11040
-#warning \
- "Stream-ordered allocations are unavailable, consider updating CUDA toolkit to version 11.4+"
- CHECK_EQ(cudaSuccess, cudaMalloc(&data, size));
-#else
- if (gpu_device_properties_.memoryPoolsSupported) {
- CHECK_EQ(cudaSuccess, cudaMallocAsync(&data, size, stream));
- } else {
- CHECK_EQ(cudaSuccess, cudaMalloc(&data, size));
- }
-#endif
- return data;
-}
-
-void ContextImpl::CudaFree(void* data, cudaStream_t stream) const {
- // Stream-ordered alloaction API is available since CUDA 11.4, but might be
- // not implemented by particular device
-#if CUDART_VERSION < 11040
-#warning \
- "Stream-ordered allocations are unavailable, consider updating CUDA toolkit to version 11.4+"
- CHECK_EQ(cudaSuccess, cudaFree(data));
-#else
- if (gpu_device_properties_.memoryPoolsSupported) {
- CHECK_EQ(cudaSuccess, cudaFreeAsync(data, stream));
- } else {
- CHECK_EQ(cudaSuccess, cudaFree(data));
- }
-#endif
-}
#endif // CERES_NO_CUDA
ContextImpl::~ContextImpl() {
diff --git a/internal/ceres/context_impl.h b/internal/ceres/context_impl.h
index 6bdfd0f..508fd06 100644
--- a/internal/ceres/context_impl.h
+++ b/internal/ceres/context_impl.h
@@ -119,20 +119,6 @@
// Returns the number of bytes of available global memory on the current CUDA
// device. If it is called before InitCuda, it returns 0.
size_t GpuMemoryAvailable() const;
- // Allocate memory stream-synchronously, if cudaMallocAsync API is available
- // in CUDA Toolkit and particular device supports it. Otherwise
- // device-synchronous allocation is used
- void* CudaMalloc(size_t size, cudaStream_t stream) const;
- template <typename T>
- T* CudaAllocate(size_t num_elements, cudaStream_t stream) const {
- T* data = static_cast<T*>(CudaMalloc(num_elements * sizeof(T), stream));
- return data;
- }
- // Free memory previously allocated by CudaMalloc. If cudaMallocAsync is
- // supported by both CUDA Toolkit used for compilation and device used at
- // runtime - deallocation will be performed stream-synchronously. Otherwise
- // device-synchronous free is used.
- void CudaFree(void* data, cudaStream_t stream) const;
cusolverDnHandle_t cusolver_handle_ = nullptr;
cublasHandle_t cublas_handle_ = nullptr;
diff --git a/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc b/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc
index 8dc68ca..05b52e7 100644
--- a/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc
+++ b/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc
@@ -35,7 +35,6 @@
#include <thrust/scan.h>
#include "ceres/block_structure.h"
-#include "ceres/context_impl.h"
#include "ceres/cuda_kernels_utils.h"
namespace ceres {
@@ -51,6 +50,50 @@
return thrust::cuda::par_nosync.on(stream);
#endif
}
+
+void* CudaMalloc(size_t size,
+ cudaStream_t stream,
+ bool memory_pools_supported) {
+ void* data = nullptr;
+ // Stream-ordered alloaction API is available since CUDA 11.4, but might be
+ // not implemented by particular device
+#if CUDART_VERSION < 11040
+#warning \
+ "Stream-ordered allocations are unavailable, consider updating CUDA toolkit to version 11.4+"
+ cudaMalloc(&data, size);
+#else
+ if (memory_pools_supported) {
+ cudaMallocAsync(&data, size, stream);
+ } else {
+ cudaMalloc(&data, size);
+ }
+#endif
+ return data;
+}
+
+void CudaFree(void* data, cudaStream_t stream, bool memory_pools_supported) {
+ // Stream-ordered alloaction API is available since CUDA 11.4, but might be
+ // not implemented by particular device
+#if CUDART_VERSION < 11040
+#warning \
+ "Stream-ordered allocations are unavailable, consider updating CUDA toolkit to version 11.4+"
+ cudaSuccess, cudaFree(data);
+#else
+ if (memory_pools_supported) {
+ cudaFreeAsync(data, stream);
+ } else {
+ cudaFree(data);
+ }
+#endif
+}
+template <typename T>
+T* CudaAllocate(size_t num_elements,
+ cudaStream_t stream,
+ bool memory_pools_supported) {
+ T* data = static_cast<T*>(
+ CudaMalloc(num_elements * sizeof(T), stream, memory_pools_supported));
+ return data;
+}
} // namespace
// Fill row block id and nnz for each row using block-sparse structure
@@ -188,10 +231,11 @@
int* rows,
int* cols,
cudaStream_t stream,
- ContextImpl* context) {
+ bool memory_pools_supported) {
// Set number of non-zeros per row in rows array and row to row-block map in
// row_block_ids array
- int* row_block_ids = context->CudaAllocate<int>(num_rows, stream);
+ int* row_block_ids =
+ CudaAllocate<int>(num_rows, stream, memory_pools_supported);
const int num_blocks_blockwise = NumBlocksInGrid(num_row_blocks + 1);
RowBlockIdAndNNZ<false><<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>(
num_row_blocks,
@@ -223,7 +267,7 @@
nullptr,
rows,
cols);
- context->CudaFree(row_block_ids, stream);
+ CudaFree(row_block_ids, stream, memory_pools_supported);
}
void FillCRSStructurePartitioned(const int num_row_blocks,
@@ -240,10 +284,11 @@
int* rows_f,
int* cols_f,
cudaStream_t stream,
- ContextImpl* context) {
+ bool memory_pools_supported) {
// Set number of non-zeros per row in rows array and row to row-block map in
// row_block_ids array
- int* row_block_ids = context->CudaAllocate<int>(num_rows, stream);
+ int* row_block_ids =
+ CudaAllocate<int>(num_rows, stream, memory_pools_supported);
const int num_blocks_blockwise = NumBlocksInGrid(num_row_blocks + 1);
RowBlockIdAndNNZ<true><<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>(
num_row_blocks,
@@ -281,7 +326,7 @@
cols_e,
rows_f,
cols_f);
- context->CudaFree(row_block_ids, stream);
+ CudaFree(row_block_ids, stream, memory_pools_supported);
}
template <typename T, typename Predicate>
diff --git a/internal/ceres/cuda_kernels_bsm_to_crs.h b/internal/ceres/cuda_kernels_bsm_to_crs.h
index ddfecfc..27f4a25 100644
--- a/internal/ceres/cuda_kernels_bsm_to_crs.h
+++ b/internal/ceres/cuda_kernels_bsm_to_crs.h
@@ -41,7 +41,6 @@
namespace internal {
struct Block;
struct Cell;
-class ContextImpl;
// Compute structure of CRS matrix using block-sparse structure.
// Arrays corresponding to CRS matrix are to be allocated by caller
@@ -54,7 +53,7 @@
int* rows,
int* cols,
cudaStream_t stream,
- ContextImpl* context);
+ bool memory_pools_supported);
// Compute structure of partitioned CRS matrix using block-sparse structure.
// Arrays corresponding to CRS matrices are to be allocated by caller
@@ -72,7 +71,7 @@
int* rows_f,
int* cols_f,
cudaStream_t stream,
- ContextImpl* context);
+ bool memory_pools_supported);
// Permute segment of values from block-sparse matrix with sequential layout to
// CRS order. Segment starts at block_sparse_offset and has length of num_values