Improve support of older CUDA toolkit versions

 - Provides fall-back for older versions of CUDA toolkit
 - Using older versions of CUDA toolkit might result in
   over-synchronization

Change-Id: I545e6625d2342be30cb759b90bda379e555d7370
diff --git a/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc b/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc
index 8be3553..a1ea6af 100644
--- a/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc
+++ b/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc
@@ -40,6 +40,43 @@
 namespace ceres {
 namespace internal {
 
+namespace {
+inline auto ThrustCudaStreamExecutionPolicy(cudaStream_t stream) {
+  // par_nosync execution policy was added in Thrust 1.16
+  // https://github.com/NVIDIA/thrust/blob/main/CHANGELOG.md#thrust-1160
+#if THRUST_VERSION < 101700
+  return thrust::cuda::par.on(stream);
+#else
+  return thrust::cuda::par_nosync.on(stream);
+#endif
+}
+// Allocate temporary memory on gpu, avoiding synchronization if possible
+template <typename T>
+T* AllocateTemporaryMemory(size_t num_elements, cudaStream_t stream) {
+  T* data;
+  // Stream-ordered alloactions are available since CUDA 11.4
+#if CUDART_VERSION < 11040
+#warning \
+    "Stream-ordered allocations are unavailable, consider updating CUDA toolkit"
+  cudaMalloc(&data, sizeof(T) * num_elements);
+#else
+  cudaMallocAsync(&data, sizeof(T) * num_elements, stream);
+#endif
+  return data;
+}
+
+void FreeTemporaryMemory(void* data, cudaStream_t stream) {
+  // Stream-ordered alloactions are available since CUDA 11.4
+#if CUDART_VERSION < 11040
+#warning \
+    "Stream-ordered allocations are unavailable, consider updating CUDA toolkit"
+  cudaFree(data);
+#else
+  cudaFreeAsync(data, stream);
+#endif
+}
+}  // namespace
+
 // Fill row block id and nnz for each row using block-sparse structure
 // represented by a set of flat arrays.
 // Inputs:
@@ -144,8 +181,7 @@
                       cudaStream_t stream) {
   // Set number of non-zeros per row in rows array and row to row-block map in
   // row_block_ids array
-  int* row_block_ids;
-  cudaMallocAsync(&row_block_ids, sizeof(int) * num_rows, stream);
+  int* row_block_ids = AllocateTemporaryMemory<int>(num_rows, stream);
   const int num_blocks_blockwise = NumBlocksInGrid(num_row_blocks + 1);
   RowBlockIdAndNNZ<<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>(
       num_row_blocks,
@@ -157,7 +193,7 @@
       row_block_ids);
   // Finalize row-index array of CRS strucure by computing prefix sum
   thrust::inclusive_scan(
-      thrust::cuda::par_nosync.on(stream), rows, rows + num_rows + 1, rows);
+      ThrustCudaStreamExecutionPolicy(stream), rows, rows + num_rows + 1, rows);
 
   // Fill cols array of CRS structure and permutation from block-sparse to CRS
   const int num_blocks_rowwise = NumBlocksInGrid(num_rows);
@@ -172,7 +208,7 @@
                                            row_block_ids,
                                            rows,
                                            cols);
-  cudaFreeAsync(row_block_ids, stream);
+  FreeTemporaryMemory(row_block_ids, stream);
 }
 
 template <typename T, typename Predicate>