internal/ceres/cuda_kernels_bsm_to_crs.cu.cc - ceres-solver.git - Git at Google

 // Ceres Solver - A fast non-linear least squares minimizer
 // Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
 // * Redistributions of source code must retain the above copyright notice,
 //   this list of conditions and the following disclaimer.
 // * Redistributions in binary form must reproduce the above copyright notice,
 //   this list of conditions and the following disclaimer in the documentation
 //   and/or other materials provided with the distribution.
 // * Neither the name of Google Inc. nor the names of its contributors may be
 //   used to endorse or promote products derived from this software without
 //   specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 //
 // Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)

 #include "ceres/cuda_kernels_bsm_to_crs.h"

 #include <cuda_runtime.h>
 #include <thrust/execution_policy.h>
 #include <thrust/scan.h>

 #include "ceres/block_structure.h"
 #include "ceres/cuda_kernels_utils.h"

 namespace ceres {
 namespace internal {

 namespace {
 inline auto ThrustCudaStreamExecutionPolicy(cudaStream_t stream) {
   // par_nosync execution policy was added in Thrust 1.16
   // https://github.com/NVIDIA/thrust/blob/main/CHANGELOG.md#thrust-1160
 #if THRUST_VERSION < 101700
   return thrust::cuda::par.on(stream);
 #else
   return thrust::cuda::par_nosync.on(stream);
 #endif
 }

 void* CudaMalloc(size_t size,
                  cudaStream_t stream,
                  bool memory_pools_supported) {
   void* data = nullptr;
   // Stream-ordered alloaction API is available since CUDA 11.2, but might be
   // not implemented by particular device
 #if CUDART_VERSION < 11020
 #warning \
     "Stream-ordered allocations are unavailable, consider updating CUDA toolkit to version 11.2+"
   cudaMalloc(&data, size);
 #else
   if (memory_pools_supported) {
     cudaMallocAsync(&data, size, stream);
   } else {
     cudaMalloc(&data, size);
   }
 #endif
   return data;
 }

 void CudaFree(void* data, cudaStream_t stream, bool memory_pools_supported) {
   // Stream-ordered alloaction API is available since CUDA 11.2, but might be
   // not implemented by particular device
 #if CUDART_VERSION < 11020
 #warning \
     "Stream-ordered allocations are unavailable, consider updating CUDA toolkit to version 11.2+"
   cudaSuccess, cudaFree(data);
 #else
   if (memory_pools_supported) {
     cudaFreeAsync(data, stream);
   } else {
     cudaFree(data);
   }
 #endif
 }
 template <typename T>
 T* CudaAllocate(size_t num_elements,
                 cudaStream_t stream,
                 bool memory_pools_supported) {
   T* data = static_cast<T*>(
       CudaMalloc(num_elements * sizeof(T), stream, memory_pools_supported));
   return data;
 }
 }  // namespace

 // Fill row block id and nnz for each row using block-sparse structure
 // represented by a set of flat arrays.
 // Inputs:
 // - num_row_blocks: number of row-blocks in block-sparse structure
 // - first_cell_in_row_block: index of the first cell of the row-block; size:
 // num_row_blocks + 1
 // - cells: cells of block-sparse structure as a continuous array
 // - row_blocks: row blocks of block-sparse structure stored sequentially
 // - col_blocks: column blocks of block-sparse structure stored sequentially
 // Outputs:
 // - rows: rows[i + 1] will contain number of non-zeros in i-th row, rows[0]
 // will be set to 0; rows are filled with a shift by one element in order
 // to obtain row-index array of CRS matrix with a inclusive scan afterwards
 // - row_block_ids: row_block_ids[i] will be set to index of row-block that
 // contains i-th row.
 // Computation is perform row-block-wise
 template <bool partitioned = false>
 __global__ void RowBlockIdAndNNZ(
     const int num_row_blocks,
     const int num_col_blocks_e,
     const int num_row_blocks_e,
     const int* __restrict__ first_cell_in_row_block,
     const Cell* __restrict__ cells,
     const Block* __restrict__ row_blocks,
     const Block* __restrict__ col_blocks,
     int* __restrict__ rows_e,
     int* __restrict__ rows_f,
     int* __restrict__ row_block_ids) {
   const int row_block_id = blockIdx.x * blockDim.x + threadIdx.x;
   if (row_block_id > num_row_blocks) {
     // No synchronization is performed in this kernel, thus it is safe to return
     return;
   }
   if (row_block_id == num_row_blocks) {
     // one extra thread sets the first element
     rows_f[0] = 0;
     if constexpr (partitioned) {
       rows_e[0] = 0;
     }
     return;
   }
   const auto& row_block = row_blocks[row_block_id];
   auto first_cell = cells + first_cell_in_row_block[row_block_id];
   const auto last_cell = cells + first_cell_in_row_block[row_block_id + 1];
   [[maybe_unused]] int row_nnz_e = 0;
   if (partitioned && row_block_id < num_row_blocks_e) {
     // First cell is a cell from E
     row_nnz_e = col_blocks[first_cell->block_id].size;
     ++first_cell;
   }
   int row_nnz_f = 0;
   for (auto cell = first_cell; cell < last_cell; ++cell) {
     row_nnz_f += col_blocks[cell->block_id].size;
   }
   const int first_row = row_block.position;
   const int last_row = first_row + row_block.size;
   for (int i = first_row; i < last_row; ++i) {
     if constexpr (partitioned) {
       rows_e[i + 1] = row_nnz_e;
     }
     rows_f[i + 1] = row_nnz_f;
     row_block_ids[i] = row_block_id;
   }
 }

 // Row-wise creation of CRS structure
 // Inputs:
 // - num_rows: number of rows in matrix
 // - first_cell_in_row_block: index of the first cell of the row-block; size:
 // num_row_blocks + 1
 // - cells: cells of block-sparse structure as a continuous array
 // - row_blocks: row blocks of block-sparse structure stored sequentially
 // - col_blocks: column blocks of block-sparse structure stored sequentially
 // - row_block_ids: index of row-block that corresponds to row
 // - rows: row-index array of CRS structure
 // Outputs:
 // - cols: column-index array of CRS structure
 // Computaion is perform row-wise
 template <bool partitioned>
 __global__ void ComputeColumns(const int num_rows,
                                const int num_row_blocks_e,
                                const int num_col_blocks_e,
                                const int* __restrict__ first_cell_in_row_block,
                                const Cell* __restrict__ cells,
                                const Block* __restrict__ row_blocks,
                                const Block* __restrict__ col_blocks,
                                const int* __restrict__ row_block_ids,
                                const int* __restrict__ rows_e,
                                int* __restrict__ cols_e,
                                const int* __restrict__ rows_f,
                                int* __restrict__ cols_f) {
   const int row = blockIdx.x * blockDim.x + threadIdx.x;
   if (row >= num_rows) {
     // No synchronization is performed in this kernel, thus it is safe to return
     return;
   }
   const int row_block_id = row_block_ids[row];
   // position in crs matrix
   auto first_cell = cells + first_cell_in_row_block[row_block_id];
   const auto last_cell = cells + first_cell_in_row_block[row_block_id + 1];
   const int num_cols_e = col_blocks[num_col_blocks_e].position;
   // For reach cell of row-block only current row is being filled
   if (partitioned && row_block_id < num_row_blocks_e) {
     // The first cell is cell from E
     const auto& col_block = col_blocks[first_cell->block_id];
     const int col_block_size = col_block.size;
     int column_idx = col_block.position;
     int crs_position_e = rows_e[row];
     // Column indices for each element of row_in_block row of current cell
     for (int i = 0; i < col_block_size; ++i, ++crs_position_e) {
       cols_e[crs_position_e] = column_idx++;
     }
     ++first_cell;
   }
   int crs_position_f = rows_f[row];
   for (auto cell = first_cell; cell < last_cell; ++cell) {
     const auto& col_block = col_blocks[cell->block_id];
     const int col_block_size = col_block.size;
     int column_idx = col_block.position - num_cols_e;
     // Column indices for each element of row_in_block row of current cell
     for (int i = 0; i < col_block_size; ++i, ++crs_position_f) {
       cols_f[crs_position_f] = column_idx++;
     }
   }
 }

 void FillCRSStructure(const int num_row_blocks,
                       const int num_rows,
                       const int* first_cell_in_row_block,
                       const Cell* cells,
                       const Block* row_blocks,
                       const Block* col_blocks,
                       int* rows,
                       int* cols,
                       cudaStream_t stream,
                       bool memory_pools_supported) {
   // Set number of non-zeros per row in rows array and row to row-block map in
   // row_block_ids array
   int* row_block_ids =
       CudaAllocate<int>(num_rows, stream, memory_pools_supported);
   const int num_blocks_blockwise = NumBlocksInGrid(num_row_blocks + 1);
   RowBlockIdAndNNZ<false><<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>(
       num_row_blocks,
       0,
       0,
       first_cell_in_row_block,
       cells,
       row_blocks,
       col_blocks,
       nullptr,
       rows,
       row_block_ids);
   // Finalize row-index array of CRS strucure by computing prefix sum
   thrust::inclusive_scan(
       ThrustCudaStreamExecutionPolicy(stream), rows, rows + num_rows + 1, rows);

   // Fill cols array of CRS structure
   const int num_blocks_rowwise = NumBlocksInGrid(num_rows);
   ComputeColumns<false><<<num_blocks_rowwise, kCudaBlockSize, 0, stream>>>(
       num_rows,
       0,
       0,
       first_cell_in_row_block,
       cells,
       row_blocks,
       col_blocks,
       row_block_ids,
       nullptr,
       nullptr,
       rows,
       cols);
   CudaFree(row_block_ids, stream, memory_pools_supported);
 }

 void FillCRSStructurePartitioned(const int num_row_blocks,
                                  const int num_rows,
                                  const int num_row_blocks_e,
                                  const int num_col_blocks_e,
                                  const int num_nonzeros_e,
                                  const int* first_cell_in_row_block,
                                  const Cell* cells,
                                  const Block* row_blocks,
                                  const Block* col_blocks,
                                  int* rows_e,
                                  int* cols_e,
                                  int* rows_f,
                                  int* cols_f,
                                  cudaStream_t stream,
                                  bool memory_pools_supported) {
   // Set number of non-zeros per row in rows array and row to row-block map in
   // row_block_ids array
   int* row_block_ids =
       CudaAllocate<int>(num_rows, stream, memory_pools_supported);
   const int num_blocks_blockwise = NumBlocksInGrid(num_row_blocks + 1);
   RowBlockIdAndNNZ<true><<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>(
       num_row_blocks,
       num_col_blocks_e,
       num_row_blocks_e,
       first_cell_in_row_block,
       cells,
       row_blocks,
       col_blocks,
       rows_e,
       rows_f,
       row_block_ids);
   // Finalize row-index array of CRS strucure by computing prefix sum
   thrust::inclusive_scan(ThrustCudaStreamExecutionPolicy(stream),
                          rows_e,
                          rows_e + num_rows + 1,
                          rows_e);
   thrust::inclusive_scan(ThrustCudaStreamExecutionPolicy(stream),
                          rows_f,
                          rows_f + num_rows + 1,
                          rows_f);

   // Fill cols array of CRS structure
   const int num_blocks_rowwise = NumBlocksInGrid(num_rows);
   ComputeColumns<true><<<num_blocks_rowwise, kCudaBlockSize, 0, stream>>>(
       num_rows,
       num_row_blocks_e,
       num_col_blocks_e,
       first_cell_in_row_block,
       cells,
       row_blocks,
       col_blocks,
       row_block_ids,
       rows_e,
       cols_e,
       rows_f,
       cols_f);
   CudaFree(row_block_ids, stream, memory_pools_supported);
 }

 template <typename T, typename Predicate>
 __device__ int PartitionPoint(const T* data,
                               int first,
                               int last,
                               Predicate&& predicate) {
   if (!predicate(data[first])) {
     return first;
   }
   while (last - first > 1) {
     const auto midpoint = first + (last - first) / 2;
     if (predicate(data[midpoint])) {
       first = midpoint;
     } else {
       last = midpoint;
     }
   }
   return last;
 }

 // Element-wise reordering of block-sparse values
 // - first_cell_in_row_block - position of the first cell of row-block
 // - block_sparse_values - segment of block-sparse values starting from
 // block_sparse_offset, containing num_values
 template <bool partitioned>
 __global__ void PermuteToCrsKernel(
     const int block_sparse_offset,
     const int num_values,
     const int num_row_blocks,
     const int num_row_blocks_e,
     const int* __restrict__ first_cell_in_row_block,
     const int* __restrict__ value_offset_row_block_f,
     const Cell* __restrict__ cells,
     const Block* __restrict__ row_blocks,
     const Block* __restrict__ col_blocks,
     const int* __restrict__ crs_rows,
     const double* __restrict__ block_sparse_values,
     double* __restrict__ crs_values) {
   const int value_id = blockIdx.x * blockDim.x + threadIdx.x;
   if (value_id >= num_values) {
     return;
   }
   const int block_sparse_value_id = value_id + block_sparse_offset;
   // Find the corresponding row-block with a binary search
   const int row_block_id =
       (partitioned
            ? PartitionPoint(value_offset_row_block_f,
                             0,
                             num_row_blocks,
                             [block_sparse_value_id] __device__(
                                 const int row_block_offset) {
                               return row_block_offset <= block_sparse_value_id;
                             })
            : PartitionPoint(first_cell_in_row_block,
                             0,
                             num_row_blocks,
                             [cells, block_sparse_value_id] __device__(
                                 const int row_block_offset) {
                               return cells[row_block_offset].position <=
                                      block_sparse_value_id;
                             })) -
       1;
   // Find cell and calculate offset within the row with a linear scan
   const auto& row_block = row_blocks[row_block_id];
   auto first_cell = cells + first_cell_in_row_block[row_block_id];
   const auto last_cell = cells + first_cell_in_row_block[row_block_id + 1];
   const int row_block_size = row_block.size;
   int num_cols_before = 0;
   if (partitioned && row_block_id < num_row_blocks_e) {
     ++first_cell;
   }
   for (const Cell* cell = first_cell; cell < last_cell; ++cell) {
     const auto& col_block = col_blocks[cell->block_id];
     const int col_block_size = col_block.size;
     const int cell_size = row_block_size * col_block_size;
     if (cell->position + cell_size > block_sparse_value_id) {
       const int pos_in_cell = block_sparse_value_id - cell->position;
       const int row_in_cell = pos_in_cell / col_block_size;
       const int col_in_cell = pos_in_cell % col_block_size;
       const int row = row_in_cell + row_block.position;
       crs_values[crs_rows[row] + num_cols_before + col_in_cell] =
           block_sparse_values[value_id];
       break;
     }
     num_cols_before += col_block_size;
   }
 }

 void PermuteToCRS(const int block_sparse_offset,
                   const int num_values,
                   const int num_row_blocks,
                   const int* first_cell_in_row_block,
                   const Cell* cells,
                   const Block* row_blocks,
                   const Block* col_blocks,
                   const int* crs_rows,
                   const double* block_sparse_values,
                   double* crs_values,
                   cudaStream_t stream) {
   const int num_blocks_valuewise = NumBlocksInGrid(num_values);
   PermuteToCrsKernel<false>
       <<<num_blocks_valuewise, kCudaBlockSize, 0, stream>>>(
           block_sparse_offset,
           num_values,
           num_row_blocks,
           0,
           first_cell_in_row_block,
           nullptr,
           cells,
           row_blocks,
           col_blocks,
           crs_rows,
           block_sparse_values,
           crs_values);
 }

 void PermuteToCRSPartitionedF(const int block_sparse_offset,
                               const int num_values,
                               const int num_row_blocks,
                               const int num_row_blocks_e,
                               const int* first_cell_in_row_block,
                               const int* value_offset_row_block_f,
                               const Cell* cells,
                               const Block* row_blocks,
                               const Block* col_blocks,
                               const int* crs_rows,
                               const double* block_sparse_values,
                               double* crs_values,
                               cudaStream_t stream) {
   const int num_blocks_valuewise = NumBlocksInGrid(num_values);
   PermuteToCrsKernel<true><<<num_blocks_valuewise, kCudaBlockSize, 0, stream>>>(
       block_sparse_offset,
       num_values,
       num_row_blocks,
       num_row_blocks_e,
       first_cell_in_row_block,
       value_offset_row_block_f,
       cells,
       row_blocks,
       col_blocks,
       crs_rows,
       block_sparse_values,
       crs_values);
 }

 }  // namespace internal
 }  // namespace ceres
	// Ceres Solver - A fast non-linear least squares minimizer
	// Copyright 2023 Google Inc. All rights reserved.
	// http://ceres-solver.org/
	//
	// Redistribution and use in source and binary forms, with or without
	// modification, are permitted provided that the following conditions are met:
	//
	// * Redistributions of source code must retain the above copyright notice,
	// this list of conditions and the following disclaimer.
	// * Redistributions in binary form must reproduce the above copyright notice,
	// this list of conditions and the following disclaimer in the documentation
	// and/or other materials provided with the distribution.
	// * Neither the name of Google Inc. nor the names of its contributors may be
	// used to endorse or promote products derived from this software without
	// specific prior written permission.
	//
	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
	// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	// POSSIBILITY OF SUCH DAMAGE.
	//
	// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)

	#include "ceres/cuda_kernels_bsm_to_crs.h"

	#include <cuda_runtime.h>
	#include <thrust/execution_policy.h>
	#include <thrust/scan.h>

	#include "ceres/block_structure.h"
	#include "ceres/cuda_kernels_utils.h"

	namespace ceres {
	namespace internal {

	namespace {
	inline auto ThrustCudaStreamExecutionPolicy(cudaStream_t stream) {
	// par_nosync execution policy was added in Thrust 1.16
	// https://github.com/NVIDIA/thrust/blob/main/CHANGELOG.md#thrust-1160
	#if THRUST_VERSION < 101700
	return thrust::cuda::par.on(stream);
	#else
	return thrust::cuda::par_nosync.on(stream);
	#endif
	}

	void* CudaMalloc(size_t size,
	cudaStream_t stream,
	bool memory_pools_supported) {
	void* data = nullptr;
	// Stream-ordered alloaction API is available since CUDA 11.2, but might be
	// not implemented by particular device
	#if CUDART_VERSION < 11020
	#warning \
	"Stream-ordered allocations are unavailable, consider updating CUDA toolkit to version 11.2+"
	cudaMalloc(&data, size);
	#else
	if (memory_pools_supported) {
	cudaMallocAsync(&data, size, stream);
	} else {
	cudaMalloc(&data, size);
	}
	#endif
	return data;
	}

	void CudaFree(void* data, cudaStream_t stream, bool memory_pools_supported) {
	// Stream-ordered alloaction API is available since CUDA 11.2, but might be
	// not implemented by particular device
	#if CUDART_VERSION < 11020
	#warning \
	"Stream-ordered allocations are unavailable, consider updating CUDA toolkit to version 11.2+"
	cudaSuccess, cudaFree(data);
	#else
	if (memory_pools_supported) {
	cudaFreeAsync(data, stream);
	} else {
	cudaFree(data);
	}
	#endif
	}
	template <typename T>
	T* CudaAllocate(size_t num_elements,
	cudaStream_t stream,
	bool memory_pools_supported) {
	T* data = static_cast<T*>(
	CudaMalloc(num_elements * sizeof(T), stream, memory_pools_supported));
	return data;
	}
	} // namespace

	// Fill row block id and nnz for each row using block-sparse structure
	// represented by a set of flat arrays.
	// Inputs:
	// - num_row_blocks: number of row-blocks in block-sparse structure
	// - first_cell_in_row_block: index of the first cell of the row-block; size:
	// num_row_blocks + 1
	// - cells: cells of block-sparse structure as a continuous array
	// - row_blocks: row blocks of block-sparse structure stored sequentially
	// - col_blocks: column blocks of block-sparse structure stored sequentially
	// Outputs:
	// - rows: rows[i + 1] will contain number of non-zeros in i-th row, rows[0]
	// will be set to 0; rows are filled with a shift by one element in order
	// to obtain row-index array of CRS matrix with a inclusive scan afterwards
	// - row_block_ids: row_block_ids[i] will be set to index of row-block that
	// contains i-th row.
	// Computation is perform row-block-wise
	template <bool partitioned = false>
	__global__ void RowBlockIdAndNNZ(
	const int num_row_blocks,
	const int num_col_blocks_e,
	const int num_row_blocks_e,
	const int* __restrict__ first_cell_in_row_block,
	const Cell* __restrict__ cells,
	const Block* __restrict__ row_blocks,
	const Block* __restrict__ col_blocks,
	int* __restrict__ rows_e,
	int* __restrict__ rows_f,
	int* __restrict__ row_block_ids) {
	const int row_block_id = blockIdx.x * blockDim.x + threadIdx.x;
	if (row_block_id > num_row_blocks) {
	// No synchronization is performed in this kernel, thus it is safe to return
	return;
	}
	if (row_block_id == num_row_blocks) {
	// one extra thread sets the first element
	rows_f[0] = 0;
	if constexpr (partitioned) {
	rows_e[0] = 0;
	}
	return;
	}
	const auto& row_block = row_blocks[row_block_id];
	auto first_cell = cells + first_cell_in_row_block[row_block_id];
	const auto last_cell = cells + first_cell_in_row_block[row_block_id + 1];
	[[maybe_unused]] int row_nnz_e = 0;
	if (partitioned && row_block_id < num_row_blocks_e) {
	// First cell is a cell from E
	row_nnz_e = col_blocks[first_cell->block_id].size;
	++first_cell;
	}
	int row_nnz_f = 0;
	for (auto cell = first_cell; cell < last_cell; ++cell) {
	row_nnz_f += col_blocks[cell->block_id].size;
	}
	const int first_row = row_block.position;
	const int last_row = first_row + row_block.size;
	for (int i = first_row; i < last_row; ++i) {
	if constexpr (partitioned) {
	rows_e[i + 1] = row_nnz_e;
	}
	rows_f[i + 1] = row_nnz_f;
	row_block_ids[i] = row_block_id;
	}
	}

	// Row-wise creation of CRS structure
	// Inputs:
	// - num_rows: number of rows in matrix
	// - first_cell_in_row_block: index of the first cell of the row-block; size:
	// num_row_blocks + 1
	// - cells: cells of block-sparse structure as a continuous array
	// - row_blocks: row blocks of block-sparse structure stored sequentially
	// - col_blocks: column blocks of block-sparse structure stored sequentially
	// - row_block_ids: index of row-block that corresponds to row
	// - rows: row-index array of CRS structure
	// Outputs:
	// - cols: column-index array of CRS structure
	// Computaion is perform row-wise
	template <bool partitioned>
	__global__ void ComputeColumns(const int num_rows,
	const int num_row_blocks_e,
	const int num_col_blocks_e,
	const int* __restrict__ first_cell_in_row_block,
	const Cell* __restrict__ cells,
	const Block* __restrict__ row_blocks,
	const Block* __restrict__ col_blocks,
	const int* __restrict__ row_block_ids,
	const int* __restrict__ rows_e,
	int* __restrict__ cols_e,
	const int* __restrict__ rows_f,
	int* __restrict__ cols_f) {
	const int row = blockIdx.x * blockDim.x + threadIdx.x;
	if (row >= num_rows) {
	// No synchronization is performed in this kernel, thus it is safe to return
	return;
	}
	const int row_block_id = row_block_ids[row];
	// position in crs matrix
	auto first_cell = cells + first_cell_in_row_block[row_block_id];
	const auto last_cell = cells + first_cell_in_row_block[row_block_id + 1];
	const int num_cols_e = col_blocks[num_col_blocks_e].position;
	// For reach cell of row-block only current row is being filled
	if (partitioned && row_block_id < num_row_blocks_e) {
	// The first cell is cell from E
	const auto& col_block = col_blocks[first_cell->block_id];
	const int col_block_size = col_block.size;
	int column_idx = col_block.position;
	int crs_position_e = rows_e[row];
	// Column indices for each element of row_in_block row of current cell
	for (int i = 0; i < col_block_size; ++i, ++crs_position_e) {
	cols_e[crs_position_e] = column_idx++;
	}
	++first_cell;
	}
	int crs_position_f = rows_f[row];
	for (auto cell = first_cell; cell < last_cell; ++cell) {
	const auto& col_block = col_blocks[cell->block_id];
	const int col_block_size = col_block.size;
	int column_idx = col_block.position - num_cols_e;
	// Column indices for each element of row_in_block row of current cell
	for (int i = 0; i < col_block_size; ++i, ++crs_position_f) {
	cols_f[crs_position_f] = column_idx++;
	}
	}
	}

	void FillCRSStructure(const int num_row_blocks,
	const int num_rows,
	const int* first_cell_in_row_block,
	const Cell* cells,
	const Block* row_blocks,
	const Block* col_blocks,
	int* rows,
	int* cols,
	cudaStream_t stream,
	bool memory_pools_supported) {
	// Set number of non-zeros per row in rows array and row to row-block map in
	// row_block_ids array
	int* row_block_ids =
	CudaAllocate<int>(num_rows, stream, memory_pools_supported);
	const int num_blocks_blockwise = NumBlocksInGrid(num_row_blocks + 1);
	RowBlockIdAndNNZ<false><<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>(
	num_row_blocks,
	0,
	0,
	first_cell_in_row_block,
	cells,
	row_blocks,
	col_blocks,
	nullptr,
	rows,
	row_block_ids);
	// Finalize row-index array of CRS strucure by computing prefix sum
	thrust::inclusive_scan(
	ThrustCudaStreamExecutionPolicy(stream), rows, rows + num_rows + 1, rows);

	// Fill cols array of CRS structure
	const int num_blocks_rowwise = NumBlocksInGrid(num_rows);
	ComputeColumns<false><<<num_blocks_rowwise, kCudaBlockSize, 0, stream>>>(
	num_rows,
	0,
	0,
	first_cell_in_row_block,
	cells,
	row_blocks,
	col_blocks,
	row_block_ids,
	nullptr,
	nullptr,
	rows,
	cols);
	CudaFree(row_block_ids, stream, memory_pools_supported);
	}

	void FillCRSStructurePartitioned(const int num_row_blocks,
	const int num_rows,
	const int num_row_blocks_e,
	const int num_col_blocks_e,
	const int num_nonzeros_e,
	const int* first_cell_in_row_block,
	const Cell* cells,
	const Block* row_blocks,
	const Block* col_blocks,
	int* rows_e,
	int* cols_e,
	int* rows_f,
	int* cols_f,
	cudaStream_t stream,
	bool memory_pools_supported) {
	// Set number of non-zeros per row in rows array and row to row-block map in
	// row_block_ids array
	int* row_block_ids =
	CudaAllocate<int>(num_rows, stream, memory_pools_supported);
	const int num_blocks_blockwise = NumBlocksInGrid(num_row_blocks + 1);
	RowBlockIdAndNNZ<true><<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>(
	num_row_blocks,
	num_col_blocks_e,
	num_row_blocks_e,
	first_cell_in_row_block,
	cells,
	row_blocks,
	col_blocks,
	rows_e,
	rows_f,
	row_block_ids);
	// Finalize row-index array of CRS strucure by computing prefix sum
	thrust::inclusive_scan(ThrustCudaStreamExecutionPolicy(stream),
	rows_e,
	rows_e + num_rows + 1,
	rows_e);
	thrust::inclusive_scan(ThrustCudaStreamExecutionPolicy(stream),
	rows_f,
	rows_f + num_rows + 1,
	rows_f);

	// Fill cols array of CRS structure
	const int num_blocks_rowwise = NumBlocksInGrid(num_rows);
	ComputeColumns<true><<<num_blocks_rowwise, kCudaBlockSize, 0, stream>>>(
	num_rows,
	num_row_blocks_e,
	num_col_blocks_e,
	first_cell_in_row_block,
	cells,
	row_blocks,
	col_blocks,
	row_block_ids,
	rows_e,
	cols_e,
	rows_f,
	cols_f);
	CudaFree(row_block_ids, stream, memory_pools_supported);
	}

	template <typename T, typename Predicate>
	__device__ int PartitionPoint(const T* data,
	int first,
	int last,
	Predicate&& predicate) {
	if (!predicate(data[first])) {
	return first;
	}
	while (last - first > 1) {
	const auto midpoint = first + (last - first) / 2;
	if (predicate(data[midpoint])) {
	first = midpoint;
	} else {
	last = midpoint;
	}
	}
	return last;
	}

	// Element-wise reordering of block-sparse values
	// - first_cell_in_row_block - position of the first cell of row-block
	// - block_sparse_values - segment of block-sparse values starting from
	// block_sparse_offset, containing num_values
	template <bool partitioned>
	__global__ void PermuteToCrsKernel(
	const int block_sparse_offset,
	const int num_values,
	const int num_row_blocks,
	const int num_row_blocks_e,
	const int* __restrict__ first_cell_in_row_block,
	const int* __restrict__ value_offset_row_block_f,
	const Cell* __restrict__ cells,
	const Block* __restrict__ row_blocks,
	const Block* __restrict__ col_blocks,
	const int* __restrict__ crs_rows,
	const double* __restrict__ block_sparse_values,
	double* __restrict__ crs_values) {
	const int value_id = blockIdx.x * blockDim.x + threadIdx.x;
	if (value_id >= num_values) {
	return;
	}
	const int block_sparse_value_id = value_id + block_sparse_offset;
	// Find the corresponding row-block with a binary search
	const int row_block_id =
	(partitioned
	? PartitionPoint(value_offset_row_block_f,
	0,
	num_row_blocks,
	[block_sparse_value_id] __device__(
	const int row_block_offset) {
	return row_block_offset <= block_sparse_value_id;
	})
	: PartitionPoint(first_cell_in_row_block,
	0,
	num_row_blocks,
	[cells, block_sparse_value_id] __device__(
	const int row_block_offset) {
	return cells[row_block_offset].position <=
	block_sparse_value_id;
	})) -
	1;
	// Find cell and calculate offset within the row with a linear scan
	const auto& row_block = row_blocks[row_block_id];
	auto first_cell = cells + first_cell_in_row_block[row_block_id];
	const auto last_cell = cells + first_cell_in_row_block[row_block_id + 1];
	const int row_block_size = row_block.size;
	int num_cols_before = 0;
	if (partitioned && row_block_id < num_row_blocks_e) {
	++first_cell;
	}
	for (const Cell* cell = first_cell; cell < last_cell; ++cell) {
	const auto& col_block = col_blocks[cell->block_id];
	const int col_block_size = col_block.size;
	const int cell_size = row_block_size * col_block_size;
	if (cell->position + cell_size > block_sparse_value_id) {
	const int pos_in_cell = block_sparse_value_id - cell->position;
	const int row_in_cell = pos_in_cell / col_block_size;
	const int col_in_cell = pos_in_cell % col_block_size;
	const int row = row_in_cell + row_block.position;
	crs_values[crs_rows[row] + num_cols_before + col_in_cell] =
	block_sparse_values[value_id];
	break;
	}
	num_cols_before += col_block_size;
	}
	}

	void PermuteToCRS(const int block_sparse_offset,
	const int num_values,
	const int num_row_blocks,
	const int* first_cell_in_row_block,
	const Cell* cells,
	const Block* row_blocks,
	const Block* col_blocks,
	const int* crs_rows,
	const double* block_sparse_values,
	double* crs_values,
	cudaStream_t stream) {
	const int num_blocks_valuewise = NumBlocksInGrid(num_values);
	PermuteToCrsKernel<false>
	<<<num_blocks_valuewise, kCudaBlockSize, 0, stream>>>(
	block_sparse_offset,
	num_values,
	num_row_blocks,
	0,
	first_cell_in_row_block,
	nullptr,
	cells,
	row_blocks,
	col_blocks,
	crs_rows,
	block_sparse_values,
	crs_values);
	}

	void PermuteToCRSPartitionedF(const int block_sparse_offset,
	const int num_values,
	const int num_row_blocks,
	const int num_row_blocks_e,
	const int* first_cell_in_row_block,
	const int* value_offset_row_block_f,
	const Cell* cells,
	const Block* row_blocks,
	const Block* col_blocks,
	const int* crs_rows,
	const double* block_sparse_values,
	double* crs_values,
	cudaStream_t stream) {
	const int num_blocks_valuewise = NumBlocksInGrid(num_values);
	PermuteToCrsKernel<true><<<num_blocks_valuewise, kCudaBlockSize, 0, stream>>>(
	block_sparse_offset,
	num_values,
	num_row_blocks,
	num_row_blocks_e,
	first_cell_in_row_block,
	value_offset_row_block_f,
	cells,
	row_blocks,
	col_blocks,
	crs_rows,
	block_sparse_values,
	crs_values);
	}

	} // namespace internal
	} // namespace ceres