|  | // Ceres Solver - A fast non-linear least squares minimizer | 
|  | // Copyright 2023 Google Inc. All rights reserved. | 
|  | // http://ceres-solver.org/ | 
|  | // | 
|  | // Redistribution and use in source and binary forms, with or without | 
|  | // modification, are permitted provided that the following conditions are met: | 
|  | // | 
|  | // * Redistributions of source code must retain the above copyright notice, | 
|  | //   this list of conditions and the following disclaimer. | 
|  | // * Redistributions in binary form must reproduce the above copyright notice, | 
|  | //   this list of conditions and the following disclaimer in the documentation | 
|  | //   and/or other materials provided with the distribution. | 
|  | // * Neither the name of Google Inc. nor the names of its contributors may be | 
|  | //   used to endorse or promote products derived from this software without | 
|  | //   specific prior written permission. | 
|  | // | 
|  | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | 
|  | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
|  | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | 
|  | // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | 
|  | // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | 
|  | // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | 
|  | // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | 
|  | // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | 
|  | // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | 
|  | // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | 
|  | // POSSIBILITY OF SUCH DAMAGE. | 
|  | // | 
|  | // Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin) | 
|  |  | 
|  | #include "ceres/cuda_kernels_bsm_to_crs.h" | 
|  |  | 
|  | #include <cuda_runtime.h> | 
|  | #include <thrust/execution_policy.h> | 
|  | #include <thrust/scan.h> | 
|  |  | 
|  | #include "ceres/block_structure.h" | 
|  | #include "ceres/cuda_kernels_utils.h" | 
|  |  | 
|  | namespace ceres { | 
|  | namespace internal { | 
|  |  | 
|  | namespace { | 
|  | inline auto ThrustCudaStreamExecutionPolicy(cudaStream_t stream) { | 
|  | // par_nosync execution policy was added in Thrust 1.16 | 
|  | // https://github.com/NVIDIA/thrust/blob/main/CHANGELOG.md#thrust-1160 | 
|  | #if THRUST_VERSION < 101700 | 
|  | return thrust::cuda::par.on(stream); | 
|  | #else | 
|  | return thrust::cuda::par_nosync.on(stream); | 
|  | #endif | 
|  | } | 
|  |  | 
|  | void* CudaMalloc(size_t size, | 
|  | cudaStream_t stream, | 
|  | bool memory_pools_supported) { | 
|  | void* data = nullptr; | 
|  | // Stream-ordered alloaction API is available since CUDA 11.2, but might be | 
|  | // not implemented by particular device | 
|  | #if CUDART_VERSION < 11020 | 
|  | #warning \ | 
|  | "Stream-ordered allocations are unavailable, consider updating CUDA toolkit to version 11.2+" | 
|  | cudaMalloc(&data, size); | 
|  | #else | 
|  | if (memory_pools_supported) { | 
|  | cudaMallocAsync(&data, size, stream); | 
|  | } else { | 
|  | cudaMalloc(&data, size); | 
|  | } | 
|  | #endif | 
|  | return data; | 
|  | } | 
|  |  | 
|  | void CudaFree(void* data, cudaStream_t stream, bool memory_pools_supported) { | 
|  | // Stream-ordered alloaction API is available since CUDA 11.2, but might be | 
|  | // not implemented by particular device | 
|  | #if CUDART_VERSION < 11020 | 
|  | #warning \ | 
|  | "Stream-ordered allocations are unavailable, consider updating CUDA toolkit to version 11.2+" | 
|  | cudaSuccess, cudaFree(data); | 
|  | #else | 
|  | if (memory_pools_supported) { | 
|  | cudaFreeAsync(data, stream); | 
|  | } else { | 
|  | cudaFree(data); | 
|  | } | 
|  | #endif | 
|  | } | 
|  | template <typename T> | 
|  | T* CudaAllocate(size_t num_elements, | 
|  | cudaStream_t stream, | 
|  | bool memory_pools_supported) { | 
|  | T* data = static_cast<T*>( | 
|  | CudaMalloc(num_elements * sizeof(T), stream, memory_pools_supported)); | 
|  | return data; | 
|  | } | 
|  | }  // namespace | 
|  |  | 
|  | // Fill row block id and nnz for each row using block-sparse structure | 
|  | // represented by a set of flat arrays. | 
|  | // Inputs: | 
|  | // - num_row_blocks: number of row-blocks in block-sparse structure | 
|  | // - first_cell_in_row_block: index of the first cell of the row-block; size: | 
|  | // num_row_blocks + 1 | 
|  | // - cells: cells of block-sparse structure as a continuous array | 
|  | // - row_blocks: row blocks of block-sparse structure stored sequentially | 
|  | // - col_blocks: column blocks of block-sparse structure stored sequentially | 
|  | // Outputs: | 
|  | // - rows: rows[i + 1] will contain number of non-zeros in i-th row, rows[0] | 
|  | // will be set to 0; rows are filled with a shift by one element in order | 
|  | // to obtain row-index array of CRS matrix with a inclusive scan afterwards | 
|  | // - row_block_ids: row_block_ids[i] will be set to index of row-block that | 
|  | // contains i-th row. | 
|  | // Computation is perform row-block-wise | 
|  | template <bool partitioned = false> | 
|  | __global__ void RowBlockIdAndNNZ( | 
|  | const int num_row_blocks, | 
|  | const int num_col_blocks_e, | 
|  | const int num_row_blocks_e, | 
|  | const int* __restrict__ first_cell_in_row_block, | 
|  | const Cell* __restrict__ cells, | 
|  | const Block* __restrict__ row_blocks, | 
|  | const Block* __restrict__ col_blocks, | 
|  | int* __restrict__ rows_e, | 
|  | int* __restrict__ rows_f, | 
|  | int* __restrict__ row_block_ids) { | 
|  | const int row_block_id = blockIdx.x * blockDim.x + threadIdx.x; | 
|  | if (row_block_id > num_row_blocks) { | 
|  | // No synchronization is performed in this kernel, thus it is safe to return | 
|  | return; | 
|  | } | 
|  | if (row_block_id == num_row_blocks) { | 
|  | // one extra thread sets the first element | 
|  | rows_f[0] = 0; | 
|  | if constexpr (partitioned) { | 
|  | rows_e[0] = 0; | 
|  | } | 
|  | return; | 
|  | } | 
|  | const auto& row_block = row_blocks[row_block_id]; | 
|  | auto first_cell = cells + first_cell_in_row_block[row_block_id]; | 
|  | const auto last_cell = cells + first_cell_in_row_block[row_block_id + 1]; | 
|  | [[maybe_unused]] int row_nnz_e = 0; | 
|  | if (partitioned && row_block_id < num_row_blocks_e) { | 
|  | // First cell is a cell from E | 
|  | row_nnz_e = col_blocks[first_cell->block_id].size; | 
|  | ++first_cell; | 
|  | } | 
|  | int row_nnz_f = 0; | 
|  | for (auto cell = first_cell; cell < last_cell; ++cell) { | 
|  | row_nnz_f += col_blocks[cell->block_id].size; | 
|  | } | 
|  | const int first_row = row_block.position; | 
|  | const int last_row = first_row + row_block.size; | 
|  | for (int i = first_row; i < last_row; ++i) { | 
|  | if constexpr (partitioned) { | 
|  | rows_e[i + 1] = row_nnz_e; | 
|  | } | 
|  | rows_f[i + 1] = row_nnz_f; | 
|  | row_block_ids[i] = row_block_id; | 
|  | } | 
|  | } | 
|  |  | 
|  | // Row-wise creation of CRS structure | 
|  | // Inputs: | 
|  | // - num_rows: number of rows in matrix | 
|  | // - first_cell_in_row_block: index of the first cell of the row-block; size: | 
|  | // num_row_blocks + 1 | 
|  | // - cells: cells of block-sparse structure as a continuous array | 
|  | // - row_blocks: row blocks of block-sparse structure stored sequentially | 
|  | // - col_blocks: column blocks of block-sparse structure stored sequentially | 
|  | // - row_block_ids: index of row-block that corresponds to row | 
|  | // - rows: row-index array of CRS structure | 
|  | // Outputs: | 
|  | // - cols: column-index array of CRS structure | 
|  | // Computaion is perform row-wise | 
|  | template <bool partitioned> | 
|  | __global__ void ComputeColumns(const int num_rows, | 
|  | const int num_row_blocks_e, | 
|  | const int num_col_blocks_e, | 
|  | const int* __restrict__ first_cell_in_row_block, | 
|  | const Cell* __restrict__ cells, | 
|  | const Block* __restrict__ row_blocks, | 
|  | const Block* __restrict__ col_blocks, | 
|  | const int* __restrict__ row_block_ids, | 
|  | const int* __restrict__ rows_e, | 
|  | int* __restrict__ cols_e, | 
|  | const int* __restrict__ rows_f, | 
|  | int* __restrict__ cols_f) { | 
|  | const int row = blockIdx.x * blockDim.x + threadIdx.x; | 
|  | if (row >= num_rows) { | 
|  | // No synchronization is performed in this kernel, thus it is safe to return | 
|  | return; | 
|  | } | 
|  | const int row_block_id = row_block_ids[row]; | 
|  | // position in crs matrix | 
|  | auto first_cell = cells + first_cell_in_row_block[row_block_id]; | 
|  | const auto last_cell = cells + first_cell_in_row_block[row_block_id + 1]; | 
|  | const int num_cols_e = col_blocks[num_col_blocks_e].position; | 
|  | // For reach cell of row-block only current row is being filled | 
|  | if (partitioned && row_block_id < num_row_blocks_e) { | 
|  | // The first cell is cell from E | 
|  | const auto& col_block = col_blocks[first_cell->block_id]; | 
|  | const int col_block_size = col_block.size; | 
|  | int column_idx = col_block.position; | 
|  | int crs_position_e = rows_e[row]; | 
|  | // Column indices for each element of row_in_block row of current cell | 
|  | for (int i = 0; i < col_block_size; ++i, ++crs_position_e) { | 
|  | cols_e[crs_position_e] = column_idx++; | 
|  | } | 
|  | ++first_cell; | 
|  | } | 
|  | int crs_position_f = rows_f[row]; | 
|  | for (auto cell = first_cell; cell < last_cell; ++cell) { | 
|  | const auto& col_block = col_blocks[cell->block_id]; | 
|  | const int col_block_size = col_block.size; | 
|  | int column_idx = col_block.position - num_cols_e; | 
|  | // Column indices for each element of row_in_block row of current cell | 
|  | for (int i = 0; i < col_block_size; ++i, ++crs_position_f) { | 
|  | cols_f[crs_position_f] = column_idx++; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | void FillCRSStructure(const int num_row_blocks, | 
|  | const int num_rows, | 
|  | const int* first_cell_in_row_block, | 
|  | const Cell* cells, | 
|  | const Block* row_blocks, | 
|  | const Block* col_blocks, | 
|  | int* rows, | 
|  | int* cols, | 
|  | cudaStream_t stream, | 
|  | bool memory_pools_supported) { | 
|  | // Set number of non-zeros per row in rows array and row to row-block map in | 
|  | // row_block_ids array | 
|  | int* row_block_ids = | 
|  | CudaAllocate<int>(num_rows, stream, memory_pools_supported); | 
|  | const int num_blocks_blockwise = NumBlocksInGrid(num_row_blocks + 1); | 
|  | RowBlockIdAndNNZ<false><<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>( | 
|  | num_row_blocks, | 
|  | 0, | 
|  | 0, | 
|  | first_cell_in_row_block, | 
|  | cells, | 
|  | row_blocks, | 
|  | col_blocks, | 
|  | nullptr, | 
|  | rows, | 
|  | row_block_ids); | 
|  | // Finalize row-index array of CRS strucure by computing prefix sum | 
|  | thrust::inclusive_scan( | 
|  | ThrustCudaStreamExecutionPolicy(stream), rows, rows + num_rows + 1, rows); | 
|  |  | 
|  | // Fill cols array of CRS structure | 
|  | const int num_blocks_rowwise = NumBlocksInGrid(num_rows); | 
|  | ComputeColumns<false><<<num_blocks_rowwise, kCudaBlockSize, 0, stream>>>( | 
|  | num_rows, | 
|  | 0, | 
|  | 0, | 
|  | first_cell_in_row_block, | 
|  | cells, | 
|  | row_blocks, | 
|  | col_blocks, | 
|  | row_block_ids, | 
|  | nullptr, | 
|  | nullptr, | 
|  | rows, | 
|  | cols); | 
|  | CudaFree(row_block_ids, stream, memory_pools_supported); | 
|  | } | 
|  |  | 
|  | void FillCRSStructurePartitioned(const int num_row_blocks, | 
|  | const int num_rows, | 
|  | const int num_row_blocks_e, | 
|  | const int num_col_blocks_e, | 
|  | const int num_nonzeros_e, | 
|  | const int* first_cell_in_row_block, | 
|  | const Cell* cells, | 
|  | const Block* row_blocks, | 
|  | const Block* col_blocks, | 
|  | int* rows_e, | 
|  | int* cols_e, | 
|  | int* rows_f, | 
|  | int* cols_f, | 
|  | cudaStream_t stream, | 
|  | bool memory_pools_supported) { | 
|  | // Set number of non-zeros per row in rows array and row to row-block map in | 
|  | // row_block_ids array | 
|  | int* row_block_ids = | 
|  | CudaAllocate<int>(num_rows, stream, memory_pools_supported); | 
|  | const int num_blocks_blockwise = NumBlocksInGrid(num_row_blocks + 1); | 
|  | RowBlockIdAndNNZ<true><<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>( | 
|  | num_row_blocks, | 
|  | num_col_blocks_e, | 
|  | num_row_blocks_e, | 
|  | first_cell_in_row_block, | 
|  | cells, | 
|  | row_blocks, | 
|  | col_blocks, | 
|  | rows_e, | 
|  | rows_f, | 
|  | row_block_ids); | 
|  | // Finalize row-index array of CRS strucure by computing prefix sum | 
|  | thrust::inclusive_scan(ThrustCudaStreamExecutionPolicy(stream), | 
|  | rows_e, | 
|  | rows_e + num_rows + 1, | 
|  | rows_e); | 
|  | thrust::inclusive_scan(ThrustCudaStreamExecutionPolicy(stream), | 
|  | rows_f, | 
|  | rows_f + num_rows + 1, | 
|  | rows_f); | 
|  |  | 
|  | // Fill cols array of CRS structure | 
|  | const int num_blocks_rowwise = NumBlocksInGrid(num_rows); | 
|  | ComputeColumns<true><<<num_blocks_rowwise, kCudaBlockSize, 0, stream>>>( | 
|  | num_rows, | 
|  | num_row_blocks_e, | 
|  | num_col_blocks_e, | 
|  | first_cell_in_row_block, | 
|  | cells, | 
|  | row_blocks, | 
|  | col_blocks, | 
|  | row_block_ids, | 
|  | rows_e, | 
|  | cols_e, | 
|  | rows_f, | 
|  | cols_f); | 
|  | CudaFree(row_block_ids, stream, memory_pools_supported); | 
|  | } | 
|  |  | 
|  | template <typename T, typename Predicate> | 
|  | __device__ int PartitionPoint(const T* data, | 
|  | int first, | 
|  | int last, | 
|  | Predicate&& predicate) { | 
|  | if (!predicate(data[first])) { | 
|  | return first; | 
|  | } | 
|  | while (last - first > 1) { | 
|  | const auto midpoint = first + (last - first) / 2; | 
|  | if (predicate(data[midpoint])) { | 
|  | first = midpoint; | 
|  | } else { | 
|  | last = midpoint; | 
|  | } | 
|  | } | 
|  | return last; | 
|  | } | 
|  |  | 
|  | // Element-wise reordering of block-sparse values | 
|  | // - first_cell_in_row_block - position of the first cell of row-block | 
|  | // - block_sparse_values - segment of block-sparse values starting from | 
|  | // block_sparse_offset, containing num_values | 
|  | template <bool partitioned> | 
|  | __global__ void PermuteToCrsKernel( | 
|  | const int block_sparse_offset, | 
|  | const int num_values, | 
|  | const int num_row_blocks, | 
|  | const int num_row_blocks_e, | 
|  | const int* __restrict__ first_cell_in_row_block, | 
|  | const int* __restrict__ value_offset_row_block_f, | 
|  | const Cell* __restrict__ cells, | 
|  | const Block* __restrict__ row_blocks, | 
|  | const Block* __restrict__ col_blocks, | 
|  | const int* __restrict__ crs_rows, | 
|  | const double* __restrict__ block_sparse_values, | 
|  | double* __restrict__ crs_values) { | 
|  | const int value_id = blockIdx.x * blockDim.x + threadIdx.x; | 
|  | if (value_id >= num_values) { | 
|  | return; | 
|  | } | 
|  | const int block_sparse_value_id = value_id + block_sparse_offset; | 
|  | // Find the corresponding row-block with a binary search | 
|  | const int row_block_id = | 
|  | (partitioned | 
|  | ? PartitionPoint(value_offset_row_block_f, | 
|  | 0, | 
|  | num_row_blocks, | 
|  | [block_sparse_value_id] __device__( | 
|  | const int row_block_offset) { | 
|  | return row_block_offset <= block_sparse_value_id; | 
|  | }) | 
|  | : PartitionPoint(first_cell_in_row_block, | 
|  | 0, | 
|  | num_row_blocks, | 
|  | [cells, block_sparse_value_id] __device__( | 
|  | const int row_block_offset) { | 
|  | return cells[row_block_offset].position <= | 
|  | block_sparse_value_id; | 
|  | })) - | 
|  | 1; | 
|  | // Find cell and calculate offset within the row with a linear scan | 
|  | const auto& row_block = row_blocks[row_block_id]; | 
|  | auto first_cell = cells + first_cell_in_row_block[row_block_id]; | 
|  | const auto last_cell = cells + first_cell_in_row_block[row_block_id + 1]; | 
|  | const int row_block_size = row_block.size; | 
|  | int num_cols_before = 0; | 
|  | if (partitioned && row_block_id < num_row_blocks_e) { | 
|  | ++first_cell; | 
|  | } | 
|  | for (const Cell* cell = first_cell; cell < last_cell; ++cell) { | 
|  | const auto& col_block = col_blocks[cell->block_id]; | 
|  | const int col_block_size = col_block.size; | 
|  | const int cell_size = row_block_size * col_block_size; | 
|  | if (cell->position + cell_size > block_sparse_value_id) { | 
|  | const int pos_in_cell = block_sparse_value_id - cell->position; | 
|  | const int row_in_cell = pos_in_cell / col_block_size; | 
|  | const int col_in_cell = pos_in_cell % col_block_size; | 
|  | const int row = row_in_cell + row_block.position; | 
|  | crs_values[crs_rows[row] + num_cols_before + col_in_cell] = | 
|  | block_sparse_values[value_id]; | 
|  | break; | 
|  | } | 
|  | num_cols_before += col_block_size; | 
|  | } | 
|  | } | 
|  |  | 
|  | void PermuteToCRS(const int block_sparse_offset, | 
|  | const int num_values, | 
|  | const int num_row_blocks, | 
|  | const int* first_cell_in_row_block, | 
|  | const Cell* cells, | 
|  | const Block* row_blocks, | 
|  | const Block* col_blocks, | 
|  | const int* crs_rows, | 
|  | const double* block_sparse_values, | 
|  | double* crs_values, | 
|  | cudaStream_t stream) { | 
|  | const int num_blocks_valuewise = NumBlocksInGrid(num_values); | 
|  | PermuteToCrsKernel<false> | 
|  | <<<num_blocks_valuewise, kCudaBlockSize, 0, stream>>>( | 
|  | block_sparse_offset, | 
|  | num_values, | 
|  | num_row_blocks, | 
|  | 0, | 
|  | first_cell_in_row_block, | 
|  | nullptr, | 
|  | cells, | 
|  | row_blocks, | 
|  | col_blocks, | 
|  | crs_rows, | 
|  | block_sparse_values, | 
|  | crs_values); | 
|  | } | 
|  |  | 
|  | void PermuteToCRSPartitionedF(const int block_sparse_offset, | 
|  | const int num_values, | 
|  | const int num_row_blocks, | 
|  | const int num_row_blocks_e, | 
|  | const int* first_cell_in_row_block, | 
|  | const int* value_offset_row_block_f, | 
|  | const Cell* cells, | 
|  | const Block* row_blocks, | 
|  | const Block* col_blocks, | 
|  | const int* crs_rows, | 
|  | const double* block_sparse_values, | 
|  | double* crs_values, | 
|  | cudaStream_t stream) { | 
|  | const int num_blocks_valuewise = NumBlocksInGrid(num_values); | 
|  | PermuteToCrsKernel<true><<<num_blocks_valuewise, kCudaBlockSize, 0, stream>>>( | 
|  | block_sparse_offset, | 
|  | num_values, | 
|  | num_row_blocks, | 
|  | num_row_blocks_e, | 
|  | first_cell_in_row_block, | 
|  | value_offset_row_block_f, | 
|  | cells, | 
|  | row_blocks, | 
|  | col_blocks, | 
|  | crs_rows, | 
|  | block_sparse_values, | 
|  | crs_values); | 
|  | } | 
|  |  | 
|  | }  // namespace internal | 
|  | }  // namespace ceres |