CUDA partitioned matrix view Converts BlockSparseMatrix into two instances of CudaSparseMatrix, corresponding to left and right sub-matrix. Values of submatrix E are always just copied as-is, and values of submatrix F are copied if each row-block of F submatrix satisfies at least one of the following conditions: - There is atmost one cell in row-block - Row block has height of 1 row Otherwise, indices of values in CRS order corresponding to value indices in block-sparse order are computed on-the-fly. Change-Id: I14eee00c36ee74b6b83fc85927907641383abfc7
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt index b67b10e..aff218e 100644 --- a/internal/ceres/CMakeLists.txt +++ b/internal/ceres/CMakeLists.txt
@@ -186,6 +186,7 @@ covariance.cc covariance_impl.cc cuda_block_sparse_crs_view.cc + cuda_partitioned_block_sparse_crs_view.cc cuda_block_structure.cc cuda_sparse_matrix.cc cuda_vector.cc @@ -478,6 +479,7 @@ ceres_test(cost_function_to_functor) ceres_test(covariance) ceres_test(cubic_interpolation) + ceres_test(cuda_partitioned_block_sparse_crs_view) ceres_test(cuda_block_sparse_crs_view) ceres_test(cuda_block_structure) ceres_test(cuda_dense_cholesky)
diff --git a/internal/ceres/block_jacobian_writer.cc b/internal/ceres/block_jacobian_writer.cc index f74d64d..b009e96 100644 --- a/internal/ceres/block_jacobian_writer.cc +++ b/internal/ceres/block_jacobian_writer.cc
@@ -54,6 +54,13 @@ // the first num_eliminate_blocks parameter blocks as indicated by the parameter // block ordering. The remaining parameter blocks are the F blocks. // +// In order to simplify handling block-sparse to CRS conversion, cells within +// the row-block of non-partitioned matrix are stored in memory sequentially in +// the order of increasing column-block id. In case of partitioned matrices, +// cells corresponding to F sub-matrix are stored sequentially in the order of +// increasing column-block id (with cells corresponding to E sub-matrix stored +// separately). +// // TODO(keir): Consider if we should use a boolean for each parameter block // instead of num_eliminate_blocks. void BuildJacobianLayout(const Program& program, @@ -95,29 +102,54 @@ int e_block_pos = 0; int* jacobian_pos = jacobian_layout_storage->data(); + std::vector<std::pair<int, int>> active_parameter_blocks; for (int i = 0; i < residual_blocks.size(); ++i) { const ResidualBlock* residual_block = residual_blocks[i]; const int num_residuals = residual_block->NumResiduals(); const int num_parameter_blocks = residual_block->NumParameterBlocks(); (*jacobian_layout)[i] = jacobian_pos; + // Cells from F sub-matrix are to be stored sequentially with increasing + // column block id. For each non-constant parameter block, a pair of indices + // (index in the list of active parameter blocks and index in the list of + // all parameter blocks) is computed, and index pairs are sorted by the + // index of corresponding column block id. + active_parameter_blocks.clear(); + active_parameter_blocks.reserve(num_parameter_blocks); for (int j = 0; j < num_parameter_blocks; ++j) { ParameterBlock* parameter_block = residual_block->parameter_blocks()[j]; - const int parameter_block_index = parameter_block->index(); if (parameter_block->IsConstant()) { continue; } + const int k = active_parameter_blocks.size(); + active_parameter_blocks.emplace_back(k, j); + } + std::sort(active_parameter_blocks.begin(), + active_parameter_blocks.end(), + [&residual_block](const std::pair<int, int>& a, + const std::pair<int, int>& b) { + return residual_block->parameter_blocks()[a.second]->index() < + residual_block->parameter_blocks()[b.second]->index(); + }); + // Cell positions for each active parameter block are filled in the order of + // active parameter block indices sorted by columnd block index. This + // guarantees that cells are laid out sequentially with increasing column + // block indices. + for (const auto& indices : active_parameter_blocks) { + const auto [k, j] = indices; + ParameterBlock* parameter_block = residual_block->parameter_blocks()[j]; + const int parameter_block_index = parameter_block->index(); const int jacobian_block_size = num_residuals * parameter_block->TangentSize(); if (parameter_block_index < num_eliminate_blocks) { - *jacobian_pos = e_block_pos; + jacobian_pos[k] = e_block_pos; e_block_pos += jacobian_block_size; } else { - *jacobian_pos = f_block_pos; + jacobian_pos[k] = f_block_pos; f_block_pos += jacobian_block_size; } - jacobian_pos++; } + jacobian_pos += active_parameter_blocks.size(); } }
diff --git a/internal/ceres/block_jacobian_writer.h b/internal/ceres/block_jacobian_writer.h index 61f69b3..30b5f02 100644 --- a/internal/ceres/block_jacobian_writer.h +++ b/internal/ceres/block_jacobian_writer.h
@@ -53,6 +53,17 @@ // TODO(sameeragarwal): This class needs documentation. class CERES_NO_EXPORT BlockJacobianWriter { public: + // Pre-computes positions of cells in block-sparse jacobian. + // Two possible memory layouts are implemented: + // - Non-partitioned case + // - Partitioned case (for Schur type linear solver) + // + // In non-partitioned case, cells are stored sequentially in the + // lexicographic order of (row block id, column block id). + // + // In the case of partitoned matrix, cells of each sub-matrix (E and F) are + // stored sequentially in the lexicographic order of (row block id, column + // block id) and cells from E sub-matrix precede cells from F sub-matrix. BlockJacobianWriter(const Evaluator::Options& options, Program* program); // JacobianWriter interface.
diff --git a/internal/ceres/cuda_block_sparse_crs_view.cc b/internal/ceres/cuda_block_sparse_crs_view.cc index bb267da..6d4c6b0 100644 --- a/internal/ceres/cuda_block_sparse_crs_view.cc +++ b/internal/ceres/cuda_block_sparse_crs_view.cc
@@ -41,8 +41,6 @@ : context_(context) { block_structure_ = std::make_unique<CudaBlockSparseStructure>( *bsm.block_structure(), context); - // Only block-sparse matrices with sequential layout of cells are supported - CHECK(block_structure_->sequential_layout()); crs_matrix_ = std::make_unique<CudaSparseMatrix>( bsm.num_rows(), bsm.num_cols(), bsm.num_nonzeros(), context); FillCRSStructure(block_structure_->num_row_blocks(),
diff --git a/internal/ceres/cuda_block_structure.cc b/internal/ceres/cuda_block_structure.cc index 5817cd7..3685775 100644 --- a/internal/ceres/cuda_block_structure.cc +++ b/internal/ceres/cuda_block_structure.cc
@@ -43,10 +43,16 @@ return last.size + last.position; } } // namespace - CudaBlockSparseStructure::CudaBlockSparseStructure( const CompressedRowBlockStructure& block_structure, ContextImpl* context) + : CudaBlockSparseStructure(block_structure, 0, context) {} + +CudaBlockSparseStructure::CudaBlockSparseStructure( + const CompressedRowBlockStructure& block_structure, + const int num_col_blocks_e, + ContextImpl* context) : first_cell_in_row_block_(context), + value_offset_row_block_f_(context), cells_(context), row_blocks_(context), col_blocks_(context) { @@ -57,40 +63,137 @@ // Row block offset is an index of the first cell corresponding to row block std::vector<int> first_cell_in_row_block; + // Offset of the first value in the first non-empty row-block of F sub-matrix + std::vector<int> value_offset_row_block_f; // Flat array of all cells from all row-blocks std::vector<Cell> cells; - int f_values_offset = 0; + int f_values_offset = -1; + num_nonzeros_e_ = 0; is_crs_compatible_ = true; num_row_blocks_ = block_structure.rows.size(); num_col_blocks_ = col_blocks.size(); row_blocks.reserve(num_row_blocks_); first_cell_in_row_block.reserve(num_row_blocks_ + 1); + value_offset_row_block_f.reserve(num_row_blocks_ + 1); num_nonzeros_ = 0; - sequential_layout_ = true; - for (const auto& r : block_structure.rows) { + // Block-sparse matrices arising from block-jacobian writer are expected to + // have sequential layout (for partitioned matrices - it is expected that both + // E and F sub-matrices have sequential layout). + bool sequential_layout = true; + int row_block_id = 0; + num_row_blocks_e_ = 0; + for (; row_block_id < num_row_blocks_; ++row_block_id) { + const auto& r = block_structure.rows[row_block_id]; const int row_block_size = r.block.size; - if (r.cells.size() > 1 && row_block_size > 1) { + const int num_cells = r.cells.size(); + + if (num_col_blocks_e == 0 || r.cells.size() == 0 || + r.cells[0].block_id >= num_col_blocks_e) { + break; + } + num_row_blocks_e_ = row_block_id + 1; + // In E sub-matrix there is exactly a single E cell in the row + // since E cells are stored separately from F cells, crs-compatiblity of + // F sub-matrix only breaks if there are more than 2 cells in row (that + // is, more than 1 cell in F sub-matrix) + if (num_cells > 2 && row_block_size > 1) { is_crs_compatible_ = false; } row_blocks.emplace_back(r.block); first_cell_in_row_block.push_back(cells.size()); + + for (int cell_id = 0; cell_id < num_cells; ++cell_id) { + const auto& c = r.cells[cell_id]; + const int col_block_size = col_blocks[c.block_id].size; + const int cell_size = col_block_size * row_block_size; + cells.push_back(c); + if (cell_id == 0) { + DCHECK(c.position == num_nonzeros_e_); + num_nonzeros_e_ += cell_size; + } else { + if (f_values_offset == -1) { + num_nonzeros_ = c.position; + f_values_offset = c.position; + } + sequential_layout &= c.position == num_nonzeros_; + num_nonzeros_ += cell_size; + if (cell_id == 1) { + // Correct value_offset_row_block_f for empty row-blocks of F + // preceding this one + for (auto it = value_offset_row_block_f.rbegin(); + it != value_offset_row_block_f.rend(); + ++it) { + if (*it != -1) break; + *it = c.position; + } + value_offset_row_block_f.push_back(c.position); + } + } + } + if (num_cells == 1) { + value_offset_row_block_f.push_back(-1); + } + } + for (; row_block_id < num_row_blocks_; ++row_block_id) { + const auto& r = block_structure.rows[row_block_id]; + const int row_block_size = r.block.size; + const int num_cells = r.cells.size(); + // After num_row_blocks_e_ row-blocks, there should be no cells in E + // sub-matrix. Thus crs-compatibility of F sub-matrix breaks if there are + // more than one cells in the row-block + if (num_cells > 1 && row_block_size > 1) { + is_crs_compatible_ = false; + } + row_blocks.emplace_back(r.block); + first_cell_in_row_block.push_back(cells.size()); + + if (r.cells.empty()) { + value_offset_row_block_f.push_back(-1); + } else { + for (auto it = value_offset_row_block_f.rbegin(); + it != value_offset_row_block_f.rend(); + --it) { + if (*it != -1) break; + *it = cells[0].position; + } + value_offset_row_block_f.push_back(r.cells[0].position); + } for (const auto& c : r.cells) { const int col_block_size = col_blocks[c.block_id].size; const int cell_size = col_block_size * row_block_size; cells.push_back(c); - sequential_layout_ &= c.position == num_nonzeros_; + DCHECK(c.block_id >= num_col_blocks_e); + if (f_values_offset == -1) { + num_nonzeros_ = c.position; + f_values_offset = c.position; + } + sequential_layout &= c.position == num_nonzeros_; num_nonzeros_ += cell_size; } } + + if (f_values_offset == -1) { + f_values_offset = num_nonzeros_e_; + num_nonzeros_ = num_nonzeros_e_; + } + // Fill non-zero offsets for the last rows of F submatrix + for (auto it = value_offset_row_block_f.rbegin(); + it != value_offset_row_block_f.rend(); + ++it) { + if (*it != -1) break; + *it = num_nonzeros_; + } + value_offset_row_block_f.push_back(num_nonzeros_); + CHECK_EQ(num_nonzeros_e_, f_values_offset); first_cell_in_row_block.push_back(cells.size()); num_cells_ = cells.size(); num_rows_ = Dimension(row_blocks); num_cols_ = Dimension(col_blocks); - is_crs_compatible_ &= sequential_layout_; + CHECK(sequential_layout); if (VLOG_IS_ON(3)) { const size_t first_cell_in_row_block_size = @@ -122,6 +225,9 @@ cells_.CopyFromCpuVector(cells); row_blocks_.CopyFromCpuVector(row_blocks); col_blocks_.CopyFromCpuVector(col_blocks); + if (num_col_blocks_e || num_row_blocks_e_) { + value_offset_row_block_f_.CopyFromCpuVector(value_offset_row_block_f); + } } } // namespace ceres::internal
diff --git a/internal/ceres/cuda_block_structure.h b/internal/ceres/cuda_block_structure.h index 28e36b2..6da6fdd 100644 --- a/internal/ceres/cuda_block_structure.h +++ b/internal/ceres/cuda_block_structure.h
@@ -56,28 +56,40 @@ // array of cells on cpu and transfer it to the gpu. CudaBlockSparseStructure(const CompressedRowBlockStructure& block_structure, ContextImpl* context); + // In the case of partitioned matrices, number of non-zeros in E and layout of + // F are computed + CudaBlockSparseStructure(const CompressedRowBlockStructure& block_structure, + const int num_col_blocks_e, + ContextImpl* context); int num_rows() const { return num_rows_; } int num_cols() const { return num_cols_; } int num_cells() const { return num_cells_; } int num_nonzeros() const { return num_nonzeros_; } + // When partitioned matrix constructor was used, returns number of non-zeros + // in E sub-matrix + int num_nonzeros_e() const { return num_nonzeros_e_; } int num_row_blocks() const { return num_row_blocks_; } + int num_row_blocks_e() const { return num_row_blocks_e_; } int num_col_blocks() const { return num_col_blocks_; } - // Returns true if values from block-sparse matrix can be copied to CRS matrix - // as-is. This is possible if each row-block is stored in CRS order: + // Returns true if values from block-sparse matrix (F sub-matrix in + // partitioned case) can be copied to CRS matrix as-is. This is possible if + // each row-block is stored in CRS order: // - Row-block consists of a single row // - Row-block contains a single cell bool IsCrsCompatible() const { return is_crs_compatible_; } - // Returns true if block-sparse structure corresponds to block-sparse matrix - // with sequential cell positions - bool sequential_layout() const { return sequential_layout_; } // Device pointer to array of num_row_blocks + 1 indices of the first cell of // row block const int* first_cell_in_row_block() const { return first_cell_in_row_block_.data(); } + // Device pointer to array of num_row_blocks + 1 indices of the first value in + // this or subsequent row-blocks of submatrix F + const int* value_offset_row_block_f() const { + return value_offset_row_block_f_.data(); + } // Device pointer to array of num_cells cells, sorted by row-block const Cell* cells() const { return cells_.data(); } // Device pointer to array of row blocks @@ -90,11 +102,13 @@ int num_cols_; int num_cells_; int num_nonzeros_; + int num_nonzeros_e_; int num_row_blocks_; + int num_row_blocks_e_; int num_col_blocks_; bool is_crs_compatible_; - bool sequential_layout_; CudaBuffer<int> first_cell_in_row_block_; + CudaBuffer<int> value_offset_row_block_f_; CudaBuffer<Cell> cells_; CudaBuffer<Block> row_blocks_; CudaBuffer<Block> col_blocks_;
diff --git a/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc b/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc index a1ea6af..0a21c8d 100644 --- a/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc +++ b/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc
@@ -93,13 +93,17 @@ // - row_block_ids: row_block_ids[i] will be set to index of row-block that // contains i-th row. // Computation is perform row-block-wise +template <bool partitioned = false> __global__ void RowBlockIdAndNNZ( - int num_row_blocks, + const int num_row_blocks, + const int num_col_blocks_e, + const int num_row_blocks_e, const int* __restrict__ first_cell_in_row_block, const Cell* __restrict__ cells, const Block* __restrict__ row_blocks, const Block* __restrict__ col_blocks, - int* __restrict__ rows, + int* __restrict__ rows_e, + int* __restrict__ rows_f, int* __restrict__ row_block_ids) { const int row_block_id = blockIdx.x * blockDim.x + threadIdx.x; if (row_block_id > num_row_blocks) { @@ -108,20 +112,32 @@ } if (row_block_id == num_row_blocks) { // one extra thread sets the first element - rows[0] = 0; + rows_f[0] = 0; + if constexpr (partitioned) { + rows_e[0] = 0; + } return; } const auto& row_block = row_blocks[row_block_id]; - int row_nnz = 0; - const auto first_cell = cells + first_cell_in_row_block[row_block_id]; + auto first_cell = cells + first_cell_in_row_block[row_block_id]; const auto last_cell = cells + first_cell_in_row_block[row_block_id + 1]; + int row_nnz_e = 0; + if (partitioned && row_block_id < num_row_blocks_e) { + // First cell is a cell from E + row_nnz_e = col_blocks[first_cell->block_id].size; + ++first_cell; + } + int row_nnz_f = 0; for (auto cell = first_cell; cell < last_cell; ++cell) { - row_nnz += col_blocks[cell->block_id].size; + row_nnz_f += col_blocks[cell->block_id].size; } const int first_row = row_block.position; const int last_row = first_row + row_block.size; for (int i = first_row; i < last_row; ++i) { - rows[i + 1] = row_nnz; + if constexpr (partitioned) { + rows_e[i + 1] = row_nnz_e; + } + rows_f[i + 1] = row_nnz_f; row_block_ids[i] = row_block_id; } } @@ -139,15 +155,19 @@ // Outputs: // - cols: column-index array of CRS structure // Computaion is perform row-wise -__global__ void ComputeColumnsAndPermutation( - int num_rows, - const int* __restrict__ first_cell_in_row_block, - const Cell* __restrict__ cells, - const Block* __restrict__ row_blocks, - const Block* __restrict__ col_blocks, - const int* __restrict__ row_block_ids, - const int* __restrict__ rows, - int* __restrict__ cols) { +template <bool partitioned> +__global__ void ComputeColumns(const int num_rows, + const int num_row_blocks_e, + const int num_col_blocks_e, + const int* __restrict__ first_cell_in_row_block, + const Cell* __restrict__ cells, + const Block* __restrict__ row_blocks, + const Block* __restrict__ col_blocks, + const int* __restrict__ row_block_ids, + const int* __restrict__ rows_e, + int* __restrict__ cols_e, + const int* __restrict__ rows_f, + int* __restrict__ cols_f) { const int row = blockIdx.x * blockDim.x + threadIdx.x; if (row >= num_rows) { // No synchronization is performed in this kernel, thus it is safe to return @@ -155,17 +175,30 @@ } const int row_block_id = row_block_ids[row]; // position in crs matrix - int crs_position = rows[row]; - const auto first_cell = cells + first_cell_in_row_block[row_block_id]; + auto first_cell = cells + first_cell_in_row_block[row_block_id]; const auto last_cell = cells + first_cell_in_row_block[row_block_id + 1]; + const int num_cols_e = col_blocks[num_col_blocks_e].position; // For reach cell of row-block only current row is being filled + if (partitioned && row_block_id < num_row_blocks_e) { + // The first cell is cell from E + const auto& col_block = col_blocks[first_cell->block_id]; + const int col_block_size = col_block.size; + int column_idx = col_block.position; + int crs_position_e = rows_e[row]; + // Column indices for each element of row_in_block row of current cell + for (int i = 0; i < col_block_size; ++i, ++crs_position_e) { + cols_e[crs_position_e] = column_idx++; + } + ++first_cell; + } + int crs_position_f = rows_f[row]; for (auto cell = first_cell; cell < last_cell; ++cell) { const auto& col_block = col_blocks[cell->block_id]; const int col_block_size = col_block.size; - int column_idx = col_block.position; + int column_idx = col_block.position - num_cols_e; // Column indices for each element of row_in_block row of current cell - for (int i = 0; i < col_block_size; ++i, ++crs_position) { - cols[crs_position] = column_idx++; + for (int i = 0; i < col_block_size; ++i, ++crs_position_f) { + cols_f[crs_position_f] = column_idx++; } } } @@ -183,31 +216,93 @@ // row_block_ids array int* row_block_ids = AllocateTemporaryMemory<int>(num_rows, stream); const int num_blocks_blockwise = NumBlocksInGrid(num_row_blocks + 1); - RowBlockIdAndNNZ<<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>( + RowBlockIdAndNNZ<false><<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>( num_row_blocks, + 0, + 0, first_cell_in_row_block, cells, row_blocks, col_blocks, + nullptr, rows, row_block_ids); // Finalize row-index array of CRS strucure by computing prefix sum thrust::inclusive_scan( ThrustCudaStreamExecutionPolicy(stream), rows, rows + num_rows + 1, rows); - // Fill cols array of CRS structure and permutation from block-sparse to CRS + // Fill cols array of CRS structure const int num_blocks_rowwise = NumBlocksInGrid(num_rows); - ComputeColumnsAndPermutation<<<num_blocks_rowwise, - kCudaBlockSize, - 0, - stream>>>(num_rows, - first_cell_in_row_block, - cells, - row_blocks, - col_blocks, - row_block_ids, - rows, - cols); + ComputeColumns<false><<<num_blocks_rowwise, kCudaBlockSize, 0, stream>>>( + num_rows, + 0, + 0, + first_cell_in_row_block, + cells, + row_blocks, + col_blocks, + row_block_ids, + nullptr, + nullptr, + rows, + cols); + FreeTemporaryMemory(row_block_ids, stream); +} + +void FillCRSStructurePartitioned(const int num_row_blocks, + const int num_rows, + const int num_row_blocks_e, + const int num_col_blocks_e, + const int num_nonzeros_e, + const int* first_cell_in_row_block, + const Cell* cells, + const Block* row_blocks, + const Block* col_blocks, + int* rows_e, + int* cols_e, + int* rows_f, + int* cols_f, + cudaStream_t stream) { + // Set number of non-zeros per row in rows array and row to row-block map in + // row_block_ids array + int* row_block_ids = AllocateTemporaryMemory<int>(num_rows, stream); + const int num_blocks_blockwise = NumBlocksInGrid(num_row_blocks + 1); + RowBlockIdAndNNZ<true><<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>( + num_row_blocks, + num_col_blocks_e, + num_row_blocks_e, + first_cell_in_row_block, + cells, + row_blocks, + col_blocks, + rows_e, + rows_f, + row_block_ids); + // Finalize row-index array of CRS strucure by computing prefix sum + thrust::inclusive_scan(ThrustCudaStreamExecutionPolicy(stream), + rows_e, + rows_e + num_rows + 1, + rows_e); + thrust::inclusive_scan(ThrustCudaStreamExecutionPolicy(stream), + rows_f, + rows_f + num_rows + 1, + rows_f); + + // Fill cols array of CRS structure + const int num_blocks_rowwise = NumBlocksInGrid(num_rows); + ComputeColumns<true><<<num_blocks_rowwise, kCudaBlockSize, 0, stream>>>( + num_rows, + num_row_blocks_e, + num_col_blocks_e, + first_cell_in_row_block, + cells, + row_blocks, + col_blocks, + row_block_ids, + rows_e, + cols_e, + rows_f, + cols_f); FreeTemporaryMemory(row_block_ids, stream); } @@ -231,16 +326,17 @@ } // Element-wise reordering of block-sparse values -// - first_cell_pos - position of the first cell of corresponding sub-matrix; -// only used if use_first_cell is true; otherwise the first cell in row-block is -// used +// - first_cell_in_row_block - position of the first cell of row-block // - block_sparse_values - segment of block-sparse values starting from // block_sparse_offset, containing num_values +template <bool partitioned> __global__ void PermuteToCrsKernel( const int block_sparse_offset, const int num_values, const int num_row_blocks, + const int num_row_blocks_e, const int* __restrict__ first_cell_in_row_block, + const int* __restrict__ value_offset_row_block_f, const Cell* __restrict__ cells, const Block* __restrict__ row_blocks, const Block* __restrict__ col_blocks, @@ -254,21 +350,32 @@ const int block_sparse_value_id = value_id + block_sparse_offset; // Find the corresponding row-block with a binary search const int row_block_id = - PartitionPoint( - first_cell_in_row_block, - 0, - num_row_blocks, - [cells, - block_sparse_value_id] __device__(const int row_block_offset) { - return cells[row_block_offset].position <= block_sparse_value_id; - }) - + (partitioned + ? PartitionPoint(value_offset_row_block_f, + 0, + num_row_blocks, + [block_sparse_value_id] __device__( + const int row_block_offset) { + return row_block_offset <= block_sparse_value_id; + }) + : PartitionPoint(first_cell_in_row_block, + 0, + num_row_blocks, + [cells, block_sparse_value_id] __device__( + const int row_block_offset) { + return cells[row_block_offset].position <= + block_sparse_value_id; + })) - 1; // Find cell and calculate offset within the row with a linear scan const auto& row_block = row_blocks[row_block_id]; - const auto first_cell = cells + first_cell_in_row_block[row_block_id]; + auto first_cell = cells + first_cell_in_row_block[row_block_id]; const auto last_cell = cells + first_cell_in_row_block[row_block_id + 1]; const int row_block_size = row_block.size; int num_cols_before = 0; + if (partitioned && row_block_id < num_row_blocks_e) { + ++first_cell; + } for (const Cell* cell = first_cell; cell < last_cell; ++cell) { const auto& col_block = col_blocks[cell->block_id]; const int col_block_size = col_block.size; @@ -298,11 +405,43 @@ double* crs_values, cudaStream_t stream) { const int num_blocks_valuewise = NumBlocksInGrid(num_values); - PermuteToCrsKernel<<<num_blocks_valuewise, kCudaBlockSize, 0, stream>>>( + PermuteToCrsKernel<false> + <<<num_blocks_valuewise, kCudaBlockSize, 0, stream>>>( + block_sparse_offset, + num_values, + num_row_blocks, + 0, + first_cell_in_row_block, + nullptr, + cells, + row_blocks, + col_blocks, + crs_rows, + block_sparse_values, + crs_values); +} + +void PermuteToCRSPartitionedF(const int block_sparse_offset, + const int num_values, + const int num_row_blocks, + const int num_row_blocks_e, + const int* first_cell_in_row_block, + const int* value_offset_row_block_f, + const Cell* cells, + const Block* row_blocks, + const Block* col_blocks, + const int* crs_rows, + const double* block_sparse_values, + double* crs_values, + cudaStream_t stream) { + const int num_blocks_valuewise = NumBlocksInGrid(num_values); + PermuteToCrsKernel<true><<<num_blocks_valuewise, kCudaBlockSize, 0, stream>>>( block_sparse_offset, num_values, num_row_blocks, + num_row_blocks_e, first_cell_in_row_block, + value_offset_row_block_f, cells, row_blocks, col_blocks,
diff --git a/internal/ceres/cuda_kernels_bsm_to_crs.h b/internal/ceres/cuda_kernels_bsm_to_crs.h index b6da69b..90b4882 100644 --- a/internal/ceres/cuda_kernels_bsm_to_crs.h +++ b/internal/ceres/cuda_kernels_bsm_to_crs.h
@@ -54,6 +54,23 @@ int* cols, cudaStream_t stream); +// Compute structure of partitioned CRS matrix using block-sparse structure. +// Arrays corresponding to CRS matrices are to be allocated by caller +void FillCRSStructurePartitioned(const int num_row_blocks, + const int num_rows, + const int num_row_blocks_e, + const int num_col_blocks_e, + const int num_nonzeros_e, + const int* first_cell_in_row_block, + const Cell* cells, + const Block* row_blocks, + const Block* col_blocks, + int* rows_e, + int* cols_e, + int* rows_f, + int* cols_f, + cudaStream_t stream); + // Permute segment of values from block-sparse matrix with sequential layout to // CRS order. Segment starts at block_sparse_offset and has length of num_values void PermuteToCRS(const int block_sparse_offset, @@ -68,6 +85,24 @@ double* crs_values, cudaStream_t stream); +// Permute segment of values from F sub-matrix of block-sparse partitioned +// matrix with sequential layout to CRS order. Segment starts at +// block_sparse_offset (including the offset induced by values of E submatrix) +// and has length of num_values +void PermuteToCRSPartitionedF(const int block_sparse_offset, + const int num_values, + const int num_row_blocks, + const int num_row_blocks_e, + const int* first_cell_in_row_block, + const int* value_offset_row_block_f, + const Cell* cells, + const Block* row_blocks, + const Block* col_blocks, + const int* crs_rows, + const double* block_sparse_values, + double* crs_values, + cudaStream_t stream); + } // namespace internal } // namespace ceres
diff --git a/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc b/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc new file mode 100644 index 0000000..703be84 --- /dev/null +++ b/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc
@@ -0,0 +1,145 @@ +// Ceres Solver - A fast non-linear least squares minimizer +// Copyright 2023 Google Inc. All rights reserved. +// http://ceres-solver.org/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Google Inc. nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin) + +#include "ceres/cuda_partitioned_block_sparse_crs_view.h" + +#ifndef CERES_NO_CUDA + +#include "ceres/cuda_block_structure.h" +#include "ceres/cuda_kernels_bsm_to_crs.h" + +namespace ceres::internal { + +CudaPartitionedBlockSparseCRSView::CudaPartitionedBlockSparseCRSView( + const BlockSparseMatrix& bsm, + const int num_col_blocks_e, + ContextImpl* context) + : + + context_(context) { + const auto& bs = *bsm.block_structure(); + block_structure_ = + std::make_unique<CudaBlockSparseStructure>(bs, num_col_blocks_e, context); + // Determine number of non-zeros in left submatrix + // Row-blocks are at least 1 row high, thus we can use a temporary array of + // num_rows for ComputeNonZerosInColumnBlockSubMatrix; and later reuse it for + // FillCRSStructurePartitioned + const int num_rows = bsm.num_rows(); + const int num_nonzeros_e = block_structure_->num_nonzeros_e(); + const int num_nonzeros_f = bsm.num_nonzeros() - num_nonzeros_e; + + const int num_cols_e = num_col_blocks_e < bs.cols.size() + ? bs.cols[num_col_blocks_e].position + : bsm.num_cols(); + const int num_cols_f = bsm.num_cols() - num_cols_e; + matrix_e_ = std::make_unique<CudaSparseMatrix>( + num_rows, num_cols_e, num_nonzeros_e, context); + matrix_f_ = std::make_unique<CudaSparseMatrix>( + num_rows, num_cols_f, num_nonzeros_f, context); + + num_row_blocks_e_ = block_structure_->num_row_blocks_e(); + FillCRSStructurePartitioned(block_structure_->num_row_blocks(), + num_rows, + num_row_blocks_e_, + num_col_blocks_e, + num_nonzeros_e, + block_structure_->first_cell_in_row_block(), + block_structure_->cells(), + block_structure_->row_blocks(), + block_structure_->col_blocks(), + matrix_e_->mutable_rows(), + matrix_e_->mutable_cols(), + matrix_f_->mutable_rows(), + matrix_f_->mutable_cols(), + context->DefaultStream()); + f_is_crs_compatible_ = block_structure_->IsCrsCompatible(); + if (f_is_crs_compatible_) { + block_structure_ = nullptr; + } else { + streamed_buffer_ = std::make_unique<CudaStreamedBuffer<double>>( + context, kMaxTemporaryArraySize); + } + CHECK_EQ(bsm.num_nonzeros(), + matrix_e_->num_nonzeros() + matrix_f_->num_nonzeros()); + + UpdateValues(bsm); +} + +void CudaPartitionedBlockSparseCRSView::UpdateValues( + const BlockSparseMatrix& bsm) { + if (f_is_crs_compatible_) { + CHECK_EQ(cudaSuccess, + cudaMemcpyAsync(matrix_e_->mutable_values(), + bsm.values(), + matrix_e_->num_nonzeros() * sizeof(double), + cudaMemcpyHostToDevice, + context_->DefaultStream())); + + CHECK_EQ(cudaSuccess, + cudaMemcpyAsync(matrix_f_->mutable_values(), + bsm.values() + matrix_e_->num_nonzeros(), + matrix_f_->num_nonzeros() * sizeof(double), + cudaMemcpyHostToDevice, + context_->DefaultStream())); + return; + } + streamed_buffer_->CopyToGpu( + bsm.values(), + bsm.num_nonzeros(), + [block_structure = block_structure_.get(), + num_nonzeros_e = matrix_e_->num_nonzeros(), + num_row_blocks_e = num_row_blocks_e_, + values_f = matrix_f_->mutable_values(), + rows_f = matrix_f_->rows()]( + const double* values, int num_values, int offset, auto stream) { + PermuteToCRSPartitionedF(num_nonzeros_e + offset, + num_values, + block_structure->num_row_blocks(), + num_row_blocks_e, + block_structure->first_cell_in_row_block(), + block_structure->value_offset_row_block_f(), + block_structure->cells(), + block_structure->row_blocks(), + block_structure->col_blocks(), + rows_f, + values, + values_f, + stream); + }); + CHECK_EQ(cudaSuccess, + cudaMemcpyAsync(matrix_e_->mutable_values(), + bsm.values(), + matrix_e_->num_nonzeros() * sizeof(double), + cudaMemcpyHostToDevice, + context_->DefaultStream())); +} + +} // namespace ceres::internal +#endif // CERES_NO_CUDA
diff --git a/internal/ceres/cuda_partitioned_block_sparse_crs_view.h b/internal/ceres/cuda_partitioned_block_sparse_crs_view.h new file mode 100644 index 0000000..3072dea --- /dev/null +++ b/internal/ceres/cuda_partitioned_block_sparse_crs_view.h
@@ -0,0 +1,111 @@ +// Ceres Solver - A fast non-linear least squares minimizer +// Copyright 2023 Google Inc. All rights reserved. +// http://ceres-solver.org/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Google Inc. nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin) +// + +#ifndef CERES_INTERNAL_CUDA_PARTITIONED_BLOCK_SPARSE_CRS_VIEW_H_ +#define CERES_INTERNAL_CUDA_PARTITIONED_BLOCK_SPARSE_CRS_VIEW_H_ + +#include "ceres/internal/config.h" + +#ifndef CERES_NO_CUDA + +#include <memory> + +#include "ceres/block_sparse_matrix.h" +#include "ceres/cuda_block_structure.h" +#include "ceres/cuda_buffer.h" +#include "ceres/cuda_sparse_matrix.h" +#include "ceres/cuda_streamed_buffer.h" + +namespace ceres::internal { +// We use cuSPARSE library for SpMV operations. However, it does not support +// neither block-sparse format with varying size of the blocks nor +// submatrix-vector products. Thus, we perform the following operations in order +// to compute products of partitioned block-sparse matrices and dense vectors on +// gpu: +// - Once per block-sparse structure update: +// - Compute CRS structures of left and right submatrices from block-sparse +// structure +// - Check if values of F sub-matrix can be copied without permutation +// matrices +// - Once per block-sparse values update: +// - Copy values of E sub-matrix +// - Permute or copy values of F sub-matrix +// +// It is assumed that cells of block-sparse matrix are laid out sequentially in +// both of sub-matrices and there is exactly one cell in row-block of E +// sub-matrix in the first num_row_blocks_e_ row blocks, and no cells in E +// sub-matrix below num_row_blocks_e_ row blocks. +// +// This class avoids storing both CRS and block-sparse values in GPU memory. +// Instead, block-sparse values are transferred to gpu memory as a disjoint set +// of small continuous segments with simultaneous permutation of the values into +// correct order using block-structure. +class CERES_NO_EXPORT CudaPartitionedBlockSparseCRSView { + public: + // Initializes internal CRS matrix and block-sparse structure on GPU side + // values. The following objects are stored in gpu memory for the whole + // lifetime of the object + // - matrix_e_: left CRS submatrix + // - matrix_f_: right CRS submatrix + // - block_structure_: copy of block-sparse structure on GPU + // - streamed_buffer_: helper for value updating + CudaPartitionedBlockSparseCRSView(const BlockSparseMatrix& bsm, + const int num_col_blocks_e, + ContextImpl* context); + + // Update values of CRS submatrices using values of block-sparse matrix. + // Assumes that bsm has the same block-sparse structure as matrix that was + // used for construction. + void UpdateValues(const BlockSparseMatrix& bsm); + + const CudaSparseMatrix* matrix_e() const { return matrix_e_.get(); } + const CudaSparseMatrix* matrix_f() const { return matrix_f_.get(); } + CudaSparseMatrix* mutable_matrix_e() { return matrix_e_.get(); } + CudaSparseMatrix* mutable_matrix_f() { return matrix_f_.get(); } + + private: + // Value permutation kernel performs a single element-wise operation per + // thread, thus performing permutation in blocks of 8 megabytes of + // block-sparse values seems reasonable + static constexpr int kMaxTemporaryArraySize = 1 * 1024 * 1024; + std::unique_ptr<CudaSparseMatrix> matrix_e_; + std::unique_ptr<CudaSparseMatrix> matrix_f_; + std::unique_ptr<CudaStreamedBuffer<double>> streamed_buffer_; + std::unique_ptr<CudaBlockSparseStructure> block_structure_; + bool f_is_crs_compatible_; + int num_row_blocks_e_; + ContextImpl* context_; +}; + +} // namespace ceres::internal + +#endif // CERES_NO_CUDA +#endif // CERES_INTERNAL_CUDA_PARTITIONED_BLOCK_SPARSE_CRS_VIEW_H_
diff --git a/internal/ceres/cuda_partitioned_block_sparse_crs_view_test.cc b/internal/ceres/cuda_partitioned_block_sparse_crs_view_test.cc new file mode 100644 index 0000000..5090b6a --- /dev/null +++ b/internal/ceres/cuda_partitioned_block_sparse_crs_view_test.cc
@@ -0,0 +1,282 @@ +// Ceres Solver - A fast non-linear least squares minimizer +// Copyright 2023 Google Inc. All rights reserved. +// http://ceres-solver.org/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Google Inc. nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin) + +#include "ceres/cuda_partitioned_block_sparse_crs_view.h" + +#include <glog/logging.h> +#include <gtest/gtest.h> + +#ifndef CERES_NO_CUDA + +namespace ceres::internal { + +namespace { +struct RandomPartitionedMatrixOptions { + int num_row_blocks_e; + int num_row_blocks_f; + int num_col_blocks_e; + int num_col_blocks_f; + int min_row_block_size; + int max_row_block_size; + int min_col_block_size; + int max_col_block_size; + double empty_f_probability; + double cell_probability_f; + int max_cells_f; +}; + +std::unique_ptr<BlockSparseMatrix> CreateRandomPartitionedMatrix( + const RandomPartitionedMatrixOptions& options, std::mt19937& rng) { + const int num_row_blocks = + std::max(options.num_row_blocks_e, options.num_row_blocks_f); + const int num_col_blocks = + options.num_col_blocks_e + options.num_col_blocks_f; + + CompressedRowBlockStructure* block_structure = + new CompressedRowBlockStructure; + block_structure->cols.reserve(num_col_blocks); + block_structure->rows.reserve(num_row_blocks); + + // Create column blocks + std::uniform_int_distribution<int> col_size(options.min_col_block_size, + options.max_col_block_size); + int num_cols = 0; + for (int i = 0; i < num_col_blocks; ++i) { + const int size = col_size(rng); + block_structure->cols.emplace_back(size, num_cols); + num_cols += size; + } + + // Prepare column-block indices of E cells + std::vector<int> e_col_block_idx; + e_col_block_idx.reserve(options.num_row_blocks_e); + std::uniform_int_distribution<int> col_e(0, options.num_col_blocks_e - 1); + for (int i = 0; i < options.num_row_blocks_e; ++i) { + e_col_block_idx.emplace_back(col_e(rng)); + } + std::sort(e_col_block_idx.begin(), e_col_block_idx.end()); + + // Prepare cell structure + std::uniform_int_distribution<int> row_size(options.min_row_block_size, + options.max_row_block_size); + std::uniform_real_distribution<double> uniform; + int num_rows = 0; + for (int i = 0; i < num_row_blocks; ++i) { + const int size = row_size(rng); + block_structure->rows.emplace_back(); + auto& row = block_structure->rows.back(); + row.block.size = size; + row.block.position = num_rows; + num_rows += size; + if (i < options.num_row_blocks_e) { + row.cells.emplace_back(e_col_block_idx[i], -1); + if (uniform(rng) < options.empty_f_probability) { + continue; + } + } + if (i >= options.num_row_blocks_f) continue; + const int cells_before = row.cells.size(); + for (int j = options.num_col_blocks_e; j < num_col_blocks; ++j) { + if (uniform(rng) > options.cell_probability_f) { + continue; + } + row.cells.emplace_back(j, -1); + } + if (row.cells.size() > cells_before + options.max_cells_f) { + std::shuffle(row.cells.begin() + cells_before, row.cells.end(), rng); + row.cells.resize(cells_before + options.max_cells_f); + std::sort( + row.cells.begin(), row.cells.end(), [](const auto& a, const auto& b) { + return a.block_id < b.block_id; + }); + } + } + + // Fill positions in E sub-matrix + int num_nonzeros = 0; + for (int i = 0; i < options.num_row_blocks_e; ++i) { + CHECK_GE(block_structure->rows[i].cells.size(), 1); + block_structure->rows[i].cells[0].position = num_nonzeros; + const int col_block_size = + block_structure->cols[block_structure->rows[i].cells[0].block_id].size; + const int row_block_size = block_structure->rows[i].block.size; + num_nonzeros += row_block_size * col_block_size; + CHECK_GE(num_nonzeros, 0); + } + // Fill positions in F sub-matrix + for (int i = 0; i < options.num_row_blocks_f; ++i) { + const int row_block_size = block_structure->rows[i].block.size; + for (auto& cell : block_structure->rows[i].cells) { + if (cell.position >= 0) continue; + cell.position = num_nonzeros; + const int col_block_size = block_structure->cols[cell.block_id].size; + num_nonzeros += row_block_size * col_block_size; + CHECK_GE(num_nonzeros, 0); + } + } + // Populate values + auto bsm = std::make_unique<BlockSparseMatrix>(block_structure, true); + for (int i = 0; i < num_nonzeros; ++i) { + bsm->mutable_values()[i] = i + 1; + } + return bsm; +} +} // namespace + +class CudaPartitionedBlockSparseCRSViewTest : public ::testing::Test { + static constexpr int kNumColBlocksE = 456; + + protected: + void SetUp() final { + std::string message; + CHECK(context_.InitCuda(&message)) + << "InitCuda() failed because: " << message; + + RandomPartitionedMatrixOptions options; + options.num_row_blocks_f = 123; + options.num_row_blocks_e = 456; + options.num_col_blocks_f = 123; + options.num_col_blocks_e = kNumColBlocksE; + options.min_row_block_size = 1; + options.max_row_block_size = 4; + options.min_col_block_size = 1; + options.max_col_block_size = 4; + options.empty_f_probability = .1; + options.cell_probability_f = .2; + options.max_cells_f = options.num_col_blocks_f; + + std::mt19937 rng; + short_f_ = CreateRandomPartitionedMatrix(options, rng); + + options.num_row_blocks_e = 123; + options.num_row_blocks_f = 456; + short_e_ = CreateRandomPartitionedMatrix(options, rng); + + options.max_cells_f = 1; + options.num_row_blocks_e = options.num_row_blocks_f; + options.num_row_blocks_e = options.num_row_blocks_f; + f_crs_compatible_ = CreateRandomPartitionedMatrix(options, rng); + } + + void TestMatrix(const BlockSparseMatrix& A_) { + const int num_col_blocks_e = 456; + CudaPartitionedBlockSparseCRSView view(A_, kNumColBlocksE, &context_); + + const int num_rows = A_.num_rows(); + const int num_cols = A_.num_cols(); + + const auto& bs = *A_.block_structure(); + const int num_cols_e = bs.cols[num_col_blocks_e].position; + const int num_cols_f = num_cols - num_cols_e; + + // TODO: we definitely would like to use matrix() here, but + // CudaSparseMatrix::RightMultiplyAndAccumulate is defined non-const because + // it might allocate additional storage by request of cuSPARSE + auto matrix_e = view.mutable_matrix_e(); + auto matrix_f = view.mutable_matrix_f(); + ASSERT_EQ(matrix_e->num_cols(), num_cols_e); + ASSERT_EQ(matrix_e->num_rows(), num_rows); + ASSERT_EQ(matrix_f->num_cols(), num_cols_f); + ASSERT_EQ(matrix_f->num_rows(), num_rows); + + Vector x(num_cols); + Vector x_left(num_cols_e); + Vector x_right(num_cols_f); + Vector y(num_rows); + CudaVector x_cuda(&context_, num_cols); + CudaVector x_left_cuda(&context_, num_cols_e); + CudaVector x_right_cuda(&context_, num_cols_f); + CudaVector y_cuda(&context_, num_rows); + Vector y_cuda_host(num_rows); + + for (int i = 0; i < num_cols_e; ++i) { + x.setZero(); + x_left.setZero(); + y.setZero(); + y_cuda.SetZero(); + x[i] = 1.; + x_left[i] = 1.; + x_left_cuda.CopyFromCpu(x_left); + A_.RightMultiplyAndAccumulate( + x.data(), y.data(), &context_, std::thread::hardware_concurrency()); + matrix_e->RightMultiplyAndAccumulate(x_left_cuda, &y_cuda); + y_cuda.CopyTo(&y_cuda_host); + // There will be up to 1 non-zero product per row, thus we expect an exact + // match on 32-bit integer indices + EXPECT_EQ((y - y_cuda_host).squaredNorm(), 0.); + } + for (int i = num_cols_e; i < num_cols_f; ++i) { + x.setZero(); + x_right.setZero(); + y.setZero(); + y_cuda.SetZero(); + x[i] = 1.; + x_right[i - num_cols_e] = 1.; + x_right_cuda.CopyFromCpu(x_right); + A_.RightMultiplyAndAccumulate( + x.data(), y.data(), &context_, std::thread::hardware_concurrency()); + matrix_f->RightMultiplyAndAccumulate(x_right_cuda, &y_cuda); + y_cuda.CopyTo(&y_cuda_host); + // There will be up to 1 non-zero product per row, thus we expect an exact + // match on 32-bit integer indices + EXPECT_EQ((y - y_cuda_host).squaredNorm(), 0.); + } + } + + // E sub-matrix might have less row-blocks with cells than F sub-matrix. This + // test matrix checks if this case is handled properly + std::unique_ptr<BlockSparseMatrix> short_e_; + // In case of non-crs compatible F matrix, permuting values from block-order + // to crs order involves binary search over row-blocks of F. Having lots of + // row-blocks with no F cells is an edge case for this algorithm. + std::unique_ptr<BlockSparseMatrix> short_f_; + // With F matrix being CRS-compatible, update of the values of partitioned + // matrix view reduces to two host->device memcopies, and uses a separate code + // path + std::unique_ptr<BlockSparseMatrix> f_crs_compatible_; + + ContextImpl context_; +}; + +TEST_F(CudaPartitionedBlockSparseCRSViewTest, CreateUpdateValuesShortE) { + TestMatrix(*short_e_); +} + +TEST_F(CudaPartitionedBlockSparseCRSViewTest, CreateUpdateValuesShortF) { + TestMatrix(*short_f_); +} + +TEST_F(CudaPartitionedBlockSparseCRSViewTest, + CreateUpdateValuesCrsCompatibleF) { + TestMatrix(*f_crs_compatible_); +} +} // namespace ceres::internal + +#endif // CERES_NO_CUDA
diff --git a/internal/ceres/evaluation_benchmark.cc b/internal/ceres/evaluation_benchmark.cc index c23c881..28a8aff 100644 --- a/internal/ceres/evaluation_benchmark.cc +++ b/internal/ceres/evaluation_benchmark.cc
@@ -37,6 +37,7 @@ #include "ceres/block_sparse_matrix.h" #include "ceres/bundle_adjustment_test_util.h" #include "ceres/cuda_block_sparse_crs_view.h" +#include "ceres/cuda_partitioned_block_sparse_crs_view.h" #include "ceres/cuda_sparse_matrix.h" #include "ceres/cuda_vector.h" #include "ceres/evaluator.h" @@ -476,6 +477,110 @@ } #ifndef CERES_NO_CUDA +static void PMVRightMultiplyAndAccumulateFCuda(benchmark::State& state, + BALData* data, + ContextImpl* context) { + LinearSolver::Options options; + options.elimination_groups.push_back(data->bal_problem->num_points()); + options.context = context; + options.num_threads = 1; + auto jacobian = data->PartitionedMatrixViewJacobian(options); + auto underlying_matrix = data->BlockSparseJacobianPartitioned(context); + CudaPartitionedBlockSparseCRSView view( + *underlying_matrix, jacobian->num_col_blocks_e(), context); + + Vector x = Vector::Random(jacobian->num_cols_f()); + CudaVector cuda_x(context, x.size()); + CudaVector cuda_y(context, jacobian->num_rows()); + + cuda_x.CopyFromCpu(x); + cuda_y.SetZero(); + + auto matrix = view.mutable_matrix_f(); + for (auto _ : state) { + matrix->RightMultiplyAndAccumulate(cuda_x, &cuda_y); + } + CHECK_GT(cuda_y.Norm(), 0.); +} + +static void PMVLeftMultiplyAndAccumulateFCuda(benchmark::State& state, + BALData* data, + ContextImpl* context) { + LinearSolver::Options options; + options.elimination_groups.push_back(data->bal_problem->num_points()); + options.context = context; + options.num_threads = 1; + auto jacobian = data->PartitionedMatrixViewJacobian(options); + auto underlying_matrix = data->BlockSparseJacobianPartitioned(context); + CudaPartitionedBlockSparseCRSView view( + *underlying_matrix, jacobian->num_col_blocks_e(), context); + + Vector x = Vector::Random(jacobian->num_rows()); + CudaVector cuda_x(context, x.size()); + CudaVector cuda_y(context, jacobian->num_cols_f()); + + cuda_x.CopyFromCpu(x); + cuda_y.SetZero(); + + auto matrix = view.mutable_matrix_f(); + for (auto _ : state) { + matrix->LeftMultiplyAndAccumulate(cuda_x, &cuda_y); + } + CHECK_GT(cuda_y.Norm(), 0.); +} + +static void PMVRightMultiplyAndAccumulateECuda(benchmark::State& state, + BALData* data, + ContextImpl* context) { + LinearSolver::Options options; + options.elimination_groups.push_back(data->bal_problem->num_points()); + options.context = context; + options.num_threads = 1; + auto jacobian = data->PartitionedMatrixViewJacobian(options); + auto underlying_matrix = data->BlockSparseJacobianPartitioned(context); + CudaPartitionedBlockSparseCRSView view( + *underlying_matrix, jacobian->num_col_blocks_e(), context); + + Vector x = Vector::Random(jacobian->num_cols_e()); + CudaVector cuda_x(context, x.size()); + CudaVector cuda_y(context, jacobian->num_rows()); + + cuda_x.CopyFromCpu(x); + cuda_y.SetZero(); + + auto matrix = view.mutable_matrix_e(); + for (auto _ : state) { + matrix->RightMultiplyAndAccumulate(cuda_x, &cuda_y); + } + CHECK_GT(cuda_y.Norm(), 0.); +} + +static void PMVLeftMultiplyAndAccumulateECuda(benchmark::State& state, + BALData* data, + ContextImpl* context) { + LinearSolver::Options options; + options.elimination_groups.push_back(data->bal_problem->num_points()); + options.context = context; + options.num_threads = 1; + auto jacobian = data->PartitionedMatrixViewJacobian(options); + auto underlying_matrix = data->BlockSparseJacobianPartitioned(context); + CudaPartitionedBlockSparseCRSView view( + *underlying_matrix, jacobian->num_col_blocks_e(), context); + + Vector x = Vector::Random(jacobian->num_rows()); + CudaVector cuda_x(context, x.size()); + CudaVector cuda_y(context, jacobian->num_cols_e()); + + cuda_x.CopyFromCpu(x); + cuda_y.SetZero(); + + auto matrix = view.mutable_matrix_e(); + for (auto _ : state) { + matrix->LeftMultiplyAndAccumulate(cuda_x, &cuda_y); + } + CHECK_GT(cuda_y.Norm(), 0.); +} + // We want CudaBlockSparseCRSView to be not slower than explicit conversion to // CRS on CPU static void JacobianToCRSView(benchmark::State& state, @@ -734,6 +839,16 @@ ->Arg(8) ->Arg(16); +#ifndef CERES_NO_CUDA + const std::string name_right_product_partitioned_f_cuda = + "PMVRightMultiplyAndAccumulateFCuda<" + path + ">"; + ::benchmark::RegisterBenchmark( + name_right_product_partitioned_f_cuda.c_str(), + ceres::internal::PMVRightMultiplyAndAccumulateFCuda, + data, + &context); +#endif + const std::string name_right_product_partitioned_e = "PMVRightMultiplyAndAccumulateE<" + path + ">"; ::benchmark::RegisterBenchmark( @@ -747,6 +862,16 @@ ->Arg(8) ->Arg(16); +#ifndef CERES_NO_CUDA + const std::string name_right_product_partitioned_e_cuda = + "PMVRightMultiplyAndAccumulateECuda<" + path + ">"; + ::benchmark::RegisterBenchmark( + name_right_product_partitioned_e_cuda.c_str(), + ceres::internal::PMVRightMultiplyAndAccumulateECuda, + data, + &context); +#endif + const std::string name_update_block_diagonal_ftf = "PMVUpdateBlockDiagonalFtF<" + path + ">"; ::benchmark::RegisterBenchmark(name_update_block_diagonal_ftf.c_str(), @@ -831,6 +956,16 @@ ->Arg(8) ->Arg(16); +#ifndef CERES_NO_CUDA + const std::string name_left_product_partitioned_f_cuda = + "PMVLeftMultiplyAndAccumulateFCuda<" + path + ">"; + ::benchmark::RegisterBenchmark( + name_left_product_partitioned_f_cuda.c_str(), + ceres::internal::PMVLeftMultiplyAndAccumulateFCuda, + data, + &context); +#endif + const std::string name_left_product_partitioned_e = "PMVLeftMultiplyAndAccumulateE<" + path + ">"; ::benchmark::RegisterBenchmark( @@ -845,6 +980,16 @@ ->Arg(16); #ifndef CERES_NO_CUDA + const std::string name_left_product_partitioned_e_cuda = + "PMVLeftMultiplyAndAccumulateECuda<" + path + ">"; + ::benchmark::RegisterBenchmark( + name_left_product_partitioned_e_cuda.c_str(), + ceres::internal::PMVLeftMultiplyAndAccumulateECuda, + data, + &context); +#endif + +#ifndef CERES_NO_CUDA const std::string name_left_product_cuda = "JacobianLeftMultiplyAndAccumulateCuda<" + path + ">"; ::benchmark::RegisterBenchmark(