CUDA partitioned matrix view
Converts BlockSparseMatrix into two instances of CudaSparseMatrix,
corresponding to left and right sub-matrix.
Values of submatrix E are always just copied as-is, and values of
submatrix F are copied if each row-block of F submatrix satisfies
at least one of the following conditions:
- There is atmost one cell in row-block
- Row block has height of 1 row
Otherwise, indices of values in CRS order corresponding to value indices
in block-sparse order are computed on-the-fly.
Change-Id: I14eee00c36ee74b6b83fc85927907641383abfc7
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index b67b10e..aff218e 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -186,6 +186,7 @@
covariance.cc
covariance_impl.cc
cuda_block_sparse_crs_view.cc
+ cuda_partitioned_block_sparse_crs_view.cc
cuda_block_structure.cc
cuda_sparse_matrix.cc
cuda_vector.cc
@@ -478,6 +479,7 @@
ceres_test(cost_function_to_functor)
ceres_test(covariance)
ceres_test(cubic_interpolation)
+ ceres_test(cuda_partitioned_block_sparse_crs_view)
ceres_test(cuda_block_sparse_crs_view)
ceres_test(cuda_block_structure)
ceres_test(cuda_dense_cholesky)
diff --git a/internal/ceres/block_jacobian_writer.cc b/internal/ceres/block_jacobian_writer.cc
index f74d64d..b009e96 100644
--- a/internal/ceres/block_jacobian_writer.cc
+++ b/internal/ceres/block_jacobian_writer.cc
@@ -54,6 +54,13 @@
// the first num_eliminate_blocks parameter blocks as indicated by the parameter
// block ordering. The remaining parameter blocks are the F blocks.
//
+// In order to simplify handling block-sparse to CRS conversion, cells within
+// the row-block of non-partitioned matrix are stored in memory sequentially in
+// the order of increasing column-block id. In case of partitioned matrices,
+// cells corresponding to F sub-matrix are stored sequentially in the order of
+// increasing column-block id (with cells corresponding to E sub-matrix stored
+// separately).
+//
// TODO(keir): Consider if we should use a boolean for each parameter block
// instead of num_eliminate_blocks.
void BuildJacobianLayout(const Program& program,
@@ -95,29 +102,54 @@
int e_block_pos = 0;
int* jacobian_pos = jacobian_layout_storage->data();
+ std::vector<std::pair<int, int>> active_parameter_blocks;
for (int i = 0; i < residual_blocks.size(); ++i) {
const ResidualBlock* residual_block = residual_blocks[i];
const int num_residuals = residual_block->NumResiduals();
const int num_parameter_blocks = residual_block->NumParameterBlocks();
(*jacobian_layout)[i] = jacobian_pos;
+ // Cells from F sub-matrix are to be stored sequentially with increasing
+ // column block id. For each non-constant parameter block, a pair of indices
+ // (index in the list of active parameter blocks and index in the list of
+ // all parameter blocks) is computed, and index pairs are sorted by the
+ // index of corresponding column block id.
+ active_parameter_blocks.clear();
+ active_parameter_blocks.reserve(num_parameter_blocks);
for (int j = 0; j < num_parameter_blocks; ++j) {
ParameterBlock* parameter_block = residual_block->parameter_blocks()[j];
- const int parameter_block_index = parameter_block->index();
if (parameter_block->IsConstant()) {
continue;
}
+ const int k = active_parameter_blocks.size();
+ active_parameter_blocks.emplace_back(k, j);
+ }
+ std::sort(active_parameter_blocks.begin(),
+ active_parameter_blocks.end(),
+ [&residual_block](const std::pair<int, int>& a,
+ const std::pair<int, int>& b) {
+ return residual_block->parameter_blocks()[a.second]->index() <
+ residual_block->parameter_blocks()[b.second]->index();
+ });
+ // Cell positions for each active parameter block are filled in the order of
+ // active parameter block indices sorted by columnd block index. This
+ // guarantees that cells are laid out sequentially with increasing column
+ // block indices.
+ for (const auto& indices : active_parameter_blocks) {
+ const auto [k, j] = indices;
+ ParameterBlock* parameter_block = residual_block->parameter_blocks()[j];
+ const int parameter_block_index = parameter_block->index();
const int jacobian_block_size =
num_residuals * parameter_block->TangentSize();
if (parameter_block_index < num_eliminate_blocks) {
- *jacobian_pos = e_block_pos;
+ jacobian_pos[k] = e_block_pos;
e_block_pos += jacobian_block_size;
} else {
- *jacobian_pos = f_block_pos;
+ jacobian_pos[k] = f_block_pos;
f_block_pos += jacobian_block_size;
}
- jacobian_pos++;
}
+ jacobian_pos += active_parameter_blocks.size();
}
}
diff --git a/internal/ceres/block_jacobian_writer.h b/internal/ceres/block_jacobian_writer.h
index 61f69b3..30b5f02 100644
--- a/internal/ceres/block_jacobian_writer.h
+++ b/internal/ceres/block_jacobian_writer.h
@@ -53,6 +53,17 @@
// TODO(sameeragarwal): This class needs documentation.
class CERES_NO_EXPORT BlockJacobianWriter {
public:
+ // Pre-computes positions of cells in block-sparse jacobian.
+ // Two possible memory layouts are implemented:
+ // - Non-partitioned case
+ // - Partitioned case (for Schur type linear solver)
+ //
+ // In non-partitioned case, cells are stored sequentially in the
+ // lexicographic order of (row block id, column block id).
+ //
+ // In the case of partitoned matrix, cells of each sub-matrix (E and F) are
+ // stored sequentially in the lexicographic order of (row block id, column
+ // block id) and cells from E sub-matrix precede cells from F sub-matrix.
BlockJacobianWriter(const Evaluator::Options& options, Program* program);
// JacobianWriter interface.
diff --git a/internal/ceres/cuda_block_sparse_crs_view.cc b/internal/ceres/cuda_block_sparse_crs_view.cc
index bb267da..6d4c6b0 100644
--- a/internal/ceres/cuda_block_sparse_crs_view.cc
+++ b/internal/ceres/cuda_block_sparse_crs_view.cc
@@ -41,8 +41,6 @@
: context_(context) {
block_structure_ = std::make_unique<CudaBlockSparseStructure>(
*bsm.block_structure(), context);
- // Only block-sparse matrices with sequential layout of cells are supported
- CHECK(block_structure_->sequential_layout());
crs_matrix_ = std::make_unique<CudaSparseMatrix>(
bsm.num_rows(), bsm.num_cols(), bsm.num_nonzeros(), context);
FillCRSStructure(block_structure_->num_row_blocks(),
diff --git a/internal/ceres/cuda_block_structure.cc b/internal/ceres/cuda_block_structure.cc
index 5817cd7..3685775 100644
--- a/internal/ceres/cuda_block_structure.cc
+++ b/internal/ceres/cuda_block_structure.cc
@@ -43,10 +43,16 @@
return last.size + last.position;
}
} // namespace
-
CudaBlockSparseStructure::CudaBlockSparseStructure(
const CompressedRowBlockStructure& block_structure, ContextImpl* context)
+ : CudaBlockSparseStructure(block_structure, 0, context) {}
+
+CudaBlockSparseStructure::CudaBlockSparseStructure(
+ const CompressedRowBlockStructure& block_structure,
+ const int num_col_blocks_e,
+ ContextImpl* context)
: first_cell_in_row_block_(context),
+ value_offset_row_block_f_(context),
cells_(context),
row_blocks_(context),
col_blocks_(context) {
@@ -57,40 +63,137 @@
// Row block offset is an index of the first cell corresponding to row block
std::vector<int> first_cell_in_row_block;
+ // Offset of the first value in the first non-empty row-block of F sub-matrix
+ std::vector<int> value_offset_row_block_f;
// Flat array of all cells from all row-blocks
std::vector<Cell> cells;
- int f_values_offset = 0;
+ int f_values_offset = -1;
+ num_nonzeros_e_ = 0;
is_crs_compatible_ = true;
num_row_blocks_ = block_structure.rows.size();
num_col_blocks_ = col_blocks.size();
row_blocks.reserve(num_row_blocks_);
first_cell_in_row_block.reserve(num_row_blocks_ + 1);
+ value_offset_row_block_f.reserve(num_row_blocks_ + 1);
num_nonzeros_ = 0;
- sequential_layout_ = true;
- for (const auto& r : block_structure.rows) {
+ // Block-sparse matrices arising from block-jacobian writer are expected to
+ // have sequential layout (for partitioned matrices - it is expected that both
+ // E and F sub-matrices have sequential layout).
+ bool sequential_layout = true;
+ int row_block_id = 0;
+ num_row_blocks_e_ = 0;
+ for (; row_block_id < num_row_blocks_; ++row_block_id) {
+ const auto& r = block_structure.rows[row_block_id];
const int row_block_size = r.block.size;
- if (r.cells.size() > 1 && row_block_size > 1) {
+ const int num_cells = r.cells.size();
+
+ if (num_col_blocks_e == 0 || r.cells.size() == 0 ||
+ r.cells[0].block_id >= num_col_blocks_e) {
+ break;
+ }
+ num_row_blocks_e_ = row_block_id + 1;
+ // In E sub-matrix there is exactly a single E cell in the row
+ // since E cells are stored separately from F cells, crs-compatiblity of
+ // F sub-matrix only breaks if there are more than 2 cells in row (that
+ // is, more than 1 cell in F sub-matrix)
+ if (num_cells > 2 && row_block_size > 1) {
is_crs_compatible_ = false;
}
row_blocks.emplace_back(r.block);
first_cell_in_row_block.push_back(cells.size());
+
+ for (int cell_id = 0; cell_id < num_cells; ++cell_id) {
+ const auto& c = r.cells[cell_id];
+ const int col_block_size = col_blocks[c.block_id].size;
+ const int cell_size = col_block_size * row_block_size;
+ cells.push_back(c);
+ if (cell_id == 0) {
+ DCHECK(c.position == num_nonzeros_e_);
+ num_nonzeros_e_ += cell_size;
+ } else {
+ if (f_values_offset == -1) {
+ num_nonzeros_ = c.position;
+ f_values_offset = c.position;
+ }
+ sequential_layout &= c.position == num_nonzeros_;
+ num_nonzeros_ += cell_size;
+ if (cell_id == 1) {
+ // Correct value_offset_row_block_f for empty row-blocks of F
+ // preceding this one
+ for (auto it = value_offset_row_block_f.rbegin();
+ it != value_offset_row_block_f.rend();
+ ++it) {
+ if (*it != -1) break;
+ *it = c.position;
+ }
+ value_offset_row_block_f.push_back(c.position);
+ }
+ }
+ }
+ if (num_cells == 1) {
+ value_offset_row_block_f.push_back(-1);
+ }
+ }
+ for (; row_block_id < num_row_blocks_; ++row_block_id) {
+ const auto& r = block_structure.rows[row_block_id];
+ const int row_block_size = r.block.size;
+ const int num_cells = r.cells.size();
+ // After num_row_blocks_e_ row-blocks, there should be no cells in E
+ // sub-matrix. Thus crs-compatibility of F sub-matrix breaks if there are
+ // more than one cells in the row-block
+ if (num_cells > 1 && row_block_size > 1) {
+ is_crs_compatible_ = false;
+ }
+ row_blocks.emplace_back(r.block);
+ first_cell_in_row_block.push_back(cells.size());
+
+ if (r.cells.empty()) {
+ value_offset_row_block_f.push_back(-1);
+ } else {
+ for (auto it = value_offset_row_block_f.rbegin();
+ it != value_offset_row_block_f.rend();
+ --it) {
+ if (*it != -1) break;
+ *it = cells[0].position;
+ }
+ value_offset_row_block_f.push_back(r.cells[0].position);
+ }
for (const auto& c : r.cells) {
const int col_block_size = col_blocks[c.block_id].size;
const int cell_size = col_block_size * row_block_size;
cells.push_back(c);
- sequential_layout_ &= c.position == num_nonzeros_;
+ DCHECK(c.block_id >= num_col_blocks_e);
+ if (f_values_offset == -1) {
+ num_nonzeros_ = c.position;
+ f_values_offset = c.position;
+ }
+ sequential_layout &= c.position == num_nonzeros_;
num_nonzeros_ += cell_size;
}
}
+
+ if (f_values_offset == -1) {
+ f_values_offset = num_nonzeros_e_;
+ num_nonzeros_ = num_nonzeros_e_;
+ }
+ // Fill non-zero offsets for the last rows of F submatrix
+ for (auto it = value_offset_row_block_f.rbegin();
+ it != value_offset_row_block_f.rend();
+ ++it) {
+ if (*it != -1) break;
+ *it = num_nonzeros_;
+ }
+ value_offset_row_block_f.push_back(num_nonzeros_);
+ CHECK_EQ(num_nonzeros_e_, f_values_offset);
first_cell_in_row_block.push_back(cells.size());
num_cells_ = cells.size();
num_rows_ = Dimension(row_blocks);
num_cols_ = Dimension(col_blocks);
- is_crs_compatible_ &= sequential_layout_;
+ CHECK(sequential_layout);
if (VLOG_IS_ON(3)) {
const size_t first_cell_in_row_block_size =
@@ -122,6 +225,9 @@
cells_.CopyFromCpuVector(cells);
row_blocks_.CopyFromCpuVector(row_blocks);
col_blocks_.CopyFromCpuVector(col_blocks);
+ if (num_col_blocks_e || num_row_blocks_e_) {
+ value_offset_row_block_f_.CopyFromCpuVector(value_offset_row_block_f);
+ }
}
} // namespace ceres::internal
diff --git a/internal/ceres/cuda_block_structure.h b/internal/ceres/cuda_block_structure.h
index 28e36b2..6da6fdd 100644
--- a/internal/ceres/cuda_block_structure.h
+++ b/internal/ceres/cuda_block_structure.h
@@ -56,28 +56,40 @@
// array of cells on cpu and transfer it to the gpu.
CudaBlockSparseStructure(const CompressedRowBlockStructure& block_structure,
ContextImpl* context);
+ // In the case of partitioned matrices, number of non-zeros in E and layout of
+ // F are computed
+ CudaBlockSparseStructure(const CompressedRowBlockStructure& block_structure,
+ const int num_col_blocks_e,
+ ContextImpl* context);
int num_rows() const { return num_rows_; }
int num_cols() const { return num_cols_; }
int num_cells() const { return num_cells_; }
int num_nonzeros() const { return num_nonzeros_; }
+ // When partitioned matrix constructor was used, returns number of non-zeros
+ // in E sub-matrix
+ int num_nonzeros_e() const { return num_nonzeros_e_; }
int num_row_blocks() const { return num_row_blocks_; }
+ int num_row_blocks_e() const { return num_row_blocks_e_; }
int num_col_blocks() const { return num_col_blocks_; }
- // Returns true if values from block-sparse matrix can be copied to CRS matrix
- // as-is. This is possible if each row-block is stored in CRS order:
+ // Returns true if values from block-sparse matrix (F sub-matrix in
+ // partitioned case) can be copied to CRS matrix as-is. This is possible if
+ // each row-block is stored in CRS order:
// - Row-block consists of a single row
// - Row-block contains a single cell
bool IsCrsCompatible() const { return is_crs_compatible_; }
- // Returns true if block-sparse structure corresponds to block-sparse matrix
- // with sequential cell positions
- bool sequential_layout() const { return sequential_layout_; }
// Device pointer to array of num_row_blocks + 1 indices of the first cell of
// row block
const int* first_cell_in_row_block() const {
return first_cell_in_row_block_.data();
}
+ // Device pointer to array of num_row_blocks + 1 indices of the first value in
+ // this or subsequent row-blocks of submatrix F
+ const int* value_offset_row_block_f() const {
+ return value_offset_row_block_f_.data();
+ }
// Device pointer to array of num_cells cells, sorted by row-block
const Cell* cells() const { return cells_.data(); }
// Device pointer to array of row blocks
@@ -90,11 +102,13 @@
int num_cols_;
int num_cells_;
int num_nonzeros_;
+ int num_nonzeros_e_;
int num_row_blocks_;
+ int num_row_blocks_e_;
int num_col_blocks_;
bool is_crs_compatible_;
- bool sequential_layout_;
CudaBuffer<int> first_cell_in_row_block_;
+ CudaBuffer<int> value_offset_row_block_f_;
CudaBuffer<Cell> cells_;
CudaBuffer<Block> row_blocks_;
CudaBuffer<Block> col_blocks_;
diff --git a/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc b/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc
index a1ea6af..0a21c8d 100644
--- a/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc
+++ b/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc
@@ -93,13 +93,17 @@
// - row_block_ids: row_block_ids[i] will be set to index of row-block that
// contains i-th row.
// Computation is perform row-block-wise
+template <bool partitioned = false>
__global__ void RowBlockIdAndNNZ(
- int num_row_blocks,
+ const int num_row_blocks,
+ const int num_col_blocks_e,
+ const int num_row_blocks_e,
const int* __restrict__ first_cell_in_row_block,
const Cell* __restrict__ cells,
const Block* __restrict__ row_blocks,
const Block* __restrict__ col_blocks,
- int* __restrict__ rows,
+ int* __restrict__ rows_e,
+ int* __restrict__ rows_f,
int* __restrict__ row_block_ids) {
const int row_block_id = blockIdx.x * blockDim.x + threadIdx.x;
if (row_block_id > num_row_blocks) {
@@ -108,20 +112,32 @@
}
if (row_block_id == num_row_blocks) {
// one extra thread sets the first element
- rows[0] = 0;
+ rows_f[0] = 0;
+ if constexpr (partitioned) {
+ rows_e[0] = 0;
+ }
return;
}
const auto& row_block = row_blocks[row_block_id];
- int row_nnz = 0;
- const auto first_cell = cells + first_cell_in_row_block[row_block_id];
+ auto first_cell = cells + first_cell_in_row_block[row_block_id];
const auto last_cell = cells + first_cell_in_row_block[row_block_id + 1];
+ int row_nnz_e = 0;
+ if (partitioned && row_block_id < num_row_blocks_e) {
+ // First cell is a cell from E
+ row_nnz_e = col_blocks[first_cell->block_id].size;
+ ++first_cell;
+ }
+ int row_nnz_f = 0;
for (auto cell = first_cell; cell < last_cell; ++cell) {
- row_nnz += col_blocks[cell->block_id].size;
+ row_nnz_f += col_blocks[cell->block_id].size;
}
const int first_row = row_block.position;
const int last_row = first_row + row_block.size;
for (int i = first_row; i < last_row; ++i) {
- rows[i + 1] = row_nnz;
+ if constexpr (partitioned) {
+ rows_e[i + 1] = row_nnz_e;
+ }
+ rows_f[i + 1] = row_nnz_f;
row_block_ids[i] = row_block_id;
}
}
@@ -139,15 +155,19 @@
// Outputs:
// - cols: column-index array of CRS structure
// Computaion is perform row-wise
-__global__ void ComputeColumnsAndPermutation(
- int num_rows,
- const int* __restrict__ first_cell_in_row_block,
- const Cell* __restrict__ cells,
- const Block* __restrict__ row_blocks,
- const Block* __restrict__ col_blocks,
- const int* __restrict__ row_block_ids,
- const int* __restrict__ rows,
- int* __restrict__ cols) {
+template <bool partitioned>
+__global__ void ComputeColumns(const int num_rows,
+ const int num_row_blocks_e,
+ const int num_col_blocks_e,
+ const int* __restrict__ first_cell_in_row_block,
+ const Cell* __restrict__ cells,
+ const Block* __restrict__ row_blocks,
+ const Block* __restrict__ col_blocks,
+ const int* __restrict__ row_block_ids,
+ const int* __restrict__ rows_e,
+ int* __restrict__ cols_e,
+ const int* __restrict__ rows_f,
+ int* __restrict__ cols_f) {
const int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= num_rows) {
// No synchronization is performed in this kernel, thus it is safe to return
@@ -155,17 +175,30 @@
}
const int row_block_id = row_block_ids[row];
// position in crs matrix
- int crs_position = rows[row];
- const auto first_cell = cells + first_cell_in_row_block[row_block_id];
+ auto first_cell = cells + first_cell_in_row_block[row_block_id];
const auto last_cell = cells + first_cell_in_row_block[row_block_id + 1];
+ const int num_cols_e = col_blocks[num_col_blocks_e].position;
// For reach cell of row-block only current row is being filled
+ if (partitioned && row_block_id < num_row_blocks_e) {
+ // The first cell is cell from E
+ const auto& col_block = col_blocks[first_cell->block_id];
+ const int col_block_size = col_block.size;
+ int column_idx = col_block.position;
+ int crs_position_e = rows_e[row];
+ // Column indices for each element of row_in_block row of current cell
+ for (int i = 0; i < col_block_size; ++i, ++crs_position_e) {
+ cols_e[crs_position_e] = column_idx++;
+ }
+ ++first_cell;
+ }
+ int crs_position_f = rows_f[row];
for (auto cell = first_cell; cell < last_cell; ++cell) {
const auto& col_block = col_blocks[cell->block_id];
const int col_block_size = col_block.size;
- int column_idx = col_block.position;
+ int column_idx = col_block.position - num_cols_e;
// Column indices for each element of row_in_block row of current cell
- for (int i = 0; i < col_block_size; ++i, ++crs_position) {
- cols[crs_position] = column_idx++;
+ for (int i = 0; i < col_block_size; ++i, ++crs_position_f) {
+ cols_f[crs_position_f] = column_idx++;
}
}
}
@@ -183,31 +216,93 @@
// row_block_ids array
int* row_block_ids = AllocateTemporaryMemory<int>(num_rows, stream);
const int num_blocks_blockwise = NumBlocksInGrid(num_row_blocks + 1);
- RowBlockIdAndNNZ<<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>(
+ RowBlockIdAndNNZ<false><<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>(
num_row_blocks,
+ 0,
+ 0,
first_cell_in_row_block,
cells,
row_blocks,
col_blocks,
+ nullptr,
rows,
row_block_ids);
// Finalize row-index array of CRS strucure by computing prefix sum
thrust::inclusive_scan(
ThrustCudaStreamExecutionPolicy(stream), rows, rows + num_rows + 1, rows);
- // Fill cols array of CRS structure and permutation from block-sparse to CRS
+ // Fill cols array of CRS structure
const int num_blocks_rowwise = NumBlocksInGrid(num_rows);
- ComputeColumnsAndPermutation<<<num_blocks_rowwise,
- kCudaBlockSize,
- 0,
- stream>>>(num_rows,
- first_cell_in_row_block,
- cells,
- row_blocks,
- col_blocks,
- row_block_ids,
- rows,
- cols);
+ ComputeColumns<false><<<num_blocks_rowwise, kCudaBlockSize, 0, stream>>>(
+ num_rows,
+ 0,
+ 0,
+ first_cell_in_row_block,
+ cells,
+ row_blocks,
+ col_blocks,
+ row_block_ids,
+ nullptr,
+ nullptr,
+ rows,
+ cols);
+ FreeTemporaryMemory(row_block_ids, stream);
+}
+
+void FillCRSStructurePartitioned(const int num_row_blocks,
+ const int num_rows,
+ const int num_row_blocks_e,
+ const int num_col_blocks_e,
+ const int num_nonzeros_e,
+ const int* first_cell_in_row_block,
+ const Cell* cells,
+ const Block* row_blocks,
+ const Block* col_blocks,
+ int* rows_e,
+ int* cols_e,
+ int* rows_f,
+ int* cols_f,
+ cudaStream_t stream) {
+ // Set number of non-zeros per row in rows array and row to row-block map in
+ // row_block_ids array
+ int* row_block_ids = AllocateTemporaryMemory<int>(num_rows, stream);
+ const int num_blocks_blockwise = NumBlocksInGrid(num_row_blocks + 1);
+ RowBlockIdAndNNZ<true><<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>(
+ num_row_blocks,
+ num_col_blocks_e,
+ num_row_blocks_e,
+ first_cell_in_row_block,
+ cells,
+ row_blocks,
+ col_blocks,
+ rows_e,
+ rows_f,
+ row_block_ids);
+ // Finalize row-index array of CRS strucure by computing prefix sum
+ thrust::inclusive_scan(ThrustCudaStreamExecutionPolicy(stream),
+ rows_e,
+ rows_e + num_rows + 1,
+ rows_e);
+ thrust::inclusive_scan(ThrustCudaStreamExecutionPolicy(stream),
+ rows_f,
+ rows_f + num_rows + 1,
+ rows_f);
+
+ // Fill cols array of CRS structure
+ const int num_blocks_rowwise = NumBlocksInGrid(num_rows);
+ ComputeColumns<true><<<num_blocks_rowwise, kCudaBlockSize, 0, stream>>>(
+ num_rows,
+ num_row_blocks_e,
+ num_col_blocks_e,
+ first_cell_in_row_block,
+ cells,
+ row_blocks,
+ col_blocks,
+ row_block_ids,
+ rows_e,
+ cols_e,
+ rows_f,
+ cols_f);
FreeTemporaryMemory(row_block_ids, stream);
}
@@ -231,16 +326,17 @@
}
// Element-wise reordering of block-sparse values
-// - first_cell_pos - position of the first cell of corresponding sub-matrix;
-// only used if use_first_cell is true; otherwise the first cell in row-block is
-// used
+// - first_cell_in_row_block - position of the first cell of row-block
// - block_sparse_values - segment of block-sparse values starting from
// block_sparse_offset, containing num_values
+template <bool partitioned>
__global__ void PermuteToCrsKernel(
const int block_sparse_offset,
const int num_values,
const int num_row_blocks,
+ const int num_row_blocks_e,
const int* __restrict__ first_cell_in_row_block,
+ const int* __restrict__ value_offset_row_block_f,
const Cell* __restrict__ cells,
const Block* __restrict__ row_blocks,
const Block* __restrict__ col_blocks,
@@ -254,21 +350,32 @@
const int block_sparse_value_id = value_id + block_sparse_offset;
// Find the corresponding row-block with a binary search
const int row_block_id =
- PartitionPoint(
- first_cell_in_row_block,
- 0,
- num_row_blocks,
- [cells,
- block_sparse_value_id] __device__(const int row_block_offset) {
- return cells[row_block_offset].position <= block_sparse_value_id;
- }) -
+ (partitioned
+ ? PartitionPoint(value_offset_row_block_f,
+ 0,
+ num_row_blocks,
+ [block_sparse_value_id] __device__(
+ const int row_block_offset) {
+ return row_block_offset <= block_sparse_value_id;
+ })
+ : PartitionPoint(first_cell_in_row_block,
+ 0,
+ num_row_blocks,
+ [cells, block_sparse_value_id] __device__(
+ const int row_block_offset) {
+ return cells[row_block_offset].position <=
+ block_sparse_value_id;
+ })) -
1;
// Find cell and calculate offset within the row with a linear scan
const auto& row_block = row_blocks[row_block_id];
- const auto first_cell = cells + first_cell_in_row_block[row_block_id];
+ auto first_cell = cells + first_cell_in_row_block[row_block_id];
const auto last_cell = cells + first_cell_in_row_block[row_block_id + 1];
const int row_block_size = row_block.size;
int num_cols_before = 0;
+ if (partitioned && row_block_id < num_row_blocks_e) {
+ ++first_cell;
+ }
for (const Cell* cell = first_cell; cell < last_cell; ++cell) {
const auto& col_block = col_blocks[cell->block_id];
const int col_block_size = col_block.size;
@@ -298,11 +405,43 @@
double* crs_values,
cudaStream_t stream) {
const int num_blocks_valuewise = NumBlocksInGrid(num_values);
- PermuteToCrsKernel<<<num_blocks_valuewise, kCudaBlockSize, 0, stream>>>(
+ PermuteToCrsKernel<false>
+ <<<num_blocks_valuewise, kCudaBlockSize, 0, stream>>>(
+ block_sparse_offset,
+ num_values,
+ num_row_blocks,
+ 0,
+ first_cell_in_row_block,
+ nullptr,
+ cells,
+ row_blocks,
+ col_blocks,
+ crs_rows,
+ block_sparse_values,
+ crs_values);
+}
+
+void PermuteToCRSPartitionedF(const int block_sparse_offset,
+ const int num_values,
+ const int num_row_blocks,
+ const int num_row_blocks_e,
+ const int* first_cell_in_row_block,
+ const int* value_offset_row_block_f,
+ const Cell* cells,
+ const Block* row_blocks,
+ const Block* col_blocks,
+ const int* crs_rows,
+ const double* block_sparse_values,
+ double* crs_values,
+ cudaStream_t stream) {
+ const int num_blocks_valuewise = NumBlocksInGrid(num_values);
+ PermuteToCrsKernel<true><<<num_blocks_valuewise, kCudaBlockSize, 0, stream>>>(
block_sparse_offset,
num_values,
num_row_blocks,
+ num_row_blocks_e,
first_cell_in_row_block,
+ value_offset_row_block_f,
cells,
row_blocks,
col_blocks,
diff --git a/internal/ceres/cuda_kernels_bsm_to_crs.h b/internal/ceres/cuda_kernels_bsm_to_crs.h
index b6da69b..90b4882 100644
--- a/internal/ceres/cuda_kernels_bsm_to_crs.h
+++ b/internal/ceres/cuda_kernels_bsm_to_crs.h
@@ -54,6 +54,23 @@
int* cols,
cudaStream_t stream);
+// Compute structure of partitioned CRS matrix using block-sparse structure.
+// Arrays corresponding to CRS matrices are to be allocated by caller
+void FillCRSStructurePartitioned(const int num_row_blocks,
+ const int num_rows,
+ const int num_row_blocks_e,
+ const int num_col_blocks_e,
+ const int num_nonzeros_e,
+ const int* first_cell_in_row_block,
+ const Cell* cells,
+ const Block* row_blocks,
+ const Block* col_blocks,
+ int* rows_e,
+ int* cols_e,
+ int* rows_f,
+ int* cols_f,
+ cudaStream_t stream);
+
// Permute segment of values from block-sparse matrix with sequential layout to
// CRS order. Segment starts at block_sparse_offset and has length of num_values
void PermuteToCRS(const int block_sparse_offset,
@@ -68,6 +85,24 @@
double* crs_values,
cudaStream_t stream);
+// Permute segment of values from F sub-matrix of block-sparse partitioned
+// matrix with sequential layout to CRS order. Segment starts at
+// block_sparse_offset (including the offset induced by values of E submatrix)
+// and has length of num_values
+void PermuteToCRSPartitionedF(const int block_sparse_offset,
+ const int num_values,
+ const int num_row_blocks,
+ const int num_row_blocks_e,
+ const int* first_cell_in_row_block,
+ const int* value_offset_row_block_f,
+ const Cell* cells,
+ const Block* row_blocks,
+ const Block* col_blocks,
+ const int* crs_rows,
+ const double* block_sparse_values,
+ double* crs_values,
+ cudaStream_t stream);
+
} // namespace internal
} // namespace ceres
diff --git a/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc b/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc
new file mode 100644
index 0000000..703be84
--- /dev/null
+++ b/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc
@@ -0,0 +1,145 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#include "ceres/cuda_partitioned_block_sparse_crs_view.h"
+
+#ifndef CERES_NO_CUDA
+
+#include "ceres/cuda_block_structure.h"
+#include "ceres/cuda_kernels_bsm_to_crs.h"
+
+namespace ceres::internal {
+
+CudaPartitionedBlockSparseCRSView::CudaPartitionedBlockSparseCRSView(
+ const BlockSparseMatrix& bsm,
+ const int num_col_blocks_e,
+ ContextImpl* context)
+ :
+
+ context_(context) {
+ const auto& bs = *bsm.block_structure();
+ block_structure_ =
+ std::make_unique<CudaBlockSparseStructure>(bs, num_col_blocks_e, context);
+ // Determine number of non-zeros in left submatrix
+ // Row-blocks are at least 1 row high, thus we can use a temporary array of
+ // num_rows for ComputeNonZerosInColumnBlockSubMatrix; and later reuse it for
+ // FillCRSStructurePartitioned
+ const int num_rows = bsm.num_rows();
+ const int num_nonzeros_e = block_structure_->num_nonzeros_e();
+ const int num_nonzeros_f = bsm.num_nonzeros() - num_nonzeros_e;
+
+ const int num_cols_e = num_col_blocks_e < bs.cols.size()
+ ? bs.cols[num_col_blocks_e].position
+ : bsm.num_cols();
+ const int num_cols_f = bsm.num_cols() - num_cols_e;
+ matrix_e_ = std::make_unique<CudaSparseMatrix>(
+ num_rows, num_cols_e, num_nonzeros_e, context);
+ matrix_f_ = std::make_unique<CudaSparseMatrix>(
+ num_rows, num_cols_f, num_nonzeros_f, context);
+
+ num_row_blocks_e_ = block_structure_->num_row_blocks_e();
+ FillCRSStructurePartitioned(block_structure_->num_row_blocks(),
+ num_rows,
+ num_row_blocks_e_,
+ num_col_blocks_e,
+ num_nonzeros_e,
+ block_structure_->first_cell_in_row_block(),
+ block_structure_->cells(),
+ block_structure_->row_blocks(),
+ block_structure_->col_blocks(),
+ matrix_e_->mutable_rows(),
+ matrix_e_->mutable_cols(),
+ matrix_f_->mutable_rows(),
+ matrix_f_->mutable_cols(),
+ context->DefaultStream());
+ f_is_crs_compatible_ = block_structure_->IsCrsCompatible();
+ if (f_is_crs_compatible_) {
+ block_structure_ = nullptr;
+ } else {
+ streamed_buffer_ = std::make_unique<CudaStreamedBuffer<double>>(
+ context, kMaxTemporaryArraySize);
+ }
+ CHECK_EQ(bsm.num_nonzeros(),
+ matrix_e_->num_nonzeros() + matrix_f_->num_nonzeros());
+
+ UpdateValues(bsm);
+}
+
+void CudaPartitionedBlockSparseCRSView::UpdateValues(
+ const BlockSparseMatrix& bsm) {
+ if (f_is_crs_compatible_) {
+ CHECK_EQ(cudaSuccess,
+ cudaMemcpyAsync(matrix_e_->mutable_values(),
+ bsm.values(),
+ matrix_e_->num_nonzeros() * sizeof(double),
+ cudaMemcpyHostToDevice,
+ context_->DefaultStream()));
+
+ CHECK_EQ(cudaSuccess,
+ cudaMemcpyAsync(matrix_f_->mutable_values(),
+ bsm.values() + matrix_e_->num_nonzeros(),
+ matrix_f_->num_nonzeros() * sizeof(double),
+ cudaMemcpyHostToDevice,
+ context_->DefaultStream()));
+ return;
+ }
+ streamed_buffer_->CopyToGpu(
+ bsm.values(),
+ bsm.num_nonzeros(),
+ [block_structure = block_structure_.get(),
+ num_nonzeros_e = matrix_e_->num_nonzeros(),
+ num_row_blocks_e = num_row_blocks_e_,
+ values_f = matrix_f_->mutable_values(),
+ rows_f = matrix_f_->rows()](
+ const double* values, int num_values, int offset, auto stream) {
+ PermuteToCRSPartitionedF(num_nonzeros_e + offset,
+ num_values,
+ block_structure->num_row_blocks(),
+ num_row_blocks_e,
+ block_structure->first_cell_in_row_block(),
+ block_structure->value_offset_row_block_f(),
+ block_structure->cells(),
+ block_structure->row_blocks(),
+ block_structure->col_blocks(),
+ rows_f,
+ values,
+ values_f,
+ stream);
+ });
+ CHECK_EQ(cudaSuccess,
+ cudaMemcpyAsync(matrix_e_->mutable_values(),
+ bsm.values(),
+ matrix_e_->num_nonzeros() * sizeof(double),
+ cudaMemcpyHostToDevice,
+ context_->DefaultStream()));
+}
+
+} // namespace ceres::internal
+#endif // CERES_NO_CUDA
diff --git a/internal/ceres/cuda_partitioned_block_sparse_crs_view.h b/internal/ceres/cuda_partitioned_block_sparse_crs_view.h
new file mode 100644
index 0000000..3072dea
--- /dev/null
+++ b/internal/ceres/cuda_partitioned_block_sparse_crs_view.h
@@ -0,0 +1,111 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+//
+
+#ifndef CERES_INTERNAL_CUDA_PARTITIONED_BLOCK_SPARSE_CRS_VIEW_H_
+#define CERES_INTERNAL_CUDA_PARTITIONED_BLOCK_SPARSE_CRS_VIEW_H_
+
+#include "ceres/internal/config.h"
+
+#ifndef CERES_NO_CUDA
+
+#include <memory>
+
+#include "ceres/block_sparse_matrix.h"
+#include "ceres/cuda_block_structure.h"
+#include "ceres/cuda_buffer.h"
+#include "ceres/cuda_sparse_matrix.h"
+#include "ceres/cuda_streamed_buffer.h"
+
+namespace ceres::internal {
+// We use cuSPARSE library for SpMV operations. However, it does not support
+// neither block-sparse format with varying size of the blocks nor
+// submatrix-vector products. Thus, we perform the following operations in order
+// to compute products of partitioned block-sparse matrices and dense vectors on
+// gpu:
+// - Once per block-sparse structure update:
+// - Compute CRS structures of left and right submatrices from block-sparse
+// structure
+// - Check if values of F sub-matrix can be copied without permutation
+// matrices
+// - Once per block-sparse values update:
+// - Copy values of E sub-matrix
+// - Permute or copy values of F sub-matrix
+//
+// It is assumed that cells of block-sparse matrix are laid out sequentially in
+// both of sub-matrices and there is exactly one cell in row-block of E
+// sub-matrix in the first num_row_blocks_e_ row blocks, and no cells in E
+// sub-matrix below num_row_blocks_e_ row blocks.
+//
+// This class avoids storing both CRS and block-sparse values in GPU memory.
+// Instead, block-sparse values are transferred to gpu memory as a disjoint set
+// of small continuous segments with simultaneous permutation of the values into
+// correct order using block-structure.
+class CERES_NO_EXPORT CudaPartitionedBlockSparseCRSView {
+ public:
+ // Initializes internal CRS matrix and block-sparse structure on GPU side
+ // values. The following objects are stored in gpu memory for the whole
+ // lifetime of the object
+ // - matrix_e_: left CRS submatrix
+ // - matrix_f_: right CRS submatrix
+ // - block_structure_: copy of block-sparse structure on GPU
+ // - streamed_buffer_: helper for value updating
+ CudaPartitionedBlockSparseCRSView(const BlockSparseMatrix& bsm,
+ const int num_col_blocks_e,
+ ContextImpl* context);
+
+ // Update values of CRS submatrices using values of block-sparse matrix.
+ // Assumes that bsm has the same block-sparse structure as matrix that was
+ // used for construction.
+ void UpdateValues(const BlockSparseMatrix& bsm);
+
+ const CudaSparseMatrix* matrix_e() const { return matrix_e_.get(); }
+ const CudaSparseMatrix* matrix_f() const { return matrix_f_.get(); }
+ CudaSparseMatrix* mutable_matrix_e() { return matrix_e_.get(); }
+ CudaSparseMatrix* mutable_matrix_f() { return matrix_f_.get(); }
+
+ private:
+ // Value permutation kernel performs a single element-wise operation per
+ // thread, thus performing permutation in blocks of 8 megabytes of
+ // block-sparse values seems reasonable
+ static constexpr int kMaxTemporaryArraySize = 1 * 1024 * 1024;
+ std::unique_ptr<CudaSparseMatrix> matrix_e_;
+ std::unique_ptr<CudaSparseMatrix> matrix_f_;
+ std::unique_ptr<CudaStreamedBuffer<double>> streamed_buffer_;
+ std::unique_ptr<CudaBlockSparseStructure> block_structure_;
+ bool f_is_crs_compatible_;
+ int num_row_blocks_e_;
+ ContextImpl* context_;
+};
+
+} // namespace ceres::internal
+
+#endif // CERES_NO_CUDA
+#endif // CERES_INTERNAL_CUDA_PARTITIONED_BLOCK_SPARSE_CRS_VIEW_H_
diff --git a/internal/ceres/cuda_partitioned_block_sparse_crs_view_test.cc b/internal/ceres/cuda_partitioned_block_sparse_crs_view_test.cc
new file mode 100644
index 0000000..5090b6a
--- /dev/null
+++ b/internal/ceres/cuda_partitioned_block_sparse_crs_view_test.cc
@@ -0,0 +1,282 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#include "ceres/cuda_partitioned_block_sparse_crs_view.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#ifndef CERES_NO_CUDA
+
+namespace ceres::internal {
+
+namespace {
+struct RandomPartitionedMatrixOptions {
+ int num_row_blocks_e;
+ int num_row_blocks_f;
+ int num_col_blocks_e;
+ int num_col_blocks_f;
+ int min_row_block_size;
+ int max_row_block_size;
+ int min_col_block_size;
+ int max_col_block_size;
+ double empty_f_probability;
+ double cell_probability_f;
+ int max_cells_f;
+};
+
+std::unique_ptr<BlockSparseMatrix> CreateRandomPartitionedMatrix(
+ const RandomPartitionedMatrixOptions& options, std::mt19937& rng) {
+ const int num_row_blocks =
+ std::max(options.num_row_blocks_e, options.num_row_blocks_f);
+ const int num_col_blocks =
+ options.num_col_blocks_e + options.num_col_blocks_f;
+
+ CompressedRowBlockStructure* block_structure =
+ new CompressedRowBlockStructure;
+ block_structure->cols.reserve(num_col_blocks);
+ block_structure->rows.reserve(num_row_blocks);
+
+ // Create column blocks
+ std::uniform_int_distribution<int> col_size(options.min_col_block_size,
+ options.max_col_block_size);
+ int num_cols = 0;
+ for (int i = 0; i < num_col_blocks; ++i) {
+ const int size = col_size(rng);
+ block_structure->cols.emplace_back(size, num_cols);
+ num_cols += size;
+ }
+
+ // Prepare column-block indices of E cells
+ std::vector<int> e_col_block_idx;
+ e_col_block_idx.reserve(options.num_row_blocks_e);
+ std::uniform_int_distribution<int> col_e(0, options.num_col_blocks_e - 1);
+ for (int i = 0; i < options.num_row_blocks_e; ++i) {
+ e_col_block_idx.emplace_back(col_e(rng));
+ }
+ std::sort(e_col_block_idx.begin(), e_col_block_idx.end());
+
+ // Prepare cell structure
+ std::uniform_int_distribution<int> row_size(options.min_row_block_size,
+ options.max_row_block_size);
+ std::uniform_real_distribution<double> uniform;
+ int num_rows = 0;
+ for (int i = 0; i < num_row_blocks; ++i) {
+ const int size = row_size(rng);
+ block_structure->rows.emplace_back();
+ auto& row = block_structure->rows.back();
+ row.block.size = size;
+ row.block.position = num_rows;
+ num_rows += size;
+ if (i < options.num_row_blocks_e) {
+ row.cells.emplace_back(e_col_block_idx[i], -1);
+ if (uniform(rng) < options.empty_f_probability) {
+ continue;
+ }
+ }
+ if (i >= options.num_row_blocks_f) continue;
+ const int cells_before = row.cells.size();
+ for (int j = options.num_col_blocks_e; j < num_col_blocks; ++j) {
+ if (uniform(rng) > options.cell_probability_f) {
+ continue;
+ }
+ row.cells.emplace_back(j, -1);
+ }
+ if (row.cells.size() > cells_before + options.max_cells_f) {
+ std::shuffle(row.cells.begin() + cells_before, row.cells.end(), rng);
+ row.cells.resize(cells_before + options.max_cells_f);
+ std::sort(
+ row.cells.begin(), row.cells.end(), [](const auto& a, const auto& b) {
+ return a.block_id < b.block_id;
+ });
+ }
+ }
+
+ // Fill positions in E sub-matrix
+ int num_nonzeros = 0;
+ for (int i = 0; i < options.num_row_blocks_e; ++i) {
+ CHECK_GE(block_structure->rows[i].cells.size(), 1);
+ block_structure->rows[i].cells[0].position = num_nonzeros;
+ const int col_block_size =
+ block_structure->cols[block_structure->rows[i].cells[0].block_id].size;
+ const int row_block_size = block_structure->rows[i].block.size;
+ num_nonzeros += row_block_size * col_block_size;
+ CHECK_GE(num_nonzeros, 0);
+ }
+ // Fill positions in F sub-matrix
+ for (int i = 0; i < options.num_row_blocks_f; ++i) {
+ const int row_block_size = block_structure->rows[i].block.size;
+ for (auto& cell : block_structure->rows[i].cells) {
+ if (cell.position >= 0) continue;
+ cell.position = num_nonzeros;
+ const int col_block_size = block_structure->cols[cell.block_id].size;
+ num_nonzeros += row_block_size * col_block_size;
+ CHECK_GE(num_nonzeros, 0);
+ }
+ }
+ // Populate values
+ auto bsm = std::make_unique<BlockSparseMatrix>(block_structure, true);
+ for (int i = 0; i < num_nonzeros; ++i) {
+ bsm->mutable_values()[i] = i + 1;
+ }
+ return bsm;
+}
+} // namespace
+
+class CudaPartitionedBlockSparseCRSViewTest : public ::testing::Test {
+ static constexpr int kNumColBlocksE = 456;
+
+ protected:
+ void SetUp() final {
+ std::string message;
+ CHECK(context_.InitCuda(&message))
+ << "InitCuda() failed because: " << message;
+
+ RandomPartitionedMatrixOptions options;
+ options.num_row_blocks_f = 123;
+ options.num_row_blocks_e = 456;
+ options.num_col_blocks_f = 123;
+ options.num_col_blocks_e = kNumColBlocksE;
+ options.min_row_block_size = 1;
+ options.max_row_block_size = 4;
+ options.min_col_block_size = 1;
+ options.max_col_block_size = 4;
+ options.empty_f_probability = .1;
+ options.cell_probability_f = .2;
+ options.max_cells_f = options.num_col_blocks_f;
+
+ std::mt19937 rng;
+ short_f_ = CreateRandomPartitionedMatrix(options, rng);
+
+ options.num_row_blocks_e = 123;
+ options.num_row_blocks_f = 456;
+ short_e_ = CreateRandomPartitionedMatrix(options, rng);
+
+ options.max_cells_f = 1;
+ options.num_row_blocks_e = options.num_row_blocks_f;
+ options.num_row_blocks_e = options.num_row_blocks_f;
+ f_crs_compatible_ = CreateRandomPartitionedMatrix(options, rng);
+ }
+
+ void TestMatrix(const BlockSparseMatrix& A_) {
+ const int num_col_blocks_e = 456;
+ CudaPartitionedBlockSparseCRSView view(A_, kNumColBlocksE, &context_);
+
+ const int num_rows = A_.num_rows();
+ const int num_cols = A_.num_cols();
+
+ const auto& bs = *A_.block_structure();
+ const int num_cols_e = bs.cols[num_col_blocks_e].position;
+ const int num_cols_f = num_cols - num_cols_e;
+
+ // TODO: we definitely would like to use matrix() here, but
+ // CudaSparseMatrix::RightMultiplyAndAccumulate is defined non-const because
+ // it might allocate additional storage by request of cuSPARSE
+ auto matrix_e = view.mutable_matrix_e();
+ auto matrix_f = view.mutable_matrix_f();
+ ASSERT_EQ(matrix_e->num_cols(), num_cols_e);
+ ASSERT_EQ(matrix_e->num_rows(), num_rows);
+ ASSERT_EQ(matrix_f->num_cols(), num_cols_f);
+ ASSERT_EQ(matrix_f->num_rows(), num_rows);
+
+ Vector x(num_cols);
+ Vector x_left(num_cols_e);
+ Vector x_right(num_cols_f);
+ Vector y(num_rows);
+ CudaVector x_cuda(&context_, num_cols);
+ CudaVector x_left_cuda(&context_, num_cols_e);
+ CudaVector x_right_cuda(&context_, num_cols_f);
+ CudaVector y_cuda(&context_, num_rows);
+ Vector y_cuda_host(num_rows);
+
+ for (int i = 0; i < num_cols_e; ++i) {
+ x.setZero();
+ x_left.setZero();
+ y.setZero();
+ y_cuda.SetZero();
+ x[i] = 1.;
+ x_left[i] = 1.;
+ x_left_cuda.CopyFromCpu(x_left);
+ A_.RightMultiplyAndAccumulate(
+ x.data(), y.data(), &context_, std::thread::hardware_concurrency());
+ matrix_e->RightMultiplyAndAccumulate(x_left_cuda, &y_cuda);
+ y_cuda.CopyTo(&y_cuda_host);
+ // There will be up to 1 non-zero product per row, thus we expect an exact
+ // match on 32-bit integer indices
+ EXPECT_EQ((y - y_cuda_host).squaredNorm(), 0.);
+ }
+ for (int i = num_cols_e; i < num_cols_f; ++i) {
+ x.setZero();
+ x_right.setZero();
+ y.setZero();
+ y_cuda.SetZero();
+ x[i] = 1.;
+ x_right[i - num_cols_e] = 1.;
+ x_right_cuda.CopyFromCpu(x_right);
+ A_.RightMultiplyAndAccumulate(
+ x.data(), y.data(), &context_, std::thread::hardware_concurrency());
+ matrix_f->RightMultiplyAndAccumulate(x_right_cuda, &y_cuda);
+ y_cuda.CopyTo(&y_cuda_host);
+ // There will be up to 1 non-zero product per row, thus we expect an exact
+ // match on 32-bit integer indices
+ EXPECT_EQ((y - y_cuda_host).squaredNorm(), 0.);
+ }
+ }
+
+ // E sub-matrix might have less row-blocks with cells than F sub-matrix. This
+ // test matrix checks if this case is handled properly
+ std::unique_ptr<BlockSparseMatrix> short_e_;
+ // In case of non-crs compatible F matrix, permuting values from block-order
+ // to crs order involves binary search over row-blocks of F. Having lots of
+ // row-blocks with no F cells is an edge case for this algorithm.
+ std::unique_ptr<BlockSparseMatrix> short_f_;
+ // With F matrix being CRS-compatible, update of the values of partitioned
+ // matrix view reduces to two host->device memcopies, and uses a separate code
+ // path
+ std::unique_ptr<BlockSparseMatrix> f_crs_compatible_;
+
+ ContextImpl context_;
+};
+
+TEST_F(CudaPartitionedBlockSparseCRSViewTest, CreateUpdateValuesShortE) {
+ TestMatrix(*short_e_);
+}
+
+TEST_F(CudaPartitionedBlockSparseCRSViewTest, CreateUpdateValuesShortF) {
+ TestMatrix(*short_f_);
+}
+
+TEST_F(CudaPartitionedBlockSparseCRSViewTest,
+ CreateUpdateValuesCrsCompatibleF) {
+ TestMatrix(*f_crs_compatible_);
+}
+} // namespace ceres::internal
+
+#endif // CERES_NO_CUDA
diff --git a/internal/ceres/evaluation_benchmark.cc b/internal/ceres/evaluation_benchmark.cc
index c23c881..28a8aff 100644
--- a/internal/ceres/evaluation_benchmark.cc
+++ b/internal/ceres/evaluation_benchmark.cc
@@ -37,6 +37,7 @@
#include "ceres/block_sparse_matrix.h"
#include "ceres/bundle_adjustment_test_util.h"
#include "ceres/cuda_block_sparse_crs_view.h"
+#include "ceres/cuda_partitioned_block_sparse_crs_view.h"
#include "ceres/cuda_sparse_matrix.h"
#include "ceres/cuda_vector.h"
#include "ceres/evaluator.h"
@@ -476,6 +477,110 @@
}
#ifndef CERES_NO_CUDA
+static void PMVRightMultiplyAndAccumulateFCuda(benchmark::State& state,
+ BALData* data,
+ ContextImpl* context) {
+ LinearSolver::Options options;
+ options.elimination_groups.push_back(data->bal_problem->num_points());
+ options.context = context;
+ options.num_threads = 1;
+ auto jacobian = data->PartitionedMatrixViewJacobian(options);
+ auto underlying_matrix = data->BlockSparseJacobianPartitioned(context);
+ CudaPartitionedBlockSparseCRSView view(
+ *underlying_matrix, jacobian->num_col_blocks_e(), context);
+
+ Vector x = Vector::Random(jacobian->num_cols_f());
+ CudaVector cuda_x(context, x.size());
+ CudaVector cuda_y(context, jacobian->num_rows());
+
+ cuda_x.CopyFromCpu(x);
+ cuda_y.SetZero();
+
+ auto matrix = view.mutable_matrix_f();
+ for (auto _ : state) {
+ matrix->RightMultiplyAndAccumulate(cuda_x, &cuda_y);
+ }
+ CHECK_GT(cuda_y.Norm(), 0.);
+}
+
+static void PMVLeftMultiplyAndAccumulateFCuda(benchmark::State& state,
+ BALData* data,
+ ContextImpl* context) {
+ LinearSolver::Options options;
+ options.elimination_groups.push_back(data->bal_problem->num_points());
+ options.context = context;
+ options.num_threads = 1;
+ auto jacobian = data->PartitionedMatrixViewJacobian(options);
+ auto underlying_matrix = data->BlockSparseJacobianPartitioned(context);
+ CudaPartitionedBlockSparseCRSView view(
+ *underlying_matrix, jacobian->num_col_blocks_e(), context);
+
+ Vector x = Vector::Random(jacobian->num_rows());
+ CudaVector cuda_x(context, x.size());
+ CudaVector cuda_y(context, jacobian->num_cols_f());
+
+ cuda_x.CopyFromCpu(x);
+ cuda_y.SetZero();
+
+ auto matrix = view.mutable_matrix_f();
+ for (auto _ : state) {
+ matrix->LeftMultiplyAndAccumulate(cuda_x, &cuda_y);
+ }
+ CHECK_GT(cuda_y.Norm(), 0.);
+}
+
+static void PMVRightMultiplyAndAccumulateECuda(benchmark::State& state,
+ BALData* data,
+ ContextImpl* context) {
+ LinearSolver::Options options;
+ options.elimination_groups.push_back(data->bal_problem->num_points());
+ options.context = context;
+ options.num_threads = 1;
+ auto jacobian = data->PartitionedMatrixViewJacobian(options);
+ auto underlying_matrix = data->BlockSparseJacobianPartitioned(context);
+ CudaPartitionedBlockSparseCRSView view(
+ *underlying_matrix, jacobian->num_col_blocks_e(), context);
+
+ Vector x = Vector::Random(jacobian->num_cols_e());
+ CudaVector cuda_x(context, x.size());
+ CudaVector cuda_y(context, jacobian->num_rows());
+
+ cuda_x.CopyFromCpu(x);
+ cuda_y.SetZero();
+
+ auto matrix = view.mutable_matrix_e();
+ for (auto _ : state) {
+ matrix->RightMultiplyAndAccumulate(cuda_x, &cuda_y);
+ }
+ CHECK_GT(cuda_y.Norm(), 0.);
+}
+
+static void PMVLeftMultiplyAndAccumulateECuda(benchmark::State& state,
+ BALData* data,
+ ContextImpl* context) {
+ LinearSolver::Options options;
+ options.elimination_groups.push_back(data->bal_problem->num_points());
+ options.context = context;
+ options.num_threads = 1;
+ auto jacobian = data->PartitionedMatrixViewJacobian(options);
+ auto underlying_matrix = data->BlockSparseJacobianPartitioned(context);
+ CudaPartitionedBlockSparseCRSView view(
+ *underlying_matrix, jacobian->num_col_blocks_e(), context);
+
+ Vector x = Vector::Random(jacobian->num_rows());
+ CudaVector cuda_x(context, x.size());
+ CudaVector cuda_y(context, jacobian->num_cols_e());
+
+ cuda_x.CopyFromCpu(x);
+ cuda_y.SetZero();
+
+ auto matrix = view.mutable_matrix_e();
+ for (auto _ : state) {
+ matrix->LeftMultiplyAndAccumulate(cuda_x, &cuda_y);
+ }
+ CHECK_GT(cuda_y.Norm(), 0.);
+}
+
// We want CudaBlockSparseCRSView to be not slower than explicit conversion to
// CRS on CPU
static void JacobianToCRSView(benchmark::State& state,
@@ -734,6 +839,16 @@
->Arg(8)
->Arg(16);
+#ifndef CERES_NO_CUDA
+ const std::string name_right_product_partitioned_f_cuda =
+ "PMVRightMultiplyAndAccumulateFCuda<" + path + ">";
+ ::benchmark::RegisterBenchmark(
+ name_right_product_partitioned_f_cuda.c_str(),
+ ceres::internal::PMVRightMultiplyAndAccumulateFCuda,
+ data,
+ &context);
+#endif
+
const std::string name_right_product_partitioned_e =
"PMVRightMultiplyAndAccumulateE<" + path + ">";
::benchmark::RegisterBenchmark(
@@ -747,6 +862,16 @@
->Arg(8)
->Arg(16);
+#ifndef CERES_NO_CUDA
+ const std::string name_right_product_partitioned_e_cuda =
+ "PMVRightMultiplyAndAccumulateECuda<" + path + ">";
+ ::benchmark::RegisterBenchmark(
+ name_right_product_partitioned_e_cuda.c_str(),
+ ceres::internal::PMVRightMultiplyAndAccumulateECuda,
+ data,
+ &context);
+#endif
+
const std::string name_update_block_diagonal_ftf =
"PMVUpdateBlockDiagonalFtF<" + path + ">";
::benchmark::RegisterBenchmark(name_update_block_diagonal_ftf.c_str(),
@@ -831,6 +956,16 @@
->Arg(8)
->Arg(16);
+#ifndef CERES_NO_CUDA
+ const std::string name_left_product_partitioned_f_cuda =
+ "PMVLeftMultiplyAndAccumulateFCuda<" + path + ">";
+ ::benchmark::RegisterBenchmark(
+ name_left_product_partitioned_f_cuda.c_str(),
+ ceres::internal::PMVLeftMultiplyAndAccumulateFCuda,
+ data,
+ &context);
+#endif
+
const std::string name_left_product_partitioned_e =
"PMVLeftMultiplyAndAccumulateE<" + path + ">";
::benchmark::RegisterBenchmark(
@@ -845,6 +980,16 @@
->Arg(16);
#ifndef CERES_NO_CUDA
+ const std::string name_left_product_partitioned_e_cuda =
+ "PMVLeftMultiplyAndAccumulateECuda<" + path + ">";
+ ::benchmark::RegisterBenchmark(
+ name_left_product_partitioned_e_cuda.c_str(),
+ ceres::internal::PMVLeftMultiplyAndAccumulateECuda,
+ data,
+ &context);
+#endif
+
+#ifndef CERES_NO_CUDA
const std::string name_left_product_cuda =
"JacobianLeftMultiplyAndAccumulateCuda<" + path + ">";
::benchmark::RegisterBenchmark(