Block-sparse to CRS conversion using block-structure
Instead of pre-computing pemutation from block-sparse to CRS order,
index of value in CRS matrix is computed in the process of updating
values using block-sparse structure.
When it is possible to update values via a simple host-to-device copy,
block-sparse structure on GPU is discarded after computing CRS
structure.
Computing index is significantly slower than using pre-computed
permutation, but is still hidden by host-to-device transfer.
On problems from BAL dataset this results into reduction of extra
gpu memory consumption from 33% (permutation stored as 32-bit indices)
to ~10% for storing block-sparse structure.
Benchmark results:
======================= CUDA Device Properties ======================
Cuda version : 11.8
Device ID : 0
Device name : NVIDIA GeForce RTX 2080 Ti
Total GPU memory : 11012 MiB
GPU memory available : 10852 MiB
Compute capability : 7.5
Warp size : 32
Max threads per block: 1024
Max threads per dim : 1024 1024 64
Max grid size : 2147483647 65535 65535
Multiprocessor count : 68
====================================================================
Running ./bin/evaluation_benchmark
Run on (112 X 3200 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x56)
L1 Instruction 32 KiB (x56)
L2 Unified 1024 KiB (x56)
L3 Unified 39424 KiB (x2)
Load Average: 24.58, 11.75, 8.52
-----------------------------------------------------------------------
Benchmark Time
-----------------------------------------------------------------------
Using on-the-fly computation of CRS index corresponding to block-sparse
index:
JacobianToCRS<g/final/problem-4585-1324582-pre.txt> 1607 ms
JacobianToCRSView<g/final/problem-4585-1324582-pre.txt> 564 ms
JacobianToCRSMatrix<g/final/problem-4585-1324582-pre.txt> 2226 ms
JacobianToCRSViewUpdate<g/final/problem-4585-1324582-pre.txt> 228 ms
JacobianToCRSMatrixUpdate<g/final/problem-4585-1324582-pre.txt> 400 ms
Using precomputed permutation:
JacobianToCRS</final/problem-4585-1324582-pre.txt> 1656 ms
JacobianToCRSView</final/problem-4585-1324582-pre.txt> 553 ms
JacobianToCRSMatrix</final/problem-4585-1324582-pre.txt> 2255 ms
JacobianToCRSViewUpdate</final/problem-4585-1324582-pre.txt> 228 ms
JacobianToCRSMatrixUpdate</final/problem-4585-1324582-pre.txt> 406 ms
Performance of JacobianToCRSViewUpdate is still limited by
host-to-device transfer, and JacobianToCRSView is faster than computing
CRS structure on CPU.
Change-Id: Ifb6910fb01ae6071400d36c277846fadc5857964
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index e4b124d..be4bad1 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -49,20 +49,23 @@
libsuitesparse-dev \
ninja-build \
wget
- - name: Setup CUDA toolkit (system repositories)
+ # nvidia cuda toolkit shipped with 20.04 LTS does not support stream-ordered allocations
+ - name: Setup CUDA toolkit repositories (20.04)
if: matrix.gpu == 'cuda' && matrix.os == 'ubuntu:20.04'
run: |
- apt-get install -y \
- nvidia-cuda-dev \
- nvidia-cuda-toolkit
+ wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb
+ dpkg -i cuda-keyring_1.0-1_all.deb
# nvidia cuda toolkit + gcc combo shipped with 22.04LTS is broken
# and is not able to compile code that uses thrust
# https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1006962
- - name: Setup CUDA toolkit (nvidia repositories)
+ - name: Setup CUDA toolkit repositories (22.04)
if: matrix.gpu == 'cuda' && matrix.os == 'ubuntu:22.04'
run: |
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb
dpkg -i cuda-keyring_1.0-1_all.deb
+ - name: Setup CUDA toolkit
+ if: matrix.gpu == 'cuda'
+ run: |
apt-get update
apt-get install -y cuda
echo "CUDACXX=/usr/local/cuda/bin/nvcc" >> $GITHUB_ENV
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index 9cb99b4..b67b10e 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -140,9 +140,12 @@
COMMAND ${CUDAToolkit_BIN_DIR}/cuda-memcheck --leak-check full
$<TARGET_FILE:cuda_dense_cholesky_test>)
endif (BUILD_TESTING AND GFLAGS)
- set_source_files_properties(cuda_kernels.cu.cc PROPERTIES LANGUAGE CUDA)
- add_library(ceres_cuda_kernels STATIC cuda_kernels.cu.cc)
+ set_source_files_properties(cuda_kernels_vector_ops.cu.cc PROPERTIES LANGUAGE CUDA)
+ set_source_files_properties(cuda_kernels_bsm_to_crs.cu.cc PROPERTIES LANGUAGE CUDA)
+ add_library(ceres_cuda_kernels STATIC cuda_kernels_vector_ops.cu.cc cuda_kernels_bsm_to_crs.cu.cc)
target_compile_features(ceres_cuda_kernels PRIVATE cxx_std_14)
+ # Enable __host__ / __device__ annotations in lambda declarations
+ target_compile_options(ceres_cuda_kernels PRIVATE --extended-lambda)
target_include_directories(ceres_cuda_kernels PRIVATE ${Ceres_SOURCE_DIR}/internal ${Ceres_BINARY_DIR}/${CMAKE_INSTALL_INCLUDEDIR})
list(APPEND CERES_LIBRARY_PRIVATE_DEPENDENCIES ceres_cuda_kernels)
endif (USE_CUDA)
@@ -479,7 +482,7 @@
ceres_test(cuda_block_structure)
ceres_test(cuda_dense_cholesky)
ceres_test(cuda_dense_qr)
- ceres_test(cuda_kernels)
+ ceres_test(cuda_kernels_vector_ops)
ceres_test(cuda_sparse_matrix)
ceres_test(cuda_streamed_buffer)
ceres_test(cuda_vector)
diff --git a/internal/ceres/block_sparse_matrix.cc b/internal/ceres/block_sparse_matrix.cc
index ab1d746..01dbfbe 100644
--- a/internal/ceres/block_sparse_matrix.cc
+++ b/internal/ceres/block_sparse_matrix.cc
@@ -709,7 +709,9 @@
}
std::unique_ptr<BlockSparseMatrix> BlockSparseMatrix::CreateRandomMatrix(
- const BlockSparseMatrix::RandomMatrixOptions& options, std::mt19937& prng) {
+ const BlockSparseMatrix::RandomMatrixOptions& options,
+ std::mt19937& prng,
+ bool use_page_locked_memory) {
CHECK_GT(options.num_row_blocks, 0);
CHECK_GT(options.min_row_block_size, 0);
CHECK_GT(options.max_row_block_size, 0);
@@ -766,7 +768,8 @@
}
}
- auto matrix = std::make_unique<BlockSparseMatrix>(bs.release());
+ auto matrix =
+ std::make_unique<BlockSparseMatrix>(bs.release(), use_page_locked_memory);
double* values = matrix->mutable_values();
std::normal_distribution<double> standard_normal_distribution;
std::generate_n(
diff --git a/internal/ceres/block_sparse_matrix.h b/internal/ceres/block_sparse_matrix.h
index 0d99e15..cd13b06 100644
--- a/internal/ceres/block_sparse_matrix.h
+++ b/internal/ceres/block_sparse_matrix.h
@@ -157,7 +157,9 @@
// distributed and whose structure is determined by
// RandomMatrixOptions.
static std::unique_ptr<BlockSparseMatrix> CreateRandomMatrix(
- const RandomMatrixOptions& options, std::mt19937& prng);
+ const RandomMatrixOptions& options,
+ std::mt19937& prng,
+ bool use_page_locked_memory = false);
private:
double* AllocateValues(int size);
diff --git a/internal/ceres/cuda_block_sparse_crs_view.cc b/internal/ceres/cuda_block_sparse_crs_view.cc
index a03d01c..bb267da 100644
--- a/internal/ceres/cuda_block_sparse_crs_view.cc
+++ b/internal/ceres/cuda_block_sparse_crs_view.cc
@@ -32,41 +32,69 @@
#ifndef CERES_NO_CUDA
-#include "ceres/cuda_block_structure.h"
-#include "ceres/cuda_kernels.h"
+#include "ceres/cuda_kernels_bsm_to_crs.h"
namespace ceres::internal {
CudaBlockSparseCRSView::CudaBlockSparseCRSView(const BlockSparseMatrix& bsm,
ContextImpl* context)
- : permutation_(context, bsm.num_nonzeros()),
- streamed_buffer_(context, kMaxTemporaryArraySize) {
- CudaBlockSparseStructure block_structure(*bsm.block_structure(), context);
+ : context_(context) {
+ block_structure_ = std::make_unique<CudaBlockSparseStructure>(
+ *bsm.block_structure(), context);
+ // Only block-sparse matrices with sequential layout of cells are supported
+ CHECK(block_structure_->sequential_layout());
crs_matrix_ = std::make_unique<CudaSparseMatrix>(
bsm.num_rows(), bsm.num_cols(), bsm.num_nonzeros(), context);
- CudaBuffer<int> temp_rows(context, bsm.num_rows());
- FillCRSStructure(block_structure.num_row_blocks(),
+ FillCRSStructure(block_structure_->num_row_blocks(),
bsm.num_rows(),
- block_structure.row_block_offsets(),
- block_structure.cells(),
- block_structure.row_blocks(),
- block_structure.col_blocks(),
+ block_structure_->first_cell_in_row_block(),
+ block_structure_->cells(),
+ block_structure_->row_blocks(),
+ block_structure_->col_blocks(),
crs_matrix_->mutable_rows(),
crs_matrix_->mutable_cols(),
- temp_rows.data(),
- permutation_.data(),
context->DefaultStream());
+ is_crs_compatible_ = block_structure_->IsCrsCompatible();
+ // if matrix is crs-compatible - we can drop block-structure and don't need
+ // streamed_buffer_
+ if (is_crs_compatible_) {
+ VLOG(3) << "Block-sparse matrix is compatible with CRS, discarding "
+ "block-structure";
+ block_structure_ = nullptr;
+ } else {
+ streamed_buffer_ = std::make_unique<CudaStreamedBuffer<double>>(
+ context_, kMaxTemporaryArraySize);
+ }
UpdateValues(bsm);
}
+
void CudaBlockSparseCRSView::UpdateValues(const BlockSparseMatrix& bsm) {
- streamed_buffer_.CopyToGpu(
+ if (is_crs_compatible_) {
+ // Values of CRS-compatible matrices can be copied as-is
+ CHECK_EQ(cudaSuccess,
+ cudaMemcpyAsync(crs_matrix_->mutable_values(),
+ bsm.values(),
+ bsm.num_nonzeros() * sizeof(double),
+ cudaMemcpyHostToDevice,
+ context_->DefaultStream()));
+ return;
+ }
+ streamed_buffer_->CopyToGpu(
bsm.values(),
bsm.num_nonzeros(),
- [permutation = permutation_.data(),
- values_to = crs_matrix_->mutable_values()](
+ [bs = block_structure_.get(), crs = crs_matrix_.get()](
const double* values, int num_values, int offset, auto stream) {
- PermuteValues(
- offset, num_values, permutation, values, values_to, stream);
+ PermuteToCRS(offset,
+ num_values,
+ bs->num_row_blocks(),
+ bs->first_cell_in_row_block(),
+ bs->cells(),
+ bs->row_blocks(),
+ bs->col_blocks(),
+ crs->rows(),
+ values,
+ crs->mutable_values(),
+ stream);
});
}
diff --git a/internal/ceres/cuda_block_sparse_crs_view.h b/internal/ceres/cuda_block_sparse_crs_view.h
index 3ea8498..2ae8721 100644
--- a/internal/ceres/cuda_block_sparse_crs_view.h
+++ b/internal/ceres/cuda_block_sparse_crs_view.h
@@ -39,6 +39,7 @@
#include <memory>
#include "ceres/block_sparse_matrix.h"
+#include "ceres/cuda_block_structure.h"
#include "ceres/cuda_buffer.h"
#include "ceres/cuda_sparse_matrix.h"
#include "ceres/cuda_streamed_buffer.h"
@@ -49,42 +50,23 @@
// following operations in order to compute products of block-sparse matrices
// and dense vectors on gpu:
// - Once per block-sparse structure update:
-// - Compute CRS structure from block-sparse structure
-// - Compute permutation from block-sparse values to CRS values
+// - Compute CRS structure from block-sparse structure and check if values of
+// block-sparse matrix would have the same order as values of CRS matrix
// - Once per block-sparse values update:
// - Update values in CRS matrix with values of block-sparse matrix
//
-// Since there are no constraints on positions of cells in value array of
-// block-sparse matrix, a permutation from block-sparse values to CRS
-// values is stored explicitly.
+// Only block-sparse matrices with sequential order of cells are supported.
//
-// Example: given matrix with the following block-structure
-// [ 1 2 | | 6 7 ]
-// [ 3 4 | | 8 9 ]
-// [-----+-+-----]
-// [ |5| ]
-// with values stored as values_block_sparse = [1, 2, 3, 4, 5, 6, 7, 8, 9],
-// permutation from block-sparse to CRS is p = [0, 1, 4, 5, 8, 2, 3, 6, 7];
-// and i-th block-sparse value has index p[i] in CRS values array
-//
-// This allows to avoid storing both CRS and block-sparse values in GPU memory.
-// Instead, block-sparse values are transferred to gpu memory as a disjoint set
-// of small continuous segments with simultaneous permutation of the values into
-// correct order
+// UpdateValues method updates values:
+// - In a single host-to-device copy for matrices with CRS-compatible value
+// layout
+// - Simultaneously transferring and permuting values using CudaStreamedBuffer
+// otherwise
class CERES_NO_EXPORT CudaBlockSparseCRSView {
public:
- // Initializes internal CRS matrix and permutation from block-sparse to CRS
- // values. The following objects are stored in gpu memory for the whole
- // lifetime of the object
- // - crs_matrix_: CRS matrix
- // - permutation_: permutation from block-sparse to CRS value order
- // (num_nonzeros integer values)
- // - streamed_buffer_: helper for value updating
- // The following objects are created temporarily during construction:
- // - CudaBlockSparseStructure: block-sparse structure of block-sparse matrix
- // - num_rows integer values: row to row-block map
- // If copy_values flag is set to false, only structure of block-sparse matrix
- // bsm is captured, and values are left uninitialized
+ // Initializes internal CRS matrix using structure and values of block-sparse
+ // matrix For block-sparse matrices that have value layout different from CRS
+ // block-sparse structure will be stored/
CudaBlockSparseCRSView(const BlockSparseMatrix& bsm, ContextImpl* context);
const CudaSparseMatrix* crs_matrix() const { return crs_matrix_.get(); }
@@ -95,16 +77,21 @@
// used for construction.
void UpdateValues(const BlockSparseMatrix& bsm);
+ // Returns true if block-sparse matrix had CRS-compatible value layout
+ bool IsCrsCompatible() const { return is_crs_compatible_; }
+
private:
// Value permutation kernel performs a single element-wise operation per
// thread, thus performing permutation in blocks of 8 megabytes of
// block-sparse values seems reasonable
static constexpr int kMaxTemporaryArraySize = 1 * 1024 * 1024;
std::unique_ptr<CudaSparseMatrix> crs_matrix_;
- // Permutation from block-sparse to CRS value order.
- // permutation_[i] = index of i-th block-sparse value in CRS values
- CudaBuffer<int> permutation_;
- CudaStreamedBuffer<double> streamed_buffer_;
+ // Only created if block-sparse matrix has non-CRS value layout
+ std::unique_ptr<CudaStreamedBuffer<double>> streamed_buffer_;
+ // Only stored if block-sparse matrix has non-CRS value layout
+ std::unique_ptr<CudaBlockSparseStructure> block_structure_;
+ bool is_crs_compatible_;
+ ContextImpl* context_;
};
} // namespace ceres::internal
diff --git a/internal/ceres/cuda_block_sparse_crs_view_test.cc b/internal/ceres/cuda_block_sparse_crs_view_test.cc
index f8d1cdf..e80d026 100644
--- a/internal/ceres/cuda_block_sparse_crs_view_test.cc
+++ b/internal/ceres/cuda_block_sparse_crs_view_test.cc
@@ -52,48 +52,110 @@
options.max_col_block_size = 10;
options.block_density = 0.2;
std::mt19937 rng;
- A_ = BlockSparseMatrix::CreateRandomMatrix(options, rng);
- std::iota(
- A_->mutable_values(), A_->mutable_values() + A_->num_nonzeros(), 1);
+
+ // Block-sparse matrix with order of values different from CRS
+ block_sparse_non_crs_compatible_ =
+ BlockSparseMatrix::CreateRandomMatrix(options, rng, true);
+ std::iota(block_sparse_non_crs_compatible_->mutable_values(),
+ block_sparse_non_crs_compatible_->mutable_values() +
+ block_sparse_non_crs_compatible_->num_nonzeros(),
+ 1);
+
+ options.max_row_block_size = 1;
+ // Block-sparse matrix with CRS order of values (row-blocks are rows)
+ block_sparse_crs_compatible_rows_ =
+ BlockSparseMatrix::CreateRandomMatrix(options, rng, true);
+ std::iota(block_sparse_crs_compatible_rows_->mutable_values(),
+ block_sparse_crs_compatible_rows_->mutable_values() +
+ block_sparse_crs_compatible_rows_->num_nonzeros(),
+ 1);
+ // Block-sparse matrix with CRS order of values (single cell per row-block)
+ auto bs = std::make_unique<CompressedRowBlockStructure>(
+ *block_sparse_non_crs_compatible_->block_structure());
+
+ int num_nonzeros = 0;
+ for (auto& r : bs->rows) {
+ const int num_cells = r.cells.size();
+ if (num_cells > 1) {
+ std::uniform_int_distribution<int> uniform_cell(0, num_cells - 1);
+ const int selected_cell = uniform_cell(rng);
+ std::swap(r.cells[0], r.cells[selected_cell]);
+ r.cells.resize(1);
+ }
+ const int row_block_size = r.block.size;
+ for (auto& c : r.cells) {
+ c.position = num_nonzeros;
+ const int col_block_size = bs->cols[c.block_id].size;
+ num_nonzeros += col_block_size * row_block_size;
+ }
+ }
+ block_sparse_crs_compatible_single_cell_ =
+ std::make_unique<BlockSparseMatrix>(bs.release());
+ std::iota(block_sparse_crs_compatible_single_cell_->mutable_values(),
+ block_sparse_crs_compatible_single_cell_->mutable_values() +
+ block_sparse_crs_compatible_single_cell_->num_nonzeros(),
+ 1);
}
- std::unique_ptr<BlockSparseMatrix> A_;
+ void Compare(const BlockSparseMatrix& bsm, CudaSparseMatrix& csm) {
+ ASSERT_EQ(csm.num_cols(), bsm.num_cols());
+ ASSERT_EQ(csm.num_rows(), bsm.num_rows());
+ ASSERT_EQ(csm.num_nonzeros(), bsm.num_nonzeros());
+ const int num_rows = bsm.num_rows();
+ const int num_cols = bsm.num_cols();
+ Vector x(num_cols);
+ Vector y(num_rows);
+ CudaVector x_cuda(&context_, num_cols);
+ CudaVector y_cuda(&context_, num_rows);
+ Vector y_cuda_host(num_rows);
+
+ for (int i = 0; i < num_cols; ++i) {
+ x.setZero();
+ y.setZero();
+ y_cuda.SetZero();
+ x[i] = 1.;
+ x_cuda.CopyFromCpu(x);
+ csm.RightMultiplyAndAccumulate(x_cuda, &y_cuda);
+ bsm.RightMultiplyAndAccumulate(
+ x.data(), y.data(), &context_, std::thread::hardware_concurrency());
+ y_cuda.CopyTo(&y_cuda_host);
+ // There will be up to 1 non-zero product per row, thus we expect an exact
+ // match on 32-bit integer indices
+ EXPECT_EQ((y - y_cuda_host).squaredNorm(), 0.);
+ }
+ }
+
+ std::unique_ptr<BlockSparseMatrix> block_sparse_non_crs_compatible_;
+ std::unique_ptr<BlockSparseMatrix> block_sparse_crs_compatible_rows_;
+ std::unique_ptr<BlockSparseMatrix> block_sparse_crs_compatible_single_cell_;
ContextImpl context_;
};
-TEST_F(CudaBlockSparseCRSViewTest, CreateUpdateValues) {
- auto view = CudaBlockSparseCRSView(*A_, &context_);
+TEST_F(CudaBlockSparseCRSViewTest, CreateUpdateValuesNonCompatible) {
+ auto view =
+ CudaBlockSparseCRSView(*block_sparse_non_crs_compatible_, &context_);
+ ASSERT_EQ(view.IsCrsCompatible(), false);
- // TODO: we definitely would like to use crs_matrix() here, but
- // CudaSparseMatrix::RightMultiplyAndAccumulate is defined non-const because
- // it might allocate additional storage by request of cuSPARSE
auto matrix = view.mutable_crs_matrix();
- ASSERT_EQ(matrix->num_cols(), A_->num_cols());
- ASSERT_EQ(matrix->num_rows(), A_->num_rows());
- ASSERT_EQ(matrix->num_nonzeros(), A_->num_nonzeros());
+ Compare(*block_sparse_non_crs_compatible_, *matrix);
+}
- const int num_rows = A_->num_rows();
- const int num_cols = A_->num_cols();
- Vector x(num_cols);
- Vector y(num_rows);
- CudaVector x_cuda(&context_, num_cols);
- CudaVector y_cuda(&context_, num_rows);
- Vector y_cuda_host(num_rows);
+TEST_F(CudaBlockSparseCRSViewTest, CreateUpdateValuesCompatibleRows) {
+ auto view =
+ CudaBlockSparseCRSView(*block_sparse_crs_compatible_rows_, &context_);
+ ASSERT_EQ(view.IsCrsCompatible(), true);
- for (int i = 0; i < num_cols; ++i) {
- x.setZero();
- y.setZero();
- y_cuda.SetZero();
- x[i] = 1.;
- x_cuda.CopyFromCpu(x);
- A_->RightMultiplyAndAccumulate(
- x.data(), y.data(), &context_, std::thread::hardware_concurrency());
- matrix->RightMultiplyAndAccumulate(x_cuda, &y_cuda);
- y_cuda.CopyTo(&y_cuda_host);
- // There will be up to 1 non-zero product per row, thus we expect an exact
- // match on 32-bit integer indices
- EXPECT_EQ((y - y_cuda_host).squaredNorm(), 0.);
- }
+ auto matrix = view.mutable_crs_matrix();
+ Compare(*block_sparse_crs_compatible_rows_, *matrix);
+}
+
+TEST_F(CudaBlockSparseCRSViewTest, CreateUpdateValuesCompatibleSingleCell) {
+ auto view = CudaBlockSparseCRSView(*block_sparse_crs_compatible_single_cell_,
+ &context_);
+ ASSERT_EQ(view.IsCrsCompatible(), true);
+
+ auto matrix = view.mutable_crs_matrix();
+ Compare(*block_sparse_crs_compatible_single_cell_, *matrix);
}
} // namespace ceres::internal
diff --git a/internal/ceres/cuda_block_structure.cc b/internal/ceres/cuda_block_structure.cc
index 8e549e1..5817cd7 100644
--- a/internal/ceres/cuda_block_structure.cc
+++ b/internal/ceres/cuda_block_structure.cc
@@ -46,7 +46,7 @@
CudaBlockSparseStructure::CudaBlockSparseStructure(
const CompressedRowBlockStructure& block_structure, ContextImpl* context)
- : row_block_offsets_(context),
+ : first_cell_in_row_block_(context),
cells_(context),
row_blocks_(context),
col_blocks_(context) {
@@ -56,44 +56,56 @@
const auto& col_blocks = block_structure.cols;
// Row block offset is an index of the first cell corresponding to row block
- std::vector<int> row_block_offsets;
+ std::vector<int> first_cell_in_row_block;
// Flat array of all cells from all row-blocks
std::vector<Cell> cells;
+ int f_values_offset = 0;
+ is_crs_compatible_ = true;
num_row_blocks_ = block_structure.rows.size();
num_col_blocks_ = col_blocks.size();
row_blocks.reserve(num_row_blocks_);
- row_block_offsets.reserve(num_row_blocks_ + 1);
+ first_cell_in_row_block.reserve(num_row_blocks_ + 1);
num_nonzeros_ = 0;
- num_cells_ = 0;
+ sequential_layout_ = true;
for (const auto& r : block_structure.rows) {
const int row_block_size = r.block.size;
+ if (r.cells.size() > 1 && row_block_size > 1) {
+ is_crs_compatible_ = false;
+ }
row_blocks.emplace_back(r.block);
- row_block_offsets.push_back(num_cells_);
+ first_cell_in_row_block.push_back(cells.size());
for (const auto& c : r.cells) {
- cells.emplace_back(c);
const int col_block_size = col_blocks[c.block_id].size;
- num_nonzeros_ += col_block_size * row_block_size;
- ++num_cells_;
+ const int cell_size = col_block_size * row_block_size;
+ cells.push_back(c);
+ sequential_layout_ &= c.position == num_nonzeros_;
+ num_nonzeros_ += cell_size;
}
}
- row_block_offsets.push_back(num_cells_);
+ first_cell_in_row_block.push_back(cells.size());
+ num_cells_ = cells.size();
num_rows_ = Dimension(row_blocks);
num_cols_ = Dimension(col_blocks);
+ is_crs_compatible_ &= sequential_layout_;
+
if (VLOG_IS_ON(3)) {
- const size_t row_block_offsets_size =
- row_block_offsets.size() * sizeof(int);
+ const size_t first_cell_in_row_block_size =
+ first_cell_in_row_block.size() * sizeof(int);
const size_t cells_size = cells.size() * sizeof(Cell);
const size_t row_blocks_size = row_blocks.size() * sizeof(Block);
const size_t col_blocks_size = col_blocks.size() * sizeof(Block);
- const size_t total_size =
- row_block_offsets_size + cells_size + col_blocks_size + row_blocks_size;
+ const size_t total_size = first_cell_in_row_block_size + cells_size +
+ col_blocks_size + row_blocks_size;
+ const double ratio =
+ (100. * total_size) / (num_nonzeros_ * (sizeof(int) + sizeof(double)) +
+ num_rows_ * sizeof(int));
VLOG(3) << "\nCudaBlockSparseStructure:\n"
"\tRow block offsets: "
- << row_block_offsets_size
+ << first_cell_in_row_block_size
<< " bytes\n"
"\tColumn blocks: "
<< col_blocks_size
@@ -102,13 +114,11 @@
<< row_blocks_size
<< " bytes\n"
"\tCells: "
- << cells_size
- << " bytes\n"
- "\tTotal: "
- << total_size << " bytes of GPU memory";
+ << cells_size << " bytes\n\tTotal: " << total_size
+ << " bytes of GPU memory (" << ratio << "% of CRS matrix size)";
}
- row_block_offsets_.CopyFromCpuVector(row_block_offsets);
+ first_cell_in_row_block_.CopyFromCpuVector(first_cell_in_row_block);
cells_.CopyFromCpuVector(cells);
row_blocks_.CopyFromCpuVector(row_blocks);
col_blocks_.CopyFromCpuVector(col_blocks);
diff --git a/internal/ceres/cuda_block_structure.h b/internal/ceres/cuda_block_structure.h
index a41d0b3..28e36b2 100644
--- a/internal/ceres/cuda_block_structure.h
+++ b/internal/ceres/cuda_block_structure.h
@@ -44,12 +44,11 @@
// This class stores a read-only block-sparse structure in gpu memory.
// Invariants are the same as those of CompressedRowBlockStructure.
// In order to simplify allocation and copying data to gpu, cells from all
-// row-blocks are stored in a single array sequentially. Additional array
-// row_block_offsets of size num_row_blocks + 1 allows to identify range of
-// cells corresponding to a row-block.
-// Cells corresponding to i-th row-block are stored in sub-array
-// cells[row_block_offsets[i]; ... row_block_offsets[i + 1] - 1], and their
-// order is preserved.
+// row-blocks are stored in a single array sequentially. Array
+// first_cell_in_row_block of size num_row_blocks + 1 allows to identify range
+// of cells corresponding to a row-block. Cells corresponding to i-th row-block
+// are stored in sub-array cells[first_cell_in_row_block[i]; ...
+// first_cell_in_row_block[i + 1] - 1], and their order is preserved.
class CERES_NO_EXPORT CudaBlockSparseStructure {
public:
// CompressedRowBlockStructure is contains a vector of CompressedLists, with
@@ -65,9 +64,20 @@
int num_row_blocks() const { return num_row_blocks_; }
int num_col_blocks() const { return num_col_blocks_; }
+ // Returns true if values from block-sparse matrix can be copied to CRS matrix
+ // as-is. This is possible if each row-block is stored in CRS order:
+ // - Row-block consists of a single row
+ // - Row-block contains a single cell
+ bool IsCrsCompatible() const { return is_crs_compatible_; }
+ // Returns true if block-sparse structure corresponds to block-sparse matrix
+ // with sequential cell positions
+ bool sequential_layout() const { return sequential_layout_; }
+
// Device pointer to array of num_row_blocks + 1 indices of the first cell of
// row block
- const int* row_block_offsets() const { return row_block_offsets_.data(); }
+ const int* first_cell_in_row_block() const {
+ return first_cell_in_row_block_.data();
+ }
// Device pointer to array of num_cells cells, sorted by row-block
const Cell* cells() const { return cells_.data(); }
// Device pointer to array of row blocks
@@ -82,7 +92,9 @@
int num_nonzeros_;
int num_row_blocks_;
int num_col_blocks_;
- CudaBuffer<int> row_block_offsets_;
+ bool is_crs_compatible_;
+ bool sequential_layout_;
+ CudaBuffer<int> first_cell_in_row_block_;
CudaBuffer<Cell> cells_;
CudaBuffer<Block> row_blocks_;
CudaBuffer<Block> col_blocks_;
diff --git a/internal/ceres/cuda_block_structure_test.cc b/internal/ceres/cuda_block_structure_test.cc
index 95cb26b..000b6bb 100644
--- a/internal/ceres/cuda_block_structure_test.cc
+++ b/internal/ceres/cuda_block_structure_test.cc
@@ -83,10 +83,10 @@
}
std::vector<int> GetRowBlockOffsets(
const CudaBlockSparseStructure& structure) {
- const auto& cuda_buffer = structure.row_block_offsets_;
- std::vector<int> row_block_offsets(cuda_buffer.size());
- cuda_buffer.CopyToCpu(row_block_offsets.data(), row_block_offsets.size());
- return row_block_offsets;
+ const auto& cuda_buffer = structure.first_cell_in_row_block_;
+ std::vector<int> first_cell_in_row_block(cuda_buffer.size());
+ cuda_buffer.CopyToCpu(first_cell_in_row_block.data(), first_cell_in_row_block.size());
+ return first_cell_in_row_block;
}
std::unique_ptr<BlockSparseMatrix> A_;
@@ -114,19 +114,19 @@
}
std::vector<Cell> cells = GetCells(cuda_block_structure);
- std::vector<int> row_block_offsets = GetRowBlockOffsets(cuda_block_structure);
+ std::vector<int> first_cell_in_row_block = GetRowBlockOffsets(cuda_block_structure);
blocks = GetRowBlocks(cuda_block_structure);
ASSERT_EQ(blocks.size(), num_row_blocks);
- ASSERT_EQ(row_block_offsets.size(), num_row_blocks + 1);
- ASSERT_EQ(row_block_offsets.back(), cells.size());
+ ASSERT_EQ(first_cell_in_row_block.size(), num_row_blocks + 1);
+ ASSERT_EQ(first_cell_in_row_block.back(), cells.size());
for (int i = 0; i < num_row_blocks; ++i) {
const int num_cells = block_structure->rows[i].cells.size();
EXPECT_EQ(blocks[i].position, block_structure->rows[i].block.position);
EXPECT_EQ(blocks[i].size, block_structure->rows[i].block.size);
- const int first_cell = row_block_offsets[i];
- const int last_cell = row_block_offsets[i + 1];
+ const int first_cell = first_cell_in_row_block[i];
+ const int last_cell = first_cell_in_row_block[i + 1];
ASSERT_EQ(last_cell - first_cell, num_cells);
for (int j = 0; j < num_cells; ++j) {
EXPECT_EQ(cells[first_cell + j].block_id,
diff --git a/internal/ceres/cuda_kernels.cu.cc b/internal/ceres/cuda_kernels.cu.cc
deleted file mode 100644
index 3d93ae8..0000000
--- a/internal/ceres/cuda_kernels.cu.cc
+++ /dev/null
@@ -1,304 +0,0 @@
-// Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
-// http://ceres-solver.org/
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-// * Neither the name of Google Inc. nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
-
-//#include "ceres/cuda_kernels.h"
-
-#include <thrust/execution_policy.h>
-#include <thrust/scan.h>
-
-#include "block_structure.h"
-#include "cuda_runtime.h"
-
-namespace ceres {
-namespace internal {
-
-// As the CUDA Toolkit documentation says, "although arbitrary in this case, is
-// a common choice". This is determined by the warp size, max block size, and
-// multiprocessor sizes of recent GPUs. For complex kernels with significant
-// register usage and unusual memory patterns, the occupancy calculator API
-// might provide better performance. See "Occupancy Calculator" under the CUDA
-// toolkit documentation.
-constexpr int kCudaBlockSize = 256;
-inline int NumBlocks(int size) {
- return (size + kCudaBlockSize - 1) / kCudaBlockSize;
-}
-
-template <typename SrcType, typename DstType>
-__global__ void TypeConversionKernel(const SrcType* __restrict__ input,
- DstType* __restrict__ output,
- const int size) {
- const int i = blockIdx.x * blockDim.x + threadIdx.x;
- if (i < size) {
- output[i] = static_cast<DstType>(input[i]);
- }
-}
-
-void CudaFP64ToFP32(const double* input,
- float* output,
- const int size,
- cudaStream_t stream) {
- const int num_blocks = NumBlocks(size);
- TypeConversionKernel<double, float>
- <<<num_blocks, kCudaBlockSize, 0, stream>>>(input, output, size);
-}
-
-void CudaFP32ToFP64(const float* input,
- double* output,
- const int size,
- cudaStream_t stream) {
- const int num_blocks = NumBlocks(size);
- TypeConversionKernel<float, double>
- <<<num_blocks, kCudaBlockSize, 0, stream>>>(input, output, size);
-}
-
-template <typename T>
-__global__ void SetZeroKernel(T* __restrict__ output, const int size) {
- const int i = blockIdx.x * blockDim.x + threadIdx.x;
- if (i < size) {
- output[i] = T(0.0);
- }
-}
-
-void CudaSetZeroFP32(float* output, const int size, cudaStream_t stream) {
- const int num_blocks = NumBlocks(size);
- SetZeroKernel<float><<<num_blocks, kCudaBlockSize, 0, stream>>>(output, size);
-}
-
-void CudaSetZeroFP64(double* output, const int size, cudaStream_t stream) {
- const int num_blocks = NumBlocks(size);
- SetZeroKernel<double>
- <<<num_blocks, kCudaBlockSize, 0, stream>>>(output, size);
-}
-
-template <typename SrcType, typename DstType>
-__global__ void XPlusEqualsYKernel(DstType* __restrict__ x,
- const SrcType* __restrict__ y,
- const int size) {
- const int i = blockIdx.x * blockDim.x + threadIdx.x;
- if (i < size) {
- x[i] = x[i] + DstType(y[i]);
- }
-}
-
-void CudaDsxpy(double* x, float* y, const int size, cudaStream_t stream) {
- const int num_blocks = NumBlocks(size);
- XPlusEqualsYKernel<float, double>
- <<<num_blocks, kCudaBlockSize, 0, stream>>>(x, y, size);
-}
-
-__global__ void CudaDtDxpyKernel(double* __restrict__ y,
- const double* D,
- const double* __restrict__ x,
- const int size) {
- const int i = blockIdx.x * blockDim.x + threadIdx.x;
- if (i < size) {
- y[i] = y[i] + D[i] * D[i] * x[i];
- }
-}
-
-void CudaDtDxpy(double* y,
- const double* D,
- const double* x,
- const int size,
- cudaStream_t stream) {
- const int num_blocks = NumBlocks(size);
- CudaDtDxpyKernel<<<num_blocks, kCudaBlockSize, 0, stream>>>(y, D, x, size);
-}
-
-// Fill row block id and nnz for each row using block-sparse structure
-// represented by a set of flat arrays.
-// Inputs:
-// - num_row_blocks: number of row-blocks in block-sparse structure
-// - row_block_offsets: index of the first cell of the row-block; size:
-// num_row_blocks + 1
-// - cells: cells of block-sparse structure as a continuous array
-// - row_blocks: row blocks of block-sparse structure stored sequentially
-// - col_blocks: column blocks of block-sparse structure stored sequentially
-// Outputs:
-// - rows: rows[i + 1] will contain number of non-zeros in i-th row, rows[0]
-// will be set to 0; rows are filled with a shift by one element in order
-// to obtain row-index array of CRS matrix with a inclusive scan afterwards
-// - row_block_ids: row_block_ids[i] will be set to index of row-block that
-// contains i-th row.
-// Computation is perform row-block-wise
-__global__ void RowBlockIdAndNNZ(int num_row_blocks,
- const int* __restrict__ row_block_offsets,
- const Cell* __restrict__ cells,
- const Block* __restrict__ row_blocks,
- const Block* __restrict__ col_blocks,
- int* __restrict__ rows,
- int* __restrict__ row_block_ids) {
- const int row_block_id = blockIdx.x * blockDim.x + threadIdx.x;
- if (row_block_id > num_row_blocks) {
- // No synchronization is performed in this kernel, thus it is safe to return
- return;
- }
- if (row_block_id == num_row_blocks) {
- // one extra thread sets the first element
- rows[0] = 0;
- return;
- }
- const auto& row_block = row_blocks[row_block_id];
- int row_nnz = 0;
- const auto first_cell = cells + row_block_offsets[row_block_id];
- const auto last_cell = cells + row_block_offsets[row_block_id + 1];
- for (auto cell = first_cell; cell < last_cell; ++cell) {
- row_nnz += col_blocks[cell->block_id].size;
- }
- const int first_row = row_block.position;
- const int last_row = first_row + row_block.size;
- for (int i = first_row; i < last_row; ++i) {
- rows[i + 1] = row_nnz;
- row_block_ids[i] = row_block_id;
- }
-}
-
-// Row-wise creation of CRS structure
-// Inputs:
-// - num_rows: number of rows in matrix
-// - row_block_offsets: index of the first cell of the row-block; size:
-// num_row_blocks + 1
-// - cells: cells of block-sparse structure as a continuous array
-// - row_blocks: row blocks of block-sparse structure stored sequentially
-// - col_blocks: column blocks of block-sparse structure stored sequentially
-// - row_block_ids: index of row-block that corresponds to row
-// - rows: row-index array of CRS structure
-// Outputs:
-// - cols: column-index array of CRS structure
-// - permutation: permutation from block-sparse to crs order
-// Computaion is perform row-wise
-__global__ void ComputeColumnsAndPermutation(
- int num_rows,
- const int* __restrict__ row_block_offsets,
- const Cell* __restrict__ cells,
- const Block* __restrict__ row_blocks,
- const Block* __restrict__ col_blocks,
- const int* __restrict__ row_block_ids,
- const int* __restrict__ rows,
- int* __restrict__ cols,
- int* __restrict__ permutation) {
- const int row = blockIdx.x * blockDim.x + threadIdx.x;
- if (row >= num_rows) {
- // No synchronization is performed in this kernel, thus it is safe to return
- return;
- }
- const int row_block_id = row_block_ids[row];
- // zero-based index of row in row-block
- const int row_in_block = row - row_blocks[row_block_id].position;
- // position in crs matrix
- int crs_position = rows[row];
- const auto first_cell = cells + row_block_offsets[row_block_id];
- const auto last_cell = cells + row_block_offsets[row_block_id + 1];
- // For reach cell of row-block only current row is being filled
- for (auto cell = first_cell; cell < last_cell; ++cell) {
- const auto& col_block = col_blocks[cell->block_id];
- const int col_block_size = col_block.size;
- int column_idx = col_block.position;
- int bs_position = cell->position + row_in_block * col_block_size;
- // Fill permutation and column indices for each element of row_in_block
- // row of current cell
- for (int i = 0; i < col_block_size; ++i, ++crs_position) {
- permutation[bs_position++] = crs_position;
- cols[crs_position] = column_idx++;
- }
- }
-}
-
-void FillCRSStructure(const int num_row_blocks,
- const int num_rows,
- const int* row_block_offsets,
- const Cell* cells,
- const Block* row_blocks,
- const Block* col_blocks,
- int* rows,
- int* cols,
- int* row_block_ids,
- int* permutation,
- cudaStream_t stream) {
- // Set number of non-zeros per row in rows array and row to row-block map in
- // row_block_ids array
- const int num_blocks_blockwise = NumBlocks(num_row_blocks + 1);
- RowBlockIdAndNNZ<<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>(
- num_row_blocks,
- row_block_offsets,
- cells,
- row_blocks,
- col_blocks,
- rows,
- row_block_ids);
- // Finalize row-index array of CRS strucure by computing prefix sum
- thrust::inclusive_scan(
- thrust::cuda::par.on(stream), rows, rows + num_rows + 1, rows);
-
- // Fill cols array of CRS structure and permutation from block-sparse to CRS
- const int num_blocks_rowwise = NumBlocks(num_rows);
- ComputeColumnsAndPermutation<<<num_blocks_rowwise,
- kCudaBlockSize,
- 0,
- stream>>>(num_rows,
- row_block_offsets,
- cells,
- row_blocks,
- col_blocks,
- row_block_ids,
- rows,
- cols,
- permutation);
-}
-
-// Updates values of CRS matrix with a continuous block of values of
-// block-sparse matrix. With permutation[i] being an index of i-th value of
-// block-sparse matrix in values of CRS matrix updating values is quite
-// efficient when performed element-wise (reads of permutation and values arrays
-// are coalesced when offset is divisable by 16)
-__global__ void PermuteValuesKernel(const int offset,
- const int num_values,
- const int* permutation,
- const double* block_sparse_values,
- double* crs_values) {
- const int value_id = blockIdx.x * blockDim.x + threadIdx.x;
- if (value_id < num_values) {
- crs_values[permutation[offset + value_id]] = block_sparse_values[value_id];
- }
-}
-
-void PermuteValues(const int offset,
- const int num_values,
- const int* permutation,
- const double* block_sparse_values,
- double* crs_values,
- cudaStream_t stream) {
- const int num_blocks = NumBlocks(num_values);
- PermuteValuesKernel<<<num_blocks, kCudaBlockSize, 0, stream>>>(
- offset, num_values, permutation, block_sparse_values, crs_values);
-}
-
-} // namespace internal
-} // namespace ceres
diff --git a/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc b/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc
new file mode 100644
index 0000000..8be3553
--- /dev/null
+++ b/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc
@@ -0,0 +1,279 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2022 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#include "ceres/cuda_kernels_bsm_to_crs.h"
+
+#include <cuda_runtime.h>
+#include <thrust/execution_policy.h>
+#include <thrust/scan.h>
+
+#include "ceres/block_structure.h"
+#include "ceres/cuda_kernels_utils.h"
+
+namespace ceres {
+namespace internal {
+
+// Fill row block id and nnz for each row using block-sparse structure
+// represented by a set of flat arrays.
+// Inputs:
+// - num_row_blocks: number of row-blocks in block-sparse structure
+// - first_cell_in_row_block: index of the first cell of the row-block; size:
+// num_row_blocks + 1
+// - cells: cells of block-sparse structure as a continuous array
+// - row_blocks: row blocks of block-sparse structure stored sequentially
+// - col_blocks: column blocks of block-sparse structure stored sequentially
+// Outputs:
+// - rows: rows[i + 1] will contain number of non-zeros in i-th row, rows[0]
+// will be set to 0; rows are filled with a shift by one element in order
+// to obtain row-index array of CRS matrix with a inclusive scan afterwards
+// - row_block_ids: row_block_ids[i] will be set to index of row-block that
+// contains i-th row.
+// Computation is perform row-block-wise
+__global__ void RowBlockIdAndNNZ(
+ int num_row_blocks,
+ const int* __restrict__ first_cell_in_row_block,
+ const Cell* __restrict__ cells,
+ const Block* __restrict__ row_blocks,
+ const Block* __restrict__ col_blocks,
+ int* __restrict__ rows,
+ int* __restrict__ row_block_ids) {
+ const int row_block_id = blockIdx.x * blockDim.x + threadIdx.x;
+ if (row_block_id > num_row_blocks) {
+ // No synchronization is performed in this kernel, thus it is safe to return
+ return;
+ }
+ if (row_block_id == num_row_blocks) {
+ // one extra thread sets the first element
+ rows[0] = 0;
+ return;
+ }
+ const auto& row_block = row_blocks[row_block_id];
+ int row_nnz = 0;
+ const auto first_cell = cells + first_cell_in_row_block[row_block_id];
+ const auto last_cell = cells + first_cell_in_row_block[row_block_id + 1];
+ for (auto cell = first_cell; cell < last_cell; ++cell) {
+ row_nnz += col_blocks[cell->block_id].size;
+ }
+ const int first_row = row_block.position;
+ const int last_row = first_row + row_block.size;
+ for (int i = first_row; i < last_row; ++i) {
+ rows[i + 1] = row_nnz;
+ row_block_ids[i] = row_block_id;
+ }
+}
+
+// Row-wise creation of CRS structure
+// Inputs:
+// - num_rows: number of rows in matrix
+// - first_cell_in_row_block: index of the first cell of the row-block; size:
+// num_row_blocks + 1
+// - cells: cells of block-sparse structure as a continuous array
+// - row_blocks: row blocks of block-sparse structure stored sequentially
+// - col_blocks: column blocks of block-sparse structure stored sequentially
+// - row_block_ids: index of row-block that corresponds to row
+// - rows: row-index array of CRS structure
+// Outputs:
+// - cols: column-index array of CRS structure
+// Computaion is perform row-wise
+__global__ void ComputeColumnsAndPermutation(
+ int num_rows,
+ const int* __restrict__ first_cell_in_row_block,
+ const Cell* __restrict__ cells,
+ const Block* __restrict__ row_blocks,
+ const Block* __restrict__ col_blocks,
+ const int* __restrict__ row_block_ids,
+ const int* __restrict__ rows,
+ int* __restrict__ cols) {
+ const int row = blockIdx.x * blockDim.x + threadIdx.x;
+ if (row >= num_rows) {
+ // No synchronization is performed in this kernel, thus it is safe to return
+ return;
+ }
+ const int row_block_id = row_block_ids[row];
+ // position in crs matrix
+ int crs_position = rows[row];
+ const auto first_cell = cells + first_cell_in_row_block[row_block_id];
+ const auto last_cell = cells + first_cell_in_row_block[row_block_id + 1];
+ // For reach cell of row-block only current row is being filled
+ for (auto cell = first_cell; cell < last_cell; ++cell) {
+ const auto& col_block = col_blocks[cell->block_id];
+ const int col_block_size = col_block.size;
+ int column_idx = col_block.position;
+ // Column indices for each element of row_in_block row of current cell
+ for (int i = 0; i < col_block_size; ++i, ++crs_position) {
+ cols[crs_position] = column_idx++;
+ }
+ }
+}
+
+void FillCRSStructure(const int num_row_blocks,
+ const int num_rows,
+ const int* first_cell_in_row_block,
+ const Cell* cells,
+ const Block* row_blocks,
+ const Block* col_blocks,
+ int* rows,
+ int* cols,
+ cudaStream_t stream) {
+ // Set number of non-zeros per row in rows array and row to row-block map in
+ // row_block_ids array
+ int* row_block_ids;
+ cudaMallocAsync(&row_block_ids, sizeof(int) * num_rows, stream);
+ const int num_blocks_blockwise = NumBlocksInGrid(num_row_blocks + 1);
+ RowBlockIdAndNNZ<<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>(
+ num_row_blocks,
+ first_cell_in_row_block,
+ cells,
+ row_blocks,
+ col_blocks,
+ rows,
+ row_block_ids);
+ // Finalize row-index array of CRS strucure by computing prefix sum
+ thrust::inclusive_scan(
+ thrust::cuda::par_nosync.on(stream), rows, rows + num_rows + 1, rows);
+
+ // Fill cols array of CRS structure and permutation from block-sparse to CRS
+ const int num_blocks_rowwise = NumBlocksInGrid(num_rows);
+ ComputeColumnsAndPermutation<<<num_blocks_rowwise,
+ kCudaBlockSize,
+ 0,
+ stream>>>(num_rows,
+ first_cell_in_row_block,
+ cells,
+ row_blocks,
+ col_blocks,
+ row_block_ids,
+ rows,
+ cols);
+ cudaFreeAsync(row_block_ids, stream);
+}
+
+template <typename T, typename Predicate>
+__device__ int PartitionPoint(const T* data,
+ int first,
+ int last,
+ Predicate&& predicate) {
+ if (!predicate(data[first])) {
+ return first;
+ }
+ while (last - first > 1) {
+ const auto midpoint = first + (last - first) / 2;
+ if (predicate(data[midpoint])) {
+ first = midpoint;
+ } else {
+ last = midpoint;
+ }
+ }
+ return last;
+}
+
+// Element-wise reordering of block-sparse values
+// - first_cell_pos - position of the first cell of corresponding sub-matrix;
+// only used if use_first_cell is true; otherwise the first cell in row-block is
+// used
+// - block_sparse_values - segment of block-sparse values starting from
+// block_sparse_offset, containing num_values
+__global__ void PermuteToCrsKernel(
+ const int block_sparse_offset,
+ const int num_values,
+ const int num_row_blocks,
+ const int* __restrict__ first_cell_in_row_block,
+ const Cell* __restrict__ cells,
+ const Block* __restrict__ row_blocks,
+ const Block* __restrict__ col_blocks,
+ const int* __restrict__ crs_rows,
+ const double* __restrict__ block_sparse_values,
+ double* __restrict__ crs_values) {
+ const int value_id = blockIdx.x * blockDim.x + threadIdx.x;
+ if (value_id >= num_values) {
+ return;
+ }
+ const int block_sparse_value_id = value_id + block_sparse_offset;
+ // Find the corresponding row-block with a binary search
+ const int row_block_id =
+ PartitionPoint(
+ first_cell_in_row_block,
+ 0,
+ num_row_blocks,
+ [cells,
+ block_sparse_value_id] __device__(const int row_block_offset) {
+ return cells[row_block_offset].position <= block_sparse_value_id;
+ }) -
+ 1;
+ // Find cell and calculate offset within the row with a linear scan
+ const auto& row_block = row_blocks[row_block_id];
+ const auto first_cell = cells + first_cell_in_row_block[row_block_id];
+ const auto last_cell = cells + first_cell_in_row_block[row_block_id + 1];
+ const int row_block_size = row_block.size;
+ int num_cols_before = 0;
+ for (const Cell* cell = first_cell; cell < last_cell; ++cell) {
+ const auto& col_block = col_blocks[cell->block_id];
+ const int col_block_size = col_block.size;
+ const int cell_size = row_block_size * col_block_size;
+ if (cell->position + cell_size > block_sparse_value_id) {
+ const int pos_in_cell = block_sparse_value_id - cell->position;
+ const int row_in_cell = pos_in_cell / col_block_size;
+ const int col_in_cell = pos_in_cell % col_block_size;
+ const int row = row_in_cell + row_block.position;
+ crs_values[crs_rows[row] + num_cols_before + col_in_cell] =
+ block_sparse_values[value_id];
+ break;
+ }
+ num_cols_before += col_block_size;
+ }
+}
+
+void PermuteToCRS(const int block_sparse_offset,
+ const int num_values,
+ const int num_row_blocks,
+ const int* first_cell_in_row_block,
+ const Cell* cells,
+ const Block* row_blocks,
+ const Block* col_blocks,
+ const int* crs_rows,
+ const double* block_sparse_values,
+ double* crs_values,
+ cudaStream_t stream) {
+ const int num_blocks_valuewise = NumBlocksInGrid(num_values);
+ PermuteToCrsKernel<<<num_blocks_valuewise, kCudaBlockSize, 0, stream>>>(
+ block_sparse_offset,
+ num_values,
+ num_row_blocks,
+ first_cell_in_row_block,
+ cells,
+ row_blocks,
+ col_blocks,
+ crs_rows,
+ block_sparse_values,
+ crs_values);
+}
+
+} // namespace internal
+} // namespace ceres
diff --git a/internal/ceres/cuda_kernels_bsm_to_crs.h b/internal/ceres/cuda_kernels_bsm_to_crs.h
new file mode 100644
index 0000000..b6da69b
--- /dev/null
+++ b/internal/ceres/cuda_kernels_bsm_to_crs.h
@@ -0,0 +1,76 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#ifndef CERES_INTERNAL_CUDA_KERNELS_BSM_TO_CRS_H_
+#define CERES_INTERNAL_CUDA_KERNELS_BSM_TO_CRS_H_
+
+#include "ceres/internal/config.h"
+
+#ifndef CERES_NO_CUDA
+
+#include "cuda_runtime.h"
+
+namespace ceres {
+namespace internal {
+class Block;
+class Cell;
+
+// Compute structure of CRS matrix using block-sparse structure.
+// Arrays corresponding to CRS matrix are to be allocated by caller
+void FillCRSStructure(const int num_row_blocks,
+ const int num_rows,
+ const int* first_cell_in_row_block,
+ const Cell* cells,
+ const Block* row_blocks,
+ const Block* col_blocks,
+ int* rows,
+ int* cols,
+ cudaStream_t stream);
+
+// Permute segment of values from block-sparse matrix with sequential layout to
+// CRS order. Segment starts at block_sparse_offset and has length of num_values
+void PermuteToCRS(const int block_sparse_offset,
+ const int num_values,
+ const int num_row_blocks,
+ const int* first_cell_in_row_block,
+ const Cell* cells,
+ const Block* row_blocks,
+ const Block* col_blocks,
+ const int* crs_rows,
+ const double* block_sparse_values,
+ double* crs_values,
+ cudaStream_t stream);
+
+} // namespace internal
+} // namespace ceres
+
+#endif // CERES_NO_CUDA
+
+#endif // CERES_INTERNAL_CUDA_KERNELS_BSM_TO_CRS_H_
diff --git a/internal/ceres/cuda_kernels_utils.h b/internal/ceres/cuda_kernels_utils.h
new file mode 100644
index 0000000..45d5d3b
--- /dev/null
+++ b/internal/ceres/cuda_kernels_utils.h
@@ -0,0 +1,56 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2022 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+
+#ifndef CERES_INTERNAL_CUDA_KERNELS_UTILS_H_
+#define CERES_INTERNAL_CUDA_KERNELS_UTILS_H_
+
+namespace ceres {
+namespace internal {
+
+// Parallel execution on CUDA device requires splitting job into blocks of a
+// fixed size. We use block-size of kCudaBlockSize for all kernels that do not
+// require any specific block size. As the CUDA Toolkit documentation says,
+// "although arbitrary in this case, is a common choice". This is determined by
+// the warp size, max block size, and multiprocessor sizes of recent GPUs. For
+// complex kernels with significant register usage and unusual memory patterns,
+// the occupancy calculator API might provide better performance. See "Occupancy
+// Calculator" under the CUDA toolkit documentation.
+constexpr int kCudaBlockSize = 256;
+
+// Compute number of blocks of kCudaBlockSize that span over 1-d grid with
+// dimension size. Note that 1-d grid dimension is limited by 2^31-1 in CUDA,
+// thus a signed int is used as an argument.
+inline int NumBlocksInGrid(int size) {
+ return (size + kCudaBlockSize - 1) / kCudaBlockSize;
+}
+} // namespace internal
+} // namespace ceres
+
+#endif // CERES_INTERNAL_CUDA_KERNELS_UTILS_H_
diff --git a/internal/ceres/cuda_kernels_vector_ops.cu.cc b/internal/ceres/cuda_kernels_vector_ops.cu.cc
new file mode 100644
index 0000000..b5a8e40
--- /dev/null
+++ b/internal/ceres/cuda_kernels_vector_ops.cu.cc
@@ -0,0 +1,123 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2022 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+
+#include "ceres/cuda_kernels_vector_ops.h"
+
+#include <cuda_runtime.h>
+
+#include "ceres/cuda_kernels_utils.h"
+
+namespace ceres {
+namespace internal {
+
+template <typename SrcType, typename DstType>
+__global__ void TypeConversionKernel(const SrcType* __restrict__ input,
+ DstType* __restrict__ output,
+ const int size) {
+ const int i = blockIdx.x * blockDim.x + threadIdx.x;
+ if (i < size) {
+ output[i] = static_cast<DstType>(input[i]);
+ }
+}
+
+void CudaFP64ToFP32(const double* input,
+ float* output,
+ const int size,
+ cudaStream_t stream) {
+ const int num_blocks = NumBlocksInGrid(size);
+ TypeConversionKernel<double, float>
+ <<<num_blocks, kCudaBlockSize, 0, stream>>>(input, output, size);
+}
+
+void CudaFP32ToFP64(const float* input,
+ double* output,
+ const int size,
+ cudaStream_t stream) {
+ const int num_blocks = NumBlocksInGrid(size);
+ TypeConversionKernel<float, double>
+ <<<num_blocks, kCudaBlockSize, 0, stream>>>(input, output, size);
+}
+
+template <typename T>
+__global__ void SetZeroKernel(T* __restrict__ output, const int size) {
+ const int i = blockIdx.x * blockDim.x + threadIdx.x;
+ if (i < size) {
+ output[i] = T(0.0);
+ }
+}
+
+void CudaSetZeroFP32(float* output, const int size, cudaStream_t stream) {
+ const int num_blocks = NumBlocksInGrid(size);
+ SetZeroKernel<float><<<num_blocks, kCudaBlockSize, 0, stream>>>(output, size);
+}
+
+void CudaSetZeroFP64(double* output, const int size, cudaStream_t stream) {
+ const int num_blocks = NumBlocksInGrid(size);
+ SetZeroKernel<double>
+ <<<num_blocks, kCudaBlockSize, 0, stream>>>(output, size);
+}
+
+template <typename SrcType, typename DstType>
+__global__ void XPlusEqualsYKernel(DstType* __restrict__ x,
+ const SrcType* __restrict__ y,
+ const int size) {
+ const int i = blockIdx.x * blockDim.x + threadIdx.x;
+ if (i < size) {
+ x[i] = x[i] + DstType(y[i]);
+ }
+}
+
+void CudaDsxpy(double* x, float* y, const int size, cudaStream_t stream) {
+ const int num_blocks = NumBlocksInGrid(size);
+ XPlusEqualsYKernel<float, double>
+ <<<num_blocks, kCudaBlockSize, 0, stream>>>(x, y, size);
+}
+
+__global__ void CudaDtDxpyKernel(double* __restrict__ y,
+ const double* D,
+ const double* __restrict__ x,
+ const int size) {
+ const int i = blockIdx.x * blockDim.x + threadIdx.x;
+ if (i < size) {
+ y[i] = y[i] + D[i] * D[i] * x[i];
+ }
+}
+
+void CudaDtDxpy(double* y,
+ const double* D,
+ const double* x,
+ const int size,
+ cudaStream_t stream) {
+ const int num_blocks = NumBlocksInGrid(size);
+ CudaDtDxpyKernel<<<num_blocks, kCudaBlockSize, 0, stream>>>(y, D, x, size);
+}
+
+} // namespace internal
+} // namespace ceres
diff --git a/internal/ceres/cuda_kernels.h b/internal/ceres/cuda_kernels_vector_ops.h
similarity index 66%
rename from internal/ceres/cuda_kernels.h
rename to internal/ceres/cuda_kernels_vector_ops.h
index 61f945a..b179201 100644
--- a/internal/ceres/cuda_kernels.h
+++ b/internal/ceres/cuda_kernels_vector_ops.h
@@ -28,8 +28,8 @@
//
// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
-#ifndef CERES_INTERNAL_CUDA_KERNELS_H_
-#define CERES_INTERNAL_CUDA_KERNELS_H_
+#ifndef CERES_INTERNAL_CUDA_KERNELS_VECTOR_OPS_H_
+#define CERES_INTERNAL_CUDA_KERNELS_VECTOR_OPS_H_
#include "ceres/internal/config.h"
@@ -75,39 +75,9 @@
const int size,
cudaStream_t stream);
-// Compute structure of CRS matrix and permutation of values using block-sparse
-// structure and temporary array row_block_ids. Array row_block_ids of size
-// num_rows will be filled with indices of row-blocks corresponding to rows of
-// CRS matrix. Arrays corresponding to CRS matrix, permutation and row_block_ids
-// arrays are to be allocated by caller
-void FillCRSStructure(const int num_row_blocks,
- const int num_rows,
- const int* row_block_offsets,
- const Cell* cells,
- const Block* row_blocks,
- const Block* col_blocks,
- int* rows,
- int* cols,
- int* row_block_ids,
- int* permutation,
- cudaStream_t stream);
-
-// Permute block of block-sparse values using permutation
-// Pointer block_sparse_values corresponds to a block of num_values values from
-// block-sparse matrix at the offset from begining. Pointer output corresponds
-// to values of CRS matrix. Array permutation stores permutation from
-// block-sparse to CRS matrix with permutation[i] being an index of i-th value
-// of block-sparse matrix in values of CRS matrix
-void PermuteValues(const int offset,
- const int num_values,
- const int* permutation,
- const double* block_sparse_values,
- double* crs_values,
- cudaStream_t stream);
-
} // namespace internal
} // namespace ceres
#endif // CERES_NO_CUDA
-#endif // CERES_INTERNAL_CUDA_KERNELS_H_
+#endif // CERES_INTERNAL_CUDA_KERNELS_VECTOR_OPS_H_
diff --git a/internal/ceres/cuda_kernels_test.cc b/internal/ceres/cuda_kernels_vector_ops_test.cc
similarity index 99%
rename from internal/ceres/cuda_kernels_test.cc
rename to internal/ceres/cuda_kernels_vector_ops_test.cc
index 41036a3..7f6e229 100644
--- a/internal/ceres/cuda_kernels_test.cc
+++ b/internal/ceres/cuda_kernels_vector_ops_test.cc
@@ -28,7 +28,7 @@
//
// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
-#include "ceres/cuda_kernels.h"
+#include "ceres/cuda_kernels_vector_ops.h"
#include <math.h>
diff --git a/internal/ceres/cuda_sparse_matrix.cc b/internal/ceres/cuda_sparse_matrix.cc
index 0ae8c5d..e9ca9cc 100644
--- a/internal/ceres/cuda_sparse_matrix.cc
+++ b/internal/ceres/cuda_sparse_matrix.cc
@@ -52,7 +52,7 @@
#ifndef CERES_NO_CUDA
#include "ceres/cuda_buffer.h"
-#include "ceres/cuda_kernels.h"
+#include "ceres/cuda_kernels_vector_ops.h"
#include "ceres/cuda_vector.h"
#include "cuda_runtime_api.h"
#include "cusparse.h"
diff --git a/internal/ceres/cuda_sparse_matrix.h b/internal/ceres/cuda_sparse_matrix.h
index ad55170..f5fcb91 100644
--- a/internal/ceres/cuda_sparse_matrix.h
+++ b/internal/ceres/cuda_sparse_matrix.h
@@ -82,6 +82,10 @@
int num_cols() const { return num_cols_; }
int num_nonzeros() const { return num_nonzeros_; }
+ const int32_t* rows() const { return rows_.data(); }
+ const int32_t* cols() const { return cols_.data(); }
+ const double* values() const { return values_.data(); }
+
int32_t* mutable_rows() { return rows_.data(); }
int32_t* mutable_cols() { return cols_.data(); }
double* mutable_values() { return values_.data(); }
diff --git a/internal/ceres/cuda_vector.cc b/internal/ceres/cuda_vector.cc
index e2143de..d434f36 100644
--- a/internal/ceres/cuda_vector.cc
+++ b/internal/ceres/cuda_vector.cc
@@ -44,7 +44,7 @@
#ifndef CERES_NO_CUDA
#include "ceres/cuda_buffer.h"
-#include "ceres/cuda_kernels.h"
+#include "ceres/cuda_kernels_vector_ops.h"
#include "ceres/cuda_vector.h"
#include "cublas_v2.h"
diff --git a/internal/ceres/cuda_vector.h b/internal/ceres/cuda_vector.h
index 34f3947..46661cf 100644
--- a/internal/ceres/cuda_vector.h
+++ b/internal/ceres/cuda_vector.h
@@ -50,7 +50,7 @@
#ifndef CERES_NO_CUDA
#include "ceres/cuda_buffer.h"
-#include "ceres/cuda_kernels.h"
+#include "ceres/cuda_kernels_vector_ops.h"
#include "ceres/internal/eigen.h"
#include "cublas_v2.h"
#include "cusparse.h"
diff --git a/internal/ceres/dense_cholesky.cc b/internal/ceres/dense_cholesky.cc
index f19ca7e..fb95536 100644
--- a/internal/ceres/dense_cholesky.cc
+++ b/internal/ceres/dense_cholesky.cc
@@ -41,7 +41,7 @@
#ifndef CERES_NO_CUDA
#include "ceres/context_impl.h"
-#include "ceres/cuda_kernels.h"
+#include "ceres/cuda_kernels_vector_ops.h"
#include "cuda_runtime.h"
#include "cusolverDn.h"
#endif // CERES_NO_CUDA
diff --git a/internal/ceres/evaluation_benchmark.cc b/internal/ceres/evaluation_benchmark.cc
index 2ba03ed..c23c881 100644
--- a/internal/ceres/evaluation_benchmark.cc
+++ b/internal/ceres/evaluation_benchmark.cc
@@ -96,7 +96,7 @@
}
std::unique_ptr<BlockSparseMatrix> CreateBlockSparseJacobian(
- ContextImpl* context) {
+ ContextImpl* context, bool sequential) {
auto problem = bal_problem->mutable_problem();
auto problem_impl = problem->mutable_impl();
CHECK(problem_impl != nullptr);
@@ -116,6 +116,30 @@
auto block_sparse = downcast_unique_ptr<BlockSparseMatrix>(jacobian);
CHECK(block_sparse != nullptr);
+ if (sequential) {
+ auto block_structure_sequential =
+ std::make_unique<CompressedRowBlockStructure>(
+ *block_sparse->block_structure());
+ int num_nonzeros = 0;
+ for (auto& row_block : block_structure_sequential->rows) {
+ const int row_block_size = row_block.block.size;
+ for (auto& cell : row_block.cells) {
+ const int col_block_size =
+ block_structure_sequential->cols[cell.block_id].size;
+ cell.position = num_nonzeros;
+ num_nonzeros += col_block_size * row_block_size;
+ }
+ }
+ block_sparse = std::make_unique<BlockSparseMatrix>(
+ block_structure_sequential.release(),
+#ifndef CERES_NO_CUDA
+ true
+#else
+ false
+#endif
+ );
+ }
+
std::mt19937 rng;
std::normal_distribution<double> rnorm;
const int nnz = block_sparse->num_nonzeros();
@@ -123,6 +147,7 @@
for (int i = 0; i < nnz; ++i) {
values[i] = rnorm(rng);
}
+
return block_sparse;
}
@@ -134,11 +159,20 @@
const BlockSparseMatrix* BlockSparseJacobian(ContextImpl* context) {
if (!block_sparse_jacobian) {
- block_sparse_jacobian = CreateBlockSparseJacobian(context);
+ block_sparse_jacobian = CreateBlockSparseJacobian(context, true);
}
return block_sparse_jacobian.get();
}
+ const BlockSparseMatrix* BlockSparseJacobianPartitioned(
+ ContextImpl* context) {
+ if (!block_sparse_jacobian_partitioned) {
+ block_sparse_jacobian_partitioned =
+ CreateBlockSparseJacobian(context, false);
+ }
+ return block_sparse_jacobian_partitioned.get();
+ }
+
const CompressedRowSparseMatrix* CompressedRowSparseJacobian(
ContextImpl* context) {
if (!crs_jacobian) {
@@ -149,7 +183,7 @@
std::unique_ptr<PartitionedView> PartitionedMatrixViewJacobian(
const LinearSolver::Options& options) {
- auto block_sparse = BlockSparseJacobian(options.context);
+ auto block_sparse = BlockSparseJacobianPartitioned(options.context);
return std::make_unique<PartitionedView>(options, *block_sparse);
}
@@ -171,7 +205,7 @@
const ImplicitSchurComplement* ImplicitSchurComplementWithoutDiagonal(
const LinearSolver::Options& options) {
- auto block_sparse = BlockSparseJacobian(options.context);
+ auto block_sparse = BlockSparseJacobianPartitioned(options.context);
implicit_schur_complement =
std::make_unique<ImplicitSchurComplement>(options);
implicit_schur_complement->Init(*block_sparse, nullptr, b.data());
@@ -180,7 +214,7 @@
const ImplicitSchurComplement* ImplicitSchurComplementWithDiagonal(
const LinearSolver::Options& options) {
- auto block_sparse = BlockSparseJacobian(options.context);
+ auto block_sparse = BlockSparseJacobianPartitioned(options.context);
implicit_schur_complement_diag =
std::make_unique<ImplicitSchurComplement>(options);
implicit_schur_complement_diag->Init(*block_sparse, D.data(), b.data());
@@ -192,6 +226,7 @@
Vector b;
std::unique_ptr<BundleAdjustmentProblem> bal_problem;
std::unique_ptr<PreprocessedProblem> preprocessed_problem;
+ std::unique_ptr<BlockSparseMatrix> block_sparse_jacobian_partitioned;
std::unique_ptr<BlockSparseMatrix> block_sparse_jacobian;
std::unique_ptr<CompressedRowSparseMatrix> crs_jacobian;
std::unique_ptr<BlockSparseMatrix> block_diagonal_ete;