Block-sparse to CRS conversion using block-structure Instead of pre-computing pemutation from block-sparse to CRS order, index of value in CRS matrix is computed in the process of updating values using block-sparse structure. When it is possible to update values via a simple host-to-device copy, block-sparse structure on GPU is discarded after computing CRS structure. Computing index is significantly slower than using pre-computed permutation, but is still hidden by host-to-device transfer. On problems from BAL dataset this results into reduction of extra gpu memory consumption from 33% (permutation stored as 32-bit indices) to ~10% for storing block-sparse structure. Benchmark results: ======================= CUDA Device Properties ====================== Cuda version : 11.8 Device ID : 0 Device name : NVIDIA GeForce RTX 2080 Ti Total GPU memory : 11012 MiB GPU memory available : 10852 MiB Compute capability : 7.5 Warp size : 32 Max threads per block: 1024 Max threads per dim : 1024 1024 64 Max grid size : 2147483647 65535 65535 Multiprocessor count : 68 ==================================================================== Running ./bin/evaluation_benchmark Run on (112 X 3200 MHz CPU s) CPU Caches: L1 Data 32 KiB (x56) L1 Instruction 32 KiB (x56) L2 Unified 1024 KiB (x56) L3 Unified 39424 KiB (x2) Load Average: 24.58, 11.75, 8.52 ----------------------------------------------------------------------- Benchmark Time ----------------------------------------------------------------------- Using on-the-fly computation of CRS index corresponding to block-sparse index: JacobianToCRS<g/final/problem-4585-1324582-pre.txt> 1607 ms JacobianToCRSView<g/final/problem-4585-1324582-pre.txt> 564 ms JacobianToCRSMatrix<g/final/problem-4585-1324582-pre.txt> 2226 ms JacobianToCRSViewUpdate<g/final/problem-4585-1324582-pre.txt> 228 ms JacobianToCRSMatrixUpdate<g/final/problem-4585-1324582-pre.txt> 400 ms Using precomputed permutation: JacobianToCRS</final/problem-4585-1324582-pre.txt> 1656 ms JacobianToCRSView</final/problem-4585-1324582-pre.txt> 553 ms JacobianToCRSMatrix</final/problem-4585-1324582-pre.txt> 2255 ms JacobianToCRSViewUpdate</final/problem-4585-1324582-pre.txt> 228 ms JacobianToCRSMatrixUpdate</final/problem-4585-1324582-pre.txt> 406 ms Performance of JacobianToCRSViewUpdate is still limited by host-to-device transfer, and JacobianToCRSView is faster than computing CRS structure on CPU. Change-Id: Ifb6910fb01ae6071400d36c277846fadc5857964
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index e4b124d..be4bad1 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml
@@ -49,20 +49,23 @@ libsuitesparse-dev \ ninja-build \ wget - - name: Setup CUDA toolkit (system repositories) + # nvidia cuda toolkit shipped with 20.04 LTS does not support stream-ordered allocations + - name: Setup CUDA toolkit repositories (20.04) if: matrix.gpu == 'cuda' && matrix.os == 'ubuntu:20.04' run: | - apt-get install -y \ - nvidia-cuda-dev \ - nvidia-cuda-toolkit + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb + dpkg -i cuda-keyring_1.0-1_all.deb # nvidia cuda toolkit + gcc combo shipped with 22.04LTS is broken # and is not able to compile code that uses thrust # https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1006962 - - name: Setup CUDA toolkit (nvidia repositories) + - name: Setup CUDA toolkit repositories (22.04) if: matrix.gpu == 'cuda' && matrix.os == 'ubuntu:22.04' run: | wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb dpkg -i cuda-keyring_1.0-1_all.deb + - name: Setup CUDA toolkit + if: matrix.gpu == 'cuda' + run: | apt-get update apt-get install -y cuda echo "CUDACXX=/usr/local/cuda/bin/nvcc" >> $GITHUB_ENV
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt index 9cb99b4..b67b10e 100644 --- a/internal/ceres/CMakeLists.txt +++ b/internal/ceres/CMakeLists.txt
@@ -140,9 +140,12 @@ COMMAND ${CUDAToolkit_BIN_DIR}/cuda-memcheck --leak-check full $<TARGET_FILE:cuda_dense_cholesky_test>) endif (BUILD_TESTING AND GFLAGS) - set_source_files_properties(cuda_kernels.cu.cc PROPERTIES LANGUAGE CUDA) - add_library(ceres_cuda_kernels STATIC cuda_kernels.cu.cc) + set_source_files_properties(cuda_kernels_vector_ops.cu.cc PROPERTIES LANGUAGE CUDA) + set_source_files_properties(cuda_kernels_bsm_to_crs.cu.cc PROPERTIES LANGUAGE CUDA) + add_library(ceres_cuda_kernels STATIC cuda_kernels_vector_ops.cu.cc cuda_kernels_bsm_to_crs.cu.cc) target_compile_features(ceres_cuda_kernels PRIVATE cxx_std_14) + # Enable __host__ / __device__ annotations in lambda declarations + target_compile_options(ceres_cuda_kernels PRIVATE --extended-lambda) target_include_directories(ceres_cuda_kernels PRIVATE ${Ceres_SOURCE_DIR}/internal ${Ceres_BINARY_DIR}/${CMAKE_INSTALL_INCLUDEDIR}) list(APPEND CERES_LIBRARY_PRIVATE_DEPENDENCIES ceres_cuda_kernels) endif (USE_CUDA) @@ -479,7 +482,7 @@ ceres_test(cuda_block_structure) ceres_test(cuda_dense_cholesky) ceres_test(cuda_dense_qr) - ceres_test(cuda_kernels) + ceres_test(cuda_kernels_vector_ops) ceres_test(cuda_sparse_matrix) ceres_test(cuda_streamed_buffer) ceres_test(cuda_vector)
diff --git a/internal/ceres/block_sparse_matrix.cc b/internal/ceres/block_sparse_matrix.cc index ab1d746..01dbfbe 100644 --- a/internal/ceres/block_sparse_matrix.cc +++ b/internal/ceres/block_sparse_matrix.cc
@@ -709,7 +709,9 @@ } std::unique_ptr<BlockSparseMatrix> BlockSparseMatrix::CreateRandomMatrix( - const BlockSparseMatrix::RandomMatrixOptions& options, std::mt19937& prng) { + const BlockSparseMatrix::RandomMatrixOptions& options, + std::mt19937& prng, + bool use_page_locked_memory) { CHECK_GT(options.num_row_blocks, 0); CHECK_GT(options.min_row_block_size, 0); CHECK_GT(options.max_row_block_size, 0); @@ -766,7 +768,8 @@ } } - auto matrix = std::make_unique<BlockSparseMatrix>(bs.release()); + auto matrix = + std::make_unique<BlockSparseMatrix>(bs.release(), use_page_locked_memory); double* values = matrix->mutable_values(); std::normal_distribution<double> standard_normal_distribution; std::generate_n(
diff --git a/internal/ceres/block_sparse_matrix.h b/internal/ceres/block_sparse_matrix.h index 0d99e15..cd13b06 100644 --- a/internal/ceres/block_sparse_matrix.h +++ b/internal/ceres/block_sparse_matrix.h
@@ -157,7 +157,9 @@ // distributed and whose structure is determined by // RandomMatrixOptions. static std::unique_ptr<BlockSparseMatrix> CreateRandomMatrix( - const RandomMatrixOptions& options, std::mt19937& prng); + const RandomMatrixOptions& options, + std::mt19937& prng, + bool use_page_locked_memory = false); private: double* AllocateValues(int size);
diff --git a/internal/ceres/cuda_block_sparse_crs_view.cc b/internal/ceres/cuda_block_sparse_crs_view.cc index a03d01c..bb267da 100644 --- a/internal/ceres/cuda_block_sparse_crs_view.cc +++ b/internal/ceres/cuda_block_sparse_crs_view.cc
@@ -32,41 +32,69 @@ #ifndef CERES_NO_CUDA -#include "ceres/cuda_block_structure.h" -#include "ceres/cuda_kernels.h" +#include "ceres/cuda_kernels_bsm_to_crs.h" namespace ceres::internal { CudaBlockSparseCRSView::CudaBlockSparseCRSView(const BlockSparseMatrix& bsm, ContextImpl* context) - : permutation_(context, bsm.num_nonzeros()), - streamed_buffer_(context, kMaxTemporaryArraySize) { - CudaBlockSparseStructure block_structure(*bsm.block_structure(), context); + : context_(context) { + block_structure_ = std::make_unique<CudaBlockSparseStructure>( + *bsm.block_structure(), context); + // Only block-sparse matrices with sequential layout of cells are supported + CHECK(block_structure_->sequential_layout()); crs_matrix_ = std::make_unique<CudaSparseMatrix>( bsm.num_rows(), bsm.num_cols(), bsm.num_nonzeros(), context); - CudaBuffer<int> temp_rows(context, bsm.num_rows()); - FillCRSStructure(block_structure.num_row_blocks(), + FillCRSStructure(block_structure_->num_row_blocks(), bsm.num_rows(), - block_structure.row_block_offsets(), - block_structure.cells(), - block_structure.row_blocks(), - block_structure.col_blocks(), + block_structure_->first_cell_in_row_block(), + block_structure_->cells(), + block_structure_->row_blocks(), + block_structure_->col_blocks(), crs_matrix_->mutable_rows(), crs_matrix_->mutable_cols(), - temp_rows.data(), - permutation_.data(), context->DefaultStream()); + is_crs_compatible_ = block_structure_->IsCrsCompatible(); + // if matrix is crs-compatible - we can drop block-structure and don't need + // streamed_buffer_ + if (is_crs_compatible_) { + VLOG(3) << "Block-sparse matrix is compatible with CRS, discarding " + "block-structure"; + block_structure_ = nullptr; + } else { + streamed_buffer_ = std::make_unique<CudaStreamedBuffer<double>>( + context_, kMaxTemporaryArraySize); + } UpdateValues(bsm); } + void CudaBlockSparseCRSView::UpdateValues(const BlockSparseMatrix& bsm) { - streamed_buffer_.CopyToGpu( + if (is_crs_compatible_) { + // Values of CRS-compatible matrices can be copied as-is + CHECK_EQ(cudaSuccess, + cudaMemcpyAsync(crs_matrix_->mutable_values(), + bsm.values(), + bsm.num_nonzeros() * sizeof(double), + cudaMemcpyHostToDevice, + context_->DefaultStream())); + return; + } + streamed_buffer_->CopyToGpu( bsm.values(), bsm.num_nonzeros(), - [permutation = permutation_.data(), - values_to = crs_matrix_->mutable_values()]( + [bs = block_structure_.get(), crs = crs_matrix_.get()]( const double* values, int num_values, int offset, auto stream) { - PermuteValues( - offset, num_values, permutation, values, values_to, stream); + PermuteToCRS(offset, + num_values, + bs->num_row_blocks(), + bs->first_cell_in_row_block(), + bs->cells(), + bs->row_blocks(), + bs->col_blocks(), + crs->rows(), + values, + crs->mutable_values(), + stream); }); }
diff --git a/internal/ceres/cuda_block_sparse_crs_view.h b/internal/ceres/cuda_block_sparse_crs_view.h index 3ea8498..2ae8721 100644 --- a/internal/ceres/cuda_block_sparse_crs_view.h +++ b/internal/ceres/cuda_block_sparse_crs_view.h
@@ -39,6 +39,7 @@ #include <memory> #include "ceres/block_sparse_matrix.h" +#include "ceres/cuda_block_structure.h" #include "ceres/cuda_buffer.h" #include "ceres/cuda_sparse_matrix.h" #include "ceres/cuda_streamed_buffer.h" @@ -49,42 +50,23 @@ // following operations in order to compute products of block-sparse matrices // and dense vectors on gpu: // - Once per block-sparse structure update: -// - Compute CRS structure from block-sparse structure -// - Compute permutation from block-sparse values to CRS values +// - Compute CRS structure from block-sparse structure and check if values of +// block-sparse matrix would have the same order as values of CRS matrix // - Once per block-sparse values update: // - Update values in CRS matrix with values of block-sparse matrix // -// Since there are no constraints on positions of cells in value array of -// block-sparse matrix, a permutation from block-sparse values to CRS -// values is stored explicitly. +// Only block-sparse matrices with sequential order of cells are supported. // -// Example: given matrix with the following block-structure -// [ 1 2 | | 6 7 ] -// [ 3 4 | | 8 9 ] -// [-----+-+-----] -// [ |5| ] -// with values stored as values_block_sparse = [1, 2, 3, 4, 5, 6, 7, 8, 9], -// permutation from block-sparse to CRS is p = [0, 1, 4, 5, 8, 2, 3, 6, 7]; -// and i-th block-sparse value has index p[i] in CRS values array -// -// This allows to avoid storing both CRS and block-sparse values in GPU memory. -// Instead, block-sparse values are transferred to gpu memory as a disjoint set -// of small continuous segments with simultaneous permutation of the values into -// correct order +// UpdateValues method updates values: +// - In a single host-to-device copy for matrices with CRS-compatible value +// layout +// - Simultaneously transferring and permuting values using CudaStreamedBuffer +// otherwise class CERES_NO_EXPORT CudaBlockSparseCRSView { public: - // Initializes internal CRS matrix and permutation from block-sparse to CRS - // values. The following objects are stored in gpu memory for the whole - // lifetime of the object - // - crs_matrix_: CRS matrix - // - permutation_: permutation from block-sparse to CRS value order - // (num_nonzeros integer values) - // - streamed_buffer_: helper for value updating - // The following objects are created temporarily during construction: - // - CudaBlockSparseStructure: block-sparse structure of block-sparse matrix - // - num_rows integer values: row to row-block map - // If copy_values flag is set to false, only structure of block-sparse matrix - // bsm is captured, and values are left uninitialized + // Initializes internal CRS matrix using structure and values of block-sparse + // matrix For block-sparse matrices that have value layout different from CRS + // block-sparse structure will be stored/ CudaBlockSparseCRSView(const BlockSparseMatrix& bsm, ContextImpl* context); const CudaSparseMatrix* crs_matrix() const { return crs_matrix_.get(); } @@ -95,16 +77,21 @@ // used for construction. void UpdateValues(const BlockSparseMatrix& bsm); + // Returns true if block-sparse matrix had CRS-compatible value layout + bool IsCrsCompatible() const { return is_crs_compatible_; } + private: // Value permutation kernel performs a single element-wise operation per // thread, thus performing permutation in blocks of 8 megabytes of // block-sparse values seems reasonable static constexpr int kMaxTemporaryArraySize = 1 * 1024 * 1024; std::unique_ptr<CudaSparseMatrix> crs_matrix_; - // Permutation from block-sparse to CRS value order. - // permutation_[i] = index of i-th block-sparse value in CRS values - CudaBuffer<int> permutation_; - CudaStreamedBuffer<double> streamed_buffer_; + // Only created if block-sparse matrix has non-CRS value layout + std::unique_ptr<CudaStreamedBuffer<double>> streamed_buffer_; + // Only stored if block-sparse matrix has non-CRS value layout + std::unique_ptr<CudaBlockSparseStructure> block_structure_; + bool is_crs_compatible_; + ContextImpl* context_; }; } // namespace ceres::internal
diff --git a/internal/ceres/cuda_block_sparse_crs_view_test.cc b/internal/ceres/cuda_block_sparse_crs_view_test.cc index f8d1cdf..e80d026 100644 --- a/internal/ceres/cuda_block_sparse_crs_view_test.cc +++ b/internal/ceres/cuda_block_sparse_crs_view_test.cc
@@ -52,48 +52,110 @@ options.max_col_block_size = 10; options.block_density = 0.2; std::mt19937 rng; - A_ = BlockSparseMatrix::CreateRandomMatrix(options, rng); - std::iota( - A_->mutable_values(), A_->mutable_values() + A_->num_nonzeros(), 1); + + // Block-sparse matrix with order of values different from CRS + block_sparse_non_crs_compatible_ = + BlockSparseMatrix::CreateRandomMatrix(options, rng, true); + std::iota(block_sparse_non_crs_compatible_->mutable_values(), + block_sparse_non_crs_compatible_->mutable_values() + + block_sparse_non_crs_compatible_->num_nonzeros(), + 1); + + options.max_row_block_size = 1; + // Block-sparse matrix with CRS order of values (row-blocks are rows) + block_sparse_crs_compatible_rows_ = + BlockSparseMatrix::CreateRandomMatrix(options, rng, true); + std::iota(block_sparse_crs_compatible_rows_->mutable_values(), + block_sparse_crs_compatible_rows_->mutable_values() + + block_sparse_crs_compatible_rows_->num_nonzeros(), + 1); + // Block-sparse matrix with CRS order of values (single cell per row-block) + auto bs = std::make_unique<CompressedRowBlockStructure>( + *block_sparse_non_crs_compatible_->block_structure()); + + int num_nonzeros = 0; + for (auto& r : bs->rows) { + const int num_cells = r.cells.size(); + if (num_cells > 1) { + std::uniform_int_distribution<int> uniform_cell(0, num_cells - 1); + const int selected_cell = uniform_cell(rng); + std::swap(r.cells[0], r.cells[selected_cell]); + r.cells.resize(1); + } + const int row_block_size = r.block.size; + for (auto& c : r.cells) { + c.position = num_nonzeros; + const int col_block_size = bs->cols[c.block_id].size; + num_nonzeros += col_block_size * row_block_size; + } + } + block_sparse_crs_compatible_single_cell_ = + std::make_unique<BlockSparseMatrix>(bs.release()); + std::iota(block_sparse_crs_compatible_single_cell_->mutable_values(), + block_sparse_crs_compatible_single_cell_->mutable_values() + + block_sparse_crs_compatible_single_cell_->num_nonzeros(), + 1); } - std::unique_ptr<BlockSparseMatrix> A_; + void Compare(const BlockSparseMatrix& bsm, CudaSparseMatrix& csm) { + ASSERT_EQ(csm.num_cols(), bsm.num_cols()); + ASSERT_EQ(csm.num_rows(), bsm.num_rows()); + ASSERT_EQ(csm.num_nonzeros(), bsm.num_nonzeros()); + const int num_rows = bsm.num_rows(); + const int num_cols = bsm.num_cols(); + Vector x(num_cols); + Vector y(num_rows); + CudaVector x_cuda(&context_, num_cols); + CudaVector y_cuda(&context_, num_rows); + Vector y_cuda_host(num_rows); + + for (int i = 0; i < num_cols; ++i) { + x.setZero(); + y.setZero(); + y_cuda.SetZero(); + x[i] = 1.; + x_cuda.CopyFromCpu(x); + csm.RightMultiplyAndAccumulate(x_cuda, &y_cuda); + bsm.RightMultiplyAndAccumulate( + x.data(), y.data(), &context_, std::thread::hardware_concurrency()); + y_cuda.CopyTo(&y_cuda_host); + // There will be up to 1 non-zero product per row, thus we expect an exact + // match on 32-bit integer indices + EXPECT_EQ((y - y_cuda_host).squaredNorm(), 0.); + } + } + + std::unique_ptr<BlockSparseMatrix> block_sparse_non_crs_compatible_; + std::unique_ptr<BlockSparseMatrix> block_sparse_crs_compatible_rows_; + std::unique_ptr<BlockSparseMatrix> block_sparse_crs_compatible_single_cell_; ContextImpl context_; }; -TEST_F(CudaBlockSparseCRSViewTest, CreateUpdateValues) { - auto view = CudaBlockSparseCRSView(*A_, &context_); +TEST_F(CudaBlockSparseCRSViewTest, CreateUpdateValuesNonCompatible) { + auto view = + CudaBlockSparseCRSView(*block_sparse_non_crs_compatible_, &context_); + ASSERT_EQ(view.IsCrsCompatible(), false); - // TODO: we definitely would like to use crs_matrix() here, but - // CudaSparseMatrix::RightMultiplyAndAccumulate is defined non-const because - // it might allocate additional storage by request of cuSPARSE auto matrix = view.mutable_crs_matrix(); - ASSERT_EQ(matrix->num_cols(), A_->num_cols()); - ASSERT_EQ(matrix->num_rows(), A_->num_rows()); - ASSERT_EQ(matrix->num_nonzeros(), A_->num_nonzeros()); + Compare(*block_sparse_non_crs_compatible_, *matrix); +} - const int num_rows = A_->num_rows(); - const int num_cols = A_->num_cols(); - Vector x(num_cols); - Vector y(num_rows); - CudaVector x_cuda(&context_, num_cols); - CudaVector y_cuda(&context_, num_rows); - Vector y_cuda_host(num_rows); +TEST_F(CudaBlockSparseCRSViewTest, CreateUpdateValuesCompatibleRows) { + auto view = + CudaBlockSparseCRSView(*block_sparse_crs_compatible_rows_, &context_); + ASSERT_EQ(view.IsCrsCompatible(), true); - for (int i = 0; i < num_cols; ++i) { - x.setZero(); - y.setZero(); - y_cuda.SetZero(); - x[i] = 1.; - x_cuda.CopyFromCpu(x); - A_->RightMultiplyAndAccumulate( - x.data(), y.data(), &context_, std::thread::hardware_concurrency()); - matrix->RightMultiplyAndAccumulate(x_cuda, &y_cuda); - y_cuda.CopyTo(&y_cuda_host); - // There will be up to 1 non-zero product per row, thus we expect an exact - // match on 32-bit integer indices - EXPECT_EQ((y - y_cuda_host).squaredNorm(), 0.); - } + auto matrix = view.mutable_crs_matrix(); + Compare(*block_sparse_crs_compatible_rows_, *matrix); +} + +TEST_F(CudaBlockSparseCRSViewTest, CreateUpdateValuesCompatibleSingleCell) { + auto view = CudaBlockSparseCRSView(*block_sparse_crs_compatible_single_cell_, + &context_); + ASSERT_EQ(view.IsCrsCompatible(), true); + + auto matrix = view.mutable_crs_matrix(); + Compare(*block_sparse_crs_compatible_single_cell_, *matrix); } } // namespace ceres::internal
diff --git a/internal/ceres/cuda_block_structure.cc b/internal/ceres/cuda_block_structure.cc index 8e549e1..5817cd7 100644 --- a/internal/ceres/cuda_block_structure.cc +++ b/internal/ceres/cuda_block_structure.cc
@@ -46,7 +46,7 @@ CudaBlockSparseStructure::CudaBlockSparseStructure( const CompressedRowBlockStructure& block_structure, ContextImpl* context) - : row_block_offsets_(context), + : first_cell_in_row_block_(context), cells_(context), row_blocks_(context), col_blocks_(context) { @@ -56,44 +56,56 @@ const auto& col_blocks = block_structure.cols; // Row block offset is an index of the first cell corresponding to row block - std::vector<int> row_block_offsets; + std::vector<int> first_cell_in_row_block; // Flat array of all cells from all row-blocks std::vector<Cell> cells; + int f_values_offset = 0; + is_crs_compatible_ = true; num_row_blocks_ = block_structure.rows.size(); num_col_blocks_ = col_blocks.size(); row_blocks.reserve(num_row_blocks_); - row_block_offsets.reserve(num_row_blocks_ + 1); + first_cell_in_row_block.reserve(num_row_blocks_ + 1); num_nonzeros_ = 0; - num_cells_ = 0; + sequential_layout_ = true; for (const auto& r : block_structure.rows) { const int row_block_size = r.block.size; + if (r.cells.size() > 1 && row_block_size > 1) { + is_crs_compatible_ = false; + } row_blocks.emplace_back(r.block); - row_block_offsets.push_back(num_cells_); + first_cell_in_row_block.push_back(cells.size()); for (const auto& c : r.cells) { - cells.emplace_back(c); const int col_block_size = col_blocks[c.block_id].size; - num_nonzeros_ += col_block_size * row_block_size; - ++num_cells_; + const int cell_size = col_block_size * row_block_size; + cells.push_back(c); + sequential_layout_ &= c.position == num_nonzeros_; + num_nonzeros_ += cell_size; } } - row_block_offsets.push_back(num_cells_); + first_cell_in_row_block.push_back(cells.size()); + num_cells_ = cells.size(); num_rows_ = Dimension(row_blocks); num_cols_ = Dimension(col_blocks); + is_crs_compatible_ &= sequential_layout_; + if (VLOG_IS_ON(3)) { - const size_t row_block_offsets_size = - row_block_offsets.size() * sizeof(int); + const size_t first_cell_in_row_block_size = + first_cell_in_row_block.size() * sizeof(int); const size_t cells_size = cells.size() * sizeof(Cell); const size_t row_blocks_size = row_blocks.size() * sizeof(Block); const size_t col_blocks_size = col_blocks.size() * sizeof(Block); - const size_t total_size = - row_block_offsets_size + cells_size + col_blocks_size + row_blocks_size; + const size_t total_size = first_cell_in_row_block_size + cells_size + + col_blocks_size + row_blocks_size; + const double ratio = + (100. * total_size) / (num_nonzeros_ * (sizeof(int) + sizeof(double)) + + num_rows_ * sizeof(int)); VLOG(3) << "\nCudaBlockSparseStructure:\n" "\tRow block offsets: " - << row_block_offsets_size + << first_cell_in_row_block_size << " bytes\n" "\tColumn blocks: " << col_blocks_size @@ -102,13 +114,11 @@ << row_blocks_size << " bytes\n" "\tCells: " - << cells_size - << " bytes\n" - "\tTotal: " - << total_size << " bytes of GPU memory"; + << cells_size << " bytes\n\tTotal: " << total_size + << " bytes of GPU memory (" << ratio << "% of CRS matrix size)"; } - row_block_offsets_.CopyFromCpuVector(row_block_offsets); + first_cell_in_row_block_.CopyFromCpuVector(first_cell_in_row_block); cells_.CopyFromCpuVector(cells); row_blocks_.CopyFromCpuVector(row_blocks); col_blocks_.CopyFromCpuVector(col_blocks);
diff --git a/internal/ceres/cuda_block_structure.h b/internal/ceres/cuda_block_structure.h index a41d0b3..28e36b2 100644 --- a/internal/ceres/cuda_block_structure.h +++ b/internal/ceres/cuda_block_structure.h
@@ -44,12 +44,11 @@ // This class stores a read-only block-sparse structure in gpu memory. // Invariants are the same as those of CompressedRowBlockStructure. // In order to simplify allocation and copying data to gpu, cells from all -// row-blocks are stored in a single array sequentially. Additional array -// row_block_offsets of size num_row_blocks + 1 allows to identify range of -// cells corresponding to a row-block. -// Cells corresponding to i-th row-block are stored in sub-array -// cells[row_block_offsets[i]; ... row_block_offsets[i + 1] - 1], and their -// order is preserved. +// row-blocks are stored in a single array sequentially. Array +// first_cell_in_row_block of size num_row_blocks + 1 allows to identify range +// of cells corresponding to a row-block. Cells corresponding to i-th row-block +// are stored in sub-array cells[first_cell_in_row_block[i]; ... +// first_cell_in_row_block[i + 1] - 1], and their order is preserved. class CERES_NO_EXPORT CudaBlockSparseStructure { public: // CompressedRowBlockStructure is contains a vector of CompressedLists, with @@ -65,9 +64,20 @@ int num_row_blocks() const { return num_row_blocks_; } int num_col_blocks() const { return num_col_blocks_; } + // Returns true if values from block-sparse matrix can be copied to CRS matrix + // as-is. This is possible if each row-block is stored in CRS order: + // - Row-block consists of a single row + // - Row-block contains a single cell + bool IsCrsCompatible() const { return is_crs_compatible_; } + // Returns true if block-sparse structure corresponds to block-sparse matrix + // with sequential cell positions + bool sequential_layout() const { return sequential_layout_; } + // Device pointer to array of num_row_blocks + 1 indices of the first cell of // row block - const int* row_block_offsets() const { return row_block_offsets_.data(); } + const int* first_cell_in_row_block() const { + return first_cell_in_row_block_.data(); + } // Device pointer to array of num_cells cells, sorted by row-block const Cell* cells() const { return cells_.data(); } // Device pointer to array of row blocks @@ -82,7 +92,9 @@ int num_nonzeros_; int num_row_blocks_; int num_col_blocks_; - CudaBuffer<int> row_block_offsets_; + bool is_crs_compatible_; + bool sequential_layout_; + CudaBuffer<int> first_cell_in_row_block_; CudaBuffer<Cell> cells_; CudaBuffer<Block> row_blocks_; CudaBuffer<Block> col_blocks_;
diff --git a/internal/ceres/cuda_block_structure_test.cc b/internal/ceres/cuda_block_structure_test.cc index 95cb26b..000b6bb 100644 --- a/internal/ceres/cuda_block_structure_test.cc +++ b/internal/ceres/cuda_block_structure_test.cc
@@ -83,10 +83,10 @@ } std::vector<int> GetRowBlockOffsets( const CudaBlockSparseStructure& structure) { - const auto& cuda_buffer = structure.row_block_offsets_; - std::vector<int> row_block_offsets(cuda_buffer.size()); - cuda_buffer.CopyToCpu(row_block_offsets.data(), row_block_offsets.size()); - return row_block_offsets; + const auto& cuda_buffer = structure.first_cell_in_row_block_; + std::vector<int> first_cell_in_row_block(cuda_buffer.size()); + cuda_buffer.CopyToCpu(first_cell_in_row_block.data(), first_cell_in_row_block.size()); + return first_cell_in_row_block; } std::unique_ptr<BlockSparseMatrix> A_; @@ -114,19 +114,19 @@ } std::vector<Cell> cells = GetCells(cuda_block_structure); - std::vector<int> row_block_offsets = GetRowBlockOffsets(cuda_block_structure); + std::vector<int> first_cell_in_row_block = GetRowBlockOffsets(cuda_block_structure); blocks = GetRowBlocks(cuda_block_structure); ASSERT_EQ(blocks.size(), num_row_blocks); - ASSERT_EQ(row_block_offsets.size(), num_row_blocks + 1); - ASSERT_EQ(row_block_offsets.back(), cells.size()); + ASSERT_EQ(first_cell_in_row_block.size(), num_row_blocks + 1); + ASSERT_EQ(first_cell_in_row_block.back(), cells.size()); for (int i = 0; i < num_row_blocks; ++i) { const int num_cells = block_structure->rows[i].cells.size(); EXPECT_EQ(blocks[i].position, block_structure->rows[i].block.position); EXPECT_EQ(blocks[i].size, block_structure->rows[i].block.size); - const int first_cell = row_block_offsets[i]; - const int last_cell = row_block_offsets[i + 1]; + const int first_cell = first_cell_in_row_block[i]; + const int last_cell = first_cell_in_row_block[i + 1]; ASSERT_EQ(last_cell - first_cell, num_cells); for (int j = 0; j < num_cells; ++j) { EXPECT_EQ(cells[first_cell + j].block_id,
diff --git a/internal/ceres/cuda_kernels.cu.cc b/internal/ceres/cuda_kernels.cu.cc deleted file mode 100644 index 3d93ae8..0000000 --- a/internal/ceres/cuda_kernels.cu.cc +++ /dev/null
@@ -1,304 +0,0 @@ -// Ceres Solver - A fast non-linear least squares minimizer -// Copyright 2022 Google Inc. All rights reserved. -// http://ceres-solver.org/ -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// * Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// * Neither the name of Google Inc. nor the names of its contributors may be -// used to endorse or promote products derived from this software without -// specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -// POSSIBILITY OF SUCH DAMAGE. -// -// Author: joydeepb@cs.utexas.edu (Joydeep Biswas) - -//#include "ceres/cuda_kernels.h" - -#include <thrust/execution_policy.h> -#include <thrust/scan.h> - -#include "block_structure.h" -#include "cuda_runtime.h" - -namespace ceres { -namespace internal { - -// As the CUDA Toolkit documentation says, "although arbitrary in this case, is -// a common choice". This is determined by the warp size, max block size, and -// multiprocessor sizes of recent GPUs. For complex kernels with significant -// register usage and unusual memory patterns, the occupancy calculator API -// might provide better performance. See "Occupancy Calculator" under the CUDA -// toolkit documentation. -constexpr int kCudaBlockSize = 256; -inline int NumBlocks(int size) { - return (size + kCudaBlockSize - 1) / kCudaBlockSize; -} - -template <typename SrcType, typename DstType> -__global__ void TypeConversionKernel(const SrcType* __restrict__ input, - DstType* __restrict__ output, - const int size) { - const int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < size) { - output[i] = static_cast<DstType>(input[i]); - } -} - -void CudaFP64ToFP32(const double* input, - float* output, - const int size, - cudaStream_t stream) { - const int num_blocks = NumBlocks(size); - TypeConversionKernel<double, float> - <<<num_blocks, kCudaBlockSize, 0, stream>>>(input, output, size); -} - -void CudaFP32ToFP64(const float* input, - double* output, - const int size, - cudaStream_t stream) { - const int num_blocks = NumBlocks(size); - TypeConversionKernel<float, double> - <<<num_blocks, kCudaBlockSize, 0, stream>>>(input, output, size); -} - -template <typename T> -__global__ void SetZeroKernel(T* __restrict__ output, const int size) { - const int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < size) { - output[i] = T(0.0); - } -} - -void CudaSetZeroFP32(float* output, const int size, cudaStream_t stream) { - const int num_blocks = NumBlocks(size); - SetZeroKernel<float><<<num_blocks, kCudaBlockSize, 0, stream>>>(output, size); -} - -void CudaSetZeroFP64(double* output, const int size, cudaStream_t stream) { - const int num_blocks = NumBlocks(size); - SetZeroKernel<double> - <<<num_blocks, kCudaBlockSize, 0, stream>>>(output, size); -} - -template <typename SrcType, typename DstType> -__global__ void XPlusEqualsYKernel(DstType* __restrict__ x, - const SrcType* __restrict__ y, - const int size) { - const int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < size) { - x[i] = x[i] + DstType(y[i]); - } -} - -void CudaDsxpy(double* x, float* y, const int size, cudaStream_t stream) { - const int num_blocks = NumBlocks(size); - XPlusEqualsYKernel<float, double> - <<<num_blocks, kCudaBlockSize, 0, stream>>>(x, y, size); -} - -__global__ void CudaDtDxpyKernel(double* __restrict__ y, - const double* D, - const double* __restrict__ x, - const int size) { - const int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < size) { - y[i] = y[i] + D[i] * D[i] * x[i]; - } -} - -void CudaDtDxpy(double* y, - const double* D, - const double* x, - const int size, - cudaStream_t stream) { - const int num_blocks = NumBlocks(size); - CudaDtDxpyKernel<<<num_blocks, kCudaBlockSize, 0, stream>>>(y, D, x, size); -} - -// Fill row block id and nnz for each row using block-sparse structure -// represented by a set of flat arrays. -// Inputs: -// - num_row_blocks: number of row-blocks in block-sparse structure -// - row_block_offsets: index of the first cell of the row-block; size: -// num_row_blocks + 1 -// - cells: cells of block-sparse structure as a continuous array -// - row_blocks: row blocks of block-sparse structure stored sequentially -// - col_blocks: column blocks of block-sparse structure stored sequentially -// Outputs: -// - rows: rows[i + 1] will contain number of non-zeros in i-th row, rows[0] -// will be set to 0; rows are filled with a shift by one element in order -// to obtain row-index array of CRS matrix with a inclusive scan afterwards -// - row_block_ids: row_block_ids[i] will be set to index of row-block that -// contains i-th row. -// Computation is perform row-block-wise -__global__ void RowBlockIdAndNNZ(int num_row_blocks, - const int* __restrict__ row_block_offsets, - const Cell* __restrict__ cells, - const Block* __restrict__ row_blocks, - const Block* __restrict__ col_blocks, - int* __restrict__ rows, - int* __restrict__ row_block_ids) { - const int row_block_id = blockIdx.x * blockDim.x + threadIdx.x; - if (row_block_id > num_row_blocks) { - // No synchronization is performed in this kernel, thus it is safe to return - return; - } - if (row_block_id == num_row_blocks) { - // one extra thread sets the first element - rows[0] = 0; - return; - } - const auto& row_block = row_blocks[row_block_id]; - int row_nnz = 0; - const auto first_cell = cells + row_block_offsets[row_block_id]; - const auto last_cell = cells + row_block_offsets[row_block_id + 1]; - for (auto cell = first_cell; cell < last_cell; ++cell) { - row_nnz += col_blocks[cell->block_id].size; - } - const int first_row = row_block.position; - const int last_row = first_row + row_block.size; - for (int i = first_row; i < last_row; ++i) { - rows[i + 1] = row_nnz; - row_block_ids[i] = row_block_id; - } -} - -// Row-wise creation of CRS structure -// Inputs: -// - num_rows: number of rows in matrix -// - row_block_offsets: index of the first cell of the row-block; size: -// num_row_blocks + 1 -// - cells: cells of block-sparse structure as a continuous array -// - row_blocks: row blocks of block-sparse structure stored sequentially -// - col_blocks: column blocks of block-sparse structure stored sequentially -// - row_block_ids: index of row-block that corresponds to row -// - rows: row-index array of CRS structure -// Outputs: -// - cols: column-index array of CRS structure -// - permutation: permutation from block-sparse to crs order -// Computaion is perform row-wise -__global__ void ComputeColumnsAndPermutation( - int num_rows, - const int* __restrict__ row_block_offsets, - const Cell* __restrict__ cells, - const Block* __restrict__ row_blocks, - const Block* __restrict__ col_blocks, - const int* __restrict__ row_block_ids, - const int* __restrict__ rows, - int* __restrict__ cols, - int* __restrict__ permutation) { - const int row = blockIdx.x * blockDim.x + threadIdx.x; - if (row >= num_rows) { - // No synchronization is performed in this kernel, thus it is safe to return - return; - } - const int row_block_id = row_block_ids[row]; - // zero-based index of row in row-block - const int row_in_block = row - row_blocks[row_block_id].position; - // position in crs matrix - int crs_position = rows[row]; - const auto first_cell = cells + row_block_offsets[row_block_id]; - const auto last_cell = cells + row_block_offsets[row_block_id + 1]; - // For reach cell of row-block only current row is being filled - for (auto cell = first_cell; cell < last_cell; ++cell) { - const auto& col_block = col_blocks[cell->block_id]; - const int col_block_size = col_block.size; - int column_idx = col_block.position; - int bs_position = cell->position + row_in_block * col_block_size; - // Fill permutation and column indices for each element of row_in_block - // row of current cell - for (int i = 0; i < col_block_size; ++i, ++crs_position) { - permutation[bs_position++] = crs_position; - cols[crs_position] = column_idx++; - } - } -} - -void FillCRSStructure(const int num_row_blocks, - const int num_rows, - const int* row_block_offsets, - const Cell* cells, - const Block* row_blocks, - const Block* col_blocks, - int* rows, - int* cols, - int* row_block_ids, - int* permutation, - cudaStream_t stream) { - // Set number of non-zeros per row in rows array and row to row-block map in - // row_block_ids array - const int num_blocks_blockwise = NumBlocks(num_row_blocks + 1); - RowBlockIdAndNNZ<<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>( - num_row_blocks, - row_block_offsets, - cells, - row_blocks, - col_blocks, - rows, - row_block_ids); - // Finalize row-index array of CRS strucure by computing prefix sum - thrust::inclusive_scan( - thrust::cuda::par.on(stream), rows, rows + num_rows + 1, rows); - - // Fill cols array of CRS structure and permutation from block-sparse to CRS - const int num_blocks_rowwise = NumBlocks(num_rows); - ComputeColumnsAndPermutation<<<num_blocks_rowwise, - kCudaBlockSize, - 0, - stream>>>(num_rows, - row_block_offsets, - cells, - row_blocks, - col_blocks, - row_block_ids, - rows, - cols, - permutation); -} - -// Updates values of CRS matrix with a continuous block of values of -// block-sparse matrix. With permutation[i] being an index of i-th value of -// block-sparse matrix in values of CRS matrix updating values is quite -// efficient when performed element-wise (reads of permutation and values arrays -// are coalesced when offset is divisable by 16) -__global__ void PermuteValuesKernel(const int offset, - const int num_values, - const int* permutation, - const double* block_sparse_values, - double* crs_values) { - const int value_id = blockIdx.x * blockDim.x + threadIdx.x; - if (value_id < num_values) { - crs_values[permutation[offset + value_id]] = block_sparse_values[value_id]; - } -} - -void PermuteValues(const int offset, - const int num_values, - const int* permutation, - const double* block_sparse_values, - double* crs_values, - cudaStream_t stream) { - const int num_blocks = NumBlocks(num_values); - PermuteValuesKernel<<<num_blocks, kCudaBlockSize, 0, stream>>>( - offset, num_values, permutation, block_sparse_values, crs_values); -} - -} // namespace internal -} // namespace ceres
diff --git a/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc b/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc new file mode 100644 index 0000000..8be3553 --- /dev/null +++ b/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc
@@ -0,0 +1,279 @@ +// Ceres Solver - A fast non-linear least squares minimizer +// Copyright 2022 Google Inc. All rights reserved. +// http://ceres-solver.org/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Google Inc. nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin) + +#include "ceres/cuda_kernels_bsm_to_crs.h" + +#include <cuda_runtime.h> +#include <thrust/execution_policy.h> +#include <thrust/scan.h> + +#include "ceres/block_structure.h" +#include "ceres/cuda_kernels_utils.h" + +namespace ceres { +namespace internal { + +// Fill row block id and nnz for each row using block-sparse structure +// represented by a set of flat arrays. +// Inputs: +// - num_row_blocks: number of row-blocks in block-sparse structure +// - first_cell_in_row_block: index of the first cell of the row-block; size: +// num_row_blocks + 1 +// - cells: cells of block-sparse structure as a continuous array +// - row_blocks: row blocks of block-sparse structure stored sequentially +// - col_blocks: column blocks of block-sparse structure stored sequentially +// Outputs: +// - rows: rows[i + 1] will contain number of non-zeros in i-th row, rows[0] +// will be set to 0; rows are filled with a shift by one element in order +// to obtain row-index array of CRS matrix with a inclusive scan afterwards +// - row_block_ids: row_block_ids[i] will be set to index of row-block that +// contains i-th row. +// Computation is perform row-block-wise +__global__ void RowBlockIdAndNNZ( + int num_row_blocks, + const int* __restrict__ first_cell_in_row_block, + const Cell* __restrict__ cells, + const Block* __restrict__ row_blocks, + const Block* __restrict__ col_blocks, + int* __restrict__ rows, + int* __restrict__ row_block_ids) { + const int row_block_id = blockIdx.x * blockDim.x + threadIdx.x; + if (row_block_id > num_row_blocks) { + // No synchronization is performed in this kernel, thus it is safe to return + return; + } + if (row_block_id == num_row_blocks) { + // one extra thread sets the first element + rows[0] = 0; + return; + } + const auto& row_block = row_blocks[row_block_id]; + int row_nnz = 0; + const auto first_cell = cells + first_cell_in_row_block[row_block_id]; + const auto last_cell = cells + first_cell_in_row_block[row_block_id + 1]; + for (auto cell = first_cell; cell < last_cell; ++cell) { + row_nnz += col_blocks[cell->block_id].size; + } + const int first_row = row_block.position; + const int last_row = first_row + row_block.size; + for (int i = first_row; i < last_row; ++i) { + rows[i + 1] = row_nnz; + row_block_ids[i] = row_block_id; + } +} + +// Row-wise creation of CRS structure +// Inputs: +// - num_rows: number of rows in matrix +// - first_cell_in_row_block: index of the first cell of the row-block; size: +// num_row_blocks + 1 +// - cells: cells of block-sparse structure as a continuous array +// - row_blocks: row blocks of block-sparse structure stored sequentially +// - col_blocks: column blocks of block-sparse structure stored sequentially +// - row_block_ids: index of row-block that corresponds to row +// - rows: row-index array of CRS structure +// Outputs: +// - cols: column-index array of CRS structure +// Computaion is perform row-wise +__global__ void ComputeColumnsAndPermutation( + int num_rows, + const int* __restrict__ first_cell_in_row_block, + const Cell* __restrict__ cells, + const Block* __restrict__ row_blocks, + const Block* __restrict__ col_blocks, + const int* __restrict__ row_block_ids, + const int* __restrict__ rows, + int* __restrict__ cols) { + const int row = blockIdx.x * blockDim.x + threadIdx.x; + if (row >= num_rows) { + // No synchronization is performed in this kernel, thus it is safe to return + return; + } + const int row_block_id = row_block_ids[row]; + // position in crs matrix + int crs_position = rows[row]; + const auto first_cell = cells + first_cell_in_row_block[row_block_id]; + const auto last_cell = cells + first_cell_in_row_block[row_block_id + 1]; + // For reach cell of row-block only current row is being filled + for (auto cell = first_cell; cell < last_cell; ++cell) { + const auto& col_block = col_blocks[cell->block_id]; + const int col_block_size = col_block.size; + int column_idx = col_block.position; + // Column indices for each element of row_in_block row of current cell + for (int i = 0; i < col_block_size; ++i, ++crs_position) { + cols[crs_position] = column_idx++; + } + } +} + +void FillCRSStructure(const int num_row_blocks, + const int num_rows, + const int* first_cell_in_row_block, + const Cell* cells, + const Block* row_blocks, + const Block* col_blocks, + int* rows, + int* cols, + cudaStream_t stream) { + // Set number of non-zeros per row in rows array and row to row-block map in + // row_block_ids array + int* row_block_ids; + cudaMallocAsync(&row_block_ids, sizeof(int) * num_rows, stream); + const int num_blocks_blockwise = NumBlocksInGrid(num_row_blocks + 1); + RowBlockIdAndNNZ<<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>( + num_row_blocks, + first_cell_in_row_block, + cells, + row_blocks, + col_blocks, + rows, + row_block_ids); + // Finalize row-index array of CRS strucure by computing prefix sum + thrust::inclusive_scan( + thrust::cuda::par_nosync.on(stream), rows, rows + num_rows + 1, rows); + + // Fill cols array of CRS structure and permutation from block-sparse to CRS + const int num_blocks_rowwise = NumBlocksInGrid(num_rows); + ComputeColumnsAndPermutation<<<num_blocks_rowwise, + kCudaBlockSize, + 0, + stream>>>(num_rows, + first_cell_in_row_block, + cells, + row_blocks, + col_blocks, + row_block_ids, + rows, + cols); + cudaFreeAsync(row_block_ids, stream); +} + +template <typename T, typename Predicate> +__device__ int PartitionPoint(const T* data, + int first, + int last, + Predicate&& predicate) { + if (!predicate(data[first])) { + return first; + } + while (last - first > 1) { + const auto midpoint = first + (last - first) / 2; + if (predicate(data[midpoint])) { + first = midpoint; + } else { + last = midpoint; + } + } + return last; +} + +// Element-wise reordering of block-sparse values +// - first_cell_pos - position of the first cell of corresponding sub-matrix; +// only used if use_first_cell is true; otherwise the first cell in row-block is +// used +// - block_sparse_values - segment of block-sparse values starting from +// block_sparse_offset, containing num_values +__global__ void PermuteToCrsKernel( + const int block_sparse_offset, + const int num_values, + const int num_row_blocks, + const int* __restrict__ first_cell_in_row_block, + const Cell* __restrict__ cells, + const Block* __restrict__ row_blocks, + const Block* __restrict__ col_blocks, + const int* __restrict__ crs_rows, + const double* __restrict__ block_sparse_values, + double* __restrict__ crs_values) { + const int value_id = blockIdx.x * blockDim.x + threadIdx.x; + if (value_id >= num_values) { + return; + } + const int block_sparse_value_id = value_id + block_sparse_offset; + // Find the corresponding row-block with a binary search + const int row_block_id = + PartitionPoint( + first_cell_in_row_block, + 0, + num_row_blocks, + [cells, + block_sparse_value_id] __device__(const int row_block_offset) { + return cells[row_block_offset].position <= block_sparse_value_id; + }) - + 1; + // Find cell and calculate offset within the row with a linear scan + const auto& row_block = row_blocks[row_block_id]; + const auto first_cell = cells + first_cell_in_row_block[row_block_id]; + const auto last_cell = cells + first_cell_in_row_block[row_block_id + 1]; + const int row_block_size = row_block.size; + int num_cols_before = 0; + for (const Cell* cell = first_cell; cell < last_cell; ++cell) { + const auto& col_block = col_blocks[cell->block_id]; + const int col_block_size = col_block.size; + const int cell_size = row_block_size * col_block_size; + if (cell->position + cell_size > block_sparse_value_id) { + const int pos_in_cell = block_sparse_value_id - cell->position; + const int row_in_cell = pos_in_cell / col_block_size; + const int col_in_cell = pos_in_cell % col_block_size; + const int row = row_in_cell + row_block.position; + crs_values[crs_rows[row] + num_cols_before + col_in_cell] = + block_sparse_values[value_id]; + break; + } + num_cols_before += col_block_size; + } +} + +void PermuteToCRS(const int block_sparse_offset, + const int num_values, + const int num_row_blocks, + const int* first_cell_in_row_block, + const Cell* cells, + const Block* row_blocks, + const Block* col_blocks, + const int* crs_rows, + const double* block_sparse_values, + double* crs_values, + cudaStream_t stream) { + const int num_blocks_valuewise = NumBlocksInGrid(num_values); + PermuteToCrsKernel<<<num_blocks_valuewise, kCudaBlockSize, 0, stream>>>( + block_sparse_offset, + num_values, + num_row_blocks, + first_cell_in_row_block, + cells, + row_blocks, + col_blocks, + crs_rows, + block_sparse_values, + crs_values); +} + +} // namespace internal +} // namespace ceres
diff --git a/internal/ceres/cuda_kernels_bsm_to_crs.h b/internal/ceres/cuda_kernels_bsm_to_crs.h new file mode 100644 index 0000000..b6da69b --- /dev/null +++ b/internal/ceres/cuda_kernels_bsm_to_crs.h
@@ -0,0 +1,76 @@ +// Ceres Solver - A fast non-linear least squares minimizer +// Copyright 2023 Google Inc. All rights reserved. +// http://ceres-solver.org/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Google Inc. nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin) + +#ifndef CERES_INTERNAL_CUDA_KERNELS_BSM_TO_CRS_H_ +#define CERES_INTERNAL_CUDA_KERNELS_BSM_TO_CRS_H_ + +#include "ceres/internal/config.h" + +#ifndef CERES_NO_CUDA + +#include "cuda_runtime.h" + +namespace ceres { +namespace internal { +class Block; +class Cell; + +// Compute structure of CRS matrix using block-sparse structure. +// Arrays corresponding to CRS matrix are to be allocated by caller +void FillCRSStructure(const int num_row_blocks, + const int num_rows, + const int* first_cell_in_row_block, + const Cell* cells, + const Block* row_blocks, + const Block* col_blocks, + int* rows, + int* cols, + cudaStream_t stream); + +// Permute segment of values from block-sparse matrix with sequential layout to +// CRS order. Segment starts at block_sparse_offset and has length of num_values +void PermuteToCRS(const int block_sparse_offset, + const int num_values, + const int num_row_blocks, + const int* first_cell_in_row_block, + const Cell* cells, + const Block* row_blocks, + const Block* col_blocks, + const int* crs_rows, + const double* block_sparse_values, + double* crs_values, + cudaStream_t stream); + +} // namespace internal +} // namespace ceres + +#endif // CERES_NO_CUDA + +#endif // CERES_INTERNAL_CUDA_KERNELS_BSM_TO_CRS_H_
diff --git a/internal/ceres/cuda_kernels_utils.h b/internal/ceres/cuda_kernels_utils.h new file mode 100644 index 0000000..45d5d3b --- /dev/null +++ b/internal/ceres/cuda_kernels_utils.h
@@ -0,0 +1,56 @@ +// Ceres Solver - A fast non-linear least squares minimizer +// Copyright 2022 Google Inc. All rights reserved. +// http://ceres-solver.org/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Google Inc. nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Author: joydeepb@cs.utexas.edu (Joydeep Biswas) + +#ifndef CERES_INTERNAL_CUDA_KERNELS_UTILS_H_ +#define CERES_INTERNAL_CUDA_KERNELS_UTILS_H_ + +namespace ceres { +namespace internal { + +// Parallel execution on CUDA device requires splitting job into blocks of a +// fixed size. We use block-size of kCudaBlockSize for all kernels that do not +// require any specific block size. As the CUDA Toolkit documentation says, +// "although arbitrary in this case, is a common choice". This is determined by +// the warp size, max block size, and multiprocessor sizes of recent GPUs. For +// complex kernels with significant register usage and unusual memory patterns, +// the occupancy calculator API might provide better performance. See "Occupancy +// Calculator" under the CUDA toolkit documentation. +constexpr int kCudaBlockSize = 256; + +// Compute number of blocks of kCudaBlockSize that span over 1-d grid with +// dimension size. Note that 1-d grid dimension is limited by 2^31-1 in CUDA, +// thus a signed int is used as an argument. +inline int NumBlocksInGrid(int size) { + return (size + kCudaBlockSize - 1) / kCudaBlockSize; +} +} // namespace internal +} // namespace ceres + +#endif // CERES_INTERNAL_CUDA_KERNELS_UTILS_H_
diff --git a/internal/ceres/cuda_kernels_vector_ops.cu.cc b/internal/ceres/cuda_kernels_vector_ops.cu.cc new file mode 100644 index 0000000..b5a8e40 --- /dev/null +++ b/internal/ceres/cuda_kernels_vector_ops.cu.cc
@@ -0,0 +1,123 @@ +// Ceres Solver - A fast non-linear least squares minimizer +// Copyright 2022 Google Inc. All rights reserved. +// http://ceres-solver.org/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Google Inc. nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Author: joydeepb@cs.utexas.edu (Joydeep Biswas) + +#include "ceres/cuda_kernels_vector_ops.h" + +#include <cuda_runtime.h> + +#include "ceres/cuda_kernels_utils.h" + +namespace ceres { +namespace internal { + +template <typename SrcType, typename DstType> +__global__ void TypeConversionKernel(const SrcType* __restrict__ input, + DstType* __restrict__ output, + const int size) { + const int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < size) { + output[i] = static_cast<DstType>(input[i]); + } +} + +void CudaFP64ToFP32(const double* input, + float* output, + const int size, + cudaStream_t stream) { + const int num_blocks = NumBlocksInGrid(size); + TypeConversionKernel<double, float> + <<<num_blocks, kCudaBlockSize, 0, stream>>>(input, output, size); +} + +void CudaFP32ToFP64(const float* input, + double* output, + const int size, + cudaStream_t stream) { + const int num_blocks = NumBlocksInGrid(size); + TypeConversionKernel<float, double> + <<<num_blocks, kCudaBlockSize, 0, stream>>>(input, output, size); +} + +template <typename T> +__global__ void SetZeroKernel(T* __restrict__ output, const int size) { + const int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < size) { + output[i] = T(0.0); + } +} + +void CudaSetZeroFP32(float* output, const int size, cudaStream_t stream) { + const int num_blocks = NumBlocksInGrid(size); + SetZeroKernel<float><<<num_blocks, kCudaBlockSize, 0, stream>>>(output, size); +} + +void CudaSetZeroFP64(double* output, const int size, cudaStream_t stream) { + const int num_blocks = NumBlocksInGrid(size); + SetZeroKernel<double> + <<<num_blocks, kCudaBlockSize, 0, stream>>>(output, size); +} + +template <typename SrcType, typename DstType> +__global__ void XPlusEqualsYKernel(DstType* __restrict__ x, + const SrcType* __restrict__ y, + const int size) { + const int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < size) { + x[i] = x[i] + DstType(y[i]); + } +} + +void CudaDsxpy(double* x, float* y, const int size, cudaStream_t stream) { + const int num_blocks = NumBlocksInGrid(size); + XPlusEqualsYKernel<float, double> + <<<num_blocks, kCudaBlockSize, 0, stream>>>(x, y, size); +} + +__global__ void CudaDtDxpyKernel(double* __restrict__ y, + const double* D, + const double* __restrict__ x, + const int size) { + const int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < size) { + y[i] = y[i] + D[i] * D[i] * x[i]; + } +} + +void CudaDtDxpy(double* y, + const double* D, + const double* x, + const int size, + cudaStream_t stream) { + const int num_blocks = NumBlocksInGrid(size); + CudaDtDxpyKernel<<<num_blocks, kCudaBlockSize, 0, stream>>>(y, D, x, size); +} + +} // namespace internal +} // namespace ceres
diff --git a/internal/ceres/cuda_kernels.h b/internal/ceres/cuda_kernels_vector_ops.h similarity index 66% rename from internal/ceres/cuda_kernels.h rename to internal/ceres/cuda_kernels_vector_ops.h index 61f945a..b179201 100644 --- a/internal/ceres/cuda_kernels.h +++ b/internal/ceres/cuda_kernels_vector_ops.h
@@ -28,8 +28,8 @@ // // Author: joydeepb@cs.utexas.edu (Joydeep Biswas) -#ifndef CERES_INTERNAL_CUDA_KERNELS_H_ -#define CERES_INTERNAL_CUDA_KERNELS_H_ +#ifndef CERES_INTERNAL_CUDA_KERNELS_VECTOR_OPS_H_ +#define CERES_INTERNAL_CUDA_KERNELS_VECTOR_OPS_H_ #include "ceres/internal/config.h" @@ -75,39 +75,9 @@ const int size, cudaStream_t stream); -// Compute structure of CRS matrix and permutation of values using block-sparse -// structure and temporary array row_block_ids. Array row_block_ids of size -// num_rows will be filled with indices of row-blocks corresponding to rows of -// CRS matrix. Arrays corresponding to CRS matrix, permutation and row_block_ids -// arrays are to be allocated by caller -void FillCRSStructure(const int num_row_blocks, - const int num_rows, - const int* row_block_offsets, - const Cell* cells, - const Block* row_blocks, - const Block* col_blocks, - int* rows, - int* cols, - int* row_block_ids, - int* permutation, - cudaStream_t stream); - -// Permute block of block-sparse values using permutation -// Pointer block_sparse_values corresponds to a block of num_values values from -// block-sparse matrix at the offset from begining. Pointer output corresponds -// to values of CRS matrix. Array permutation stores permutation from -// block-sparse to CRS matrix with permutation[i] being an index of i-th value -// of block-sparse matrix in values of CRS matrix -void PermuteValues(const int offset, - const int num_values, - const int* permutation, - const double* block_sparse_values, - double* crs_values, - cudaStream_t stream); - } // namespace internal } // namespace ceres #endif // CERES_NO_CUDA -#endif // CERES_INTERNAL_CUDA_KERNELS_H_ +#endif // CERES_INTERNAL_CUDA_KERNELS_VECTOR_OPS_H_
diff --git a/internal/ceres/cuda_kernels_test.cc b/internal/ceres/cuda_kernels_vector_ops_test.cc similarity index 99% rename from internal/ceres/cuda_kernels_test.cc rename to internal/ceres/cuda_kernels_vector_ops_test.cc index 41036a3..7f6e229 100644 --- a/internal/ceres/cuda_kernels_test.cc +++ b/internal/ceres/cuda_kernels_vector_ops_test.cc
@@ -28,7 +28,7 @@ // // Author: joydeepb@cs.utexas.edu (Joydeep Biswas) -#include "ceres/cuda_kernels.h" +#include "ceres/cuda_kernels_vector_ops.h" #include <math.h>
diff --git a/internal/ceres/cuda_sparse_matrix.cc b/internal/ceres/cuda_sparse_matrix.cc index 0ae8c5d..e9ca9cc 100644 --- a/internal/ceres/cuda_sparse_matrix.cc +++ b/internal/ceres/cuda_sparse_matrix.cc
@@ -52,7 +52,7 @@ #ifndef CERES_NO_CUDA #include "ceres/cuda_buffer.h" -#include "ceres/cuda_kernels.h" +#include "ceres/cuda_kernels_vector_ops.h" #include "ceres/cuda_vector.h" #include "cuda_runtime_api.h" #include "cusparse.h"
diff --git a/internal/ceres/cuda_sparse_matrix.h b/internal/ceres/cuda_sparse_matrix.h index ad55170..f5fcb91 100644 --- a/internal/ceres/cuda_sparse_matrix.h +++ b/internal/ceres/cuda_sparse_matrix.h
@@ -82,6 +82,10 @@ int num_cols() const { return num_cols_; } int num_nonzeros() const { return num_nonzeros_; } + const int32_t* rows() const { return rows_.data(); } + const int32_t* cols() const { return cols_.data(); } + const double* values() const { return values_.data(); } + int32_t* mutable_rows() { return rows_.data(); } int32_t* mutable_cols() { return cols_.data(); } double* mutable_values() { return values_.data(); }
diff --git a/internal/ceres/cuda_vector.cc b/internal/ceres/cuda_vector.cc index e2143de..d434f36 100644 --- a/internal/ceres/cuda_vector.cc +++ b/internal/ceres/cuda_vector.cc
@@ -44,7 +44,7 @@ #ifndef CERES_NO_CUDA #include "ceres/cuda_buffer.h" -#include "ceres/cuda_kernels.h" +#include "ceres/cuda_kernels_vector_ops.h" #include "ceres/cuda_vector.h" #include "cublas_v2.h"
diff --git a/internal/ceres/cuda_vector.h b/internal/ceres/cuda_vector.h index 34f3947..46661cf 100644 --- a/internal/ceres/cuda_vector.h +++ b/internal/ceres/cuda_vector.h
@@ -50,7 +50,7 @@ #ifndef CERES_NO_CUDA #include "ceres/cuda_buffer.h" -#include "ceres/cuda_kernels.h" +#include "ceres/cuda_kernels_vector_ops.h" #include "ceres/internal/eigen.h" #include "cublas_v2.h" #include "cusparse.h"
diff --git a/internal/ceres/dense_cholesky.cc b/internal/ceres/dense_cholesky.cc index f19ca7e..fb95536 100644 --- a/internal/ceres/dense_cholesky.cc +++ b/internal/ceres/dense_cholesky.cc
@@ -41,7 +41,7 @@ #ifndef CERES_NO_CUDA #include "ceres/context_impl.h" -#include "ceres/cuda_kernels.h" +#include "ceres/cuda_kernels_vector_ops.h" #include "cuda_runtime.h" #include "cusolverDn.h" #endif // CERES_NO_CUDA
diff --git a/internal/ceres/evaluation_benchmark.cc b/internal/ceres/evaluation_benchmark.cc index 2ba03ed..c23c881 100644 --- a/internal/ceres/evaluation_benchmark.cc +++ b/internal/ceres/evaluation_benchmark.cc
@@ -96,7 +96,7 @@ } std::unique_ptr<BlockSparseMatrix> CreateBlockSparseJacobian( - ContextImpl* context) { + ContextImpl* context, bool sequential) { auto problem = bal_problem->mutable_problem(); auto problem_impl = problem->mutable_impl(); CHECK(problem_impl != nullptr); @@ -116,6 +116,30 @@ auto block_sparse = downcast_unique_ptr<BlockSparseMatrix>(jacobian); CHECK(block_sparse != nullptr); + if (sequential) { + auto block_structure_sequential = + std::make_unique<CompressedRowBlockStructure>( + *block_sparse->block_structure()); + int num_nonzeros = 0; + for (auto& row_block : block_structure_sequential->rows) { + const int row_block_size = row_block.block.size; + for (auto& cell : row_block.cells) { + const int col_block_size = + block_structure_sequential->cols[cell.block_id].size; + cell.position = num_nonzeros; + num_nonzeros += col_block_size * row_block_size; + } + } + block_sparse = std::make_unique<BlockSparseMatrix>( + block_structure_sequential.release(), +#ifndef CERES_NO_CUDA + true +#else + false +#endif + ); + } + std::mt19937 rng; std::normal_distribution<double> rnorm; const int nnz = block_sparse->num_nonzeros(); @@ -123,6 +147,7 @@ for (int i = 0; i < nnz; ++i) { values[i] = rnorm(rng); } + return block_sparse; } @@ -134,11 +159,20 @@ const BlockSparseMatrix* BlockSparseJacobian(ContextImpl* context) { if (!block_sparse_jacobian) { - block_sparse_jacobian = CreateBlockSparseJacobian(context); + block_sparse_jacobian = CreateBlockSparseJacobian(context, true); } return block_sparse_jacobian.get(); } + const BlockSparseMatrix* BlockSparseJacobianPartitioned( + ContextImpl* context) { + if (!block_sparse_jacobian_partitioned) { + block_sparse_jacobian_partitioned = + CreateBlockSparseJacobian(context, false); + } + return block_sparse_jacobian_partitioned.get(); + } + const CompressedRowSparseMatrix* CompressedRowSparseJacobian( ContextImpl* context) { if (!crs_jacobian) { @@ -149,7 +183,7 @@ std::unique_ptr<PartitionedView> PartitionedMatrixViewJacobian( const LinearSolver::Options& options) { - auto block_sparse = BlockSparseJacobian(options.context); + auto block_sparse = BlockSparseJacobianPartitioned(options.context); return std::make_unique<PartitionedView>(options, *block_sparse); } @@ -171,7 +205,7 @@ const ImplicitSchurComplement* ImplicitSchurComplementWithoutDiagonal( const LinearSolver::Options& options) { - auto block_sparse = BlockSparseJacobian(options.context); + auto block_sparse = BlockSparseJacobianPartitioned(options.context); implicit_schur_complement = std::make_unique<ImplicitSchurComplement>(options); implicit_schur_complement->Init(*block_sparse, nullptr, b.data()); @@ -180,7 +214,7 @@ const ImplicitSchurComplement* ImplicitSchurComplementWithDiagonal( const LinearSolver::Options& options) { - auto block_sparse = BlockSparseJacobian(options.context); + auto block_sparse = BlockSparseJacobianPartitioned(options.context); implicit_schur_complement_diag = std::make_unique<ImplicitSchurComplement>(options); implicit_schur_complement_diag->Init(*block_sparse, D.data(), b.data()); @@ -192,6 +226,7 @@ Vector b; std::unique_ptr<BundleAdjustmentProblem> bal_problem; std::unique_ptr<PreprocessedProblem> preprocessed_problem; + std::unique_ptr<BlockSparseMatrix> block_sparse_jacobian_partitioned; std::unique_ptr<BlockSparseMatrix> block_sparse_jacobian; std::unique_ptr<CompressedRowSparseMatrix> crs_jacobian; std::unique_ptr<BlockSparseMatrix> block_diagonal_ete;