Permutation-based conversion from block-sparse to crs
Change-Id: Ic33a6476c033187dff61886deb6d1761524943f0
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 682c27f..e4b124d 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -47,15 +47,25 @@
liblapack-dev \
libmetis-dev \
libsuitesparse-dev \
- ninja-build
-
- - name: Setup CUDA toolkit
- if: matrix.gpu == 'cuda'
+ ninja-build \
+ wget
+ - name: Setup CUDA toolkit (system repositories)
+ if: matrix.gpu == 'cuda' && matrix.os == 'ubuntu:20.04'
run: |
apt-get install -y \
nvidia-cuda-dev \
nvidia-cuda-toolkit
-
+ # nvidia cuda toolkit + gcc combo shipped with 22.04LTS is broken
+ # and is not able to compile code that uses thrust
+ # https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1006962
+ - name: Setup CUDA toolkit (nvidia repositories)
+ if: matrix.gpu == 'cuda' && matrix.os == 'ubuntu:22.04'
+ run: |
+ wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb
+ dpkg -i cuda-keyring_1.0-1_all.deb
+ apt-get update
+ apt-get install -y cuda
+ echo "CUDACXX=/usr/local/cuda/bin/nvcc" >> $GITHUB_ENV
- name: Cache Build
id: cache-build
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index 9729ef6..9cb99b4 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -141,8 +141,9 @@
$<TARGET_FILE:cuda_dense_cholesky_test>)
endif (BUILD_TESTING AND GFLAGS)
set_source_files_properties(cuda_kernels.cu.cc PROPERTIES LANGUAGE CUDA)
- add_library(ceres_cuda_kernels cuda_kernels.cu.cc)
+ add_library(ceres_cuda_kernels STATIC cuda_kernels.cu.cc)
target_compile_features(ceres_cuda_kernels PRIVATE cxx_std_14)
+ target_include_directories(ceres_cuda_kernels PRIVATE ${Ceres_SOURCE_DIR}/internal ${Ceres_BINARY_DIR}/${CMAKE_INSTALL_INCLUDEDIR})
list(APPEND CERES_LIBRARY_PRIVATE_DEPENDENCIES ceres_cuda_kernels)
endif (USE_CUDA)
@@ -181,6 +182,8 @@
cost_function.cc
covariance.cc
covariance_impl.cc
+ cuda_block_sparse_crs_view.cc
+ cuda_block_structure.cc
cuda_sparse_matrix.cc
cuda_vector.cc
dense_cholesky.cc
@@ -472,10 +475,13 @@
ceres_test(cost_function_to_functor)
ceres_test(covariance)
ceres_test(cubic_interpolation)
+ ceres_test(cuda_block_sparse_crs_view)
+ ceres_test(cuda_block_structure)
ceres_test(cuda_dense_cholesky)
ceres_test(cuda_dense_qr)
ceres_test(cuda_kernels)
ceres_test(cuda_sparse_matrix)
+ ceres_test(cuda_streamed_buffer)
ceres_test(cuda_vector)
ceres_test(dense_linear_solver)
ceres_test(dense_cholesky)
diff --git a/internal/ceres/block_structure.h b/internal/ceres/block_structure.h
index aaa4080..bccfd8b 100644
--- a/internal/ceres/block_structure.h
+++ b/internal/ceres/block_structure.h
@@ -43,7 +43,11 @@
#include "ceres/internal/export.h"
-namespace ceres::internal {
+// This file is being included into source files that are compiled with nvcc.
+// nvcc shipped with ubuntu 20.04 does not support some features of c++17,
+// including nested namespace definitions
+namespace ceres {
+namespace internal {
using BlockSize = int32_t;
@@ -189,6 +193,7 @@
std::vector<Block> Tail(const std::vector<Block>& blocks, int n);
int SumSquaredSizes(const std::vector<Block>& blocks);
-} // namespace ceres::internal
+} // namespace internal
+} // namespace ceres
#endif // CERES_INTERNAL_BLOCK_STRUCTURE_H_
diff --git a/internal/ceres/context_impl.cc b/internal/ceres/context_impl.cc
index cc01b54..ea3016b 100644
--- a/internal/ceres/context_impl.cc
+++ b/internal/ceres/context_impl.cc
@@ -60,9 +60,11 @@
cusparseDestroy(cusparse_handle_);
cusparse_handle_ = nullptr;
}
- if (stream_ != nullptr) {
- cudaStreamDestroy(stream_);
- stream_ = nullptr;
+ for (auto& s : streams_) {
+ if (s != nullptr) {
+ cudaStreamDestroy(s);
+ s = nullptr;
+ }
}
is_cuda_initialized_ = false;
}
@@ -143,19 +145,22 @@
return false;
}
event_logger.AddEvent("cusparseCreate");
- if (cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking) !=
- cudaSuccess) {
- *message =
- "CUDA initialization failed because CUDA::cudaStreamCreateWithFlags "
- "failed.";
- TearDown();
- return false;
+ for (auto& s : streams_) {
+ if (cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking) != cudaSuccess) {
+ *message =
+ "CUDA initialization failed because CUDA::cudaStreamCreateWithFlags "
+ "failed.";
+ TearDown();
+ return false;
+ }
}
event_logger.AddEvent("cudaStreamCreateWithFlags");
- if (cusolverDnSetStream(cusolver_handle_, stream_) !=
+ if (cusolverDnSetStream(cusolver_handle_, DefaultStream()) !=
CUSOLVER_STATUS_SUCCESS ||
- cublasSetStream(cublas_handle_, stream_) != CUBLAS_STATUS_SUCCESS ||
- cusparseSetStream(cusparse_handle_, stream_) != CUSPARSE_STATUS_SUCCESS) {
+ cublasSetStream(cublas_handle_, DefaultStream()) !=
+ CUBLAS_STATUS_SUCCESS ||
+ cusparseSetStream(cusparse_handle_, DefaultStream()) !=
+ CUSPARSE_STATUS_SUCCESS) {
*message = "CUDA initialization failed because SetStream failed.";
TearDown();
return false;
diff --git a/internal/ceres/context_impl.h b/internal/ceres/context_impl.h
index a18d638..776ca31 100644
--- a/internal/ceres/context_impl.h
+++ b/internal/ceres/context_impl.h
@@ -78,11 +78,32 @@
// Ceres, Ceres will use that GPU.
// Note on Ceres' use of CUDA Streams:
- // All operations on the GPU are performed using a single stream. This ensures
- // that the order of operations are stream-ordered, but we do not need to
- // explicitly synchronize the stream at the end of every operation. Stream
- // synchronization occurs only before GPU to CPU transfers, and is handled by
- // CudaBuffer.
+ // Most of operations on the GPU are performed using a single stream. In
+ // those cases DefaultStream() should be used. This ensures that operations
+ // are stream-ordered, and might be concurrent with cpu processing with no
+ // additional efforts.
+ //
+ // a. Single-stream workloads
+ // - Only use default stream
+ // - Return control to the callee without synchronization whenever possible
+ // - Stream synchronization occurs only after GPU to CPU transfers, and is
+ // handled by CudaBuffer
+ //
+ // b. Multi-stream workloads
+ // Multi-stream workloads are more restricted in order to make it harder to
+ // get a race-condition.
+ // - Should always synchronize the default stream on entry
+ // - Should always synchronize all utilized streams on exit
+ // - Should not make any assumptions on one of streams_[] being default
+ //
+ // With those rules in place
+ // - All single-stream asynchronous workloads are serialized using default
+ // stream
+ // - Multiple-stream workloads always wait single-stream workloads to finish
+ // and leave no running computations on exit.
+ // This slightly penalizes multi-stream workloads, but makes it easier to
+ // avoid race conditions when multiple-stream workload depends on results of
+ // any preceeding gpu computations.
// Initializes cuBLAS, cuSOLVER, and cuSPARSE contexts, creates an
// asynchronous CUDA stream, and associates the stream with the contexts.
@@ -101,7 +122,14 @@
cusolverDnHandle_t cusolver_handle_ = nullptr;
cublasHandle_t cublas_handle_ = nullptr;
- cudaStream_t stream_ = nullptr;
+
+ // Default stream.
+ // Kernel invocations and memory copies on this stream can be left without
+ // synchronization.
+ cudaStream_t DefaultStream() { return streams_[0]; }
+ static constexpr int kNumCudaStreams = 2;
+ cudaStream_t streams_[kNumCudaStreams] = {0};
+
cusparseHandle_t cusparse_handle_ = nullptr;
bool is_cuda_initialized_ = false;
int gpu_device_id_in_use_ = -1;
diff --git a/internal/ceres/cuda_block_sparse_crs_view.cc b/internal/ceres/cuda_block_sparse_crs_view.cc
new file mode 100644
index 0000000..a03d01c
--- /dev/null
+++ b/internal/ceres/cuda_block_sparse_crs_view.cc
@@ -0,0 +1,74 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#include "ceres/cuda_block_sparse_crs_view.h"
+
+#ifndef CERES_NO_CUDA
+
+#include "ceres/cuda_block_structure.h"
+#include "ceres/cuda_kernels.h"
+
+namespace ceres::internal {
+
+CudaBlockSparseCRSView::CudaBlockSparseCRSView(const BlockSparseMatrix& bsm,
+ ContextImpl* context)
+ : permutation_(context, bsm.num_nonzeros()),
+ streamed_buffer_(context, kMaxTemporaryArraySize) {
+ CudaBlockSparseStructure block_structure(*bsm.block_structure(), context);
+ crs_matrix_ = std::make_unique<CudaSparseMatrix>(
+ bsm.num_rows(), bsm.num_cols(), bsm.num_nonzeros(), context);
+ CudaBuffer<int> temp_rows(context, bsm.num_rows());
+ FillCRSStructure(block_structure.num_row_blocks(),
+ bsm.num_rows(),
+ block_structure.row_block_offsets(),
+ block_structure.cells(),
+ block_structure.row_blocks(),
+ block_structure.col_blocks(),
+ crs_matrix_->mutable_rows(),
+ crs_matrix_->mutable_cols(),
+ temp_rows.data(),
+ permutation_.data(),
+ context->DefaultStream());
+ UpdateValues(bsm);
+}
+void CudaBlockSparseCRSView::UpdateValues(const BlockSparseMatrix& bsm) {
+ streamed_buffer_.CopyToGpu(
+ bsm.values(),
+ bsm.num_nonzeros(),
+ [permutation = permutation_.data(),
+ values_to = crs_matrix_->mutable_values()](
+ const double* values, int num_values, int offset, auto stream) {
+ PermuteValues(
+ offset, num_values, permutation, values, values_to, stream);
+ });
+}
+
+} // namespace ceres::internal
+#endif // CERES_NO_CUDA
diff --git a/internal/ceres/cuda_block_sparse_crs_view.h b/internal/ceres/cuda_block_sparse_crs_view.h
new file mode 100644
index 0000000..3ea8498
--- /dev/null
+++ b/internal/ceres/cuda_block_sparse_crs_view.h
@@ -0,0 +1,113 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+//
+
+#ifndef CERES_INTERNAL_CUDA_BLOCK_SPARSE_CRS_VIEW_H_
+#define CERES_INTERNAL_CUDA_BLOCK_SPARSE_CRS_VIEW_H_
+
+#include "ceres/internal/config.h"
+
+#ifndef CERES_NO_CUDA
+
+#include <memory>
+
+#include "ceres/block_sparse_matrix.h"
+#include "ceres/cuda_buffer.h"
+#include "ceres/cuda_sparse_matrix.h"
+#include "ceres/cuda_streamed_buffer.h"
+
+namespace ceres::internal {
+// We use cuSPARSE library for SpMV operations. However, it does not support
+// block-sparse format with varying size of the blocks. Thus, we perform the
+// following operations in order to compute products of block-sparse matrices
+// and dense vectors on gpu:
+// - Once per block-sparse structure update:
+// - Compute CRS structure from block-sparse structure
+// - Compute permutation from block-sparse values to CRS values
+// - Once per block-sparse values update:
+// - Update values in CRS matrix with values of block-sparse matrix
+//
+// Since there are no constraints on positions of cells in value array of
+// block-sparse matrix, a permutation from block-sparse values to CRS
+// values is stored explicitly.
+//
+// Example: given matrix with the following block-structure
+// [ 1 2 | | 6 7 ]
+// [ 3 4 | | 8 9 ]
+// [-----+-+-----]
+// [ |5| ]
+// with values stored as values_block_sparse = [1, 2, 3, 4, 5, 6, 7, 8, 9],
+// permutation from block-sparse to CRS is p = [0, 1, 4, 5, 8, 2, 3, 6, 7];
+// and i-th block-sparse value has index p[i] in CRS values array
+//
+// This allows to avoid storing both CRS and block-sparse values in GPU memory.
+// Instead, block-sparse values are transferred to gpu memory as a disjoint set
+// of small continuous segments with simultaneous permutation of the values into
+// correct order
+class CERES_NO_EXPORT CudaBlockSparseCRSView {
+ public:
+ // Initializes internal CRS matrix and permutation from block-sparse to CRS
+ // values. The following objects are stored in gpu memory for the whole
+ // lifetime of the object
+ // - crs_matrix_: CRS matrix
+ // - permutation_: permutation from block-sparse to CRS value order
+ // (num_nonzeros integer values)
+ // - streamed_buffer_: helper for value updating
+ // The following objects are created temporarily during construction:
+ // - CudaBlockSparseStructure: block-sparse structure of block-sparse matrix
+ // - num_rows integer values: row to row-block map
+ // If copy_values flag is set to false, only structure of block-sparse matrix
+ // bsm is captured, and values are left uninitialized
+ CudaBlockSparseCRSView(const BlockSparseMatrix& bsm, ContextImpl* context);
+
+ const CudaSparseMatrix* crs_matrix() const { return crs_matrix_.get(); }
+ CudaSparseMatrix* mutable_crs_matrix() { return crs_matrix_.get(); }
+
+ // Update values of crs_matrix_ using values of block-sparse matrix.
+ // Assumes that bsm has the same block-sparse structure as matrix that was
+ // used for construction.
+ void UpdateValues(const BlockSparseMatrix& bsm);
+
+ private:
+ // Value permutation kernel performs a single element-wise operation per
+ // thread, thus performing permutation in blocks of 8 megabytes of
+ // block-sparse values seems reasonable
+ static constexpr int kMaxTemporaryArraySize = 1 * 1024 * 1024;
+ std::unique_ptr<CudaSparseMatrix> crs_matrix_;
+ // Permutation from block-sparse to CRS value order.
+ // permutation_[i] = index of i-th block-sparse value in CRS values
+ CudaBuffer<int> permutation_;
+ CudaStreamedBuffer<double> streamed_buffer_;
+};
+
+} // namespace ceres::internal
+
+#endif // CERES_NO_CUDA
+#endif // CERES_INTERNAL_CUDA_BLOCK_SPARSE_CRS_VIEW_H_
diff --git a/internal/ceres/cuda_block_sparse_crs_view_test.cc b/internal/ceres/cuda_block_sparse_crs_view_test.cc
new file mode 100644
index 0000000..f8d1cdf
--- /dev/null
+++ b/internal/ceres/cuda_block_sparse_crs_view_test.cc
@@ -0,0 +1,100 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#include "ceres/cuda_block_sparse_crs_view.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#ifndef CERES_NO_CUDA
+
+namespace ceres::internal {
+class CudaBlockSparseCRSViewTest : public ::testing::Test {
+ protected:
+ void SetUp() final {
+ std::string message;
+ CHECK(context_.InitCuda(&message))
+ << "InitCuda() failed because: " << message;
+
+ BlockSparseMatrix::RandomMatrixOptions options;
+ options.num_row_blocks = 1234;
+ options.min_row_block_size = 1;
+ options.max_row_block_size = 10;
+ options.num_col_blocks = 567;
+ options.min_col_block_size = 1;
+ options.max_col_block_size = 10;
+ options.block_density = 0.2;
+ std::mt19937 rng;
+ A_ = BlockSparseMatrix::CreateRandomMatrix(options, rng);
+ std::iota(
+ A_->mutable_values(), A_->mutable_values() + A_->num_nonzeros(), 1);
+ }
+
+ std::unique_ptr<BlockSparseMatrix> A_;
+ ContextImpl context_;
+};
+
+TEST_F(CudaBlockSparseCRSViewTest, CreateUpdateValues) {
+ auto view = CudaBlockSparseCRSView(*A_, &context_);
+
+ // TODO: we definitely would like to use crs_matrix() here, but
+ // CudaSparseMatrix::RightMultiplyAndAccumulate is defined non-const because
+ // it might allocate additional storage by request of cuSPARSE
+ auto matrix = view.mutable_crs_matrix();
+ ASSERT_EQ(matrix->num_cols(), A_->num_cols());
+ ASSERT_EQ(matrix->num_rows(), A_->num_rows());
+ ASSERT_EQ(matrix->num_nonzeros(), A_->num_nonzeros());
+
+ const int num_rows = A_->num_rows();
+ const int num_cols = A_->num_cols();
+ Vector x(num_cols);
+ Vector y(num_rows);
+ CudaVector x_cuda(&context_, num_cols);
+ CudaVector y_cuda(&context_, num_rows);
+ Vector y_cuda_host(num_rows);
+
+ for (int i = 0; i < num_cols; ++i) {
+ x.setZero();
+ y.setZero();
+ y_cuda.SetZero();
+ x[i] = 1.;
+ x_cuda.CopyFromCpu(x);
+ A_->RightMultiplyAndAccumulate(
+ x.data(), y.data(), &context_, std::thread::hardware_concurrency());
+ matrix->RightMultiplyAndAccumulate(x_cuda, &y_cuda);
+ y_cuda.CopyTo(&y_cuda_host);
+ // There will be up to 1 non-zero product per row, thus we expect an exact
+ // match on 32-bit integer indices
+ EXPECT_EQ((y - y_cuda_host).squaredNorm(), 0.);
+ }
+}
+} // namespace ceres::internal
+
+#endif // CERES_NO_CUDA
diff --git a/internal/ceres/cuda_block_structure.cc b/internal/ceres/cuda_block_structure.cc
new file mode 100644
index 0000000..8e549e1
--- /dev/null
+++ b/internal/ceres/cuda_block_structure.cc
@@ -0,0 +1,118 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#include "ceres/cuda_block_structure.h"
+
+#ifndef CERES_NO_CUDA
+
+namespace ceres::internal {
+namespace {
+// Dimension of a sorted array of blocks
+inline int Dimension(const std::vector<Block>& blocks) {
+ if (blocks.empty()) {
+ return 0;
+ }
+ const auto& last = blocks.back();
+ return last.size + last.position;
+}
+} // namespace
+
+CudaBlockSparseStructure::CudaBlockSparseStructure(
+ const CompressedRowBlockStructure& block_structure, ContextImpl* context)
+ : row_block_offsets_(context),
+ cells_(context),
+ row_blocks_(context),
+ col_blocks_(context) {
+ // Row blocks extracted from CompressedRowBlockStructure::rows
+ std::vector<Block> row_blocks;
+ // Column blocks can be reused as-is
+ const auto& col_blocks = block_structure.cols;
+
+ // Row block offset is an index of the first cell corresponding to row block
+ std::vector<int> row_block_offsets;
+ // Flat array of all cells from all row-blocks
+ std::vector<Cell> cells;
+
+ num_row_blocks_ = block_structure.rows.size();
+ num_col_blocks_ = col_blocks.size();
+
+ row_blocks.reserve(num_row_blocks_);
+ row_block_offsets.reserve(num_row_blocks_ + 1);
+ num_nonzeros_ = 0;
+ num_cells_ = 0;
+ for (const auto& r : block_structure.rows) {
+ const int row_block_size = r.block.size;
+ row_blocks.emplace_back(r.block);
+ row_block_offsets.push_back(num_cells_);
+ for (const auto& c : r.cells) {
+ cells.emplace_back(c);
+ const int col_block_size = col_blocks[c.block_id].size;
+ num_nonzeros_ += col_block_size * row_block_size;
+ ++num_cells_;
+ }
+ }
+ row_block_offsets.push_back(num_cells_);
+
+ num_rows_ = Dimension(row_blocks);
+ num_cols_ = Dimension(col_blocks);
+
+ if (VLOG_IS_ON(3)) {
+ const size_t row_block_offsets_size =
+ row_block_offsets.size() * sizeof(int);
+ const size_t cells_size = cells.size() * sizeof(Cell);
+ const size_t row_blocks_size = row_blocks.size() * sizeof(Block);
+ const size_t col_blocks_size = col_blocks.size() * sizeof(Block);
+ const size_t total_size =
+ row_block_offsets_size + cells_size + col_blocks_size + row_blocks_size;
+ VLOG(3) << "\nCudaBlockSparseStructure:\n"
+ "\tRow block offsets: "
+ << row_block_offsets_size
+ << " bytes\n"
+ "\tColumn blocks: "
+ << col_blocks_size
+ << " bytes\n"
+ "\tRow blocks: "
+ << row_blocks_size
+ << " bytes\n"
+ "\tCells: "
+ << cells_size
+ << " bytes\n"
+ "\tTotal: "
+ << total_size << " bytes of GPU memory";
+ }
+
+ row_block_offsets_.CopyFromCpuVector(row_block_offsets);
+ cells_.CopyFromCpuVector(cells);
+ row_blocks_.CopyFromCpuVector(row_blocks);
+ col_blocks_.CopyFromCpuVector(col_blocks);
+}
+} // namespace ceres::internal
+
+#endif // CERES_NO_CUDA
diff --git a/internal/ceres/cuda_block_structure.h b/internal/ceres/cuda_block_structure.h
new file mode 100644
index 0000000..a41d0b3
--- /dev/null
+++ b/internal/ceres/cuda_block_structure.h
@@ -0,0 +1,94 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#ifndef CERES_INTERNAL_CUDA_BLOCK_STRUCTURE_H_
+#define CERES_INTERNAL_CUDA_BLOCK_STRUCTURE_H_
+
+#include "ceres/internal/config.h"
+
+#ifndef CERES_NO_CUDA
+
+#include "ceres/block_structure.h"
+#include "ceres/cuda_buffer.h"
+
+namespace ceres::internal {
+class CudaBlockStructureTest;
+
+// This class stores a read-only block-sparse structure in gpu memory.
+// Invariants are the same as those of CompressedRowBlockStructure.
+// In order to simplify allocation and copying data to gpu, cells from all
+// row-blocks are stored in a single array sequentially. Additional array
+// row_block_offsets of size num_row_blocks + 1 allows to identify range of
+// cells corresponding to a row-block.
+// Cells corresponding to i-th row-block are stored in sub-array
+// cells[row_block_offsets[i]; ... row_block_offsets[i + 1] - 1], and their
+// order is preserved.
+class CERES_NO_EXPORT CudaBlockSparseStructure {
+ public:
+ // CompressedRowBlockStructure is contains a vector of CompressedLists, with
+ // each CompressedList containing a vector of Cells. We precompute a flat
+ // array of cells on cpu and transfer it to the gpu.
+ CudaBlockSparseStructure(const CompressedRowBlockStructure& block_structure,
+ ContextImpl* context);
+
+ int num_rows() const { return num_rows_; }
+ int num_cols() const { return num_cols_; }
+ int num_cells() const { return num_cells_; }
+ int num_nonzeros() const { return num_nonzeros_; }
+ int num_row_blocks() const { return num_row_blocks_; }
+ int num_col_blocks() const { return num_col_blocks_; }
+
+ // Device pointer to array of num_row_blocks + 1 indices of the first cell of
+ // row block
+ const int* row_block_offsets() const { return row_block_offsets_.data(); }
+ // Device pointer to array of num_cells cells, sorted by row-block
+ const Cell* cells() const { return cells_.data(); }
+ // Device pointer to array of row blocks
+ const Block* row_blocks() const { return row_blocks_.data(); }
+ // Device pointer to array of column blocks
+ const Block* col_blocks() const { return col_blocks_.data(); }
+
+ private:
+ int num_rows_;
+ int num_cols_;
+ int num_cells_;
+ int num_nonzeros_;
+ int num_row_blocks_;
+ int num_col_blocks_;
+ CudaBuffer<int> row_block_offsets_;
+ CudaBuffer<Cell> cells_;
+ CudaBuffer<Block> row_blocks_;
+ CudaBuffer<Block> col_blocks_;
+ friend class CudaBlockStructureTest;
+};
+} // namespace ceres::internal
+
+#endif // CERES_NO_CUDA
+#endif // CERES_INTERNAL_CUDA_BLOCK_SPARSE_STRUCTURE_H_
diff --git a/internal/ceres/cuda_block_structure_test.cc b/internal/ceres/cuda_block_structure_test.cc
new file mode 100644
index 0000000..95cb26b
--- /dev/null
+++ b/internal/ceres/cuda_block_structure_test.cc
@@ -0,0 +1,142 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#include "ceres/internal/config.h"
+
+#ifndef CERES_NO_CUDA
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <numeric>
+
+#include "ceres/block_sparse_matrix.h"
+#include "ceres/cuda_block_structure.h"
+
+namespace ceres::internal {
+
+class CudaBlockStructureTest : public ::testing::Test {
+ protected:
+ void SetUp() final {
+ std::string message;
+ CHECK(context_.InitCuda(&message))
+ << "InitCuda() failed because: " << message;
+
+ BlockSparseMatrix::RandomMatrixOptions options;
+ options.num_row_blocks = 1234;
+ options.min_row_block_size = 1;
+ options.max_row_block_size = 10;
+ options.num_col_blocks = 567;
+ options.min_col_block_size = 1;
+ options.max_col_block_size = 10;
+ options.block_density = 0.2;
+ std::mt19937 rng;
+ A_ = BlockSparseMatrix::CreateRandomMatrix(options, rng);
+ std::iota(
+ A_->mutable_values(), A_->mutable_values() + A_->num_nonzeros(), 1);
+ }
+
+ std::vector<Cell> GetCells(const CudaBlockSparseStructure& structure) {
+ const auto& cuda_buffer = structure.cells_;
+ std::vector<Cell> cells(cuda_buffer.size());
+ cuda_buffer.CopyToCpu(cells.data(), cells.size());
+ return cells;
+ }
+ std::vector<Block> GetRowBlocks(const CudaBlockSparseStructure& structure) {
+ const auto& cuda_buffer = structure.row_blocks_;
+ std::vector<Block> blocks(cuda_buffer.size());
+ cuda_buffer.CopyToCpu(blocks.data(), blocks.size());
+ return blocks;
+ }
+ std::vector<Block> GetColBlocks(const CudaBlockSparseStructure& structure) {
+ const auto& cuda_buffer = structure.col_blocks_;
+ std::vector<Block> blocks(cuda_buffer.size());
+ cuda_buffer.CopyToCpu(blocks.data(), blocks.size());
+ return blocks;
+ }
+ std::vector<int> GetRowBlockOffsets(
+ const CudaBlockSparseStructure& structure) {
+ const auto& cuda_buffer = structure.row_block_offsets_;
+ std::vector<int> row_block_offsets(cuda_buffer.size());
+ cuda_buffer.CopyToCpu(row_block_offsets.data(), row_block_offsets.size());
+ return row_block_offsets;
+ }
+
+ std::unique_ptr<BlockSparseMatrix> A_;
+ ContextImpl context_;
+};
+
+TEST_F(CudaBlockStructureTest, StructureIdentity) {
+ auto block_structure = A_->block_structure();
+ const int num_row_blocks = block_structure->rows.size();
+ const int num_col_blocks = block_structure->cols.size();
+
+ CudaBlockSparseStructure cuda_block_structure(*block_structure, &context_);
+
+ ASSERT_EQ(cuda_block_structure.num_rows(), A_->num_rows());
+ ASSERT_EQ(cuda_block_structure.num_cols(), A_->num_cols());
+ ASSERT_EQ(cuda_block_structure.num_nonzeros(), A_->num_nonzeros());
+ ASSERT_EQ(cuda_block_structure.num_row_blocks(), num_row_blocks);
+ ASSERT_EQ(cuda_block_structure.num_col_blocks(), num_col_blocks);
+
+ std::vector<Block> blocks = GetColBlocks(cuda_block_structure);
+ ASSERT_EQ(blocks.size(), num_col_blocks);
+ for (int i = 0; i < num_col_blocks; ++i) {
+ EXPECT_EQ(block_structure->cols[i].position, blocks[i].position);
+ EXPECT_EQ(block_structure->cols[i].size, blocks[i].size);
+ }
+
+ std::vector<Cell> cells = GetCells(cuda_block_structure);
+ std::vector<int> row_block_offsets = GetRowBlockOffsets(cuda_block_structure);
+ blocks = GetRowBlocks(cuda_block_structure);
+
+ ASSERT_EQ(blocks.size(), num_row_blocks);
+ ASSERT_EQ(row_block_offsets.size(), num_row_blocks + 1);
+ ASSERT_EQ(row_block_offsets.back(), cells.size());
+
+ for (int i = 0; i < num_row_blocks; ++i) {
+ const int num_cells = block_structure->rows[i].cells.size();
+ EXPECT_EQ(blocks[i].position, block_structure->rows[i].block.position);
+ EXPECT_EQ(blocks[i].size, block_structure->rows[i].block.size);
+ const int first_cell = row_block_offsets[i];
+ const int last_cell = row_block_offsets[i + 1];
+ ASSERT_EQ(last_cell - first_cell, num_cells);
+ for (int j = 0; j < num_cells; ++j) {
+ EXPECT_EQ(cells[first_cell + j].block_id,
+ block_structure->rows[i].cells[j].block_id);
+ EXPECT_EQ(cells[first_cell + j].position,
+ block_structure->rows[i].cells[j].position);
+ }
+ }
+}
+
+} // namespace ceres::internal
+
+#endif // CERES_NO_CUDA
diff --git a/internal/ceres/cuda_buffer.h b/internal/ceres/cuda_buffer.h
index 97d126c..d6abb15 100644
--- a/internal/ceres/cuda_buffer.h
+++ b/internal/ceres/cuda_buffer.h
@@ -86,7 +86,7 @@
data,
size * sizeof(T),
cudaMemcpyHostToDevice,
- context_->stream_),
+ context_->DefaultStream()),
cudaSuccess);
}
@@ -98,7 +98,7 @@
data.data(),
data.size() * sizeof(T),
cudaMemcpyHostToDevice,
- context_->stream_),
+ context_->DefaultStream()),
cudaSuccess);
}
@@ -110,7 +110,7 @@
data,
size * sizeof(T),
cudaMemcpyDeviceToDevice,
- context_->stream_),
+ context_->DefaultStream()),
cudaSuccess);
}
@@ -126,9 +126,9 @@
data_,
size * sizeof(T),
cudaMemcpyDeviceToHost,
- context_->stream_),
+ context_->DefaultStream()),
cudaSuccess);
- CHECK_EQ(cudaStreamSynchronize(context_->stream_), cudaSuccess);
+ CHECK_EQ(cudaStreamSynchronize(context_->DefaultStream()), cudaSuccess);
}
// Copy N items from another GPU memory array to the GPU memory managed by
@@ -142,7 +142,7 @@
other.data_,
size_ * sizeof(T),
cudaMemcpyDeviceToDevice,
- context_->stream_),
+ context_->DefaultStream()),
cudaSuccess);
}
diff --git a/internal/ceres/cuda_kernels.cu.cc b/internal/ceres/cuda_kernels.cu.cc
index c0f5453..3d93ae8 100644
--- a/internal/ceres/cuda_kernels.cu.cc
+++ b/internal/ceres/cuda_kernels.cu.cc
@@ -28,6 +28,12 @@
//
// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+//#include "ceres/cuda_kernels.h"
+
+#include <thrust/execution_policy.h>
+#include <thrust/scan.h>
+
+#include "block_structure.h"
#include "cuda_runtime.h"
namespace ceres {
@@ -40,6 +46,9 @@
// might provide better performance. See "Occupancy Calculator" under the CUDA
// toolkit documentation.
constexpr int kCudaBlockSize = 256;
+inline int NumBlocks(int size) {
+ return (size + kCudaBlockSize - 1) / kCudaBlockSize;
+}
template <typename SrcType, typename DstType>
__global__ void TypeConversionKernel(const SrcType* __restrict__ input,
@@ -55,7 +64,7 @@
float* output,
const int size,
cudaStream_t stream) {
- const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
+ const int num_blocks = NumBlocks(size);
TypeConversionKernel<double, float>
<<<num_blocks, kCudaBlockSize, 0, stream>>>(input, output, size);
}
@@ -64,7 +73,7 @@
double* output,
const int size,
cudaStream_t stream) {
- const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
+ const int num_blocks = NumBlocks(size);
TypeConversionKernel<float, double>
<<<num_blocks, kCudaBlockSize, 0, stream>>>(input, output, size);
}
@@ -78,12 +87,12 @@
}
void CudaSetZeroFP32(float* output, const int size, cudaStream_t stream) {
- const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
+ const int num_blocks = NumBlocks(size);
SetZeroKernel<float><<<num_blocks, kCudaBlockSize, 0, stream>>>(output, size);
}
void CudaSetZeroFP64(double* output, const int size, cudaStream_t stream) {
- const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
+ const int num_blocks = NumBlocks(size);
SetZeroKernel<double>
<<<num_blocks, kCudaBlockSize, 0, stream>>>(output, size);
}
@@ -99,7 +108,7 @@
}
void CudaDsxpy(double* x, float* y, const int size, cudaStream_t stream) {
- const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
+ const int num_blocks = NumBlocks(size);
XPlusEqualsYKernel<float, double>
<<<num_blocks, kCudaBlockSize, 0, stream>>>(x, y, size);
}
@@ -119,9 +128,177 @@
const double* x,
const int size,
cudaStream_t stream) {
- const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
+ const int num_blocks = NumBlocks(size);
CudaDtDxpyKernel<<<num_blocks, kCudaBlockSize, 0, stream>>>(y, D, x, size);
}
+// Fill row block id and nnz for each row using block-sparse structure
+// represented by a set of flat arrays.
+// Inputs:
+// - num_row_blocks: number of row-blocks in block-sparse structure
+// - row_block_offsets: index of the first cell of the row-block; size:
+// num_row_blocks + 1
+// - cells: cells of block-sparse structure as a continuous array
+// - row_blocks: row blocks of block-sparse structure stored sequentially
+// - col_blocks: column blocks of block-sparse structure stored sequentially
+// Outputs:
+// - rows: rows[i + 1] will contain number of non-zeros in i-th row, rows[0]
+// will be set to 0; rows are filled with a shift by one element in order
+// to obtain row-index array of CRS matrix with a inclusive scan afterwards
+// - row_block_ids: row_block_ids[i] will be set to index of row-block that
+// contains i-th row.
+// Computation is perform row-block-wise
+__global__ void RowBlockIdAndNNZ(int num_row_blocks,
+ const int* __restrict__ row_block_offsets,
+ const Cell* __restrict__ cells,
+ const Block* __restrict__ row_blocks,
+ const Block* __restrict__ col_blocks,
+ int* __restrict__ rows,
+ int* __restrict__ row_block_ids) {
+ const int row_block_id = blockIdx.x * blockDim.x + threadIdx.x;
+ if (row_block_id > num_row_blocks) {
+ // No synchronization is performed in this kernel, thus it is safe to return
+ return;
+ }
+ if (row_block_id == num_row_blocks) {
+ // one extra thread sets the first element
+ rows[0] = 0;
+ return;
+ }
+ const auto& row_block = row_blocks[row_block_id];
+ int row_nnz = 0;
+ const auto first_cell = cells + row_block_offsets[row_block_id];
+ const auto last_cell = cells + row_block_offsets[row_block_id + 1];
+ for (auto cell = first_cell; cell < last_cell; ++cell) {
+ row_nnz += col_blocks[cell->block_id].size;
+ }
+ const int first_row = row_block.position;
+ const int last_row = first_row + row_block.size;
+ for (int i = first_row; i < last_row; ++i) {
+ rows[i + 1] = row_nnz;
+ row_block_ids[i] = row_block_id;
+ }
+}
+
+// Row-wise creation of CRS structure
+// Inputs:
+// - num_rows: number of rows in matrix
+// - row_block_offsets: index of the first cell of the row-block; size:
+// num_row_blocks + 1
+// - cells: cells of block-sparse structure as a continuous array
+// - row_blocks: row blocks of block-sparse structure stored sequentially
+// - col_blocks: column blocks of block-sparse structure stored sequentially
+// - row_block_ids: index of row-block that corresponds to row
+// - rows: row-index array of CRS structure
+// Outputs:
+// - cols: column-index array of CRS structure
+// - permutation: permutation from block-sparse to crs order
+// Computaion is perform row-wise
+__global__ void ComputeColumnsAndPermutation(
+ int num_rows,
+ const int* __restrict__ row_block_offsets,
+ const Cell* __restrict__ cells,
+ const Block* __restrict__ row_blocks,
+ const Block* __restrict__ col_blocks,
+ const int* __restrict__ row_block_ids,
+ const int* __restrict__ rows,
+ int* __restrict__ cols,
+ int* __restrict__ permutation) {
+ const int row = blockIdx.x * blockDim.x + threadIdx.x;
+ if (row >= num_rows) {
+ // No synchronization is performed in this kernel, thus it is safe to return
+ return;
+ }
+ const int row_block_id = row_block_ids[row];
+ // zero-based index of row in row-block
+ const int row_in_block = row - row_blocks[row_block_id].position;
+ // position in crs matrix
+ int crs_position = rows[row];
+ const auto first_cell = cells + row_block_offsets[row_block_id];
+ const auto last_cell = cells + row_block_offsets[row_block_id + 1];
+ // For reach cell of row-block only current row is being filled
+ for (auto cell = first_cell; cell < last_cell; ++cell) {
+ const auto& col_block = col_blocks[cell->block_id];
+ const int col_block_size = col_block.size;
+ int column_idx = col_block.position;
+ int bs_position = cell->position + row_in_block * col_block_size;
+ // Fill permutation and column indices for each element of row_in_block
+ // row of current cell
+ for (int i = 0; i < col_block_size; ++i, ++crs_position) {
+ permutation[bs_position++] = crs_position;
+ cols[crs_position] = column_idx++;
+ }
+ }
+}
+
+void FillCRSStructure(const int num_row_blocks,
+ const int num_rows,
+ const int* row_block_offsets,
+ const Cell* cells,
+ const Block* row_blocks,
+ const Block* col_blocks,
+ int* rows,
+ int* cols,
+ int* row_block_ids,
+ int* permutation,
+ cudaStream_t stream) {
+ // Set number of non-zeros per row in rows array and row to row-block map in
+ // row_block_ids array
+ const int num_blocks_blockwise = NumBlocks(num_row_blocks + 1);
+ RowBlockIdAndNNZ<<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>(
+ num_row_blocks,
+ row_block_offsets,
+ cells,
+ row_blocks,
+ col_blocks,
+ rows,
+ row_block_ids);
+ // Finalize row-index array of CRS strucure by computing prefix sum
+ thrust::inclusive_scan(
+ thrust::cuda::par.on(stream), rows, rows + num_rows + 1, rows);
+
+ // Fill cols array of CRS structure and permutation from block-sparse to CRS
+ const int num_blocks_rowwise = NumBlocks(num_rows);
+ ComputeColumnsAndPermutation<<<num_blocks_rowwise,
+ kCudaBlockSize,
+ 0,
+ stream>>>(num_rows,
+ row_block_offsets,
+ cells,
+ row_blocks,
+ col_blocks,
+ row_block_ids,
+ rows,
+ cols,
+ permutation);
+}
+
+// Updates values of CRS matrix with a continuous block of values of
+// block-sparse matrix. With permutation[i] being an index of i-th value of
+// block-sparse matrix in values of CRS matrix updating values is quite
+// efficient when performed element-wise (reads of permutation and values arrays
+// are coalesced when offset is divisable by 16)
+__global__ void PermuteValuesKernel(const int offset,
+ const int num_values,
+ const int* permutation,
+ const double* block_sparse_values,
+ double* crs_values) {
+ const int value_id = blockIdx.x * blockDim.x + threadIdx.x;
+ if (value_id < num_values) {
+ crs_values[permutation[offset + value_id]] = block_sparse_values[value_id];
+ }
+}
+
+void PermuteValues(const int offset,
+ const int num_values,
+ const int* permutation,
+ const double* block_sparse_values,
+ double* crs_values,
+ cudaStream_t stream) {
+ const int num_blocks = NumBlocks(num_values);
+ PermuteValuesKernel<<<num_blocks, kCudaBlockSize, 0, stream>>>(
+ offset, num_values, permutation, block_sparse_values, crs_values);
+}
+
} // namespace internal
} // namespace ceres
diff --git a/internal/ceres/cuda_kernels.h b/internal/ceres/cuda_kernels.h
index d347c20..61f945a 100644
--- a/internal/ceres/cuda_kernels.h
+++ b/internal/ceres/cuda_kernels.h
@@ -37,7 +37,10 @@
#include "cuda_runtime.h"
-namespace ceres::internal {
+namespace ceres {
+namespace internal {
+class Block;
+class Cell;
// Convert an array of double (FP64) values to float (FP32). Both arrays must
// already be on GPU memory.
@@ -72,7 +75,38 @@
const int size,
cudaStream_t stream);
-} // namespace ceres::internal
+// Compute structure of CRS matrix and permutation of values using block-sparse
+// structure and temporary array row_block_ids. Array row_block_ids of size
+// num_rows will be filled with indices of row-blocks corresponding to rows of
+// CRS matrix. Arrays corresponding to CRS matrix, permutation and row_block_ids
+// arrays are to be allocated by caller
+void FillCRSStructure(const int num_row_blocks,
+ const int num_rows,
+ const int* row_block_offsets,
+ const Cell* cells,
+ const Block* row_blocks,
+ const Block* col_blocks,
+ int* rows,
+ int* cols,
+ int* row_block_ids,
+ int* permutation,
+ cudaStream_t stream);
+
+// Permute block of block-sparse values using permutation
+// Pointer block_sparse_values corresponds to a block of num_values values from
+// block-sparse matrix at the offset from begining. Pointer output corresponds
+// to values of CRS matrix. Array permutation stores permutation from
+// block-sparse to CRS matrix with permutation[i] being an index of i-th value
+// of block-sparse matrix in values of CRS matrix
+void PermuteValues(const int offset,
+ const int num_values,
+ const int* permutation,
+ const double* block_sparse_values,
+ double* crs_values,
+ cudaStream_t stream);
+
+} // namespace internal
+} // namespace ceres
#endif // CERES_NO_CUDA
diff --git a/internal/ceres/cuda_kernels_test.cc b/internal/ceres/cuda_kernels_test.cc
index 32ff55a..41036a3 100644
--- a/internal/ceres/cuda_kernels_test.cc
+++ b/internal/ceres/cuda_kernels_test.cc
@@ -57,8 +57,10 @@
fp64_gpu.CopyFromCpuVector(fp64_cpu);
CudaBuffer<float> fp32_gpu(&context);
fp32_gpu.Reserve(fp64_cpu.size());
- CudaFP64ToFP32(
- fp64_gpu.data(), fp32_gpu.data(), fp64_cpu.size(), context.stream_);
+ CudaFP64ToFP32(fp64_gpu.data(),
+ fp32_gpu.data(),
+ fp64_cpu.size(),
+ context.DefaultStream());
std::vector<float> fp32_cpu(fp64_cpu.size());
fp32_gpu.CopyToCpu(fp32_cpu.data(), fp32_cpu.size());
for (int i = 0; i < fp32_cpu.size(); ++i) {
@@ -82,8 +84,10 @@
fp64_gpu.CopyFromCpuVector(fp64_cpu);
CudaBuffer<float> fp32_gpu(&context);
fp32_gpu.Reserve(fp64_cpu.size());
- CudaFP64ToFP32(
- fp64_gpu.data(), fp32_gpu.data(), fp64_cpu.size(), context.stream_);
+ CudaFP64ToFP32(fp64_gpu.data(),
+ fp32_gpu.data(),
+ fp64_cpu.size(),
+ context.DefaultStream());
std::vector<float> fp32_cpu(fp64_cpu.size());
fp32_gpu.CopyToCpu(fp32_cpu.data(), fp32_cpu.size());
EXPECT_EQ(fp32_cpu[0], 0.0f);
@@ -101,8 +105,10 @@
fp32_gpu.CopyFromCpuVector(fp32_cpu);
CudaBuffer<double> fp64_gpu(&context);
fp64_gpu.Reserve(fp32_cpu.size());
- CudaFP32ToFP64(
- fp32_gpu.data(), fp64_gpu.data(), fp32_cpu.size(), context.stream_);
+ CudaFP32ToFP64(fp32_gpu.data(),
+ fp64_gpu.data(),
+ fp32_cpu.size(),
+ context.DefaultStream());
std::vector<double> fp64_cpu(fp32_cpu.size());
fp64_gpu.CopyToCpu(fp64_cpu.data(), fp64_cpu.size());
for (int i = 0; i < fp64_cpu.size(); ++i) {
@@ -117,7 +123,7 @@
std::vector<float> fp32_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0};
CudaBuffer<float> fp32_gpu(&context);
fp32_gpu.CopyFromCpuVector(fp32_cpu);
- CudaSetZeroFP32(fp32_gpu.data(), fp32_cpu.size(), context.stream_);
+ CudaSetZeroFP32(fp32_gpu.data(), fp32_cpu.size(), context.DefaultStream());
std::vector<float> fp32_cpu_zero(fp32_cpu.size());
fp32_gpu.CopyToCpu(fp32_cpu_zero.data(), fp32_cpu_zero.size());
for (int i = 0; i < fp32_cpu_zero.size(); ++i) {
@@ -132,7 +138,7 @@
std::vector<double> fp64_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0};
CudaBuffer<double> fp64_gpu(&context);
fp64_gpu.CopyFromCpuVector(fp64_cpu);
- CudaSetZeroFP64(fp64_gpu.data(), fp64_cpu.size(), context.stream_);
+ CudaSetZeroFP64(fp64_gpu.data(), fp64_cpu.size(), context.DefaultStream());
std::vector<double> fp64_cpu_zero(fp64_cpu.size());
fp64_gpu.CopyToCpu(fp64_cpu_zero.data(), fp64_cpu_zero.size());
for (int i = 0; i < fp64_cpu_zero.size(); ++i) {
@@ -151,8 +157,10 @@
fp32_gpu_a.CopyFromCpuVector(fp32_cpu_a);
CudaBuffer<double> fp64_gpu_b(&context);
fp64_gpu_b.CopyFromCpuVector(fp64_cpu_b);
- CudaDsxpy(
- fp64_gpu_b.data(), fp32_gpu_a.data(), fp32_gpu_a.size(), context.stream_);
+ CudaDsxpy(fp64_gpu_b.data(),
+ fp32_gpu_a.data(),
+ fp32_gpu_a.size(),
+ context.DefaultStream());
fp64_gpu_b.CopyToCpu(fp64_cpu_b.data(), fp64_cpu_b.size());
for (int i = 0; i < fp64_cpu_b.size(); ++i) {
EXPECT_DOUBLE_EQ(fp64_cpu_b[i], 2.0 * fp32_cpu_a[i]);
@@ -172,8 +180,11 @@
y_gpu.CopyFromCpuVector(y_cpu);
CudaBuffer<double> d_gpu(&context);
d_gpu.CopyFromCpuVector(d_cpu);
- CudaDtDxpy(
- y_gpu.data(), d_gpu.data(), x_gpu.data(), y_gpu.size(), context.stream_);
+ CudaDtDxpy(y_gpu.data(),
+ d_gpu.data(),
+ x_gpu.data(),
+ y_gpu.size(),
+ context.DefaultStream());
y_gpu.CopyToCpu(y_cpu.data(), y_cpu.size());
EXPECT_DOUBLE_EQ(y_cpu[0], 4.0 + 10.0 * 10.0 * 1.0);
EXPECT_DOUBLE_EQ(y_cpu[1], 3.0 + 20.0 * 20.0 * 2.0);
diff --git a/internal/ceres/cuda_sparse_matrix.cc b/internal/ceres/cuda_sparse_matrix.cc
index 7ee1761..0ae8c5d 100644
--- a/internal/ceres/cuda_sparse_matrix.cc
+++ b/internal/ceres/cuda_sparse_matrix.cc
@@ -59,6 +59,31 @@
namespace ceres::internal {
+CudaSparseMatrix::CudaSparseMatrix(int num_rows,
+ int num_cols,
+ int num_nonzeros,
+ ContextImpl* context)
+ : context_(context),
+ rows_(context, num_rows + 1),
+ cols_(context, num_nonzeros),
+ values_(context, num_nonzeros),
+ spmv_buffer_(context),
+ num_rows_(num_rows),
+ num_cols_(num_cols),
+ num_nonzeros_(num_nonzeros) {
+ cusparseCreateCsr(&descr_,
+ num_rows_,
+ num_cols_,
+ num_nonzeros_,
+ rows_.data(),
+ cols_.data(),
+ values_.data(),
+ CUSPARSE_INDEX_32I,
+ CUSPARSE_INDEX_32I,
+ CUSPARSE_INDEX_BASE_ZERO,
+ CUDA_R_64F);
+}
+
CudaSparseMatrix::CudaSparseMatrix(ContextImpl* context,
const CompressedRowSparseMatrix& crs_matrix)
: context_(context),
diff --git a/internal/ceres/cuda_sparse_matrix.h b/internal/ceres/cuda_sparse_matrix.h
index d5f3332..ad55170 100644
--- a/internal/ceres/cuda_sparse_matrix.h
+++ b/internal/ceres/cuda_sparse_matrix.h
@@ -64,6 +64,13 @@
CudaSparseMatrix(ContextImpl* context,
const CompressedRowSparseMatrix& crs_matrix);
+ // Creates a "blank" matrix with an appropriate amount of memory allocated.
+ // The object itself is left in an inconsistent state.
+ CudaSparseMatrix(int num_rows,
+ int num_cols,
+ int num_nonzeros,
+ ContextImpl* context);
+
~CudaSparseMatrix();
// y = y + Ax;
@@ -75,6 +82,10 @@
int num_cols() const { return num_cols_; }
int num_nonzeros() const { return num_nonzeros_; }
+ int32_t* mutable_rows() { return rows_.data(); }
+ int32_t* mutable_cols() { return cols_.data(); }
+ double* mutable_values() { return values_.data(); }
+
// If subsequent uses of this matrix involve only numerical changes and no
// structural changes, then this method can be used to copy the updated
// non-zero values -- the row and column index arrays are kept the same. It
diff --git a/internal/ceres/cuda_streamed_buffer.h b/internal/ceres/cuda_streamed_buffer.h
new file mode 100644
index 0000000..8761ef4
--- /dev/null
+++ b/internal/ceres/cuda_streamed_buffer.h
@@ -0,0 +1,335 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#ifndef CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_
+#define CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_
+
+#include "ceres/internal/config.h"
+
+#ifndef CERES_NO_CUDA
+#include "ceres/cuda_buffer.h"
+
+namespace ceres::internal {
+
+// Most contemporary CUDA devices are capable of simultaneous code execution and
+// host-to-device transfer. This class copies batches of data to GPU memory and
+// executes processing of copied data in parallel (asynchronously).
+// Data is copied to a fixed-size buffer on GPU (containing at most
+// max_buffer_size values), and this memory is re-used when the previous
+// batch of values is processed by user-provided callback
+// Host-to-device copy uses a temporary buffer if required. Each batch of values
+// has size of kValuesPerBatch, except the last one.
+template <typename T>
+class CERES_NO_EXPORT CudaStreamedBuffer {
+ public:
+ // If hardware supports only one host-to-device copy or one host-to-device
+ // copy is able to reach peak bandwidth, two streams are sufficient to reach
+ // maximum efficiency:
+ // - If transferring batch of values takes more time, than processing it on
+ // gpu, then at every moment of time one of the streams will be transferring
+ // data and other stream will be either processing data or idle; the whole
+ // process will be bounded by host-to-device copy.
+ // - If transferring batch of values takes less time, than processing it on
+ // gpu, then at every moment of time one of the streams will be processing
+ // data and other stream will be either performing computations or
+ // transferring data, and the whole process will be bounded by computations.
+ static constexpr int kNumBatches = 2;
+ // max_buffer_size is the maximal size (in elements of type T) of array
+ // to be pre-allocated in gpu memory. The size of array determines size of
+ // batch of values for simultaneous copying and processing. It should be large
+ // enough to allow highly-parallel execution of user kernels; making it too
+ // large increases latency.
+ CudaStreamedBuffer(ContextImpl* context, const int max_buffer_size)
+ : kValuesPerBatch(max_buffer_size / kNumBatches),
+ context_(context),
+ values_gpu_(context, kValuesPerBatch * kNumBatches) {
+ static_assert(ContextImpl::kNumCudaStreams >= kNumBatches);
+ CHECK_GE(max_buffer_size, kNumBatches);
+ // Pre-allocate a buffer of page-locked memory for transfers from a regular
+ // cpu memory. Because we will be only writing into that buffer from cpu,
+ // memory is allocated with cudaHostAllocWriteCombined flag.
+ CHECK_EQ(cudaSuccess,
+ cudaHostAlloc(&values_cpu_pinned_,
+ sizeof(T) * kValuesPerBatch * kNumBatches,
+ cudaHostAllocWriteCombined));
+ for (auto& e : copy_finished_) {
+ CHECK_EQ(cudaSuccess,
+ cudaEventCreateWithFlags(&e, cudaEventDisableTiming));
+ }
+ }
+
+ CudaStreamedBuffer(const CudaStreamedBuffer&) = delete;
+
+ ~CudaStreamedBuffer() {
+ CHECK_EQ(cudaSuccess, cudaFreeHost(values_cpu_pinned_));
+ for (auto& e : copy_finished_) {
+ CHECK_EQ(cudaSuccess, cudaEventDestroy(e));
+ }
+ }
+
+ // Transfer num_values at host-memory pointer from, calling
+ // callback(device_pointer, size_of_batch, offset_of_batch, stream_to_use)
+ // after scheduling transfer of each batch of data. User-provided callback
+ // should perform processing of data at device_pointer only in
+ // stream_to_use stream (device_pointer will be re-used in the next
+ // callback invocation with the same stream).
+ //
+ // Two diagrams below describe operation in two possible scenarios, depending
+ // on input data being stored in page-locked memory. In this example we will
+ // have max_buffer_size = 2 * K, num_values = N * K and callback
+ // scheduling a single asynchronous launch of
+ // Kernel<<..., stream_to_use>>(device_pointer,
+ // size_of_batch,
+ // offset_of_batch)
+ //
+ // a. Copying from page-locked memory
+ // In this case no copy on the host-side is necessary, and this method just
+ // schedules a bunch of interleaved memory copies and callback invocations:
+ //
+ // cudaStreamSynchronize(context->DefaultStream());
+ // - Iteration #0:
+ // - cudaMemcpyAsync(values_gpu_, from, K * sizeof(T), H->D, stream_0)
+ // - callback(values_gpu_, K, 0, stream_0)
+ // - Iteration #1:
+ // - cudaMemcpyAsync(values_gpu_ + K, from + K, K * sizeof(T), H->D,
+ // stream_1)
+ // - callback(values_gpu_ + K, K, K, stream_1)
+ // - Iteration #2:
+ // - cudaMemcpyAsync(values_gpu_, from + 2 * K, K * sizeof(T), H->D,
+ // stream_0)
+ // - callback(values_gpu_, K, 2 * K, stream_0)
+ // - Iteration #3:
+ // - cudaMemcpyAsync(values_gpu_ + K, from + 3 * K, K * sizeof(T), H->D,
+ // stream_1)
+ // - callback(values_gpu_ + K, K, 3 * K, stream_1)
+ // ...
+ // - Iteration #i:
+ // - cudaMemcpyAsync(values_gpu_ + (i % 2) * K, from + i * K, K *
+ // sizeof(T), H->D, stream_(i % 2))
+ // - callback(values_gpu_ + (i % 2) * K, K, i * K, stream_(i % 2)
+ // ...
+ // cudaStreamSynchronize(stream_0)
+ // cudaStreamSynchronize(stream_1)
+ //
+ // This sequence of calls results in following activity on gpu (assuming that
+ // kernel invoked by callback takes less time than host-to-device copy):
+ // +-------------------+-------------------+
+ // | Stream #0 | Stream #1 |
+ // +-------------------+-------------------+
+ // | Copy host->device | |
+ // | | |
+ // | | |
+ // +-------------------+-------------------+
+ // | Kernel | Copy host->device |
+ // +-------------------+ |
+ // | | |
+ // +-------------------+-------------------+
+ // | Copy host->device | Kernel |
+ // | +-------------------+
+ // | | |
+ // +-------------------+-------------------+
+ // | Kernel | Copy host->device |
+ // | ... |
+ // +---------------------------------------+
+ //
+ // b. Copying from regular memory
+ // In this case a copy from regular memory to page-locked memory is required
+ // in order to get asynchrnonous operation. Because pinned memory on host-side
+ // is reused, additional synchronization is required. On each iteration method
+ // the following actions are performed:
+ // - Wait till previous copy operation in stream is completed
+ // - Copy batch of values from input array into pinned memory
+ // - Asynchronously launch host-to-device copy
+ // - Setup event for synchronization on copy completion
+ // - Invoke callback (that launches kernel asynchronously)
+ //
+ // Invocations are performed with the following arguments
+ // cudaStreamSynchronize(context->DefaultStream());
+ // - Iteration #0:
+ // - cudaEventSynchronize(copy_finished_0)
+ // - std::copy_n(from, K, values_cpu_pinned_)
+ // - cudaMemcpyAsync(values_gpu_, values_cpu_pinned_, K * sizeof(T), H->D,
+ // stream_0)
+ // - cudaEventRecord(copy_finished_0, stream_0)
+ // - callback(values_gpu_, K, 0, stream_0)
+ // - Iteration #1:
+ // - cudaEventSynchronize(copy_finished_1)
+ // - std::copy_n(from + K, K, values_cpu_pinned_ + K)
+ // - cudaMemcpyAsync(values_gpu_ + K, values_cpu_pinned_ + K, K *
+ // sizeof(T), H->D, stream_1)
+ // - cudaEventRecord(copy_finished_1, stream_1)
+ // - callback(values_gpu_ + K, K, K, stream_1)
+ // - Iteration #2:
+ // - cudaEventSynchronize(copy_finished_0)
+ // - std::copy_n(from + 2 * K, K, values_cpu_pinned_)
+ // - cudaMemcpyAsync(values_gpu_, values_cpu_pinned_, K * sizeof(T), H->D,
+ // stream_0)
+ // - cudaEventRecord(copy_finished_0, stream_0)
+ // - callback(values_gpu_, K, 2 * K, stream_0)
+ // - Iteration #3:
+ // - cudaEventSynchronize(copy_finished_1)
+ // - std::copy_n(from + 3 * K, K, values_cpu_pinned_ + K)
+ // - cudaMemcpyAsync(values_gpu_ + K, values_cpu_pinned_ + K, K *
+ // sizeof(T), H->D, stream_1)
+ // - cudaEventRecord(copy_finished_1, stream_1)
+ // - callback(values_gpu_ + K, K, 3 * K, stream_1)
+ // ...
+ // - Iteration #i:
+ // - cudaEventSynchronize(copy_finished_(i % 2))
+ // - std::copy_n(from + i * K, K, values_cpu_pinned_ + (i % 2) * K)
+ // - cudaMemcpyAsync(values_gpu_ + (i % 2) * K, values_cpu_pinned_ + (i %
+ // 2) * K, K * sizeof(T), H->D, stream_(i % 2))
+ // - cudaEventRecord(copy_finished_(i % 2), stream_(i % 2))
+ // - callback(values_gpu_ + (i % 2) * K, K, i * K, stream_(i % 2))
+ // ...
+ // cudaStreamSynchronize(stream_0)
+ // cudaStreamSynchronize(stream_1)
+ //
+ // This sequence of calls results in following activity on cpu and gpu
+ // (assuming that kernel invoked by callback takes less time than
+ // host-to-device copy and copy in cpu memory, and copy in cpu memory is
+ // faster than host-to-device copy):
+ // +----------------------------+-------------------+-------------------+
+ // | Stream #0 | Stream #0 | Stream #1 |
+ // +----------------------------+-------------------+-------------------+
+ // | Copy to pinned memory | | |
+ // | | | |
+ // +----------------------------+-------------------| |
+ // | Copy to pinned memory | Copy host->device | |
+ // | | | |
+ // +----------------------------+ | |
+ // | Waiting previous h->d copy | | |
+ // +----------------------------+-------------------+-------------------+
+ // | Copy to pinned memory | Kernel | Copy host->device |
+ // | +-------------------+ |
+ // +----------------------------+ | |
+ // | Waiting previous h->d copy | | |
+ // +----------------------------+-------------------+-------------------+
+ // | Copy to pinned memory | Copy host->device | Kernel |
+ // | | +-------------------+
+ // | ... ... |
+ // +----------------------------+---------------------------------------+
+ //
+ template <typename Fun>
+ void CopyToGpu(const T* from, const int num_values, Fun&& callback) {
+ // This synchronization is not required in some cases, but we perform it in
+ // order to avoid situation when user callback depends on data that is
+ // still to be computed in default stream
+ CHECK_EQ(cudaSuccess, cudaStreamSynchronize(context_->DefaultStream()));
+
+ // If pointer to input data does not correspond to page-locked memory,
+ // host-to-device memory copy might be executed synchrnonously (with a copy
+ // to pinned memory happening inside the driver). In that case we perform
+ // copy to a pre-allocated array of page-locked memory.
+ const bool copy_to_pinned_memory = MemoryTypeResultsInSynchronousCopy(from);
+ T* batch_values_gpu[kNumBatches];
+ T* batch_values_cpu[kNumBatches];
+ auto streams = context_->streams_;
+ for (int i = 0; i < kNumBatches; ++i) {
+ batch_values_gpu[i] = values_gpu_.data() + kValuesPerBatch * i;
+ batch_values_cpu[i] = values_cpu_pinned_ + kValuesPerBatch * i;
+ }
+ int batch_id = 0;
+ for (int offset = 0; offset < num_values; offset += kValuesPerBatch) {
+ const int num_values_batch =
+ std::min(num_values - offset, kValuesPerBatch);
+ const T* batch_from = from + offset;
+ T* batch_to = batch_values_gpu[batch_id];
+ auto stream = streams[batch_id];
+ auto copy_finished = copy_finished_[batch_id];
+
+ if (copy_to_pinned_memory) {
+ // Copying values to a temporary buffer should be started only after the
+ // previous copy from temporary buffer to device is completed.
+ CHECK_EQ(cudaSuccess, cudaEventSynchronize(copy_finished));
+ std::copy_n(batch_from, num_values_batch, batch_values_cpu[batch_id]);
+ batch_from = batch_values_cpu[batch_id];
+ }
+ CHECK_EQ(cudaSuccess,
+ cudaMemcpyAsync(batch_to,
+ batch_from,
+ sizeof(T) * num_values_batch,
+ cudaMemcpyHostToDevice,
+ stream));
+ if (copy_to_pinned_memory) {
+ // Next copy to a temporary buffer can start straight after asynchronous
+ // copy is completed (and might be started before kernels asynchronously
+ // executed in stream by user-supplied callback are completed).
+ // No explicit synchronization is required when copying data from
+ // page-locked memory, because memory copy and user kernel execution
+ // with corresponding part of values_gpu_ array is serialized using
+ // stream
+ CHECK_EQ(cudaSuccess, cudaEventRecord(copy_finished, stream));
+ }
+ callback(batch_to, num_values_batch, offset, stream);
+ batch_id = (batch_id + 1) % kNumBatches;
+ }
+ // Explicitly synchronize on all CUDA streams that were utilized.
+ for (int i = 0; i < kNumBatches; ++i) {
+ CHECK_EQ(cudaSuccess, cudaStreamSynchronize(streams[i]));
+ }
+ }
+
+ private:
+ // It is necessary to have all host-to-device copies to be completely
+ // asynchronous. This requires source memory to be allocated in page-locked
+ // memory.
+ static bool MemoryTypeResultsInSynchronousCopy(const void* ptr) {
+ cudaPointerAttributes attributes;
+ auto status = cudaPointerGetAttributes(&attributes, ptr);
+#if CUDART_VERSION < 11000
+ // In CUDA versions prior 11 call to cudaPointerGetAttributes with host
+ // pointer will return cudaErrorInvalidValue
+ if (status == cudaErrorInvalidValue) {
+ return true;
+ }
+#endif
+ CHECK_EQ(status, cudaSuccess);
+ // This class only supports cpu memory as a source
+ CHECK_NE(attributes.type, cudaMemoryTypeDevice);
+ // If host memory was allocated (or registered) with CUDA API, or is a
+ // managed memory, then call to cudaMemcpyAsync will be asynchrnous. In case
+ // of managed memory it might be slightly better to perform a single call of
+ // user-provided call-back (and hope that page migration will provide a
+ // similar throughput with zero efforts from our side).
+ return attributes.type == cudaMemoryTypeUnregistered;
+ }
+
+ const int kValuesPerBatch;
+ ContextImpl* context_ = nullptr;
+ CudaBuffer<T> values_gpu_;
+ T* values_cpu_pinned_ = nullptr;
+ cudaEvent_t copy_finished_[kNumBatches] = {nullptr};
+};
+
+} // namespace ceres::internal
+
+#endif // CERES_NO_CUDA
+#endif // CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_
diff --git a/internal/ceres/cuda_streamed_buffer_test.cc b/internal/ceres/cuda_streamed_buffer_test.cc
new file mode 100644
index 0000000..4837005
--- /dev/null
+++ b/internal/ceres/cuda_streamed_buffer_test.cc
@@ -0,0 +1,169 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#include "ceres/internal/config.h"
+
+#ifndef CERES_NO_CUDA
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <numeric>
+
+#include "ceres/cuda_streamed_buffer.h"
+
+namespace ceres::internal {
+
+TEST(CudaStreamedBufferTest, IntegerCopy) {
+ // Offsets and sizes of batches supplied to callback
+ std::vector<std::pair<int, int>> batches;
+ const int kMaxTemporaryArraySize = 16;
+ const int kInputSize = kMaxTemporaryArraySize * 7 + 3;
+ ContextImpl context;
+ std::string message;
+ CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
+
+ std::vector<int> inputs(kInputSize);
+ std::vector<int> outputs(kInputSize, -1);
+ std::iota(inputs.begin(), inputs.end(), 0);
+
+ CudaStreamedBuffer<int> streamed_buffer(&context, kMaxTemporaryArraySize);
+ streamed_buffer.CopyToGpu(inputs.data(),
+ kInputSize,
+ [&outputs, &batches](const int* device_pointer,
+ int size,
+ int offset,
+ cudaStream_t stream) {
+ batches.emplace_back(offset, size);
+ CHECK_EQ(cudaSuccess,
+ cudaMemcpyAsync(outputs.data() + offset,
+ device_pointer,
+ sizeof(int) * size,
+ cudaMemcpyDeviceToHost,
+ stream));
+ });
+ // All operations in all streams should be completed when CopyToGpu returns
+ // control to the callee
+ for (int i = 0; i < ContextImpl::kNumCudaStreams; ++i) {
+ CHECK_EQ(cudaSuccess, cudaStreamQuery(context.streams_[i]));
+ }
+
+ // Check if every element was visited
+ for (int i = 0; i < kInputSize; ++i) {
+ CHECK_EQ(outputs[i], i);
+ }
+
+ // Check if there is no overlap between batches
+ std::sort(batches.begin(), batches.end());
+ const int num_batches = batches.size();
+ for (int i = 0; i < num_batches; ++i) {
+ const auto [begin, size] = batches[i];
+ const int end = begin + size;
+ CHECK_GE(begin, 0);
+ CHECK_LT(begin, kInputSize);
+
+ CHECK_GT(size, 0);
+ CHECK_LE(end, kInputSize);
+
+ if (i + 1 == num_batches) continue;
+ CHECK_EQ(end, batches[i + 1].first);
+ }
+}
+
+TEST(CudaStreamedBufferTest, IntegerNoCopy) {
+ // Offsets and sizes of batches supplied to callback
+ std::vector<std::pair<int, int>> batches;
+ const int kMaxTemporaryArraySize = 16;
+ const int kInputSize = kMaxTemporaryArraySize * 7 + 3;
+ ContextImpl context;
+ std::string message;
+ CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
+
+ int* inputs;
+ int* outputs;
+ CHECK_EQ(cudaSuccess,
+ cudaHostAlloc(
+ &inputs, sizeof(int) * kInputSize, cudaHostAllocWriteCombined));
+ CHECK_EQ(
+ cudaSuccess,
+ cudaHostAlloc(&outputs, sizeof(int) * kInputSize, cudaHostAllocDefault));
+
+ std::fill(outputs, outputs + kInputSize, -1);
+ std::iota(inputs, inputs + kInputSize, 0);
+
+ CudaStreamedBuffer<int> streamed_buffer(&context, kMaxTemporaryArraySize);
+ streamed_buffer.CopyToGpu(inputs,
+ kInputSize,
+ [outputs, &batches](const int* device_pointer,
+ int size,
+ int offset,
+ cudaStream_t stream) {
+ batches.emplace_back(offset, size);
+ CHECK_EQ(cudaSuccess,
+ cudaMemcpyAsync(outputs + offset,
+ device_pointer,
+ sizeof(int) * size,
+ cudaMemcpyDeviceToHost,
+ stream));
+ });
+ // All operations in all streams should be completed when CopyToGpu returns
+ // control to the callee
+ for (int i = 0; i < ContextImpl::kNumCudaStreams; ++i) {
+ CHECK_EQ(cudaSuccess, cudaStreamQuery(context.streams_[i]));
+ }
+
+ // Check if every element was visited
+ for (int i = 0; i < kInputSize; ++i) {
+ CHECK_EQ(outputs[i], i);
+ }
+
+ // Check if there is no overlap between batches
+ std::sort(batches.begin(), batches.end());
+ const int num_batches = batches.size();
+ for (int i = 0; i < num_batches; ++i) {
+ const auto [begin, size] = batches[i];
+ const int end = begin + size;
+ CHECK_GE(begin, 0);
+ CHECK_LT(begin, kInputSize);
+
+ CHECK_GT(size, 0);
+ CHECK_LE(end, kInputSize);
+
+ if (i + 1 == num_batches) continue;
+ CHECK_EQ(end, batches[i + 1].first);
+ }
+
+ CHECK_EQ(cudaSuccess, cudaFreeHost(inputs));
+ CHECK_EQ(cudaSuccess, cudaFreeHost(outputs));
+}
+
+} // namespace ceres::internal
+
+#endif // CERES_NO_CUDA
diff --git a/internal/ceres/cuda_vector.cc b/internal/ceres/cuda_vector.cc
index 274a47a..e2143de 100644
--- a/internal/ceres/cuda_vector.cc
+++ b/internal/ceres/cuda_vector.cc
@@ -127,7 +127,7 @@
void CudaVector::SetZero() {
CHECK(data_.data() != nullptr);
- CudaSetZeroFP64(data_.data(), num_rows_, context_->stream_);
+ CudaSetZeroFP64(data_.data(), num_rows_, context_->DefaultStream());
}
void CudaVector::Axpby(double a, const CudaVector& x, double b) {
@@ -160,7 +160,7 @@
D.data().data(),
x.data().data(),
num_rows_,
- context_->stream_);
+ context_->DefaultStream());
}
void CudaVector::Scale(double s) {
diff --git a/internal/ceres/dense_cholesky.cc b/internal/ceres/dense_cholesky.cc
index 93e9f0d..f19ca7e 100644
--- a/internal/ceres/dense_cholesky.cc
+++ b/internal/ceres/dense_cholesky.cc
@@ -516,7 +516,7 @@
residual_fp32_.data(),
num_cols_ * sizeof(float),
cudaMemcpyDeviceToDevice,
- context_->stream_),
+ context_->DefaultStream()),
cudaSuccess);
if (cusolverDnSpotrs(context_->cusolver_handle_,
CUBLAS_FILL_MODE_LOWER,
@@ -569,7 +569,7 @@
CudaFP64ToFP32(lhs_fp64_.data(),
lhs_fp32_.data(),
num_cols * num_cols,
- context_->stream_);
+ context_->DefaultStream());
// Factorize lhs_fp32.
factorize_result_ = CudaCholeskyFactorize(message);
@@ -592,7 +592,7 @@
residual_fp64_.Reserve(num_cols_);
// Initialize x = 0.
- CudaSetZeroFP64(x_fp64_.data(), num_cols_, context_->stream_);
+ CudaSetZeroFP64(x_fp64_.data(), num_cols_, context_->DefaultStream());
// Initialize residual = rhs.
rhs_fp64_.CopyFromCpu(rhs, num_cols_);
@@ -603,15 +603,17 @@
CudaFP64ToFP32(residual_fp64_.data(),
residual_fp32_.data(),
num_cols_,
- context_->stream_);
+ context_->DefaultStream());
// [fp32] c = lhs^-1 * residual.
auto result = CudaCholeskySolve(message);
if (result != LinearSolverTerminationType::SUCCESS) {
return result;
}
// [fp64] x += c.
- CudaDsxpy(
- x_fp64_.data(), correction_fp32_.data(), num_cols_, context_->stream_);
+ CudaDsxpy(x_fp64_.data(),
+ correction_fp32_.data(),
+ num_cols_,
+ context_->DefaultStream());
if (i < max_num_refinement_iterations_) {
// [fp64] residual = rhs - lhs * x
// This is done in two steps:
diff --git a/internal/ceres/evaluation_benchmark.cc b/internal/ceres/evaluation_benchmark.cc
index ae1a65d..2ba03ed 100644
--- a/internal/ceres/evaluation_benchmark.cc
+++ b/internal/ceres/evaluation_benchmark.cc
@@ -36,6 +36,7 @@
#include "benchmark/benchmark.h"
#include "ceres/block_sparse_matrix.h"
#include "ceres/bundle_adjustment_test_util.h"
+#include "ceres/cuda_block_sparse_crs_view.h"
#include "ceres/cuda_sparse_matrix.h"
#include "ceres/cuda_vector.h"
#include "ceres/evaluator.h"
@@ -427,6 +428,75 @@
CHECK_GT(y.squaredNorm(), 0.);
}
+static void JacobianToCRS(benchmark::State& state,
+ BALData* data,
+ ContextImpl* context) {
+ auto jacobian = data->BlockSparseJacobian(context);
+
+ std::unique_ptr<CompressedRowSparseMatrix> matrix;
+ for (auto _ : state) {
+ matrix = jacobian->ToCompressedRowSparseMatrix();
+ }
+ CHECK(matrix != nullptr);
+}
+
+#ifndef CERES_NO_CUDA
+// We want CudaBlockSparseCRSView to be not slower than explicit conversion to
+// CRS on CPU
+static void JacobianToCRSView(benchmark::State& state,
+ BALData* data,
+ ContextImpl* context) {
+ auto jacobian = data->BlockSparseJacobian(context);
+
+ std::unique_ptr<CudaBlockSparseCRSView> matrix;
+ for (auto _ : state) {
+ matrix = std::make_unique<CudaBlockSparseCRSView>(*jacobian, context);
+ }
+ CHECK(matrix != nullptr);
+}
+static void JacobianToCRSMatrix(benchmark::State& state,
+ BALData* data,
+ ContextImpl* context) {
+ auto jacobian = data->BlockSparseJacobian(context);
+
+ std::unique_ptr<CudaSparseMatrix> matrix;
+ std::unique_ptr<CompressedRowSparseMatrix> matrix_cpu;
+ for (auto _ : state) {
+ matrix_cpu = jacobian->ToCompressedRowSparseMatrix();
+ matrix = std::make_unique<CudaSparseMatrix>(context, *matrix_cpu);
+ }
+ CHECK(matrix != nullptr);
+}
+// Updating values in CudaBlockSparseCRSView should be +- as fast as just
+// copying values (time spent in value permutation has to be hidden by PCIe
+// transfer)
+static void JacobianToCRSViewUpdate(benchmark::State& state,
+ BALData* data,
+ ContextImpl* context) {
+ auto jacobian = data->BlockSparseJacobian(context);
+
+ auto matrix = CudaBlockSparseCRSView(*jacobian, context);
+ for (auto _ : state) {
+ matrix.UpdateValues(*jacobian);
+ }
+}
+static void JacobianToCRSMatrixUpdate(benchmark::State& state,
+ BALData* data,
+ ContextImpl* context) {
+ auto jacobian = data->BlockSparseJacobian(context);
+
+ auto matrix_cpu = jacobian->ToCompressedRowSparseMatrix();
+ auto matrix = std::make_unique<CudaSparseMatrix>(context, *matrix_cpu);
+ for (auto _ : state) {
+ CHECK_EQ(cudaSuccess,
+ cudaMemcpy(matrix->mutable_values(),
+ matrix_cpu->values(),
+ matrix->num_nonzeros() * sizeof(double),
+ cudaMemcpyHostToDevice));
+ }
+}
+#endif
+
static void JacobianSquaredColumnNorm(benchmark::State& state,
BALData* data,
ContextImpl* context) {
@@ -772,6 +842,34 @@
->Arg(4)
->Arg(8)
->Arg(16);
+
+ const std::string name_to_crs = "JacobianToCRS<" + path + ">";
+ ::benchmark::RegisterBenchmark(
+ name_to_crs.c_str(), ceres::internal::JacobianToCRS, data, &context);
+#ifndef CERES_NO_CUDA
+ const std::string name_to_crs_view = "JacobianToCRSView<" + path + ">";
+ ::benchmark::RegisterBenchmark(name_to_crs_view.c_str(),
+ ceres::internal::JacobianToCRSView,
+ data,
+ &context);
+ const std::string name_to_crs_matrix = "JacobianToCRSMatrix<" + path + ">";
+ ::benchmark::RegisterBenchmark(name_to_crs_matrix.c_str(),
+ ceres::internal::JacobianToCRSMatrix,
+ data,
+ &context);
+ const std::string name_to_crs_view_update =
+ "JacobianToCRSViewUpdate<" + path + ">";
+ ::benchmark::RegisterBenchmark(name_to_crs_view_update.c_str(),
+ ceres::internal::JacobianToCRSViewUpdate,
+ data,
+ &context);
+ const std::string name_to_crs_matrix_update =
+ "JacobianToCRSMatrixUpdate<" + path + ">";
+ ::benchmark::RegisterBenchmark(name_to_crs_matrix_update.c_str(),
+ ceres::internal::JacobianToCRSMatrixUpdate,
+ data,
+ &context);
+#endif
}
::benchmark::RunSpecifiedBenchmarks();