Permutation-based conversion from block-sparse to crs Change-Id: Ic33a6476c033187dff61886deb6d1761524943f0
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 682c27f..e4b124d 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml
@@ -47,15 +47,25 @@ liblapack-dev \ libmetis-dev \ libsuitesparse-dev \ - ninja-build - - - name: Setup CUDA toolkit - if: matrix.gpu == 'cuda' + ninja-build \ + wget + - name: Setup CUDA toolkit (system repositories) + if: matrix.gpu == 'cuda' && matrix.os == 'ubuntu:20.04' run: | apt-get install -y \ nvidia-cuda-dev \ nvidia-cuda-toolkit - + # nvidia cuda toolkit + gcc combo shipped with 22.04LTS is broken + # and is not able to compile code that uses thrust + # https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1006962 + - name: Setup CUDA toolkit (nvidia repositories) + if: matrix.gpu == 'cuda' && matrix.os == 'ubuntu:22.04' + run: | + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb + dpkg -i cuda-keyring_1.0-1_all.deb + apt-get update + apt-get install -y cuda + echo "CUDACXX=/usr/local/cuda/bin/nvcc" >> $GITHUB_ENV - name: Cache Build id: cache-build
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt index 9729ef6..9cb99b4 100644 --- a/internal/ceres/CMakeLists.txt +++ b/internal/ceres/CMakeLists.txt
@@ -141,8 +141,9 @@ $<TARGET_FILE:cuda_dense_cholesky_test>) endif (BUILD_TESTING AND GFLAGS) set_source_files_properties(cuda_kernels.cu.cc PROPERTIES LANGUAGE CUDA) - add_library(ceres_cuda_kernels cuda_kernels.cu.cc) + add_library(ceres_cuda_kernels STATIC cuda_kernels.cu.cc) target_compile_features(ceres_cuda_kernels PRIVATE cxx_std_14) + target_include_directories(ceres_cuda_kernels PRIVATE ${Ceres_SOURCE_DIR}/internal ${Ceres_BINARY_DIR}/${CMAKE_INSTALL_INCLUDEDIR}) list(APPEND CERES_LIBRARY_PRIVATE_DEPENDENCIES ceres_cuda_kernels) endif (USE_CUDA) @@ -181,6 +182,8 @@ cost_function.cc covariance.cc covariance_impl.cc + cuda_block_sparse_crs_view.cc + cuda_block_structure.cc cuda_sparse_matrix.cc cuda_vector.cc dense_cholesky.cc @@ -472,10 +475,13 @@ ceres_test(cost_function_to_functor) ceres_test(covariance) ceres_test(cubic_interpolation) + ceres_test(cuda_block_sparse_crs_view) + ceres_test(cuda_block_structure) ceres_test(cuda_dense_cholesky) ceres_test(cuda_dense_qr) ceres_test(cuda_kernels) ceres_test(cuda_sparse_matrix) + ceres_test(cuda_streamed_buffer) ceres_test(cuda_vector) ceres_test(dense_linear_solver) ceres_test(dense_cholesky)
diff --git a/internal/ceres/block_structure.h b/internal/ceres/block_structure.h index aaa4080..bccfd8b 100644 --- a/internal/ceres/block_structure.h +++ b/internal/ceres/block_structure.h
@@ -43,7 +43,11 @@ #include "ceres/internal/export.h" -namespace ceres::internal { +// This file is being included into source files that are compiled with nvcc. +// nvcc shipped with ubuntu 20.04 does not support some features of c++17, +// including nested namespace definitions +namespace ceres { +namespace internal { using BlockSize = int32_t; @@ -189,6 +193,7 @@ std::vector<Block> Tail(const std::vector<Block>& blocks, int n); int SumSquaredSizes(const std::vector<Block>& blocks); -} // namespace ceres::internal +} // namespace internal +} // namespace ceres #endif // CERES_INTERNAL_BLOCK_STRUCTURE_H_
diff --git a/internal/ceres/context_impl.cc b/internal/ceres/context_impl.cc index cc01b54..ea3016b 100644 --- a/internal/ceres/context_impl.cc +++ b/internal/ceres/context_impl.cc
@@ -60,9 +60,11 @@ cusparseDestroy(cusparse_handle_); cusparse_handle_ = nullptr; } - if (stream_ != nullptr) { - cudaStreamDestroy(stream_); - stream_ = nullptr; + for (auto& s : streams_) { + if (s != nullptr) { + cudaStreamDestroy(s); + s = nullptr; + } } is_cuda_initialized_ = false; } @@ -143,19 +145,22 @@ return false; } event_logger.AddEvent("cusparseCreate"); - if (cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking) != - cudaSuccess) { - *message = - "CUDA initialization failed because CUDA::cudaStreamCreateWithFlags " - "failed."; - TearDown(); - return false; + for (auto& s : streams_) { + if (cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking) != cudaSuccess) { + *message = + "CUDA initialization failed because CUDA::cudaStreamCreateWithFlags " + "failed."; + TearDown(); + return false; + } } event_logger.AddEvent("cudaStreamCreateWithFlags"); - if (cusolverDnSetStream(cusolver_handle_, stream_) != + if (cusolverDnSetStream(cusolver_handle_, DefaultStream()) != CUSOLVER_STATUS_SUCCESS || - cublasSetStream(cublas_handle_, stream_) != CUBLAS_STATUS_SUCCESS || - cusparseSetStream(cusparse_handle_, stream_) != CUSPARSE_STATUS_SUCCESS) { + cublasSetStream(cublas_handle_, DefaultStream()) != + CUBLAS_STATUS_SUCCESS || + cusparseSetStream(cusparse_handle_, DefaultStream()) != + CUSPARSE_STATUS_SUCCESS) { *message = "CUDA initialization failed because SetStream failed."; TearDown(); return false;
diff --git a/internal/ceres/context_impl.h b/internal/ceres/context_impl.h index a18d638..776ca31 100644 --- a/internal/ceres/context_impl.h +++ b/internal/ceres/context_impl.h
@@ -78,11 +78,32 @@ // Ceres, Ceres will use that GPU. // Note on Ceres' use of CUDA Streams: - // All operations on the GPU are performed using a single stream. This ensures - // that the order of operations are stream-ordered, but we do not need to - // explicitly synchronize the stream at the end of every operation. Stream - // synchronization occurs only before GPU to CPU transfers, and is handled by - // CudaBuffer. + // Most of operations on the GPU are performed using a single stream. In + // those cases DefaultStream() should be used. This ensures that operations + // are stream-ordered, and might be concurrent with cpu processing with no + // additional efforts. + // + // a. Single-stream workloads + // - Only use default stream + // - Return control to the callee without synchronization whenever possible + // - Stream synchronization occurs only after GPU to CPU transfers, and is + // handled by CudaBuffer + // + // b. Multi-stream workloads + // Multi-stream workloads are more restricted in order to make it harder to + // get a race-condition. + // - Should always synchronize the default stream on entry + // - Should always synchronize all utilized streams on exit + // - Should not make any assumptions on one of streams_[] being default + // + // With those rules in place + // - All single-stream asynchronous workloads are serialized using default + // stream + // - Multiple-stream workloads always wait single-stream workloads to finish + // and leave no running computations on exit. + // This slightly penalizes multi-stream workloads, but makes it easier to + // avoid race conditions when multiple-stream workload depends on results of + // any preceeding gpu computations. // Initializes cuBLAS, cuSOLVER, and cuSPARSE contexts, creates an // asynchronous CUDA stream, and associates the stream with the contexts. @@ -101,7 +122,14 @@ cusolverDnHandle_t cusolver_handle_ = nullptr; cublasHandle_t cublas_handle_ = nullptr; - cudaStream_t stream_ = nullptr; + + // Default stream. + // Kernel invocations and memory copies on this stream can be left without + // synchronization. + cudaStream_t DefaultStream() { return streams_[0]; } + static constexpr int kNumCudaStreams = 2; + cudaStream_t streams_[kNumCudaStreams] = {0}; + cusparseHandle_t cusparse_handle_ = nullptr; bool is_cuda_initialized_ = false; int gpu_device_id_in_use_ = -1;
diff --git a/internal/ceres/cuda_block_sparse_crs_view.cc b/internal/ceres/cuda_block_sparse_crs_view.cc new file mode 100644 index 0000000..a03d01c --- /dev/null +++ b/internal/ceres/cuda_block_sparse_crs_view.cc
@@ -0,0 +1,74 @@ +// Ceres Solver - A fast non-linear least squares minimizer +// Copyright 2023 Google Inc. All rights reserved. +// http://ceres-solver.org/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Google Inc. nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin) + +#include "ceres/cuda_block_sparse_crs_view.h" + +#ifndef CERES_NO_CUDA + +#include "ceres/cuda_block_structure.h" +#include "ceres/cuda_kernels.h" + +namespace ceres::internal { + +CudaBlockSparseCRSView::CudaBlockSparseCRSView(const BlockSparseMatrix& bsm, + ContextImpl* context) + : permutation_(context, bsm.num_nonzeros()), + streamed_buffer_(context, kMaxTemporaryArraySize) { + CudaBlockSparseStructure block_structure(*bsm.block_structure(), context); + crs_matrix_ = std::make_unique<CudaSparseMatrix>( + bsm.num_rows(), bsm.num_cols(), bsm.num_nonzeros(), context); + CudaBuffer<int> temp_rows(context, bsm.num_rows()); + FillCRSStructure(block_structure.num_row_blocks(), + bsm.num_rows(), + block_structure.row_block_offsets(), + block_structure.cells(), + block_structure.row_blocks(), + block_structure.col_blocks(), + crs_matrix_->mutable_rows(), + crs_matrix_->mutable_cols(), + temp_rows.data(), + permutation_.data(), + context->DefaultStream()); + UpdateValues(bsm); +} +void CudaBlockSparseCRSView::UpdateValues(const BlockSparseMatrix& bsm) { + streamed_buffer_.CopyToGpu( + bsm.values(), + bsm.num_nonzeros(), + [permutation = permutation_.data(), + values_to = crs_matrix_->mutable_values()]( + const double* values, int num_values, int offset, auto stream) { + PermuteValues( + offset, num_values, permutation, values, values_to, stream); + }); +} + +} // namespace ceres::internal +#endif // CERES_NO_CUDA
diff --git a/internal/ceres/cuda_block_sparse_crs_view.h b/internal/ceres/cuda_block_sparse_crs_view.h new file mode 100644 index 0000000..3ea8498 --- /dev/null +++ b/internal/ceres/cuda_block_sparse_crs_view.h
@@ -0,0 +1,113 @@ +// Ceres Solver - A fast non-linear least squares minimizer +// Copyright 2023 Google Inc. All rights reserved. +// http://ceres-solver.org/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Google Inc. nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin) +// + +#ifndef CERES_INTERNAL_CUDA_BLOCK_SPARSE_CRS_VIEW_H_ +#define CERES_INTERNAL_CUDA_BLOCK_SPARSE_CRS_VIEW_H_ + +#include "ceres/internal/config.h" + +#ifndef CERES_NO_CUDA + +#include <memory> + +#include "ceres/block_sparse_matrix.h" +#include "ceres/cuda_buffer.h" +#include "ceres/cuda_sparse_matrix.h" +#include "ceres/cuda_streamed_buffer.h" + +namespace ceres::internal { +// We use cuSPARSE library for SpMV operations. However, it does not support +// block-sparse format with varying size of the blocks. Thus, we perform the +// following operations in order to compute products of block-sparse matrices +// and dense vectors on gpu: +// - Once per block-sparse structure update: +// - Compute CRS structure from block-sparse structure +// - Compute permutation from block-sparse values to CRS values +// - Once per block-sparse values update: +// - Update values in CRS matrix with values of block-sparse matrix +// +// Since there are no constraints on positions of cells in value array of +// block-sparse matrix, a permutation from block-sparse values to CRS +// values is stored explicitly. +// +// Example: given matrix with the following block-structure +// [ 1 2 | | 6 7 ] +// [ 3 4 | | 8 9 ] +// [-----+-+-----] +// [ |5| ] +// with values stored as values_block_sparse = [1, 2, 3, 4, 5, 6, 7, 8, 9], +// permutation from block-sparse to CRS is p = [0, 1, 4, 5, 8, 2, 3, 6, 7]; +// and i-th block-sparse value has index p[i] in CRS values array +// +// This allows to avoid storing both CRS and block-sparse values in GPU memory. +// Instead, block-sparse values are transferred to gpu memory as a disjoint set +// of small continuous segments with simultaneous permutation of the values into +// correct order +class CERES_NO_EXPORT CudaBlockSparseCRSView { + public: + // Initializes internal CRS matrix and permutation from block-sparse to CRS + // values. The following objects are stored in gpu memory for the whole + // lifetime of the object + // - crs_matrix_: CRS matrix + // - permutation_: permutation from block-sparse to CRS value order + // (num_nonzeros integer values) + // - streamed_buffer_: helper for value updating + // The following objects are created temporarily during construction: + // - CudaBlockSparseStructure: block-sparse structure of block-sparse matrix + // - num_rows integer values: row to row-block map + // If copy_values flag is set to false, only structure of block-sparse matrix + // bsm is captured, and values are left uninitialized + CudaBlockSparseCRSView(const BlockSparseMatrix& bsm, ContextImpl* context); + + const CudaSparseMatrix* crs_matrix() const { return crs_matrix_.get(); } + CudaSparseMatrix* mutable_crs_matrix() { return crs_matrix_.get(); } + + // Update values of crs_matrix_ using values of block-sparse matrix. + // Assumes that bsm has the same block-sparse structure as matrix that was + // used for construction. + void UpdateValues(const BlockSparseMatrix& bsm); + + private: + // Value permutation kernel performs a single element-wise operation per + // thread, thus performing permutation in blocks of 8 megabytes of + // block-sparse values seems reasonable + static constexpr int kMaxTemporaryArraySize = 1 * 1024 * 1024; + std::unique_ptr<CudaSparseMatrix> crs_matrix_; + // Permutation from block-sparse to CRS value order. + // permutation_[i] = index of i-th block-sparse value in CRS values + CudaBuffer<int> permutation_; + CudaStreamedBuffer<double> streamed_buffer_; +}; + +} // namespace ceres::internal + +#endif // CERES_NO_CUDA +#endif // CERES_INTERNAL_CUDA_BLOCK_SPARSE_CRS_VIEW_H_
diff --git a/internal/ceres/cuda_block_sparse_crs_view_test.cc b/internal/ceres/cuda_block_sparse_crs_view_test.cc new file mode 100644 index 0000000..f8d1cdf --- /dev/null +++ b/internal/ceres/cuda_block_sparse_crs_view_test.cc
@@ -0,0 +1,100 @@ +// Ceres Solver - A fast non-linear least squares minimizer +// Copyright 2023 Google Inc. All rights reserved. +// http://ceres-solver.org/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Google Inc. nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin) + +#include "ceres/cuda_block_sparse_crs_view.h" + +#include <glog/logging.h> +#include <gtest/gtest.h> + +#ifndef CERES_NO_CUDA + +namespace ceres::internal { +class CudaBlockSparseCRSViewTest : public ::testing::Test { + protected: + void SetUp() final { + std::string message; + CHECK(context_.InitCuda(&message)) + << "InitCuda() failed because: " << message; + + BlockSparseMatrix::RandomMatrixOptions options; + options.num_row_blocks = 1234; + options.min_row_block_size = 1; + options.max_row_block_size = 10; + options.num_col_blocks = 567; + options.min_col_block_size = 1; + options.max_col_block_size = 10; + options.block_density = 0.2; + std::mt19937 rng; + A_ = BlockSparseMatrix::CreateRandomMatrix(options, rng); + std::iota( + A_->mutable_values(), A_->mutable_values() + A_->num_nonzeros(), 1); + } + + std::unique_ptr<BlockSparseMatrix> A_; + ContextImpl context_; +}; + +TEST_F(CudaBlockSparseCRSViewTest, CreateUpdateValues) { + auto view = CudaBlockSparseCRSView(*A_, &context_); + + // TODO: we definitely would like to use crs_matrix() here, but + // CudaSparseMatrix::RightMultiplyAndAccumulate is defined non-const because + // it might allocate additional storage by request of cuSPARSE + auto matrix = view.mutable_crs_matrix(); + ASSERT_EQ(matrix->num_cols(), A_->num_cols()); + ASSERT_EQ(matrix->num_rows(), A_->num_rows()); + ASSERT_EQ(matrix->num_nonzeros(), A_->num_nonzeros()); + + const int num_rows = A_->num_rows(); + const int num_cols = A_->num_cols(); + Vector x(num_cols); + Vector y(num_rows); + CudaVector x_cuda(&context_, num_cols); + CudaVector y_cuda(&context_, num_rows); + Vector y_cuda_host(num_rows); + + for (int i = 0; i < num_cols; ++i) { + x.setZero(); + y.setZero(); + y_cuda.SetZero(); + x[i] = 1.; + x_cuda.CopyFromCpu(x); + A_->RightMultiplyAndAccumulate( + x.data(), y.data(), &context_, std::thread::hardware_concurrency()); + matrix->RightMultiplyAndAccumulate(x_cuda, &y_cuda); + y_cuda.CopyTo(&y_cuda_host); + // There will be up to 1 non-zero product per row, thus we expect an exact + // match on 32-bit integer indices + EXPECT_EQ((y - y_cuda_host).squaredNorm(), 0.); + } +} +} // namespace ceres::internal + +#endif // CERES_NO_CUDA
diff --git a/internal/ceres/cuda_block_structure.cc b/internal/ceres/cuda_block_structure.cc new file mode 100644 index 0000000..8e549e1 --- /dev/null +++ b/internal/ceres/cuda_block_structure.cc
@@ -0,0 +1,118 @@ +// Ceres Solver - A fast non-linear least squares minimizer +// Copyright 2023 Google Inc. All rights reserved. +// http://ceres-solver.org/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Google Inc. nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin) + +#include "ceres/cuda_block_structure.h" + +#ifndef CERES_NO_CUDA + +namespace ceres::internal { +namespace { +// Dimension of a sorted array of blocks +inline int Dimension(const std::vector<Block>& blocks) { + if (blocks.empty()) { + return 0; + } + const auto& last = blocks.back(); + return last.size + last.position; +} +} // namespace + +CudaBlockSparseStructure::CudaBlockSparseStructure( + const CompressedRowBlockStructure& block_structure, ContextImpl* context) + : row_block_offsets_(context), + cells_(context), + row_blocks_(context), + col_blocks_(context) { + // Row blocks extracted from CompressedRowBlockStructure::rows + std::vector<Block> row_blocks; + // Column blocks can be reused as-is + const auto& col_blocks = block_structure.cols; + + // Row block offset is an index of the first cell corresponding to row block + std::vector<int> row_block_offsets; + // Flat array of all cells from all row-blocks + std::vector<Cell> cells; + + num_row_blocks_ = block_structure.rows.size(); + num_col_blocks_ = col_blocks.size(); + + row_blocks.reserve(num_row_blocks_); + row_block_offsets.reserve(num_row_blocks_ + 1); + num_nonzeros_ = 0; + num_cells_ = 0; + for (const auto& r : block_structure.rows) { + const int row_block_size = r.block.size; + row_blocks.emplace_back(r.block); + row_block_offsets.push_back(num_cells_); + for (const auto& c : r.cells) { + cells.emplace_back(c); + const int col_block_size = col_blocks[c.block_id].size; + num_nonzeros_ += col_block_size * row_block_size; + ++num_cells_; + } + } + row_block_offsets.push_back(num_cells_); + + num_rows_ = Dimension(row_blocks); + num_cols_ = Dimension(col_blocks); + + if (VLOG_IS_ON(3)) { + const size_t row_block_offsets_size = + row_block_offsets.size() * sizeof(int); + const size_t cells_size = cells.size() * sizeof(Cell); + const size_t row_blocks_size = row_blocks.size() * sizeof(Block); + const size_t col_blocks_size = col_blocks.size() * sizeof(Block); + const size_t total_size = + row_block_offsets_size + cells_size + col_blocks_size + row_blocks_size; + VLOG(3) << "\nCudaBlockSparseStructure:\n" + "\tRow block offsets: " + << row_block_offsets_size + << " bytes\n" + "\tColumn blocks: " + << col_blocks_size + << " bytes\n" + "\tRow blocks: " + << row_blocks_size + << " bytes\n" + "\tCells: " + << cells_size + << " bytes\n" + "\tTotal: " + << total_size << " bytes of GPU memory"; + } + + row_block_offsets_.CopyFromCpuVector(row_block_offsets); + cells_.CopyFromCpuVector(cells); + row_blocks_.CopyFromCpuVector(row_blocks); + col_blocks_.CopyFromCpuVector(col_blocks); +} +} // namespace ceres::internal + +#endif // CERES_NO_CUDA
diff --git a/internal/ceres/cuda_block_structure.h b/internal/ceres/cuda_block_structure.h new file mode 100644 index 0000000..a41d0b3 --- /dev/null +++ b/internal/ceres/cuda_block_structure.h
@@ -0,0 +1,94 @@ +// Ceres Solver - A fast non-linear least squares minimizer +// Copyright 2023 Google Inc. All rights reserved. +// http://ceres-solver.org/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Google Inc. nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin) + +#ifndef CERES_INTERNAL_CUDA_BLOCK_STRUCTURE_H_ +#define CERES_INTERNAL_CUDA_BLOCK_STRUCTURE_H_ + +#include "ceres/internal/config.h" + +#ifndef CERES_NO_CUDA + +#include "ceres/block_structure.h" +#include "ceres/cuda_buffer.h" + +namespace ceres::internal { +class CudaBlockStructureTest; + +// This class stores a read-only block-sparse structure in gpu memory. +// Invariants are the same as those of CompressedRowBlockStructure. +// In order to simplify allocation and copying data to gpu, cells from all +// row-blocks are stored in a single array sequentially. Additional array +// row_block_offsets of size num_row_blocks + 1 allows to identify range of +// cells corresponding to a row-block. +// Cells corresponding to i-th row-block are stored in sub-array +// cells[row_block_offsets[i]; ... row_block_offsets[i + 1] - 1], and their +// order is preserved. +class CERES_NO_EXPORT CudaBlockSparseStructure { + public: + // CompressedRowBlockStructure is contains a vector of CompressedLists, with + // each CompressedList containing a vector of Cells. We precompute a flat + // array of cells on cpu and transfer it to the gpu. + CudaBlockSparseStructure(const CompressedRowBlockStructure& block_structure, + ContextImpl* context); + + int num_rows() const { return num_rows_; } + int num_cols() const { return num_cols_; } + int num_cells() const { return num_cells_; } + int num_nonzeros() const { return num_nonzeros_; } + int num_row_blocks() const { return num_row_blocks_; } + int num_col_blocks() const { return num_col_blocks_; } + + // Device pointer to array of num_row_blocks + 1 indices of the first cell of + // row block + const int* row_block_offsets() const { return row_block_offsets_.data(); } + // Device pointer to array of num_cells cells, sorted by row-block + const Cell* cells() const { return cells_.data(); } + // Device pointer to array of row blocks + const Block* row_blocks() const { return row_blocks_.data(); } + // Device pointer to array of column blocks + const Block* col_blocks() const { return col_blocks_.data(); } + + private: + int num_rows_; + int num_cols_; + int num_cells_; + int num_nonzeros_; + int num_row_blocks_; + int num_col_blocks_; + CudaBuffer<int> row_block_offsets_; + CudaBuffer<Cell> cells_; + CudaBuffer<Block> row_blocks_; + CudaBuffer<Block> col_blocks_; + friend class CudaBlockStructureTest; +}; +} // namespace ceres::internal + +#endif // CERES_NO_CUDA +#endif // CERES_INTERNAL_CUDA_BLOCK_SPARSE_STRUCTURE_H_
diff --git a/internal/ceres/cuda_block_structure_test.cc b/internal/ceres/cuda_block_structure_test.cc new file mode 100644 index 0000000..95cb26b --- /dev/null +++ b/internal/ceres/cuda_block_structure_test.cc
@@ -0,0 +1,142 @@ +// Ceres Solver - A fast non-linear least squares minimizer +// Copyright 2023 Google Inc. All rights reserved. +// http://ceres-solver.org/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Google Inc. nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin) + +#include "ceres/internal/config.h" + +#ifndef CERES_NO_CUDA + +#include <glog/logging.h> +#include <gtest/gtest.h> + +#include <numeric> + +#include "ceres/block_sparse_matrix.h" +#include "ceres/cuda_block_structure.h" + +namespace ceres::internal { + +class CudaBlockStructureTest : public ::testing::Test { + protected: + void SetUp() final { + std::string message; + CHECK(context_.InitCuda(&message)) + << "InitCuda() failed because: " << message; + + BlockSparseMatrix::RandomMatrixOptions options; + options.num_row_blocks = 1234; + options.min_row_block_size = 1; + options.max_row_block_size = 10; + options.num_col_blocks = 567; + options.min_col_block_size = 1; + options.max_col_block_size = 10; + options.block_density = 0.2; + std::mt19937 rng; + A_ = BlockSparseMatrix::CreateRandomMatrix(options, rng); + std::iota( + A_->mutable_values(), A_->mutable_values() + A_->num_nonzeros(), 1); + } + + std::vector<Cell> GetCells(const CudaBlockSparseStructure& structure) { + const auto& cuda_buffer = structure.cells_; + std::vector<Cell> cells(cuda_buffer.size()); + cuda_buffer.CopyToCpu(cells.data(), cells.size()); + return cells; + } + std::vector<Block> GetRowBlocks(const CudaBlockSparseStructure& structure) { + const auto& cuda_buffer = structure.row_blocks_; + std::vector<Block> blocks(cuda_buffer.size()); + cuda_buffer.CopyToCpu(blocks.data(), blocks.size()); + return blocks; + } + std::vector<Block> GetColBlocks(const CudaBlockSparseStructure& structure) { + const auto& cuda_buffer = structure.col_blocks_; + std::vector<Block> blocks(cuda_buffer.size()); + cuda_buffer.CopyToCpu(blocks.data(), blocks.size()); + return blocks; + } + std::vector<int> GetRowBlockOffsets( + const CudaBlockSparseStructure& structure) { + const auto& cuda_buffer = structure.row_block_offsets_; + std::vector<int> row_block_offsets(cuda_buffer.size()); + cuda_buffer.CopyToCpu(row_block_offsets.data(), row_block_offsets.size()); + return row_block_offsets; + } + + std::unique_ptr<BlockSparseMatrix> A_; + ContextImpl context_; +}; + +TEST_F(CudaBlockStructureTest, StructureIdentity) { + auto block_structure = A_->block_structure(); + const int num_row_blocks = block_structure->rows.size(); + const int num_col_blocks = block_structure->cols.size(); + + CudaBlockSparseStructure cuda_block_structure(*block_structure, &context_); + + ASSERT_EQ(cuda_block_structure.num_rows(), A_->num_rows()); + ASSERT_EQ(cuda_block_structure.num_cols(), A_->num_cols()); + ASSERT_EQ(cuda_block_structure.num_nonzeros(), A_->num_nonzeros()); + ASSERT_EQ(cuda_block_structure.num_row_blocks(), num_row_blocks); + ASSERT_EQ(cuda_block_structure.num_col_blocks(), num_col_blocks); + + std::vector<Block> blocks = GetColBlocks(cuda_block_structure); + ASSERT_EQ(blocks.size(), num_col_blocks); + for (int i = 0; i < num_col_blocks; ++i) { + EXPECT_EQ(block_structure->cols[i].position, blocks[i].position); + EXPECT_EQ(block_structure->cols[i].size, blocks[i].size); + } + + std::vector<Cell> cells = GetCells(cuda_block_structure); + std::vector<int> row_block_offsets = GetRowBlockOffsets(cuda_block_structure); + blocks = GetRowBlocks(cuda_block_structure); + + ASSERT_EQ(blocks.size(), num_row_blocks); + ASSERT_EQ(row_block_offsets.size(), num_row_blocks + 1); + ASSERT_EQ(row_block_offsets.back(), cells.size()); + + for (int i = 0; i < num_row_blocks; ++i) { + const int num_cells = block_structure->rows[i].cells.size(); + EXPECT_EQ(blocks[i].position, block_structure->rows[i].block.position); + EXPECT_EQ(blocks[i].size, block_structure->rows[i].block.size); + const int first_cell = row_block_offsets[i]; + const int last_cell = row_block_offsets[i + 1]; + ASSERT_EQ(last_cell - first_cell, num_cells); + for (int j = 0; j < num_cells; ++j) { + EXPECT_EQ(cells[first_cell + j].block_id, + block_structure->rows[i].cells[j].block_id); + EXPECT_EQ(cells[first_cell + j].position, + block_structure->rows[i].cells[j].position); + } + } +} + +} // namespace ceres::internal + +#endif // CERES_NO_CUDA
diff --git a/internal/ceres/cuda_buffer.h b/internal/ceres/cuda_buffer.h index 97d126c..d6abb15 100644 --- a/internal/ceres/cuda_buffer.h +++ b/internal/ceres/cuda_buffer.h
@@ -86,7 +86,7 @@ data, size * sizeof(T), cudaMemcpyHostToDevice, - context_->stream_), + context_->DefaultStream()), cudaSuccess); } @@ -98,7 +98,7 @@ data.data(), data.size() * sizeof(T), cudaMemcpyHostToDevice, - context_->stream_), + context_->DefaultStream()), cudaSuccess); } @@ -110,7 +110,7 @@ data, size * sizeof(T), cudaMemcpyDeviceToDevice, - context_->stream_), + context_->DefaultStream()), cudaSuccess); } @@ -126,9 +126,9 @@ data_, size * sizeof(T), cudaMemcpyDeviceToHost, - context_->stream_), + context_->DefaultStream()), cudaSuccess); - CHECK_EQ(cudaStreamSynchronize(context_->stream_), cudaSuccess); + CHECK_EQ(cudaStreamSynchronize(context_->DefaultStream()), cudaSuccess); } // Copy N items from another GPU memory array to the GPU memory managed by @@ -142,7 +142,7 @@ other.data_, size_ * sizeof(T), cudaMemcpyDeviceToDevice, - context_->stream_), + context_->DefaultStream()), cudaSuccess); }
diff --git a/internal/ceres/cuda_kernels.cu.cc b/internal/ceres/cuda_kernels.cu.cc index c0f5453..3d93ae8 100644 --- a/internal/ceres/cuda_kernels.cu.cc +++ b/internal/ceres/cuda_kernels.cu.cc
@@ -28,6 +28,12 @@ // // Author: joydeepb@cs.utexas.edu (Joydeep Biswas) +//#include "ceres/cuda_kernels.h" + +#include <thrust/execution_policy.h> +#include <thrust/scan.h> + +#include "block_structure.h" #include "cuda_runtime.h" namespace ceres { @@ -40,6 +46,9 @@ // might provide better performance. See "Occupancy Calculator" under the CUDA // toolkit documentation. constexpr int kCudaBlockSize = 256; +inline int NumBlocks(int size) { + return (size + kCudaBlockSize - 1) / kCudaBlockSize; +} template <typename SrcType, typename DstType> __global__ void TypeConversionKernel(const SrcType* __restrict__ input, @@ -55,7 +64,7 @@ float* output, const int size, cudaStream_t stream) { - const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize; + const int num_blocks = NumBlocks(size); TypeConversionKernel<double, float> <<<num_blocks, kCudaBlockSize, 0, stream>>>(input, output, size); } @@ -64,7 +73,7 @@ double* output, const int size, cudaStream_t stream) { - const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize; + const int num_blocks = NumBlocks(size); TypeConversionKernel<float, double> <<<num_blocks, kCudaBlockSize, 0, stream>>>(input, output, size); } @@ -78,12 +87,12 @@ } void CudaSetZeroFP32(float* output, const int size, cudaStream_t stream) { - const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize; + const int num_blocks = NumBlocks(size); SetZeroKernel<float><<<num_blocks, kCudaBlockSize, 0, stream>>>(output, size); } void CudaSetZeroFP64(double* output, const int size, cudaStream_t stream) { - const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize; + const int num_blocks = NumBlocks(size); SetZeroKernel<double> <<<num_blocks, kCudaBlockSize, 0, stream>>>(output, size); } @@ -99,7 +108,7 @@ } void CudaDsxpy(double* x, float* y, const int size, cudaStream_t stream) { - const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize; + const int num_blocks = NumBlocks(size); XPlusEqualsYKernel<float, double> <<<num_blocks, kCudaBlockSize, 0, stream>>>(x, y, size); } @@ -119,9 +128,177 @@ const double* x, const int size, cudaStream_t stream) { - const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize; + const int num_blocks = NumBlocks(size); CudaDtDxpyKernel<<<num_blocks, kCudaBlockSize, 0, stream>>>(y, D, x, size); } +// Fill row block id and nnz for each row using block-sparse structure +// represented by a set of flat arrays. +// Inputs: +// - num_row_blocks: number of row-blocks in block-sparse structure +// - row_block_offsets: index of the first cell of the row-block; size: +// num_row_blocks + 1 +// - cells: cells of block-sparse structure as a continuous array +// - row_blocks: row blocks of block-sparse structure stored sequentially +// - col_blocks: column blocks of block-sparse structure stored sequentially +// Outputs: +// - rows: rows[i + 1] will contain number of non-zeros in i-th row, rows[0] +// will be set to 0; rows are filled with a shift by one element in order +// to obtain row-index array of CRS matrix with a inclusive scan afterwards +// - row_block_ids: row_block_ids[i] will be set to index of row-block that +// contains i-th row. +// Computation is perform row-block-wise +__global__ void RowBlockIdAndNNZ(int num_row_blocks, + const int* __restrict__ row_block_offsets, + const Cell* __restrict__ cells, + const Block* __restrict__ row_blocks, + const Block* __restrict__ col_blocks, + int* __restrict__ rows, + int* __restrict__ row_block_ids) { + const int row_block_id = blockIdx.x * blockDim.x + threadIdx.x; + if (row_block_id > num_row_blocks) { + // No synchronization is performed in this kernel, thus it is safe to return + return; + } + if (row_block_id == num_row_blocks) { + // one extra thread sets the first element + rows[0] = 0; + return; + } + const auto& row_block = row_blocks[row_block_id]; + int row_nnz = 0; + const auto first_cell = cells + row_block_offsets[row_block_id]; + const auto last_cell = cells + row_block_offsets[row_block_id + 1]; + for (auto cell = first_cell; cell < last_cell; ++cell) { + row_nnz += col_blocks[cell->block_id].size; + } + const int first_row = row_block.position; + const int last_row = first_row + row_block.size; + for (int i = first_row; i < last_row; ++i) { + rows[i + 1] = row_nnz; + row_block_ids[i] = row_block_id; + } +} + +// Row-wise creation of CRS structure +// Inputs: +// - num_rows: number of rows in matrix +// - row_block_offsets: index of the first cell of the row-block; size: +// num_row_blocks + 1 +// - cells: cells of block-sparse structure as a continuous array +// - row_blocks: row blocks of block-sparse structure stored sequentially +// - col_blocks: column blocks of block-sparse structure stored sequentially +// - row_block_ids: index of row-block that corresponds to row +// - rows: row-index array of CRS structure +// Outputs: +// - cols: column-index array of CRS structure +// - permutation: permutation from block-sparse to crs order +// Computaion is perform row-wise +__global__ void ComputeColumnsAndPermutation( + int num_rows, + const int* __restrict__ row_block_offsets, + const Cell* __restrict__ cells, + const Block* __restrict__ row_blocks, + const Block* __restrict__ col_blocks, + const int* __restrict__ row_block_ids, + const int* __restrict__ rows, + int* __restrict__ cols, + int* __restrict__ permutation) { + const int row = blockIdx.x * blockDim.x + threadIdx.x; + if (row >= num_rows) { + // No synchronization is performed in this kernel, thus it is safe to return + return; + } + const int row_block_id = row_block_ids[row]; + // zero-based index of row in row-block + const int row_in_block = row - row_blocks[row_block_id].position; + // position in crs matrix + int crs_position = rows[row]; + const auto first_cell = cells + row_block_offsets[row_block_id]; + const auto last_cell = cells + row_block_offsets[row_block_id + 1]; + // For reach cell of row-block only current row is being filled + for (auto cell = first_cell; cell < last_cell; ++cell) { + const auto& col_block = col_blocks[cell->block_id]; + const int col_block_size = col_block.size; + int column_idx = col_block.position; + int bs_position = cell->position + row_in_block * col_block_size; + // Fill permutation and column indices for each element of row_in_block + // row of current cell + for (int i = 0; i < col_block_size; ++i, ++crs_position) { + permutation[bs_position++] = crs_position; + cols[crs_position] = column_idx++; + } + } +} + +void FillCRSStructure(const int num_row_blocks, + const int num_rows, + const int* row_block_offsets, + const Cell* cells, + const Block* row_blocks, + const Block* col_blocks, + int* rows, + int* cols, + int* row_block_ids, + int* permutation, + cudaStream_t stream) { + // Set number of non-zeros per row in rows array and row to row-block map in + // row_block_ids array + const int num_blocks_blockwise = NumBlocks(num_row_blocks + 1); + RowBlockIdAndNNZ<<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>( + num_row_blocks, + row_block_offsets, + cells, + row_blocks, + col_blocks, + rows, + row_block_ids); + // Finalize row-index array of CRS strucure by computing prefix sum + thrust::inclusive_scan( + thrust::cuda::par.on(stream), rows, rows + num_rows + 1, rows); + + // Fill cols array of CRS structure and permutation from block-sparse to CRS + const int num_blocks_rowwise = NumBlocks(num_rows); + ComputeColumnsAndPermutation<<<num_blocks_rowwise, + kCudaBlockSize, + 0, + stream>>>(num_rows, + row_block_offsets, + cells, + row_blocks, + col_blocks, + row_block_ids, + rows, + cols, + permutation); +} + +// Updates values of CRS matrix with a continuous block of values of +// block-sparse matrix. With permutation[i] being an index of i-th value of +// block-sparse matrix in values of CRS matrix updating values is quite +// efficient when performed element-wise (reads of permutation and values arrays +// are coalesced when offset is divisable by 16) +__global__ void PermuteValuesKernel(const int offset, + const int num_values, + const int* permutation, + const double* block_sparse_values, + double* crs_values) { + const int value_id = blockIdx.x * blockDim.x + threadIdx.x; + if (value_id < num_values) { + crs_values[permutation[offset + value_id]] = block_sparse_values[value_id]; + } +} + +void PermuteValues(const int offset, + const int num_values, + const int* permutation, + const double* block_sparse_values, + double* crs_values, + cudaStream_t stream) { + const int num_blocks = NumBlocks(num_values); + PermuteValuesKernel<<<num_blocks, kCudaBlockSize, 0, stream>>>( + offset, num_values, permutation, block_sparse_values, crs_values); +} + } // namespace internal } // namespace ceres
diff --git a/internal/ceres/cuda_kernels.h b/internal/ceres/cuda_kernels.h index d347c20..61f945a 100644 --- a/internal/ceres/cuda_kernels.h +++ b/internal/ceres/cuda_kernels.h
@@ -37,7 +37,10 @@ #include "cuda_runtime.h" -namespace ceres::internal { +namespace ceres { +namespace internal { +class Block; +class Cell; // Convert an array of double (FP64) values to float (FP32). Both arrays must // already be on GPU memory. @@ -72,7 +75,38 @@ const int size, cudaStream_t stream); -} // namespace ceres::internal +// Compute structure of CRS matrix and permutation of values using block-sparse +// structure and temporary array row_block_ids. Array row_block_ids of size +// num_rows will be filled with indices of row-blocks corresponding to rows of +// CRS matrix. Arrays corresponding to CRS matrix, permutation and row_block_ids +// arrays are to be allocated by caller +void FillCRSStructure(const int num_row_blocks, + const int num_rows, + const int* row_block_offsets, + const Cell* cells, + const Block* row_blocks, + const Block* col_blocks, + int* rows, + int* cols, + int* row_block_ids, + int* permutation, + cudaStream_t stream); + +// Permute block of block-sparse values using permutation +// Pointer block_sparse_values corresponds to a block of num_values values from +// block-sparse matrix at the offset from begining. Pointer output corresponds +// to values of CRS matrix. Array permutation stores permutation from +// block-sparse to CRS matrix with permutation[i] being an index of i-th value +// of block-sparse matrix in values of CRS matrix +void PermuteValues(const int offset, + const int num_values, + const int* permutation, + const double* block_sparse_values, + double* crs_values, + cudaStream_t stream); + +} // namespace internal +} // namespace ceres #endif // CERES_NO_CUDA
diff --git a/internal/ceres/cuda_kernels_test.cc b/internal/ceres/cuda_kernels_test.cc index 32ff55a..41036a3 100644 --- a/internal/ceres/cuda_kernels_test.cc +++ b/internal/ceres/cuda_kernels_test.cc
@@ -57,8 +57,10 @@ fp64_gpu.CopyFromCpuVector(fp64_cpu); CudaBuffer<float> fp32_gpu(&context); fp32_gpu.Reserve(fp64_cpu.size()); - CudaFP64ToFP32( - fp64_gpu.data(), fp32_gpu.data(), fp64_cpu.size(), context.stream_); + CudaFP64ToFP32(fp64_gpu.data(), + fp32_gpu.data(), + fp64_cpu.size(), + context.DefaultStream()); std::vector<float> fp32_cpu(fp64_cpu.size()); fp32_gpu.CopyToCpu(fp32_cpu.data(), fp32_cpu.size()); for (int i = 0; i < fp32_cpu.size(); ++i) { @@ -82,8 +84,10 @@ fp64_gpu.CopyFromCpuVector(fp64_cpu); CudaBuffer<float> fp32_gpu(&context); fp32_gpu.Reserve(fp64_cpu.size()); - CudaFP64ToFP32( - fp64_gpu.data(), fp32_gpu.data(), fp64_cpu.size(), context.stream_); + CudaFP64ToFP32(fp64_gpu.data(), + fp32_gpu.data(), + fp64_cpu.size(), + context.DefaultStream()); std::vector<float> fp32_cpu(fp64_cpu.size()); fp32_gpu.CopyToCpu(fp32_cpu.data(), fp32_cpu.size()); EXPECT_EQ(fp32_cpu[0], 0.0f); @@ -101,8 +105,10 @@ fp32_gpu.CopyFromCpuVector(fp32_cpu); CudaBuffer<double> fp64_gpu(&context); fp64_gpu.Reserve(fp32_cpu.size()); - CudaFP32ToFP64( - fp32_gpu.data(), fp64_gpu.data(), fp32_cpu.size(), context.stream_); + CudaFP32ToFP64(fp32_gpu.data(), + fp64_gpu.data(), + fp32_cpu.size(), + context.DefaultStream()); std::vector<double> fp64_cpu(fp32_cpu.size()); fp64_gpu.CopyToCpu(fp64_cpu.data(), fp64_cpu.size()); for (int i = 0; i < fp64_cpu.size(); ++i) { @@ -117,7 +123,7 @@ std::vector<float> fp32_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0}; CudaBuffer<float> fp32_gpu(&context); fp32_gpu.CopyFromCpuVector(fp32_cpu); - CudaSetZeroFP32(fp32_gpu.data(), fp32_cpu.size(), context.stream_); + CudaSetZeroFP32(fp32_gpu.data(), fp32_cpu.size(), context.DefaultStream()); std::vector<float> fp32_cpu_zero(fp32_cpu.size()); fp32_gpu.CopyToCpu(fp32_cpu_zero.data(), fp32_cpu_zero.size()); for (int i = 0; i < fp32_cpu_zero.size(); ++i) { @@ -132,7 +138,7 @@ std::vector<double> fp64_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0}; CudaBuffer<double> fp64_gpu(&context); fp64_gpu.CopyFromCpuVector(fp64_cpu); - CudaSetZeroFP64(fp64_gpu.data(), fp64_cpu.size(), context.stream_); + CudaSetZeroFP64(fp64_gpu.data(), fp64_cpu.size(), context.DefaultStream()); std::vector<double> fp64_cpu_zero(fp64_cpu.size()); fp64_gpu.CopyToCpu(fp64_cpu_zero.data(), fp64_cpu_zero.size()); for (int i = 0; i < fp64_cpu_zero.size(); ++i) { @@ -151,8 +157,10 @@ fp32_gpu_a.CopyFromCpuVector(fp32_cpu_a); CudaBuffer<double> fp64_gpu_b(&context); fp64_gpu_b.CopyFromCpuVector(fp64_cpu_b); - CudaDsxpy( - fp64_gpu_b.data(), fp32_gpu_a.data(), fp32_gpu_a.size(), context.stream_); + CudaDsxpy(fp64_gpu_b.data(), + fp32_gpu_a.data(), + fp32_gpu_a.size(), + context.DefaultStream()); fp64_gpu_b.CopyToCpu(fp64_cpu_b.data(), fp64_cpu_b.size()); for (int i = 0; i < fp64_cpu_b.size(); ++i) { EXPECT_DOUBLE_EQ(fp64_cpu_b[i], 2.0 * fp32_cpu_a[i]); @@ -172,8 +180,11 @@ y_gpu.CopyFromCpuVector(y_cpu); CudaBuffer<double> d_gpu(&context); d_gpu.CopyFromCpuVector(d_cpu); - CudaDtDxpy( - y_gpu.data(), d_gpu.data(), x_gpu.data(), y_gpu.size(), context.stream_); + CudaDtDxpy(y_gpu.data(), + d_gpu.data(), + x_gpu.data(), + y_gpu.size(), + context.DefaultStream()); y_gpu.CopyToCpu(y_cpu.data(), y_cpu.size()); EXPECT_DOUBLE_EQ(y_cpu[0], 4.0 + 10.0 * 10.0 * 1.0); EXPECT_DOUBLE_EQ(y_cpu[1], 3.0 + 20.0 * 20.0 * 2.0);
diff --git a/internal/ceres/cuda_sparse_matrix.cc b/internal/ceres/cuda_sparse_matrix.cc index 7ee1761..0ae8c5d 100644 --- a/internal/ceres/cuda_sparse_matrix.cc +++ b/internal/ceres/cuda_sparse_matrix.cc
@@ -59,6 +59,31 @@ namespace ceres::internal { +CudaSparseMatrix::CudaSparseMatrix(int num_rows, + int num_cols, + int num_nonzeros, + ContextImpl* context) + : context_(context), + rows_(context, num_rows + 1), + cols_(context, num_nonzeros), + values_(context, num_nonzeros), + spmv_buffer_(context), + num_rows_(num_rows), + num_cols_(num_cols), + num_nonzeros_(num_nonzeros) { + cusparseCreateCsr(&descr_, + num_rows_, + num_cols_, + num_nonzeros_, + rows_.data(), + cols_.data(), + values_.data(), + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_64F); +} + CudaSparseMatrix::CudaSparseMatrix(ContextImpl* context, const CompressedRowSparseMatrix& crs_matrix) : context_(context),
diff --git a/internal/ceres/cuda_sparse_matrix.h b/internal/ceres/cuda_sparse_matrix.h index d5f3332..ad55170 100644 --- a/internal/ceres/cuda_sparse_matrix.h +++ b/internal/ceres/cuda_sparse_matrix.h
@@ -64,6 +64,13 @@ CudaSparseMatrix(ContextImpl* context, const CompressedRowSparseMatrix& crs_matrix); + // Creates a "blank" matrix with an appropriate amount of memory allocated. + // The object itself is left in an inconsistent state. + CudaSparseMatrix(int num_rows, + int num_cols, + int num_nonzeros, + ContextImpl* context); + ~CudaSparseMatrix(); // y = y + Ax; @@ -75,6 +82,10 @@ int num_cols() const { return num_cols_; } int num_nonzeros() const { return num_nonzeros_; } + int32_t* mutable_rows() { return rows_.data(); } + int32_t* mutable_cols() { return cols_.data(); } + double* mutable_values() { return values_.data(); } + // If subsequent uses of this matrix involve only numerical changes and no // structural changes, then this method can be used to copy the updated // non-zero values -- the row and column index arrays are kept the same. It
diff --git a/internal/ceres/cuda_streamed_buffer.h b/internal/ceres/cuda_streamed_buffer.h new file mode 100644 index 0000000..8761ef4 --- /dev/null +++ b/internal/ceres/cuda_streamed_buffer.h
@@ -0,0 +1,335 @@ +// Ceres Solver - A fast non-linear least squares minimizer +// Copyright 2023 Google Inc. All rights reserved. +// http://ceres-solver.org/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Google Inc. nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin) + +#ifndef CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_ +#define CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_ + +#include "ceres/internal/config.h" + +#ifndef CERES_NO_CUDA +#include "ceres/cuda_buffer.h" + +namespace ceres::internal { + +// Most contemporary CUDA devices are capable of simultaneous code execution and +// host-to-device transfer. This class copies batches of data to GPU memory and +// executes processing of copied data in parallel (asynchronously). +// Data is copied to a fixed-size buffer on GPU (containing at most +// max_buffer_size values), and this memory is re-used when the previous +// batch of values is processed by user-provided callback +// Host-to-device copy uses a temporary buffer if required. Each batch of values +// has size of kValuesPerBatch, except the last one. +template <typename T> +class CERES_NO_EXPORT CudaStreamedBuffer { + public: + // If hardware supports only one host-to-device copy or one host-to-device + // copy is able to reach peak bandwidth, two streams are sufficient to reach + // maximum efficiency: + // - If transferring batch of values takes more time, than processing it on + // gpu, then at every moment of time one of the streams will be transferring + // data and other stream will be either processing data or idle; the whole + // process will be bounded by host-to-device copy. + // - If transferring batch of values takes less time, than processing it on + // gpu, then at every moment of time one of the streams will be processing + // data and other stream will be either performing computations or + // transferring data, and the whole process will be bounded by computations. + static constexpr int kNumBatches = 2; + // max_buffer_size is the maximal size (in elements of type T) of array + // to be pre-allocated in gpu memory. The size of array determines size of + // batch of values for simultaneous copying and processing. It should be large + // enough to allow highly-parallel execution of user kernels; making it too + // large increases latency. + CudaStreamedBuffer(ContextImpl* context, const int max_buffer_size) + : kValuesPerBatch(max_buffer_size / kNumBatches), + context_(context), + values_gpu_(context, kValuesPerBatch * kNumBatches) { + static_assert(ContextImpl::kNumCudaStreams >= kNumBatches); + CHECK_GE(max_buffer_size, kNumBatches); + // Pre-allocate a buffer of page-locked memory for transfers from a regular + // cpu memory. Because we will be only writing into that buffer from cpu, + // memory is allocated with cudaHostAllocWriteCombined flag. + CHECK_EQ(cudaSuccess, + cudaHostAlloc(&values_cpu_pinned_, + sizeof(T) * kValuesPerBatch * kNumBatches, + cudaHostAllocWriteCombined)); + for (auto& e : copy_finished_) { + CHECK_EQ(cudaSuccess, + cudaEventCreateWithFlags(&e, cudaEventDisableTiming)); + } + } + + CudaStreamedBuffer(const CudaStreamedBuffer&) = delete; + + ~CudaStreamedBuffer() { + CHECK_EQ(cudaSuccess, cudaFreeHost(values_cpu_pinned_)); + for (auto& e : copy_finished_) { + CHECK_EQ(cudaSuccess, cudaEventDestroy(e)); + } + } + + // Transfer num_values at host-memory pointer from, calling + // callback(device_pointer, size_of_batch, offset_of_batch, stream_to_use) + // after scheduling transfer of each batch of data. User-provided callback + // should perform processing of data at device_pointer only in + // stream_to_use stream (device_pointer will be re-used in the next + // callback invocation with the same stream). + // + // Two diagrams below describe operation in two possible scenarios, depending + // on input data being stored in page-locked memory. In this example we will + // have max_buffer_size = 2 * K, num_values = N * K and callback + // scheduling a single asynchronous launch of + // Kernel<<..., stream_to_use>>(device_pointer, + // size_of_batch, + // offset_of_batch) + // + // a. Copying from page-locked memory + // In this case no copy on the host-side is necessary, and this method just + // schedules a bunch of interleaved memory copies and callback invocations: + // + // cudaStreamSynchronize(context->DefaultStream()); + // - Iteration #0: + // - cudaMemcpyAsync(values_gpu_, from, K * sizeof(T), H->D, stream_0) + // - callback(values_gpu_, K, 0, stream_0) + // - Iteration #1: + // - cudaMemcpyAsync(values_gpu_ + K, from + K, K * sizeof(T), H->D, + // stream_1) + // - callback(values_gpu_ + K, K, K, stream_1) + // - Iteration #2: + // - cudaMemcpyAsync(values_gpu_, from + 2 * K, K * sizeof(T), H->D, + // stream_0) + // - callback(values_gpu_, K, 2 * K, stream_0) + // - Iteration #3: + // - cudaMemcpyAsync(values_gpu_ + K, from + 3 * K, K * sizeof(T), H->D, + // stream_1) + // - callback(values_gpu_ + K, K, 3 * K, stream_1) + // ... + // - Iteration #i: + // - cudaMemcpyAsync(values_gpu_ + (i % 2) * K, from + i * K, K * + // sizeof(T), H->D, stream_(i % 2)) + // - callback(values_gpu_ + (i % 2) * K, K, i * K, stream_(i % 2) + // ... + // cudaStreamSynchronize(stream_0) + // cudaStreamSynchronize(stream_1) + // + // This sequence of calls results in following activity on gpu (assuming that + // kernel invoked by callback takes less time than host-to-device copy): + // +-------------------+-------------------+ + // | Stream #0 | Stream #1 | + // +-------------------+-------------------+ + // | Copy host->device | | + // | | | + // | | | + // +-------------------+-------------------+ + // | Kernel | Copy host->device | + // +-------------------+ | + // | | | + // +-------------------+-------------------+ + // | Copy host->device | Kernel | + // | +-------------------+ + // | | | + // +-------------------+-------------------+ + // | Kernel | Copy host->device | + // | ... | + // +---------------------------------------+ + // + // b. Copying from regular memory + // In this case a copy from regular memory to page-locked memory is required + // in order to get asynchrnonous operation. Because pinned memory on host-side + // is reused, additional synchronization is required. On each iteration method + // the following actions are performed: + // - Wait till previous copy operation in stream is completed + // - Copy batch of values from input array into pinned memory + // - Asynchronously launch host-to-device copy + // - Setup event for synchronization on copy completion + // - Invoke callback (that launches kernel asynchronously) + // + // Invocations are performed with the following arguments + // cudaStreamSynchronize(context->DefaultStream()); + // - Iteration #0: + // - cudaEventSynchronize(copy_finished_0) + // - std::copy_n(from, K, values_cpu_pinned_) + // - cudaMemcpyAsync(values_gpu_, values_cpu_pinned_, K * sizeof(T), H->D, + // stream_0) + // - cudaEventRecord(copy_finished_0, stream_0) + // - callback(values_gpu_, K, 0, stream_0) + // - Iteration #1: + // - cudaEventSynchronize(copy_finished_1) + // - std::copy_n(from + K, K, values_cpu_pinned_ + K) + // - cudaMemcpyAsync(values_gpu_ + K, values_cpu_pinned_ + K, K * + // sizeof(T), H->D, stream_1) + // - cudaEventRecord(copy_finished_1, stream_1) + // - callback(values_gpu_ + K, K, K, stream_1) + // - Iteration #2: + // - cudaEventSynchronize(copy_finished_0) + // - std::copy_n(from + 2 * K, K, values_cpu_pinned_) + // - cudaMemcpyAsync(values_gpu_, values_cpu_pinned_, K * sizeof(T), H->D, + // stream_0) + // - cudaEventRecord(copy_finished_0, stream_0) + // - callback(values_gpu_, K, 2 * K, stream_0) + // - Iteration #3: + // - cudaEventSynchronize(copy_finished_1) + // - std::copy_n(from + 3 * K, K, values_cpu_pinned_ + K) + // - cudaMemcpyAsync(values_gpu_ + K, values_cpu_pinned_ + K, K * + // sizeof(T), H->D, stream_1) + // - cudaEventRecord(copy_finished_1, stream_1) + // - callback(values_gpu_ + K, K, 3 * K, stream_1) + // ... + // - Iteration #i: + // - cudaEventSynchronize(copy_finished_(i % 2)) + // - std::copy_n(from + i * K, K, values_cpu_pinned_ + (i % 2) * K) + // - cudaMemcpyAsync(values_gpu_ + (i % 2) * K, values_cpu_pinned_ + (i % + // 2) * K, K * sizeof(T), H->D, stream_(i % 2)) + // - cudaEventRecord(copy_finished_(i % 2), stream_(i % 2)) + // - callback(values_gpu_ + (i % 2) * K, K, i * K, stream_(i % 2)) + // ... + // cudaStreamSynchronize(stream_0) + // cudaStreamSynchronize(stream_1) + // + // This sequence of calls results in following activity on cpu and gpu + // (assuming that kernel invoked by callback takes less time than + // host-to-device copy and copy in cpu memory, and copy in cpu memory is + // faster than host-to-device copy): + // +----------------------------+-------------------+-------------------+ + // | Stream #0 | Stream #0 | Stream #1 | + // +----------------------------+-------------------+-------------------+ + // | Copy to pinned memory | | | + // | | | | + // +----------------------------+-------------------| | + // | Copy to pinned memory | Copy host->device | | + // | | | | + // +----------------------------+ | | + // | Waiting previous h->d copy | | | + // +----------------------------+-------------------+-------------------+ + // | Copy to pinned memory | Kernel | Copy host->device | + // | +-------------------+ | + // +----------------------------+ | | + // | Waiting previous h->d copy | | | + // +----------------------------+-------------------+-------------------+ + // | Copy to pinned memory | Copy host->device | Kernel | + // | | +-------------------+ + // | ... ... | + // +----------------------------+---------------------------------------+ + // + template <typename Fun> + void CopyToGpu(const T* from, const int num_values, Fun&& callback) { + // This synchronization is not required in some cases, but we perform it in + // order to avoid situation when user callback depends on data that is + // still to be computed in default stream + CHECK_EQ(cudaSuccess, cudaStreamSynchronize(context_->DefaultStream())); + + // If pointer to input data does not correspond to page-locked memory, + // host-to-device memory copy might be executed synchrnonously (with a copy + // to pinned memory happening inside the driver). In that case we perform + // copy to a pre-allocated array of page-locked memory. + const bool copy_to_pinned_memory = MemoryTypeResultsInSynchronousCopy(from); + T* batch_values_gpu[kNumBatches]; + T* batch_values_cpu[kNumBatches]; + auto streams = context_->streams_; + for (int i = 0; i < kNumBatches; ++i) { + batch_values_gpu[i] = values_gpu_.data() + kValuesPerBatch * i; + batch_values_cpu[i] = values_cpu_pinned_ + kValuesPerBatch * i; + } + int batch_id = 0; + for (int offset = 0; offset < num_values; offset += kValuesPerBatch) { + const int num_values_batch = + std::min(num_values - offset, kValuesPerBatch); + const T* batch_from = from + offset; + T* batch_to = batch_values_gpu[batch_id]; + auto stream = streams[batch_id]; + auto copy_finished = copy_finished_[batch_id]; + + if (copy_to_pinned_memory) { + // Copying values to a temporary buffer should be started only after the + // previous copy from temporary buffer to device is completed. + CHECK_EQ(cudaSuccess, cudaEventSynchronize(copy_finished)); + std::copy_n(batch_from, num_values_batch, batch_values_cpu[batch_id]); + batch_from = batch_values_cpu[batch_id]; + } + CHECK_EQ(cudaSuccess, + cudaMemcpyAsync(batch_to, + batch_from, + sizeof(T) * num_values_batch, + cudaMemcpyHostToDevice, + stream)); + if (copy_to_pinned_memory) { + // Next copy to a temporary buffer can start straight after asynchronous + // copy is completed (and might be started before kernels asynchronously + // executed in stream by user-supplied callback are completed). + // No explicit synchronization is required when copying data from + // page-locked memory, because memory copy and user kernel execution + // with corresponding part of values_gpu_ array is serialized using + // stream + CHECK_EQ(cudaSuccess, cudaEventRecord(copy_finished, stream)); + } + callback(batch_to, num_values_batch, offset, stream); + batch_id = (batch_id + 1) % kNumBatches; + } + // Explicitly synchronize on all CUDA streams that were utilized. + for (int i = 0; i < kNumBatches; ++i) { + CHECK_EQ(cudaSuccess, cudaStreamSynchronize(streams[i])); + } + } + + private: + // It is necessary to have all host-to-device copies to be completely + // asynchronous. This requires source memory to be allocated in page-locked + // memory. + static bool MemoryTypeResultsInSynchronousCopy(const void* ptr) { + cudaPointerAttributes attributes; + auto status = cudaPointerGetAttributes(&attributes, ptr); +#if CUDART_VERSION < 11000 + // In CUDA versions prior 11 call to cudaPointerGetAttributes with host + // pointer will return cudaErrorInvalidValue + if (status == cudaErrorInvalidValue) { + return true; + } +#endif + CHECK_EQ(status, cudaSuccess); + // This class only supports cpu memory as a source + CHECK_NE(attributes.type, cudaMemoryTypeDevice); + // If host memory was allocated (or registered) with CUDA API, or is a + // managed memory, then call to cudaMemcpyAsync will be asynchrnous. In case + // of managed memory it might be slightly better to perform a single call of + // user-provided call-back (and hope that page migration will provide a + // similar throughput with zero efforts from our side). + return attributes.type == cudaMemoryTypeUnregistered; + } + + const int kValuesPerBatch; + ContextImpl* context_ = nullptr; + CudaBuffer<T> values_gpu_; + T* values_cpu_pinned_ = nullptr; + cudaEvent_t copy_finished_[kNumBatches] = {nullptr}; +}; + +} // namespace ceres::internal + +#endif // CERES_NO_CUDA +#endif // CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_
diff --git a/internal/ceres/cuda_streamed_buffer_test.cc b/internal/ceres/cuda_streamed_buffer_test.cc new file mode 100644 index 0000000..4837005 --- /dev/null +++ b/internal/ceres/cuda_streamed_buffer_test.cc
@@ -0,0 +1,169 @@ +// Ceres Solver - A fast non-linear least squares minimizer +// Copyright 2023 Google Inc. All rights reserved. +// http://ceres-solver.org/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Google Inc. nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin) + +#include "ceres/internal/config.h" + +#ifndef CERES_NO_CUDA + +#include <glog/logging.h> +#include <gtest/gtest.h> + +#include <numeric> + +#include "ceres/cuda_streamed_buffer.h" + +namespace ceres::internal { + +TEST(CudaStreamedBufferTest, IntegerCopy) { + // Offsets and sizes of batches supplied to callback + std::vector<std::pair<int, int>> batches; + const int kMaxTemporaryArraySize = 16; + const int kInputSize = kMaxTemporaryArraySize * 7 + 3; + ContextImpl context; + std::string message; + CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message; + + std::vector<int> inputs(kInputSize); + std::vector<int> outputs(kInputSize, -1); + std::iota(inputs.begin(), inputs.end(), 0); + + CudaStreamedBuffer<int> streamed_buffer(&context, kMaxTemporaryArraySize); + streamed_buffer.CopyToGpu(inputs.data(), + kInputSize, + [&outputs, &batches](const int* device_pointer, + int size, + int offset, + cudaStream_t stream) { + batches.emplace_back(offset, size); + CHECK_EQ(cudaSuccess, + cudaMemcpyAsync(outputs.data() + offset, + device_pointer, + sizeof(int) * size, + cudaMemcpyDeviceToHost, + stream)); + }); + // All operations in all streams should be completed when CopyToGpu returns + // control to the callee + for (int i = 0; i < ContextImpl::kNumCudaStreams; ++i) { + CHECK_EQ(cudaSuccess, cudaStreamQuery(context.streams_[i])); + } + + // Check if every element was visited + for (int i = 0; i < kInputSize; ++i) { + CHECK_EQ(outputs[i], i); + } + + // Check if there is no overlap between batches + std::sort(batches.begin(), batches.end()); + const int num_batches = batches.size(); + for (int i = 0; i < num_batches; ++i) { + const auto [begin, size] = batches[i]; + const int end = begin + size; + CHECK_GE(begin, 0); + CHECK_LT(begin, kInputSize); + + CHECK_GT(size, 0); + CHECK_LE(end, kInputSize); + + if (i + 1 == num_batches) continue; + CHECK_EQ(end, batches[i + 1].first); + } +} + +TEST(CudaStreamedBufferTest, IntegerNoCopy) { + // Offsets and sizes of batches supplied to callback + std::vector<std::pair<int, int>> batches; + const int kMaxTemporaryArraySize = 16; + const int kInputSize = kMaxTemporaryArraySize * 7 + 3; + ContextImpl context; + std::string message; + CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message; + + int* inputs; + int* outputs; + CHECK_EQ(cudaSuccess, + cudaHostAlloc( + &inputs, sizeof(int) * kInputSize, cudaHostAllocWriteCombined)); + CHECK_EQ( + cudaSuccess, + cudaHostAlloc(&outputs, sizeof(int) * kInputSize, cudaHostAllocDefault)); + + std::fill(outputs, outputs + kInputSize, -1); + std::iota(inputs, inputs + kInputSize, 0); + + CudaStreamedBuffer<int> streamed_buffer(&context, kMaxTemporaryArraySize); + streamed_buffer.CopyToGpu(inputs, + kInputSize, + [outputs, &batches](const int* device_pointer, + int size, + int offset, + cudaStream_t stream) { + batches.emplace_back(offset, size); + CHECK_EQ(cudaSuccess, + cudaMemcpyAsync(outputs + offset, + device_pointer, + sizeof(int) * size, + cudaMemcpyDeviceToHost, + stream)); + }); + // All operations in all streams should be completed when CopyToGpu returns + // control to the callee + for (int i = 0; i < ContextImpl::kNumCudaStreams; ++i) { + CHECK_EQ(cudaSuccess, cudaStreamQuery(context.streams_[i])); + } + + // Check if every element was visited + for (int i = 0; i < kInputSize; ++i) { + CHECK_EQ(outputs[i], i); + } + + // Check if there is no overlap between batches + std::sort(batches.begin(), batches.end()); + const int num_batches = batches.size(); + for (int i = 0; i < num_batches; ++i) { + const auto [begin, size] = batches[i]; + const int end = begin + size; + CHECK_GE(begin, 0); + CHECK_LT(begin, kInputSize); + + CHECK_GT(size, 0); + CHECK_LE(end, kInputSize); + + if (i + 1 == num_batches) continue; + CHECK_EQ(end, batches[i + 1].first); + } + + CHECK_EQ(cudaSuccess, cudaFreeHost(inputs)); + CHECK_EQ(cudaSuccess, cudaFreeHost(outputs)); +} + +} // namespace ceres::internal + +#endif // CERES_NO_CUDA
diff --git a/internal/ceres/cuda_vector.cc b/internal/ceres/cuda_vector.cc index 274a47a..e2143de 100644 --- a/internal/ceres/cuda_vector.cc +++ b/internal/ceres/cuda_vector.cc
@@ -127,7 +127,7 @@ void CudaVector::SetZero() { CHECK(data_.data() != nullptr); - CudaSetZeroFP64(data_.data(), num_rows_, context_->stream_); + CudaSetZeroFP64(data_.data(), num_rows_, context_->DefaultStream()); } void CudaVector::Axpby(double a, const CudaVector& x, double b) { @@ -160,7 +160,7 @@ D.data().data(), x.data().data(), num_rows_, - context_->stream_); + context_->DefaultStream()); } void CudaVector::Scale(double s) {
diff --git a/internal/ceres/dense_cholesky.cc b/internal/ceres/dense_cholesky.cc index 93e9f0d..f19ca7e 100644 --- a/internal/ceres/dense_cholesky.cc +++ b/internal/ceres/dense_cholesky.cc
@@ -516,7 +516,7 @@ residual_fp32_.data(), num_cols_ * sizeof(float), cudaMemcpyDeviceToDevice, - context_->stream_), + context_->DefaultStream()), cudaSuccess); if (cusolverDnSpotrs(context_->cusolver_handle_, CUBLAS_FILL_MODE_LOWER, @@ -569,7 +569,7 @@ CudaFP64ToFP32(lhs_fp64_.data(), lhs_fp32_.data(), num_cols * num_cols, - context_->stream_); + context_->DefaultStream()); // Factorize lhs_fp32. factorize_result_ = CudaCholeskyFactorize(message); @@ -592,7 +592,7 @@ residual_fp64_.Reserve(num_cols_); // Initialize x = 0. - CudaSetZeroFP64(x_fp64_.data(), num_cols_, context_->stream_); + CudaSetZeroFP64(x_fp64_.data(), num_cols_, context_->DefaultStream()); // Initialize residual = rhs. rhs_fp64_.CopyFromCpu(rhs, num_cols_); @@ -603,15 +603,17 @@ CudaFP64ToFP32(residual_fp64_.data(), residual_fp32_.data(), num_cols_, - context_->stream_); + context_->DefaultStream()); // [fp32] c = lhs^-1 * residual. auto result = CudaCholeskySolve(message); if (result != LinearSolverTerminationType::SUCCESS) { return result; } // [fp64] x += c. - CudaDsxpy( - x_fp64_.data(), correction_fp32_.data(), num_cols_, context_->stream_); + CudaDsxpy(x_fp64_.data(), + correction_fp32_.data(), + num_cols_, + context_->DefaultStream()); if (i < max_num_refinement_iterations_) { // [fp64] residual = rhs - lhs * x // This is done in two steps:
diff --git a/internal/ceres/evaluation_benchmark.cc b/internal/ceres/evaluation_benchmark.cc index ae1a65d..2ba03ed 100644 --- a/internal/ceres/evaluation_benchmark.cc +++ b/internal/ceres/evaluation_benchmark.cc
@@ -36,6 +36,7 @@ #include "benchmark/benchmark.h" #include "ceres/block_sparse_matrix.h" #include "ceres/bundle_adjustment_test_util.h" +#include "ceres/cuda_block_sparse_crs_view.h" #include "ceres/cuda_sparse_matrix.h" #include "ceres/cuda_vector.h" #include "ceres/evaluator.h" @@ -427,6 +428,75 @@ CHECK_GT(y.squaredNorm(), 0.); } +static void JacobianToCRS(benchmark::State& state, + BALData* data, + ContextImpl* context) { + auto jacobian = data->BlockSparseJacobian(context); + + std::unique_ptr<CompressedRowSparseMatrix> matrix; + for (auto _ : state) { + matrix = jacobian->ToCompressedRowSparseMatrix(); + } + CHECK(matrix != nullptr); +} + +#ifndef CERES_NO_CUDA +// We want CudaBlockSparseCRSView to be not slower than explicit conversion to +// CRS on CPU +static void JacobianToCRSView(benchmark::State& state, + BALData* data, + ContextImpl* context) { + auto jacobian = data->BlockSparseJacobian(context); + + std::unique_ptr<CudaBlockSparseCRSView> matrix; + for (auto _ : state) { + matrix = std::make_unique<CudaBlockSparseCRSView>(*jacobian, context); + } + CHECK(matrix != nullptr); +} +static void JacobianToCRSMatrix(benchmark::State& state, + BALData* data, + ContextImpl* context) { + auto jacobian = data->BlockSparseJacobian(context); + + std::unique_ptr<CudaSparseMatrix> matrix; + std::unique_ptr<CompressedRowSparseMatrix> matrix_cpu; + for (auto _ : state) { + matrix_cpu = jacobian->ToCompressedRowSparseMatrix(); + matrix = std::make_unique<CudaSparseMatrix>(context, *matrix_cpu); + } + CHECK(matrix != nullptr); +} +// Updating values in CudaBlockSparseCRSView should be +- as fast as just +// copying values (time spent in value permutation has to be hidden by PCIe +// transfer) +static void JacobianToCRSViewUpdate(benchmark::State& state, + BALData* data, + ContextImpl* context) { + auto jacobian = data->BlockSparseJacobian(context); + + auto matrix = CudaBlockSparseCRSView(*jacobian, context); + for (auto _ : state) { + matrix.UpdateValues(*jacobian); + } +} +static void JacobianToCRSMatrixUpdate(benchmark::State& state, + BALData* data, + ContextImpl* context) { + auto jacobian = data->BlockSparseJacobian(context); + + auto matrix_cpu = jacobian->ToCompressedRowSparseMatrix(); + auto matrix = std::make_unique<CudaSparseMatrix>(context, *matrix_cpu); + for (auto _ : state) { + CHECK_EQ(cudaSuccess, + cudaMemcpy(matrix->mutable_values(), + matrix_cpu->values(), + matrix->num_nonzeros() * sizeof(double), + cudaMemcpyHostToDevice)); + } +} +#endif + static void JacobianSquaredColumnNorm(benchmark::State& state, BALData* data, ContextImpl* context) { @@ -772,6 +842,34 @@ ->Arg(4) ->Arg(8) ->Arg(16); + + const std::string name_to_crs = "JacobianToCRS<" + path + ">"; + ::benchmark::RegisterBenchmark( + name_to_crs.c_str(), ceres::internal::JacobianToCRS, data, &context); +#ifndef CERES_NO_CUDA + const std::string name_to_crs_view = "JacobianToCRSView<" + path + ">"; + ::benchmark::RegisterBenchmark(name_to_crs_view.c_str(), + ceres::internal::JacobianToCRSView, + data, + &context); + const std::string name_to_crs_matrix = "JacobianToCRSMatrix<" + path + ">"; + ::benchmark::RegisterBenchmark(name_to_crs_matrix.c_str(), + ceres::internal::JacobianToCRSMatrix, + data, + &context); + const std::string name_to_crs_view_update = + "JacobianToCRSViewUpdate<" + path + ">"; + ::benchmark::RegisterBenchmark(name_to_crs_view_update.c_str(), + ceres::internal::JacobianToCRSViewUpdate, + data, + &context); + const std::string name_to_crs_matrix_update = + "JacobianToCRSMatrixUpdate<" + path + ">"; + ::benchmark::RegisterBenchmark(name_to_crs_matrix_update.c_str(), + ceres::internal::JacobianToCRSMatrixUpdate, + data, + &context); +#endif } ::benchmark::RunSpecifiedBenchmarks();