internal/ceres/cuda_partitioned_block_sparse_crs_view.h - ceres-solver - Git at Google

 // Ceres Solver - A fast non-linear least squares minimizer
 // Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
 // * Redistributions of source code must retain the above copyright notice,
 //   this list of conditions and the following disclaimer.
 // * Redistributions in binary form must reproduce the above copyright notice,
 //   this list of conditions and the following disclaimer in the documentation
 //   and/or other materials provided with the distribution.
 // * Neither the name of Google Inc. nor the names of its contributors may be
 //   used to endorse or promote products derived from this software without
 //   specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 //
 // Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
 //

 #ifndef CERES_INTERNAL_CUDA_PARTITIONED_BLOCK_SPARSE_CRS_VIEW_H_
 #define CERES_INTERNAL_CUDA_PARTITIONED_BLOCK_SPARSE_CRS_VIEW_H_

 #include "ceres/internal/config.h"

 #ifndef CERES_NO_CUDA

 #include <memory>

 #include "ceres/block_sparse_matrix.h"
 #include "ceres/cuda_block_structure.h"
 #include "ceres/cuda_buffer.h"
 #include "ceres/cuda_sparse_matrix.h"
 #include "ceres/cuda_streamed_buffer.h"

 namespace ceres::internal {
 // We use cuSPARSE library for SpMV operations. However, it does not support
 // neither block-sparse format with varying size of the blocks nor
 // submatrix-vector products. Thus, we perform the following operations in order
 // to compute products of partitioned block-sparse matrices and dense vectors on
 // gpu:
 //  - Once per block-sparse structure update:
 //    - Compute CRS structures of left and right submatrices from block-sparse
 //    structure
 //    - Check if values of F sub-matrix can be copied without permutation
 //    matrices
 //  - Once per block-sparse values update:
 //    - Copy values of E sub-matrix
 //    - Permute or copy values of F sub-matrix
 //
 // It is assumed that cells of block-sparse matrix are laid out sequentially in
 // both of sub-matrices and there is exactly one cell in row-block of E
 // sub-matrix in the first num_row_blocks_e_ row blocks, and no cells in E
 // sub-matrix below num_row_blocks_e_ row blocks.
 //
 // This class avoids storing both CRS and block-sparse values in GPU memory.
 // Instead, block-sparse values are transferred to gpu memory as a disjoint set
 // of small continuous segments with simultaneous permutation of the values into
 // correct order using block-structure.
 class CERES_NO_EXPORT CudaPartitionedBlockSparseCRSView {
  public:
   // Initializes internal CRS matrix and block-sparse structure on GPU side
   // values. The following objects are stored in gpu memory for the whole
   // lifetime of the object
   //  - matrix_e_: left CRS submatrix
   //  - matrix_f_: right CRS submatrix
   //  - block_structure_: copy of block-sparse structure on GPU
   //  - streamed_buffer_: helper for value updating
   CudaPartitionedBlockSparseCRSView(const BlockSparseMatrix& bsm,
                                     const int num_col_blocks_e,
                                     ContextImpl* context);

   // Update values of CRS submatrices using values of block-sparse matrix.
   // Assumes that bsm has the same block-sparse structure as matrix that was
   // used for construction.
   void UpdateValues(const BlockSparseMatrix& bsm);

   const CudaSparseMatrix* matrix_e() const { return matrix_e_.get(); }
   const CudaSparseMatrix* matrix_f() const { return matrix_f_.get(); }
   CudaSparseMatrix* mutable_matrix_e() { return matrix_e_.get(); }
   CudaSparseMatrix* mutable_matrix_f() { return matrix_f_.get(); }

  private:
   // Value permutation kernel performs a single element-wise operation per
   // thread, thus performing permutation in blocks of 8 megabytes of
   // block-sparse  values seems reasonable
   static constexpr int kMaxTemporaryArraySize = 1 * 1024 * 1024;
   std::unique_ptr<CudaSparseMatrix> matrix_e_;
   std::unique_ptr<CudaSparseMatrix> matrix_f_;
   std::unique_ptr<CudaStreamedBuffer<double>> streamed_buffer_;
   std::unique_ptr<CudaBlockSparseStructure> block_structure_;
   bool f_is_crs_compatible_;
   int num_row_blocks_e_;
   ContextImpl* context_;
 };

 }  // namespace ceres::internal

 #endif  // CERES_NO_CUDA
 #endif  // CERES_INTERNAL_CUDA_PARTITIONED_BLOCK_SPARSE_CRS_VIEW_H_
	// Ceres Solver - A fast non-linear least squares minimizer
	// Copyright 2023 Google Inc. All rights reserved.
	// http://ceres-solver.org/
	//
	// Redistribution and use in source and binary forms, with or without
	// modification, are permitted provided that the following conditions are met:
	//
	// * Redistributions of source code must retain the above copyright notice,
	// this list of conditions and the following disclaimer.
	// * Redistributions in binary form must reproduce the above copyright notice,
	// this list of conditions and the following disclaimer in the documentation
	// and/or other materials provided with the distribution.
	// * Neither the name of Google Inc. nor the names of its contributors may be
	// used to endorse or promote products derived from this software without
	// specific prior written permission.
	//
	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
	// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	// POSSIBILITY OF SUCH DAMAGE.
	//
	// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
	//

	#ifndef CERES_INTERNAL_CUDA_PARTITIONED_BLOCK_SPARSE_CRS_VIEW_H_
	#define CERES_INTERNAL_CUDA_PARTITIONED_BLOCK_SPARSE_CRS_VIEW_H_

	#include "ceres/internal/config.h"

	#ifndef CERES_NO_CUDA

	#include <memory>

	#include "ceres/block_sparse_matrix.h"
	#include "ceres/cuda_block_structure.h"
	#include "ceres/cuda_buffer.h"
	#include "ceres/cuda_sparse_matrix.h"
	#include "ceres/cuda_streamed_buffer.h"

	namespace ceres::internal {
	// We use cuSPARSE library for SpMV operations. However, it does not support
	// neither block-sparse format with varying size of the blocks nor
	// submatrix-vector products. Thus, we perform the following operations in order
	// to compute products of partitioned block-sparse matrices and dense vectors on
	// gpu:
	// - Once per block-sparse structure update:
	// - Compute CRS structures of left and right submatrices from block-sparse
	// structure
	// - Check if values of F sub-matrix can be copied without permutation
	// matrices
	// - Once per block-sparse values update:
	// - Copy values of E sub-matrix
	// - Permute or copy values of F sub-matrix
	//
	// It is assumed that cells of block-sparse matrix are laid out sequentially in
	// both of sub-matrices and there is exactly one cell in row-block of E
	// sub-matrix in the first num_row_blocks_e_ row blocks, and no cells in E
	// sub-matrix below num_row_blocks_e_ row blocks.
	//
	// This class avoids storing both CRS and block-sparse values in GPU memory.
	// Instead, block-sparse values are transferred to gpu memory as a disjoint set
	// of small continuous segments with simultaneous permutation of the values into
	// correct order using block-structure.
	class CERES_NO_EXPORT CudaPartitionedBlockSparseCRSView {
	public:
	// Initializes internal CRS matrix and block-sparse structure on GPU side
	// values. The following objects are stored in gpu memory for the whole
	// lifetime of the object
	// - matrix_e_: left CRS submatrix
	// - matrix_f_: right CRS submatrix
	// - block_structure_: copy of block-sparse structure on GPU
	// - streamed_buffer_: helper for value updating
	CudaPartitionedBlockSparseCRSView(const BlockSparseMatrix& bsm,
	const int num_col_blocks_e,
	ContextImpl* context);

	// Update values of CRS submatrices using values of block-sparse matrix.
	// Assumes that bsm has the same block-sparse structure as matrix that was
	// used for construction.
	void UpdateValues(const BlockSparseMatrix& bsm);

	const CudaSparseMatrix* matrix_e() const { return matrix_e_.get(); }
	const CudaSparseMatrix* matrix_f() const { return matrix_f_.get(); }
	CudaSparseMatrix* mutable_matrix_e() { return matrix_e_.get(); }
	CudaSparseMatrix* mutable_matrix_f() { return matrix_f_.get(); }

	private:
	// Value permutation kernel performs a single element-wise operation per
	// thread, thus performing permutation in blocks of 8 megabytes of
	// block-sparse values seems reasonable
	static constexpr int kMaxTemporaryArraySize = 1 * 1024 * 1024;
	std::unique_ptr<CudaSparseMatrix> matrix_e_;
	std::unique_ptr<CudaSparseMatrix> matrix_f_;
	std::unique_ptr<CudaStreamedBuffer<double>> streamed_buffer_;
	std::unique_ptr<CudaBlockSparseStructure> block_structure_;
	bool f_is_crs_compatible_;
	int num_row_blocks_e_;
	ContextImpl* context_;
	};

	} // namespace ceres::internal

	#endif // CERES_NO_CUDA
	#endif // CERES_INTERNAL_CUDA_PARTITIONED_BLOCK_SPARSE_CRS_VIEW_H_