internal/ceres/cuda_partitioned_block_sparse_crs_view.cc - ceres-solver - Git at Google

 // Ceres Solver - A fast non-linear least squares minimizer
 // Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
 // * Redistributions of source code must retain the above copyright notice,
 //   this list of conditions and the following disclaimer.
 // * Redistributions in binary form must reproduce the above copyright notice,
 //   this list of conditions and the following disclaimer in the documentation
 //   and/or other materials provided with the distribution.
 // * Neither the name of Google Inc. nor the names of its contributors may be
 //   used to endorse or promote products derived from this software without
 //   specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 //
 // Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)

 #include "ceres/cuda_partitioned_block_sparse_crs_view.h"

 #ifndef CERES_NO_CUDA

 #include "ceres/cuda_block_structure.h"
 #include "ceres/cuda_kernels_bsm_to_crs.h"

 namespace ceres::internal {

 CudaPartitionedBlockSparseCRSView::CudaPartitionedBlockSparseCRSView(
     const BlockSparseMatrix& bsm,
     const int num_col_blocks_e,
     ContextImpl* context)
     :

       context_(context) {
   const auto& bs = *bsm.block_structure();
   block_structure_ =
       std::make_unique<CudaBlockSparseStructure>(bs, num_col_blocks_e, context);
   // Determine number of non-zeros in left submatrix
   // Row-blocks are at least 1 row high, thus we can use a temporary array of
   // num_rows for ComputeNonZerosInColumnBlockSubMatrix; and later reuse it for
   // FillCRSStructurePartitioned
   const int num_rows = bsm.num_rows();
   const int num_nonzeros_e = block_structure_->num_nonzeros_e();
   const int num_nonzeros_f = bsm.num_nonzeros() - num_nonzeros_e;

   const int num_cols_e = num_col_blocks_e < bs.cols.size()
                              ? bs.cols[num_col_blocks_e].position
                              : bsm.num_cols();
   const int num_cols_f = bsm.num_cols() - num_cols_e;

   CudaBuffer<int32_t> rows_e(context, num_rows + 1);
   CudaBuffer<int32_t> cols_e(context, num_nonzeros_e);
   CudaBuffer<int32_t> rows_f(context, num_rows + 1);
   CudaBuffer<int32_t> cols_f(context, num_nonzeros_f);

   num_row_blocks_e_ = block_structure_->num_row_blocks_e();
   FillCRSStructurePartitioned(block_structure_->num_row_blocks(),
                               num_rows,
                               num_row_blocks_e_,
                               num_col_blocks_e,
                               num_nonzeros_e,
                               block_structure_->first_cell_in_row_block(),
                               block_structure_->cells(),
                               block_structure_->row_blocks(),
                               block_structure_->col_blocks(),
                               rows_e.data(),
                               cols_e.data(),
                               rows_f.data(),
                               cols_f.data(),
                               context->DefaultStream(),
                               context->is_cuda_memory_pools_supported_);
   f_is_crs_compatible_ = block_structure_->IsCrsCompatible();
   if (f_is_crs_compatible_) {
     block_structure_ = nullptr;
   } else {
     streamed_buffer_ = std::make_unique<CudaStreamedBuffer<double>>(
         context, kMaxTemporaryArraySize);
   }
   matrix_e_ = std::make_unique<CudaSparseMatrix>(
       num_cols_e, std::move(rows_e), std::move(cols_e), context);
   matrix_f_ = std::make_unique<CudaSparseMatrix>(
       num_cols_f, std::move(rows_f), std::move(cols_f), context);

   CHECK_EQ(bsm.num_nonzeros(),
            matrix_e_->num_nonzeros() + matrix_f_->num_nonzeros());

   UpdateValues(bsm);
 }

 void CudaPartitionedBlockSparseCRSView::UpdateValues(
     const BlockSparseMatrix& bsm) {
   if (f_is_crs_compatible_) {
     CHECK_EQ(cudaSuccess,
              cudaMemcpyAsync(matrix_e_->mutable_values(),
                              bsm.values(),
                              matrix_e_->num_nonzeros() * sizeof(double),
                              cudaMemcpyHostToDevice,
                              context_->DefaultStream()));

     CHECK_EQ(cudaSuccess,
              cudaMemcpyAsync(matrix_f_->mutable_values(),
                              bsm.values() + matrix_e_->num_nonzeros(),
                              matrix_f_->num_nonzeros() * sizeof(double),
                              cudaMemcpyHostToDevice,
                              context_->DefaultStream()));
     return;
   }
   streamed_buffer_->CopyToGpu(
       bsm.values(),
       bsm.num_nonzeros(),
       [block_structure = block_structure_.get(),
        num_nonzeros_e = matrix_e_->num_nonzeros(),
        num_row_blocks_e = num_row_blocks_e_,
        values_f = matrix_f_->mutable_values(),
        rows_f = matrix_f_->rows()](
           const double* values, int num_values, int offset, auto stream) {
         PermuteToCRSPartitionedF(num_nonzeros_e + offset,
                                  num_values,
                                  block_structure->num_row_blocks(),
                                  num_row_blocks_e,
                                  block_structure->first_cell_in_row_block(),
                                  block_structure->value_offset_row_block_f(),
                                  block_structure->cells(),
                                  block_structure->row_blocks(),
                                  block_structure->col_blocks(),
                                  rows_f,
                                  values,
                                  values_f,
                                  stream);
       });
   CHECK_EQ(cudaSuccess,
            cudaMemcpyAsync(matrix_e_->mutable_values(),
                            bsm.values(),
                            matrix_e_->num_nonzeros() * sizeof(double),
                            cudaMemcpyHostToDevice,
                            context_->DefaultStream()));
 }

 }  // namespace ceres::internal
 #endif  // CERES_NO_CUDA
	// Ceres Solver - A fast non-linear least squares minimizer
	// Copyright 2023 Google Inc. All rights reserved.
	// http://ceres-solver.org/
	//
	// Redistribution and use in source and binary forms, with or without
	// modification, are permitted provided that the following conditions are met:
	//
	// * Redistributions of source code must retain the above copyright notice,
	// this list of conditions and the following disclaimer.
	// * Redistributions in binary form must reproduce the above copyright notice,
	// this list of conditions and the following disclaimer in the documentation
	// and/or other materials provided with the distribution.
	// * Neither the name of Google Inc. nor the names of its contributors may be
	// used to endorse or promote products derived from this software without
	// specific prior written permission.
	//
	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
	// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	// POSSIBILITY OF SUCH DAMAGE.
	//
	// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)

	#include "ceres/cuda_partitioned_block_sparse_crs_view.h"

	#ifndef CERES_NO_CUDA

	#include "ceres/cuda_block_structure.h"
	#include "ceres/cuda_kernels_bsm_to_crs.h"

	namespace ceres::internal {

	CudaPartitionedBlockSparseCRSView::CudaPartitionedBlockSparseCRSView(
	const BlockSparseMatrix& bsm,
	const int num_col_blocks_e,
	ContextImpl* context)
	:

	context_(context) {
	const auto& bs = *bsm.block_structure();
	block_structure_ =
	std::make_unique<CudaBlockSparseStructure>(bs, num_col_blocks_e, context);
	// Determine number of non-zeros in left submatrix
	// Row-blocks are at least 1 row high, thus we can use a temporary array of
	// num_rows for ComputeNonZerosInColumnBlockSubMatrix; and later reuse it for
	// FillCRSStructurePartitioned
	const int num_rows = bsm.num_rows();
	const int num_nonzeros_e = block_structure_->num_nonzeros_e();
	const int num_nonzeros_f = bsm.num_nonzeros() - num_nonzeros_e;

	const int num_cols_e = num_col_blocks_e < bs.cols.size()
	? bs.cols[num_col_blocks_e].position
	: bsm.num_cols();
	const int num_cols_f = bsm.num_cols() - num_cols_e;

	CudaBuffer<int32_t> rows_e(context, num_rows + 1);
	CudaBuffer<int32_t> cols_e(context, num_nonzeros_e);
	CudaBuffer<int32_t> rows_f(context, num_rows + 1);
	CudaBuffer<int32_t> cols_f(context, num_nonzeros_f);

	num_row_blocks_e_ = block_structure_->num_row_blocks_e();
	FillCRSStructurePartitioned(block_structure_->num_row_blocks(),
	num_rows,
	num_row_blocks_e_,
	num_col_blocks_e,
	num_nonzeros_e,
	block_structure_->first_cell_in_row_block(),
	block_structure_->cells(),
	block_structure_->row_blocks(),
	block_structure_->col_blocks(),
	rows_e.data(),
	cols_e.data(),
	rows_f.data(),
	cols_f.data(),
	context->DefaultStream(),
	context->is_cuda_memory_pools_supported_);
	f_is_crs_compatible_ = block_structure_->IsCrsCompatible();
	if (f_is_crs_compatible_) {
	block_structure_ = nullptr;
	} else {
	streamed_buffer_ = std::make_unique<CudaStreamedBuffer<double>>(
	context, kMaxTemporaryArraySize);
	}
	matrix_e_ = std::make_unique<CudaSparseMatrix>(
	num_cols_e, std::move(rows_e), std::move(cols_e), context);
	matrix_f_ = std::make_unique<CudaSparseMatrix>(
	num_cols_f, std::move(rows_f), std::move(cols_f), context);

	CHECK_EQ(bsm.num_nonzeros(),
	matrix_e_->num_nonzeros() + matrix_f_->num_nonzeros());

	UpdateValues(bsm);
	}

	void CudaPartitionedBlockSparseCRSView::UpdateValues(
	const BlockSparseMatrix& bsm) {
	if (f_is_crs_compatible_) {
	CHECK_EQ(cudaSuccess,
	cudaMemcpyAsync(matrix_e_->mutable_values(),
	bsm.values(),
	matrix_e_->num_nonzeros() * sizeof(double),
	cudaMemcpyHostToDevice,
	context_->DefaultStream()));

	CHECK_EQ(cudaSuccess,
	cudaMemcpyAsync(matrix_f_->mutable_values(),
	bsm.values() + matrix_e_->num_nonzeros(),
	matrix_f_->num_nonzeros() * sizeof(double),
	cudaMemcpyHostToDevice,
	context_->DefaultStream()));
	return;
	}
	streamed_buffer_->CopyToGpu(
	bsm.values(),
	bsm.num_nonzeros(),
	[block_structure = block_structure_.get(),
	num_nonzeros_e = matrix_e_->num_nonzeros(),
	num_row_blocks_e = num_row_blocks_e_,
	values_f = matrix_f_->mutable_values(),
	rows_f = matrix_f_->rows()](
	const double* values, int num_values, int offset, auto stream) {
	PermuteToCRSPartitionedF(num_nonzeros_e + offset,
	num_values,
	block_structure->num_row_blocks(),
	num_row_blocks_e,
	block_structure->first_cell_in_row_block(),
	block_structure->value_offset_row_block_f(),
	block_structure->cells(),
	block_structure->row_blocks(),
	block_structure->col_blocks(),
	rows_f,
	values,
	values_f,
	stream);
	});
	CHECK_EQ(cudaSuccess,
	cudaMemcpyAsync(matrix_e_->mutable_values(),
	bsm.values(),
	matrix_e_->num_nonzeros() * sizeof(double),
	cudaMemcpyHostToDevice,
	context_->DefaultStream()));
	}

	} // namespace ceres::internal
	#endif // CERES_NO_CUDA