internal/ceres/cuda_partitioned_block_sparse_crs_view_test.cc - ceres-solver - Git at Google

 // Ceres Solver - A fast non-linear least squares minimizer
 // Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
 // * Redistributions of source code must retain the above copyright notice,
 //   this list of conditions and the following disclaimer.
 // * Redistributions in binary form must reproduce the above copyright notice,
 //   this list of conditions and the following disclaimer in the documentation
 //   and/or other materials provided with the distribution.
 // * Neither the name of Google Inc. nor the names of its contributors may be
 //   used to endorse or promote products derived from this software without
 //   specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 //
 // Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)

 #include "ceres/cuda_partitioned_block_sparse_crs_view.h"

 #include <glog/logging.h>
 #include <gtest/gtest.h>

 #ifndef CERES_NO_CUDA

 namespace ceres::internal {

 namespace {
 struct RandomPartitionedMatrixOptions {
   int num_row_blocks_e;
   int num_row_blocks_f;
   int num_col_blocks_e;
   int num_col_blocks_f;
   int min_row_block_size;
   int max_row_block_size;
   int min_col_block_size;
   int max_col_block_size;
   double empty_f_probability;
   double cell_probability_f;
   int max_cells_f;
 };

 std::unique_ptr<BlockSparseMatrix> CreateRandomPartitionedMatrix(
     const RandomPartitionedMatrixOptions& options, std::mt19937& rng) {
   const int num_row_blocks =
       std::max(options.num_row_blocks_e, options.num_row_blocks_f);
   const int num_col_blocks =
       options.num_col_blocks_e + options.num_col_blocks_f;

   CompressedRowBlockStructure* block_structure =
       new CompressedRowBlockStructure;
   block_structure->cols.reserve(num_col_blocks);
   block_structure->rows.reserve(num_row_blocks);

   // Create column blocks
   std::uniform_int_distribution<int> col_size(options.min_col_block_size,
                                               options.max_col_block_size);
   int num_cols = 0;
   for (int i = 0; i < num_col_blocks; ++i) {
     const int size = col_size(rng);
     block_structure->cols.emplace_back(size, num_cols);
     num_cols += size;
   }

   // Prepare column-block indices of E cells
   std::vector<int> e_col_block_idx;
   e_col_block_idx.reserve(options.num_row_blocks_e);
   std::uniform_int_distribution<int> col_e(0, options.num_col_blocks_e - 1);
   for (int i = 0; i < options.num_row_blocks_e; ++i) {
     e_col_block_idx.emplace_back(col_e(rng));
   }
   std::sort(e_col_block_idx.begin(), e_col_block_idx.end());

   // Prepare cell structure
   std::uniform_int_distribution<int> row_size(options.min_row_block_size,
                                               options.max_row_block_size);
   std::uniform_real_distribution<double> uniform;
   int num_rows = 0;
   for (int i = 0; i < num_row_blocks; ++i) {
     const int size = row_size(rng);
     block_structure->rows.emplace_back();
     auto& row = block_structure->rows.back();
     row.block.size = size;
     row.block.position = num_rows;
     num_rows += size;
     if (i < options.num_row_blocks_e) {
       row.cells.emplace_back(e_col_block_idx[i], -1);
       if (uniform(rng) < options.empty_f_probability) {
         continue;
       }
     }
     if (i >= options.num_row_blocks_f) continue;
     const int cells_before = row.cells.size();
     for (int j = options.num_col_blocks_e; j < num_col_blocks; ++j) {
       if (uniform(rng) > options.cell_probability_f) {
         continue;
       }
       row.cells.emplace_back(j, -1);
     }
     if (row.cells.size() > cells_before + options.max_cells_f) {
       std::shuffle(row.cells.begin() + cells_before, row.cells.end(), rng);
       row.cells.resize(cells_before + options.max_cells_f);
       std::sort(
           row.cells.begin(), row.cells.end(), [](const auto& a, const auto& b) {
             return a.block_id < b.block_id;
           });
     }
   }

   // Fill positions in E sub-matrix
   int num_nonzeros = 0;
   for (int i = 0; i < options.num_row_blocks_e; ++i) {
     CHECK_GE(block_structure->rows[i].cells.size(), 1);
     block_structure->rows[i].cells[0].position = num_nonzeros;
     const int col_block_size =
         block_structure->cols[block_structure->rows[i].cells[0].block_id].size;
     const int row_block_size = block_structure->rows[i].block.size;
     num_nonzeros += row_block_size * col_block_size;
     CHECK_GE(num_nonzeros, 0);
   }
   // Fill positions in F sub-matrix
   for (int i = 0; i < options.num_row_blocks_f; ++i) {
     const int row_block_size = block_structure->rows[i].block.size;
     for (auto& cell : block_structure->rows[i].cells) {
       if (cell.position >= 0) continue;
       cell.position = num_nonzeros;
       const int col_block_size = block_structure->cols[cell.block_id].size;
       num_nonzeros += row_block_size * col_block_size;
       CHECK_GE(num_nonzeros, 0);
     }
   }
   // Populate values
   auto bsm = std::make_unique<BlockSparseMatrix>(block_structure, true);
   for (int i = 0; i < num_nonzeros; ++i) {
     bsm->mutable_values()[i] = i + 1;
   }
   return bsm;
 }
 }  // namespace

 class CudaPartitionedBlockSparseCRSViewTest : public ::testing::Test {
   static constexpr int kNumColBlocksE = 456;

  protected:
   void SetUp() final {
     std::string message;
     CHECK(context_.InitCuda(&message))
         << "InitCuda() failed because: " << message;

     RandomPartitionedMatrixOptions options;
     options.num_row_blocks_f = 123;
     options.num_row_blocks_e = 456;
     options.num_col_blocks_f = 123;
     options.num_col_blocks_e = kNumColBlocksE;
     options.min_row_block_size = 1;
     options.max_row_block_size = 4;
     options.min_col_block_size = 1;
     options.max_col_block_size = 4;
     options.empty_f_probability = .1;
     options.cell_probability_f = .2;
     options.max_cells_f = options.num_col_blocks_f;

     std::mt19937 rng;
     short_f_ = CreateRandomPartitionedMatrix(options, rng);

     options.num_row_blocks_e = 123;
     options.num_row_blocks_f = 456;
     short_e_ = CreateRandomPartitionedMatrix(options, rng);

     options.max_cells_f = 1;
     options.num_row_blocks_e = options.num_row_blocks_f;
     options.num_row_blocks_e = options.num_row_blocks_f;
     f_crs_compatible_ = CreateRandomPartitionedMatrix(options, rng);
   }

   void TestMatrix(const BlockSparseMatrix& A_) {
     const int num_col_blocks_e = 456;
     CudaPartitionedBlockSparseCRSView view(A_, kNumColBlocksE, &context_);

     const int num_rows = A_.num_rows();
     const int num_cols = A_.num_cols();

     const auto& bs = *A_.block_structure();
     const int num_cols_e = bs.cols[num_col_blocks_e].position;
     const int num_cols_f = num_cols - num_cols_e;

     auto matrix_e = view.matrix_e();
     auto matrix_f = view.matrix_f();
     ASSERT_EQ(matrix_e->num_cols(), num_cols_e);
     ASSERT_EQ(matrix_e->num_rows(), num_rows);
     ASSERT_EQ(matrix_f->num_cols(), num_cols_f);
     ASSERT_EQ(matrix_f->num_rows(), num_rows);

     Vector x(num_cols);
     Vector x_left(num_cols_e);
     Vector x_right(num_cols_f);
     Vector y(num_rows);
     CudaVector x_cuda(&context_, num_cols);
     CudaVector x_left_cuda(&context_, num_cols_e);
     CudaVector x_right_cuda(&context_, num_cols_f);
     CudaVector y_cuda(&context_, num_rows);
     Vector y_cuda_host(num_rows);

     for (int i = 0; i < num_cols_e; ++i) {
       x.setZero();
       x_left.setZero();
       y.setZero();
       y_cuda.SetZero();
       x[i] = 1.;
       x_left[i] = 1.;
       x_left_cuda.CopyFromCpu(x_left);
       A_.RightMultiplyAndAccumulate(
           x.data(), y.data(), &context_, std::thread::hardware_concurrency());
       matrix_e->RightMultiplyAndAccumulate(x_left_cuda, &y_cuda);
       y_cuda.CopyTo(&y_cuda_host);
       // There will be up to 1 non-zero product per row, thus we expect an exact
       // match on 32-bit integer indices
       EXPECT_EQ((y - y_cuda_host).squaredNorm(), 0.);
     }
     for (int i = num_cols_e; i < num_cols_f; ++i) {
       x.setZero();
       x_right.setZero();
       y.setZero();
       y_cuda.SetZero();
       x[i] = 1.;
       x_right[i - num_cols_e] = 1.;
       x_right_cuda.CopyFromCpu(x_right);
       A_.RightMultiplyAndAccumulate(
           x.data(), y.data(), &context_, std::thread::hardware_concurrency());
       matrix_f->RightMultiplyAndAccumulate(x_right_cuda, &y_cuda);
       y_cuda.CopyTo(&y_cuda_host);
       // There will be up to 1 non-zero product per row, thus we expect an exact
       // match on 32-bit integer indices
       EXPECT_EQ((y - y_cuda_host).squaredNorm(), 0.);
     }
   }

   // E sub-matrix might have less row-blocks with cells than F sub-matrix. This
   // test matrix checks if this case is handled properly
   std::unique_ptr<BlockSparseMatrix> short_e_;
   // In case of non-crs compatible F matrix, permuting values from block-order
   // to crs order involves binary search over row-blocks of F. Having lots of
   // row-blocks with no F cells is an edge case for this algorithm.
   std::unique_ptr<BlockSparseMatrix> short_f_;
   // With F matrix being CRS-compatible, update of the values of partitioned
   // matrix view reduces to two host->device memcopies, and uses a separate code
   // path
   std::unique_ptr<BlockSparseMatrix> f_crs_compatible_;

   ContextImpl context_;
 };

 TEST_F(CudaPartitionedBlockSparseCRSViewTest, CreateUpdateValuesShortE) {
   TestMatrix(*short_e_);
 }

 TEST_F(CudaPartitionedBlockSparseCRSViewTest, CreateUpdateValuesShortF) {
   TestMatrix(*short_f_);
 }

 TEST_F(CudaPartitionedBlockSparseCRSViewTest,
        CreateUpdateValuesCrsCompatibleF) {
   TestMatrix(*f_crs_compatible_);
 }
 }  // namespace ceres::internal

 #endif  // CERES_NO_CUDA
	// Ceres Solver - A fast non-linear least squares minimizer
	// Copyright 2023 Google Inc. All rights reserved.
	// http://ceres-solver.org/
	//
	// Redistribution and use in source and binary forms, with or without
	// modification, are permitted provided that the following conditions are met:
	//
	// * Redistributions of source code must retain the above copyright notice,
	// this list of conditions and the following disclaimer.
	// * Redistributions in binary form must reproduce the above copyright notice,
	// this list of conditions and the following disclaimer in the documentation
	// and/or other materials provided with the distribution.
	// * Neither the name of Google Inc. nor the names of its contributors may be
	// used to endorse or promote products derived from this software without
	// specific prior written permission.
	//
	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
	// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	// POSSIBILITY OF SUCH DAMAGE.
	//
	// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)

	#include "ceres/cuda_partitioned_block_sparse_crs_view.h"

	#include <glog/logging.h>
	#include <gtest/gtest.h>

	#ifndef CERES_NO_CUDA

	namespace ceres::internal {

	namespace {
	struct RandomPartitionedMatrixOptions {
	int num_row_blocks_e;
	int num_row_blocks_f;
	int num_col_blocks_e;
	int num_col_blocks_f;
	int min_row_block_size;
	int max_row_block_size;
	int min_col_block_size;
	int max_col_block_size;
	double empty_f_probability;
	double cell_probability_f;
	int max_cells_f;
	};

	std::unique_ptr<BlockSparseMatrix> CreateRandomPartitionedMatrix(
	const RandomPartitionedMatrixOptions& options, std::mt19937& rng) {
	const int num_row_blocks =
	std::max(options.num_row_blocks_e, options.num_row_blocks_f);
	const int num_col_blocks =
	options.num_col_blocks_e + options.num_col_blocks_f;

	CompressedRowBlockStructure* block_structure =
	new CompressedRowBlockStructure;
	block_structure->cols.reserve(num_col_blocks);
	block_structure->rows.reserve(num_row_blocks);

	// Create column blocks
	std::uniform_int_distribution<int> col_size(options.min_col_block_size,
	options.max_col_block_size);
	int num_cols = 0;
	for (int i = 0; i < num_col_blocks; ++i) {
	const int size = col_size(rng);
	block_structure->cols.emplace_back(size, num_cols);
	num_cols += size;
	}

	// Prepare column-block indices of E cells
	std::vector<int> e_col_block_idx;
	e_col_block_idx.reserve(options.num_row_blocks_e);
	std::uniform_int_distribution<int> col_e(0, options.num_col_blocks_e - 1);
	for (int i = 0; i < options.num_row_blocks_e; ++i) {
	e_col_block_idx.emplace_back(col_e(rng));
	}
	std::sort(e_col_block_idx.begin(), e_col_block_idx.end());

	// Prepare cell structure
	std::uniform_int_distribution<int> row_size(options.min_row_block_size,
	options.max_row_block_size);
	std::uniform_real_distribution<double> uniform;
	int num_rows = 0;
	for (int i = 0; i < num_row_blocks; ++i) {
	const int size = row_size(rng);
	block_structure->rows.emplace_back();
	auto& row = block_structure->rows.back();
	row.block.size = size;
	row.block.position = num_rows;
	num_rows += size;
	if (i < options.num_row_blocks_e) {
	row.cells.emplace_back(e_col_block_idx[i], -1);
	if (uniform(rng) < options.empty_f_probability) {
	continue;
	}
	}
	if (i >= options.num_row_blocks_f) continue;
	const int cells_before = row.cells.size();
	for (int j = options.num_col_blocks_e; j < num_col_blocks; ++j) {
	if (uniform(rng) > options.cell_probability_f) {
	continue;
	}
	row.cells.emplace_back(j, -1);
	}
	if (row.cells.size() > cells_before + options.max_cells_f) {
	std::shuffle(row.cells.begin() + cells_before, row.cells.end(), rng);
	row.cells.resize(cells_before + options.max_cells_f);
	std::sort(
	row.cells.begin(), row.cells.end(), [](const auto& a, const auto& b) {
	return a.block_id < b.block_id;
	});
	}
	}

	// Fill positions in E sub-matrix
	int num_nonzeros = 0;
	for (int i = 0; i < options.num_row_blocks_e; ++i) {
	CHECK_GE(block_structure->rows[i].cells.size(), 1);
	block_structure->rows[i].cells[0].position = num_nonzeros;
	const int col_block_size =
	block_structure->cols[block_structure->rows[i].cells[0].block_id].size;
	const int row_block_size = block_structure->rows[i].block.size;
	num_nonzeros += row_block_size * col_block_size;
	CHECK_GE(num_nonzeros, 0);
	}
	// Fill positions in F sub-matrix
	for (int i = 0; i < options.num_row_blocks_f; ++i) {
	const int row_block_size = block_structure->rows[i].block.size;
	for (auto& cell : block_structure->rows[i].cells) {
	if (cell.position >= 0) continue;
	cell.position = num_nonzeros;
	const int col_block_size = block_structure->cols[cell.block_id].size;
	num_nonzeros += row_block_size * col_block_size;
	CHECK_GE(num_nonzeros, 0);
	}
	}
	// Populate values
	auto bsm = std::make_unique<BlockSparseMatrix>(block_structure, true);
	for (int i = 0; i < num_nonzeros; ++i) {
	bsm->mutable_values()[i] = i + 1;
	}
	return bsm;
	}
	} // namespace

	class CudaPartitionedBlockSparseCRSViewTest : public ::testing::Test {
	static constexpr int kNumColBlocksE = 456;

	protected:
	void SetUp() final {
	std::string message;
	CHECK(context_.InitCuda(&message))
	<< "InitCuda() failed because: " << message;

	RandomPartitionedMatrixOptions options;
	options.num_row_blocks_f = 123;
	options.num_row_blocks_e = 456;
	options.num_col_blocks_f = 123;
	options.num_col_blocks_e = kNumColBlocksE;
	options.min_row_block_size = 1;
	options.max_row_block_size = 4;
	options.min_col_block_size = 1;
	options.max_col_block_size = 4;
	options.empty_f_probability = .1;
	options.cell_probability_f = .2;
	options.max_cells_f = options.num_col_blocks_f;

	std::mt19937 rng;
	short_f_ = CreateRandomPartitionedMatrix(options, rng);

	options.num_row_blocks_e = 123;
	options.num_row_blocks_f = 456;
	short_e_ = CreateRandomPartitionedMatrix(options, rng);

	options.max_cells_f = 1;
	options.num_row_blocks_e = options.num_row_blocks_f;
	options.num_row_blocks_e = options.num_row_blocks_f;
	f_crs_compatible_ = CreateRandomPartitionedMatrix(options, rng);
	}

	void TestMatrix(const BlockSparseMatrix& A_) {
	const int num_col_blocks_e = 456;
	CudaPartitionedBlockSparseCRSView view(A_, kNumColBlocksE, &context_);

	const int num_rows = A_.num_rows();
	const int num_cols = A_.num_cols();

	const auto& bs = *A_.block_structure();
	const int num_cols_e = bs.cols[num_col_blocks_e].position;
	const int num_cols_f = num_cols - num_cols_e;

	auto matrix_e = view.matrix_e();
	auto matrix_f = view.matrix_f();
	ASSERT_EQ(matrix_e->num_cols(), num_cols_e);
	ASSERT_EQ(matrix_e->num_rows(), num_rows);
	ASSERT_EQ(matrix_f->num_cols(), num_cols_f);
	ASSERT_EQ(matrix_f->num_rows(), num_rows);

	Vector x(num_cols);
	Vector x_left(num_cols_e);
	Vector x_right(num_cols_f);
	Vector y(num_rows);
	CudaVector x_cuda(&context_, num_cols);
	CudaVector x_left_cuda(&context_, num_cols_e);
	CudaVector x_right_cuda(&context_, num_cols_f);
	CudaVector y_cuda(&context_, num_rows);
	Vector y_cuda_host(num_rows);

	for (int i = 0; i < num_cols_e; ++i) {
	x.setZero();
	x_left.setZero();
	y.setZero();
	y_cuda.SetZero();
	x[i] = 1.;
	x_left[i] = 1.;
	x_left_cuda.CopyFromCpu(x_left);
	A_.RightMultiplyAndAccumulate(
	x.data(), y.data(), &context_, std::thread::hardware_concurrency());
	matrix_e->RightMultiplyAndAccumulate(x_left_cuda, &y_cuda);
	y_cuda.CopyTo(&y_cuda_host);
	// There will be up to 1 non-zero product per row, thus we expect an exact
	// match on 32-bit integer indices
	EXPECT_EQ((y - y_cuda_host).squaredNorm(), 0.);
	}
	for (int i = num_cols_e; i < num_cols_f; ++i) {
	x.setZero();
	x_right.setZero();
	y.setZero();
	y_cuda.SetZero();
	x[i] = 1.;
	x_right[i - num_cols_e] = 1.;
	x_right_cuda.CopyFromCpu(x_right);
	A_.RightMultiplyAndAccumulate(
	x.data(), y.data(), &context_, std::thread::hardware_concurrency());
	matrix_f->RightMultiplyAndAccumulate(x_right_cuda, &y_cuda);
	y_cuda.CopyTo(&y_cuda_host);
	// There will be up to 1 non-zero product per row, thus we expect an exact
	// match on 32-bit integer indices
	EXPECT_EQ((y - y_cuda_host).squaredNorm(), 0.);
	}
	}

	// E sub-matrix might have less row-blocks with cells than F sub-matrix. This
	// test matrix checks if this case is handled properly
	std::unique_ptr<BlockSparseMatrix> short_e_;
	// In case of non-crs compatible F matrix, permuting values from block-order
	// to crs order involves binary search over row-blocks of F. Having lots of
	// row-blocks with no F cells is an edge case for this algorithm.
	std::unique_ptr<BlockSparseMatrix> short_f_;
	// With F matrix being CRS-compatible, update of the values of partitioned
	// matrix view reduces to two host->device memcopies, and uses a separate code
	// path
	std::unique_ptr<BlockSparseMatrix> f_crs_compatible_;

	ContextImpl context_;
	};

	TEST_F(CudaPartitionedBlockSparseCRSViewTest, CreateUpdateValuesShortE) {
	TestMatrix(*short_e_);
	}

	TEST_F(CudaPartitionedBlockSparseCRSViewTest, CreateUpdateValuesShortF) {
	TestMatrix(*short_f_);
	}

	TEST_F(CudaPartitionedBlockSparseCRSViewTest,
	CreateUpdateValuesCrsCompatibleF) {
	TestMatrix(*f_crs_compatible_);
	}
	} // namespace ceres::internal

	#endif // CERES_NO_CUDA