blob: 92863626900989e6c71fe6ba8ea44ca6ed89374e [file] [log] [blame]
// Ceres Solver - A fast non-linear least squares minimizer
// Copyright 2023 Google Inc. All rights reserved.
// http://ceres-solver.org/
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// * Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
// * Neither the name of Google Inc. nor the names of its contributors may be
// used to endorse or promote products derived from this software without
// specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
#include "ceres/cuda_partitioned_block_sparse_crs_view.h"
#include "absl/log/check.h"
#include "gtest/gtest.h"
#ifndef CERES_NO_CUDA
namespace ceres::internal {
namespace {
struct RandomPartitionedMatrixOptions {
int num_row_blocks_e;
int num_row_blocks_f;
int num_col_blocks_e;
int num_col_blocks_f;
int min_row_block_size;
int max_row_block_size;
int min_col_block_size;
int max_col_block_size;
double empty_f_probability;
double cell_probability_f;
int max_cells_f;
};
std::unique_ptr<BlockSparseMatrix> CreateRandomPartitionedMatrix(
const RandomPartitionedMatrixOptions& options, std::mt19937& rng) {
const int num_row_blocks =
std::max(options.num_row_blocks_e, options.num_row_blocks_f);
const int num_col_blocks =
options.num_col_blocks_e + options.num_col_blocks_f;
CompressedRowBlockStructure* block_structure =
new CompressedRowBlockStructure;
block_structure->cols.reserve(num_col_blocks);
block_structure->rows.reserve(num_row_blocks);
// Create column blocks
std::uniform_int_distribution<int> col_size(options.min_col_block_size,
options.max_col_block_size);
int num_cols = 0;
for (int i = 0; i < num_col_blocks; ++i) {
const int size = col_size(rng);
block_structure->cols.emplace_back(size, num_cols);
num_cols += size;
}
// Prepare column-block indices of E cells
std::vector<int> e_col_block_idx;
e_col_block_idx.reserve(options.num_row_blocks_e);
std::uniform_int_distribution<int> col_e(0, options.num_col_blocks_e - 1);
for (int i = 0; i < options.num_row_blocks_e; ++i) {
e_col_block_idx.emplace_back(col_e(rng));
}
std::sort(e_col_block_idx.begin(), e_col_block_idx.end());
// Prepare cell structure
std::uniform_int_distribution<int> row_size(options.min_row_block_size,
options.max_row_block_size);
std::uniform_real_distribution<double> uniform;
int num_rows = 0;
for (int i = 0; i < num_row_blocks; ++i) {
const int size = row_size(rng);
block_structure->rows.emplace_back();
auto& row = block_structure->rows.back();
row.block.size = size;
row.block.position = num_rows;
num_rows += size;
if (i < options.num_row_blocks_e) {
row.cells.emplace_back(e_col_block_idx[i], -1);
if (uniform(rng) < options.empty_f_probability) {
continue;
}
}
if (i >= options.num_row_blocks_f) continue;
const int cells_before = row.cells.size();
for (int j = options.num_col_blocks_e; j < num_col_blocks; ++j) {
if (uniform(rng) > options.cell_probability_f) {
continue;
}
row.cells.emplace_back(j, -1);
}
if (row.cells.size() > cells_before + options.max_cells_f) {
std::shuffle(row.cells.begin() + cells_before, row.cells.end(), rng);
row.cells.resize(cells_before + options.max_cells_f);
std::sort(
row.cells.begin(), row.cells.end(), [](const auto& a, const auto& b) {
return a.block_id < b.block_id;
});
}
}
// Fill positions in E sub-matrix
int num_nonzeros = 0;
for (int i = 0; i < options.num_row_blocks_e; ++i) {
CHECK_GE(block_structure->rows[i].cells.size(), 1);
block_structure->rows[i].cells[0].position = num_nonzeros;
const int col_block_size =
block_structure->cols[block_structure->rows[i].cells[0].block_id].size;
const int row_block_size = block_structure->rows[i].block.size;
num_nonzeros += row_block_size * col_block_size;
CHECK_GE(num_nonzeros, 0);
}
// Fill positions in F sub-matrix
for (int i = 0; i < options.num_row_blocks_f; ++i) {
const int row_block_size = block_structure->rows[i].block.size;
for (auto& cell : block_structure->rows[i].cells) {
if (cell.position >= 0) continue;
cell.position = num_nonzeros;
const int col_block_size = block_structure->cols[cell.block_id].size;
num_nonzeros += row_block_size * col_block_size;
CHECK_GE(num_nonzeros, 0);
}
}
// Populate values
auto bsm = std::make_unique<BlockSparseMatrix>(block_structure, true);
for (int i = 0; i < num_nonzeros; ++i) {
bsm->mutable_values()[i] = i + 1;
}
return bsm;
}
} // namespace
class CudaPartitionedBlockSparseCRSViewTest : public ::testing::Test {
static constexpr int kNumColBlocksE = 456;
protected:
void SetUp() final {
std::string message;
ASSERT_TRUE(context_.InitCuda(&message))
<< "InitCuda() failed because: " << message;
RandomPartitionedMatrixOptions options;
options.num_row_blocks_f = 123;
options.num_row_blocks_e = 456;
options.num_col_blocks_f = 123;
options.num_col_blocks_e = kNumColBlocksE;
options.min_row_block_size = 1;
options.max_row_block_size = 4;
options.min_col_block_size = 1;
options.max_col_block_size = 4;
options.empty_f_probability = .1;
options.cell_probability_f = .2;
options.max_cells_f = options.num_col_blocks_f;
std::mt19937 rng;
short_f_ = CreateRandomPartitionedMatrix(options, rng);
options.num_row_blocks_e = 123;
options.num_row_blocks_f = 456;
short_e_ = CreateRandomPartitionedMatrix(options, rng);
options.max_cells_f = 1;
options.num_row_blocks_e = options.num_row_blocks_f;
options.num_row_blocks_e = options.num_row_blocks_f;
f_crs_compatible_ = CreateRandomPartitionedMatrix(options, rng);
}
void TestMatrix(const BlockSparseMatrix& A_) {
const int num_col_blocks_e = 456;
CudaPartitionedBlockSparseCRSView view(A_, kNumColBlocksE, &context_);
const int num_rows = A_.num_rows();
const int num_cols = A_.num_cols();
const auto& bs = *A_.block_structure();
const int num_cols_e = bs.cols[num_col_blocks_e].position;
const int num_cols_f = num_cols - num_cols_e;
auto matrix_e = view.matrix_e();
auto matrix_f = view.matrix_f();
ASSERT_EQ(matrix_e->num_cols(), num_cols_e);
ASSERT_EQ(matrix_e->num_rows(), num_rows);
ASSERT_EQ(matrix_f->num_cols(), num_cols_f);
ASSERT_EQ(matrix_f->num_rows(), num_rows);
Vector x(num_cols);
Vector x_left(num_cols_e);
Vector x_right(num_cols_f);
Vector y(num_rows);
CudaVector x_cuda(&context_, num_cols);
CudaVector x_left_cuda(&context_, num_cols_e);
CudaVector x_right_cuda(&context_, num_cols_f);
CudaVector y_cuda(&context_, num_rows);
Vector y_cuda_host(num_rows);
for (int i = 0; i < num_cols_e; ++i) {
x.setZero();
x_left.setZero();
y.setZero();
y_cuda.SetZero();
x[i] = 1.;
x_left[i] = 1.;
x_left_cuda.CopyFromCpu(x_left);
A_.RightMultiplyAndAccumulate(
x.data(), y.data(), &context_, std::thread::hardware_concurrency());
matrix_e->RightMultiplyAndAccumulate(x_left_cuda, &y_cuda);
y_cuda.CopyTo(&y_cuda_host);
// There will be up to 1 non-zero product per row, thus we expect an exact
// match on 32-bit integer indices
EXPECT_EQ((y - y_cuda_host).squaredNorm(), 0.);
}
for (int i = num_cols_e; i < num_cols_f; ++i) {
x.setZero();
x_right.setZero();
y.setZero();
y_cuda.SetZero();
x[i] = 1.;
x_right[i - num_cols_e] = 1.;
x_right_cuda.CopyFromCpu(x_right);
A_.RightMultiplyAndAccumulate(
x.data(), y.data(), &context_, std::thread::hardware_concurrency());
matrix_f->RightMultiplyAndAccumulate(x_right_cuda, &y_cuda);
y_cuda.CopyTo(&y_cuda_host);
// There will be up to 1 non-zero product per row, thus we expect an exact
// match on 32-bit integer indices
EXPECT_EQ((y - y_cuda_host).squaredNorm(), 0.);
}
}
// E sub-matrix might have less row-blocks with cells than F sub-matrix. This
// test matrix checks if this case is handled properly
std::unique_ptr<BlockSparseMatrix> short_e_;
// In case of non-crs compatible F matrix, permuting values from block-order
// to crs order involves binary search over row-blocks of F. Having lots of
// row-blocks with no F cells is an edge case for this algorithm.
std::unique_ptr<BlockSparseMatrix> short_f_;
// With F matrix being CRS-compatible, update of the values of partitioned
// matrix view reduces to two host->device memcopies, and uses a separate code
// path
std::unique_ptr<BlockSparseMatrix> f_crs_compatible_;
ContextImpl context_;
};
TEST_F(CudaPartitionedBlockSparseCRSViewTest, CreateUpdateValuesShortE) {
TestMatrix(*short_e_);
}
TEST_F(CudaPartitionedBlockSparseCRSViewTest, CreateUpdateValuesShortF) {
TestMatrix(*short_f_);
}
TEST_F(CudaPartitionedBlockSparseCRSViewTest,
CreateUpdateValuesCrsCompatibleF) {
TestMatrix(*f_crs_compatible_);
}
} // namespace ceres::internal
#endif // CERES_NO_CUDA