Add a specialized schur eliminator. Add a new specialized Schur Eliminator for the case where there is exactly one F block and all dimensions are known at compile time. Benchmark on my macbook shoes ~3x speedup for Eliminate and 4x speedup for BackSubstitute. Similar speedups are observed on other platforms. ------------------------------------------------------------------------------------------ Benchmark Time CPU Iterations ------------------------------------------------------------------------------------------ BM_SchurEliminatorEliminate/10 4176 ns 4132 ns 153522 BM_SchurEliminatorEliminate/64 24392 ns 24234 ns 29993 BM_SchurEliminatorEliminate/512 169632 ns 168709 ns 3872 BM_SchurEliminatorEliminate/4096 1296407 ns 1292873 ns 519 BM_SchurEliminatorEliminate/10000 3159652 ns 3149532 ns 216 BM_SchurEliminatorForOneFBlockEliminate/10 1187 ns 1184 ns 603953 BM_SchurEliminatorForOneFBlockEliminate/64 7917 ns 7898 ns 87558 BM_SchurEliminatorForOneFBlockEliminate/512 57629 ns 57485 ns 12035 BM_SchurEliminatorForOneFBlockEliminate/4096 484680 ns 480757 ns 1459 BM_SchurEliminatorForOneFBlockEliminate/10000 1157237 ns 1150735 ns 566 BM_SchurEliminatorBackSubstitute/10 681 ns 679 ns 1070500 BM_SchurEliminatorBackSubstitute/64 3899 ns 3887 ns 182883 BM_SchurEliminatorBackSubstitute/512 33632 ns 33467 ns 22582 BM_SchurEliminatorBackSubstitute/4096 241117 ns 240257 ns 2884 BM_SchurEliminatorBackSubstitute/10000 631869 ns 626992 ns 1146 BM_SchurEliminatorForOneFBlockBackSubstitute/10 139 ns 138 ns 5021341 BM_SchurEliminatorForOneFBlockBackSubstitute/64 898 ns 892 ns 780396 BM_SchurEliminatorForOneFBlockBackSubstitute/512 7344 ns 7286 ns 87059 BM_SchurEliminatorForOneFBlockBackSubstitute/4096 57634 ns 57345 ns 11642 BM_SchurEliminatorForOneFBlockBackSubstitute/10000 153129 ns 152320 ns 4949 Change-Id: Iae64e96c91beb44b807df0e0572d202df7cdaed8
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt index 22b5d07..e452f48 100644 --- a/internal/ceres/CMakeLists.txt +++ b/internal/ceres/CMakeLists.txt
@@ -514,4 +514,7 @@ add_executable(invert_psd_matrix_benchmark invert_psd_matrix_benchmark.cc) add_dependencies_to_benchmark(invert_psd_matrix_benchmark) + + add_executable(schur_eliminator_benchmark schur_eliminator_benchmark.cc) + add_dependencies_to_benchmark(schur_eliminator_benchmark) endif (BUILD_BENCHMARKS)
diff --git a/internal/ceres/schur_eliminator.h b/internal/ceres/schur_eliminator.h index 52e3535..7dfc160 100644 --- a/internal/ceres/schur_eliminator.h +++ b/internal/ceres/schur_eliminator.h
@@ -1,5 +1,5 @@ // Ceres Solver - A fast non-linear least squares minimizer -// Copyright 2015 Google Inc. All rights reserved. +// Copyright 2019 Google Inc. All rights reserved. // http://ceres-solver.org/ // // Redistribution and use in source and binary forms, with or without @@ -36,10 +36,12 @@ #include <mutex> #include <vector> +#include "Eigen/Dense" #include "ceres/block_random_access_matrix.h" #include "ceres/block_sparse_matrix.h" #include "ceres/block_structure.h" #include "ceres/internal/eigen.h" +#include "ceres/internal/port.h" #include "ceres/linear_solver.h" namespace ceres { @@ -220,14 +222,13 @@ // level linear algebra routines. template <int kRowBlockSize = Eigen::Dynamic, int kEBlockSize = Eigen::Dynamic, - int kFBlockSize = Eigen::Dynamic > + int kFBlockSize = Eigen::Dynamic> class SchurEliminator : public SchurEliminatorBase { public: explicit SchurEliminator(const LinearSolver::Options& options) - : num_threads_(options.num_threads), - context_(options.context) { + : num_threads_(options.num_threads), context_(options.context) { CHECK(context_ != nullptr); -} + } // SchurEliminatorBase Interface virtual ~SchurEliminator(); @@ -306,12 +307,11 @@ int row_block_index, BlockRandomAccessMatrix* lhs); - void NoEBlockRowsUpdate(const BlockSparseMatrix* A, - const double* b, - int row_block_counter, - BlockRandomAccessMatrix* lhs, - double* rhs); + const double* b, + int row_block_counter, + BlockRandomAccessMatrix* lhs, + double* rhs); void NoEBlockRowOuterProduct(const BlockSparseMatrix* A, int row_block_index, @@ -357,7 +357,268 @@ // Locks for the blocks in the right hand side of the reduced linear // system. - std::vector<std::mutex*> rhs_locks_; + std::vector<std::mutex*> rhs_locks_; +}; + +// SchurEliminatorForOneFBlock specializes the SchurEliminatorBase interface for +// the case where there is exactly one f-block and all three parameters +// kRowBlockSize, kEBlockSize and KFBlockSize are known at compile time. This is +// the case for some two view bundle adjustment problems which have very +// stringent latency requirements. +// +// Under these assumptions, we can simplify the more general algorithm +// implemented by SchurEliminatorImpl significantly. Two of the major +// contributors to the increased performance are: +// +// 1. Simpler loop structure and less use of dynamic memory. Almost everything +// is on the stack and benefits from aligned memory as well as fixed sized +// vectorization. We are also able to reason about temporaries and control +// their lifetimes better. +// 2. Use of inverse() over llt().solve(Identity). +template <int kRowBlockSize = Eigen::Dynamic, + int kEBlockSize = Eigen::Dynamic, + int kFBlockSize = Eigen::Dynamic> +class SchurEliminatorForOneFBlock : public SchurEliminatorBase { + public: + virtual ~SchurEliminatorForOneFBlock() {} + void Init(int num_eliminate_blocks, + bool assume_full_rank_ete, + const CompressedRowBlockStructure* bs) override { + CHECK_GT(num_eliminate_blocks, 0) + << "SchurComplementSolver cannot be initialized with " + << "num_eliminate_blocks = 0."; + CHECK_EQ(bs->cols.size() - num_eliminate_blocks, 1); + + num_eliminate_blocks_ = num_eliminate_blocks; + const int num_row_blocks = bs->rows.size(); + chunks_.clear(); + int r = 0; + // Iterate over the row blocks of A, and detect the chunks. The + // matrix should already have been ordered so that all rows + // containing the same y block are vertically contiguous. + while (r < num_row_blocks) { + const int e_block_id = bs->rows[r].cells.front().block_id; + if (e_block_id >= num_eliminate_blocks_) { + break; + } + + chunks_.push_back(Chunk()); + Chunk& chunk = chunks_.back(); + chunk.num_rows = 0; + chunk.start = r; + // Add to the chunk until the first block in the row is + // different than the one in the first row for the chunk. + while (r + chunk.num_rows < num_row_blocks) { + const CompressedRow& row = bs->rows[r + chunk.num_rows]; + if (row.cells.front().block_id != e_block_id) { + break; + } + ++chunk.num_rows; + } + r += chunk.num_rows; + } + + const Chunk& last_chunk = chunks_.back(); + uneliminated_row_begins_ = last_chunk.start + last_chunk.num_rows; + e_t_e_inverse_matrices_.resize(kEBlockSize * kEBlockSize * chunks_.size()); + std::fill( + e_t_e_inverse_matrices_.begin(), e_t_e_inverse_matrices_.end(), 0.0); + } + + void Eliminate(const BlockSparseMatrix* A, + const double* b, + const double* D, + BlockRandomAccessMatrix* lhs_bram, + double* rhs_ptr) override { + // Since there is only one f-block, we can call GetCell once, and cache its + // output. + int r, c, row_stride, col_stride; + CellInfo* cell_info = + lhs_bram->GetCell(0, 0, &r, &c, &row_stride, &col_stride); + typename EigenTypes<kFBlockSize, kFBlockSize>::MatrixRef lhs( + cell_info->values, kFBlockSize, kFBlockSize); + typename EigenTypes<kFBlockSize>::VectorRef rhs(rhs_ptr, kFBlockSize); + + lhs.setZero(); + rhs.setZero(); + + const CompressedRowBlockStructure* bs = A->block_structure(); + const double* values = A->values(); + + // Add the diagonal to the schur complement. + if (D != nullptr) { + typename EigenTypes<kFBlockSize>::ConstVectorRef diag( + D + bs->cols[num_eliminate_blocks_].position, kFBlockSize); + lhs.diagonal() = diag.array().square().matrix(); + } + + Eigen::Matrix<double, kEBlockSize, kFBlockSize> tmp; + Eigen::Matrix<double, kEBlockSize, 1> tmp2; + + // The following loop works on a block matrix which looks as follows + // (number of rows can be anything): + // + // [e_1 | f_1] = [b1] + // [e_2 | f_2] = [b2] + // [e_3 | f_3] = [b3] + // [e_4 | f_4] = [b4] + // + // and computes the following + // + // e_t_e = sum_i e_i^T * e_i + // e_t_e_inverse = inverse(e_t_e) + // e_t_f = sum_i e_i^T f_i + // e_t_b = sum_i e_i^T b_i + // f_t_b = sum_i f_i^T b_i + // + // lhs += sum_i f_i^T * f_i - e_t_f^T * e_t_e_inverse * e_t_f + // rhs += f_t_b - e_t_f^T * e_t_e_inverse * e_t_b + for (int i = 0; i < chunks_.size(); ++i) { + const Chunk& chunk = chunks_[i]; + const int e_block_id = bs->rows[chunk.start].cells.front().block_id; + + // Naming covention, e_t_e = e_block.transpose() * e_block; + Eigen::Matrix<double, kEBlockSize, kEBlockSize> e_t_e; + Eigen::Matrix<double, kEBlockSize, kFBlockSize> e_t_f; + Eigen::Matrix<double, kEBlockSize, 1> e_t_b; + Eigen::Matrix<double, kFBlockSize, 1> f_t_b; + + // Add the square of the diagonal to e_t_e. + if (D != NULL) { + const typename EigenTypes<kEBlockSize>::ConstVectorRef diag( + D + bs->cols[e_block_id].position, kEBlockSize); + e_t_e = diag.array().square().matrix().asDiagonal(); + } else { + e_t_e.setZero(); + } + e_t_f.setZero(); + e_t_b.setZero(); + f_t_b.setZero(); + + for (int j = 0; j < chunk.num_rows; ++j) { + const int row_id = chunk.start + j; + const auto& row = bs->rows[row_id]; + const typename EigenTypes<kRowBlockSize, kEBlockSize>::ConstMatrixRef + e_block(values + row.cells[0].position, kRowBlockSize, kEBlockSize); + const typename EigenTypes<kRowBlockSize>::ConstVectorRef b_block( + b + row.block.position, kRowBlockSize); + + e_t_e.noalias() += e_block.transpose() * e_block; + e_t_b.noalias() += e_block.transpose() * b_block; + + if (row.cells.size() == 1) { + // There is no f block, so there is nothing more to do. + continue; + } + + const typename EigenTypes<kRowBlockSize, kFBlockSize>::ConstMatrixRef + f_block(values + row.cells[1].position, kRowBlockSize, kFBlockSize); + e_t_f.noalias() += e_block.transpose() * f_block; + lhs.noalias() += f_block.transpose() * f_block; + f_t_b.noalias() += f_block.transpose() * b_block; + } + + // BackSubstitute computes the same inverse, and this is the key workload + // there, so caching these inverses makes BackSubstitute essentially free. + typename EigenTypes<kEBlockSize, kEBlockSize>::MatrixRef e_t_e_inverse( + &e_t_e_inverse_matrices_[kEBlockSize * kEBlockSize * i], + kEBlockSize, + kEBlockSize); + + // e_t_e is a symmetric positive definite matrix, so the standard way to + // compute its inverse is via the Cholesky factorization by calling + // e_t_e.llt().solve(Identity()). However, the inverse() method even + // though it is not optimized for symmetric matrices is significantly + // faster for small fixed size (up to 4x4) matrices. + // + // https://eigen.tuxfamily.org/dox/group__TutorialLinearAlgebra.html#title3 + e_t_e_inverse.noalias() = e_t_e.inverse(); + + // The use of these two pre-allocated tmp vectors saves temporaries in the + // expressions for lhs and rhs updates below and has a significant impact + // on the performance of this method. + tmp.noalias() = e_t_e_inverse * e_t_f; + tmp2.noalias() = e_t_e_inverse * e_t_b; + + lhs.noalias() -= e_t_f.transpose() * tmp; + rhs.noalias() += f_t_b - e_t_f.transpose() * tmp2; + } + + // The rows without any e-blocks can have arbitrary size but only contain + // the f-block. + // + // lhs += f_i^T f_i + // rhs += f_i^T b_i + for (int row_id = uneliminated_row_begins_; row_id < bs->rows.size(); + ++row_id) { + const auto& row = bs->rows[row_id]; + const auto& cell = row.cells[0]; + const typename EigenTypes<Eigen::Dynamic, kFBlockSize>::ConstMatrixRef + f_block(values + cell.position, row.block.size, kFBlockSize); + const typename EigenTypes<Eigen::Dynamic>::ConstVectorRef b_block( + b + row.block.position, row.block.size); + lhs.noalias() += f_block.transpose() * f_block; + rhs.noalias() += f_block.transpose() * b_block; + } + } + + // This implementation of BackSubstitute depends on Eliminate being called + // before this. SchurComplementSolver always does this. + // + // y_i = e_t_e_inverse * sum_i e_i^T * (b_i - f_i * z); + void BackSubstitute(const BlockSparseMatrix* A, + const double* b, + const double* D, + const double* z_ptr, + double* y) override { + typename EigenTypes<kFBlockSize>::ConstVectorRef z(z_ptr, kFBlockSize); + const CompressedRowBlockStructure* bs = A->block_structure(); + const double* values = A->values(); + Eigen::Matrix<double, kEBlockSize, 1> tmp; + for (int i = 0; i < chunks_.size(); ++i) { + const Chunk& chunk = chunks_[i]; + const int e_block_id = bs->rows[chunk.start].cells.front().block_id; + tmp.setZero(); + for (int j = 0; j < chunk.num_rows; ++j) { + const int row_id = chunk.start + j; + const auto& row = bs->rows[row_id]; + const typename EigenTypes<kRowBlockSize, kEBlockSize>::ConstMatrixRef + e_block(values + row.cells[0].position, kRowBlockSize, kEBlockSize); + const typename EigenTypes<kRowBlockSize>::ConstVectorRef b_block( + b + row.block.position, kRowBlockSize); + + if (row.cells.size() == 1) { + // There is no f block. + tmp += e_block.transpose() * b_block; + } else { + typename EigenTypes<kRowBlockSize, kFBlockSize>::ConstMatrixRef + f_block( + values + row.cells[1].position, kRowBlockSize, kFBlockSize); + tmp += e_block.transpose() * (b_block - f_block * z); + } + } + + typename EigenTypes<kEBlockSize, kEBlockSize>::MatrixRef e_t_e_inverse( + &e_t_e_inverse_matrices_[kEBlockSize * kEBlockSize * i], + kEBlockSize, + kEBlockSize); + + typename EigenTypes<kEBlockSize>::VectorRef y_block( + y + bs->cols[e_block_id].position, kEBlockSize); + y_block.noalias() = e_t_e_inverse * tmp; + } + } + + private: + struct Chunk { + int start = 0; + int num_rows = 0; + }; + + std::vector<Chunk> chunks_; + int num_eliminate_blocks_; + int uneliminated_row_begins_; + std::vector<double> e_t_e_inverse_matrices_; }; } // namespace internal
diff --git a/internal/ceres/schur_eliminator_benchmark.cc b/internal/ceres/schur_eliminator_benchmark.cc new file mode 100644 index 0000000..9dcc77f --- /dev/null +++ b/internal/ceres/schur_eliminator_benchmark.cc
@@ -0,0 +1,222 @@ +// Ceres Solver - A fast non-linear least squares minimizer +// Copyright 2019 Google Inc. All rights reserved. +// http://ceres-solver.org/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Google Inc. nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Authors: sameeragarwal@google.com (Sameer Agarwal) + +#include "Eigen/Dense" +#include "benchmark/benchmark.h" +#include "ceres/block_random_access_dense_matrix.h" +#include "ceres/block_sparse_matrix.h" +#include "ceres/block_structure.h" +#include "ceres/random.h" +#include "ceres/schur_eliminator.h" + +namespace ceres { +namespace internal { + +constexpr int kRowBlockSize = 2; +constexpr int kEBlockSize = 3; +constexpr int kFBlockSize = 6; + +class BenchmarkData { + public: + explicit BenchmarkData(const int num_e_blocks) { + CompressedRowBlockStructure* bs = new CompressedRowBlockStructure; + bs->cols.resize(num_e_blocks + 1); + int col_pos = 0; + for (int i = 0; i < num_e_blocks; ++i) { + bs->cols[i].position = col_pos; + bs->cols[i].size = kEBlockSize; + col_pos += kEBlockSize; + } + bs->cols.back().position = col_pos; + bs->cols.back().size = kFBlockSize; + + bs->rows.resize(2 * num_e_blocks); + int row_pos = 0; + int cell_pos = 0; + for (int i = 0; i < num_e_blocks; ++i) { + { + auto& row = bs->rows[2 * i]; + row.block.position = row_pos; + row.block.size = kRowBlockSize; + row_pos += kRowBlockSize; + auto& cells = row.cells; + cells.resize(2); + cells[0].block_id = i; + cells[0].position = cell_pos; + cell_pos += kRowBlockSize * kEBlockSize; + cells[1].block_id = num_e_blocks; + cells[1].position = cell_pos; + cell_pos += kRowBlockSize * kFBlockSize; + } + { + auto& row = bs->rows[2 * i + 1]; + row.block.position = row_pos; + row.block.size = kRowBlockSize; + row_pos += kRowBlockSize; + auto& cells = row.cells; + cells.resize(1); + cells[0].block_id = i; + cells[0].position = cell_pos; + cell_pos += kRowBlockSize * kEBlockSize; + } + } + + matrix_.reset(new BlockSparseMatrix(bs)); + double* values = matrix_->mutable_values(); + for (int i = 0; i < matrix_->num_nonzeros(); ++i) { + values[i] = RandNormal(); + } + + b_.resize(matrix_->num_rows()); + b_.setRandom(); + + std::vector<int> blocks(1, kFBlockSize); + lhs_.reset(new BlockRandomAccessDenseMatrix(blocks)); + diagonal_.resize(matrix_->num_cols()); + diagonal_.setOnes(); + rhs_.resize(kFBlockSize); + + y_.resize(num_e_blocks * kEBlockSize); + y_.setZero(); + z_.resize(kFBlockSize); + z_.setOnes(); + } + + const BlockSparseMatrix& matrix() const { return *matrix_; } + const Vector& b() const { return b_; } + const Vector& diagonal() const { return diagonal_; } + BlockRandomAccessDenseMatrix* mutable_lhs() { return lhs_.get(); } + Vector* mutable_rhs() { return &rhs_; } + Vector* mutable_y() { return &y_; } + Vector* mutable_z() { return &z_; } + + private: + std::unique_ptr<BlockSparseMatrix> matrix_; + Vector b_; + std::unique_ptr<BlockRandomAccessDenseMatrix> lhs_; + Vector rhs_; + Vector diagonal_; + Vector z_; + Vector y_; +}; + +void BM_SchurEliminatorEliminate(benchmark::State& state) { + const int num_e_blocks = state.range(0); + BenchmarkData data(num_e_blocks); + + ContextImpl context; + LinearSolver::Options linear_solver_options; + linear_solver_options.e_block_size = kEBlockSize; + linear_solver_options.row_block_size = kRowBlockSize; + linear_solver_options.f_block_size = kFBlockSize; + linear_solver_options.context = &context; + std::unique_ptr<SchurEliminatorBase> eliminator( + SchurEliminatorBase::Create(linear_solver_options)); + + eliminator->Init(num_e_blocks, true, data.matrix().block_structure()); + for (auto _ : state) { + eliminator->Eliminate(&data.matrix(), + data.b().data(), + data.diagonal().data(), + data.mutable_lhs(), + data.mutable_rhs()->data()); + } +} + +void BM_SchurEliminatorBackSubstitute(benchmark::State& state) { + const int num_e_blocks = state.range(0); + BenchmarkData data(num_e_blocks); + + ContextImpl context; + LinearSolver::Options linear_solver_options; + linear_solver_options.e_block_size = kEBlockSize; + linear_solver_options.row_block_size = kRowBlockSize; + linear_solver_options.f_block_size = kFBlockSize; + linear_solver_options.context = &context; + std::unique_ptr<SchurEliminatorBase> eliminator( + SchurEliminatorBase::Create(linear_solver_options)); + + eliminator->Init(num_e_blocks, true, data.matrix().block_structure()); + eliminator->Eliminate(&data.matrix(), + data.b().data(), + data.diagonal().data(), + data.mutable_lhs(), + data.mutable_rhs()->data()); + for (auto _ : state) { + eliminator->BackSubstitute(&data.matrix(), + data.b().data(), + data.diagonal().data(), + data.mutable_z()->data(), + data.mutable_y()->data()); + } +} + +void BM_SchurEliminatorForOneFBlockEliminate(benchmark::State& state) { + const int num_e_blocks = state.range(0); + BenchmarkData data(num_e_blocks); + SchurEliminatorForOneFBlock<2, 3, 6> eliminator; + eliminator.Init(num_e_blocks, true, data.matrix().block_structure()); + for (auto _ : state) { + eliminator.Eliminate(&data.matrix(), + data.b().data(), + data.diagonal().data(), + data.mutable_lhs(), + data.mutable_rhs()->data()); + } +} + +void BM_SchurEliminatorForOneFBlockBackSubstitute(benchmark::State& state) { + const int num_e_blocks = state.range(0); + BenchmarkData data(num_e_blocks); + SchurEliminatorForOneFBlock<2, 3, 6> eliminator; + eliminator.Init(num_e_blocks, true, data.matrix().block_structure()); + eliminator.Eliminate(&data.matrix(), + data.b().data(), + data.diagonal().data(), + data.mutable_lhs(), + data.mutable_rhs()->data()); + for (auto _ : state) { + eliminator.BackSubstitute(&data.matrix(), + data.b().data(), + data.diagonal().data(), + data.mutable_z()->data(), + data.mutable_y()->data()); + } +} + +BENCHMARK(BM_SchurEliminatorEliminate)->Range(10, 10000); +BENCHMARK(BM_SchurEliminatorForOneFBlockEliminate)->Range(10, 10000); +BENCHMARK(BM_SchurEliminatorBackSubstitute)->Range(10, 10000); +BENCHMARK(BM_SchurEliminatorForOneFBlockBackSubstitute)->Range(10, 10000); + +} // namespace internal +} // namespace ceres + +BENCHMARK_MAIN();
diff --git a/internal/ceres/schur_eliminator_test.cc b/internal/ceres/schur_eliminator_test.cc index 2e8492f..645b25a 100644 --- a/internal/ceres/schur_eliminator_test.cc +++ b/internal/ceres/schur_eliminator_test.cc
@@ -1,5 +1,5 @@ // Ceres Solver - A fast non-linear least squares minimizer -// Copyright 2015 Google Inc. All rights reserved. +// Copyright 2019 Google Inc. All rights reserved. // http://ceres-solver.org/ // // Redistribution and use in source and binary forms, with or without @@ -31,14 +31,17 @@ #include "ceres/schur_eliminator.h" #include <memory> + #include "Eigen/Dense" #include "ceres/block_random_access_dense_matrix.h" #include "ceres/block_sparse_matrix.h" +#include "ceres/block_structure.h" #include "ceres/casts.h" #include "ceres/context_impl.h" #include "ceres/detect_structure.h" #include "ceres/internal/eigen.h" #include "ceres/linear_least_squares_problems.h" +#include "ceres/random.h" #include "ceres/test_util.h" #include "ceres/triplet_sparse_matrix.h" #include "ceres/types.h" @@ -226,5 +229,138 @@ EliminateSolveAndCompare(VectorRef(D.get(), A->num_cols()), false, 1e-14); } +TEST(SchurEliminatorForOneFBlock, MatchesSchurEliminator) { + constexpr int kRowBlockSize = 2; + constexpr int kEBlockSize = 3; + constexpr int kFBlockSize = 6; + constexpr int num_e_blocks = 5; + + CompressedRowBlockStructure* bs = new CompressedRowBlockStructure; + bs->cols.resize(num_e_blocks + 1); + int col_pos = 0; + for (int i = 0; i < num_e_blocks; ++i) { + bs->cols[i].position = col_pos; + bs->cols[i].size = kEBlockSize; + col_pos += kEBlockSize; + } + bs->cols.back().position = col_pos; + bs->cols.back().size = kFBlockSize; + + bs->rows.resize(2 * num_e_blocks + 1); + int row_pos = 0; + int cell_pos = 0; + for (int i = 0; i < num_e_blocks; ++i) { + { + auto& row = bs->rows[2 * i]; + row.block.position = row_pos; + row.block.size = kRowBlockSize; + row_pos += kRowBlockSize; + auto& cells = row.cells; + cells.resize(2); + cells[0].block_id = i; + cells[0].position = cell_pos; + cell_pos += kRowBlockSize * kEBlockSize; + cells[1].block_id = num_e_blocks; + cells[1].position = cell_pos; + cell_pos += kRowBlockSize * kFBlockSize; + } + { + auto& row = bs->rows[2 * i + 1]; + row.block.position = row_pos; + row.block.size = kRowBlockSize; + row_pos += kRowBlockSize; + auto& cells = row.cells; + cells.resize(1); + cells[0].block_id = i; + cells[0].position = cell_pos; + cell_pos += kRowBlockSize * kEBlockSize; + } + } + + { + auto& row = bs->rows.back(); + row.block.position = row_pos; + row.block.size = kEBlockSize; + row_pos += kRowBlockSize; + auto& cells = row.cells; + cells.resize(1); + cells[0].block_id = num_e_blocks; + cells[0].position = cell_pos; + cell_pos += kEBlockSize * kEBlockSize; + } + + BlockSparseMatrix matrix(bs); + double* values = matrix.mutable_values(); + for (int i = 0; i < matrix.num_nonzeros(); ++i) { + values[i] = RandNormal(); + } + + Vector b(matrix.num_rows()); + b.setRandom(); + + Vector diagonal(matrix.num_cols()); + diagonal.setOnes(); + + std::vector<int> blocks(1, kFBlockSize); + BlockRandomAccessDenseMatrix actual_lhs(blocks); + BlockRandomAccessDenseMatrix expected_lhs(blocks); + Vector actual_rhs(kFBlockSize); + Vector expected_rhs(kFBlockSize); + + Vector f_sol(kFBlockSize); + f_sol.setRandom(); + Vector actual_e_sol(num_e_blocks * kEBlockSize); + actual_e_sol.setZero(); + Vector expected_e_sol(num_e_blocks * kEBlockSize); + expected_e_sol.setZero(); + + { + ContextImpl context; + LinearSolver::Options linear_solver_options; + linear_solver_options.e_block_size = kEBlockSize; + linear_solver_options.row_block_size = kRowBlockSize; + linear_solver_options.f_block_size = kFBlockSize; + linear_solver_options.context = &context; + std::unique_ptr<SchurEliminatorBase> eliminator( + SchurEliminatorBase::Create(linear_solver_options)); + eliminator->Init(num_e_blocks, true, matrix.block_structure()); + eliminator->Eliminate(&matrix, b.data(), diagonal.data(), &expected_lhs, + expected_rhs.data()); + eliminator->BackSubstitute(&matrix, b.data(), diagonal.data(), f_sol.data(), + actual_e_sol.data()); + } + + { + SchurEliminatorForOneFBlock<2, 3, 6> eliminator; + eliminator.Init(num_e_blocks, true, matrix.block_structure()); + eliminator.Eliminate(&matrix, b.data(), diagonal.data(), &actual_lhs, + actual_rhs.data()); + eliminator.BackSubstitute(&matrix, b.data(), diagonal.data(), f_sol.data(), + expected_e_sol.data()); + } + ConstMatrixRef actual_lhsref(actual_lhs.values(), actual_lhs.num_cols(), + actual_lhs.num_cols()); + ConstMatrixRef expected_lhsref(expected_lhs.values(), actual_lhs.num_cols(), + actual_lhs.num_cols()); + + EXPECT_NEAR((actual_lhsref - expected_lhsref).norm() / expected_lhsref.norm(), + 0.0, 1e-12) + << "expected: \n" + << expected_lhsref << "\nactual: \n" + << actual_lhsref; + + EXPECT_NEAR((actual_rhs - expected_rhs).norm() / expected_rhs.norm(), 0.0, + 1e-12) + << "expected: \n" + << expected_rhs << "\nactual: \n" + << actual_rhs; + + EXPECT_NEAR((actual_e_sol - expected_e_sol).norm() / expected_e_sol.norm(), + 0.0, 1e-12) + << "expected: \n" + << expected_e_sol << "\nactual: \n" + << actual_e_sol; +} + } // namespace internal } // namespace ceres