Add a specialized schur eliminator.
Add a new specialized Schur Eliminator for the case where there is
exactly one F block and all dimensions are known at compile time.
Benchmark on my macbook shoes ~3x speedup for Eliminate and 4x speedup for BackSubstitute.
Similar speedups are observed on other platforms.
------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations
------------------------------------------------------------------------------------------
BM_SchurEliminatorEliminate/10 4176 ns 4132 ns 153522
BM_SchurEliminatorEliminate/64 24392 ns 24234 ns 29993
BM_SchurEliminatorEliminate/512 169632 ns 168709 ns 3872
BM_SchurEliminatorEliminate/4096 1296407 ns 1292873 ns 519
BM_SchurEliminatorEliminate/10000 3159652 ns 3149532 ns 216
BM_SchurEliminatorForOneFBlockEliminate/10 1187 ns 1184 ns 603953
BM_SchurEliminatorForOneFBlockEliminate/64 7917 ns 7898 ns 87558
BM_SchurEliminatorForOneFBlockEliminate/512 57629 ns 57485 ns 12035
BM_SchurEliminatorForOneFBlockEliminate/4096 484680 ns 480757 ns 1459
BM_SchurEliminatorForOneFBlockEliminate/10000 1157237 ns 1150735 ns 566
BM_SchurEliminatorBackSubstitute/10 681 ns 679 ns 1070500
BM_SchurEliminatorBackSubstitute/64 3899 ns 3887 ns 182883
BM_SchurEliminatorBackSubstitute/512 33632 ns 33467 ns 22582
BM_SchurEliminatorBackSubstitute/4096 241117 ns 240257 ns 2884
BM_SchurEliminatorBackSubstitute/10000 631869 ns 626992 ns 1146
BM_SchurEliminatorForOneFBlockBackSubstitute/10 139 ns 138 ns 5021341
BM_SchurEliminatorForOneFBlockBackSubstitute/64 898 ns 892 ns 780396
BM_SchurEliminatorForOneFBlockBackSubstitute/512 7344 ns 7286 ns 87059
BM_SchurEliminatorForOneFBlockBackSubstitute/4096 57634 ns 57345 ns 11642
BM_SchurEliminatorForOneFBlockBackSubstitute/10000 153129 ns 152320 ns 4949
Change-Id: Iae64e96c91beb44b807df0e0572d202df7cdaed8
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index 22b5d07..e452f48 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -514,4 +514,7 @@
add_executable(invert_psd_matrix_benchmark invert_psd_matrix_benchmark.cc)
add_dependencies_to_benchmark(invert_psd_matrix_benchmark)
+
+ add_executable(schur_eliminator_benchmark schur_eliminator_benchmark.cc)
+ add_dependencies_to_benchmark(schur_eliminator_benchmark)
endif (BUILD_BENCHMARKS)
diff --git a/internal/ceres/schur_eliminator.h b/internal/ceres/schur_eliminator.h
index 52e3535..7dfc160 100644
--- a/internal/ceres/schur_eliminator.h
+++ b/internal/ceres/schur_eliminator.h
@@ -1,5 +1,5 @@
// Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2019 Google Inc. All rights reserved.
// http://ceres-solver.org/
//
// Redistribution and use in source and binary forms, with or without
@@ -36,10 +36,12 @@
#include <mutex>
#include <vector>
+#include "Eigen/Dense"
#include "ceres/block_random_access_matrix.h"
#include "ceres/block_sparse_matrix.h"
#include "ceres/block_structure.h"
#include "ceres/internal/eigen.h"
+#include "ceres/internal/port.h"
#include "ceres/linear_solver.h"
namespace ceres {
@@ -220,14 +222,13 @@
// level linear algebra routines.
template <int kRowBlockSize = Eigen::Dynamic,
int kEBlockSize = Eigen::Dynamic,
- int kFBlockSize = Eigen::Dynamic >
+ int kFBlockSize = Eigen::Dynamic>
class SchurEliminator : public SchurEliminatorBase {
public:
explicit SchurEliminator(const LinearSolver::Options& options)
- : num_threads_(options.num_threads),
- context_(options.context) {
+ : num_threads_(options.num_threads), context_(options.context) {
CHECK(context_ != nullptr);
-}
+ }
// SchurEliminatorBase Interface
virtual ~SchurEliminator();
@@ -306,12 +307,11 @@
int row_block_index,
BlockRandomAccessMatrix* lhs);
-
void NoEBlockRowsUpdate(const BlockSparseMatrix* A,
- const double* b,
- int row_block_counter,
- BlockRandomAccessMatrix* lhs,
- double* rhs);
+ const double* b,
+ int row_block_counter,
+ BlockRandomAccessMatrix* lhs,
+ double* rhs);
void NoEBlockRowOuterProduct(const BlockSparseMatrix* A,
int row_block_index,
@@ -357,7 +357,268 @@
// Locks for the blocks in the right hand side of the reduced linear
// system.
- std::vector<std::mutex*> rhs_locks_;
+ std::vector<std::mutex*> rhs_locks_;
+};
+
+// SchurEliminatorForOneFBlock specializes the SchurEliminatorBase interface for
+// the case where there is exactly one f-block and all three parameters
+// kRowBlockSize, kEBlockSize and KFBlockSize are known at compile time. This is
+// the case for some two view bundle adjustment problems which have very
+// stringent latency requirements.
+//
+// Under these assumptions, we can simplify the more general algorithm
+// implemented by SchurEliminatorImpl significantly. Two of the major
+// contributors to the increased performance are:
+//
+// 1. Simpler loop structure and less use of dynamic memory. Almost everything
+// is on the stack and benefits from aligned memory as well as fixed sized
+// vectorization. We are also able to reason about temporaries and control
+// their lifetimes better.
+// 2. Use of inverse() over llt().solve(Identity).
+template <int kRowBlockSize = Eigen::Dynamic,
+ int kEBlockSize = Eigen::Dynamic,
+ int kFBlockSize = Eigen::Dynamic>
+class SchurEliminatorForOneFBlock : public SchurEliminatorBase {
+ public:
+ virtual ~SchurEliminatorForOneFBlock() {}
+ void Init(int num_eliminate_blocks,
+ bool assume_full_rank_ete,
+ const CompressedRowBlockStructure* bs) override {
+ CHECK_GT(num_eliminate_blocks, 0)
+ << "SchurComplementSolver cannot be initialized with "
+ << "num_eliminate_blocks = 0.";
+ CHECK_EQ(bs->cols.size() - num_eliminate_blocks, 1);
+
+ num_eliminate_blocks_ = num_eliminate_blocks;
+ const int num_row_blocks = bs->rows.size();
+ chunks_.clear();
+ int r = 0;
+ // Iterate over the row blocks of A, and detect the chunks. The
+ // matrix should already have been ordered so that all rows
+ // containing the same y block are vertically contiguous.
+ while (r < num_row_blocks) {
+ const int e_block_id = bs->rows[r].cells.front().block_id;
+ if (e_block_id >= num_eliminate_blocks_) {
+ break;
+ }
+
+ chunks_.push_back(Chunk());
+ Chunk& chunk = chunks_.back();
+ chunk.num_rows = 0;
+ chunk.start = r;
+ // Add to the chunk until the first block in the row is
+ // different than the one in the first row for the chunk.
+ while (r + chunk.num_rows < num_row_blocks) {
+ const CompressedRow& row = bs->rows[r + chunk.num_rows];
+ if (row.cells.front().block_id != e_block_id) {
+ break;
+ }
+ ++chunk.num_rows;
+ }
+ r += chunk.num_rows;
+ }
+
+ const Chunk& last_chunk = chunks_.back();
+ uneliminated_row_begins_ = last_chunk.start + last_chunk.num_rows;
+ e_t_e_inverse_matrices_.resize(kEBlockSize * kEBlockSize * chunks_.size());
+ std::fill(
+ e_t_e_inverse_matrices_.begin(), e_t_e_inverse_matrices_.end(), 0.0);
+ }
+
+ void Eliminate(const BlockSparseMatrix* A,
+ const double* b,
+ const double* D,
+ BlockRandomAccessMatrix* lhs_bram,
+ double* rhs_ptr) override {
+ // Since there is only one f-block, we can call GetCell once, and cache its
+ // output.
+ int r, c, row_stride, col_stride;
+ CellInfo* cell_info =
+ lhs_bram->GetCell(0, 0, &r, &c, &row_stride, &col_stride);
+ typename EigenTypes<kFBlockSize, kFBlockSize>::MatrixRef lhs(
+ cell_info->values, kFBlockSize, kFBlockSize);
+ typename EigenTypes<kFBlockSize>::VectorRef rhs(rhs_ptr, kFBlockSize);
+
+ lhs.setZero();
+ rhs.setZero();
+
+ const CompressedRowBlockStructure* bs = A->block_structure();
+ const double* values = A->values();
+
+ // Add the diagonal to the schur complement.
+ if (D != nullptr) {
+ typename EigenTypes<kFBlockSize>::ConstVectorRef diag(
+ D + bs->cols[num_eliminate_blocks_].position, kFBlockSize);
+ lhs.diagonal() = diag.array().square().matrix();
+ }
+
+ Eigen::Matrix<double, kEBlockSize, kFBlockSize> tmp;
+ Eigen::Matrix<double, kEBlockSize, 1> tmp2;
+
+ // The following loop works on a block matrix which looks as follows
+ // (number of rows can be anything):
+ //
+ // [e_1 | f_1] = [b1]
+ // [e_2 | f_2] = [b2]
+ // [e_3 | f_3] = [b3]
+ // [e_4 | f_4] = [b4]
+ //
+ // and computes the following
+ //
+ // e_t_e = sum_i e_i^T * e_i
+ // e_t_e_inverse = inverse(e_t_e)
+ // e_t_f = sum_i e_i^T f_i
+ // e_t_b = sum_i e_i^T b_i
+ // f_t_b = sum_i f_i^T b_i
+ //
+ // lhs += sum_i f_i^T * f_i - e_t_f^T * e_t_e_inverse * e_t_f
+ // rhs += f_t_b - e_t_f^T * e_t_e_inverse * e_t_b
+ for (int i = 0; i < chunks_.size(); ++i) {
+ const Chunk& chunk = chunks_[i];
+ const int e_block_id = bs->rows[chunk.start].cells.front().block_id;
+
+ // Naming covention, e_t_e = e_block.transpose() * e_block;
+ Eigen::Matrix<double, kEBlockSize, kEBlockSize> e_t_e;
+ Eigen::Matrix<double, kEBlockSize, kFBlockSize> e_t_f;
+ Eigen::Matrix<double, kEBlockSize, 1> e_t_b;
+ Eigen::Matrix<double, kFBlockSize, 1> f_t_b;
+
+ // Add the square of the diagonal to e_t_e.
+ if (D != NULL) {
+ const typename EigenTypes<kEBlockSize>::ConstVectorRef diag(
+ D + bs->cols[e_block_id].position, kEBlockSize);
+ e_t_e = diag.array().square().matrix().asDiagonal();
+ } else {
+ e_t_e.setZero();
+ }
+ e_t_f.setZero();
+ e_t_b.setZero();
+ f_t_b.setZero();
+
+ for (int j = 0; j < chunk.num_rows; ++j) {
+ const int row_id = chunk.start + j;
+ const auto& row = bs->rows[row_id];
+ const typename EigenTypes<kRowBlockSize, kEBlockSize>::ConstMatrixRef
+ e_block(values + row.cells[0].position, kRowBlockSize, kEBlockSize);
+ const typename EigenTypes<kRowBlockSize>::ConstVectorRef b_block(
+ b + row.block.position, kRowBlockSize);
+
+ e_t_e.noalias() += e_block.transpose() * e_block;
+ e_t_b.noalias() += e_block.transpose() * b_block;
+
+ if (row.cells.size() == 1) {
+ // There is no f block, so there is nothing more to do.
+ continue;
+ }
+
+ const typename EigenTypes<kRowBlockSize, kFBlockSize>::ConstMatrixRef
+ f_block(values + row.cells[1].position, kRowBlockSize, kFBlockSize);
+ e_t_f.noalias() += e_block.transpose() * f_block;
+ lhs.noalias() += f_block.transpose() * f_block;
+ f_t_b.noalias() += f_block.transpose() * b_block;
+ }
+
+ // BackSubstitute computes the same inverse, and this is the key workload
+ // there, so caching these inverses makes BackSubstitute essentially free.
+ typename EigenTypes<kEBlockSize, kEBlockSize>::MatrixRef e_t_e_inverse(
+ &e_t_e_inverse_matrices_[kEBlockSize * kEBlockSize * i],
+ kEBlockSize,
+ kEBlockSize);
+
+ // e_t_e is a symmetric positive definite matrix, so the standard way to
+ // compute its inverse is via the Cholesky factorization by calling
+ // e_t_e.llt().solve(Identity()). However, the inverse() method even
+ // though it is not optimized for symmetric matrices is significantly
+ // faster for small fixed size (up to 4x4) matrices.
+ //
+ // https://eigen.tuxfamily.org/dox/group__TutorialLinearAlgebra.html#title3
+ e_t_e_inverse.noalias() = e_t_e.inverse();
+
+ // The use of these two pre-allocated tmp vectors saves temporaries in the
+ // expressions for lhs and rhs updates below and has a significant impact
+ // on the performance of this method.
+ tmp.noalias() = e_t_e_inverse * e_t_f;
+ tmp2.noalias() = e_t_e_inverse * e_t_b;
+
+ lhs.noalias() -= e_t_f.transpose() * tmp;
+ rhs.noalias() += f_t_b - e_t_f.transpose() * tmp2;
+ }
+
+ // The rows without any e-blocks can have arbitrary size but only contain
+ // the f-block.
+ //
+ // lhs += f_i^T f_i
+ // rhs += f_i^T b_i
+ for (int row_id = uneliminated_row_begins_; row_id < bs->rows.size();
+ ++row_id) {
+ const auto& row = bs->rows[row_id];
+ const auto& cell = row.cells[0];
+ const typename EigenTypes<Eigen::Dynamic, kFBlockSize>::ConstMatrixRef
+ f_block(values + cell.position, row.block.size, kFBlockSize);
+ const typename EigenTypes<Eigen::Dynamic>::ConstVectorRef b_block(
+ b + row.block.position, row.block.size);
+ lhs.noalias() += f_block.transpose() * f_block;
+ rhs.noalias() += f_block.transpose() * b_block;
+ }
+ }
+
+ // This implementation of BackSubstitute depends on Eliminate being called
+ // before this. SchurComplementSolver always does this.
+ //
+ // y_i = e_t_e_inverse * sum_i e_i^T * (b_i - f_i * z);
+ void BackSubstitute(const BlockSparseMatrix* A,
+ const double* b,
+ const double* D,
+ const double* z_ptr,
+ double* y) override {
+ typename EigenTypes<kFBlockSize>::ConstVectorRef z(z_ptr, kFBlockSize);
+ const CompressedRowBlockStructure* bs = A->block_structure();
+ const double* values = A->values();
+ Eigen::Matrix<double, kEBlockSize, 1> tmp;
+ for (int i = 0; i < chunks_.size(); ++i) {
+ const Chunk& chunk = chunks_[i];
+ const int e_block_id = bs->rows[chunk.start].cells.front().block_id;
+ tmp.setZero();
+ for (int j = 0; j < chunk.num_rows; ++j) {
+ const int row_id = chunk.start + j;
+ const auto& row = bs->rows[row_id];
+ const typename EigenTypes<kRowBlockSize, kEBlockSize>::ConstMatrixRef
+ e_block(values + row.cells[0].position, kRowBlockSize, kEBlockSize);
+ const typename EigenTypes<kRowBlockSize>::ConstVectorRef b_block(
+ b + row.block.position, kRowBlockSize);
+
+ if (row.cells.size() == 1) {
+ // There is no f block.
+ tmp += e_block.transpose() * b_block;
+ } else {
+ typename EigenTypes<kRowBlockSize, kFBlockSize>::ConstMatrixRef
+ f_block(
+ values + row.cells[1].position, kRowBlockSize, kFBlockSize);
+ tmp += e_block.transpose() * (b_block - f_block * z);
+ }
+ }
+
+ typename EigenTypes<kEBlockSize, kEBlockSize>::MatrixRef e_t_e_inverse(
+ &e_t_e_inverse_matrices_[kEBlockSize * kEBlockSize * i],
+ kEBlockSize,
+ kEBlockSize);
+
+ typename EigenTypes<kEBlockSize>::VectorRef y_block(
+ y + bs->cols[e_block_id].position, kEBlockSize);
+ y_block.noalias() = e_t_e_inverse * tmp;
+ }
+ }
+
+ private:
+ struct Chunk {
+ int start = 0;
+ int num_rows = 0;
+ };
+
+ std::vector<Chunk> chunks_;
+ int num_eliminate_blocks_;
+ int uneliminated_row_begins_;
+ std::vector<double> e_t_e_inverse_matrices_;
};
} // namespace internal
diff --git a/internal/ceres/schur_eliminator_benchmark.cc b/internal/ceres/schur_eliminator_benchmark.cc
new file mode 100644
index 0000000..9dcc77f
--- /dev/null
+++ b/internal/ceres/schur_eliminator_benchmark.cc
@@ -0,0 +1,222 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2019 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: sameeragarwal@google.com (Sameer Agarwal)
+
+#include "Eigen/Dense"
+#include "benchmark/benchmark.h"
+#include "ceres/block_random_access_dense_matrix.h"
+#include "ceres/block_sparse_matrix.h"
+#include "ceres/block_structure.h"
+#include "ceres/random.h"
+#include "ceres/schur_eliminator.h"
+
+namespace ceres {
+namespace internal {
+
+constexpr int kRowBlockSize = 2;
+constexpr int kEBlockSize = 3;
+constexpr int kFBlockSize = 6;
+
+class BenchmarkData {
+ public:
+ explicit BenchmarkData(const int num_e_blocks) {
+ CompressedRowBlockStructure* bs = new CompressedRowBlockStructure;
+ bs->cols.resize(num_e_blocks + 1);
+ int col_pos = 0;
+ for (int i = 0; i < num_e_blocks; ++i) {
+ bs->cols[i].position = col_pos;
+ bs->cols[i].size = kEBlockSize;
+ col_pos += kEBlockSize;
+ }
+ bs->cols.back().position = col_pos;
+ bs->cols.back().size = kFBlockSize;
+
+ bs->rows.resize(2 * num_e_blocks);
+ int row_pos = 0;
+ int cell_pos = 0;
+ for (int i = 0; i < num_e_blocks; ++i) {
+ {
+ auto& row = bs->rows[2 * i];
+ row.block.position = row_pos;
+ row.block.size = kRowBlockSize;
+ row_pos += kRowBlockSize;
+ auto& cells = row.cells;
+ cells.resize(2);
+ cells[0].block_id = i;
+ cells[0].position = cell_pos;
+ cell_pos += kRowBlockSize * kEBlockSize;
+ cells[1].block_id = num_e_blocks;
+ cells[1].position = cell_pos;
+ cell_pos += kRowBlockSize * kFBlockSize;
+ }
+ {
+ auto& row = bs->rows[2 * i + 1];
+ row.block.position = row_pos;
+ row.block.size = kRowBlockSize;
+ row_pos += kRowBlockSize;
+ auto& cells = row.cells;
+ cells.resize(1);
+ cells[0].block_id = i;
+ cells[0].position = cell_pos;
+ cell_pos += kRowBlockSize * kEBlockSize;
+ }
+ }
+
+ matrix_.reset(new BlockSparseMatrix(bs));
+ double* values = matrix_->mutable_values();
+ for (int i = 0; i < matrix_->num_nonzeros(); ++i) {
+ values[i] = RandNormal();
+ }
+
+ b_.resize(matrix_->num_rows());
+ b_.setRandom();
+
+ std::vector<int> blocks(1, kFBlockSize);
+ lhs_.reset(new BlockRandomAccessDenseMatrix(blocks));
+ diagonal_.resize(matrix_->num_cols());
+ diagonal_.setOnes();
+ rhs_.resize(kFBlockSize);
+
+ y_.resize(num_e_blocks * kEBlockSize);
+ y_.setZero();
+ z_.resize(kFBlockSize);
+ z_.setOnes();
+ }
+
+ const BlockSparseMatrix& matrix() const { return *matrix_; }
+ const Vector& b() const { return b_; }
+ const Vector& diagonal() const { return diagonal_; }
+ BlockRandomAccessDenseMatrix* mutable_lhs() { return lhs_.get(); }
+ Vector* mutable_rhs() { return &rhs_; }
+ Vector* mutable_y() { return &y_; }
+ Vector* mutable_z() { return &z_; }
+
+ private:
+ std::unique_ptr<BlockSparseMatrix> matrix_;
+ Vector b_;
+ std::unique_ptr<BlockRandomAccessDenseMatrix> lhs_;
+ Vector rhs_;
+ Vector diagonal_;
+ Vector z_;
+ Vector y_;
+};
+
+void BM_SchurEliminatorEliminate(benchmark::State& state) {
+ const int num_e_blocks = state.range(0);
+ BenchmarkData data(num_e_blocks);
+
+ ContextImpl context;
+ LinearSolver::Options linear_solver_options;
+ linear_solver_options.e_block_size = kEBlockSize;
+ linear_solver_options.row_block_size = kRowBlockSize;
+ linear_solver_options.f_block_size = kFBlockSize;
+ linear_solver_options.context = &context;
+ std::unique_ptr<SchurEliminatorBase> eliminator(
+ SchurEliminatorBase::Create(linear_solver_options));
+
+ eliminator->Init(num_e_blocks, true, data.matrix().block_structure());
+ for (auto _ : state) {
+ eliminator->Eliminate(&data.matrix(),
+ data.b().data(),
+ data.diagonal().data(),
+ data.mutable_lhs(),
+ data.mutable_rhs()->data());
+ }
+}
+
+void BM_SchurEliminatorBackSubstitute(benchmark::State& state) {
+ const int num_e_blocks = state.range(0);
+ BenchmarkData data(num_e_blocks);
+
+ ContextImpl context;
+ LinearSolver::Options linear_solver_options;
+ linear_solver_options.e_block_size = kEBlockSize;
+ linear_solver_options.row_block_size = kRowBlockSize;
+ linear_solver_options.f_block_size = kFBlockSize;
+ linear_solver_options.context = &context;
+ std::unique_ptr<SchurEliminatorBase> eliminator(
+ SchurEliminatorBase::Create(linear_solver_options));
+
+ eliminator->Init(num_e_blocks, true, data.matrix().block_structure());
+ eliminator->Eliminate(&data.matrix(),
+ data.b().data(),
+ data.diagonal().data(),
+ data.mutable_lhs(),
+ data.mutable_rhs()->data());
+ for (auto _ : state) {
+ eliminator->BackSubstitute(&data.matrix(),
+ data.b().data(),
+ data.diagonal().data(),
+ data.mutable_z()->data(),
+ data.mutable_y()->data());
+ }
+}
+
+void BM_SchurEliminatorForOneFBlockEliminate(benchmark::State& state) {
+ const int num_e_blocks = state.range(0);
+ BenchmarkData data(num_e_blocks);
+ SchurEliminatorForOneFBlock<2, 3, 6> eliminator;
+ eliminator.Init(num_e_blocks, true, data.matrix().block_structure());
+ for (auto _ : state) {
+ eliminator.Eliminate(&data.matrix(),
+ data.b().data(),
+ data.diagonal().data(),
+ data.mutable_lhs(),
+ data.mutable_rhs()->data());
+ }
+}
+
+void BM_SchurEliminatorForOneFBlockBackSubstitute(benchmark::State& state) {
+ const int num_e_blocks = state.range(0);
+ BenchmarkData data(num_e_blocks);
+ SchurEliminatorForOneFBlock<2, 3, 6> eliminator;
+ eliminator.Init(num_e_blocks, true, data.matrix().block_structure());
+ eliminator.Eliminate(&data.matrix(),
+ data.b().data(),
+ data.diagonal().data(),
+ data.mutable_lhs(),
+ data.mutable_rhs()->data());
+ for (auto _ : state) {
+ eliminator.BackSubstitute(&data.matrix(),
+ data.b().data(),
+ data.diagonal().data(),
+ data.mutable_z()->data(),
+ data.mutable_y()->data());
+ }
+}
+
+BENCHMARK(BM_SchurEliminatorEliminate)->Range(10, 10000);
+BENCHMARK(BM_SchurEliminatorForOneFBlockEliminate)->Range(10, 10000);
+BENCHMARK(BM_SchurEliminatorBackSubstitute)->Range(10, 10000);
+BENCHMARK(BM_SchurEliminatorForOneFBlockBackSubstitute)->Range(10, 10000);
+
+} // namespace internal
+} // namespace ceres
+
+BENCHMARK_MAIN();
diff --git a/internal/ceres/schur_eliminator_test.cc b/internal/ceres/schur_eliminator_test.cc
index 2e8492f..645b25a 100644
--- a/internal/ceres/schur_eliminator_test.cc
+++ b/internal/ceres/schur_eliminator_test.cc
@@ -1,5 +1,5 @@
// Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2019 Google Inc. All rights reserved.
// http://ceres-solver.org/
//
// Redistribution and use in source and binary forms, with or without
@@ -31,14 +31,17 @@
#include "ceres/schur_eliminator.h"
#include <memory>
+
#include "Eigen/Dense"
#include "ceres/block_random_access_dense_matrix.h"
#include "ceres/block_sparse_matrix.h"
+#include "ceres/block_structure.h"
#include "ceres/casts.h"
#include "ceres/context_impl.h"
#include "ceres/detect_structure.h"
#include "ceres/internal/eigen.h"
#include "ceres/linear_least_squares_problems.h"
+#include "ceres/random.h"
#include "ceres/test_util.h"
#include "ceres/triplet_sparse_matrix.h"
#include "ceres/types.h"
@@ -226,5 +229,138 @@
EliminateSolveAndCompare(VectorRef(D.get(), A->num_cols()), false, 1e-14);
}
+TEST(SchurEliminatorForOneFBlock, MatchesSchurEliminator) {
+ constexpr int kRowBlockSize = 2;
+ constexpr int kEBlockSize = 3;
+ constexpr int kFBlockSize = 6;
+ constexpr int num_e_blocks = 5;
+
+ CompressedRowBlockStructure* bs = new CompressedRowBlockStructure;
+ bs->cols.resize(num_e_blocks + 1);
+ int col_pos = 0;
+ for (int i = 0; i < num_e_blocks; ++i) {
+ bs->cols[i].position = col_pos;
+ bs->cols[i].size = kEBlockSize;
+ col_pos += kEBlockSize;
+ }
+ bs->cols.back().position = col_pos;
+ bs->cols.back().size = kFBlockSize;
+
+ bs->rows.resize(2 * num_e_blocks + 1);
+ int row_pos = 0;
+ int cell_pos = 0;
+ for (int i = 0; i < num_e_blocks; ++i) {
+ {
+ auto& row = bs->rows[2 * i];
+ row.block.position = row_pos;
+ row.block.size = kRowBlockSize;
+ row_pos += kRowBlockSize;
+ auto& cells = row.cells;
+ cells.resize(2);
+ cells[0].block_id = i;
+ cells[0].position = cell_pos;
+ cell_pos += kRowBlockSize * kEBlockSize;
+ cells[1].block_id = num_e_blocks;
+ cells[1].position = cell_pos;
+ cell_pos += kRowBlockSize * kFBlockSize;
+ }
+ {
+ auto& row = bs->rows[2 * i + 1];
+ row.block.position = row_pos;
+ row.block.size = kRowBlockSize;
+ row_pos += kRowBlockSize;
+ auto& cells = row.cells;
+ cells.resize(1);
+ cells[0].block_id = i;
+ cells[0].position = cell_pos;
+ cell_pos += kRowBlockSize * kEBlockSize;
+ }
+ }
+
+ {
+ auto& row = bs->rows.back();
+ row.block.position = row_pos;
+ row.block.size = kEBlockSize;
+ row_pos += kRowBlockSize;
+ auto& cells = row.cells;
+ cells.resize(1);
+ cells[0].block_id = num_e_blocks;
+ cells[0].position = cell_pos;
+ cell_pos += kEBlockSize * kEBlockSize;
+ }
+
+ BlockSparseMatrix matrix(bs);
+ double* values = matrix.mutable_values();
+ for (int i = 0; i < matrix.num_nonzeros(); ++i) {
+ values[i] = RandNormal();
+ }
+
+ Vector b(matrix.num_rows());
+ b.setRandom();
+
+ Vector diagonal(matrix.num_cols());
+ diagonal.setOnes();
+
+ std::vector<int> blocks(1, kFBlockSize);
+ BlockRandomAccessDenseMatrix actual_lhs(blocks);
+ BlockRandomAccessDenseMatrix expected_lhs(blocks);
+ Vector actual_rhs(kFBlockSize);
+ Vector expected_rhs(kFBlockSize);
+
+ Vector f_sol(kFBlockSize);
+ f_sol.setRandom();
+ Vector actual_e_sol(num_e_blocks * kEBlockSize);
+ actual_e_sol.setZero();
+ Vector expected_e_sol(num_e_blocks * kEBlockSize);
+ expected_e_sol.setZero();
+
+ {
+ ContextImpl context;
+ LinearSolver::Options linear_solver_options;
+ linear_solver_options.e_block_size = kEBlockSize;
+ linear_solver_options.row_block_size = kRowBlockSize;
+ linear_solver_options.f_block_size = kFBlockSize;
+ linear_solver_options.context = &context;
+ std::unique_ptr<SchurEliminatorBase> eliminator(
+ SchurEliminatorBase::Create(linear_solver_options));
+ eliminator->Init(num_e_blocks, true, matrix.block_structure());
+ eliminator->Eliminate(&matrix, b.data(), diagonal.data(), &expected_lhs,
+ expected_rhs.data());
+ eliminator->BackSubstitute(&matrix, b.data(), diagonal.data(), f_sol.data(),
+ actual_e_sol.data());
+ }
+
+ {
+ SchurEliminatorForOneFBlock<2, 3, 6> eliminator;
+ eliminator.Init(num_e_blocks, true, matrix.block_structure());
+ eliminator.Eliminate(&matrix, b.data(), diagonal.data(), &actual_lhs,
+ actual_rhs.data());
+ eliminator.BackSubstitute(&matrix, b.data(), diagonal.data(), f_sol.data(),
+ expected_e_sol.data());
+ }
+ ConstMatrixRef actual_lhsref(actual_lhs.values(), actual_lhs.num_cols(),
+ actual_lhs.num_cols());
+ ConstMatrixRef expected_lhsref(expected_lhs.values(), actual_lhs.num_cols(),
+ actual_lhs.num_cols());
+
+ EXPECT_NEAR((actual_lhsref - expected_lhsref).norm() / expected_lhsref.norm(),
+ 0.0, 1e-12)
+ << "expected: \n"
+ << expected_lhsref << "\nactual: \n"
+ << actual_lhsref;
+
+ EXPECT_NEAR((actual_rhs - expected_rhs).norm() / expected_rhs.norm(), 0.0,
+ 1e-12)
+ << "expected: \n"
+ << expected_rhs << "\nactual: \n"
+ << actual_rhs;
+
+ EXPECT_NEAR((actual_e_sol - expected_e_sol).norm() / expected_e_sol.norm(),
+ 0.0, 1e-12)
+ << "expected: \n"
+ << expected_e_sol << "\nactual: \n"
+ << actual_e_sol;
+}
+
} // namespace internal
} // namespace ceres