blob: 3832fe63e50444319c401b11f281a74035fb0e4c [file] [log] [blame]
// Ceres Solver - A fast non-linear least squares minimizer
// Copyright 2023 Google Inc. All rights reserved.
// http://ceres-solver.org/
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// * Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
// * Neither the name of Google Inc. nor the names of its contributors may be
// used to endorse or promote products derived from this software without
// specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
// Author: sameeragarwal@google.com (Sameer Agarwal)
#ifndef CERES_INTERNAL_SCHUR_ELIMINATOR_H_
#define CERES_INTERNAL_SCHUR_ELIMINATOR_H_
#include <map>
#include <memory>
#include <mutex>
#include <vector>
#include "Eigen/Dense"
#include "ceres/block_random_access_matrix.h"
#include "ceres/block_sparse_matrix.h"
#include "ceres/block_structure.h"
#include "ceres/internal/config.h"
#include "ceres/internal/disable_warnings.h"
#include "ceres/internal/eigen.h"
#include "ceres/internal/export.h"
#include "ceres/linear_solver.h"
namespace ceres::internal {
// Classes implementing the SchurEliminatorBase interface implement
// variable elimination for linear least squares problems. Assuming
// that the input linear system Ax = b can be partitioned into
//
// E y + F z = b
//
// Where x = [y;z] is a partition of the variables. The partitioning
// of the variables is such that, E'E is a block diagonal matrix. Or
// in other words, the parameter blocks in E form an independent set
// of the graph implied by the block matrix A'A. Then, this class
// provides the functionality to compute the Schur complement system
//
// S z = r
//
// where
//
// S = F'F - F'E (E'E)^{-1} E'F and r = F'b - F'E(E'E)^(-1) E'b
//
// This is the Eliminate operation, i.e., construct the linear system
// obtained by eliminating the variables in E.
//
// The eliminator also provides the reverse functionality, i.e. given
// values for z it can back substitute for the values of y, by solving the
// linear system
//
// Ey = b - F z
//
// which is done by observing that
//
// y = (E'E)^(-1) [E'b - E'F z]
//
// The eliminator has a number of requirements.
//
// The rows of A are ordered so that for every variable block in y,
// all the rows containing that variable block occur as a vertically
// contiguous block. i.e the matrix A looks like
//
// E F chunk
// A = [ y1 0 0 0 | z1 0 0 0 z5] 1
// [ y1 0 0 0 | z1 z2 0 0 0] 1
// [ 0 y2 0 0 | 0 0 z3 0 0] 2
// [ 0 0 y3 0 | z1 z2 z3 z4 z5] 3
// [ 0 0 y3 0 | z1 0 0 0 z5] 3
// [ 0 0 0 y4 | 0 0 0 0 z5] 4
// [ 0 0 0 y4 | 0 z2 0 0 0] 4
// [ 0 0 0 y4 | 0 0 0 0 0] 4
// [ 0 0 0 0 | z1 0 0 0 0] non chunk blocks
// [ 0 0 0 0 | 0 0 z3 z4 z5] non chunk blocks
//
// This structure should be reflected in the corresponding
// CompressedRowBlockStructure object associated with A. The linear
// system Ax = b should either be well posed or the array D below
// should be non-null and the diagonal matrix corresponding to it
// should be non-singular. For simplicity of exposition only the case
// with a null D is described.
//
// The usual way to do the elimination is as follows. Starting with
//
// E y + F z = b
//
// we can form the normal equations,
//
// E'E y + E'F z = E'b
// F'E y + F'F z = F'b
//
// multiplying both sides of the first equation by (E'E)^(-1) and then
// by F'E we get
//
// F'E y + F'E (E'E)^(-1) E'F z = F'E (E'E)^(-1) E'b
// F'E y + F'F z = F'b
//
// now subtracting the two equations we get
//
// [FF' - F'E (E'E)^(-1) E'F] z = F'b - F'E(E'E)^(-1) E'b
//
// Instead of forming the normal equations and operating on them as
// general sparse matrices, the algorithm here deals with one
// parameter block in y at a time. The rows corresponding to a single
// parameter block yi are known as a chunk, and the algorithm operates
// on one chunk at a time. The mathematics remains the same since the
// reduced linear system can be shown to be the sum of the reduced
// linear systems for each chunk. This can be seen by observing two
// things.
//
// 1. E'E is a block diagonal matrix.
//
// 2. When E'F is computed, only the terms within a single chunk
// interact, i.e for y1 column blocks when transposed and multiplied
// with F, the only non-zero contribution comes from the blocks in
// chunk1.
//
// Thus, the reduced linear system
//
// FF' - F'E (E'E)^(-1) E'F
//
// can be re-written as
//
// sum_k F_k F_k' - F_k'E_k (E_k'E_k)^(-1) E_k' F_k
//
// Where the sum is over chunks and E_k'E_k is dense matrix of size y1
// x y1.
//
// Advanced usage. Until now it has been assumed that the user would
// be interested in all of the Schur Complement S. However, it is also
// possible to use this eliminator to obtain an arbitrary submatrix of
// the full Schur complement. When the eliminator is generating the
// blocks of S, it asks the RandomAccessBlockMatrix instance passed to
// it if it has storage for that block. If it does, the eliminator
// computes/updates it, if not it is skipped. This is useful when one
// is interested in constructing a preconditioner based on the Schur
// Complement, e.g., computing the block diagonal of S so that it can
// be used as a preconditioner for an Iterative Substructuring based
// solver [See Agarwal et al, Bundle Adjustment in the Large, ECCV
// 2008 for an example of such use].
//
// Example usage: Please see schur_complement_solver.cc
class CERES_NO_EXPORT SchurEliminatorBase {
public:
virtual ~SchurEliminatorBase();
// Initialize the eliminator. It is the user's responsibility to call
// this function before calling Eliminate or BackSubstitute. It is
// also the caller's responsibility to ensure that the
// CompressedRowBlockStructure object passed to this method is the
// same one (or is equivalent to) the one associated with the
// BlockSparseMatrix objects below.
//
// assume_full_rank_ete controls how the eliminator inverts with the
// diagonal blocks corresponding to e blocks in A'A. If
// assume_full_rank_ete is true, then a Cholesky factorization is
// used to compute the inverse, otherwise a singular value
// decomposition is used to compute the pseudo inverse.
virtual void Init(int num_eliminate_blocks,
bool assume_full_rank_ete,
const CompressedRowBlockStructure* bs) = 0;
// Compute the Schur complement system from the augmented linear
// least squares problem [A;D] x = [b;0]. The left hand side and the
// right hand side of the reduced linear system are returned in lhs
// and rhs respectively.
//
// It is the caller's responsibility to construct and initialize
// lhs. Depending upon the structure of the lhs object passed here,
// the full or a submatrix of the Schur complement will be computed.
//
// Since the Schur complement is a symmetric matrix, only the upper
// triangular part of the Schur complement is computed.
virtual void Eliminate(const BlockSparseMatrixData& A,
const double* b,
const double* D,
BlockRandomAccessMatrix* lhs,
double* rhs) = 0;
// Given values for the variables z in the F block of A, solve for
// the optimal values of the variables y corresponding to the E
// block in A.
virtual void BackSubstitute(const BlockSparseMatrixData& A,
const double* b,
const double* D,
const double* z,
double* y) = 0;
// Factory
static std::unique_ptr<SchurEliminatorBase> Create(
const LinearSolver::Options& options);
};
// Templated implementation of the SchurEliminatorBase interface. The
// templating is on the sizes of the row, e and f blocks sizes in the
// input matrix. In many problems, the sizes of one or more of these
// blocks are constant, in that case, its worth passing these
// parameters as template arguments so that they are visible to the
// compiler and can be used for compile time optimization of the low
// level linear algebra routines.
template <int kRowBlockSize = Eigen::Dynamic,
int kEBlockSize = Eigen::Dynamic,
int kFBlockSize = Eigen::Dynamic>
class CERES_NO_EXPORT SchurEliminator final : public SchurEliminatorBase {
public:
explicit SchurEliminator(const LinearSolver::Options& options)
: num_threads_(options.num_threads), context_(options.context) {
CHECK(context_ != nullptr);
}
// SchurEliminatorBase Interface
~SchurEliminator() override;
void Init(int num_eliminate_blocks,
bool assume_full_rank_ete,
const CompressedRowBlockStructure* bs) final;
void Eliminate(const BlockSparseMatrixData& A,
const double* b,
const double* D,
BlockRandomAccessMatrix* lhs,
double* rhs) final;
void BackSubstitute(const BlockSparseMatrixData& A,
const double* b,
const double* D,
const double* z,
double* y) final;
private:
// Chunk objects store combinatorial information needed to
// efficiently eliminate a whole chunk out of the least squares
// problem. Consider the first chunk in the example matrix above.
//
// [ y1 0 0 0 | z1 0 0 0 z5]
// [ y1 0 0 0 | z1 z2 0 0 0]
//
// One of the intermediate quantities that needs to be calculated is
// for each row the product of the y block transposed with the
// non-zero z block, and the sum of these blocks across rows. A
// temporary array "buffer_" is used for computing and storing them
// and the buffer_layout maps the indices of the z-blocks to
// position in the buffer_ array. The size of the chunk is the
// number of row blocks/residual blocks for the particular y block
// being considered.
//
// For the example chunk shown above,
//
// size = 2
//
// The entries of buffer_layout will be filled in the following order.
//
// buffer_layout[z1] = 0
// buffer_layout[z5] = y1 * z1
// buffer_layout[z2] = y1 * z1 + y1 * z5
using BufferLayoutType = std::map<int, int>;
struct Chunk {
explicit Chunk(int start) : size(0), start(start) {}
int size;
int start;
BufferLayoutType buffer_layout;
};
void ChunkDiagonalBlockAndGradient(
const Chunk& chunk,
const BlockSparseMatrixData& A,
const double* b,
int row_block_counter,
typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix* eet,
double* g,
double* buffer,
BlockRandomAccessMatrix* lhs);
void UpdateRhs(const Chunk& chunk,
const BlockSparseMatrixData& A,
const double* b,
int row_block_counter,
const double* inverse_ete_g,
double* rhs);
void ChunkOuterProduct(int thread_id,
const CompressedRowBlockStructure* bs,
const Matrix& inverse_eet,
const double* buffer,
const BufferLayoutType& buffer_layout,
BlockRandomAccessMatrix* lhs);
void EBlockRowOuterProduct(const BlockSparseMatrixData& A,
int row_block_index,
BlockRandomAccessMatrix* lhs);
void NoEBlockRowsUpdate(const BlockSparseMatrixData& A,
const double* b,
int row_block_counter,
BlockRandomAccessMatrix* lhs,
double* rhs);
void NoEBlockRowOuterProduct(const BlockSparseMatrixData& A,
int row_block_index,
BlockRandomAccessMatrix* lhs);
int num_threads_;
ContextImpl* context_;
int num_eliminate_blocks_;
bool assume_full_rank_ete_;
// Block layout of the columns of the reduced linear system. Since
// the f blocks can be of varying size, this vector stores the
// position of each f block in the row/col of the reduced linear
// system. Thus lhs_row_layout_[i] is the row/col position of the
// i^th f block.
std::vector<int> lhs_row_layout_;
// Combinatorial structure of the chunks in A. For more information
// see the documentation of the Chunk object above.
std::vector<Chunk> chunks_;
// TODO(sameeragarwal): The following two arrays contain per-thread
// storage. They should be refactored into a per thread struct.
// Buffer to store the products of the y and z blocks generated
// during the elimination phase. buffer_ is of size num_threads *
// buffer_size_. Each thread accesses the chunk
//
// [thread_id * buffer_size_ , (thread_id + 1) * buffer_size_]
//
std::unique_ptr<double[]> buffer_;
// Buffer to store per thread matrix matrix products used by
// ChunkOuterProduct. Like buffer_ it is of size num_threads *
// buffer_size_. Each thread accesses the chunk
//
// [thread_id * buffer_size_ , (thread_id + 1) * buffer_size_ -1]
//
std::unique_ptr<double[]> chunk_outer_product_buffer_;
int buffer_size_;
int uneliminated_row_begins_;
// Locks for the blocks in the right hand side of the reduced linear
// system.
std::vector<std::mutex*> rhs_locks_;
};
// SchurEliminatorForOneFBlock specializes the SchurEliminatorBase interface for
// the case where there is exactly one f-block and all three parameters
// kRowBlockSize, kEBlockSize and KFBlockSize are known at compile time. This is
// the case for some two view bundle adjustment problems which have very
// stringent latency requirements.
//
// Under these assumptions, we can simplify the more general algorithm
// implemented by SchurEliminatorImpl significantly. Two of the major
// contributors to the increased performance are:
//
// 1. Simpler loop structure and less use of dynamic memory. Almost everything
// is on the stack and benefits from aligned memory as well as fixed sized
// vectorization. We are also able to reason about temporaries and control
// their lifetimes better.
// 2. Use of inverse() over llt().solve(Identity).
template <int kRowBlockSize = Eigen::Dynamic,
int kEBlockSize = Eigen::Dynamic,
int kFBlockSize = Eigen::Dynamic>
class CERES_NO_EXPORT SchurEliminatorForOneFBlock final
: public SchurEliminatorBase {
public:
// TODO(sameeragarwal) Find out why "assume_full_rank_ete" is not used here
void Init(int num_eliminate_blocks,
bool /*assume_full_rank_ete*/,
const CompressedRowBlockStructure* bs) override {
CHECK_GT(num_eliminate_blocks, 0)
<< "SchurComplementSolver cannot be initialized with "
<< "num_eliminate_blocks = 0.";
CHECK_EQ(bs->cols.size() - num_eliminate_blocks, 1);
num_eliminate_blocks_ = num_eliminate_blocks;
const int num_row_blocks = bs->rows.size();
chunks_.clear();
int r = 0;
// Iterate over the row blocks of A, and detect the chunks. The
// matrix should already have been ordered so that all rows
// containing the same y block are vertically contiguous.
while (r < num_row_blocks) {
const int e_block_id = bs->rows[r].cells.front().block_id;
if (e_block_id >= num_eliminate_blocks_) {
break;
}
chunks_.push_back(Chunk());
Chunk& chunk = chunks_.back();
chunk.num_rows = 0;
chunk.start = r;
// Add to the chunk until the first block in the row is
// different than the one in the first row for the chunk.
while (r + chunk.num_rows < num_row_blocks) {
const CompressedRow& row = bs->rows[r + chunk.num_rows];
if (row.cells.front().block_id != e_block_id) {
break;
}
++chunk.num_rows;
}
r += chunk.num_rows;
}
const Chunk& last_chunk = chunks_.back();
uneliminated_row_begins_ = last_chunk.start + last_chunk.num_rows;
e_t_e_inverse_matrices_.resize(kEBlockSize * kEBlockSize * chunks_.size());
std::fill(
e_t_e_inverse_matrices_.begin(), e_t_e_inverse_matrices_.end(), 0.0);
}
void Eliminate(const BlockSparseMatrixData& A,
const double* b,
const double* D,
BlockRandomAccessMatrix* lhs_bram,
double* rhs_ptr) override {
// Since there is only one f-block, we can call GetCell once, and cache its
// output.
int r, c, row_stride, col_stride;
CellInfo* cell_info =
lhs_bram->GetCell(0, 0, &r, &c, &row_stride, &col_stride);
typename EigenTypes<kFBlockSize, kFBlockSize>::MatrixRef lhs(
cell_info->values, kFBlockSize, kFBlockSize);
typename EigenTypes<kFBlockSize>::VectorRef rhs(rhs_ptr, kFBlockSize);
lhs.setZero();
rhs.setZero();
const CompressedRowBlockStructure* bs = A.block_structure();
const double* values = A.values();
// Add the diagonal to the Schur complement.
if (D != nullptr) {
typename EigenTypes<kFBlockSize>::ConstVectorRef diag(
D + bs->cols[num_eliminate_blocks_].position, kFBlockSize);
lhs.diagonal() = diag.array().square().matrix();
}
Eigen::Matrix<double, kEBlockSize, kFBlockSize> tmp;
Eigen::Matrix<double, kEBlockSize, 1> tmp2;
// The following loop works on a block matrix which looks as follows
// (number of rows can be anything):
//
// [e_1 | f_1] = [b1]
// [e_2 | f_2] = [b2]
// [e_3 | f_3] = [b3]
// [e_4 | f_4] = [b4]
//
// and computes the following
//
// e_t_e = sum_i e_i^T * e_i
// e_t_e_inverse = inverse(e_t_e)
// e_t_f = sum_i e_i^T f_i
// e_t_b = sum_i e_i^T b_i
// f_t_b = sum_i f_i^T b_i
//
// lhs += sum_i f_i^T * f_i - e_t_f^T * e_t_e_inverse * e_t_f
// rhs += f_t_b - e_t_f^T * e_t_e_inverse * e_t_b
for (int i = 0; i < chunks_.size(); ++i) {
const Chunk& chunk = chunks_[i];
const int e_block_id = bs->rows[chunk.start].cells.front().block_id;
// Naming convention, e_t_e = e_block.transpose() * e_block;
Eigen::Matrix<double, kEBlockSize, kEBlockSize> e_t_e;
Eigen::Matrix<double, kEBlockSize, kFBlockSize> e_t_f;
Eigen::Matrix<double, kEBlockSize, 1> e_t_b;
Eigen::Matrix<double, kFBlockSize, 1> f_t_b;
// Add the square of the diagonal to e_t_e.
if (D != nullptr) {
const typename EigenTypes<kEBlockSize>::ConstVectorRef diag(
D + bs->cols[e_block_id].position, kEBlockSize);
e_t_e = diag.array().square().matrix().asDiagonal();
} else {
e_t_e.setZero();
}
e_t_f.setZero();
e_t_b.setZero();
f_t_b.setZero();
for (int j = 0; j < chunk.num_rows; ++j) {
const int row_id = chunk.start + j;
const auto& row = bs->rows[row_id];
const typename EigenTypes<kRowBlockSize, kEBlockSize>::ConstMatrixRef
e_block(values + row.cells[0].position, kRowBlockSize, kEBlockSize);
const typename EigenTypes<kRowBlockSize>::ConstVectorRef b_block(
b + row.block.position, kRowBlockSize);
e_t_e.noalias() += e_block.transpose() * e_block;
e_t_b.noalias() += e_block.transpose() * b_block;
if (row.cells.size() == 1) {
// There is no f block, so there is nothing more to do.
continue;
}
const typename EigenTypes<kRowBlockSize, kFBlockSize>::ConstMatrixRef
f_block(values + row.cells[1].position, kRowBlockSize, kFBlockSize);
e_t_f.noalias() += e_block.transpose() * f_block;
lhs.noalias() += f_block.transpose() * f_block;
f_t_b.noalias() += f_block.transpose() * b_block;
}
// BackSubstitute computes the same inverse, and this is the key workload
// there, so caching these inverses makes BackSubstitute essentially free.
typename EigenTypes<kEBlockSize, kEBlockSize>::MatrixRef e_t_e_inverse(
&e_t_e_inverse_matrices_[kEBlockSize * kEBlockSize * i],
kEBlockSize,
kEBlockSize);
// e_t_e is a symmetric positive definite matrix, so the standard way to
// compute its inverse is via the Cholesky factorization by calling
// e_t_e.llt().solve(Identity()). However, the inverse() method even
// though it is not optimized for symmetric matrices is significantly
// faster for small fixed size (up to 4x4) matrices.
//
// https://eigen.tuxfamily.org/dox/group__TutorialLinearAlgebra.html#title3
e_t_e_inverse.noalias() = e_t_e.inverse();
// The use of these two pre-allocated tmp vectors saves temporaries in the
// expressions for lhs and rhs updates below and has a significant impact
// on the performance of this method.
tmp.noalias() = e_t_e_inverse * e_t_f;
tmp2.noalias() = e_t_e_inverse * e_t_b;
lhs.noalias() -= e_t_f.transpose() * tmp;
rhs.noalias() += f_t_b - e_t_f.transpose() * tmp2;
}
// The rows without any e-blocks can have arbitrary size but only contain
// the f-block.
//
// lhs += f_i^T f_i
// rhs += f_i^T b_i
for (int row_id = uneliminated_row_begins_; row_id < bs->rows.size();
++row_id) {
const auto& row = bs->rows[row_id];
const auto& cell = row.cells[0];
const typename EigenTypes<Eigen::Dynamic, kFBlockSize>::ConstMatrixRef
f_block(values + cell.position, row.block.size, kFBlockSize);
const typename EigenTypes<Eigen::Dynamic>::ConstVectorRef b_block(
b + row.block.position, row.block.size);
lhs.noalias() += f_block.transpose() * f_block;
rhs.noalias() += f_block.transpose() * b_block;
}
}
// This implementation of BackSubstitute depends on Eliminate being called
// before this. SchurComplementSolver always does this.
//
// y_i = e_t_e_inverse * sum_i e_i^T * (b_i - f_i * z);
void BackSubstitute(const BlockSparseMatrixData& A,
const double* b,
const double* /*D*/,
const double* z_ptr,
double* y) override {
typename EigenTypes<kFBlockSize>::ConstVectorRef z(z_ptr, kFBlockSize);
const CompressedRowBlockStructure* bs = A.block_structure();
const double* values = A.values();
Eigen::Matrix<double, kEBlockSize, 1> tmp;
for (int i = 0; i < chunks_.size(); ++i) {
const Chunk& chunk = chunks_[i];
const int e_block_id = bs->rows[chunk.start].cells.front().block_id;
tmp.setZero();
for (int j = 0; j < chunk.num_rows; ++j) {
const int row_id = chunk.start + j;
const auto& row = bs->rows[row_id];
const typename EigenTypes<kRowBlockSize, kEBlockSize>::ConstMatrixRef
e_block(values + row.cells[0].position, kRowBlockSize, kEBlockSize);
const typename EigenTypes<kRowBlockSize>::ConstVectorRef b_block(
b + row.block.position, kRowBlockSize);
if (row.cells.size() == 1) {
// There is no f block.
tmp += e_block.transpose() * b_block;
} else {
typename EigenTypes<kRowBlockSize, kFBlockSize>::ConstMatrixRef
f_block(
values + row.cells[1].position, kRowBlockSize, kFBlockSize);
tmp += e_block.transpose() * (b_block - f_block * z);
}
}
typename EigenTypes<kEBlockSize, kEBlockSize>::MatrixRef e_t_e_inverse(
&e_t_e_inverse_matrices_[kEBlockSize * kEBlockSize * i],
kEBlockSize,
kEBlockSize);
typename EigenTypes<kEBlockSize>::VectorRef y_block(
y + bs->cols[e_block_id].position, kEBlockSize);
y_block.noalias() = e_t_e_inverse * tmp;
}
}
private:
struct Chunk {
int start = 0;
int num_rows = 0;
};
std::vector<Chunk> chunks_;
int num_eliminate_blocks_;
int uneliminated_row_begins_;
std::vector<double> e_t_e_inverse_matrices_;
};
} // namespace ceres::internal
#include "ceres/internal/reenable_warnings.h"
#endif // CERES_INTERNAL_SCHUR_ELIMINATOR_H_