|  | // Ceres Solver - A fast non-linear least squares minimizer | 
|  | // Copyright 2023 Google Inc. All rights reserved. | 
|  | // http://ceres-solver.org/ | 
|  | // | 
|  | // Redistribution and use in source and binary forms, with or without | 
|  | // modification, are permitted provided that the following conditions are met: | 
|  | // | 
|  | // * Redistributions of source code must retain the above copyright notice, | 
|  | //   this list of conditions and the following disclaimer. | 
|  | // * Redistributions in binary form must reproduce the above copyright notice, | 
|  | //   this list of conditions and the following disclaimer in the documentation | 
|  | //   and/or other materials provided with the distribution. | 
|  | // * Neither the name of Google Inc. nor the names of its contributors may be | 
|  | //   used to endorse or promote products derived from this software without | 
|  | //   specific prior written permission. | 
|  | // | 
|  | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | 
|  | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
|  | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | 
|  | // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | 
|  | // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | 
|  | // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | 
|  | // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | 
|  | // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | 
|  | // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | 
|  | // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | 
|  | // POSSIBILITY OF SUCH DAMAGE. | 
|  | // | 
|  | // Author: sameeragarwal@google.com (Sameer Agarwal) | 
|  |  | 
|  | #ifndef CERES_INTERNAL_SCHUR_ELIMINATOR_H_ | 
|  | #define CERES_INTERNAL_SCHUR_ELIMINATOR_H_ | 
|  |  | 
|  | #include <map> | 
|  | #include <memory> | 
|  | #include <mutex> | 
|  | #include <vector> | 
|  |  | 
|  | #include "Eigen/Dense" | 
|  | #include "ceres/block_random_access_matrix.h" | 
|  | #include "ceres/block_sparse_matrix.h" | 
|  | #include "ceres/block_structure.h" | 
|  | #include "ceres/internal/config.h" | 
|  | #include "ceres/internal/disable_warnings.h" | 
|  | #include "ceres/internal/eigen.h" | 
|  | #include "ceres/internal/export.h" | 
|  | #include "ceres/linear_solver.h" | 
|  |  | 
|  | namespace ceres::internal { | 
|  |  | 
|  | // Classes implementing the SchurEliminatorBase interface implement | 
|  | // variable elimination for linear least squares problems. Assuming | 
|  | // that the input linear system Ax = b can be partitioned into | 
|  | // | 
|  | //  E y + F z = b | 
|  | // | 
|  | // Where x = [y;z] is a partition of the variables.  The partitioning | 
|  | // of the variables is such that, E'E is a block diagonal matrix. Or | 
|  | // in other words, the parameter blocks in E form an independent set | 
|  | // of the graph implied by the block matrix A'A. Then, this class | 
|  | // provides the functionality to compute the Schur complement system | 
|  | // | 
|  | //   S z = r | 
|  | // | 
|  | // where | 
|  | // | 
|  | //   S = F'F - F'E (E'E)^{-1} E'F and r = F'b - F'E(E'E)^(-1) E'b | 
|  | // | 
|  | // This is the Eliminate operation, i.e., construct the linear system | 
|  | // obtained by eliminating the variables in E. | 
|  | // | 
|  | // The eliminator also provides the reverse functionality, i.e. given | 
|  | // values for z it can back substitute for the values of y, by solving the | 
|  | // linear system | 
|  | // | 
|  | //  Ey = b - F z | 
|  | // | 
|  | // which is done by observing that | 
|  | // | 
|  | //  y = (E'E)^(-1) [E'b - E'F z] | 
|  | // | 
|  | // The eliminator has a number of requirements. | 
|  | // | 
|  | // The rows of A are ordered so that for every variable block in y, | 
|  | // all the rows containing that variable block occur as a vertically | 
|  | // contiguous block. i.e the matrix A looks like | 
|  | // | 
|  | //              E                 F                   chunk | 
|  | //  A = [ y1   0   0   0 |  z1    0    0   0    z5]     1 | 
|  | //      [ y1   0   0   0 |  z1   z2    0   0     0]     1 | 
|  | //      [  0  y2   0   0 |   0    0   z3   0     0]     2 | 
|  | //      [  0   0  y3   0 |  z1   z2   z3  z4    z5]     3 | 
|  | //      [  0   0  y3   0 |  z1    0    0   0    z5]     3 | 
|  | //      [  0   0   0  y4 |   0    0    0   0    z5]     4 | 
|  | //      [  0   0   0  y4 |   0   z2    0   0     0]     4 | 
|  | //      [  0   0   0  y4 |   0    0    0   0     0]     4 | 
|  | //      [  0   0   0   0 |  z1    0    0   0     0] non chunk blocks | 
|  | //      [  0   0   0   0 |   0    0   z3  z4    z5] non chunk blocks | 
|  | // | 
|  | // This structure should be reflected in the corresponding | 
|  | // CompressedRowBlockStructure object associated with A. The linear | 
|  | // system Ax = b should either be well posed or the array D below | 
|  | // should be non-null and the diagonal matrix corresponding to it | 
|  | // should be non-singular. For simplicity of exposition only the case | 
|  | // with a null D is described. | 
|  | // | 
|  | // The usual way to do the elimination is as follows. Starting with | 
|  | // | 
|  | //  E y + F z = b | 
|  | // | 
|  | // we can form the normal equations, | 
|  | // | 
|  | //  E'E y + E'F z = E'b | 
|  | //  F'E y + F'F z = F'b | 
|  | // | 
|  | // multiplying both sides of the first equation by (E'E)^(-1) and then | 
|  | // by F'E we get | 
|  | // | 
|  | //  F'E y + F'E (E'E)^(-1) E'F z =  F'E (E'E)^(-1) E'b | 
|  | //  F'E y +                F'F z =  F'b | 
|  | // | 
|  | // now subtracting the two equations we get | 
|  | // | 
|  | // [FF' - F'E (E'E)^(-1) E'F] z = F'b - F'E(E'E)^(-1) E'b | 
|  | // | 
|  | // Instead of forming the normal equations and operating on them as | 
|  | // general sparse matrices, the algorithm here deals with one | 
|  | // parameter block in y at a time. The rows corresponding to a single | 
|  | // parameter block yi are known as a chunk, and the algorithm operates | 
|  | // on one chunk at a time. The mathematics remains the same since the | 
|  | // reduced linear system can be shown to be the sum of the reduced | 
|  | // linear systems for each chunk. This can be seen by observing two | 
|  | // things. | 
|  | // | 
|  | //  1. E'E is a block diagonal matrix. | 
|  | // | 
|  | //  2. When E'F is computed, only the terms within a single chunk | 
|  | //  interact, i.e for y1 column blocks when transposed and multiplied | 
|  | //  with F, the only non-zero contribution comes from the blocks in | 
|  | //  chunk1. | 
|  | // | 
|  | // Thus, the reduced linear system | 
|  | // | 
|  | //  FF' - F'E (E'E)^(-1) E'F | 
|  | // | 
|  | // can be re-written as | 
|  | // | 
|  | //  sum_k F_k F_k' - F_k'E_k (E_k'E_k)^(-1) E_k' F_k | 
|  | // | 
|  | // Where the sum is over chunks and E_k'E_k is dense matrix of size y1 | 
|  | // x y1. | 
|  | // | 
|  | // Advanced usage. Until now it has been assumed that the user would | 
|  | // be interested in all of the Schur Complement S. However, it is also | 
|  | // possible to use this eliminator to obtain an arbitrary submatrix of | 
|  | // the full Schur complement. When the eliminator is generating the | 
|  | // blocks of S, it asks the RandomAccessBlockMatrix instance passed to | 
|  | // it if it has storage for that block. If it does, the eliminator | 
|  | // computes/updates it, if not it is skipped. This is useful when one | 
|  | // is interested in constructing a preconditioner based on the Schur | 
|  | // Complement, e.g., computing the block diagonal of S so that it can | 
|  | // be used as a preconditioner for an Iterative Substructuring based | 
|  | // solver [See Agarwal et al, Bundle Adjustment in the Large, ECCV | 
|  | // 2008 for an example of such use]. | 
|  | // | 
|  | // Example usage: Please see schur_complement_solver.cc | 
|  | class CERES_NO_EXPORT SchurEliminatorBase { | 
|  | public: | 
|  | virtual ~SchurEliminatorBase(); | 
|  |  | 
|  | // Initialize the eliminator. It is the user's responsibility to call | 
|  | // this function before calling Eliminate or BackSubstitute. It is | 
|  | // also the caller's responsibility to ensure that the | 
|  | // CompressedRowBlockStructure object passed to this method is the | 
|  | // same one (or is equivalent to) the one associated with the | 
|  | // BlockSparseMatrix objects below. | 
|  | // | 
|  | // assume_full_rank_ete controls how the eliminator inverts with the | 
|  | // diagonal blocks corresponding to e blocks in A'A. If | 
|  | // assume_full_rank_ete is true, then a Cholesky factorization is | 
|  | // used to compute the inverse, otherwise a singular value | 
|  | // decomposition is used to compute the pseudo inverse. | 
|  | virtual void Init(int num_eliminate_blocks, | 
|  | bool assume_full_rank_ete, | 
|  | const CompressedRowBlockStructure* bs) = 0; | 
|  |  | 
|  | // Compute the Schur complement system from the augmented linear | 
|  | // least squares problem [A;D] x = [b;0]. The left hand side and the | 
|  | // right hand side of the reduced linear system are returned in lhs | 
|  | // and rhs respectively. | 
|  | // | 
|  | // It is the caller's responsibility to construct and initialize | 
|  | // lhs. Depending upon the structure of the lhs object passed here, | 
|  | // the full or a submatrix of the Schur complement will be computed. | 
|  | // | 
|  | // Since the Schur complement is a symmetric matrix, only the upper | 
|  | // triangular part of the Schur complement is computed. | 
|  | virtual void Eliminate(const BlockSparseMatrixData& A, | 
|  | const double* b, | 
|  | const double* D, | 
|  | BlockRandomAccessMatrix* lhs, | 
|  | double* rhs) = 0; | 
|  |  | 
|  | // Given values for the variables z in the F block of A, solve for | 
|  | // the optimal values of the variables y corresponding to the E | 
|  | // block in A. | 
|  | virtual void BackSubstitute(const BlockSparseMatrixData& A, | 
|  | const double* b, | 
|  | const double* D, | 
|  | const double* z, | 
|  | double* y) = 0; | 
|  | // Factory | 
|  | static std::unique_ptr<SchurEliminatorBase> Create( | 
|  | const LinearSolver::Options& options); | 
|  | }; | 
|  |  | 
|  | // Templated implementation of the SchurEliminatorBase interface. The | 
|  | // templating is on the sizes of the row, e and f blocks sizes in the | 
|  | // input matrix. In many problems, the sizes of one or more of these | 
|  | // blocks are constant, in that case, its worth passing these | 
|  | // parameters as template arguments so that they are visible to the | 
|  | // compiler and can be used for compile time optimization of the low | 
|  | // level linear algebra routines. | 
|  | template <int kRowBlockSize = Eigen::Dynamic, | 
|  | int kEBlockSize = Eigen::Dynamic, | 
|  | int kFBlockSize = Eigen::Dynamic> | 
|  | class CERES_NO_EXPORT SchurEliminator final : public SchurEliminatorBase { | 
|  | public: | 
|  | explicit SchurEliminator(const LinearSolver::Options& options) | 
|  | : num_threads_(options.num_threads), context_(options.context) { | 
|  | CHECK(context_ != nullptr); | 
|  | } | 
|  |  | 
|  | // SchurEliminatorBase Interface | 
|  | ~SchurEliminator() override; | 
|  | void Init(int num_eliminate_blocks, | 
|  | bool assume_full_rank_ete, | 
|  | const CompressedRowBlockStructure* bs) final; | 
|  | void Eliminate(const BlockSparseMatrixData& A, | 
|  | const double* b, | 
|  | const double* D, | 
|  | BlockRandomAccessMatrix* lhs, | 
|  | double* rhs) final; | 
|  | void BackSubstitute(const BlockSparseMatrixData& A, | 
|  | const double* b, | 
|  | const double* D, | 
|  | const double* z, | 
|  | double* y) final; | 
|  |  | 
|  | private: | 
|  | // Chunk objects store combinatorial information needed to | 
|  | // efficiently eliminate a whole chunk out of the least squares | 
|  | // problem. Consider the first chunk in the example matrix above. | 
|  | // | 
|  | //      [ y1   0   0   0 |  z1    0    0   0    z5] | 
|  | //      [ y1   0   0   0 |  z1   z2    0   0     0] | 
|  | // | 
|  | // One of the intermediate quantities that needs to be calculated is | 
|  | // for each row the product of the y block transposed with the | 
|  | // non-zero z block, and the sum of these blocks across rows. A | 
|  | // temporary array "buffer_" is used for computing and storing them | 
|  | // and the buffer_layout maps the indices of the z-blocks to | 
|  | // position in the buffer_ array.  The size of the chunk is the | 
|  | // number of row blocks/residual blocks for the particular y block | 
|  | // being considered. | 
|  | // | 
|  | // For the example chunk shown above, | 
|  | // | 
|  | // size = 2 | 
|  | // | 
|  | // The entries of buffer_layout will be filled in the following order. | 
|  | // | 
|  | // buffer_layout[z1] = 0 | 
|  | // buffer_layout[z5] = y1 * z1 | 
|  | // buffer_layout[z2] = y1 * z1 + y1 * z5 | 
|  | using BufferLayoutType = std::map<int, int>; | 
|  | struct Chunk { | 
|  | explicit Chunk(int start) : size(0), start(start) {} | 
|  | int size; | 
|  | int start; | 
|  | BufferLayoutType buffer_layout; | 
|  | }; | 
|  |  | 
|  | void ChunkDiagonalBlockAndGradient( | 
|  | const Chunk& chunk, | 
|  | const BlockSparseMatrixData& A, | 
|  | const double* b, | 
|  | int row_block_counter, | 
|  | typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix* eet, | 
|  | double* g, | 
|  | double* buffer, | 
|  | BlockRandomAccessMatrix* lhs); | 
|  |  | 
|  | void UpdateRhs(const Chunk& chunk, | 
|  | const BlockSparseMatrixData& A, | 
|  | const double* b, | 
|  | int row_block_counter, | 
|  | const double* inverse_ete_g, | 
|  | double* rhs); | 
|  |  | 
|  | void ChunkOuterProduct(int thread_id, | 
|  | const CompressedRowBlockStructure* bs, | 
|  | const Matrix& inverse_eet, | 
|  | const double* buffer, | 
|  | const BufferLayoutType& buffer_layout, | 
|  | BlockRandomAccessMatrix* lhs); | 
|  | void EBlockRowOuterProduct(const BlockSparseMatrixData& A, | 
|  | int row_block_index, | 
|  | BlockRandomAccessMatrix* lhs); | 
|  |  | 
|  | void NoEBlockRowsUpdate(const BlockSparseMatrixData& A, | 
|  | const double* b, | 
|  | int row_block_counter, | 
|  | BlockRandomAccessMatrix* lhs, | 
|  | double* rhs); | 
|  |  | 
|  | void NoEBlockRowOuterProduct(const BlockSparseMatrixData& A, | 
|  | int row_block_index, | 
|  | BlockRandomAccessMatrix* lhs); | 
|  |  | 
|  | int num_threads_; | 
|  | ContextImpl* context_; | 
|  | int num_eliminate_blocks_; | 
|  | bool assume_full_rank_ete_; | 
|  |  | 
|  | // Block layout of the columns of the reduced linear system. Since | 
|  | // the f blocks can be of varying size, this vector stores the | 
|  | // position of each f block in the row/col of the reduced linear | 
|  | // system. Thus lhs_row_layout_[i] is the row/col position of the | 
|  | // i^th f block. | 
|  | std::vector<int> lhs_row_layout_; | 
|  |  | 
|  | // Combinatorial structure of the chunks in A. For more information | 
|  | // see the documentation of the Chunk object above. | 
|  | std::vector<Chunk> chunks_; | 
|  |  | 
|  | // TODO(sameeragarwal): The following two arrays contain per-thread | 
|  | // storage. They should be refactored into a per thread struct. | 
|  |  | 
|  | // Buffer to store the products of the y and z blocks generated | 
|  | // during the elimination phase. buffer_ is of size num_threads * | 
|  | // buffer_size_. Each thread accesses the chunk | 
|  | // | 
|  | //   [thread_id * buffer_size_ , (thread_id + 1) * buffer_size_] | 
|  | // | 
|  | std::unique_ptr<double[]> buffer_; | 
|  |  | 
|  | // Buffer to store per thread matrix matrix products used by | 
|  | // ChunkOuterProduct. Like buffer_ it is of size num_threads * | 
|  | // buffer_size_. Each thread accesses the chunk | 
|  | // | 
|  | //   [thread_id * buffer_size_ , (thread_id + 1) * buffer_size_ -1] | 
|  | // | 
|  | std::unique_ptr<double[]> chunk_outer_product_buffer_; | 
|  |  | 
|  | int buffer_size_; | 
|  | int uneliminated_row_begins_; | 
|  |  | 
|  | // Locks for the blocks in the right hand side of the reduced linear | 
|  | // system. | 
|  | std::vector<std::mutex*> rhs_locks_; | 
|  | }; | 
|  |  | 
|  | // SchurEliminatorForOneFBlock specializes the SchurEliminatorBase interface for | 
|  | // the case where there is exactly one f-block and all three parameters | 
|  | // kRowBlockSize, kEBlockSize and KFBlockSize are known at compile time. This is | 
|  | // the case for some two view bundle adjustment problems which have very | 
|  | // stringent latency requirements. | 
|  | // | 
|  | // Under these assumptions, we can simplify the more general algorithm | 
|  | // implemented by SchurEliminatorImpl significantly. Two of the major | 
|  | // contributors to the increased performance are: | 
|  | // | 
|  | // 1. Simpler loop structure and less use of dynamic memory. Almost everything | 
|  | //    is on the stack and benefits from aligned memory as well as fixed sized | 
|  | //    vectorization. We are also able to reason about temporaries and control | 
|  | //    their lifetimes better. | 
|  | // 2. Use of inverse() over llt().solve(Identity). | 
|  | template <int kRowBlockSize = Eigen::Dynamic, | 
|  | int kEBlockSize = Eigen::Dynamic, | 
|  | int kFBlockSize = Eigen::Dynamic> | 
|  | class CERES_NO_EXPORT SchurEliminatorForOneFBlock final | 
|  | : public SchurEliminatorBase { | 
|  | public: | 
|  | // TODO(sameeragarwal) Find out why "assume_full_rank_ete" is not used here | 
|  | void Init(int num_eliminate_blocks, | 
|  | bool /*assume_full_rank_ete*/, | 
|  | const CompressedRowBlockStructure* bs) override { | 
|  | CHECK_GT(num_eliminate_blocks, 0) | 
|  | << "SchurComplementSolver cannot be initialized with " | 
|  | << "num_eliminate_blocks = 0."; | 
|  | CHECK_EQ(bs->cols.size() - num_eliminate_blocks, 1); | 
|  |  | 
|  | num_eliminate_blocks_ = num_eliminate_blocks; | 
|  | const int num_row_blocks = bs->rows.size(); | 
|  | chunks_.clear(); | 
|  | int r = 0; | 
|  | // Iterate over the row blocks of A, and detect the chunks. The | 
|  | // matrix should already have been ordered so that all rows | 
|  | // containing the same y block are vertically contiguous. | 
|  | while (r < num_row_blocks) { | 
|  | const int e_block_id = bs->rows[r].cells.front().block_id; | 
|  | if (e_block_id >= num_eliminate_blocks_) { | 
|  | break; | 
|  | } | 
|  |  | 
|  | chunks_.push_back(Chunk()); | 
|  | Chunk& chunk = chunks_.back(); | 
|  | chunk.num_rows = 0; | 
|  | chunk.start = r; | 
|  | // Add to the chunk until the first block in the row is | 
|  | // different than the one in the first row for the chunk. | 
|  | while (r + chunk.num_rows < num_row_blocks) { | 
|  | const CompressedRow& row = bs->rows[r + chunk.num_rows]; | 
|  | if (row.cells.front().block_id != e_block_id) { | 
|  | break; | 
|  | } | 
|  | ++chunk.num_rows; | 
|  | } | 
|  | r += chunk.num_rows; | 
|  | } | 
|  |  | 
|  | const Chunk& last_chunk = chunks_.back(); | 
|  | uneliminated_row_begins_ = last_chunk.start + last_chunk.num_rows; | 
|  | e_t_e_inverse_matrices_.resize(kEBlockSize * kEBlockSize * chunks_.size()); | 
|  | std::fill( | 
|  | e_t_e_inverse_matrices_.begin(), e_t_e_inverse_matrices_.end(), 0.0); | 
|  | } | 
|  |  | 
|  | void Eliminate(const BlockSparseMatrixData& A, | 
|  | const double* b, | 
|  | const double* D, | 
|  | BlockRandomAccessMatrix* lhs_bram, | 
|  | double* rhs_ptr) override { | 
|  | // Since there is only one f-block, we can call GetCell once, and cache its | 
|  | // output. | 
|  | int r, c, row_stride, col_stride; | 
|  | CellInfo* cell_info = | 
|  | lhs_bram->GetCell(0, 0, &r, &c, &row_stride, &col_stride); | 
|  | typename EigenTypes<kFBlockSize, kFBlockSize>::MatrixRef lhs( | 
|  | cell_info->values, kFBlockSize, kFBlockSize); | 
|  | typename EigenTypes<kFBlockSize>::VectorRef rhs(rhs_ptr, kFBlockSize); | 
|  |  | 
|  | lhs.setZero(); | 
|  | rhs.setZero(); | 
|  |  | 
|  | const CompressedRowBlockStructure* bs = A.block_structure(); | 
|  | const double* values = A.values(); | 
|  |  | 
|  | // Add the diagonal to the Schur complement. | 
|  | if (D != nullptr) { | 
|  | typename EigenTypes<kFBlockSize>::ConstVectorRef diag( | 
|  | D + bs->cols[num_eliminate_blocks_].position, kFBlockSize); | 
|  | lhs.diagonal() = diag.array().square().matrix(); | 
|  | } | 
|  |  | 
|  | Eigen::Matrix<double, kEBlockSize, kFBlockSize> tmp; | 
|  | Eigen::Matrix<double, kEBlockSize, 1> tmp2; | 
|  |  | 
|  | // The following loop works on a block matrix which looks as follows | 
|  | // (number of rows can be anything): | 
|  | // | 
|  | // [e_1 | f_1] = [b1] | 
|  | // [e_2 | f_2] = [b2] | 
|  | // [e_3 | f_3] = [b3] | 
|  | // [e_4 | f_4] = [b4] | 
|  | // | 
|  | // and computes the following | 
|  | // | 
|  | // e_t_e = sum_i e_i^T * e_i | 
|  | // e_t_e_inverse = inverse(e_t_e) | 
|  | // e_t_f = sum_i e_i^T f_i | 
|  | // e_t_b = sum_i e_i^T b_i | 
|  | // f_t_b = sum_i f_i^T b_i | 
|  | // | 
|  | // lhs += sum_i f_i^T * f_i - e_t_f^T * e_t_e_inverse * e_t_f | 
|  | // rhs += f_t_b - e_t_f^T * e_t_e_inverse * e_t_b | 
|  | for (int i = 0; i < chunks_.size(); ++i) { | 
|  | const Chunk& chunk = chunks_[i]; | 
|  | const int e_block_id = bs->rows[chunk.start].cells.front().block_id; | 
|  |  | 
|  | // Naming convention, e_t_e = e_block.transpose() * e_block; | 
|  | Eigen::Matrix<double, kEBlockSize, kEBlockSize> e_t_e; | 
|  | Eigen::Matrix<double, kEBlockSize, kFBlockSize> e_t_f; | 
|  | Eigen::Matrix<double, kEBlockSize, 1> e_t_b; | 
|  | Eigen::Matrix<double, kFBlockSize, 1> f_t_b; | 
|  |  | 
|  | // Add the square of the diagonal to e_t_e. | 
|  | if (D != nullptr) { | 
|  | const typename EigenTypes<kEBlockSize>::ConstVectorRef diag( | 
|  | D + bs->cols[e_block_id].position, kEBlockSize); | 
|  | e_t_e = diag.array().square().matrix().asDiagonal(); | 
|  | } else { | 
|  | e_t_e.setZero(); | 
|  | } | 
|  | e_t_f.setZero(); | 
|  | e_t_b.setZero(); | 
|  | f_t_b.setZero(); | 
|  |  | 
|  | for (int j = 0; j < chunk.num_rows; ++j) { | 
|  | const int row_id = chunk.start + j; | 
|  | const auto& row = bs->rows[row_id]; | 
|  | const typename EigenTypes<kRowBlockSize, kEBlockSize>::ConstMatrixRef | 
|  | e_block(values + row.cells[0].position, kRowBlockSize, kEBlockSize); | 
|  | const typename EigenTypes<kRowBlockSize>::ConstVectorRef b_block( | 
|  | b + row.block.position, kRowBlockSize); | 
|  |  | 
|  | e_t_e.noalias() += e_block.transpose() * e_block; | 
|  | e_t_b.noalias() += e_block.transpose() * b_block; | 
|  |  | 
|  | if (row.cells.size() == 1) { | 
|  | // There is no f block, so there is nothing more to do. | 
|  | continue; | 
|  | } | 
|  |  | 
|  | const typename EigenTypes<kRowBlockSize, kFBlockSize>::ConstMatrixRef | 
|  | f_block(values + row.cells[1].position, kRowBlockSize, kFBlockSize); | 
|  | e_t_f.noalias() += e_block.transpose() * f_block; | 
|  | lhs.noalias() += f_block.transpose() * f_block; | 
|  | f_t_b.noalias() += f_block.transpose() * b_block; | 
|  | } | 
|  |  | 
|  | // BackSubstitute computes the same inverse, and this is the key workload | 
|  | // there, so caching these inverses makes BackSubstitute essentially free. | 
|  | typename EigenTypes<kEBlockSize, kEBlockSize>::MatrixRef e_t_e_inverse( | 
|  | &e_t_e_inverse_matrices_[kEBlockSize * kEBlockSize * i], | 
|  | kEBlockSize, | 
|  | kEBlockSize); | 
|  |  | 
|  | // e_t_e is a symmetric positive definite matrix, so the standard way to | 
|  | // compute its inverse is via the Cholesky factorization by calling | 
|  | // e_t_e.llt().solve(Identity()). However, the inverse() method even | 
|  | // though it is not optimized for symmetric matrices is significantly | 
|  | // faster for small fixed size (up to 4x4) matrices. | 
|  | // | 
|  | // https://eigen.tuxfamily.org/dox/group__TutorialLinearAlgebra.html#title3 | 
|  | e_t_e_inverse.noalias() = e_t_e.inverse(); | 
|  |  | 
|  | // The use of these two pre-allocated tmp vectors saves temporaries in the | 
|  | // expressions for lhs and rhs updates below and has a significant impact | 
|  | // on the performance of this method. | 
|  | tmp.noalias() = e_t_e_inverse * e_t_f; | 
|  | tmp2.noalias() = e_t_e_inverse * e_t_b; | 
|  |  | 
|  | lhs.noalias() -= e_t_f.transpose() * tmp; | 
|  | rhs.noalias() += f_t_b - e_t_f.transpose() * tmp2; | 
|  | } | 
|  |  | 
|  | // The rows without any e-blocks can have arbitrary size but only contain | 
|  | // the f-block. | 
|  | // | 
|  | // lhs += f_i^T f_i | 
|  | // rhs += f_i^T b_i | 
|  | for (int row_id = uneliminated_row_begins_; row_id < bs->rows.size(); | 
|  | ++row_id) { | 
|  | const auto& row = bs->rows[row_id]; | 
|  | const auto& cell = row.cells[0]; | 
|  | const typename EigenTypes<Eigen::Dynamic, kFBlockSize>::ConstMatrixRef | 
|  | f_block(values + cell.position, row.block.size, kFBlockSize); | 
|  | const typename EigenTypes<Eigen::Dynamic>::ConstVectorRef b_block( | 
|  | b + row.block.position, row.block.size); | 
|  | lhs.noalias() += f_block.transpose() * f_block; | 
|  | rhs.noalias() += f_block.transpose() * b_block; | 
|  | } | 
|  | } | 
|  |  | 
|  | // This implementation of BackSubstitute depends on Eliminate being called | 
|  | // before this. SchurComplementSolver always does this. | 
|  | // | 
|  | // y_i = e_t_e_inverse * sum_i e_i^T * (b_i - f_i * z); | 
|  | void BackSubstitute(const BlockSparseMatrixData& A, | 
|  | const double* b, | 
|  | const double* /*D*/, | 
|  | const double* z_ptr, | 
|  | double* y) override { | 
|  | typename EigenTypes<kFBlockSize>::ConstVectorRef z(z_ptr, kFBlockSize); | 
|  | const CompressedRowBlockStructure* bs = A.block_structure(); | 
|  | const double* values = A.values(); | 
|  | Eigen::Matrix<double, kEBlockSize, 1> tmp; | 
|  | for (int i = 0; i < chunks_.size(); ++i) { | 
|  | const Chunk& chunk = chunks_[i]; | 
|  | const int e_block_id = bs->rows[chunk.start].cells.front().block_id; | 
|  | tmp.setZero(); | 
|  | for (int j = 0; j < chunk.num_rows; ++j) { | 
|  | const int row_id = chunk.start + j; | 
|  | const auto& row = bs->rows[row_id]; | 
|  | const typename EigenTypes<kRowBlockSize, kEBlockSize>::ConstMatrixRef | 
|  | e_block(values + row.cells[0].position, kRowBlockSize, kEBlockSize); | 
|  | const typename EigenTypes<kRowBlockSize>::ConstVectorRef b_block( | 
|  | b + row.block.position, kRowBlockSize); | 
|  |  | 
|  | if (row.cells.size() == 1) { | 
|  | // There is no f block. | 
|  | tmp += e_block.transpose() * b_block; | 
|  | } else { | 
|  | typename EigenTypes<kRowBlockSize, kFBlockSize>::ConstMatrixRef | 
|  | f_block( | 
|  | values + row.cells[1].position, kRowBlockSize, kFBlockSize); | 
|  | tmp += e_block.transpose() * (b_block - f_block * z); | 
|  | } | 
|  | } | 
|  |  | 
|  | typename EigenTypes<kEBlockSize, kEBlockSize>::MatrixRef e_t_e_inverse( | 
|  | &e_t_e_inverse_matrices_[kEBlockSize * kEBlockSize * i], | 
|  | kEBlockSize, | 
|  | kEBlockSize); | 
|  |  | 
|  | typename EigenTypes<kEBlockSize>::VectorRef y_block( | 
|  | y + bs->cols[e_block_id].position, kEBlockSize); | 
|  | y_block.noalias() = e_t_e_inverse * tmp; | 
|  | } | 
|  | } | 
|  |  | 
|  | private: | 
|  | struct Chunk { | 
|  | int start = 0; | 
|  | int num_rows = 0; | 
|  | }; | 
|  |  | 
|  | std::vector<Chunk> chunks_; | 
|  | int num_eliminate_blocks_; | 
|  | int uneliminated_row_begins_; | 
|  | std::vector<double> e_t_e_inverse_matrices_; | 
|  | }; | 
|  |  | 
|  | }  // namespace ceres::internal | 
|  |  | 
|  | #include "ceres/internal/reenable_warnings.h" | 
|  |  | 
|  | #endif  // CERES_INTERNAL_SCHUR_ELIMINATOR_H_ |