Minor cleanups.
1. Further BLAS and heap allocation cleanups in schur_eliminator_impl.h
2. Modularize blas.h using macros.
3. Lint cleanups from William Rucklidge.
4. Small changes to jet.h
5. ResidualBlock now uses blas.h
Performance improvements:
For static and dynamic sized blocks, the peformance is not changed much.
-use_quaternions -ordering user -linear_solver sparse_schur
master change
problem: 16-22106
gcc 3.4 3.3
clang 2.8 2.7
problem: 49-7776
gcc 1.7 1.7
clang 1.4 1.4
problem: 245-198739
gcc 80.1 79.6
clang 80.6 76.2
problem: 257-65132
gcc 12.2 12.0
clang 10.4 10.2
problem: 356-226730
gcc 99.0 96.8
clang 88.9 88.3
problem: 744-543562
gcc 361.5 356.2
clang 352.7 343.5
problem: 1024-110968
gcc 45.9 45.6
clang 42.6 42.1
However, performance when using local parameterizations is
significantly improved due to residual_block.cc using blas.h
-use_quaternions -use_local_parameterization -ordering user -linear_solver sparse_schur
master change
problem: 16-22106
gcc 3.6 3.3
clang 3.5 2.8
problem: 49-7776
gcc 1.8 1.6
clang 1.7 1.4
problem: 245-198739
gcc 79.7 76.1
clang 79.7 73.0
problem: 257-65132
gcc 12.8 11.9
clang 12.3 9.8
problem: 356-226730
gcc 101.9 93.5
clang 105.0 86.8
problem: 744-543562
gcc 367.9 350.5
clang 355.3 323.1
problem: 1024-110968
gcc 43.0 40.3
clang 41.0 37.5
Change-Id: I6dcf7476ddaa77cb116558d112a9cf1e832f5fc9
diff --git a/internal/ceres/schur_eliminator_impl.h b/internal/ceres/schur_eliminator_impl.h
index b46eab9..5afb2f2 100644
--- a/internal/ceres/schur_eliminator_impl.h
+++ b/internal/ceres/schur_eliminator_impl.h
@@ -51,16 +51,18 @@
#include <algorithm>
#include <map>
-#include "Eigen/Dense"
+
#include "ceres/blas.h"
#include "ceres/block_random_access_matrix.h"
#include "ceres/block_sparse_matrix.h"
#include "ceres/block_structure.h"
#include "ceres/internal/eigen.h"
+#include "ceres/internal/fixed_array.h"
#include "ceres/internal/scoped_ptr.h"
#include "ceres/map_util.h"
#include "ceres/schur_eliminator.h"
#include "ceres/stl_util.h"
+#include "Eigen/Dense"
#include "glog/logging.h"
namespace ceres {
@@ -240,8 +242,9 @@
ete.setZero();
}
- typename EigenTypes<kEBlockSize>::Vector g(e_block_size);
- g.setZero();
+ FixedArray<double, 8> g(e_block_size);
+ typename EigenTypes<kEBlockSize>::VectorRef gref(g.get(), e_block_size);
+ gref.setZero();
// We are going to be computing
//
@@ -256,7 +259,7 @@
// in this chunk (g) and add the outer product of the f_blocks to
// Schur complement (S += F'F).
ChunkDiagonalBlockAndGradient(
- chunk, A, b, chunk.start, &ete, &g, buffer, lhs);
+ chunk, A, b, chunk.start, &ete, g.get(), buffer, lhs);
// Normally one wouldn't compute the inverse explicitly, but
// e_block_size will typically be a small number like 3, in
@@ -273,7 +276,12 @@
// linear system.
//
// rhs = F'b - F'E(E'E)^(-1) E'b
- UpdateRhs(chunk, A, b, chunk.start, inverse_ete * g, rhs);
+
+ FixedArray<double, 8> inverse_ete_g(e_block_size);
+ MatrixVectorMultiply<kEBlockSize, kEBlockSize, 0>(
+ inverse_ete.data(), e_block_size, e_block_size, g.get(), inverse_ete_g.get());
+
+ UpdateRhs(chunk, A, b, chunk.start, inverse_ete_g.get(), rhs);
// S -= F'E(E'E)^{-1}E'F
ChunkOuterProduct(bs, inverse_ete, buffer, chunk.buffer_layout, lhs);
@@ -318,7 +326,9 @@
const Cell& e_cell = row.cells.front();
DCHECK_EQ(e_block_id, e_cell.block_id);
- typename EigenTypes<kRowBlockSize>::Vector sj =
+ FixedArray<double, 8> sj(row.block.size);
+
+ typename EigenTypes<kRowBlockSize>::VectorRef(sj.get(), row.block.size) =
typename EigenTypes<kRowBlockSize>::ConstVectorRef
(b + bs->rows[chunk.start + j].block.position, row.block.size);
@@ -330,16 +340,16 @@
MatrixVectorMultiply<kRowBlockSize, kFBlockSize, -1>(
row_values + row.cells[c].position, row.block.size, f_block_size,
z + lhs_row_layout_[r_block],
- sj.data());
+ sj.get());
}
MatrixTransposeVectorMultiply<kRowBlockSize, kEBlockSize, 1>(
row_values + e_cell.position, row.block.size, e_block_size,
- sj.data(),
+ sj.get(),
y_ptr);
MatrixTransposeMatrixMultiply
- <kRowBlockSize, kEBlockSize,kRowBlockSize, kEBlockSize, 1>(
+ <kRowBlockSize, kEBlockSize, kRowBlockSize, kEBlockSize, 1>(
row_values + e_cell.position, row.block.size, e_block_size,
row_values + e_cell.position, row.block.size, e_block_size,
ete.data(), 0, 0, e_block_size, e_block_size);
@@ -359,25 +369,25 @@
const BlockSparseMatrixBase* A,
const double* b,
int row_block_counter,
- const Vector& inverse_ete_g,
+ const double* inverse_ete_g,
double* rhs) {
const CompressedRowBlockStructure* bs = A->block_structure();
- const int e_block_size = inverse_ete_g.rows();
+ const int e_block_id = bs->rows[chunk.start].cells.front().block_id;
+ const int e_block_size = bs->cols[e_block_id].size;
+
int b_pos = bs->rows[row_block_counter].block.position;
for (int j = 0; j < chunk.size; ++j) {
const CompressedRow& row = bs->rows[row_block_counter + j];
const double *row_values = A->RowBlockValues(row_block_counter + j);
const Cell& e_cell = row.cells.front();
- const typename EigenTypes<kRowBlockSize, kEBlockSize>::ConstMatrixRef
- e_block(row_values + e_cell.position,
- row.block.size,
- e_block_size);
-
- const typename EigenTypes<kRowBlockSize>::Vector
- sj =
+ typename EigenTypes<kRowBlockSize>::Vector sj =
typename EigenTypes<kRowBlockSize>::ConstVectorRef
- (b + b_pos, row.block.size) - e_block * (inverse_ete_g);
+ (b + b_pos, row.block.size);
+
+ MatrixVectorMultiply<kRowBlockSize, kEBlockSize, -1>(
+ row_values + e_cell.position, row.block.size, e_block_size,
+ inverse_ete_g, sj.data());
for (int c = 1; c < row.cells.size(); ++c) {
const int block_id = row.cells[c].block_id;
@@ -421,7 +431,7 @@
const double* b,
int row_block_counter,
typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix* ete,
- typename EigenTypes<kEBlockSize>::Vector* g,
+ double* g,
double* buffer,
BlockRandomAccessMatrix* lhs) {
const CompressedRowBlockStructure* bs = A->block_structure();
@@ -453,7 +463,7 @@
MatrixTransposeVectorMultiply<kRowBlockSize, kEBlockSize, 1>(
row_values + e_cell.position, row.block.size, e_block_size,
b + b_pos,
- g->data());
+ g);
// buffer = E'F. This computation is done by iterating over the
@@ -550,10 +560,10 @@
const int block_id = row.cells[c].block_id;
const int block_size = bs->cols[block_id].size;
const int block = block_id - num_eliminate_blocks_;
- VectorRef(rhs + lhs_row_layout_[block], block_size).noalias()
- += (ConstMatrixRef(row_values + row.cells[c].position,
- row.block.size, block_size).transpose() *
- ConstVectorRef(b + row.block.position, row.block.size));
+ MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
+ row_values + row.cells[c].position, row.block.size, block_size,
+ b + row.block.position,
+ rhs + lhs_row_layout_[block]);
}
NoEBlockRowOuterProduct(A, row_block_counter, lhs);
}
@@ -588,8 +598,6 @@
DCHECK_GE(block1, 0);
const int block1_size = bs->cols[row.cells[i].block_id].size;
- const ConstMatrixRef b1(row_values + row.cells[i].position,
- row.block.size, block1_size);
int r, c, row_stride, col_stride;
CellInfo* cell_info = lhs->GetCell(block1, block1,
&r, &c,