Minor cleanups.
1. Further BLAS and heap allocation cleanups in schur_eliminator_impl.h
2. Modularize blas.h using macros.
3. Lint cleanups from William Rucklidge.
4. Small changes to jet.h
5. ResidualBlock now uses blas.h
Performance improvements:
For static and dynamic sized blocks, the peformance is not changed much.
-use_quaternions -ordering user -linear_solver sparse_schur
master change
problem: 16-22106
gcc 3.4 3.3
clang 2.8 2.7
problem: 49-7776
gcc 1.7 1.7
clang 1.4 1.4
problem: 245-198739
gcc 80.1 79.6
clang 80.6 76.2
problem: 257-65132
gcc 12.2 12.0
clang 10.4 10.2
problem: 356-226730
gcc 99.0 96.8
clang 88.9 88.3
problem: 744-543562
gcc 361.5 356.2
clang 352.7 343.5
problem: 1024-110968
gcc 45.9 45.6
clang 42.6 42.1
However, performance when using local parameterizations is
significantly improved due to residual_block.cc using blas.h
-use_quaternions -use_local_parameterization -ordering user -linear_solver sparse_schur
master change
problem: 16-22106
gcc 3.6 3.3
clang 3.5 2.8
problem: 49-7776
gcc 1.8 1.6
clang 1.7 1.4
problem: 245-198739
gcc 79.7 76.1
clang 79.7 73.0
problem: 257-65132
gcc 12.8 11.9
clang 12.3 9.8
problem: 356-226730
gcc 101.9 93.5
clang 105.0 86.8
problem: 744-543562
gcc 367.9 350.5
clang 355.3 323.1
problem: 1024-110968
gcc 43.0 40.3
clang 41.0 37.5
Change-Id: I6dcf7476ddaa77cb116558d112a9cf1e832f5fc9
diff --git a/include/ceres/jet.h b/include/ceres/jet.h
index 1238123..000bd1c 100644
--- a/include/ceres/jet.h
+++ b/include/ceres/jet.h
@@ -348,8 +348,8 @@
// b + v (b + v)(b - v) b^2
//
// which holds because v*v = 0.
- h.a = f.a / g.a;
- const T g_a_inverse = 1.0 / g.a;
+ const T g_a_inverse = T(1.0) / g.a;
+ h.a = f.a * g_a_inverse;
const T f_a_by_g_a = f.a * g_a_inverse;
for (int i = 0; i < N; ++i) {
h.v[i] = (f.v[i] - f_a_by_g_a * g.v[i]) * g_a_inverse;
@@ -450,7 +450,7 @@
Jet<T, N> sqrt(const Jet<T, N>& f) {
Jet<T, N> g;
g.a = sqrt(f.a);
- const T two_a_inverse = 1.0 / (T(2.0) * g.a);
+ const T two_a_inverse = T(1.0) / (T(2.0) * g.a);
g.v = f.v * two_a_inverse;
return g;
}
diff --git a/internal/ceres/blas.h b/internal/ceres/blas.h
index bacf1ff..9629b3d 100644
--- a/internal/ceres/blas.h
+++ b/internal/ceres/blas.h
@@ -65,20 +65,71 @@
#define CERES_MAYBE_NOALIAS .noalias()
#endif
-// For the matrix-matrix functions below, there are three functions
-// for each functionality. Foo, FooNaive and FooEigen. Foo is the one
-// to be called by the user. FooNaive is a basic loop based
+// The following three macros are used to share code and reduce
+// template junk across the various GEMM variants.
+#define CERES_GEMM_BEGIN(name) \
+ template<int kRowA, int kColA, int kRowB, int kColB, int kOperation> \
+ inline void name(const double* A, \
+ const int num_row_a, \
+ const int num_col_a, \
+ const double* B, \
+ const int num_row_b, \
+ const int num_col_b, \
+ double* C, \
+ const int start_row_c, \
+ const int start_col_c, \
+ const int row_stride_c, \
+ const int col_stride_c)
+
+#define CERES_GEMM_NAIVE_HEADER \
+ DCHECK_GT(num_row_a, 0); \
+ DCHECK_GT(num_col_a, 0); \
+ DCHECK_GT(num_row_b, 0); \
+ DCHECK_GT(num_col_b, 0); \
+ DCHECK_GE(start_row_c, 0); \
+ DCHECK_GE(start_col_c, 0); \
+ DCHECK_GT(row_stride_c, 0); \
+ DCHECK_GT(col_stride_c, 0); \
+ DCHECK((kRowA == Eigen::Dynamic) || (kRowA == num_row_a)); \
+ DCHECK((kColA == Eigen::Dynamic) || (kColA == num_col_a)); \
+ DCHECK((kRowB == Eigen::Dynamic) || (kRowB == num_row_b)); \
+ DCHECK((kColB == Eigen::Dynamic) || (kColB == num_col_b)); \
+ const int NUM_ROW_A = (kRowA != Eigen::Dynamic ? kRowA : num_row_a); \
+ const int NUM_COL_A = (kColA != Eigen::Dynamic ? kColA : num_col_a); \
+ const int NUM_ROW_B = (kColB != Eigen::Dynamic ? kRowB : num_row_b); \
+ const int NUM_COL_B = (kColB != Eigen::Dynamic ? kColB : num_col_b);
+
+#define CERES_GEMM_EIGEN_HEADER \
+ const typename EigenTypes<kRowA, kColA>::ConstMatrixRef \
+ Aref(A, num_row_a, num_col_a); \
+ const typename EigenTypes<kRowB, kColB>::ConstMatrixRef \
+ Bref(B, num_row_b, num_col_b); \
+ MatrixRef Cref(C, row_stride_c, col_stride_c); \
+
+#define CERES_CALL_GEMM(name) \
+ name<kRowA, kColA, kRowB, kColB, kOperation>( \
+ A, num_row_a, num_col_a, \
+ B, num_row_b, num_col_b, \
+ C, start_row_c, start_col_c, row_stride_c, col_stride_c);
+
+
+// For the matrix-matrix functions below, there are three variants for
+// each functionality. Foo, FooNaive and FooEigen. Foo is the one to
+// be called by the user. FooNaive is a basic loop based
// implementation and FooEigen uses Eigen's implementation. Foo
// chooses between FooNaive and FooEigen depending on how many of the
// template arguments are fixed at compile time. Currently, FooEigen
-// is called if all matrix dimenions are compile time
+// is called if all matrix dimensions are compile time
// constants. FooNaive is called otherwise. This leads to the best
// performance currently.
//
-// TODO(sameeragarwal): Benchmark and simplify the matrix-vector
-// functions.
-
-// C op A * B;
+// The MatrixMatrixMultiply variants compute:
+//
+// C op A * B;
+//
+// The MatrixTransposeMatrixMultiply variants compute:
+//
+// C op A' * B
//
// where op can be +=, -=, or =.
//
@@ -91,7 +142,7 @@
// kOperation = -1 -> C -= A * B
// kOperation = 0 -> C = A * B
//
-// The function can write into matrices C which are larger than the
+// The functions can write into matrices C which are larger than the
// matrix A * B. This is done by specifying the true size of C via
// row_stride_c and col_stride_c, and then indicating where A * B
// should be written into by start_row_c and start_col_c.
@@ -110,24 +161,11 @@
// ------------
// ------------
//
-template<int kRowA, int kColA, int kRowB, int kColB, int kOperation>
-inline void MatrixMatrixMultiplyEigen(const double* A,
- const int num_row_a,
- const int num_col_a,
- const double* B,
- const int num_row_b,
- const int num_col_b,
- double* C,
- const int start_row_c,
- const int start_col_c,
- const int row_stride_c,
- const int col_stride_c) {
- const typename EigenTypes<kRowA, kColA>::ConstMatrixRef Aref(A, num_row_a, num_col_a);
- const typename EigenTypes<kRowB, kColB>::ConstMatrixRef Bref(B, num_row_b, num_col_b);
- MatrixRef Cref(C, row_stride_c, col_stride_c);
- Eigen::Block<MatrixRef, kRowA, kColB> block(Cref,
- start_row_c, start_col_c,
- num_row_a, num_col_b);
+CERES_GEMM_BEGIN(MatrixMatrixMultiplyEigen) {
+ CERES_GEMM_EIGEN_HEADER
+ Eigen::Block<MatrixRef, kRowA, kColB>
+ block(Cref, start_row_c, start_col_c, num_row_a, num_col_b);
+
if (kOperation > 0) {
block CERES_MAYBE_NOALIAS += Aref * Bref;
} else if (kOperation < 0) {
@@ -137,36 +175,8 @@
}
}
-template<int kRowA, int kColA, int kRowB, int kColB, int kOperation>
-inline void MatrixMatrixMultiplyNaive(const double* A,
- const int num_row_a,
- const int num_col_a,
- const double* B,
- const int num_row_b,
- const int num_col_b,
- double* C,
- const int start_row_c,
- const int start_col_c,
- const int row_stride_c,
- const int col_stride_c) {
- DCHECK_GT(num_row_a, 0);
- DCHECK_GT(num_col_a, 0);
- DCHECK_GT(num_row_b, 0);
- DCHECK_GT(num_col_b, 0);
- DCHECK_GE(start_row_c, 0);
- DCHECK_GE(start_col_c, 0);
- DCHECK_GT(row_stride_c, 0);
- DCHECK_GT(col_stride_c, 0);
-
- DCHECK((kRowA == Eigen::Dynamic) || (kRowA == num_row_a));
- DCHECK((kColA == Eigen::Dynamic) || (kColA == num_col_a));
- DCHECK((kRowB == Eigen::Dynamic) || (kRowB == num_row_b));
- DCHECK((kColB == Eigen::Dynamic) || (kColB == num_col_b));
-
- const int NUM_ROW_A = (kRowA != Eigen::Dynamic ? kRowA : num_row_a);
- const int NUM_COL_A = (kColA != Eigen::Dynamic ? kColA : num_col_a);
- const int NUM_ROW_B = (kColB != Eigen::Dynamic ? kRowB : num_row_b);
- const int NUM_COL_B = (kColB != Eigen::Dynamic ? kColB : num_col_b);
+CERES_GEMM_BEGIN(MatrixMatrixMultiplyNaive) {
+ CERES_GEMM_NAIVE_HEADER
DCHECK_EQ(NUM_COL_A, NUM_ROW_B);
const int NUM_ROW_C = NUM_ROW_A;
@@ -193,91 +203,26 @@
}
}
-template<int kRowA, int kColA, int kRowB, int kColB, int kOperation>
-inline void MatrixMatrixMultiply(const double* A,
- const int num_row_a,
- const int num_col_a,
- const double* B,
- const int num_row_b,
- const int num_col_b,
- double* C,
- const int start_row_c,
- const int start_col_c,
- const int row_stride_c,
- const int col_stride_c) {
+CERES_GEMM_BEGIN(MatrixMatrixMultiply) {
#ifdef CERES_NO_CUSTOM_BLAS
- MatrixMatrixMultiplyEigen<kRowA, kColA, kRowB, kColB, kOperation>(
- A, num_row_a, num_col_a,
- B, num_row_b, num_col_b,
- C, start_row_c, start_col_c, row_stride_c, col_stride_c);
+
+ CERES_CALL_GEMM(MatrixMatrixMultiplyEigen)
return;
#else
if (kRowA != Eigen::Dynamic && kColA != Eigen::Dynamic &&
kRowB != Eigen::Dynamic && kColB != Eigen::Dynamic) {
- MatrixMatrixMultiplyEigen<kRowA, kColA, kRowB, kColB, kOperation>(
- A, num_row_a, num_col_a,
- B, num_row_b, num_col_b,
- C, start_row_c, start_col_c, row_stride_c, col_stride_c);
+ CERES_CALL_GEMM(MatrixMatrixMultiplyEigen)
} else {
- MatrixMatrixMultiplyNaive<kRowA, kColA, kRowB, kColB, kOperation>(
- A, num_row_a, num_col_a,
- B, num_row_b, num_col_b,
- C, start_row_c, start_col_c, row_stride_c, col_stride_c);
+ CERES_CALL_GEMM(MatrixMatrixMultiplyNaive)
}
#endif
}
-
-// C op A' * B;
-//
-// where op can be +=, -=, or =.
-//
-// The template parameters (kRowA, kColA, kRowB, kColB) allow
-// specialization of the loop at compile time. If this information is
-// not available, then Eigen::Dynamic should be used as the template
-// argument.
-//
-// kOperation = 1 -> C += A' * B
-// kOperation = -1 -> C -= A' * B
-// kOperation = 0 -> C = A' * B
-//
-// The function can write into matrices C which are larger than the
-// matrix A' * B. This is done by specifying the true size of C via
-// row_stride_c and col_stride_c, and then indicating where A * B
-// should be written into by start_row_c and start_col_c.
-//
-// Graphically if row_stride_c = 10, col_stride_c = 12, start_row_c =
-// 4 and start_col_c = 5, then if A = 2x3 and B = 2x4, we get
-//
-// ------------
-// ------------
-// ------------
-// ------------
-// -----xxxx---
-// -----xxxx---
-// -----xxxx---
-// ------------
-// ------------
-// ------------
-//
-template<int kRowA, int kColA, int kRowB, int kColB, int kOperation>
-inline void MatrixTransposeMatrixMultiplyEigen(const double* A,
- const int num_row_a,
- const int num_col_a,
- const double* B,
- const int num_row_b,
- const int num_col_b,
- double* C,
- const int start_row_c,
- const int start_col_c,
- const int row_stride_c,
- const int col_stride_c) {
- const typename EigenTypes<kRowA, kColA>::ConstMatrixRef Aref(A, num_row_a, num_col_a);
- const typename EigenTypes<kRowB, kColB>::ConstMatrixRef Bref(B, num_row_b, num_col_b);
- MatrixRef Cref(C, row_stride_c, col_stride_c);
+CERES_GEMM_BEGIN(MatrixTransposeMatrixMultiplyEigen) {
+ CERES_GEMM_EIGEN_HEADER
Eigen::Block<MatrixRef, kColA, kColB> block(Cref,
start_row_c, start_col_c,
num_col_a, num_col_b);
@@ -290,36 +235,8 @@
}
}
-template<int kRowA, int kColA, int kRowB, int kColB, int kOperation>
-inline void MatrixTransposeMatrixMultiplyNaive(const double* A,
- const int num_row_a,
- const int num_col_a,
- const double* B,
- const int num_row_b,
- const int num_col_b,
- double* C,
- const int start_row_c,
- const int start_col_c,
- const int row_stride_c,
- const int col_stride_c) {
- DCHECK_GT(num_row_a, 0);
- DCHECK_GT(num_col_a, 0);
- DCHECK_GT(num_row_b, 0);
- DCHECK_GT(num_col_b, 0);
- DCHECK_GE(start_row_c, 0);
- DCHECK_GE(start_col_c, 0);
- DCHECK_GT(row_stride_c, 0);
- DCHECK_GT(col_stride_c, 0);
-
- DCHECK((kRowA == Eigen::Dynamic) || (kRowA == num_row_a));
- DCHECK((kColA == Eigen::Dynamic) || (kColA == num_col_a));
- DCHECK((kRowB == Eigen::Dynamic) || (kRowB == num_row_b));
- DCHECK((kColB == Eigen::Dynamic) || (kColB == num_col_b));
-
- const int NUM_ROW_A = (kRowA != Eigen::Dynamic ? kRowA : num_row_a);
- const int NUM_COL_A = (kColA != Eigen::Dynamic ? kColA : num_col_a);
- const int NUM_ROW_B = (kColB != Eigen::Dynamic ? kRowB : num_row_b);
- const int NUM_COL_B = (kColB != Eigen::Dynamic ? kColB : num_col_b);
+CERES_GEMM_BEGIN(MatrixTransposeMatrixMultiplyNaive) {
+ CERES_GEMM_NAIVE_HEADER
DCHECK_EQ(NUM_ROW_A, NUM_ROW_B);
const int NUM_ROW_C = NUM_COL_A;
@@ -346,43 +263,37 @@
}
}
-template<int kRowA, int kColA, int kRowB, int kColB, int kOperation>
-inline void MatrixTransposeMatrixMultiply(const double* A,
- const int num_row_a,
- const int num_col_a,
- const double* B,
- const int num_row_b,
- const int num_col_b,
- double* C,
- const int start_row_c,
- const int start_col_c,
- const int row_stride_c,
- const int col_stride_c) {
+CERES_GEMM_BEGIN(MatrixTransposeMatrixMultiply) {
#ifdef CERES_NO_CUSTOM_BLAS
- MatrixTransposeMatrixMultiplyEigen<kRowA, kColA, kRowB, kColB, kOperation>(
- A, num_row_a, num_col_a,
- B, num_row_b, num_col_b,
- C, start_row_c, start_col_c, row_stride_c, col_stride_c);
+
+ CERES_CALL_GEMM(MatrixTransposeMatrixMultiplyEigen)
return;
#else
if (kRowA != Eigen::Dynamic && kColA != Eigen::Dynamic &&
kRowB != Eigen::Dynamic && kColB != Eigen::Dynamic) {
- MatrixTransposeMatrixMultiplyEigen<kRowA, kColA, kRowB, kColB, kOperation>(
- A, num_row_a, num_col_a,
- B, num_row_b, num_col_b,
- C, start_row_c, start_col_c, row_stride_c, col_stride_c);
+ CERES_CALL_GEMM(MatrixTransposeMatrixMultiplyEigen)
} else {
- MatrixTransposeMatrixMultiplyNaive<kRowA, kColA, kRowB, kColB, kOperation>(
- A, num_row_a, num_col_a,
- B, num_row_b, num_col_b,
- C, start_row_c, start_col_c, row_stride_c, col_stride_c);
+ CERES_CALL_GEMM(MatrixTransposeMatrixMultiplyNaive)
}
#endif
}
+// Matrix-Vector multiplication
+//
+// c op A * b;
+//
+// where op can be +=, -=, or =.
+//
+// The template parameters (kRowA, kColA) allow specialization of the
+// loop at compile time. If this information is not available, then
+// Eigen::Dynamic should be used as the template argument.
+//
+// kOperation = 1 -> c += A' * b
+// kOperation = -1 -> c -= A' * b
+// kOperation = 0 -> c = A' * b
template<int kRowA, int kColA, int kOperation>
inline void MatrixVectorMultiply(const double* A,
const int num_row_a,
@@ -390,7 +301,8 @@
const double* b,
double* c) {
#ifdef CERES_NO_CUSTOM_BLAS
- const typename EigenTypes<kRowA, kColA>::ConstMatrixRef Aref(A, num_row_a, num_col_a);
+ const typename EigenTypes<kRowA, kColA>::ConstMatrixRef
+ Aref(A, num_row_a, num_col_a);
const typename EigenTypes<kColA>::ConstVectorRef bref(b, num_col_a);
typename EigenTypes<kRowA>::VectorRef cref(c, num_row_a);
@@ -430,17 +342,9 @@
#endif // CERES_NO_CUSTOM_BLAS
}
+// Similar to MatrixVectorMultiply, except that A is transposed, i.e.,
+//
// c op A' * b;
-//
-// where op can be +=, -=, or =.
-//
-// The template parameters (kRowA, kColA) allow specialization of the
-// loop at compile time. If this information is not available, then
-// Eigen::Dynamic should be used as the template argument.
-//
-// kOperation = 1 -> c += A' * b
-// kOperation = -1 -> c -= A' * b
-// kOperation = 0 -> c = A' * b
template<int kRowA, int kColA, int kOperation>
inline void MatrixTransposeVectorMultiply(const double* A,
const int num_row_a,
@@ -448,7 +352,8 @@
const double* b,
double* c) {
#ifdef CERES_NO_CUSTOM_BLAS
- const typename EigenTypes<kRowA, kColA>::ConstMatrixRef Aref(A, num_row_a, num_col_a);
+ const typename EigenTypes<kRowA, kColA>::ConstMatrixRef
+ Aref(A, num_row_a, num_col_a);
const typename EigenTypes<kRowA>::ConstVectorRef bref(b, num_row_a);
typename EigenTypes<kColA>::VectorRef cref(c, num_col_a);
@@ -488,7 +393,12 @@
#endif // CERES_NO_CUSTOM_BLAS
}
+
#undef CERES_MAYBE_NOALIAS
+#undef CERES_GEMM_BEGIN
+#undef CERES_GEMM_EIGEN_HEADER
+#undef CERES_GEMM_NAIVE_HEADER
+#undef CERES_CALL_GEMM
} // namespace internal
} // namespace ceres
diff --git a/internal/ceres/block_sparse_matrix.cc b/internal/ceres/block_sparse_matrix.cc
index ab6fcef..ae36d60 100644
--- a/internal/ceres/block_sparse_matrix.cc
+++ b/internal/ceres/block_sparse_matrix.cc
@@ -147,7 +147,6 @@
values_.get() + cells[j].position, row_block_size, col_block_size,
x + row_block_pos,
y + col_block_pos);
-
}
}
}
diff --git a/internal/ceres/residual_block.cc b/internal/ceres/residual_block.cc
index b91b0ed..4906607 100644
--- a/internal/ceres/residual_block.cc
+++ b/internal/ceres/residual_block.cc
@@ -35,6 +35,7 @@
#include <cstddef>
#include <vector>
+#include "ceres/blas.h"
#include "ceres/corrector.h"
#include "ceres/parameter_block.h"
#include "ceres/residual_block_utils.h"
@@ -139,17 +140,15 @@
// Apply local reparameterization to the jacobians.
if (parameter_block->LocalParameterizationJacobian() != NULL) {
- ConstMatrixRef local_to_global(
+ MatrixMatrixMultiply
+ <Eigen::Dynamic, Eigen::Dynamic, Eigen::Dynamic, Eigen::Dynamic, 0>(
+ global_jacobians[i],
+ num_residuals,
+ parameter_block->Size(),
parameter_block->LocalParameterizationJacobian(),
parameter_block->Size(),
- parameter_block->LocalSize());
- MatrixRef global_jacobian(global_jacobians[i],
- num_residuals,
- parameter_block->Size());
- MatrixRef local_jacobian(jacobians[i],
- num_residuals,
- parameter_block->LocalSize());
- local_jacobian.noalias() = global_jacobian * local_to_global;
+ parameter_block->LocalSize(),
+ jacobians[i], 0, 0, num_residuals, parameter_block->LocalSize());
}
}
}
diff --git a/internal/ceres/schur_eliminator.h b/internal/ceres/schur_eliminator.h
index e0c7fda..f2c247a 100644
--- a/internal/ceres/schur_eliminator.h
+++ b/internal/ceres/schur_eliminator.h
@@ -277,7 +277,7 @@
const double* b,
int row_block_counter,
typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix* eet,
- typename EigenTypes<kEBlockSize>::Vector* g,
+ double* g,
double* buffer,
BlockRandomAccessMatrix* lhs);
@@ -285,7 +285,7 @@
const BlockSparseMatrixBase* A,
const double* b,
int row_block_counter,
- const Vector& inverse_ete_g,
+ const double* inverse_ete_g,
double* rhs);
void ChunkOuterProduct(const CompressedRowBlockStructure* bs,
@@ -336,7 +336,7 @@
// ChunkOuterProduct. Like buffer_ it is of size num_threads *
// buffer_size_. Each thread accesses the chunk
//
- // [thread_id * buffer_size_ , (thread_id + 1) * buffer_size_]
+ // [thread_id * buffer_size_ , (thread_id + 1) * buffer_size_ -1]
//
scoped_array<double> chunk_outer_product_buffer_;
diff --git a/internal/ceres/schur_eliminator_impl.h b/internal/ceres/schur_eliminator_impl.h
index b46eab9..5afb2f2 100644
--- a/internal/ceres/schur_eliminator_impl.h
+++ b/internal/ceres/schur_eliminator_impl.h
@@ -51,16 +51,18 @@
#include <algorithm>
#include <map>
-#include "Eigen/Dense"
+
#include "ceres/blas.h"
#include "ceres/block_random_access_matrix.h"
#include "ceres/block_sparse_matrix.h"
#include "ceres/block_structure.h"
#include "ceres/internal/eigen.h"
+#include "ceres/internal/fixed_array.h"
#include "ceres/internal/scoped_ptr.h"
#include "ceres/map_util.h"
#include "ceres/schur_eliminator.h"
#include "ceres/stl_util.h"
+#include "Eigen/Dense"
#include "glog/logging.h"
namespace ceres {
@@ -240,8 +242,9 @@
ete.setZero();
}
- typename EigenTypes<kEBlockSize>::Vector g(e_block_size);
- g.setZero();
+ FixedArray<double, 8> g(e_block_size);
+ typename EigenTypes<kEBlockSize>::VectorRef gref(g.get(), e_block_size);
+ gref.setZero();
// We are going to be computing
//
@@ -256,7 +259,7 @@
// in this chunk (g) and add the outer product of the f_blocks to
// Schur complement (S += F'F).
ChunkDiagonalBlockAndGradient(
- chunk, A, b, chunk.start, &ete, &g, buffer, lhs);
+ chunk, A, b, chunk.start, &ete, g.get(), buffer, lhs);
// Normally one wouldn't compute the inverse explicitly, but
// e_block_size will typically be a small number like 3, in
@@ -273,7 +276,12 @@
// linear system.
//
// rhs = F'b - F'E(E'E)^(-1) E'b
- UpdateRhs(chunk, A, b, chunk.start, inverse_ete * g, rhs);
+
+ FixedArray<double, 8> inverse_ete_g(e_block_size);
+ MatrixVectorMultiply<kEBlockSize, kEBlockSize, 0>(
+ inverse_ete.data(), e_block_size, e_block_size, g.get(), inverse_ete_g.get());
+
+ UpdateRhs(chunk, A, b, chunk.start, inverse_ete_g.get(), rhs);
// S -= F'E(E'E)^{-1}E'F
ChunkOuterProduct(bs, inverse_ete, buffer, chunk.buffer_layout, lhs);
@@ -318,7 +326,9 @@
const Cell& e_cell = row.cells.front();
DCHECK_EQ(e_block_id, e_cell.block_id);
- typename EigenTypes<kRowBlockSize>::Vector sj =
+ FixedArray<double, 8> sj(row.block.size);
+
+ typename EigenTypes<kRowBlockSize>::VectorRef(sj.get(), row.block.size) =
typename EigenTypes<kRowBlockSize>::ConstVectorRef
(b + bs->rows[chunk.start + j].block.position, row.block.size);
@@ -330,16 +340,16 @@
MatrixVectorMultiply<kRowBlockSize, kFBlockSize, -1>(
row_values + row.cells[c].position, row.block.size, f_block_size,
z + lhs_row_layout_[r_block],
- sj.data());
+ sj.get());
}
MatrixTransposeVectorMultiply<kRowBlockSize, kEBlockSize, 1>(
row_values + e_cell.position, row.block.size, e_block_size,
- sj.data(),
+ sj.get(),
y_ptr);
MatrixTransposeMatrixMultiply
- <kRowBlockSize, kEBlockSize,kRowBlockSize, kEBlockSize, 1>(
+ <kRowBlockSize, kEBlockSize, kRowBlockSize, kEBlockSize, 1>(
row_values + e_cell.position, row.block.size, e_block_size,
row_values + e_cell.position, row.block.size, e_block_size,
ete.data(), 0, 0, e_block_size, e_block_size);
@@ -359,25 +369,25 @@
const BlockSparseMatrixBase* A,
const double* b,
int row_block_counter,
- const Vector& inverse_ete_g,
+ const double* inverse_ete_g,
double* rhs) {
const CompressedRowBlockStructure* bs = A->block_structure();
- const int e_block_size = inverse_ete_g.rows();
+ const int e_block_id = bs->rows[chunk.start].cells.front().block_id;
+ const int e_block_size = bs->cols[e_block_id].size;
+
int b_pos = bs->rows[row_block_counter].block.position;
for (int j = 0; j < chunk.size; ++j) {
const CompressedRow& row = bs->rows[row_block_counter + j];
const double *row_values = A->RowBlockValues(row_block_counter + j);
const Cell& e_cell = row.cells.front();
- const typename EigenTypes<kRowBlockSize, kEBlockSize>::ConstMatrixRef
- e_block(row_values + e_cell.position,
- row.block.size,
- e_block_size);
-
- const typename EigenTypes<kRowBlockSize>::Vector
- sj =
+ typename EigenTypes<kRowBlockSize>::Vector sj =
typename EigenTypes<kRowBlockSize>::ConstVectorRef
- (b + b_pos, row.block.size) - e_block * (inverse_ete_g);
+ (b + b_pos, row.block.size);
+
+ MatrixVectorMultiply<kRowBlockSize, kEBlockSize, -1>(
+ row_values + e_cell.position, row.block.size, e_block_size,
+ inverse_ete_g, sj.data());
for (int c = 1; c < row.cells.size(); ++c) {
const int block_id = row.cells[c].block_id;
@@ -421,7 +431,7 @@
const double* b,
int row_block_counter,
typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix* ete,
- typename EigenTypes<kEBlockSize>::Vector* g,
+ double* g,
double* buffer,
BlockRandomAccessMatrix* lhs) {
const CompressedRowBlockStructure* bs = A->block_structure();
@@ -453,7 +463,7 @@
MatrixTransposeVectorMultiply<kRowBlockSize, kEBlockSize, 1>(
row_values + e_cell.position, row.block.size, e_block_size,
b + b_pos,
- g->data());
+ g);
// buffer = E'F. This computation is done by iterating over the
@@ -550,10 +560,10 @@
const int block_id = row.cells[c].block_id;
const int block_size = bs->cols[block_id].size;
const int block = block_id - num_eliminate_blocks_;
- VectorRef(rhs + lhs_row_layout_[block], block_size).noalias()
- += (ConstMatrixRef(row_values + row.cells[c].position,
- row.block.size, block_size).transpose() *
- ConstVectorRef(b + row.block.position, row.block.size));
+ MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
+ row_values + row.cells[c].position, row.block.size, block_size,
+ b + row.block.position,
+ rhs + lhs_row_layout_[block]);
}
NoEBlockRowOuterProduct(A, row_block_counter, lhs);
}
@@ -588,8 +598,6 @@
DCHECK_GE(block1, 0);
const int block1_size = bs->cols[row.cells[i].block_id].size;
- const ConstMatrixRef b1(row_values + row.cells[i].position,
- row.block.size, block1_size);
int r, c, row_stride, col_stride;
CellInfo* cell_info = lhs->GetCell(block1, block1,
&r, &c,