Replace Eigen block operations with small GEMM and GEMV loops. 1. Add Matrix-Matrix and Matrix-Vector multiply functions. 2. Replace Eigen usage in SchurEliminator with these custom matrix operations. 3. Save on some memory allocations in ChunkOuterProduct. 4. Replace LDLT with LLT. As a result on problem-16-22106-pre.txt, the linear solver time goes down from 1.2s to 0.64s. Change-Id: I2daa667960e0a1e8834489965a30be31f37fd87f
diff --git a/CMakeLists.txt b/CMakeLists.txt index a7b77fe..1f234c7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt
@@ -451,10 +451,19 @@ ON) IF (NOT ${LINE_SEARCH_MINIMIZER}) - ADD_DEFINITIONS(-DCERES_NO_LINE_SEARCH_MINIMIZER) - MESSAGE("-- Disabling line search minimizer") + ADD_DEFINITIONS(-DCERES_NO_LINE_SEARCH_MINIMIZER) + MESSAGE("-- Disabling line search minimizer") ENDIF (NOT ${LINE_SEARCH_MINIMIZER}) +OPTION(CUSTOM_BLAS + "Use handcoded BLAS routines (usually faster) instead of Eigen." + ON) + +IF (NOT ${CUSTOM_BLAS}) + ADD_DEFINITIONS(-DCERES_NO_CUSTOM_BLAS) + MESSAGE("-- Disabling custom blas") +ENDIF (NOT ${CUSTOM_BLAS}) + # Multithreading using OpenMP OPTION(OPENMP "Enable threaded solving in Ceres (requires OpenMP)"
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt index 4694ec5..8a90935 100644 --- a/internal/ceres/CMakeLists.txt +++ b/internal/ceres/CMakeLists.txt
@@ -238,6 +238,7 @@ CERES_TEST(array_utils) CERES_TEST(autodiff) CERES_TEST(autodiff_cost_function) + CERES_TEST(blas) CERES_TEST(block_random_access_dense_matrix) CERES_TEST(block_random_access_sparse_matrix) CERES_TEST(block_sparse_matrix)
diff --git a/internal/ceres/blas.h b/internal/ceres/blas.h new file mode 100644 index 0000000..028b63a --- /dev/null +++ b/internal/ceres/blas.h
@@ -0,0 +1,390 @@ +// Ceres Solver - A fast non-linear least squares minimizer +// Copyright 2013 Google Inc. All rights reserved. +// http://code.google.com/p/ceres-solver/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Google Inc. nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Author: sameeragarwal@google.com (Sameer Agarwal) +// +// Simple blas functions for use in the Schur Eliminator. These are +// fairly basic implementations which already yield a significant +// speedup in the eliminator performance. + +#ifndef CERES_INTERNAL_BLAS_H_ +#define CERES_INTERNAL_BLAS_H_ + +#include "ceres/internal/eigen.h" +#include "glog/logging.h" + +namespace ceres { +namespace internal { + +// Remove the ".noalias()" annotation from the matrix matrix +// mutliplies to produce a correct build with the Android NDK, +// including versions 6, 7, 8, and 8b, when built with STLPort and the +// non-standalone toolchain (i.e. ndk-build). This appears to be a +// compiler bug; if the workaround is not in place, the line +// +// block.noalias() -= A * B; +// +// gets compiled to +// +// block.noalias() += A * B; +// +// which breaks schur elimination. Introducing a temporary by removing the +// .noalias() annotation causes the issue to disappear. Tracking this +// issue down was tricky, since the test suite doesn't run when built with +// the non-standalone toolchain. +// +// TODO(keir): Make a reproduction case for this and send it upstream. +#ifdef CERES_WORK_AROUND_ANDROID_NDK_COMPILER_BUG +#define CERES_MAYBE_NOALIAS +#else +#define CERES_MAYBE_NOALIAS .noalias() +#endif + +// C op A * B; +// +// where op can be +=, -=, or =. +// +// The template parameters (kRowA, kColA, kRowB, kColB) allow +// specialization of the loop at compile time. If this information is +// not available, then Eigen::Dynamic should be used as the template +// argument. +// +// kOperation = 1 -> C += A * B +// kOperation = -1 -> C -= A * B +// kOperation = 0 -> C = A * B +// +// The function can write into matrices C which are larger than the +// matrix A * B. This is done by specifying the true size of C via +// row_stride_c and col_stride_c, and then indicating where A * B +// should be written into by start_row_c and start_col_c. +// +// Graphically if row_stride_c = 10, col_stride_c = 12, start_row_c = +// 4 and start_col_c = 5, then if A = 3x2 and B = 2x4, we get +// +// ------------ +// ------------ +// ------------ +// ------------ +// -----xxxx--- +// -----xxxx--- +// -----xxxx--- +// ------------ +// ------------ +// ------------ +// +template<int kRowA, int kColA, int kRowB, int kColB, int kOperation> +inline void MatrixMatrixMultiply(const double* A, + const int num_row_a, + const int num_col_a, + const double* B, + const int num_row_b, + const int num_col_b, + double* C, + const int start_row_c, + const int start_col_c, + const int row_stride_c, + const int col_stride_c) { +#ifdef CERES_NO_CUSTOM_BLAS + const typename EigenTypes<kRowA, kColA>::ConstMatrixRef Aref(A, num_row_a, num_col_a); + const typename EigenTypes<kRowB, kColB>::ConstMatrixRef Bref(B, num_row_b, num_col_b); + MatrixRef Cref(C, row_stride_c, col_stride_c); + Eigen::Block<MatrixRef, kRowA, kColB> block(Cref, + start_row_c, start_col_c, + num_row_a, num_col_b); + + if (kOperation > 0) { + block CERES_MAYBE_NOALIAS += Aref * Bref; + } else if (kOperation < 0) { + block CERES_MAYBE_NOALIAS -= Aref * Bref; + } else { + block CERES_MAYBE_NOALIAS = Aref * Bref; + } + +#else + + DCHECK_GT(num_row_a, 0); + DCHECK_GT(num_col_a, 0); + DCHECK_GT(num_row_b, 0); + DCHECK_GT(num_col_b, 0); + DCHECK_GE(start_row_c, 0); + DCHECK_GE(start_col_c, 0); + DCHECK_GT(row_stride_c, 0); + DCHECK_GT(col_stride_c, 0); + + DCHECK((kRowA == Eigen::Dynamic) || (kRowA == num_row_a)); + DCHECK((kColA == Eigen::Dynamic) || (kColA == num_col_a)); + DCHECK((kRowB == Eigen::Dynamic) || (kRowB == num_row_b)); + DCHECK((kColB == Eigen::Dynamic) || (kColB == num_col_b)); + + const int NUM_ROW_A = (kRowA != Eigen::Dynamic ? kRowA : num_row_a); + const int NUM_COL_A = (kColA != Eigen::Dynamic ? kColA : num_col_a); + const int NUM_ROW_B = (kColB != Eigen::Dynamic ? kRowB : num_row_b); + const int NUM_COL_B = (kColB != Eigen::Dynamic ? kColB : num_col_b); + DCHECK_EQ(NUM_COL_A, NUM_ROW_B); + + const int NUM_ROW_C = NUM_ROW_A; + const int NUM_COL_C = NUM_COL_B; + DCHECK_LT(start_row_c + NUM_ROW_C, row_stride_c); + DCHECK_LT(start_col_c + NUM_COL_C, col_stride_c); + + for (int row = 0; row < NUM_ROW_C; ++row) { + for (int col = 0; col < NUM_COL_C; ++col) { + double tmp = 0.0; + for (int k = 0; k < NUM_COL_A; ++k) { + tmp += A[row * NUM_COL_A + k] * B[k * NUM_COL_B + col]; + } + + const int index = (row + start_row_c) * col_stride_c + start_col_c + col; + if (kOperation > 0) { + C[index] += tmp; + } else if (kOperation < 0) { + C[index] -= tmp; + } else { + C[index] = tmp; + } + } + } +#endif // CERES_NO_CUSTOM_BLAS +} + +// C op A' * B; +// +// where op can be +=, -=, or =. +// +// The template parameters (kRowA, kColA, kRowB, kColB) allow +// specialization of the loop at compile time. If this information is +// not available, then Eigen::Dynamic should be used as the template +// argument. +// +// kOperation = 1 -> C += A' * B +// kOperation = -1 -> C -= A' * B +// kOperation = 0 -> C = A' * B +// +// The function can write into matrices C which are larger than the +// matrix A' * B. This is done by specifying the true size of C via +// row_stride_c and col_stride_c, and then indicating where A * B +// should be written into by start_row_c and start_col_c. +// +// Graphically if row_stride_c = 10, col_stride_c = 12, start_row_c = +// 4 and start_col_c = 5, then if A = 2x3 and B = 2x4, we get +// +// ------------ +// ------------ +// ------------ +// ------------ +// -----xxxx--- +// -----xxxx--- +// -----xxxx--- +// ------------ +// ------------ +// ------------ +// +template<int kRowA, int kColA, int kRowB, int kColB, int kOperation> +inline void MatrixTransposeMatrixMultiply(const double* A, + const int num_row_a, + const int num_col_a, + const double* B, + const int num_row_b, + const int num_col_b, + double* C, + const int start_row_c, + const int start_col_c, + const int row_stride_c, + const int col_stride_c) { +#ifdef CERES_NO_CUSTOM_BLAS + const typename EigenTypes<kRowA, kColA>::ConstMatrixRef Aref(A, num_row_a, num_col_a); + const typename EigenTypes<kRowB, kColB>::ConstMatrixRef Bref(B, num_row_b, num_col_b); + MatrixRef Cref(C, row_stride_c, col_stride_c); + Eigen::Block<MatrixRef, kColA, kColB> block(Cref, + start_row_c, start_col_c, + num_col_a, num_col_b); + + if (kOperation > 0) { + block CERES_MAYBE_NOALIAS += Aref.transpose() * Bref; + } else if (kOperation < 0) { + block CERES_MAYBE_NOALIAS -= Aref.transpose() * Bref; + } else { + block CERES_MAYBE_NOALIAS = Aref.transpose() * Bref; + } + +#else + + DCHECK_GT(num_row_a, 0); + DCHECK_GT(num_col_a, 0); + DCHECK_GT(num_row_b, 0); + DCHECK_GT(num_col_b, 0); + DCHECK_GE(start_row_c, 0); + DCHECK_GE(start_col_c, 0); + DCHECK_GT(row_stride_c, 0); + DCHECK_GT(col_stride_c, 0); + + DCHECK((kRowA == Eigen::Dynamic) || (kRowA == num_row_a)); + DCHECK((kColA == Eigen::Dynamic) || (kColA == num_col_a)); + DCHECK((kRowB == Eigen::Dynamic) || (kRowB == num_row_b)); + DCHECK((kColB == Eigen::Dynamic) || (kColB == num_col_b)); + + const int NUM_ROW_A = (kRowA != Eigen::Dynamic ? kRowA : num_row_a); + const int NUM_COL_A = (kColA != Eigen::Dynamic ? kColA : num_col_a); + const int NUM_ROW_B = (kColB != Eigen::Dynamic ? kRowB : num_row_b); + const int NUM_COL_B = (kColB != Eigen::Dynamic ? kColB : num_col_b); + DCHECK_EQ(NUM_ROW_A, NUM_ROW_B); + + const int NUM_ROW_C = NUM_COL_A; + const int NUM_COL_C = NUM_COL_B; + DCHECK_LT(start_row_c + NUM_ROW_C, row_stride_c); + DCHECK_LT(start_col_c + NUM_COL_C, col_stride_c); + + for (int row = 0; row < NUM_ROW_C; ++row) { + for (int col = 0; col < NUM_COL_C; ++col) { + double tmp = 0.0; + for (int k = 0; k < NUM_ROW_A; ++k) { + tmp += A[k * NUM_COL_A + row] * B[k * NUM_COL_B + col]; + } + + const int index = (row + start_row_c) * col_stride_c + start_col_c + col; + if (kOperation > 0) { + C[index]+= tmp; + } else if (kOperation < 0) { + C[index]-= tmp; + } else { + C[index]= tmp; + } + } + } +#endif // CERES_NO_CUSTOM_BLAS +} + +template<int kRowA, int kColA, int kOperation> +inline void MatrixVectorMultiply(const double* A, + const int num_row_a, + const int num_col_a, + const double* b, + double* c) { +#ifdef CERES_NO_CUSTOM_BLAS + const typename EigenTypes<kRowA, kColA>::ConstMatrixRef Aref(A, num_row_a, num_col_a); + const typename EigenTypes<kColA>::ConstVectorRef bref(b, num_col_a); + typename EigenTypes<kRowA>::VectorRef cref(c, num_row_a); + + if (kOperation > 0) { + cref.noalias() += Aref * bref; + } else if (kOperation < 0) { + cref.noalias() -= Aref * bref; + } else { + cref.noalias() = Aref * bref; + } + +#else + + DCHECK_GT(num_row_a, 0); + DCHECK_GT(num_col_a, 0); + DCHECK((kRowA == Eigen::Dynamic) || (kRowA == num_row_a)); + DCHECK((kColA == Eigen::Dynamic) || (kColA == num_col_a)); + + const int NUM_ROW_A = (kRowA != Eigen::Dynamic ? kRowA : num_row_a); + const int NUM_COL_A = (kColA != Eigen::Dynamic ? kColA : num_col_a); + + for (int row = 0; row < NUM_ROW_A; ++row) { + double tmp = 0.0; + for (int col = 0; col < NUM_COL_A; ++col) { + tmp += A[row * NUM_COL_A + col] * b[col]; + } + + if (kOperation > 0) { + c[row] += tmp; + } else if (kOperation < 0) { + c[row] -= tmp; + } else { + c[row] = tmp; + } + } +#endif // CERES_NO_CUSTOM_BLAS +} + +// c op A' * b; +// +// where op can be +=, -=, or =. +// +// The template parameters (kRowA, kColA) allow specialization of the +// loop at compile time. If this information is not available, then +// Eigen::Dynamic should be used as the template argument. +// +// kOperation = 1 -> c += A' * b +// kOperation = -1 -> c -= A' * b +// kOperation = 0 -> c = A' * b +template<int kRowA, int kColA, int kOperation> +inline void MatrixTransposeVectorMultiply(const double* A, + const int num_row_a, + const int num_col_a, + const double* b, + double* c) { +#ifdef CERES_NO_CUSTOM_BLAS + const typename EigenTypes<kRowA, kColA>::ConstMatrixRef Aref(A, num_row_a, num_col_a); + const typename EigenTypes<kRowA>::ConstVectorRef bref(b, num_row_a); + typename EigenTypes<kColA>::VectorRef cref(c, num_col_a); + + if (kOperation > 0) { + cref.noalias() += Aref.transpose() * bref; + } else if (kOperation < 0) { + cref.noalias() -= Aref.transpose() * bref; + } else { + cref.noalias() = Aref.transpose() * bref; + } + +#else + + DCHECK_GT(num_row_a, 0); + DCHECK_GT(num_col_a, 0); + DCHECK((kRowA == Eigen::Dynamic) || (kRowA == num_row_a)); + DCHECK((kColA == Eigen::Dynamic) || (kColA == num_col_a)); + + const int NUM_ROW_A = (kRowA != Eigen::Dynamic ? kRowA : num_row_a); + const int NUM_COL_A = (kColA != Eigen::Dynamic ? kColA : num_col_a); + + for (int row = 0; row < NUM_COL_A; ++row) { + double tmp = 0.0; + for (int col = 0; col < NUM_ROW_A; ++col) { + tmp += A[col * NUM_COL_A + row] * b[col]; + } + + if (kOperation > 0) { + c[row] += tmp; + } else if (kOperation < 0) { + c[row] -= tmp; + } else { + c[row] = tmp; + } + } +#endif // CERES_NO_CUSTOM_BLAS +} + +#undef CERES_MAYBE_NOALIAS + +} // namespace internal +} // namespace ceres + +#endif // CERES_INTERNAL_BLAS_H_
diff --git a/internal/ceres/blas_test.cc b/internal/ceres/blas_test.cc new file mode 100644 index 0000000..efa7e7b --- /dev/null +++ b/internal/ceres/blas_test.cc
@@ -0,0 +1,303 @@ +// Ceres Solver - A fast non-linear least squares minimizer +// Copyright 2013 Google Inc. All rights reserved. +// http://code.google.com/p/ceres-solver/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Google Inc. nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Author: keir@google.com (Keir Mierle) + +#include "ceres/blas.h" + +#include "gtest/gtest.h" +#include "ceres/internal/eigen.h" + +namespace ceres { +namespace internal { + +TEST(BLAS, MatrixMatrixMultiply) { + const double kTolerance = 1e-16; + const int kRowA = 3; + const int kColA = 5; + Matrix A(kRowA, kColA); + A.setOnes(); + + const int kRowB = 5; + const int kColB = 7; + Matrix B(kRowB, kColB); + B.setOnes(); + + for (int row_stride_c = kRowA; row_stride_c < 3 * kRowA; ++row_stride_c) { + for (int col_stride_c = kColB; col_stride_c < 3 * kColB; ++col_stride_c) { + Matrix C(row_stride_c, col_stride_c); + C.setOnes(); + + Matrix C_plus = C; + Matrix C_minus = C; + Matrix C_assign = C; + + Matrix C_plus_ref = C; + Matrix C_minus_ref = C; + Matrix C_assign_ref = C; + for (int start_row_c = 0; start_row_c + kRowA < row_stride_c; ++start_row_c) { + for (int start_col_c = 0; start_col_c + kColB < col_stride_c; ++start_col_c) { + C_plus_ref.block(start_row_c, start_col_c, kRowA, kColB) += + A * B; + + MatrixMatrixMultiply<kRowA, kColA, kRowB, kColB, 1>( + A.data(), kRowA, kColA, + B.data(), kRowB, kColB, + C_plus.data(), start_row_c, start_col_c, row_stride_c, col_stride_c); + + EXPECT_NEAR((C_plus_ref - C_plus).norm(), 0.0, kTolerance) + << "C += A * B \n" + << "row_stride_c : " << row_stride_c << "\n" + << "col_stride_c : " << col_stride_c << "\n" + << "start_row_c : " << start_row_c << "\n" + << "start_col_c : " << start_col_c << "\n" + << "Cref : \n" << C_plus_ref << "\n" + << "C: \n" << C_plus; + + + C_minus_ref.block(start_row_c, start_col_c, kRowA, kColB) -= + A * B; + + MatrixMatrixMultiply<kRowA, kColA, kRowB, kColB, -1>( + A.data(), kRowA, kColA, + B.data(), kRowB, kColB, + C_minus.data(), start_row_c, start_col_c, row_stride_c, col_stride_c); + + EXPECT_NEAR((C_minus_ref - C_minus).norm(), 0.0, kTolerance) + << "C -= A * B \n" + << "row_stride_c : " << row_stride_c << "\n" + << "col_stride_c : " << col_stride_c << "\n" + << "start_row_c : " << start_row_c << "\n" + << "start_col_c : " << start_col_c << "\n" + << "Cref : \n" << C_minus_ref << "\n" + << "C: \n" << C_minus; + + C_assign_ref.block(start_row_c, start_col_c, kRowA, kColB) = + A * B; + + MatrixMatrixMultiply<kRowA, kColA, kRowB, kColB, 0>( + A.data(), kRowA, kColA, + B.data(), kRowB, kColB, + C_assign.data(), start_row_c, start_col_c, row_stride_c, col_stride_c); + + EXPECT_NEAR((C_assign_ref - C_assign).norm(), 0.0, kTolerance) + << "C = A * B \n" + << "row_stride_c : " << row_stride_c << "\n" + << "col_stride_c : " << col_stride_c << "\n" + << "start_row_c : " << start_row_c << "\n" + << "start_col_c : " << start_col_c << "\n" + << "Cref : \n" << C_assign_ref << "\n" + << "C: \n" << C_assign; + } + } + } + } +} + +TEST(BLAS, MatrixTransposeMatrixMultiply) { + const double kTolerance = 1e-16; + const int kRowA = 5; + const int kColA = 3; + Matrix A(kRowA, kColA); + A.setOnes(); + + const int kRowB = 5; + const int kColB = 7; + Matrix B(kRowB, kColB); + B.setOnes(); + + for (int row_stride_c = kColA; row_stride_c < 3 * kColA; ++row_stride_c) { + for (int col_stride_c = kColB; col_stride_c < 3 * kColB; ++col_stride_c) { + Matrix C(row_stride_c, col_stride_c); + C.setOnes(); + + Matrix C_plus = C; + Matrix C_minus = C; + Matrix C_assign = C; + + Matrix C_plus_ref = C; + Matrix C_minus_ref = C; + Matrix C_assign_ref = C; + for (int start_row_c = 0; start_row_c + kColA < row_stride_c; ++start_row_c) { + for (int start_col_c = 0; start_col_c + kColB < col_stride_c; ++start_col_c) { + C_plus_ref.block(start_row_c, start_col_c, kColA, kColB) += + A.transpose() * B; + + MatrixTransposeMatrixMultiply<kRowA, kColA, kRowB, kColB, 1>( + A.data(), kRowA, kColA, + B.data(), kRowB, kColB, + C_plus.data(), start_row_c, start_col_c, row_stride_c, col_stride_c); + + EXPECT_NEAR((C_plus_ref - C_plus).norm(), 0.0, kTolerance) + << "C += A' * B \n" + << "row_stride_c : " << row_stride_c << "\n" + << "col_stride_c : " << col_stride_c << "\n" + << "start_row_c : " << start_row_c << "\n" + << "start_col_c : " << start_col_c << "\n" + << "Cref : \n" << C_plus_ref << "\n" + << "C: \n" << C_plus; + + C_minus_ref.block(start_row_c, start_col_c, kColA, kColB) -= + A.transpose() * B; + + MatrixTransposeMatrixMultiply<kRowA, kColA, kRowB, kColB, -1>( + A.data(), kRowA, kColA, + B.data(), kRowB, kColB, + C_minus.data(), start_row_c, start_col_c, row_stride_c, col_stride_c); + + EXPECT_NEAR((C_minus_ref - C_minus).norm(), 0.0, kTolerance) + << "C -= A' * B \n" + << "row_stride_c : " << row_stride_c << "\n" + << "col_stride_c : " << col_stride_c << "\n" + << "start_row_c : " << start_row_c << "\n" + << "start_col_c : " << start_col_c << "\n" + << "Cref : \n" << C_minus_ref << "\n" + << "C: \n" << C_minus; + + C_assign_ref.block(start_row_c, start_col_c, kColA, kColB) = + A.transpose() * B; + + MatrixTransposeMatrixMultiply<kRowA, kColA, kRowB, kColB, 0>( + A.data(), kRowA, kColA, + B.data(), kRowB, kColB, + C_assign.data(), start_row_c, start_col_c, row_stride_c, col_stride_c); + + EXPECT_NEAR((C_assign_ref - C_assign).norm(), 0.0, kTolerance) + << "C = A' * B \n" + << "row_stride_c : " << row_stride_c << "\n" + << "col_stride_c : " << col_stride_c << "\n" + << "start_row_c : " << start_row_c << "\n" + << "start_col_c : " << start_col_c << "\n" + << "Cref : \n" << C_assign_ref << "\n" + << "C: \n" << C_assign; + } + } + } + } +} + +TEST(BLAS, MatrixVectorMultiply) { + const double kTolerance = 1e-16; + const int kRowA = 5; + const int kColA = 3; + Matrix A(kRowA, kColA); + A.setOnes(); + + Vector b(kColA); + b.setOnes(); + + Vector c(kRowA); + c.setOnes(); + + Vector c_plus = c; + Vector c_minus = c; + Vector c_assign = c; + + Vector c_plus_ref = c; + Vector c_minus_ref = c; + Vector c_assign_ref = c; + + c_plus_ref += A * b; + MatrixVectorMultiply<kRowA, kColA, 1>(A.data(), kRowA, kColA, + b.data(), + c_plus.data()); + EXPECT_NEAR((c_plus_ref - c_plus).norm(), 0.0, kTolerance) + << "c += A * b \n" + << "c_ref : \n" << c_plus_ref << "\n" + << "c: \n" << c_plus; + + c_minus_ref -= A * b; + MatrixVectorMultiply<kRowA, kColA, -1>(A.data(), kRowA, kColA, + b.data(), + c_minus.data()); + EXPECT_NEAR((c_minus_ref - c_minus).norm(), 0.0, kTolerance) + << "c += A * b \n" + << "c_ref : \n" << c_minus_ref << "\n" + << "c: \n" << c_minus; + + c_assign_ref = A * b; + MatrixVectorMultiply<kRowA, kColA, 0>(A.data(), kRowA, kColA, + b.data(), + c_assign.data()); + EXPECT_NEAR((c_assign_ref - c_assign).norm(), 0.0, kTolerance) + << "c += A * b \n" + << "c_ref : \n" << c_assign_ref << "\n" + << "c: \n" << c_assign; +} + +TEST(BLAS, MatrixTransposeVectorMultiply) { + const double kTolerance = 1e-16; + const int kRowA = 5; + const int kColA = 3; + Matrix A(kRowA, kColA); + A.setRandom(); + + Vector b(kRowA); + b.setRandom(); + + Vector c(kColA); + c.setOnes(); + + Vector c_plus = c; + Vector c_minus = c; + Vector c_assign = c; + + Vector c_plus_ref = c; + Vector c_minus_ref = c; + Vector c_assign_ref = c; + + c_plus_ref += A.transpose() * b; + MatrixTransposeVectorMultiply<kRowA, kColA, 1>(A.data(), kRowA, kColA, + b.data(), + c_plus.data()); + EXPECT_NEAR((c_plus_ref - c_plus).norm(), 0.0, kTolerance) + << "c += A' * b \n" + << "c_ref : \n" << c_plus_ref << "\n" + << "c: \n" << c_plus; + + c_minus_ref -= A.transpose() * b; + MatrixTransposeVectorMultiply<kRowA, kColA, -1>(A.data(), kRowA, kColA, + b.data(), + c_minus.data()); + EXPECT_NEAR((c_minus_ref - c_minus).norm(), 0.0, kTolerance) + << "c += A' * b \n" + << "c_ref : \n" << c_minus_ref << "\n" + << "c: \n" << c_minus; + + c_assign_ref = A.transpose() * b; + MatrixTransposeVectorMultiply<kRowA, kColA, 0>(A.data(), kRowA, kColA, + b.data(), + c_assign.data()); + EXPECT_NEAR((c_assign_ref - c_assign).norm(), 0.0, kTolerance) + << "c += A' * b \n" + << "c_ref : \n" << c_assign_ref << "\n" + << "c: \n" << c_assign; +} + +} // namespace internal +} // namespace ceres
diff --git a/internal/ceres/schur_eliminator.h b/internal/ceres/schur_eliminator.h index 877621b..e0c7fda 100644 --- a/internal/ceres/schur_eliminator.h +++ b/internal/ceres/schur_eliminator.h
@@ -321,9 +321,25 @@ // see the documentation of the Chunk object above. vector<Chunk> chunks_; + // TODO(sameeragarwal): The following two arrays contain per-thread + // storage. They should be refactored into a per thread struct. + // Buffer to store the products of the y and z blocks generated - // during the elimination phase. + // during the elimination phase. buffer_ is of size num_threads * + // buffer_size_. Each thread accesses the chunk + // + // [thread_id * buffer_size_ , (thread_id + 1) * buffer_size_] + // scoped_array<double> buffer_; + + // Buffer to store per thread matrix matrix products used by + // ChunkOuterProduct. Like buffer_ it is of size num_threads * + // buffer_size_. Each thread accesses the chunk + // + // [thread_id * buffer_size_ , (thread_id + 1) * buffer_size_] + // + scoped_array<double> chunk_outer_product_buffer_; + int buffer_size_; int num_threads_; int uneliminated_row_begins_;
diff --git a/internal/ceres/schur_eliminator_impl.h b/internal/ceres/schur_eliminator_impl.h index 339c44b..b46eab9 100644 --- a/internal/ceres/schur_eliminator_impl.h +++ b/internal/ceres/schur_eliminator_impl.h
@@ -34,10 +34,6 @@ #ifndef CERES_INTERNAL_SCHUR_ELIMINATOR_IMPL_H_ #define CERES_INTERNAL_SCHUR_ELIMINATOR_IMPL_H_ -#ifdef CERES_USE_OPENMP -#include <omp.h> -#endif - // Eigen has an internal threshold switching between different matrix // multiplication algorithms. In particular for matrices larger than // EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD it uses a cache friendly @@ -46,19 +42,25 @@ // are thin and long, the default choice may not be optimal. This is // the case for us, as the default choice causes a 30% performance // regression when we moved from Eigen2 to Eigen3. + #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 10 +#ifdef CERES_USE_OPENMP +#include <omp.h> +#endif + #include <algorithm> #include <map> #include "Eigen/Dense" +#include "ceres/blas.h" #include "ceres/block_random_access_matrix.h" #include "ceres/block_sparse_matrix.h" #include "ceres/block_structure.h" +#include "ceres/internal/eigen.h" +#include "ceres/internal/scoped_ptr.h" #include "ceres/map_util.h" #include "ceres/schur_eliminator.h" #include "ceres/stl_util.h" -#include "ceres/internal/eigen.h" -#include "ceres/internal/scoped_ptr.h" #include "glog/logging.h" namespace ceres { @@ -149,13 +151,16 @@ buffer_.reset(new double[buffer_size_ * num_threads_]); + // chunk_outer_product_buffer_ only needs to store e_block_size * + // f_block_size, which is always less than buffer_size_, so we just + // allocate buffer_size_ per thread. + chunk_outer_product_buffer_.reset(new double[buffer_size_ * num_threads_]); + STLDeleteElements(&rhs_locks_); rhs_locks_.resize(num_col_blocks - num_eliminate_blocks_); for (int i = 0; i < num_col_blocks - num_eliminate_blocks_; ++i) { rhs_locks_[i] = new Mutex; } - - VLOG(1) << "Eliminator threads: " << num_threads_; } template <int kRowBlockSize, int kEBlockSize, int kFBlockSize> @@ -261,7 +266,7 @@ typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix inverse_ete = ete .template selfadjointView<Eigen::Upper>() - .ldlt() + .llt() .solve(Matrix::Identity(e_block_size, e_block_size)); // For the current chunk compute and update the rhs of the reduced @@ -294,8 +299,8 @@ const int e_block_id = bs->rows[chunk.start].cells.front().block_id; const int e_block_size = bs->cols[e_block_id].size; - typename EigenTypes<kEBlockSize>::VectorRef y_block( - y + bs->cols[e_block_id].position, e_block_size); + double* y_ptr = y + bs->cols[e_block_id].position; + typename EigenTypes<kEBlockSize>::VectorRef y_block(y_ptr, e_block_size); typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix ete(e_block_size, e_block_size); @@ -312,40 +317,35 @@ const double* row_values = A->RowBlockValues(chunk.start + j); const Cell& e_cell = row.cells.front(); DCHECK_EQ(e_block_id, e_cell.block_id); - const typename EigenTypes<kRowBlockSize, kEBlockSize>::ConstMatrixRef - e_block(row_values + e_cell.position, - row.block.size, - e_block_size); - typename EigenTypes<kRowBlockSize>::Vector - sj = + typename EigenTypes<kRowBlockSize>::Vector sj = typename EigenTypes<kRowBlockSize>::ConstVectorRef - (b + bs->rows[chunk.start + j].block.position, - row.block.size); + (b + bs->rows[chunk.start + j].block.position, row.block.size); for (int c = 1; c < row.cells.size(); ++c) { const int f_block_id = row.cells[c].block_id; const int f_block_size = bs->cols[f_block_id].size; - const typename EigenTypes<kRowBlockSize, kFBlockSize>::ConstMatrixRef - f_block(row_values + row.cells[c].position, - row.block.size, f_block_size); const int r_block = f_block_id - num_eliminate_blocks_; - sj -= f_block * - typename EigenTypes<kFBlockSize>::ConstVectorRef - (z + lhs_row_layout_[r_block], f_block_size); + MatrixVectorMultiply<kRowBlockSize, kFBlockSize, -1>( + row_values + row.cells[c].position, row.block.size, f_block_size, + z + lhs_row_layout_[r_block], + sj.data()); } - y_block += e_block.transpose() * sj; - ete.template selfadjointView<Eigen::Upper>() - .rankUpdate(e_block.transpose(), 1.0); + MatrixTransposeVectorMultiply<kRowBlockSize, kEBlockSize, 1>( + row_values + e_cell.position, row.block.size, e_block_size, + sj.data(), + y_ptr); + + MatrixTransposeMatrixMultiply + <kRowBlockSize, kEBlockSize,kRowBlockSize, kEBlockSize, 1>( + row_values + e_cell.position, row.block.size, e_block_size, + row_values + e_cell.position, row.block.size, e_block_size, + ete.data(), 0, 0, e_block_size, e_block_size); } - y_block = - ete - .template selfadjointView<Eigen::Upper>() - .ldlt() - .solve(y_block); + ete.llt().solveInPlace(y_block); } } @@ -382,15 +382,12 @@ for (int c = 1; c < row.cells.size(); ++c) { const int block_id = row.cells[c].block_id; const int block_size = bs->cols[block_id].size; - const typename EigenTypes<kRowBlockSize, kFBlockSize>::ConstMatrixRef - b(row_values + row.cells[c].position, - row.block.size, block_size); - const int block = block_id - num_eliminate_blocks_; CeresMutexLock l(rhs_locks_[block]); - typename EigenTypes<kFBlockSize>::VectorRef - (rhs + lhs_row_layout_[block], block_size).noalias() - += b.transpose() * sj; + MatrixTransposeVectorMultiply<kRowBlockSize, kFBlockSize, 1>( + row_values + row.cells[c].position, + row.block.size, block_size, + sj.data(), rhs + lhs_row_layout_[block]); } b_pos += row.block.size; } @@ -446,34 +443,31 @@ // Extract the e_block, ETE += E_i' E_i const Cell& e_cell = row.cells.front(); - const typename EigenTypes<kRowBlockSize, kEBlockSize>::ConstMatrixRef - e_block(row_values + e_cell.position, - row.block.size, - e_block_size); - - ete->template selfadjointView<Eigen::Upper>() - .rankUpdate(e_block.transpose(), 1.0); + MatrixTransposeMatrixMultiply + <kRowBlockSize, kEBlockSize, kRowBlockSize, kEBlockSize, 1>( + row_values + e_cell.position, row.block.size, e_block_size, + row_values + e_cell.position, row.block.size, e_block_size, + ete->data(), 0, 0, e_block_size, e_block_size); // g += E_i' b_i - g->noalias() += e_block.transpose() * - typename EigenTypes<kRowBlockSize>::ConstVectorRef - (b + b_pos, row.block.size); + MatrixTransposeVectorMultiply<kRowBlockSize, kEBlockSize, 1>( + row_values + e_cell.position, row.block.size, e_block_size, + b + b_pos, + g->data()); + // buffer = E'F. This computation is done by iterating over the // f_blocks for each row in the chunk. for (int c = 1; c < row.cells.size(); ++c) { const int f_block_id = row.cells[c].block_id; const int f_block_size = bs->cols[f_block_id].size; - const typename EigenTypes<kRowBlockSize, kFBlockSize>::ConstMatrixRef - f_block(row_values + row.cells[c].position, - row.block.size, f_block_size); - double* buffer_ptr = buffer + FindOrDie(chunk.buffer_layout, f_block_id); - - typename EigenTypes<kEBlockSize, kFBlockSize>::MatrixRef - (buffer_ptr, e_block_size, f_block_size).noalias() - += e_block.transpose() * f_block; + MatrixTransposeMatrixMultiply + <kRowBlockSize, kEBlockSize, kRowBlockSize, kFBlockSize, 1>( + row_values + e_cell.position, row.block.size, e_block_size, + row_values + row.cells[c].position, row.block.size, f_block_size, + buffer_ptr, 0, 0, e_block_size, f_block_size); } b_pos += row.block.size; } @@ -497,15 +491,24 @@ // references to the left hand side. const int e_block_size = inverse_ete.rows(); BufferLayoutType::const_iterator it1 = buffer_layout.begin(); + +#ifdef CERES_USE_OPENMP + int thread_id = omp_get_thread_num(); +#else + int thread_id = 0; +#endif + double* b1_transpose_inverse_ete = + chunk_outer_product_buffer_.get() + thread_id * buffer_size_; + // S(i,j) -= bi' * ete^{-1} b_j for (; it1 != buffer_layout.end(); ++it1) { const int block1 = it1->first - num_eliminate_blocks_; const int block1_size = bs->cols[it1->first].size; - - const typename EigenTypes<kEBlockSize, kFBlockSize>::ConstMatrixRef - b1(buffer + it1->second, e_block_size, block1_size); - const typename EigenTypes<kFBlockSize, kEBlockSize>::Matrix - b1_transpose_inverse_ete = b1.transpose() * inverse_ete; + MatrixTransposeMatrixMultiply + <kEBlockSize, kFBlockSize, kEBlockSize, kEBlockSize, 0>( + buffer + it1->second, e_block_size, block1_size, + inverse_ete.data(), e_block_size, e_block_size, + b1_transpose_inverse_ete, 0, 0, block1_size, e_block_size); BufferLayoutType::const_iterator it2 = it1; for (; it2 != buffer_layout.end(); ++it2) { @@ -515,46 +518,15 @@ CellInfo* cell_info = lhs->GetCell(block1, block2, &r, &c, &row_stride, &col_stride); - if (cell_info == NULL) { - continue; + if (cell_info != NULL) { + const int block2_size = bs->cols[it2->first].size; + CeresMutexLock l(&cell_info->m); + MatrixMatrixMultiply + <kFBlockSize, kEBlockSize, kEBlockSize, kFBlockSize, -1>( + b1_transpose_inverse_ete, block1_size, e_block_size, + buffer + it2->second, e_block_size, block2_size, + cell_info->values, r, c, row_stride, col_stride); } - - const int block2_size = bs->cols[it2->first].size; - const typename EigenTypes<kEBlockSize, kFBlockSize>::ConstMatrixRef - b2(buffer + it2->second, e_block_size, block2_size); - - CeresMutexLock l(&cell_info->m); - MatrixRef m(cell_info->values, row_stride, col_stride); - - // We explicitly construct a block object here instead of using - // m.block(), as m.block() variant of the constructor does not - // allow mixing of template sizing and runtime sizing parameters - // like the Matrix class does. - Eigen::Block<MatrixRef, kFBlockSize, kFBlockSize> - block(m, r, c, block1_size, block2_size); -#ifdef CERES_WORK_AROUND_ANDROID_NDK_COMPILER_BUG - // Removing the ".noalias()" annotation on the following statement is - // necessary to produce a correct build with the Android NDK, including - // versions 6, 7, 8, and 8b, when built with STLPort and the - // non-standalone toolchain (i.e. ndk-build). This appears to be a - // compiler bug; if the workaround is not in place, the line - // - // block.noalias() -= b1_transpose_inverse_ete * b2; - // - // gets compiled to - // - // block.noalias() += b1_transpose_inverse_ete * b2; - // - // which breaks schur elimination. Introducing a temporary by removing the - // .noalias() annotation causes the issue to disappear. Tracking this - // issue down was tricky, since the test suite doesn't run when built with - // the non-standalone toolchain. - // - // TODO(keir): Make a reproduction case for this and send it upstream. - block -= b1_transpose_inverse_ete * b2; -#else - block.noalias() -= b1_transpose_inverse_ete * b2; -#endif // CERES_WORK_AROUND_ANDROID_NDK_COMPILER_BUG } } } @@ -624,10 +596,13 @@ &row_stride, &col_stride); if (cell_info != NULL) { CeresMutexLock l(&cell_info->m); - MatrixRef m(cell_info->values, row_stride, col_stride); - m.block(r, c, block1_size, block1_size) - .selfadjointView<Eigen::Upper>() - .rankUpdate(b1.transpose(), 1.0); + // This multiply currently ignores the fact that this is a + // symmetric outer product. + MatrixTransposeMatrixMultiply + <Eigen::Dynamic, Eigen::Dynamic, Eigen::Dynamic, Eigen::Dynamic, 1>( + row_values + row.cells[i].position, row.block.size, block1_size, + row_values + row.cells[i].position, row.block.size, block1_size, + cell_info->values, r, c, row_stride, col_stride); } for (int j = i + 1; j < row.cells.size(); ++j) { @@ -638,17 +613,15 @@ CellInfo* cell_info = lhs->GetCell(block1, block2, &r, &c, &row_stride, &col_stride); - if (cell_info == NULL) { - continue; + if (cell_info != NULL) { + const int block2_size = bs->cols[row.cells[j].block_id].size; + CeresMutexLock l(&cell_info->m); + MatrixTransposeMatrixMultiply + <Eigen::Dynamic, Eigen::Dynamic, Eigen::Dynamic, Eigen::Dynamic, 1>( + row_values + row.cells[i].position, row.block.size, block1_size, + row_values + row.cells[j].position, row.block.size, block2_size, + cell_info->values, r, c, row_stride, col_stride); } - - const int block2_size = bs->cols[row.cells[j].block_id].size; - CeresMutexLock l(&cell_info->m); - MatrixRef m(cell_info->values, row_stride, col_stride); - m.block(r, c, block1_size, block2_size).noalias() += - b1.transpose() * ConstMatrixRef(row_values + row.cells[j].position, - row.block.size, - block2_size); } } } @@ -670,25 +643,18 @@ DCHECK_GE(block1, 0); const int block1_size = bs->cols[row.cells[i].block_id].size; - const typename EigenTypes<kRowBlockSize, kFBlockSize>::ConstMatrixRef - b1(row_values + row.cells[i].position, - row.block.size, block1_size); - { - int r, c, row_stride, col_stride; - CellInfo* cell_info = lhs->GetCell(block1, block1, - &r, &c, - &row_stride, &col_stride); - if (cell_info == NULL) { - continue; - } - + int r, c, row_stride, col_stride; + CellInfo* cell_info = lhs->GetCell(block1, block1, + &r, &c, + &row_stride, &col_stride); + if (cell_info != NULL) { CeresMutexLock l(&cell_info->m); - MatrixRef m(cell_info->values, row_stride, col_stride); - - Eigen::Block<MatrixRef, kFBlockSize, kFBlockSize> - block(m, r, c, block1_size, block1_size); - block.template selfadjointView<Eigen::Upper>() - .rankUpdate(b1.transpose(), 1.0); + // block += b1.transpose() * b1; + MatrixTransposeMatrixMultiply + <kRowBlockSize, kFBlockSize, kRowBlockSize, kFBlockSize, 1>( + row_values + row.cells[i].position, row.block.size, block1_size, + row_values + row.cells[i].position, row.block.size, block1_size, + cell_info->values, r, c, row_stride, col_stride); } for (int j = i + 1; j < row.cells.size(); ++j) { @@ -700,20 +666,14 @@ CellInfo* cell_info = lhs->GetCell(block1, block2, &r, &c, &row_stride, &col_stride); - if (cell_info == NULL) { - continue; + if (cell_info != NULL) { + // block += b1.transpose() * b2; + MatrixTransposeMatrixMultiply + <kRowBlockSize, kFBlockSize, kRowBlockSize, kFBlockSize, 1>( + row_values + row.cells[i].position, row.block.size, block1_size, + row_values + row.cells[j].position, row.block.size, block2_size, + cell_info->values, r, c, row_stride, col_stride); } - - const typename EigenTypes<kRowBlockSize, kFBlockSize>::ConstMatrixRef - b2(row_values + row.cells[j].position, - row.block.size, - block2_size); - - CeresMutexLock l(&cell_info->m); - MatrixRef m(cell_info->values, row_stride, col_stride); - Eigen::Block<MatrixRef, kFBlockSize, kFBlockSize> - block(m, r, c, block1_size, block2_size); - block.noalias() += b1.transpose() * b2; } } }