Revert "Optimization for custom small blas multiplication with dynamic" This reverts commit 68cc71ce5d7ce58751343ccf9cb5aec6dcd60f8e. Reason for revert: Breaks a number of tests. Change-Id: I543eaf2103e9903e9026a1fcc2c0bf517d971190
diff --git a/internal/ceres/small_blas.h b/internal/ceres/small_blas.h index 34b4ec7..264ac53 100644 --- a/internal/ceres/small_blas.h +++ b/internal/ceres/small_blas.h
@@ -38,7 +38,6 @@ #include "ceres/internal/port.h" #include "ceres/internal/eigen.h" #include "glog/logging.h" -#include "small_blas_generic.h" namespace ceres { namespace internal { @@ -90,26 +89,6 @@ B, num_row_b, num_col_b, \ C, start_row_c, start_col_c, row_stride_c, col_stride_c); -#define CERES_GEMM_STORE_SINGLE(p, index, value) \ - if (kOperation > 0) { \ - p[index] += value; \ - } else if (kOperation < 0) { \ - p[index] -= value; \ - } else { \ - p[index] = value; \ - } - -#define CERES_GEMM_STORE_PAIR(p, index, v1, v2) \ - if (kOperation > 0) { \ - p[index] += v1; \ - p[index + 1] += v2; \ - } else if (kOperation < 0) { \ - p[index] -= v1; \ - p[index + 1] -= v2; \ - } else { \ - p[index] = v1; \ - p[index + 1] = v2; \ - } // For the matrix-matrix functions below, there are three variants for // each functionality. Foo, FooNaive and FooEigen. Foo is the one to @@ -181,64 +160,24 @@ const int NUM_COL_C = NUM_COL_B; DCHECK_LE(start_row_c + NUM_ROW_C, row_stride_c); DCHECK_LE(start_col_c + NUM_COL_C, col_stride_c); - const int span = 4; - // Calculate the remainder part first. - - // Process the last odd column if present. - if (NUM_COL_C & 1) { - int col = NUM_COL_C - 1; - const double* pa = &A[0]; - for (int row = 0; row < NUM_ROW_C; ++row, pa += NUM_COL_A) { - const double* pb = &B[col]; + for (int row = 0; row < NUM_ROW_C; ++row) { + for (int col = 0; col < NUM_COL_C; ++col) { double tmp = 0.0; - for (int k = 0; k < NUM_COL_A; ++k, pb += NUM_COL_B) { - tmp += pa[k] * pb[0]; + for (int k = 0; k < NUM_COL_A; ++k) { + tmp += A[row * NUM_COL_A + k] * B[k * NUM_COL_B + col]; } const int index = (row + start_row_c) * col_stride_c + start_col_c + col; - CERES_GEMM_STORE_SINGLE(C, index, tmp); - } - - // Return directly for efficiency of extremely small matrix multiply. - if (NUM_COL_C == 1) { - return; - } - } - - // Process the couple columns in remainder if present. - if (NUM_COL_C & 2) { - int col = NUM_COL_C & (int)(~(span - 1)) ; - const double* pa = &A[0]; - for (int row = 0; row < NUM_ROW_C; ++row, pa += NUM_COL_A) { - const double* pb = &B[col]; - double tmp1 = 0.0, tmp2 = 0.0; - for (int k = 0; k < NUM_COL_A; ++k, pb += NUM_COL_B) { - double av = pa[k]; - tmp1 += av * pb[0]; - tmp2 += av * pb[1]; + if (kOperation > 0) { + C[index] += tmp; + } else if (kOperation < 0) { + C[index] -= tmp; + } else { + C[index] = tmp; } - - const int index = (row + start_row_c) * col_stride_c + start_col_c + col; - CERES_GEMM_STORE_PAIR(C, index, tmp1, tmp2); - } - - // Return directly for efficiency of extremely small matrix multiply. - if (NUM_COL_C < span) { - return; } } - - // Calculate the main part with multiples of 4. - int col_m = NUM_COL_C & (int)(~(span - 1)); - for (int col = 0; col < col_m; col += span) { - for (int row = 0; row < NUM_ROW_C; ++row) { - const int index = (row + start_row_c) * col_stride_c + start_col_c + col; - MMM_mat1x4(NUM_COL_A, &A[row * NUM_COL_A], - &B[col], NUM_COL_B, &C[index], kOperation); - } - } - } CERES_GEMM_BEGIN(MatrixMatrixMultiply) { @@ -281,68 +220,24 @@ const int NUM_COL_C = NUM_COL_B; DCHECK_LE(start_row_c + NUM_ROW_C, row_stride_c); DCHECK_LE(start_col_c + NUM_COL_C, col_stride_c); - const int span = 4; - // Process the remainder part first. - - // Process the last odd column if present. - if (NUM_COL_C & 1) { - int col = NUM_COL_C - 1; - for (int row = 0; row < NUM_ROW_C; ++row) { - const double* pa = &A[row]; - const double* pb = &B[col]; + for (int row = 0; row < NUM_ROW_C; ++row) { + for (int col = 0; col < NUM_COL_C; ++col) { double tmp = 0.0; for (int k = 0; k < NUM_ROW_A; ++k) { - tmp += pa[0] * pb[0]; - pa += NUM_COL_A; - pb += NUM_COL_B; + tmp += A[k * NUM_COL_A + row] * B[k * NUM_COL_B + col]; } const int index = (row + start_row_c) * col_stride_c + start_col_c + col; - CERES_GEMM_STORE_SINGLE(C, index, tmp); - } - - // Return directly for efficiency of extremely small matrix multiply. - if (NUM_COL_C == 1) { - return; - } - } - - // Process the couple columns in remainder if present. - if (NUM_COL_C & 2) { - int col = NUM_COL_C & (int)(~(span - 1)) ; - for (int row = 0; row < NUM_ROW_C; ++row) { - const double* pa = &A[row]; - const double* pb = &B[col]; - double tmp1 = 0.0, tmp2 = 0.0; - for (int k = 0; k < NUM_ROW_A; ++k) { - double av = *pa; - tmp1 += av * pb[0]; - tmp2 += av * pb[1]; - pa += NUM_COL_A; - pb += NUM_COL_B; + if (kOperation > 0) { + C[index]+= tmp; + } else if (kOperation < 0) { + C[index]-= tmp; + } else { + C[index]= tmp; } - - const int index = (row + start_row_c) * col_stride_c + start_col_c + col; - CERES_GEMM_STORE_PAIR(C, index, tmp1, tmp2); - } - - // Return directly for efficiency of extremely small matrix multiply. - if (NUM_COL_C < span) { - return; } } - - // Process the main part with multiples of 4. - int col_m = NUM_COL_C & (int)(~(span - 1)); - for (int col = 0; col < col_m; col += span) { - for (int row = 0; row < NUM_ROW_C; ++row) { - const int index = (row + start_row_c) * col_stride_c + start_col_c + col; - MTM_mat1x4(NUM_ROW_A, &A[row], NUM_COL_A, - &B[col], NUM_COL_B, &C[index], kOperation); - } - } - } CERES_GEMM_BEGIN(MatrixTransposeMatrixMultiply) { @@ -406,54 +301,21 @@ const int NUM_ROW_A = (kRowA != Eigen::Dynamic ? kRowA : num_row_a); const int NUM_COL_A = (kColA != Eigen::Dynamic ? kColA : num_col_a); - const int span = 4; - // Calculate the remainder part first. - - // Process the last odd row if present. - if (NUM_ROW_A & 1) { - int row = NUM_ROW_A - 1; - const double* pa = &A[row * NUM_COL_A]; - const double* pb = &b[0]; + for (int row = 0; row < NUM_ROW_A; ++row) { double tmp = 0.0; for (int col = 0; col < NUM_COL_A; ++col) { - tmp += (*pa++) * (*pb++); + tmp += A[row * NUM_COL_A + col] * b[col]; } - CERES_GEMM_STORE_SINGLE(c, row, tmp); - // Return directly for efficiency of extremely small matrix multiply. - if (NUM_ROW_A == 1) { - return; + if (kOperation > 0) { + c[row] += tmp; + } else if (kOperation < 0) { + c[row] -= tmp; + } else { + c[row] = tmp; } } - - // Process the couple rows in remainder if present. - if (NUM_ROW_A & 2) { - int row = NUM_ROW_A & (int)(~(span - 1)); - const double* pa1 = &A[row * NUM_COL_A]; - const double* pa2 = pa1 + NUM_COL_A; - const double* pb = &b[0]; - double tmp1 = 0.0, tmp2 = 0.0; - for (int col = 0; col < NUM_ROW_A; ++col) { - double bv = *pb++; - tmp1 += *(pa1++) * bv; - tmp2 += *(pa2++) * bv; - } - CERES_GEMM_STORE_PAIR(c, row, tmp1, tmp2); - - // Return directly for efficiency of extremely small matrix multiply. - if (NUM_ROW_A < span) { - return; - } - } - - // Calculate the main part with multiples of 4. - int row_m = NUM_ROW_A & (int)(~(span - 1)); - for (int row = 0; row < row_m; row += span) { - MVM_mat4x1(NUM_COL_A, &A[row * NUM_COL_A], NUM_COL_A, - &b[0], &c[row], kOperation); - } - #endif // CERES_NO_CUSTOM_BLAS } @@ -490,55 +352,21 @@ const int NUM_ROW_A = (kRowA != Eigen::Dynamic ? kRowA : num_row_a); const int NUM_COL_A = (kColA != Eigen::Dynamic ? kColA : num_col_a); - const int span = 4; - // Calculate the remainder part first. - - // Process the last odd column if present. - if (NUM_COL_A & 1) { - int row = NUM_COL_A - 1; - const double* pa = &A[row]; - const double* pb = &b[0]; + for (int row = 0; row < NUM_COL_A; ++row) { double tmp = 0.0; for (int col = 0; col < NUM_ROW_A; ++col) { - tmp += *pa * (*pb++); - pa += NUM_COL_A; + tmp += A[col * NUM_COL_A + row] * b[col]; } - CERES_GEMM_STORE_SINGLE(c, row, tmp); - // Return directly for efficiency of extremely small matrix multiply. - if (NUM_COL_A == 1) { - return; + if (kOperation > 0) { + c[row] += tmp; + } else if (kOperation < 0) { + c[row] -= tmp; + } else { + c[row] = tmp; } } - - // Process the couple columns in remainder if present. - if (NUM_COL_A & 2) { - int row = NUM_COL_A & (int)(~(span - 1)); - const double* pa = &A[row]; - const double* pb = &b[0]; - double tmp1 = 0.0, tmp2 = 0.0; - for (int col = 0; col < NUM_ROW_A; ++col) { - double bv = *pb++; - tmp1 += *(pa ) * bv; - tmp2 += *(pa + 1) * bv; - pa += NUM_COL_A; - } - CERES_GEMM_STORE_PAIR(c, row, tmp1, tmp2); - - // Return directly for efficiency of extremely small matrix multiply. - if (NUM_COL_A < span) { - return; - } - } - - // Calculate the main part with multiples of 4. - int row_m = NUM_COL_A & (int)(~(span - 1)); - for (int row = 0; row < row_m; row += span) { - MTV_mat4x1(NUM_ROW_A, &A[row], NUM_COL_A, - &b[0], &c[row], kOperation); - } - #endif // CERES_NO_CUSTOM_BLAS } @@ -546,8 +374,6 @@ #undef CERES_GEMM_EIGEN_HEADER #undef CERES_GEMM_NAIVE_HEADER #undef CERES_CALL_GEMM -#undef CERES_GEMM_STORE_SINGLE -#undef CERES_GEMM_STORE_PAIR } // namespace internal } // namespace ceres
diff --git a/internal/ceres/small_blas_generic.h b/internal/ceres/small_blas_generic.h deleted file mode 100644 index 978c5d5..0000000 --- a/internal/ceres/small_blas_generic.h +++ /dev/null
@@ -1,315 +0,0 @@ -// Ceres Solver - A fast non-linear least squares minimizer -// Copyright 2018 Google Inc. All rights reserved. -// http://ceres-solver.org/ -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// * Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// * Neither the name of Google Inc. nor the names of its contributors may be -// used to endorse or promote products derived from this software without -// specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -// POSSIBILITY OF SUCH DAMAGE. -// -// Author: yangfan34@lenovo.com (Lenovo Research Device+ Lab - Shanghai) -// -// Optimization for simple blas functions used in the Schur Eliminator. -// These are fairly basic implementations which already yield a significant -// speedup in the eliminator performance. - -#ifndef CERES_INTERNAL_SMALL_BLAS_GENERIC_H_ -#define CERES_INTERNAL_SMALL_BLAS_GENERIC_H_ - -namespace ceres { -namespace internal { - -// The following macros are used to share code -#define CERES_GEMM_OPT_NAIVE_HEADER \ - double c0 = 0.0; \ - double c1 = 0.0; \ - double c2 = 0.0; \ - double c3 = 0.0; \ - const double* pa = a; \ - const double* pb = b; \ - const int span = 4; \ - int col_r = col_a & (span - 1); \ - int col_m = col_a - col_r; - -#define CERES_GEMM_OPT_STORE_MAT1X4 \ - if (kOperation > 0) { \ - *c++ += c0; \ - *c++ += c1; \ - *c++ += c2; \ - *c++ += c3; \ - } else if (kOperation < 0) { \ - *c++ -= c0; \ - *c++ -= c1; \ - *c++ -= c2; \ - *c++ -= c3; \ - } else { \ - *c++ = c0; \ - *c++ = c1; \ - *c++ = c2; \ - *c++ = c3; \ - } - -// Matrix-Matrix Multiplication -// Figure out 1x4 of Matrix C in one batch -// -// c op a * B; -// where op can be +=, -=, or =, indicated by kOperation. -// -// Matrix C Matrix A Matrix B -// -// C0, C1, C2, C3 op A0, A1, A2, A3, ... * B0, B1, B2, B3 -// B4, B5, B6, B7 -// B8, B9, Ba, Bb -// Bc, Bd, Be, Bf -// . , . , . , . -// . , . , . , . -// . , . , . , . -// -// unroll for loops -// utilize the data resided in cache -// NOTE: col_a means the columns of A -static inline void MMM_mat1x4(const int col_a, - const double* a, - const double* b, - const int col_stride_b, - double* c, - const int kOperation) { - CERES_GEMM_OPT_NAIVE_HEADER - double av = 0.0; - int bi = 0; - -#define CERES_GEMM_OPT_MMM_MAT1X4_MUL \ - av = pa[k]; \ - pb = b + bi; \ - c0 += av * *pb++; \ - c1 += av * *pb++; \ - c2 += av * *pb++; \ - c3 += av * *pb++; \ - bi += col_stride_b; \ - k++; - - for (int k = 0; k < col_m;) { - CERES_GEMM_OPT_MMM_MAT1X4_MUL - CERES_GEMM_OPT_MMM_MAT1X4_MUL - CERES_GEMM_OPT_MMM_MAT1X4_MUL - CERES_GEMM_OPT_MMM_MAT1X4_MUL - } - - for (int k = col_m; k < col_a;) { - CERES_GEMM_OPT_MMM_MAT1X4_MUL - } - - CERES_GEMM_OPT_STORE_MAT1X4 - -#undef CERES_GEMM_OPT_MMM_MAT1X4_MUL -} - -// Matrix Transpose-Matrix multiplication -// Figure out 1x4 of Matrix C in one batch -// -// c op a' * B; -// where op can be +=, -=, or = indicated by kOperation. -// -// Matrix A -// -// A0 -// A1 -// A2 -// A3 -// . -// . -// . -// -// Matrix C Matrix A' Matrix B -// -// C0, C1, C2, C3 op A0, A1, A2, A3, ... * B0, B1, B2, B3 -// B4, B5, B6, B7 -// B8, B9, Ba, Bb -// Bc, Bd, Be, Bf -// . , . , . , . -// . , . , . , . -// . , . , . , . -// -// unroll for loops -// utilize the data resided in cache -// NOTE: col_a means the columns of A' -static inline void MTM_mat1x4(const int col_a, - const double* a, - const int col_stride_a, - const double* b, - const int col_stride_b, - double* c, - const int kOperation) { - CERES_GEMM_OPT_NAIVE_HEADER - double av = 0.0; - int ai = 0; - int bi = 0; - -#define CERES_GEMM_OPT_MTM_MAT1X4_MUL \ - av = pa[ai]; \ - pb = b + bi; \ - c0 += av * *pb++; \ - c1 += av * *pb++; \ - c2 += av * *pb++; \ - c3 += av * *pb++; \ - ai += col_stride_a; \ - bi += col_stride_b; - - for (int k = 0; k < col_m; k += span) { - CERES_GEMM_OPT_MTM_MAT1X4_MUL - CERES_GEMM_OPT_MTM_MAT1X4_MUL - CERES_GEMM_OPT_MTM_MAT1X4_MUL - CERES_GEMM_OPT_MTM_MAT1X4_MUL - } - - for (int k = col_m; k < col_a; k++) { - CERES_GEMM_OPT_MTM_MAT1X4_MUL - } - - CERES_GEMM_OPT_STORE_MAT1X4 - -#undef CERES_GEMM_OPT_MTM_MAT1X4_MUL -} - -// Matrix-Vector Multiplication -// Figure out 4x1 of vector c in one batch -// -// c op A * b; -// where op can be +=, -=, or =, indicated by kOperation. -// -// Vector c Matrix A Vector b -// -// C0 op A0, A1, A2, A3, ... * B0 -// C1 A4, A5, A6, A7, ... B1 -// C2 A8, A9, Aa, Ab, ... B2 -// C3 Ac, Ad, Ae, Af, ... B3 -// . -// . -// . -// -// unroll for loops -// utilize the data resided in cache -// NOTE: col_a means the columns of A -static inline void MVM_mat4x1(const int col_a, - const double* a, - const int col_stride_a, - const double* b, - double* c, - const int kOperation) { - CERES_GEMM_OPT_NAIVE_HEADER - double bv = 0.0; - -#define CERES_GEMM_OPT_MVM_MAT4X1_MUL \ - bv = *pb; \ - c0 += *(pa ) * bv; \ - c1 += *(pa + col_stride_a ) * bv; \ - c2 += *(pa + col_stride_a * 2) * bv; \ - c3 += *(pa + col_stride_a * 3) * bv; \ - pa++; \ - pb++; - - for (int k = 0; k < col_m; k += span) { - CERES_GEMM_OPT_MVM_MAT4X1_MUL - CERES_GEMM_OPT_MVM_MAT4X1_MUL - CERES_GEMM_OPT_MVM_MAT4X1_MUL - CERES_GEMM_OPT_MVM_MAT4X1_MUL - } - - for (int k = col_m; k < col_a; k++) { - CERES_GEMM_OPT_MVM_MAT4X1_MUL - } - - CERES_GEMM_OPT_STORE_MAT1X4 - -#undef CERES_GEMM_OPT_MVM_MAT4X1_MUL -} - -// Matrix Transpose-Vector multiplication -// Figure out 4x1 of vector c in one batch -// -// c op A' * b; -// where op can be +=, -=, or =, indicated by kOperation. -// -// Matrix A -// -// A0, A4, A8, Ac -// A1, A5, A9, Ad -// A2, A6, Aa, Ae -// A3, A7, Ab, Af -// . , . , . , . -// . , . , . , . -// . , . , . , . -// -// Vector c Matrix A' Vector b -// -// C0 op A0, A1, A2, A3, ... * B0 -// C1 A4, A5, A6, A7, ... B1 -// C2 A8, A9, Aa, Ab, ... B2 -// C3 Ac, Ad, Ae, Af, ... B3 -// . -// . -// . -// -// unroll for loops -// utilize the data resided in cache -// NOTE: col_a means the columns of A' -static inline void MTV_mat4x1(const int col_a, - const double* a, - const int col_stride_a, - const double* b, - double* c, - const int kOperation) { - CERES_GEMM_OPT_NAIVE_HEADER - double bv = 0.0; - -#define CERES_GEMM_OPT_MTV_MAT4X1_MUL \ - bv = *pb; \ - c0 += *(pa ) * bv; \ - c1 += *(pa + 1) * bv; \ - c2 += *(pa + 2) * bv; \ - c3 += *(pa + 3) * bv; \ - pa += col_stride_a; \ - pb++; - - for (int k = 0; k < col_m; k += span) { - CERES_GEMM_OPT_MTV_MAT4X1_MUL - CERES_GEMM_OPT_MTV_MAT4X1_MUL - CERES_GEMM_OPT_MTV_MAT4X1_MUL - CERES_GEMM_OPT_MTV_MAT4X1_MUL - } - - for (int k = col_m; k < col_a; k++) { - CERES_GEMM_OPT_MTV_MAT4X1_MUL - } - - CERES_GEMM_OPT_STORE_MAT1X4 - -#undef CERES_GEMM_OPT_MTV_MAT4X1_MUL -} - -#undef CERES_GEMM_OPT_NAIVE_HEADER -#undef CERES_GEMM_OPT_STORE_MAT1X4 - -} // namespace internal -} // namespace ceres - -#endif // CERES_INTERNAL_SMALL_BLAS_GENERIC_H_