Make the code in small_blas_generic.h more compiler friendly. Instead of having four separate scalars, allocate them as an array as they are all touched as a group of four. Change-Id: I773cfc08cf53b66032985c11a4b0ebc06db06083
diff --git a/internal/ceres/small_blas_generic.h b/internal/ceres/small_blas_generic.h index f5aa909..dd21f59 100644 --- a/internal/ceres/small_blas_generic.h +++ b/internal/ceres/small_blas_generic.h
@@ -39,34 +39,32 @@ namespace internal { // The following macros are used to share code -#define CERES_GEMM_OPT_NAIVE_HEADER \ - double c0 = 0.0; \ - double c1 = 0.0; \ - double c2 = 0.0; \ - double c3 = 0.0; \ - const double* pa = a; \ - const double* pb = b; \ - const int span = 4; \ - int col_r = col_a & (span - 1); \ +#define CERES_GEMM_OPT_NAIVE_HEADER \ + double cvec4[4] = {0.0, 0.0, 0.0, 0.0}; \ + const double* pa = a; \ + const double* pb = b; \ + const int span = 4; \ + int col_r = col_a & (span - 1); \ int col_m = col_a - col_r; #define CERES_GEMM_OPT_STORE_MAT1X4 \ if (kOperation > 0) { \ - *c++ += c0; \ - *c++ += c1; \ - *c++ += c2; \ - *c++ += c3; \ + c[0] += cvec4[0]; \ + c[1] += cvec4[1]; \ + c[2] += cvec4[2]; \ + c[3] += cvec4[3]; \ } else if (kOperation < 0) { \ - *c++ -= c0; \ - *c++ -= c1; \ - *c++ -= c2; \ - *c++ -= c3; \ + c[0] -= cvec4[0]; \ + c[1] -= cvec4[1]; \ + c[2] -= cvec4[2]; \ + c[3] -= cvec4[3]; \ } else { \ - *c++ = c0; \ - *c++ = c1; \ - *c++ = c2; \ - *c++ = c3; \ - } + c[0] = cvec4[0]; \ + c[1] = cvec4[1]; \ + c[2] = cvec4[2]; \ + c[3] = cvec4[3]; \ + } \ + c += 4; // Matrix-Matrix Multiplication // Figure out 1x4 of Matrix C in one batch @@ -100,10 +98,10 @@ #define CERES_GEMM_OPT_MMM_MAT1X4_MUL \ av = pa[k]; \ pb = b + bi; \ - c0 += av * pb[0]; \ - c1 += av * pb[1]; \ - c2 += av * pb[2]; \ - c3 += av * pb[3]; \ + cvec4[0] += av * pb[0]; \ + cvec4[1] += av * pb[1]; \ + cvec4[2] += av * pb[2]; \ + cvec4[3] += av * pb[3]; \ pb += 4; \ bi += col_stride_b; \ k++; @@ -168,10 +166,10 @@ #define CERES_GEMM_OPT_MTM_MAT1X4_MUL \ av = pa[ai]; \ pb = b + bi; \ - c0 += av * pb[0]; \ - c1 += av * pb[1]; \ - c2 += av * pb[2]; \ - c3 += av * pb[3]; \ + cvec4[0] += av * pb[0]; \ + cvec4[1] += av * pb[1]; \ + cvec4[2] += av * pb[2]; \ + cvec4[3] += av * pb[3]; \ pb += 4; \ ai += col_stride_a; \ bi += col_stride_b; @@ -221,13 +219,13 @@ double bv = 0.0; // clang-format off -#define CERES_GEMM_OPT_MVM_MAT4X1_MUL \ - bv = *pb; \ - c0 += *(pa ) * bv; \ - c1 += *(pa + col_stride_a ) * bv; \ - c2 += *(pa + col_stride_a * 2) * bv; \ - c3 += *(pa + col_stride_a * 3) * bv; \ - pa++; \ +#define CERES_GEMM_OPT_MVM_MAT4X1_MUL \ + bv = *pb; \ + cvec4[0] += *(pa ) * bv; \ + cvec4[1] += *(pa + col_stride_a ) * bv; \ + cvec4[2] += *(pa + col_stride_a * 2) * bv; \ + cvec4[3] += *(pa + col_stride_a * 3) * bv; \ + pa++; \ pb++; // clang-format on @@ -285,16 +283,14 @@ CERES_GEMM_OPT_NAIVE_HEADER double bv = 0.0; - // clang-format off #define CERES_GEMM_OPT_MTV_MAT4X1_MUL \ bv = *pb; \ - c0 += *(pa ) * bv; \ - c1 += *(pa + 1) * bv; \ - c2 += *(pa + 2) * bv; \ - c3 += *(pa + 3) * bv; \ + cvec4[0] += pa[0] * bv; \ + cvec4[1] += pa[1] * bv; \ + cvec4[2] += pa[2] * bv; \ + cvec4[3] += pa[3] * bv; \ pa += col_stride_a; \ pb++; - // clang-format on for (int k = 0; k < col_m; k += span) { CERES_GEMM_OPT_MTV_MAT4X1_MUL