Make the code in small_blas_generic.h more compiler friendly.
Instead of having four separate scalars, allocate them as
an array as they are all touched as a group of four.
Change-Id: I773cfc08cf53b66032985c11a4b0ebc06db06083
diff --git a/internal/ceres/small_blas_generic.h b/internal/ceres/small_blas_generic.h
index f5aa909..dd21f59 100644
--- a/internal/ceres/small_blas_generic.h
+++ b/internal/ceres/small_blas_generic.h
@@ -39,34 +39,32 @@
namespace internal {
// The following macros are used to share code
-#define CERES_GEMM_OPT_NAIVE_HEADER \
- double c0 = 0.0; \
- double c1 = 0.0; \
- double c2 = 0.0; \
- double c3 = 0.0; \
- const double* pa = a; \
- const double* pb = b; \
- const int span = 4; \
- int col_r = col_a & (span - 1); \
+#define CERES_GEMM_OPT_NAIVE_HEADER \
+ double cvec4[4] = {0.0, 0.0, 0.0, 0.0}; \
+ const double* pa = a; \
+ const double* pb = b; \
+ const int span = 4; \
+ int col_r = col_a & (span - 1); \
int col_m = col_a - col_r;
#define CERES_GEMM_OPT_STORE_MAT1X4 \
if (kOperation > 0) { \
- *c++ += c0; \
- *c++ += c1; \
- *c++ += c2; \
- *c++ += c3; \
+ c[0] += cvec4[0]; \
+ c[1] += cvec4[1]; \
+ c[2] += cvec4[2]; \
+ c[3] += cvec4[3]; \
} else if (kOperation < 0) { \
- *c++ -= c0; \
- *c++ -= c1; \
- *c++ -= c2; \
- *c++ -= c3; \
+ c[0] -= cvec4[0]; \
+ c[1] -= cvec4[1]; \
+ c[2] -= cvec4[2]; \
+ c[3] -= cvec4[3]; \
} else { \
- *c++ = c0; \
- *c++ = c1; \
- *c++ = c2; \
- *c++ = c3; \
- }
+ c[0] = cvec4[0]; \
+ c[1] = cvec4[1]; \
+ c[2] = cvec4[2]; \
+ c[3] = cvec4[3]; \
+ } \
+ c += 4;
// Matrix-Matrix Multiplication
// Figure out 1x4 of Matrix C in one batch
@@ -100,10 +98,10 @@
#define CERES_GEMM_OPT_MMM_MAT1X4_MUL \
av = pa[k]; \
pb = b + bi; \
- c0 += av * pb[0]; \
- c1 += av * pb[1]; \
- c2 += av * pb[2]; \
- c3 += av * pb[3]; \
+ cvec4[0] += av * pb[0]; \
+ cvec4[1] += av * pb[1]; \
+ cvec4[2] += av * pb[2]; \
+ cvec4[3] += av * pb[3]; \
pb += 4; \
bi += col_stride_b; \
k++;
@@ -168,10 +166,10 @@
#define CERES_GEMM_OPT_MTM_MAT1X4_MUL \
av = pa[ai]; \
pb = b + bi; \
- c0 += av * pb[0]; \
- c1 += av * pb[1]; \
- c2 += av * pb[2]; \
- c3 += av * pb[3]; \
+ cvec4[0] += av * pb[0]; \
+ cvec4[1] += av * pb[1]; \
+ cvec4[2] += av * pb[2]; \
+ cvec4[3] += av * pb[3]; \
pb += 4; \
ai += col_stride_a; \
bi += col_stride_b;
@@ -221,13 +219,13 @@
double bv = 0.0;
// clang-format off
-#define CERES_GEMM_OPT_MVM_MAT4X1_MUL \
- bv = *pb; \
- c0 += *(pa ) * bv; \
- c1 += *(pa + col_stride_a ) * bv; \
- c2 += *(pa + col_stride_a * 2) * bv; \
- c3 += *(pa + col_stride_a * 3) * bv; \
- pa++; \
+#define CERES_GEMM_OPT_MVM_MAT4X1_MUL \
+ bv = *pb; \
+ cvec4[0] += *(pa ) * bv; \
+ cvec4[1] += *(pa + col_stride_a ) * bv; \
+ cvec4[2] += *(pa + col_stride_a * 2) * bv; \
+ cvec4[3] += *(pa + col_stride_a * 3) * bv; \
+ pa++; \
pb++;
// clang-format on
@@ -285,16 +283,14 @@
CERES_GEMM_OPT_NAIVE_HEADER
double bv = 0.0;
- // clang-format off
#define CERES_GEMM_OPT_MTV_MAT4X1_MUL \
bv = *pb; \
- c0 += *(pa ) * bv; \
- c1 += *(pa + 1) * bv; \
- c2 += *(pa + 2) * bv; \
- c3 += *(pa + 3) * bv; \
+ cvec4[0] += pa[0] * bv; \
+ cvec4[1] += pa[1] * bv; \
+ cvec4[2] += pa[2] * bv; \
+ cvec4[3] += pa[3] * bv; \
pa += col_stride_a; \
pb++;
- // clang-format on
for (int k = 0; k < col_m; k += span) {
CERES_GEMM_OPT_MTV_MAT4X1_MUL