Speed up corrector.cc

Remove the Eigen temporary by revealing the columnwise nature
of the computation. This also allows us to get rid of the
special case for nrow = 1.

On problem-356-226730-pre.txt with -robustify evaluation times
change from:

Before:
  Residual Evaluations                  1.015
  Jacobian Evaluations                 18.313

After:
  Residual Evaluations                  1.005
  Jacobian Evaluations                  8.382

To give a sense of the overhead reduction, compare these numbers
when loss functions are disabled.

  Residual Evaluations                  0.955
  Jacobian Evaluations                  7.772

So, this is a 17.5x speedup!

The one dimensional specialization was motivated by denoising.cc.
The evaluation times there are essentially unchanged.

Before:
  Residual Evaluations                  2.774
  Jacobian Evaluations                 20.178

After:
  Residual Evaluations                  2.588
  Jacobian Evaluations                 19.781

Change-Id: Ic0efbaed75fe4489635039f17189ae24b97802c8
diff --git a/internal/ceres/corrector.cc b/internal/ceres/corrector.cc
index c3858ab..60269a6 100644
--- a/internal/ceres/corrector.cc
+++ b/internal/ceres/corrector.cc
@@ -32,7 +32,6 @@
 
 #include <cstddef>
 #include <cmath>
-#include "ceres/internal/eigen.h"
 #include "glog/logging.h"
 
 namespace ceres {
@@ -90,7 +89,7 @@
   // 0.5 *  alpha^2 - alpha - rho'' / rho' *  z'z = 0.
   //
   // Start by calculating the discriminant D.
-  const double D = 1.0 + 2.0 * sq_norm*rho[2] / rho[1];
+  const double D = 1.0 + 2.0 * sq_norm * rho[2] / rho[1];
 
   // Since both rho[1] and rho[2] are guaranteed to be positive at
   // this point, we know that D > 1.0.
@@ -102,29 +101,43 @@
   alpha_sq_norm_ = alpha / sq_norm;
 }
 
-void Corrector::CorrectResiduals(int nrow, double* residuals) {
+void Corrector::CorrectResiduals(int num_rows, double* residuals) {
   DCHECK(residuals != NULL);
-  VectorRef r_ref(residuals, nrow);
   // Equation 11 in BANS.
-  r_ref *= residual_scaling_;
+  for (int r = 0; r < num_rows; ++r) {
+    residuals[r] *= residual_scaling_;
+  }
 }
 
-void Corrector::CorrectJacobian(int nrow, int ncol,
-                                double* residuals, double* jacobian) {
+void Corrector::CorrectJacobian(int num_rows,
+                                int num_cols,
+                                double* residuals,
+                                double* jacobian) {
   DCHECK(residuals != NULL);
   DCHECK(jacobian != NULL);
+  // Equation 11 in BANS.
+  //
+  //  J = sqrt(rho) * (J - alpha^2 r * r' J)
+  //
+  // In days gone by this loop used to be a single Eigen expression of
+  // the form
+  //
+  //  J = sqrt_rho1_ * (J - alpha_sq_norm_ * r* (r.transpose() * J));
+  //
+  // Which turns out to about 17x slower on bal problems. The reason
+  // is that Eigen is unable to figure out that this expression can be
+  // evaluated columnwise and ends up creating a temporary.
+  for (int c = 0; c < num_cols; ++c) {
+    double r_transpose_j = 0.0;
+    for (int r = 0; r < num_rows; ++r) {
+      r_transpose_j += jacobian[r * num_cols + c] * residuals[r];
+    }
 
-  if (nrow == 1) {
-    // Specialization for the case where the residual is a scalar.
-    VectorRef j_ref(jacobian, ncol);
-    j_ref *= sqrt_rho1_ * (1.0 - alpha_sq_norm_ * pow(*residuals, 2));
-  } else {
-    ConstVectorRef r_ref(residuals, nrow);
-    MatrixRef j_ref(jacobian, nrow, ncol);
-
-    // Equation 11 in BANS.
-    j_ref = sqrt_rho1_ * (j_ref - alpha_sq_norm_ *
-                          r_ref * (r_ref.transpose() * j_ref));
+    for (int r = 0; r < num_rows; ++r) {
+      jacobian[r * num_cols + c] = sqrt_rho1_ *
+          (jacobian[r * num_cols + c] -
+           alpha_sq_norm_ * residuals[r] * r_transpose_j);
+    }
   }
 }