Speed up DenseSparseMatrix::SquareColumnNorm.

Because we store the matrix as row major matrix, the
obvious Eigen expression performs rather poorly. A straight
c++ loop speeds things up considerably.

Also replace use of matrix() with direct use of m_.

Change-Id: I3d6166df4765ad8400ab9602a54b65fd21b1d50f
diff --git a/internal/ceres/dense_sparse_matrix.cc b/internal/ceres/dense_sparse_matrix.cc
index cef219f..e71546b 100644
--- a/internal/ceres/dense_sparse_matrix.cc
+++ b/internal/ceres/dense_sparse_matrix.cc
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2022 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -60,16 +60,28 @@
 void DenseSparseMatrix::SetZero() { m_.setZero(); }
 
 void DenseSparseMatrix::RightMultiply(const double* x, double* y) const {
-  VectorRef(y, num_rows()) += matrix() * ConstVectorRef(x, num_cols());
+  VectorRef(y, num_rows()).noalias() += m_ * ConstVectorRef(x, num_cols());
 }
 
 void DenseSparseMatrix::LeftMultiply(const double* x, double* y) const {
-  VectorRef(y, num_cols()) +=
-      matrix().transpose() * ConstVectorRef(x, num_rows());
+  VectorRef(y, num_cols()).noalias() +=
+      m_.transpose() * ConstVectorRef(x, num_rows());
 }
 
 void DenseSparseMatrix::SquaredColumnNorm(double* x) const {
-  VectorRef(x, num_cols()) = m_.colwise().squaredNorm();
+  // This implementation is 3x faster than the naive version
+  // x = m_.colwise().square().sum(), likely because m_
+  // is a row major matrix.
+
+  const int num_rows = m_.rows();
+  const int num_cols = m_.cols();
+  std::fill_n(x, num_cols, 0.0);
+  const double* m = m_.data();
+  for (int i = 0; i < num_rows; ++i) {
+    for (int j = 0; j < num_cols; ++j, ++m) {
+      x[j] += (*m) * (*m);
+    }
+  }
 }
 
 void DenseSparseMatrix::ScaleColumns(const double* scale) {