Block ordering for SPARSE_SCHUR + CX_SPARSE.

Uptil now only SuiteSparse when used with SPARSE_SCHUR would use
the block structure of the reduced camera matrix to find a fill-reducing
ordering.

This leads to substantial speedup for some bundle adjustment
problems.

Credit for this technique goes to the authors of g2o. I learned
about it from reading their source code.

Change-Id: I5403efefd4d9552c9c6fc6e02a65498bdf171584
diff --git a/internal/ceres/cxsparse.cc b/internal/ceres/cxsparse.cc
index 3fbc271..b7f2520 100644
--- a/internal/ceres/cxsparse.cc
+++ b/internal/ceres/cxsparse.cc
@@ -32,7 +32,10 @@
 
 #include "ceres/cxsparse.h"
 
+#include <vector>
+#include "ceres/compressed_col_sparse_matrix_utils.h"
 #include "ceres/compressed_row_sparse_matrix.h"
+#include "ceres/internal/port.h"
 #include "ceres/triplet_sparse_matrix.h"
 #include "glog/logging.h"
 
@@ -44,24 +47,25 @@
 
 CXSparse::~CXSparse() {
   if (scratch_size_ > 0) {
-    cs_free(scratch_);
+    cs_di_free(scratch_);
   }
 }
 
+
 bool CXSparse::SolveCholesky(cs_di* A,
                              cs_dis* symbolic_factorization,
                              double* b) {
   // Make sure we have enough scratch space available.
   if (scratch_size_ < A->n) {
     if (scratch_size_ > 0) {
-      cs_free(scratch_);
+      cs_di_free(scratch_);
     }
-    scratch_ = reinterpret_cast<CS_ENTRY*>(cs_malloc(A->n, sizeof(CS_ENTRY)));
+    scratch_ = reinterpret_cast<CS_ENTRY*>(cs_di_malloc(A->n, sizeof(CS_ENTRY)));
     scratch_size_ = A->n;
   }
 
   // Solve using Cholesky factorization
-  csn* numeric_factorization = cs_chol(A, symbolic_factorization);
+  csn* numeric_factorization = cs_di_chol(A, symbolic_factorization);
   if (numeric_factorization == NULL) {
     LOG(WARNING) << "Cholesky factorization failed.";
     return false;
@@ -71,19 +75,16 @@
   // succeeded as well. In the comments below, "x" refers to the scratch space.
   //
   // Set x = P * b.
-  cs_ipvec(symbolic_factorization->pinv, b, scratch_, A->n);
-
+  cs_di_ipvec(symbolic_factorization->pinv, b, scratch_, A->n);
   // Set x = L \ x.
-  cs_lsolve(numeric_factorization->L, scratch_);
-
+  cs_di_lsolve(numeric_factorization->L, scratch_);
   // Set x = L' \ x.
-  cs_ltsolve(numeric_factorization->L, scratch_);
-
+  cs_di_ltsolve(numeric_factorization->L, scratch_);
   // Set b = P' * x.
-  cs_pvec(symbolic_factorization->pinv, scratch_, b, A->n);
+  cs_di_pvec(symbolic_factorization->pinv, scratch_, b, A->n);
 
   // Free Cholesky factorization.
-  cs_nfree(numeric_factorization);
+  cs_di_nfree(numeric_factorization);
   return true;
 }
 
@@ -92,6 +93,61 @@
   return cs_schol(1, A);
 }
 
+cs_dis* CXSparse::BlockAnalyzeCholesky(cs_di* A,
+                                       const vector<int>& row_blocks,
+                                       const vector<int>& col_blocks) {
+  const int num_row_blocks = row_blocks.size();
+  const int num_col_blocks = col_blocks.size();
+
+  vector<int> block_rows;
+  vector<int> block_cols;
+  CompressedColumnScalarMatrixToBlockMatrix(A->i,
+                                            A->p,
+                                            row_blocks,
+                                            col_blocks,
+                                            &block_rows,
+                                            &block_cols);
+  cs_di block_matrix;
+  block_matrix.m = num_row_blocks;
+  block_matrix.n = num_col_blocks;
+  block_matrix.nz  = -1;
+  block_matrix.nzmax = block_rows.size();
+  block_matrix.p = &block_cols[0];
+  block_matrix.i = &block_rows[0];
+  block_matrix.x = NULL;
+
+  int* ordering = cs_amd(1, &block_matrix);
+  vector<int> block_ordering(num_row_blocks, -1);
+  copy(ordering, ordering + num_row_blocks, &block_ordering[0]);
+  cs_free(ordering);
+
+  vector<int> scalar_ordering;
+  BlockOrderingToScalarOrdering(row_blocks, block_ordering, &scalar_ordering);
+
+  cs_dis* symbolic_factorization = reinterpret_cast<cs_dis*>(cs_calloc(1, sizeof(cs_dis)));
+  symbolic_factorization->pinv = cs_pinv(&scalar_ordering[0], A->n);
+  cs* permuted_A = cs_symperm(A, symbolic_factorization->pinv, 0);
+
+  symbolic_factorization->parent = cs_etree(permuted_A, 0);
+  int* postordering = cs_post(symbolic_factorization->parent, A->n);
+  int* column_counts = cs_counts(permuted_A, symbolic_factorization->parent, postordering, 0);
+  cs_free(postordering);
+  cs_spfree(permuted_A);
+
+  symbolic_factorization->cp = (int*) cs_malloc(A->n+1, sizeof(int));
+  symbolic_factorization->lnz = cs_cumsum(symbolic_factorization->cp, column_counts, A->n);
+  symbolic_factorization->unz = symbolic_factorization->lnz;
+
+  cs_free(column_counts);
+
+  if (symbolic_factorization->lnz < 0) {
+    cs_sfree(symbolic_factorization);
+    symbolic_factorization = NULL;
+  }
+
+  return symbolic_factorization;
+}
+
 cs_di CXSparse::CreateSparseMatrixTransposeView(CompressedRowSparseMatrix* A) {
   cs_di At;
   At.m = A->num_cols();