Blame - internal/ceres/schur_eliminator_impl.h - ceres-solver

blob: b46eab92c34c4a6f988cad874f47d37ead4c1b73 [file] [log] [blame]

Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	1	// Ceres Solver - A fast non-linear least squares minimizer
				2	// Copyright 2010, 2011, 2012 Google Inc. All rights reserved.
				3	// http://code.google.com/p/ceres-solver/
				4	//
				5	// Redistribution and use in source and binary forms, with or without
				6	// modification, are permitted provided that the following conditions are met:
				7	//
				8	// * Redistributions of source code must retain the above copyright notice,
				9	// this list of conditions and the following disclaimer.
				10	// * Redistributions in binary form must reproduce the above copyright notice,
				11	// this list of conditions and the following disclaimer in the documentation
				12	// and/or other materials provided with the distribution.
				13	// * Neither the name of Google Inc. nor the names of its contributors may be
				14	// used to endorse or promote products derived from this software without
				15	// specific prior written permission.
				16	//
				17	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
				18	// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
				19	// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
				20	// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
				21	// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
				22	// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
				23	// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
				24	// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
				25	// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
				26	// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
				27	// POSSIBILITY OF SUCH DAMAGE.
				28	//
				29	// Author: sameeragarwal@google.com (Sameer Agarwal)
				30	//
				31	// TODO(sameeragarwal): row_block_counter can perhaps be replaced by
				32	// Chunk::start ?
				33
				34	#ifndef CERES_INTERNAL_SCHUR_ELIMINATOR_IMPL_H_
				35	#define CERES_INTERNAL_SCHUR_ELIMINATOR_IMPL_H_
				36
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	37	// Eigen has an internal threshold switching between different matrix
				38	// multiplication algorithms. In particular for matrices larger than
				39	// EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD it uses a cache friendly
				40	// matrix matrix product algorithm that has a higher setup cost. For
				41	// matrix sizes close to this threshold, especially when the matrices
				42	// are thin and long, the default choice may not be optimal. This is
				43	// the case for us, as the default choice causes a 30% performance
				44	// regression when we moved from Eigen2 to Eigen3.
Sameer Agarwal	296fa9b	2013-04-02 09:44:15 -0700	[diff] [blame]	45
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	46	#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 10
				47
Sameer Agarwal	296fa9b	2013-04-02 09:44:15 -0700	[diff] [blame]	48	#ifdef CERES_USE_OPENMP
				49	#include <omp.h>
				50	#endif
				51
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	52	#include <algorithm>
				53	#include <map>
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	54	#include "Eigen/Dense"
Sameer Agarwal	296fa9b	2013-04-02 09:44:15 -0700	[diff] [blame]	55	#include "ceres/blas.h"
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	56	#include "ceres/block_random_access_matrix.h"
				57	#include "ceres/block_sparse_matrix.h"
				58	#include "ceres/block_structure.h"
Sameer Agarwal	296fa9b	2013-04-02 09:44:15 -0700	[diff] [blame]	59	#include "ceres/internal/eigen.h"
				60	#include "ceres/internal/scoped_ptr.h"
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	61	#include "ceres/map_util.h"
				62	#include "ceres/schur_eliminator.h"
				63	#include "ceres/stl_util.h"
Sameer Agarwal	509f68c	2013-02-20 01:39:03 -0800	[diff] [blame]	64	#include "glog/logging.h"
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	65
				66	namespace ceres {
				67	namespace internal {
				68
				69	template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
				70	SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::~SchurEliminator() {
				71	STLDeleteElements(&rhs_locks_);
				72	}
				73
				74	template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
				75	void
				76	SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::
				77	Init(int num_eliminate_blocks, const CompressedRowBlockStructure* bs) {
				78	CHECK_GT(num_eliminate_blocks, 0)
				79	<< "SchurComplementSolver cannot be initialized with "
				80	<< "num_eliminate_blocks = 0.";
				81
				82	num_eliminate_blocks_ = num_eliminate_blocks;
				83
				84	const int num_col_blocks = bs->cols.size();
				85	const int num_row_blocks = bs->rows.size();
				86
				87	buffer_size_ = 1;
				88	chunks_.clear();
				89	lhs_row_layout_.clear();
				90
				91	int lhs_num_rows = 0;
				92	// Add a map object for each block in the reduced linear system
				93	// and build the row/column block structure of the reduced linear
				94	// system.
				95	lhs_row_layout_.resize(num_col_blocks - num_eliminate_blocks_);
				96	for (int i = num_eliminate_blocks_; i < num_col_blocks; ++i) {
				97	lhs_row_layout_[i - num_eliminate_blocks_] = lhs_num_rows;
				98	lhs_num_rows += bs->cols[i].size;
				99	}
				100
				101	int r = 0;
				102	// Iterate over the row blocks of A, and detect the chunks. The
				103	// matrix should already have been ordered so that all rows
				104	// containing the same y block are vertically contiguous. Along
				105	// the way also compute the amount of space each chunk will need
				106	// to perform the elimination.
				107	while (r < num_row_blocks) {
				108	const int chunk_block_id = bs->rows[r].cells.front().block_id;
				109	if (chunk_block_id >= num_eliminate_blocks_) {
				110	break;
				111	}
				112
				113	chunks_.push_back(Chunk());
				114	Chunk& chunk = chunks_.back();
				115	chunk.size = 0;
				116	chunk.start = r;
				117	int buffer_size = 0;
				118	const int e_block_size = bs->cols[chunk_block_id].size;
				119
				120	// Add to the chunk until the first block in the row is
				121	// different than the one in the first row for the chunk.
				122	while (r + chunk.size < num_row_blocks) {
				123	const CompressedRow& row = bs->rows[r + chunk.size];
				124	if (row.cells.front().block_id != chunk_block_id) {
				125	break;
				126	}
				127
				128	// Iterate over the blocks in the row, ignoring the first
				129	// block since it is the one to be eliminated.
				130	for (int c = 1; c < row.cells.size(); ++c) {
				131	const Cell& cell = row.cells[c];
				132	if (InsertIfNotPresent(
				133	&(chunk.buffer_layout), cell.block_id, buffer_size)) {
				134	buffer_size += e_block_size * bs->cols[cell.block_id].size;
				135	}
				136	}
				137
				138	buffer_size_ = max(buffer_size, buffer_size_);
				139	++chunk.size;
				140	}
				141
				142	CHECK_GT(chunk.size, 0);
				143	r += chunk.size;
				144	}
				145	const Chunk& chunk = chunks_.back();
				146
				147	uneliminated_row_begins_ = chunk.start + chunk.size;
				148	if (num_threads_ > 1) {
				149	random_shuffle(chunks_.begin(), chunks_.end());
				150	}
				151
				152	buffer_.reset(new double[buffer_size_ * num_threads_]);
				153
Sameer Agarwal	296fa9b	2013-04-02 09:44:15 -0700	[diff] [blame]	154	// chunk_outer_product_buffer_ only needs to store e_block_size *
				155	// f_block_size, which is always less than buffer_size_, so we just
				156	// allocate buffer_size_ per thread.
				157	chunk_outer_product_buffer_.reset(new double[buffer_size_ * num_threads_]);
				158
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	159	STLDeleteElements(&rhs_locks_);
				160	rhs_locks_.resize(num_col_blocks - num_eliminate_blocks_);
				161	for (int i = 0; i < num_col_blocks - num_eliminate_blocks_; ++i) {
				162	rhs_locks_[i] = new Mutex;
				163	}
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	164	}
				165
				166	template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
				167	void
				168	SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::
				169	Eliminate(const BlockSparseMatrixBase* A,
				170	const double* b,
				171	const double* D,
				172	BlockRandomAccessMatrix* lhs,
				173	double* rhs) {
				174	if (lhs->num_rows() > 0) {
				175	lhs->SetZero();
				176	VectorRef(rhs, lhs->num_rows()).setZero();
				177	}
				178
				179	const CompressedRowBlockStructure* bs = A->block_structure();
				180	const int num_col_blocks = bs->cols.size();
				181
				182	// Add the diagonal to the schur complement.
				183	if (D != NULL) {
				184	#pragma omp parallel for num_threads(num_threads_) schedule(dynamic)
				185	for (int i = num_eliminate_blocks_; i < num_col_blocks; ++i) {
				186	const int block_id = i - num_eliminate_blocks_;
				187	int r, c, row_stride, col_stride;
				188	CellInfo* cell_info = lhs->GetCell(block_id, block_id,
				189	&r, &c,
				190	&row_stride, &col_stride);
				191	if (cell_info != NULL) {
				192	const int block_size = bs->cols[i].size;
				193	typename EigenTypes<kFBlockSize>::ConstVectorRef
				194	diag(D + bs->cols[i].position, block_size);
				195
Keir Mierle	ff71d74	2012-08-10 17:05:15 -0700	[diff] [blame]	196	CeresMutexLock l(&cell_info->m);
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	197	MatrixRef m(cell_info->values, row_stride, col_stride);
				198	m.block(r, c, block_size, block_size).diagonal()
				199	+= diag.array().square().matrix();
				200	}
				201	}
				202	}
				203
				204	// Eliminate y blocks one chunk at a time. For each chunk,x3
				205	// compute the entries of the normal equations and the gradient
				206	// vector block corresponding to the y block and then apply
				207	// Gaussian elimination to them. The matrix ete stores the normal
				208	// matrix corresponding to the block being eliminated and array
				209	// buffer_ contains the non-zero blocks in the row corresponding
				210	// to this y block in the normal equations. This computation is
				211	// done in ChunkDiagonalBlockAndGradient. UpdateRhs then applies
				212	// gaussian elimination to the rhs of the normal equations,
				213	// updating the rhs of the reduced linear system by modifying rhs
				214	// blocks for all the z blocks that share a row block/residual
				215	// term with the y block. EliminateRowOuterProduct does the
				216	// corresponding operation for the lhs of the reduced linear
				217	// system.
				218	#pragma omp parallel for num_threads(num_threads_) schedule(dynamic)
				219	for (int i = 0; i < chunks_.size(); ++i) {
				220	#ifdef CERES_USE_OPENMP
				221	int thread_id = omp_get_thread_num();
				222	#else
				223	int thread_id = 0;
				224	#endif
				225	double* buffer = buffer_.get() + thread_id * buffer_size_;
				226	const Chunk& chunk = chunks_[i];
				227	const int e_block_id = bs->rows[chunk.start].cells.front().block_id;
				228	const int e_block_size = bs->cols[e_block_id].size;
				229
				230	VectorRef(buffer, buffer_size_).setZero();
				231
				232	typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix
				233	ete(e_block_size, e_block_size);
				234
				235	if (D != NULL) {
				236	const typename EigenTypes<kEBlockSize>::ConstVectorRef
				237	diag(D + bs->cols[e_block_id].position, e_block_size);
				238	ete = diag.array().square().matrix().asDiagonal();
				239	} else {
				240	ete.setZero();
				241	}
				242
				243	typename EigenTypes<kEBlockSize>::Vector g(e_block_size);
				244	g.setZero();
				245
				246	// We are going to be computing
				247	//
				248	// S += F'F - F'E(E'E)^{-1}E'F
				249	//
				250	// for each Chunk. The computation is broken down into a number of
				251	// function calls as below.
				252
				253	// Compute the outer product of the e_blocks with themselves (ete
				254	// = E'E). Compute the product of the e_blocks with the
				255	// corresonding f_blocks (buffer = E'F), the gradient of the terms
				256	// in this chunk (g) and add the outer product of the f_blocks to
				257	// Schur complement (S += F'F).
				258	ChunkDiagonalBlockAndGradient(
				259	chunk, A, b, chunk.start, &ete, &g, buffer, lhs);
				260
				261	// Normally one wouldn't compute the inverse explicitly, but
				262	// e_block_size will typically be a small number like 3, in
				263	// which case its much faster to compute the inverse once and
				264	// use it to multiply other matrices/vectors instead of doing a
				265	// Solve call over and over again.
				266	typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix inverse_ete =
				267	ete
				268	.template selfadjointView<Eigen::Upper>()
Sameer Agarwal	296fa9b	2013-04-02 09:44:15 -0700	[diff] [blame]	269	.llt()
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	270	.solve(Matrix::Identity(e_block_size, e_block_size));
				271
				272	// For the current chunk compute and update the rhs of the reduced
				273	// linear system.
				274	//
				275	// rhs = F'b - F'E(E'E)^(-1) E'b
				276	UpdateRhs(chunk, A, b, chunk.start, inverse_ete * g, rhs);
				277
				278	// S -= F'E(E'E)^{-1}E'F
				279	ChunkOuterProduct(bs, inverse_ete, buffer, chunk.buffer_layout, lhs);
				280	}
				281
				282	// For rows with no e_blocks, the schur complement update reduces to
				283	// S += F'F.
				284	NoEBlockRowsUpdate(A, b, uneliminated_row_begins_, lhs, rhs);
				285	}
				286
				287	template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
				288	void
				289	SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::
				290	BackSubstitute(const BlockSparseMatrixBase* A,
				291	const double* b,
				292	const double* D,
				293	const double* z,
				294	double* y) {
				295	const CompressedRowBlockStructure* bs = A->block_structure();
				296	#pragma omp parallel for num_threads(num_threads_) schedule(dynamic)
				297	for (int i = 0; i < chunks_.size(); ++i) {
				298	const Chunk& chunk = chunks_[i];
				299	const int e_block_id = bs->rows[chunk.start].cells.front().block_id;
				300	const int e_block_size = bs->cols[e_block_id].size;
				301
Sameer Agarwal	296fa9b	2013-04-02 09:44:15 -0700	[diff] [blame]	302	double* y_ptr = y + bs->cols[e_block_id].position;
				303	typename EigenTypes<kEBlockSize>::VectorRef y_block(y_ptr, e_block_size);
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	304
				305	typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix
				306	ete(e_block_size, e_block_size);
				307	if (D != NULL) {
				308	const typename EigenTypes<kEBlockSize>::ConstVectorRef
				309	diag(D + bs->cols[e_block_id].position, e_block_size);
				310	ete = diag.array().square().matrix().asDiagonal();
				311	} else {
				312	ete.setZero();
				313	}
				314
				315	for (int j = 0; j < chunk.size; ++j) {
				316	const CompressedRow& row = bs->rows[chunk.start + j];
				317	const double* row_values = A->RowBlockValues(chunk.start + j);
				318	const Cell& e_cell = row.cells.front();
				319	DCHECK_EQ(e_block_id, e_cell.block_id);
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	320
Sameer Agarwal	296fa9b	2013-04-02 09:44:15 -0700	[diff] [blame]	321	typename EigenTypes<kRowBlockSize>::Vector sj =
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	322	typename EigenTypes<kRowBlockSize>::ConstVectorRef
Sameer Agarwal	296fa9b	2013-04-02 09:44:15 -0700	[diff] [blame]	323	(b + bs->rows[chunk.start + j].block.position, row.block.size);
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	324
				325	for (int c = 1; c < row.cells.size(); ++c) {
				326	const int f_block_id = row.cells[c].block_id;
				327	const int f_block_size = bs->cols[f_block_id].size;
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	328	const int r_block = f_block_id - num_eliminate_blocks_;
				329
Sameer Agarwal	296fa9b	2013-04-02 09:44:15 -0700	[diff] [blame]	330	MatrixVectorMultiply<kRowBlockSize, kFBlockSize, -1>(
				331	row_values + row.cells[c].position, row.block.size, f_block_size,
				332	z + lhs_row_layout_[r_block],
				333	sj.data());
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	334	}
				335
Sameer Agarwal	296fa9b	2013-04-02 09:44:15 -0700	[diff] [blame]	336	MatrixTransposeVectorMultiply<kRowBlockSize, kEBlockSize, 1>(
				337	row_values + e_cell.position, row.block.size, e_block_size,
				338	sj.data(),
				339	y_ptr);
				340
				341	MatrixTransposeMatrixMultiply
				342	<kRowBlockSize, kEBlockSize,kRowBlockSize, kEBlockSize, 1>(
				343	row_values + e_cell.position, row.block.size, e_block_size,
				344	row_values + e_cell.position, row.block.size, e_block_size,
				345	ete.data(), 0, 0, e_block_size, e_block_size);
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	346	}
				347
Sameer Agarwal	296fa9b	2013-04-02 09:44:15 -0700	[diff] [blame]	348	ete.llt().solveInPlace(y_block);
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	349	}
				350	}
				351
				352	// Update the rhs of the reduced linear system. Compute
				353	//
				354	// F'b - F'E(E'E)^(-1) E'b
				355	template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
				356	void
				357	SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::
				358	UpdateRhs(const Chunk& chunk,
				359	const BlockSparseMatrixBase* A,
				360	const double* b,
				361	int row_block_counter,
				362	const Vector& inverse_ete_g,
				363	double* rhs) {
				364	const CompressedRowBlockStructure* bs = A->block_structure();
				365	const int e_block_size = inverse_ete_g.rows();
				366	int b_pos = bs->rows[row_block_counter].block.position;
				367	for (int j = 0; j < chunk.size; ++j) {
				368	const CompressedRow& row = bs->rows[row_block_counter + j];
				369	const double *row_values = A->RowBlockValues(row_block_counter + j);
				370	const Cell& e_cell = row.cells.front();
				371
				372	const typename EigenTypes<kRowBlockSize, kEBlockSize>::ConstMatrixRef
				373	e_block(row_values + e_cell.position,
				374	row.block.size,
				375	e_block_size);
				376
				377	const typename EigenTypes<kRowBlockSize>::Vector
				378	sj =
				379	typename EigenTypes<kRowBlockSize>::ConstVectorRef
				380	(b + b_pos, row.block.size) - e_block * (inverse_ete_g);
				381
				382	for (int c = 1; c < row.cells.size(); ++c) {
				383	const int block_id = row.cells[c].block_id;
				384	const int block_size = bs->cols[block_id].size;
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	385	const int block = block_id - num_eliminate_blocks_;
Keir Mierle	ff71d74	2012-08-10 17:05:15 -0700	[diff] [blame]	386	CeresMutexLock l(rhs_locks_[block]);
Sameer Agarwal	296fa9b	2013-04-02 09:44:15 -0700	[diff] [blame]	387	MatrixTransposeVectorMultiply<kRowBlockSize, kFBlockSize, 1>(
				388	row_values + row.cells[c].position,
				389	row.block.size, block_size,
				390	sj.data(), rhs + lhs_row_layout_[block]);
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	391	}
				392	b_pos += row.block.size;
				393	}
				394	}
				395
				396	// Given a Chunk - set of rows with the same e_block, e.g. in the
				397	// following Chunk with two rows.
				398	//
				399	// E F
				400	// [ y11 0 0 0 \| z11 0 0 0 z51]
				401	// [ y12 0 0 0 \| z12 z22 0 0 0]
				402	//
				403	// this function computes twp matrices. The diagonal block matrix
				404	//
				405	// ete = y11 * y11' + y12 * y12'
				406	//
				407	// and the off diagonal blocks in the Guass Newton Hessian.
				408	//
				409	// buffer = [y11'(z11 + z12), y12' * z22, y11' * z51]
				410	//
				411	// which are zero compressed versions of the block sparse matrices E'E
				412	// and E'F.
				413	//
				414	// and the gradient of the e_block, E'b.
				415	template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
				416	void
				417	SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::
				418	ChunkDiagonalBlockAndGradient(
				419	const Chunk& chunk,
				420	const BlockSparseMatrixBase* A,
				421	const double* b,
				422	int row_block_counter,
				423	typename EigenTypes<kEBlockSize, kEBlockSize>::Matrix* ete,
				424	typename EigenTypes<kEBlockSize>::Vector* g,
				425	double* buffer,
				426	BlockRandomAccessMatrix* lhs) {
				427	const CompressedRowBlockStructure* bs = A->block_structure();
				428
				429	int b_pos = bs->rows[row_block_counter].block.position;
				430	const int e_block_size = ete->rows();
				431
				432	// Iterate over the rows in this chunk, for each row, compute the
				433	// contribution of its F blocks to the Schur complement, the
				434	// contribution of its E block to the matrix EE' (ete), and the
				435	// corresponding block in the gradient vector.
				436	for (int j = 0; j < chunk.size; ++j) {
				437	const CompressedRow& row = bs->rows[row_block_counter + j];
				438	const double *row_values = A->RowBlockValues(row_block_counter + j);
				439
				440	if (row.cells.size() > 1) {
				441	EBlockRowOuterProduct(A, row_block_counter + j, lhs);
				442	}
				443
				444	// Extract the e_block, ETE += E_i' E_i
				445	const Cell& e_cell = row.cells.front();
Sameer Agarwal	296fa9b	2013-04-02 09:44:15 -0700	[diff] [blame]	446	MatrixTransposeMatrixMultiply
				447	<kRowBlockSize, kEBlockSize, kRowBlockSize, kEBlockSize, 1>(
				448	row_values + e_cell.position, row.block.size, e_block_size,
				449	row_values + e_cell.position, row.block.size, e_block_size,
				450	ete->data(), 0, 0, e_block_size, e_block_size);
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	451
				452	// g += E_i' b_i
Sameer Agarwal	296fa9b	2013-04-02 09:44:15 -0700	[diff] [blame]	453	MatrixTransposeVectorMultiply<kRowBlockSize, kEBlockSize, 1>(
				454	row_values + e_cell.position, row.block.size, e_block_size,
				455	b + b_pos,
				456	g->data());
				457
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	458
				459	// buffer = E'F. This computation is done by iterating over the
				460	// f_blocks for each row in the chunk.
				461	for (int c = 1; c < row.cells.size(); ++c) {
				462	const int f_block_id = row.cells[c].block_id;
				463	const int f_block_size = bs->cols[f_block_id].size;
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	464	double* buffer_ptr =
				465	buffer + FindOrDie(chunk.buffer_layout, f_block_id);
Sameer Agarwal	296fa9b	2013-04-02 09:44:15 -0700	[diff] [blame]	466	MatrixTransposeMatrixMultiply
				467	<kRowBlockSize, kEBlockSize, kRowBlockSize, kFBlockSize, 1>(
				468	row_values + e_cell.position, row.block.size, e_block_size,
				469	row_values + row.cells[c].position, row.block.size, f_block_size,
				470	buffer_ptr, 0, 0, e_block_size, f_block_size);
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	471	}
				472	b_pos += row.block.size;
				473	}
				474	}
				475
				476	// Compute the outer product F'E(E'E)^{-1}E'F and subtract it from the
				477	// Schur complement matrix, i.e
				478	//
				479	// S -= F'E(E'E)^{-1}E'F.
				480	template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
				481	void
				482	SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::
				483	ChunkOuterProduct(const CompressedRowBlockStructure* bs,
				484	const Matrix& inverse_ete,
				485	const double* buffer,
				486	const BufferLayoutType& buffer_layout,
				487	BlockRandomAccessMatrix* lhs) {
				488	// This is the most computationally expensive part of this
				489	// code. Profiling experiments reveal that the bottleneck is not the
				490	// computation of the right-hand matrix product, but memory
				491	// references to the left hand side.
				492	const int e_block_size = inverse_ete.rows();
				493	BufferLayoutType::const_iterator it1 = buffer_layout.begin();
Sameer Agarwal	296fa9b	2013-04-02 09:44:15 -0700	[diff] [blame]	494
				495	#ifdef CERES_USE_OPENMP
				496	int thread_id = omp_get_thread_num();
				497	#else
				498	int thread_id = 0;
				499	#endif
				500	double* b1_transpose_inverse_ete =
				501	chunk_outer_product_buffer_.get() + thread_id * buffer_size_;
				502
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	503	// S(i,j) -= bi' * ete^{-1} b_j
				504	for (; it1 != buffer_layout.end(); ++it1) {
				505	const int block1 = it1->first - num_eliminate_blocks_;
				506	const int block1_size = bs->cols[it1->first].size;
Sameer Agarwal	296fa9b	2013-04-02 09:44:15 -0700	[diff] [blame]	507	MatrixTransposeMatrixMultiply
				508	<kEBlockSize, kFBlockSize, kEBlockSize, kEBlockSize, 0>(
				509	buffer + it1->second, e_block_size, block1_size,
				510	inverse_ete.data(), e_block_size, e_block_size,
				511	b1_transpose_inverse_ete, 0, 0, block1_size, e_block_size);
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	512
				513	BufferLayoutType::const_iterator it2 = it1;
				514	for (; it2 != buffer_layout.end(); ++it2) {
				515	const int block2 = it2->first - num_eliminate_blocks_;
				516
				517	int r, c, row_stride, col_stride;
				518	CellInfo* cell_info = lhs->GetCell(block1, block2,
				519	&r, &c,
				520	&row_stride, &col_stride);
Sameer Agarwal	296fa9b	2013-04-02 09:44:15 -0700	[diff] [blame]	521	if (cell_info != NULL) {
				522	const int block2_size = bs->cols[it2->first].size;
				523	CeresMutexLock l(&cell_info->m);
				524	MatrixMatrixMultiply
				525	<kFBlockSize, kEBlockSize, kEBlockSize, kFBlockSize, -1>(
				526	b1_transpose_inverse_ete, block1_size, e_block_size,
				527	buffer + it2->second, e_block_size, block2_size,
				528	cell_info->values, r, c, row_stride, col_stride);
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	529	}
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	530	}
				531	}
				532	}
				533
				534	// For rows with no e_blocks, the schur complement update reduces to S
				535	// += F'F. This function iterates over the rows of A with no e_block,
				536	// and calls NoEBlockRowOuterProduct on each row.
				537	template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
				538	void
				539	SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::
				540	NoEBlockRowsUpdate(const BlockSparseMatrixBase* A,
				541	const double* b,
				542	int row_block_counter,
				543	BlockRandomAccessMatrix* lhs,
				544	double* rhs) {
				545	const CompressedRowBlockStructure* bs = A->block_structure();
				546	for (; row_block_counter < bs->rows.size(); ++row_block_counter) {
				547	const CompressedRow& row = bs->rows[row_block_counter];
				548	const double *row_values = A->RowBlockValues(row_block_counter);
				549	for (int c = 0; c < row.cells.size(); ++c) {
				550	const int block_id = row.cells[c].block_id;
				551	const int block_size = bs->cols[block_id].size;
				552	const int block = block_id - num_eliminate_blocks_;
				553	VectorRef(rhs + lhs_row_layout_[block], block_size).noalias()
				554	+= (ConstMatrixRef(row_values + row.cells[c].position,
				555	row.block.size, block_size).transpose() *
				556	ConstVectorRef(b + row.block.position, row.block.size));
				557	}
				558	NoEBlockRowOuterProduct(A, row_block_counter, lhs);
				559	}
				560	}
				561
				562
				563	// A row r of A, which has no e_blocks gets added to the Schur
				564	// Complement as S += r r'. This function is responsible for computing
				565	// the contribution of a single row r to the Schur complement. It is
				566	// very similar in structure to EBlockRowOuterProduct except for
				567	// one difference. It does not use any of the template
				568	// parameters. This is because the algorithm used for detecting the
				569	// static structure of the matrix A only pays attention to rows with
				570	// e_blocks. This is becase rows without e_blocks are rare and
				571	// typically arise from regularization terms in the original
				572	// optimization problem, and have a very different structure than the
				573	// rows with e_blocks. Including them in the static structure
				574	// detection will lead to most template parameters being set to
				575	// dynamic. Since the number of rows without e_blocks is small, the
				576	// lack of templating is not an issue.
				577	template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
				578	void
				579	SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::
				580	NoEBlockRowOuterProduct(const BlockSparseMatrixBase* A,
				581	int row_block_index,
				582	BlockRandomAccessMatrix* lhs) {
				583	const CompressedRowBlockStructure* bs = A->block_structure();
				584	const CompressedRow& row = bs->rows[row_block_index];
				585	const double *row_values = A->RowBlockValues(row_block_index);
				586	for (int i = 0; i < row.cells.size(); ++i) {
				587	const int block1 = row.cells[i].block_id - num_eliminate_blocks_;
				588	DCHECK_GE(block1, 0);
				589
				590	const int block1_size = bs->cols[row.cells[i].block_id].size;
				591	const ConstMatrixRef b1(row_values + row.cells[i].position,
				592	row.block.size, block1_size);
				593	int r, c, row_stride, col_stride;
				594	CellInfo* cell_info = lhs->GetCell(block1, block1,
				595	&r, &c,
				596	&row_stride, &col_stride);
				597	if (cell_info != NULL) {
Keir Mierle	ff71d74	2012-08-10 17:05:15 -0700	[diff] [blame]	598	CeresMutexLock l(&cell_info->m);
Sameer Agarwal	296fa9b	2013-04-02 09:44:15 -0700	[diff] [blame]	599	// This multiply currently ignores the fact that this is a
				600	// symmetric outer product.
				601	MatrixTransposeMatrixMultiply
				602	<Eigen::Dynamic, Eigen::Dynamic, Eigen::Dynamic, Eigen::Dynamic, 1>(
				603	row_values + row.cells[i].position, row.block.size, block1_size,
				604	row_values + row.cells[i].position, row.block.size, block1_size,
				605	cell_info->values, r, c, row_stride, col_stride);
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	606	}
				607
				608	for (int j = i + 1; j < row.cells.size(); ++j) {
				609	const int block2 = row.cells[j].block_id - num_eliminate_blocks_;
				610	DCHECK_GE(block2, 0);
				611	DCHECK_LT(block1, block2);
				612	int r, c, row_stride, col_stride;
				613	CellInfo* cell_info = lhs->GetCell(block1, block2,
				614	&r, &c,
				615	&row_stride, &col_stride);
Sameer Agarwal	296fa9b	2013-04-02 09:44:15 -0700	[diff] [blame]	616	if (cell_info != NULL) {
				617	const int block2_size = bs->cols[row.cells[j].block_id].size;
				618	CeresMutexLock l(&cell_info->m);
				619	MatrixTransposeMatrixMultiply
				620	<Eigen::Dynamic, Eigen::Dynamic, Eigen::Dynamic, Eigen::Dynamic, 1>(
				621	row_values + row.cells[i].position, row.block.size, block1_size,
				622	row_values + row.cells[j].position, row.block.size, block2_size,
				623	cell_info->values, r, c, row_stride, col_stride);
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	624	}
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	625	}
				626	}
				627	}
				628
				629	// For a row with an e_block, compute the contribition S += F'F. This
				630	// function has the same structure as NoEBlockRowOuterProduct, except
				631	// that this function uses the template parameters.
				632	template <int kRowBlockSize, int kEBlockSize, int kFBlockSize>
				633	void
				634	SchurEliminator<kRowBlockSize, kEBlockSize, kFBlockSize>::
				635	EBlockRowOuterProduct(const BlockSparseMatrixBase* A,
				636	int row_block_index,
				637	BlockRandomAccessMatrix* lhs) {
				638	const CompressedRowBlockStructure* bs = A->block_structure();
				639	const CompressedRow& row = bs->rows[row_block_index];
				640	const double *row_values = A->RowBlockValues(row_block_index);
				641	for (int i = 1; i < row.cells.size(); ++i) {
				642	const int block1 = row.cells[i].block_id - num_eliminate_blocks_;
				643	DCHECK_GE(block1, 0);
				644
				645	const int block1_size = bs->cols[row.cells[i].block_id].size;
Sameer Agarwal	296fa9b	2013-04-02 09:44:15 -0700	[diff] [blame]	646	int r, c, row_stride, col_stride;
				647	CellInfo* cell_info = lhs->GetCell(block1, block1,
				648	&r, &c,
				649	&row_stride, &col_stride);
				650	if (cell_info != NULL) {
Keir Mierle	ff71d74	2012-08-10 17:05:15 -0700	[diff] [blame]	651	CeresMutexLock l(&cell_info->m);
Sameer Agarwal	296fa9b	2013-04-02 09:44:15 -0700	[diff] [blame]	652	// block += b1.transpose() * b1;
				653	MatrixTransposeMatrixMultiply
				654	<kRowBlockSize, kFBlockSize, kRowBlockSize, kFBlockSize, 1>(
				655	row_values + row.cells[i].position, row.block.size, block1_size,
				656	row_values + row.cells[i].position, row.block.size, block1_size,
				657	cell_info->values, r, c, row_stride, col_stride);
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	658	}
				659
				660	for (int j = i + 1; j < row.cells.size(); ++j) {
				661	const int block2 = row.cells[j].block_id - num_eliminate_blocks_;
				662	DCHECK_GE(block2, 0);
				663	DCHECK_LT(block1, block2);
				664	const int block2_size = bs->cols[row.cells[j].block_id].size;
				665	int r, c, row_stride, col_stride;
				666	CellInfo* cell_info = lhs->GetCell(block1, block2,
				667	&r, &c,
				668	&row_stride, &col_stride);
Sameer Agarwal	296fa9b	2013-04-02 09:44:15 -0700	[diff] [blame]	669	if (cell_info != NULL) {
				670	// block += b1.transpose() * b2;
				671	MatrixTransposeMatrixMultiply
				672	<kRowBlockSize, kFBlockSize, kRowBlockSize, kFBlockSize, 1>(
				673	row_values + row.cells[i].position, row.block.size, block1_size,
				674	row_values + row.cells[j].position, row.block.size, block2_size,
				675	cell_info->values, r, c, row_stride, col_stride);
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	676	}
Keir Mierle	8ebb073	2012-04-30 23:09:08 -0700	[diff] [blame]	677	}
				678	}
				679	}
				680
				681	} // namespace internal
				682	} // namespace ceres
				683
				684	#endif // CERES_INTERNAL_SCHUR_ELIMINATOR_IMPL_H_