internal/ceres/block_sparse_matrix.cc - ceres-solver - Git at Google

 // Ceres Solver - A fast non-linear least squares minimizer
 // Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
 // * Redistributions of source code must retain the above copyright notice,
 //   this list of conditions and the following disclaimer.
 // * Redistributions in binary form must reproduce the above copyright notice,
 //   this list of conditions and the following disclaimer in the documentation
 //   and/or other materials provided with the distribution.
 // * Neither the name of Google Inc. nor the names of its contributors may be
 //   used to endorse or promote products derived from this software without
 //   specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 //
 // Author: sameeragarwal@google.com (Sameer Agarwal)

 #include "ceres/block_sparse_matrix.h"

 #include <algorithm>
 #include <cstddef>
 #include <memory>
 #include <numeric>
 #include <random>
 #include <vector>

 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/strings/str_format.h"
 #include "ceres/block_structure.h"
 #include "ceres/crs_matrix.h"
 #include "ceres/internal/eigen.h"
 #include "ceres/parallel_for.h"
 #include "ceres/parallel_vector_ops.h"
 #include "ceres/small_blas.h"
 #include "ceres/triplet_sparse_matrix.h"

 #ifndef CERES_NO_CUDA
 #include "cuda_runtime.h"
 #endif

 namespace ceres::internal {

 namespace {
 void ComputeCumulativeNumberOfNonZeros(std::vector<CompressedList>& rows) {
   if (rows.empty()) {
     return;
   }
   rows[0].cumulative_nnz = rows[0].nnz;
   for (int c = 1; c < rows.size(); ++c) {
     const int curr_nnz = rows[c].nnz;
     rows[c].cumulative_nnz = curr_nnz + rows[c - 1].cumulative_nnz;
   }
 }

 template <bool transpose>
 std::unique_ptr<CompressedRowSparseMatrix>
 CreateStructureOfCompressedRowSparseMatrix(
     int num_rows,
     int num_cols,
     int num_nonzeros,
     const CompressedRowBlockStructure* block_structure) {
   auto crs_matrix = std::make_unique<CompressedRowSparseMatrix>(
       num_rows, num_cols, num_nonzeros);
   auto crs_cols = crs_matrix->mutable_cols();
   auto crs_rows = crs_matrix->mutable_rows();
   int value_offset = 0;
   const int num_row_blocks = block_structure->rows.size();
   const auto& cols = block_structure->cols;
   *crs_rows++ = 0;
   for (int row_block_id = 0; row_block_id < num_row_blocks; ++row_block_id) {
     const auto& row_block = block_structure->rows[row_block_id];
     // Empty row block: only requires setting row offsets
     if (row_block.cells.empty()) {
       std::fill(crs_rows, crs_rows + row_block.block.size, value_offset);
       crs_rows += row_block.block.size;
       continue;
     }

     int row_nnz = 0;
     if constexpr (transpose) {
       // Transposed block structure comes with nnz in row-block filled-in
       row_nnz = row_block.nnz / row_block.block.size;
     } else {
       // Nnz field of non-transposed block structure is not filled and it can
       // have non-sequential structure (consider the case of jacobian for
       // Schur-complement solver: E and F blocks are stored separately).
       for (auto& c : row_block.cells) {
         row_nnz += cols[c.block_id].size;
       }
     }

     // Row-wise setup of matrix structure
     for (int row = 0; row < row_block.block.size; ++row) {
       value_offset += row_nnz;
       *crs_rows++ = value_offset;
       for (auto& c : row_block.cells) {
         const int col_block_size = cols[c.block_id].size;
         const int col_position = cols[c.block_id].position;
         std::iota(crs_cols, crs_cols + col_block_size, col_position);
         crs_cols += col_block_size;
       }
     }
   }
   return crs_matrix;
 }

 template <bool transpose>
 void UpdateCompressedRowSparseMatrixImpl(
     CompressedRowSparseMatrix* crs_matrix,
     const double* values,
     const CompressedRowBlockStructure* block_structure) {
   auto crs_values = crs_matrix->mutable_values();
   auto crs_rows = crs_matrix->mutable_rows();
   const int num_row_blocks = block_structure->rows.size();
   const auto& cols = block_structure->cols;
   for (int row_block_id = 0; row_block_id < num_row_blocks; ++row_block_id) {
     const auto& row_block = block_structure->rows[row_block_id];
     const int row_block_size = row_block.block.size;
     const int row_nnz = crs_rows[1] - crs_rows[0];
     crs_rows += row_block_size;

     if (row_nnz == 0) {
       continue;
     }

     MatrixRef crs_row_block(crs_values, row_block_size, row_nnz);
     int col_offset = 0;
     for (auto& c : row_block.cells) {
       const int col_block_size = cols[c.block_id].size;
       auto crs_cell =
           crs_row_block.block(0, col_offset, row_block_size, col_block_size);
       if constexpr (transpose) {
         // Transposed matrix is filled using transposed block-strucutre
         ConstMatrixRef cell(
             values + c.position, col_block_size, row_block_size);
         crs_cell = cell.transpose();
       } else {
         ConstMatrixRef cell(
             values + c.position, row_block_size, col_block_size);
         crs_cell = cell;
       }
       col_offset += col_block_size;
     }
     crs_values += row_nnz * row_block_size;
   }
 }

 void SetBlockStructureOfCompressedRowSparseMatrix(
     CompressedRowSparseMatrix* crs_matrix,
     CompressedRowBlockStructure* block_structure) {
   const int num_row_blocks = block_structure->rows.size();
   auto& row_blocks = *crs_matrix->mutable_row_blocks();
   row_blocks.resize(num_row_blocks);
   for (int i = 0; i < num_row_blocks; ++i) {
     row_blocks[i] = block_structure->rows[i].block;
   }

   auto& col_blocks = *crs_matrix->mutable_col_blocks();
   col_blocks = block_structure->cols;
 }

 }  // namespace

 BlockSparseMatrix::BlockSparseMatrix(
     CompressedRowBlockStructure* block_structure, bool use_page_locked_memory)
     : use_page_locked_memory_(use_page_locked_memory),
       num_rows_(0),
       num_cols_(0),
       num_nonzeros_(0),
       block_structure_(block_structure) {
   CHECK(block_structure_ != nullptr);

   // Count the number of columns in the matrix.
   for (auto& col : block_structure_->cols) {
     num_cols_ += col.size;
   }

   // Count the number of non-zero entries and the number of rows in
   // the matrix.
   for (int i = 0; i < block_structure_->rows.size(); ++i) {
     int row_block_size = block_structure_->rows[i].block.size;
     num_rows_ += row_block_size;

     const std::vector<Cell>& cells = block_structure_->rows[i].cells;
     for (const auto& cell : cells) {
       int col_block_id = cell.block_id;
       int col_block_size = block_structure_->cols[col_block_id].size;
       num_nonzeros_ += col_block_size * row_block_size;
     }
   }

   CHECK_GE(num_rows_, 0);
   CHECK_GE(num_cols_, 0);
   CHECK_GE(num_nonzeros_, 0);
   VLOG(2) << "Allocating values array with " << num_nonzeros_ * sizeof(double)
           << " bytes.";  // NOLINT

   values_ = AllocateValues(num_nonzeros_);
   max_num_nonzeros_ = num_nonzeros_;
   CHECK(values_ != nullptr);
   AddTransposeBlockStructure();
 }

 BlockSparseMatrix::~BlockSparseMatrix() { FreeValues(values_); }

 void BlockSparseMatrix::AddTransposeBlockStructure() {
   if (transpose_block_structure_ == nullptr) {
     transpose_block_structure_ = CreateTranspose(*block_structure_);
   }
 }

 void BlockSparseMatrix::SetZero() {
   std::fill(values_, values_ + num_nonzeros_, 0.0);
 }

 void BlockSparseMatrix::SetZero(ContextImpl* context, int num_threads) {
   ParallelSetZero(context, num_threads, values_, num_nonzeros_);
 }

 void BlockSparseMatrix::RightMultiplyAndAccumulate(const double* x,
                                                    double* y) const {
   RightMultiplyAndAccumulate(x, y, nullptr, 1);
 }

 void BlockSparseMatrix::RightMultiplyAndAccumulate(const double* x,
                                                    double* y,
                                                    ContextImpl* context,
                                                    int num_threads) const {
   CHECK(x != nullptr);
   CHECK(y != nullptr);

   const auto values = values_;
   const auto block_structure = block_structure_.get();
   const auto num_row_blocks = block_structure->rows.size();

   ParallelFor(context,
               0,
               num_row_blocks,
               num_threads,
               [values, block_structure, x, y](int row_block_id) {
                 const int row_block_pos =
                     block_structure->rows[row_block_id].block.position;
                 const int row_block_size =
                     block_structure->rows[row_block_id].block.size;
                 const auto& cells = block_structure->rows[row_block_id].cells;
                 for (const auto& cell : cells) {
                   const int col_block_id = cell.block_id;
                   const int col_block_size =
                       block_structure->cols[col_block_id].size;
                   const int col_block_pos =
                       block_structure->cols[col_block_id].position;
                   MatrixVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
                       values + cell.position,
                       row_block_size,
                       col_block_size,
                       x + col_block_pos,
                       y + row_block_pos);
                 }
               });
 }

 // TODO(https://github.com/ceres-solver/ceres-solver/issues/933): This method
 // might benefit from caching column-block partition
 void BlockSparseMatrix::LeftMultiplyAndAccumulate(const double* x,
                                                   double* y,
                                                   ContextImpl* context,
                                                   int num_threads) const {
   // While utilizing transposed structure allows to perform parallel
   // left-multiplication by dense vector, it makes access patterns to matrix
   // elements scattered. Thus, multiplication using transposed structure
   // is only useful for parallel execution
   CHECK(x != nullptr);
   CHECK(y != nullptr);
   if (transpose_block_structure_ == nullptr || num_threads == 1) {
     LeftMultiplyAndAccumulate(x, y);
     return;
   }

   auto transpose_bs = transpose_block_structure_.get();
   const auto values = values_;
   const int num_col_blocks = transpose_bs->rows.size();
   if (!num_col_blocks) {
     return;
   }

   // Use non-zero count as iteration cost for guided parallel-for loop
   ParallelFor(
       context,
       0,
       num_col_blocks,
       num_threads,
       [values, transpose_bs, x, y](int row_block_id) {
         int row_block_pos = transpose_bs->rows[row_block_id].block.position;
         int row_block_size = transpose_bs->rows[row_block_id].block.size;
         auto& cells = transpose_bs->rows[row_block_id].cells;

         for (auto& cell : cells) {
           const int col_block_id = cell.block_id;
           const int col_block_size = transpose_bs->cols[col_block_id].size;
           const int col_block_pos = transpose_bs->cols[col_block_id].position;
           MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
               values + cell.position,
               col_block_size,
               row_block_size,
               x + col_block_pos,
               y + row_block_pos);
         }
       },
       transpose_bs->rows.data(),
       [](const CompressedRow& row) { return row.cumulative_nnz; });
 }

 void BlockSparseMatrix::LeftMultiplyAndAccumulate(const double* x,
                                                   double* y) const {
   CHECK(x != nullptr);
   CHECK(y != nullptr);
   // Single-threaded left products are always computed using a non-transpose
   // block structure, because it has linear access pattern to matrix elements
   for (int i = 0; i < block_structure_->rows.size(); ++i) {
     int row_block_pos = block_structure_->rows[i].block.position;
     int row_block_size = block_structure_->rows[i].block.size;
     const auto& cells = block_structure_->rows[i].cells;
     for (const auto& cell : cells) {
       int col_block_id = cell.block_id;
       int col_block_size = block_structure_->cols[col_block_id].size;
       int col_block_pos = block_structure_->cols[col_block_id].position;
       MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
           values_ + cell.position,
           row_block_size,
           col_block_size,
           x + row_block_pos,
           y + col_block_pos);
     }
   }
 }

 void BlockSparseMatrix::SquaredColumnNorm(double* x) const {
   CHECK(x != nullptr);
   VectorRef(x, num_cols_).setZero();
   for (int i = 0; i < block_structure_->rows.size(); ++i) {
     int row_block_size = block_structure_->rows[i].block.size;
     auto& cells = block_structure_->rows[i].cells;
     for (const auto& cell : cells) {
       int col_block_id = cell.block_id;
       int col_block_size = block_structure_->cols[col_block_id].size;
       int col_block_pos = block_structure_->cols[col_block_id].position;
       const MatrixRef m(
           values_ + cell.position, row_block_size, col_block_size);
       VectorRef(x + col_block_pos, col_block_size) += m.colwise().squaredNorm();
     }
   }
 }

 // TODO(https://github.com/ceres-solver/ceres-solver/issues/933): This method
 // might benefit from caching column-block partition
 void BlockSparseMatrix::SquaredColumnNorm(double* x,
                                           ContextImpl* context,
                                           int num_threads) const {
   if (transpose_block_structure_ == nullptr || num_threads == 1) {
     SquaredColumnNorm(x);
     return;
   }

   CHECK(x != nullptr);
   ParallelSetZero(context, num_threads, x, num_cols_);

   auto transpose_bs = transpose_block_structure_.get();
   const auto values = values_;
   const int num_col_blocks = transpose_bs->rows.size();
   ParallelFor(
       context,
       0,
       num_col_blocks,
       num_threads,
       [values, transpose_bs, x](int row_block_id) {
         const auto& row = transpose_bs->rows[row_block_id];

         for (auto& cell : row.cells) {
           const auto& col = transpose_bs->cols[cell.block_id];
           const MatrixRef m(values + cell.position, col.size, row.block.size);
           VectorRef(x + row.block.position, row.block.size) +=
               m.colwise().squaredNorm();
         }
       },
       transpose_bs->rows.data(),
       [](const CompressedRow& row) { return row.cumulative_nnz; });
 }

 void BlockSparseMatrix::ScaleColumns(const double* scale) {
   CHECK(scale != nullptr);

   for (int i = 0; i < block_structure_->rows.size(); ++i) {
     int row_block_size = block_structure_->rows[i].block.size;
     auto& cells = block_structure_->rows[i].cells;
     for (const auto& cell : cells) {
       int col_block_id = cell.block_id;
       int col_block_size = block_structure_->cols[col_block_id].size;
       int col_block_pos = block_structure_->cols[col_block_id].position;
       MatrixRef m(values_ + cell.position, row_block_size, col_block_size);
       m *= ConstVectorRef(scale + col_block_pos, col_block_size).asDiagonal();
     }
   }
 }

 // TODO(https://github.com/ceres-solver/ceres-solver/issues/933): This method
 // might benefit from caching column-block partition
 void BlockSparseMatrix::ScaleColumns(const double* scale,
                                      ContextImpl* context,
                                      int num_threads) {
   if (transpose_block_structure_ == nullptr || num_threads == 1) {
     ScaleColumns(scale);
     return;
   }

   CHECK(scale != nullptr);
   auto transpose_bs = transpose_block_structure_.get();
   auto values = values_;
   const int num_col_blocks = transpose_bs->rows.size();
   ParallelFor(
       context,
       0,
       num_col_blocks,
       num_threads,
       [values, transpose_bs, scale](int row_block_id) {
         const auto& row = transpose_bs->rows[row_block_id];

         for (auto& cell : row.cells) {
           const auto& col = transpose_bs->cols[cell.block_id];
           MatrixRef m(values + cell.position, col.size, row.block.size);
           m *= ConstVectorRef(scale + row.block.position, row.block.size)
                    .asDiagonal();
         }
       },
       transpose_bs->rows.data(),
       [](const CompressedRow& row) { return row.cumulative_nnz; });
 }
 std::unique_ptr<CompressedRowSparseMatrix>
 BlockSparseMatrix::ToCompressedRowSparseMatrixTranspose() const {
   auto bs = transpose_block_structure_.get();
   auto crs_matrix = CreateStructureOfCompressedRowSparseMatrix<true>(
       num_cols_, num_rows_, num_nonzeros_, bs);

   SetBlockStructureOfCompressedRowSparseMatrix(crs_matrix.get(), bs);

   UpdateCompressedRowSparseMatrixTranspose(crs_matrix.get());
   return crs_matrix;
 }

 std::unique_ptr<CompressedRowSparseMatrix>
 BlockSparseMatrix::ToCompressedRowSparseMatrix() const {
   auto crs_matrix = CreateStructureOfCompressedRowSparseMatrix<false>(
       num_rows_, num_cols_, num_nonzeros_, block_structure_.get());

   SetBlockStructureOfCompressedRowSparseMatrix(crs_matrix.get(),
                                                block_structure_.get());

   UpdateCompressedRowSparseMatrix(crs_matrix.get());
   return crs_matrix;
 }

 void BlockSparseMatrix::UpdateCompressedRowSparseMatrixTranspose(
     CompressedRowSparseMatrix* crs_matrix) const {
   CHECK(crs_matrix != nullptr);
   CHECK_EQ(crs_matrix->num_rows(), num_cols_);
   CHECK_EQ(crs_matrix->num_cols(), num_rows_);
   CHECK_EQ(crs_matrix->num_nonzeros(), num_nonzeros_);
   UpdateCompressedRowSparseMatrixImpl<true>(
       crs_matrix, values(), transpose_block_structure_.get());
 }
 void BlockSparseMatrix::UpdateCompressedRowSparseMatrix(
     CompressedRowSparseMatrix* crs_matrix) const {
   CHECK(crs_matrix != nullptr);
   CHECK_EQ(crs_matrix->num_rows(), num_rows_);
   CHECK_EQ(crs_matrix->num_cols(), num_cols_);
   CHECK_EQ(crs_matrix->num_nonzeros(), num_nonzeros_);
   UpdateCompressedRowSparseMatrixImpl<false>(
       crs_matrix, values(), block_structure_.get());
 }

 void BlockSparseMatrix::ToDenseMatrix(Matrix* dense_matrix) const {
   CHECK(dense_matrix != nullptr);

   dense_matrix->resize(num_rows_, num_cols_);
   dense_matrix->setZero();
   Matrix& m = *dense_matrix;

   for (int i = 0; i < block_structure_->rows.size(); ++i) {
     int row_block_pos = block_structure_->rows[i].block.position;
     int row_block_size = block_structure_->rows[i].block.size;
     auto& cells = block_structure_->rows[i].cells;
     for (const auto& cell : cells) {
       int col_block_id = cell.block_id;
       int col_block_size = block_structure_->cols[col_block_id].size;
       int col_block_pos = block_structure_->cols[col_block_id].position;
       int jac_pos = cell.position;
       m.block(row_block_pos, col_block_pos, row_block_size, col_block_size) +=
           MatrixRef(values_ + jac_pos, row_block_size, col_block_size);
     }
   }
 }

 void BlockSparseMatrix::ToTripletSparseMatrix(
     TripletSparseMatrix* matrix) const {
   CHECK(matrix != nullptr);

   matrix->Reserve(num_nonzeros_);
   matrix->Resize(num_rows_, num_cols_);
   matrix->SetZero();

   for (int i = 0; i < block_structure_->rows.size(); ++i) {
     int row_block_pos = block_structure_->rows[i].block.position;
     int row_block_size = block_structure_->rows[i].block.size;
     const auto& cells = block_structure_->rows[i].cells;
     for (const auto& cell : cells) {
       int col_block_id = cell.block_id;
       int col_block_size = block_structure_->cols[col_block_id].size;
       int col_block_pos = block_structure_->cols[col_block_id].position;
       int jac_pos = cell.position;
       for (int r = 0; r < row_block_size; ++r) {
         for (int c = 0; c < col_block_size; ++c, ++jac_pos) {
           matrix->mutable_rows()[jac_pos] = row_block_pos + r;
           matrix->mutable_cols()[jac_pos] = col_block_pos + c;
           matrix->mutable_values()[jac_pos] = values_[jac_pos];
         }
       }
     }
   }
   matrix->set_num_nonzeros(num_nonzeros_);
 }

 // Return a pointer to the block structure. We continue to hold
 // ownership of the object though.
 const CompressedRowBlockStructure* BlockSparseMatrix::block_structure() const {
   return block_structure_.get();
 }

 // Return a pointer to the block structure of matrix transpose. We continue to
 // hold ownership of the object though.
 const CompressedRowBlockStructure*
 BlockSparseMatrix::transpose_block_structure() const {
   return transpose_block_structure_.get();
 }

 void BlockSparseMatrix::ToTextFile(FILE* file) const {
   CHECK(file != nullptr);
   for (int i = 0; i < block_structure_->rows.size(); ++i) {
     const int row_block_pos = block_structure_->rows[i].block.position;
     const int row_block_size = block_structure_->rows[i].block.size;
     const auto& cells = block_structure_->rows[i].cells;
     for (const auto& cell : cells) {
       const int col_block_id = cell.block_id;
       const int col_block_size = block_structure_->cols[col_block_id].size;
       const int col_block_pos = block_structure_->cols[col_block_id].position;
       int jac_pos = cell.position;
       for (int r = 0; r < row_block_size; ++r) {
         for (int c = 0; c < col_block_size; ++c) {
           absl::FPrintF(file,
                         "% 10d % 10d %17f\n",
                         row_block_pos + r,
                         col_block_pos + c,
                         values_[jac_pos++]);
         }
       }
     }
   }
 }

 std::unique_ptr<BlockSparseMatrix> BlockSparseMatrix::CreateDiagonalMatrix(
     const double* diagonal, const std::vector<Block>& column_blocks) {
   // Create the block structure for the diagonal matrix.
   auto* bs = new CompressedRowBlockStructure();
   bs->cols = column_blocks;
   int position = 0;
   bs->rows.resize(column_blocks.size(), CompressedRow(1));
   for (int i = 0; i < column_blocks.size(); ++i) {
     CompressedRow& row = bs->rows[i];
     row.block = column_blocks[i];
     Cell& cell = row.cells[0];
     cell.block_id = i;
     cell.position = position;
     position += row.block.size * row.block.size;
   }

   // Create the BlockSparseMatrix with the given block structure.
   auto matrix = std::make_unique<BlockSparseMatrix>(bs);
   matrix->SetZero();

   // Fill the values array of the block sparse matrix.
   double* values = matrix->mutable_values();
   for (const auto& column_block : column_blocks) {
     const int size = column_block.size;
     for (int j = 0; j < size; ++j) {
       // (j + 1) * size is compact way of accessing the (j,j) entry.
       values[j * (size + 1)] = diagonal[j];
     }
     diagonal += size;
     values += size * size;
   }

   return matrix;
 }

 void BlockSparseMatrix::AppendRows(const BlockSparseMatrix& m) {
   CHECK_EQ(m.num_cols(), num_cols());
   const CompressedRowBlockStructure* m_bs = m.block_structure();
   CHECK_EQ(m_bs->cols.size(), block_structure_->cols.size());

   const int old_num_nonzeros = num_nonzeros_;
   const int old_num_row_blocks = block_structure_->rows.size();
   block_structure_->rows.resize(old_num_row_blocks + m_bs->rows.size());

   for (int i = 0; i < m_bs->rows.size(); ++i) {
     const CompressedRow& m_row = m_bs->rows[i];
     const int row_block_id = old_num_row_blocks + i;
     CompressedRow& row = block_structure_->rows[row_block_id];
     row.block.size = m_row.block.size;
     row.block.position = num_rows_;
     num_rows_ += m_row.block.size;
     row.cells.resize(m_row.cells.size());
     if (transpose_block_structure_) {
       transpose_block_structure_->cols.emplace_back(row.block);
     }
     for (int c = 0; c < m_row.cells.size(); ++c) {
       const int block_id = m_row.cells[c].block_id;
       row.cells[c].block_id = block_id;
       row.cells[c].position = num_nonzeros_;

       const int cell_nnz = m_row.block.size * m_bs->cols[block_id].size;
       if (transpose_block_structure_) {
         transpose_block_structure_->rows[block_id].cells.emplace_back(
             row_block_id, num_nonzeros_);
         transpose_block_structure_->rows[block_id].nnz += cell_nnz;
       }

       num_nonzeros_ += cell_nnz;
     }
   }

   if (num_nonzeros_ > max_num_nonzeros_) {
     double* old_values = values_;
     values_ = AllocateValues(num_nonzeros_);
     std::copy_n(old_values, old_num_nonzeros, values_);
     max_num_nonzeros_ = num_nonzeros_;
     FreeValues(old_values);
   }

   std::copy(
       m.values(), m.values() + m.num_nonzeros(), values_ + old_num_nonzeros);

   if (transpose_block_structure_ == nullptr) {
     return;
   }
   ComputeCumulativeNumberOfNonZeros(transpose_block_structure_->rows);
 }

 void BlockSparseMatrix::DeleteRowBlocks(const int delta_row_blocks) {
   const int num_row_blocks = block_structure_->rows.size();
   const int new_num_row_blocks = num_row_blocks - delta_row_blocks;
   int delta_num_nonzeros = 0;
   int delta_num_rows = 0;
   const std::vector<Block>& column_blocks = block_structure_->cols;
   for (int i = 0; i < delta_row_blocks; ++i) {
     const CompressedRow& row = block_structure_->rows[num_row_blocks - i - 1];
     delta_num_rows += row.block.size;
     for (int c = 0; c < row.cells.size(); ++c) {
       const Cell& cell = row.cells[c];
       delta_num_nonzeros += row.block.size * column_blocks[cell.block_id].size;

       if (transpose_block_structure_) {
         auto& col_cells = transpose_block_structure_->rows[cell.block_id].cells;
         while (!col_cells.empty() &&
                col_cells.back().block_id >= new_num_row_blocks) {
           const int del_block_id = col_cells.back().block_id;
           const int del_block_rows =
               block_structure_->rows[del_block_id].block.size;
           const int del_block_cols = column_blocks[cell.block_id].size;
           const int del_cell_nnz = del_block_rows * del_block_cols;
           transpose_block_structure_->rows[cell.block_id].nnz -= del_cell_nnz;
           col_cells.pop_back();
         }
       }
     }
   }
   num_nonzeros_ -= delta_num_nonzeros;
   num_rows_ -= delta_num_rows;
   block_structure_->rows.resize(new_num_row_blocks);

   if (transpose_block_structure_ == nullptr) {
     return;
   }
   for (int i = 0; i < delta_row_blocks; ++i) {
     transpose_block_structure_->cols.pop_back();
   }

   ComputeCumulativeNumberOfNonZeros(transpose_block_structure_->rows);
 }

 std::unique_ptr<BlockSparseMatrix> BlockSparseMatrix::CreateRandomMatrix(
     const BlockSparseMatrix::RandomMatrixOptions& options,
     std::mt19937& prng,
     bool use_page_locked_memory) {
   CHECK_GT(options.num_row_blocks, 0);
   CHECK_GT(options.min_row_block_size, 0);
   CHECK_GT(options.max_row_block_size, 0);
   CHECK_LE(options.min_row_block_size, options.max_row_block_size);
   CHECK_GT(options.block_density, 0.0);
   CHECK_LE(options.block_density, 1.0);

   std::uniform_int_distribution<int> col_distribution(
       options.min_col_block_size, options.max_col_block_size);
   std::uniform_int_distribution<int> row_distribution(
       options.min_row_block_size, options.max_row_block_size);
   auto bs = std::make_unique<CompressedRowBlockStructure>();
   if (options.col_blocks.empty()) {
     CHECK_GT(options.num_col_blocks, 0);
     CHECK_GT(options.min_col_block_size, 0);
     CHECK_GT(options.max_col_block_size, 0);
     CHECK_LE(options.min_col_block_size, options.max_col_block_size);

     // Generate the col block structure.
     int col_block_position = 0;
     for (int i = 0; i < options.num_col_blocks; ++i) {
       const int col_block_size = col_distribution(prng);
       bs->cols.emplace_back(col_block_size, col_block_position);
       col_block_position += col_block_size;
     }
   } else {
     bs->cols = options.col_blocks;
   }

   bool matrix_has_blocks = false;
   std::uniform_real_distribution<double> uniform01(0.0, 1.0);
   while (!matrix_has_blocks) {
     VLOG(1) << "Clearing";
     bs->rows.clear();
     int row_block_position = 0;
     int value_position = 0;
     for (int r = 0; r < options.num_row_blocks; ++r) {
       const int row_block_size = row_distribution(prng);
       bs->rows.emplace_back();
       CompressedRow& row = bs->rows.back();
       row.block.size = row_block_size;
       row.block.position = row_block_position;
       row_block_position += row_block_size;
       for (int c = 0; c < bs->cols.size(); ++c) {
         if (uniform01(prng) > options.block_density) continue;

         row.cells.emplace_back();
         Cell& cell = row.cells.back();
         cell.block_id = c;
         cell.position = value_position;
         value_position += row_block_size * bs->cols[c].size;
         matrix_has_blocks = true;
       }
     }
   }

   auto matrix =
       std::make_unique<BlockSparseMatrix>(bs.release(), use_page_locked_memory);
   double* values = matrix->mutable_values();
   std::normal_distribution<double> standard_normal_distribution;
   std::generate_n(
       values, matrix->num_nonzeros(), [&standard_normal_distribution, &prng] {
         return standard_normal_distribution(prng);
       });

   return matrix;
 }

 std::unique_ptr<CompressedRowBlockStructure> CreateTranspose(
     const CompressedRowBlockStructure& bs) {
   auto transpose = std::make_unique<CompressedRowBlockStructure>();

   transpose->rows.resize(bs.cols.size());
   for (int i = 0; i < bs.cols.size(); ++i) {
     transpose->rows[i].block = bs.cols[i];
     transpose->rows[i].nnz = 0;
   }

   transpose->cols.resize(bs.rows.size());
   for (int i = 0; i < bs.rows.size(); ++i) {
     auto& row = bs.rows[i];
     transpose->cols[i] = row.block;

     const int nrows = row.block.size;
     for (auto& cell : row.cells) {
       transpose->rows[cell.block_id].cells.emplace_back(i, cell.position);
       const int ncols = transpose->rows[cell.block_id].block.size;
       transpose->rows[cell.block_id].nnz += nrows * ncols;
     }
   }
   ComputeCumulativeNumberOfNonZeros(transpose->rows);
   return transpose;
 }

 double* BlockSparseMatrix::AllocateValues(int size) {
   if (!use_page_locked_memory_) {
     return new double[size];
   }

 #ifndef CERES_NO_CUDA

   double* values = nullptr;
   CHECK_EQ(cudaSuccess,
            cudaHostAlloc(&values, sizeof(double) * size, cudaHostAllocDefault));
   return values;
 #else
   LOG(FATAL) << "Page locked memory requested when CUDA is not available. "
              << "This is a Ceres bug; please contact the developers!";
   return nullptr;
 #endif
 };

 void BlockSparseMatrix::FreeValues(double*& values) {
   if (!use_page_locked_memory_) {
     delete[] values;
     values = nullptr;
     return;
   }

 #ifndef CERES_NO_CUDA
   CHECK_EQ(cudaSuccess, cudaFreeHost(values));
   values = nullptr;
 #else
   LOG(FATAL) << "Page locked memory requested when CUDA is not available. "
              << "This is a Ceres bug; please contact the developers!";
 #endif
 };

 }  // namespace ceres::internal