CUDA partitioned matrix view

Converts BlockSparseMatrix into two instances of CudaSparseMatrix,
corresponding to left and right sub-matrix.

Values of submatrix E are always just copied as-is, and values of
submatrix F are copied if each row-block of F submatrix satisfies
at least one of the following conditions:
 - There is atmost one cell in row-block
 - Row block has height of 1 row
Otherwise, indices of values in CRS order corresponding to value indices
in block-sparse order are computed on-the-fly.

Change-Id: I14eee00c36ee74b6b83fc85927907641383abfc7
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index b67b10e..aff218e 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -186,6 +186,7 @@
     covariance.cc
     covariance_impl.cc
     cuda_block_sparse_crs_view.cc
+    cuda_partitioned_block_sparse_crs_view.cc
     cuda_block_structure.cc
     cuda_sparse_matrix.cc
     cuda_vector.cc
@@ -478,6 +479,7 @@
   ceres_test(cost_function_to_functor)
   ceres_test(covariance)
   ceres_test(cubic_interpolation)
+  ceres_test(cuda_partitioned_block_sparse_crs_view)
   ceres_test(cuda_block_sparse_crs_view)
   ceres_test(cuda_block_structure)
   ceres_test(cuda_dense_cholesky)
diff --git a/internal/ceres/block_jacobian_writer.cc b/internal/ceres/block_jacobian_writer.cc
index f74d64d..b009e96 100644
--- a/internal/ceres/block_jacobian_writer.cc
+++ b/internal/ceres/block_jacobian_writer.cc
@@ -54,6 +54,13 @@
 // the first num_eliminate_blocks parameter blocks as indicated by the parameter
 // block ordering. The remaining parameter blocks are the F blocks.
 //
+// In order to simplify handling block-sparse to CRS conversion, cells within
+// the row-block of non-partitioned matrix are stored in memory sequentially in
+// the order of increasing column-block id. In case of partitioned matrices,
+// cells corresponding to F sub-matrix are stored sequentially in the order of
+// increasing column-block id (with cells corresponding to E sub-matrix stored
+// separately).
+//
 // TODO(keir): Consider if we should use a boolean for each parameter block
 // instead of num_eliminate_blocks.
 void BuildJacobianLayout(const Program& program,
@@ -95,29 +102,54 @@
 
   int e_block_pos = 0;
   int* jacobian_pos = jacobian_layout_storage->data();
+  std::vector<std::pair<int, int>> active_parameter_blocks;
   for (int i = 0; i < residual_blocks.size(); ++i) {
     const ResidualBlock* residual_block = residual_blocks[i];
     const int num_residuals = residual_block->NumResiduals();
     const int num_parameter_blocks = residual_block->NumParameterBlocks();
 
     (*jacobian_layout)[i] = jacobian_pos;
+    // Cells from F sub-matrix are to be stored sequentially with increasing
+    // column block id. For each non-constant parameter block, a pair of indices
+    // (index in the list of active parameter blocks and index in the list of
+    // all parameter blocks) is computed, and index pairs are sorted by the
+    // index of corresponding column block id.
+    active_parameter_blocks.clear();
+    active_parameter_blocks.reserve(num_parameter_blocks);
     for (int j = 0; j < num_parameter_blocks; ++j) {
       ParameterBlock* parameter_block = residual_block->parameter_blocks()[j];
-      const int parameter_block_index = parameter_block->index();
       if (parameter_block->IsConstant()) {
         continue;
       }
+      const int k = active_parameter_blocks.size();
+      active_parameter_blocks.emplace_back(k, j);
+    }
+    std::sort(active_parameter_blocks.begin(),
+              active_parameter_blocks.end(),
+              [&residual_block](const std::pair<int, int>& a,
+                                const std::pair<int, int>& b) {
+                return residual_block->parameter_blocks()[a.second]->index() <
+                       residual_block->parameter_blocks()[b.second]->index();
+              });
+    // Cell positions for each active parameter block are filled in the order of
+    // active parameter block indices sorted by columnd block index. This
+    // guarantees that cells are laid out sequentially with increasing column
+    // block indices.
+    for (const auto& indices : active_parameter_blocks) {
+      const auto [k, j] = indices;
+      ParameterBlock* parameter_block = residual_block->parameter_blocks()[j];
+      const int parameter_block_index = parameter_block->index();
       const int jacobian_block_size =
           num_residuals * parameter_block->TangentSize();
       if (parameter_block_index < num_eliminate_blocks) {
-        *jacobian_pos = e_block_pos;
+        jacobian_pos[k] = e_block_pos;
         e_block_pos += jacobian_block_size;
       } else {
-        *jacobian_pos = f_block_pos;
+        jacobian_pos[k] = f_block_pos;
         f_block_pos += jacobian_block_size;
       }
-      jacobian_pos++;
     }
+    jacobian_pos += active_parameter_blocks.size();
   }
 }
 
diff --git a/internal/ceres/block_jacobian_writer.h b/internal/ceres/block_jacobian_writer.h
index 61f69b3..30b5f02 100644
--- a/internal/ceres/block_jacobian_writer.h
+++ b/internal/ceres/block_jacobian_writer.h
@@ -53,6 +53,17 @@
 // TODO(sameeragarwal): This class needs documentation.
 class CERES_NO_EXPORT BlockJacobianWriter {
  public:
+  // Pre-computes positions of cells in block-sparse jacobian.
+  // Two possible memory layouts are implemented:
+  //  - Non-partitioned case
+  //  - Partitioned case (for Schur type linear solver)
+  //
+  // In non-partitioned case, cells are stored sequentially in the
+  // lexicographic order of (row block id, column block id).
+  //
+  // In the case of partitoned matrix, cells of each sub-matrix (E and F) are
+  // stored sequentially in the lexicographic order of (row block id, column
+  // block id) and cells from E sub-matrix precede cells from F sub-matrix.
   BlockJacobianWriter(const Evaluator::Options& options, Program* program);
 
   // JacobianWriter interface.
diff --git a/internal/ceres/cuda_block_sparse_crs_view.cc b/internal/ceres/cuda_block_sparse_crs_view.cc
index bb267da..6d4c6b0 100644
--- a/internal/ceres/cuda_block_sparse_crs_view.cc
+++ b/internal/ceres/cuda_block_sparse_crs_view.cc
@@ -41,8 +41,6 @@
     : context_(context) {
   block_structure_ = std::make_unique<CudaBlockSparseStructure>(
       *bsm.block_structure(), context);
-  // Only block-sparse matrices with sequential layout of cells are supported
-  CHECK(block_structure_->sequential_layout());
   crs_matrix_ = std::make_unique<CudaSparseMatrix>(
       bsm.num_rows(), bsm.num_cols(), bsm.num_nonzeros(), context);
   FillCRSStructure(block_structure_->num_row_blocks(),
diff --git a/internal/ceres/cuda_block_structure.cc b/internal/ceres/cuda_block_structure.cc
index 5817cd7..3685775 100644
--- a/internal/ceres/cuda_block_structure.cc
+++ b/internal/ceres/cuda_block_structure.cc
@@ -43,10 +43,16 @@
   return last.size + last.position;
 }
 }  // namespace
-
 CudaBlockSparseStructure::CudaBlockSparseStructure(
     const CompressedRowBlockStructure& block_structure, ContextImpl* context)
+    : CudaBlockSparseStructure(block_structure, 0, context) {}
+
+CudaBlockSparseStructure::CudaBlockSparseStructure(
+    const CompressedRowBlockStructure& block_structure,
+    const int num_col_blocks_e,
+    ContextImpl* context)
     : first_cell_in_row_block_(context),
+      value_offset_row_block_f_(context),
       cells_(context),
       row_blocks_(context),
       col_blocks_(context) {
@@ -57,40 +63,137 @@
 
   // Row block offset is an index of the first cell corresponding to row block
   std::vector<int> first_cell_in_row_block;
+  // Offset of the first value in the first non-empty row-block of F sub-matrix
+  std::vector<int> value_offset_row_block_f;
   // Flat array of all cells from all row-blocks
   std::vector<Cell> cells;
 
-  int f_values_offset = 0;
+  int f_values_offset = -1;
+  num_nonzeros_e_ = 0;
   is_crs_compatible_ = true;
   num_row_blocks_ = block_structure.rows.size();
   num_col_blocks_ = col_blocks.size();
 
   row_blocks.reserve(num_row_blocks_);
   first_cell_in_row_block.reserve(num_row_blocks_ + 1);
+  value_offset_row_block_f.reserve(num_row_blocks_ + 1);
   num_nonzeros_ = 0;
-  sequential_layout_ = true;
-  for (const auto& r : block_structure.rows) {
+  // Block-sparse matrices arising from block-jacobian writer are expected to
+  // have sequential layout (for partitioned matrices - it is expected that both
+  // E and F sub-matrices have sequential layout).
+  bool sequential_layout = true;
+  int row_block_id = 0;
+  num_row_blocks_e_ = 0;
+  for (; row_block_id < num_row_blocks_; ++row_block_id) {
+    const auto& r = block_structure.rows[row_block_id];
     const int row_block_size = r.block.size;
-    if (r.cells.size() > 1 && row_block_size > 1) {
+    const int num_cells = r.cells.size();
+
+    if (num_col_blocks_e == 0 || r.cells.size() == 0 ||
+        r.cells[0].block_id >= num_col_blocks_e) {
+      break;
+    }
+    num_row_blocks_e_ = row_block_id + 1;
+    // In E sub-matrix there is exactly a single E cell in the row
+    // since E cells are stored separately from F cells, crs-compatiblity of
+    // F sub-matrix only breaks if there are more than 2 cells in row (that
+    // is, more than 1 cell in F sub-matrix)
+    if (num_cells > 2 && row_block_size > 1) {
       is_crs_compatible_ = false;
     }
     row_blocks.emplace_back(r.block);
     first_cell_in_row_block.push_back(cells.size());
+
+    for (int cell_id = 0; cell_id < num_cells; ++cell_id) {
+      const auto& c = r.cells[cell_id];
+      const int col_block_size = col_blocks[c.block_id].size;
+      const int cell_size = col_block_size * row_block_size;
+      cells.push_back(c);
+      if (cell_id == 0) {
+        DCHECK(c.position == num_nonzeros_e_);
+        num_nonzeros_e_ += cell_size;
+      } else {
+        if (f_values_offset == -1) {
+          num_nonzeros_ = c.position;
+          f_values_offset = c.position;
+        }
+        sequential_layout &= c.position == num_nonzeros_;
+        num_nonzeros_ += cell_size;
+        if (cell_id == 1) {
+          // Correct value_offset_row_block_f for empty row-blocks of F
+          // preceding this one
+          for (auto it = value_offset_row_block_f.rbegin();
+               it != value_offset_row_block_f.rend();
+               ++it) {
+            if (*it != -1) break;
+            *it = c.position;
+          }
+          value_offset_row_block_f.push_back(c.position);
+        }
+      }
+    }
+    if (num_cells == 1) {
+      value_offset_row_block_f.push_back(-1);
+    }
+  }
+  for (; row_block_id < num_row_blocks_; ++row_block_id) {
+    const auto& r = block_structure.rows[row_block_id];
+    const int row_block_size = r.block.size;
+    const int num_cells = r.cells.size();
+    // After num_row_blocks_e_ row-blocks, there should be no cells in E
+    // sub-matrix. Thus crs-compatibility of F sub-matrix breaks if there are
+    // more than one cells in the row-block
+    if (num_cells > 1 && row_block_size > 1) {
+      is_crs_compatible_ = false;
+    }
+    row_blocks.emplace_back(r.block);
+    first_cell_in_row_block.push_back(cells.size());
+
+    if (r.cells.empty()) {
+      value_offset_row_block_f.push_back(-1);
+    } else {
+      for (auto it = value_offset_row_block_f.rbegin();
+           it != value_offset_row_block_f.rend();
+           --it) {
+        if (*it != -1) break;
+        *it = cells[0].position;
+      }
+      value_offset_row_block_f.push_back(r.cells[0].position);
+    }
     for (const auto& c : r.cells) {
       const int col_block_size = col_blocks[c.block_id].size;
       const int cell_size = col_block_size * row_block_size;
       cells.push_back(c);
-      sequential_layout_ &= c.position == num_nonzeros_;
+      DCHECK(c.block_id >= num_col_blocks_e);
+      if (f_values_offset == -1) {
+        num_nonzeros_ = c.position;
+        f_values_offset = c.position;
+      }
+      sequential_layout &= c.position == num_nonzeros_;
       num_nonzeros_ += cell_size;
     }
   }
+
+  if (f_values_offset == -1) {
+    f_values_offset = num_nonzeros_e_;
+    num_nonzeros_ = num_nonzeros_e_;
+  }
+  // Fill non-zero offsets for the last rows of F submatrix
+  for (auto it = value_offset_row_block_f.rbegin();
+       it != value_offset_row_block_f.rend();
+       ++it) {
+    if (*it != -1) break;
+    *it = num_nonzeros_;
+  }
+  value_offset_row_block_f.push_back(num_nonzeros_);
+  CHECK_EQ(num_nonzeros_e_, f_values_offset);
   first_cell_in_row_block.push_back(cells.size());
   num_cells_ = cells.size();
 
   num_rows_ = Dimension(row_blocks);
   num_cols_ = Dimension(col_blocks);
 
-  is_crs_compatible_ &= sequential_layout_;
+  CHECK(sequential_layout);
 
   if (VLOG_IS_ON(3)) {
     const size_t first_cell_in_row_block_size =
@@ -122,6 +225,9 @@
   cells_.CopyFromCpuVector(cells);
   row_blocks_.CopyFromCpuVector(row_blocks);
   col_blocks_.CopyFromCpuVector(col_blocks);
+  if (num_col_blocks_e || num_row_blocks_e_) {
+    value_offset_row_block_f_.CopyFromCpuVector(value_offset_row_block_f);
+  }
 }
 }  // namespace ceres::internal
 
diff --git a/internal/ceres/cuda_block_structure.h b/internal/ceres/cuda_block_structure.h
index 28e36b2..6da6fdd 100644
--- a/internal/ceres/cuda_block_structure.h
+++ b/internal/ceres/cuda_block_structure.h
@@ -56,28 +56,40 @@
   // array of cells on cpu and transfer it to the gpu.
   CudaBlockSparseStructure(const CompressedRowBlockStructure& block_structure,
                            ContextImpl* context);
+  // In the case of partitioned matrices, number of non-zeros in E and layout of
+  // F are computed
+  CudaBlockSparseStructure(const CompressedRowBlockStructure& block_structure,
+                           const int num_col_blocks_e,
+                           ContextImpl* context);
 
   int num_rows() const { return num_rows_; }
   int num_cols() const { return num_cols_; }
   int num_cells() const { return num_cells_; }
   int num_nonzeros() const { return num_nonzeros_; }
+  // When partitioned matrix constructor was used, returns number of non-zeros
+  // in E sub-matrix
+  int num_nonzeros_e() const { return num_nonzeros_e_; }
   int num_row_blocks() const { return num_row_blocks_; }
+  int num_row_blocks_e() const { return num_row_blocks_e_; }
   int num_col_blocks() const { return num_col_blocks_; }
 
-  // Returns true if values from block-sparse matrix can be copied to CRS matrix
-  // as-is. This is possible if each row-block is stored in CRS order:
+  // Returns true if values from block-sparse matrix (F sub-matrix in
+  // partitioned case) can be copied to CRS matrix as-is. This is possible if
+  // each row-block is stored in CRS order:
   //  - Row-block consists of a single row
   //  - Row-block contains a single cell
   bool IsCrsCompatible() const { return is_crs_compatible_; }
-  // Returns true if block-sparse structure corresponds to block-sparse matrix
-  // with sequential cell positions
-  bool sequential_layout() const { return sequential_layout_; }
 
   // Device pointer to array of num_row_blocks + 1 indices of the first cell of
   // row block
   const int* first_cell_in_row_block() const {
     return first_cell_in_row_block_.data();
   }
+  // Device pointer to array of num_row_blocks + 1 indices of the first value in
+  // this or subsequent row-blocks of submatrix F
+  const int* value_offset_row_block_f() const {
+    return value_offset_row_block_f_.data();
+  }
   // Device pointer to array of num_cells cells, sorted by row-block
   const Cell* cells() const { return cells_.data(); }
   // Device pointer to array of row blocks
@@ -90,11 +102,13 @@
   int num_cols_;
   int num_cells_;
   int num_nonzeros_;
+  int num_nonzeros_e_;
   int num_row_blocks_;
+  int num_row_blocks_e_;
   int num_col_blocks_;
   bool is_crs_compatible_;
-  bool sequential_layout_;
   CudaBuffer<int> first_cell_in_row_block_;
+  CudaBuffer<int> value_offset_row_block_f_;
   CudaBuffer<Cell> cells_;
   CudaBuffer<Block> row_blocks_;
   CudaBuffer<Block> col_blocks_;
diff --git a/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc b/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc
index a1ea6af..0a21c8d 100644
--- a/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc
+++ b/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc
@@ -93,13 +93,17 @@
 // - row_block_ids: row_block_ids[i] will be set to index of row-block that
 // contains i-th row.
 // Computation is perform row-block-wise
+template <bool partitioned = false>
 __global__ void RowBlockIdAndNNZ(
-    int num_row_blocks,
+    const int num_row_blocks,
+    const int num_col_blocks_e,
+    const int num_row_blocks_e,
     const int* __restrict__ first_cell_in_row_block,
     const Cell* __restrict__ cells,
     const Block* __restrict__ row_blocks,
     const Block* __restrict__ col_blocks,
-    int* __restrict__ rows,
+    int* __restrict__ rows_e,
+    int* __restrict__ rows_f,
     int* __restrict__ row_block_ids) {
   const int row_block_id = blockIdx.x * blockDim.x + threadIdx.x;
   if (row_block_id > num_row_blocks) {
@@ -108,20 +112,32 @@
   }
   if (row_block_id == num_row_blocks) {
     // one extra thread sets the first element
-    rows[0] = 0;
+    rows_f[0] = 0;
+    if constexpr (partitioned) {
+      rows_e[0] = 0;
+    }
     return;
   }
   const auto& row_block = row_blocks[row_block_id];
-  int row_nnz = 0;
-  const auto first_cell = cells + first_cell_in_row_block[row_block_id];
+  auto first_cell = cells + first_cell_in_row_block[row_block_id];
   const auto last_cell = cells + first_cell_in_row_block[row_block_id + 1];
+  int row_nnz_e = 0;
+  if (partitioned && row_block_id < num_row_blocks_e) {
+    // First cell is a cell from E
+    row_nnz_e = col_blocks[first_cell->block_id].size;
+    ++first_cell;
+  }
+  int row_nnz_f = 0;
   for (auto cell = first_cell; cell < last_cell; ++cell) {
-    row_nnz += col_blocks[cell->block_id].size;
+    row_nnz_f += col_blocks[cell->block_id].size;
   }
   const int first_row = row_block.position;
   const int last_row = first_row + row_block.size;
   for (int i = first_row; i < last_row; ++i) {
-    rows[i + 1] = row_nnz;
+    if constexpr (partitioned) {
+      rows_e[i + 1] = row_nnz_e;
+    }
+    rows_f[i + 1] = row_nnz_f;
     row_block_ids[i] = row_block_id;
   }
 }
@@ -139,15 +155,19 @@
 // Outputs:
 // - cols: column-index array of CRS structure
 // Computaion is perform row-wise
-__global__ void ComputeColumnsAndPermutation(
-    int num_rows,
-    const int* __restrict__ first_cell_in_row_block,
-    const Cell* __restrict__ cells,
-    const Block* __restrict__ row_blocks,
-    const Block* __restrict__ col_blocks,
-    const int* __restrict__ row_block_ids,
-    const int* __restrict__ rows,
-    int* __restrict__ cols) {
+template <bool partitioned>
+__global__ void ComputeColumns(const int num_rows,
+                               const int num_row_blocks_e,
+                               const int num_col_blocks_e,
+                               const int* __restrict__ first_cell_in_row_block,
+                               const Cell* __restrict__ cells,
+                               const Block* __restrict__ row_blocks,
+                               const Block* __restrict__ col_blocks,
+                               const int* __restrict__ row_block_ids,
+                               const int* __restrict__ rows_e,
+                               int* __restrict__ cols_e,
+                               const int* __restrict__ rows_f,
+                               int* __restrict__ cols_f) {
   const int row = blockIdx.x * blockDim.x + threadIdx.x;
   if (row >= num_rows) {
     // No synchronization is performed in this kernel, thus it is safe to return
@@ -155,17 +175,30 @@
   }
   const int row_block_id = row_block_ids[row];
   // position in crs matrix
-  int crs_position = rows[row];
-  const auto first_cell = cells + first_cell_in_row_block[row_block_id];
+  auto first_cell = cells + first_cell_in_row_block[row_block_id];
   const auto last_cell = cells + first_cell_in_row_block[row_block_id + 1];
+  const int num_cols_e = col_blocks[num_col_blocks_e].position;
   // For reach cell of row-block only current row is being filled
+  if (partitioned && row_block_id < num_row_blocks_e) {
+    // The first cell is cell from E
+    const auto& col_block = col_blocks[first_cell->block_id];
+    const int col_block_size = col_block.size;
+    int column_idx = col_block.position;
+    int crs_position_e = rows_e[row];
+    // Column indices for each element of row_in_block row of current cell
+    for (int i = 0; i < col_block_size; ++i, ++crs_position_e) {
+      cols_e[crs_position_e] = column_idx++;
+    }
+    ++first_cell;
+  }
+  int crs_position_f = rows_f[row];
   for (auto cell = first_cell; cell < last_cell; ++cell) {
     const auto& col_block = col_blocks[cell->block_id];
     const int col_block_size = col_block.size;
-    int column_idx = col_block.position;
+    int column_idx = col_block.position - num_cols_e;
     // Column indices for each element of row_in_block row of current cell
-    for (int i = 0; i < col_block_size; ++i, ++crs_position) {
-      cols[crs_position] = column_idx++;
+    for (int i = 0; i < col_block_size; ++i, ++crs_position_f) {
+      cols_f[crs_position_f] = column_idx++;
     }
   }
 }
@@ -183,31 +216,93 @@
   // row_block_ids array
   int* row_block_ids = AllocateTemporaryMemory<int>(num_rows, stream);
   const int num_blocks_blockwise = NumBlocksInGrid(num_row_blocks + 1);
-  RowBlockIdAndNNZ<<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>(
+  RowBlockIdAndNNZ<false><<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>(
       num_row_blocks,
+      0,
+      0,
       first_cell_in_row_block,
       cells,
       row_blocks,
       col_blocks,
+      nullptr,
       rows,
       row_block_ids);
   // Finalize row-index array of CRS strucure by computing prefix sum
   thrust::inclusive_scan(
       ThrustCudaStreamExecutionPolicy(stream), rows, rows + num_rows + 1, rows);
 
-  // Fill cols array of CRS structure and permutation from block-sparse to CRS
+  // Fill cols array of CRS structure
   const int num_blocks_rowwise = NumBlocksInGrid(num_rows);
-  ComputeColumnsAndPermutation<<<num_blocks_rowwise,
-                                 kCudaBlockSize,
-                                 0,
-                                 stream>>>(num_rows,
-                                           first_cell_in_row_block,
-                                           cells,
-                                           row_blocks,
-                                           col_blocks,
-                                           row_block_ids,
-                                           rows,
-                                           cols);
+  ComputeColumns<false><<<num_blocks_rowwise, kCudaBlockSize, 0, stream>>>(
+      num_rows,
+      0,
+      0,
+      first_cell_in_row_block,
+      cells,
+      row_blocks,
+      col_blocks,
+      row_block_ids,
+      nullptr,
+      nullptr,
+      rows,
+      cols);
+  FreeTemporaryMemory(row_block_ids, stream);
+}
+
+void FillCRSStructurePartitioned(const int num_row_blocks,
+                                 const int num_rows,
+                                 const int num_row_blocks_e,
+                                 const int num_col_blocks_e,
+                                 const int num_nonzeros_e,
+                                 const int* first_cell_in_row_block,
+                                 const Cell* cells,
+                                 const Block* row_blocks,
+                                 const Block* col_blocks,
+                                 int* rows_e,
+                                 int* cols_e,
+                                 int* rows_f,
+                                 int* cols_f,
+                                 cudaStream_t stream) {
+  // Set number of non-zeros per row in rows array and row to row-block map in
+  // row_block_ids array
+  int* row_block_ids = AllocateTemporaryMemory<int>(num_rows, stream);
+  const int num_blocks_blockwise = NumBlocksInGrid(num_row_blocks + 1);
+  RowBlockIdAndNNZ<true><<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>(
+      num_row_blocks,
+      num_col_blocks_e,
+      num_row_blocks_e,
+      first_cell_in_row_block,
+      cells,
+      row_blocks,
+      col_blocks,
+      rows_e,
+      rows_f,
+      row_block_ids);
+  // Finalize row-index array of CRS strucure by computing prefix sum
+  thrust::inclusive_scan(ThrustCudaStreamExecutionPolicy(stream),
+                         rows_e,
+                         rows_e + num_rows + 1,
+                         rows_e);
+  thrust::inclusive_scan(ThrustCudaStreamExecutionPolicy(stream),
+                         rows_f,
+                         rows_f + num_rows + 1,
+                         rows_f);
+
+  // Fill cols array of CRS structure
+  const int num_blocks_rowwise = NumBlocksInGrid(num_rows);
+  ComputeColumns<true><<<num_blocks_rowwise, kCudaBlockSize, 0, stream>>>(
+      num_rows,
+      num_row_blocks_e,
+      num_col_blocks_e,
+      first_cell_in_row_block,
+      cells,
+      row_blocks,
+      col_blocks,
+      row_block_ids,
+      rows_e,
+      cols_e,
+      rows_f,
+      cols_f);
   FreeTemporaryMemory(row_block_ids, stream);
 }
 
@@ -231,16 +326,17 @@
 }
 
 // Element-wise reordering of block-sparse values
-// - first_cell_pos - position of the first cell of corresponding sub-matrix;
-// only used if use_first_cell is true; otherwise the first cell in row-block is
-// used
+// - first_cell_in_row_block - position of the first cell of row-block
 // - block_sparse_values - segment of block-sparse values starting from
 // block_sparse_offset, containing num_values
+template <bool partitioned>
 __global__ void PermuteToCrsKernel(
     const int block_sparse_offset,
     const int num_values,
     const int num_row_blocks,
+    const int num_row_blocks_e,
     const int* __restrict__ first_cell_in_row_block,
+    const int* __restrict__ value_offset_row_block_f,
     const Cell* __restrict__ cells,
     const Block* __restrict__ row_blocks,
     const Block* __restrict__ col_blocks,
@@ -254,21 +350,32 @@
   const int block_sparse_value_id = value_id + block_sparse_offset;
   // Find the corresponding row-block with a binary search
   const int row_block_id =
-      PartitionPoint(
-          first_cell_in_row_block,
-          0,
-          num_row_blocks,
-          [cells,
-           block_sparse_value_id] __device__(const int row_block_offset) {
-            return cells[row_block_offset].position <= block_sparse_value_id;
-          }) -
+      (partitioned
+           ? PartitionPoint(value_offset_row_block_f,
+                            0,
+                            num_row_blocks,
+                            [block_sparse_value_id] __device__(
+                                const int row_block_offset) {
+                              return row_block_offset <= block_sparse_value_id;
+                            })
+           : PartitionPoint(first_cell_in_row_block,
+                            0,
+                            num_row_blocks,
+                            [cells, block_sparse_value_id] __device__(
+                                const int row_block_offset) {
+                              return cells[row_block_offset].position <=
+                                     block_sparse_value_id;
+                            })) -
       1;
   // Find cell and calculate offset within the row with a linear scan
   const auto& row_block = row_blocks[row_block_id];
-  const auto first_cell = cells + first_cell_in_row_block[row_block_id];
+  auto first_cell = cells + first_cell_in_row_block[row_block_id];
   const auto last_cell = cells + first_cell_in_row_block[row_block_id + 1];
   const int row_block_size = row_block.size;
   int num_cols_before = 0;
+  if (partitioned && row_block_id < num_row_blocks_e) {
+    ++first_cell;
+  }
   for (const Cell* cell = first_cell; cell < last_cell; ++cell) {
     const auto& col_block = col_blocks[cell->block_id];
     const int col_block_size = col_block.size;
@@ -298,11 +405,43 @@
                   double* crs_values,
                   cudaStream_t stream) {
   const int num_blocks_valuewise = NumBlocksInGrid(num_values);
-  PermuteToCrsKernel<<<num_blocks_valuewise, kCudaBlockSize, 0, stream>>>(
+  PermuteToCrsKernel<false>
+      <<<num_blocks_valuewise, kCudaBlockSize, 0, stream>>>(
+          block_sparse_offset,
+          num_values,
+          num_row_blocks,
+          0,
+          first_cell_in_row_block,
+          nullptr,
+          cells,
+          row_blocks,
+          col_blocks,
+          crs_rows,
+          block_sparse_values,
+          crs_values);
+}
+
+void PermuteToCRSPartitionedF(const int block_sparse_offset,
+                              const int num_values,
+                              const int num_row_blocks,
+                              const int num_row_blocks_e,
+                              const int* first_cell_in_row_block,
+                              const int* value_offset_row_block_f,
+                              const Cell* cells,
+                              const Block* row_blocks,
+                              const Block* col_blocks,
+                              const int* crs_rows,
+                              const double* block_sparse_values,
+                              double* crs_values,
+                              cudaStream_t stream) {
+  const int num_blocks_valuewise = NumBlocksInGrid(num_values);
+  PermuteToCrsKernel<true><<<num_blocks_valuewise, kCudaBlockSize, 0, stream>>>(
       block_sparse_offset,
       num_values,
       num_row_blocks,
+      num_row_blocks_e,
       first_cell_in_row_block,
+      value_offset_row_block_f,
       cells,
       row_blocks,
       col_blocks,
diff --git a/internal/ceres/cuda_kernels_bsm_to_crs.h b/internal/ceres/cuda_kernels_bsm_to_crs.h
index b6da69b..90b4882 100644
--- a/internal/ceres/cuda_kernels_bsm_to_crs.h
+++ b/internal/ceres/cuda_kernels_bsm_to_crs.h
@@ -54,6 +54,23 @@
                       int* cols,
                       cudaStream_t stream);
 
+// Compute structure of partitioned CRS matrix using block-sparse structure.
+// Arrays corresponding to CRS matrices are to be allocated by caller
+void FillCRSStructurePartitioned(const int num_row_blocks,
+                                 const int num_rows,
+                                 const int num_row_blocks_e,
+                                 const int num_col_blocks_e,
+                                 const int num_nonzeros_e,
+                                 const int* first_cell_in_row_block,
+                                 const Cell* cells,
+                                 const Block* row_blocks,
+                                 const Block* col_blocks,
+                                 int* rows_e,
+                                 int* cols_e,
+                                 int* rows_f,
+                                 int* cols_f,
+                                 cudaStream_t stream);
+
 // Permute segment of values from block-sparse matrix with sequential layout to
 // CRS order. Segment starts at block_sparse_offset and has length of num_values
 void PermuteToCRS(const int block_sparse_offset,
@@ -68,6 +85,24 @@
                   double* crs_values,
                   cudaStream_t stream);
 
+// Permute segment of values from F sub-matrix of block-sparse partitioned
+// matrix with sequential layout to CRS order. Segment starts at
+// block_sparse_offset (including the offset induced by values of E submatrix)
+// and has length of num_values
+void PermuteToCRSPartitionedF(const int block_sparse_offset,
+                              const int num_values,
+                              const int num_row_blocks,
+                              const int num_row_blocks_e,
+                              const int* first_cell_in_row_block,
+                              const int* value_offset_row_block_f,
+                              const Cell* cells,
+                              const Block* row_blocks,
+                              const Block* col_blocks,
+                              const int* crs_rows,
+                              const double* block_sparse_values,
+                              double* crs_values,
+                              cudaStream_t stream);
+
 }  // namespace internal
 }  // namespace ceres
 
diff --git a/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc b/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc
new file mode 100644
index 0000000..703be84
--- /dev/null
+++ b/internal/ceres/cuda_partitioned_block_sparse_crs_view.cc
@@ -0,0 +1,145 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#include "ceres/cuda_partitioned_block_sparse_crs_view.h"
+
+#ifndef CERES_NO_CUDA
+
+#include "ceres/cuda_block_structure.h"
+#include "ceres/cuda_kernels_bsm_to_crs.h"
+
+namespace ceres::internal {
+
+CudaPartitionedBlockSparseCRSView::CudaPartitionedBlockSparseCRSView(
+    const BlockSparseMatrix& bsm,
+    const int num_col_blocks_e,
+    ContextImpl* context)
+    :
+
+      context_(context) {
+  const auto& bs = *bsm.block_structure();
+  block_structure_ =
+      std::make_unique<CudaBlockSparseStructure>(bs, num_col_blocks_e, context);
+  // Determine number of non-zeros in left submatrix
+  // Row-blocks are at least 1 row high, thus we can use a temporary array of
+  // num_rows for ComputeNonZerosInColumnBlockSubMatrix; and later reuse it for
+  // FillCRSStructurePartitioned
+  const int num_rows = bsm.num_rows();
+  const int num_nonzeros_e = block_structure_->num_nonzeros_e();
+  const int num_nonzeros_f = bsm.num_nonzeros() - num_nonzeros_e;
+
+  const int num_cols_e = num_col_blocks_e < bs.cols.size()
+                             ? bs.cols[num_col_blocks_e].position
+                             : bsm.num_cols();
+  const int num_cols_f = bsm.num_cols() - num_cols_e;
+  matrix_e_ = std::make_unique<CudaSparseMatrix>(
+      num_rows, num_cols_e, num_nonzeros_e, context);
+  matrix_f_ = std::make_unique<CudaSparseMatrix>(
+      num_rows, num_cols_f, num_nonzeros_f, context);
+
+  num_row_blocks_e_ = block_structure_->num_row_blocks_e();
+  FillCRSStructurePartitioned(block_structure_->num_row_blocks(),
+                              num_rows,
+                              num_row_blocks_e_,
+                              num_col_blocks_e,
+                              num_nonzeros_e,
+                              block_structure_->first_cell_in_row_block(),
+                              block_structure_->cells(),
+                              block_structure_->row_blocks(),
+                              block_structure_->col_blocks(),
+                              matrix_e_->mutable_rows(),
+                              matrix_e_->mutable_cols(),
+                              matrix_f_->mutable_rows(),
+                              matrix_f_->mutable_cols(),
+                              context->DefaultStream());
+  f_is_crs_compatible_ = block_structure_->IsCrsCompatible();
+  if (f_is_crs_compatible_) {
+    block_structure_ = nullptr;
+  } else {
+    streamed_buffer_ = std::make_unique<CudaStreamedBuffer<double>>(
+        context, kMaxTemporaryArraySize);
+  }
+  CHECK_EQ(bsm.num_nonzeros(),
+           matrix_e_->num_nonzeros() + matrix_f_->num_nonzeros());
+
+  UpdateValues(bsm);
+}
+
+void CudaPartitionedBlockSparseCRSView::UpdateValues(
+    const BlockSparseMatrix& bsm) {
+  if (f_is_crs_compatible_) {
+    CHECK_EQ(cudaSuccess,
+             cudaMemcpyAsync(matrix_e_->mutable_values(),
+                             bsm.values(),
+                             matrix_e_->num_nonzeros() * sizeof(double),
+                             cudaMemcpyHostToDevice,
+                             context_->DefaultStream()));
+
+    CHECK_EQ(cudaSuccess,
+             cudaMemcpyAsync(matrix_f_->mutable_values(),
+                             bsm.values() + matrix_e_->num_nonzeros(),
+                             matrix_f_->num_nonzeros() * sizeof(double),
+                             cudaMemcpyHostToDevice,
+                             context_->DefaultStream()));
+    return;
+  }
+  streamed_buffer_->CopyToGpu(
+      bsm.values(),
+      bsm.num_nonzeros(),
+      [block_structure = block_structure_.get(),
+       num_nonzeros_e = matrix_e_->num_nonzeros(),
+       num_row_blocks_e = num_row_blocks_e_,
+       values_f = matrix_f_->mutable_values(),
+       rows_f = matrix_f_->rows()](
+          const double* values, int num_values, int offset, auto stream) {
+        PermuteToCRSPartitionedF(num_nonzeros_e + offset,
+                                 num_values,
+                                 block_structure->num_row_blocks(),
+                                 num_row_blocks_e,
+                                 block_structure->first_cell_in_row_block(),
+                                 block_structure->value_offset_row_block_f(),
+                                 block_structure->cells(),
+                                 block_structure->row_blocks(),
+                                 block_structure->col_blocks(),
+                                 rows_f,
+                                 values,
+                                 values_f,
+                                 stream);
+      });
+  CHECK_EQ(cudaSuccess,
+           cudaMemcpyAsync(matrix_e_->mutable_values(),
+                           bsm.values(),
+                           matrix_e_->num_nonzeros() * sizeof(double),
+                           cudaMemcpyHostToDevice,
+                           context_->DefaultStream()));
+}
+
+}  // namespace ceres::internal
+#endif  // CERES_NO_CUDA
diff --git a/internal/ceres/cuda_partitioned_block_sparse_crs_view.h b/internal/ceres/cuda_partitioned_block_sparse_crs_view.h
new file mode 100644
index 0000000..3072dea
--- /dev/null
+++ b/internal/ceres/cuda_partitioned_block_sparse_crs_view.h
@@ -0,0 +1,111 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+//
+
+#ifndef CERES_INTERNAL_CUDA_PARTITIONED_BLOCK_SPARSE_CRS_VIEW_H_
+#define CERES_INTERNAL_CUDA_PARTITIONED_BLOCK_SPARSE_CRS_VIEW_H_
+
+#include "ceres/internal/config.h"
+
+#ifndef CERES_NO_CUDA
+
+#include <memory>
+
+#include "ceres/block_sparse_matrix.h"
+#include "ceres/cuda_block_structure.h"
+#include "ceres/cuda_buffer.h"
+#include "ceres/cuda_sparse_matrix.h"
+#include "ceres/cuda_streamed_buffer.h"
+
+namespace ceres::internal {
+// We use cuSPARSE library for SpMV operations. However, it does not support
+// neither block-sparse format with varying size of the blocks nor
+// submatrix-vector products. Thus, we perform the following operations in order
+// to compute products of partitioned block-sparse matrices and dense vectors on
+// gpu:
+//  - Once per block-sparse structure update:
+//    - Compute CRS structures of left and right submatrices from block-sparse
+//    structure
+//    - Check if values of F sub-matrix can be copied without permutation
+//    matrices
+//  - Once per block-sparse values update:
+//    - Copy values of E sub-matrix
+//    - Permute or copy values of F sub-matrix
+//
+// It is assumed that cells of block-sparse matrix are laid out sequentially in
+// both of sub-matrices and there is exactly one cell in row-block of E
+// sub-matrix in the first num_row_blocks_e_ row blocks, and no cells in E
+// sub-matrix below num_row_blocks_e_ row blocks.
+//
+// This class avoids storing both CRS and block-sparse values in GPU memory.
+// Instead, block-sparse values are transferred to gpu memory as a disjoint set
+// of small continuous segments with simultaneous permutation of the values into
+// correct order using block-structure.
+class CERES_NO_EXPORT CudaPartitionedBlockSparseCRSView {
+ public:
+  // Initializes internal CRS matrix and block-sparse structure on GPU side
+  // values. The following objects are stored in gpu memory for the whole
+  // lifetime of the object
+  //  - matrix_e_: left CRS submatrix
+  //  - matrix_f_: right CRS submatrix
+  //  - block_structure_: copy of block-sparse structure on GPU
+  //  - streamed_buffer_: helper for value updating
+  CudaPartitionedBlockSparseCRSView(const BlockSparseMatrix& bsm,
+                                    const int num_col_blocks_e,
+                                    ContextImpl* context);
+
+  // Update values of CRS submatrices using values of block-sparse matrix.
+  // Assumes that bsm has the same block-sparse structure as matrix that was
+  // used for construction.
+  void UpdateValues(const BlockSparseMatrix& bsm);
+
+  const CudaSparseMatrix* matrix_e() const { return matrix_e_.get(); }
+  const CudaSparseMatrix* matrix_f() const { return matrix_f_.get(); }
+  CudaSparseMatrix* mutable_matrix_e() { return matrix_e_.get(); }
+  CudaSparseMatrix* mutable_matrix_f() { return matrix_f_.get(); }
+
+ private:
+  // Value permutation kernel performs a single element-wise operation per
+  // thread, thus performing permutation in blocks of 8 megabytes of
+  // block-sparse  values seems reasonable
+  static constexpr int kMaxTemporaryArraySize = 1 * 1024 * 1024;
+  std::unique_ptr<CudaSparseMatrix> matrix_e_;
+  std::unique_ptr<CudaSparseMatrix> matrix_f_;
+  std::unique_ptr<CudaStreamedBuffer<double>> streamed_buffer_;
+  std::unique_ptr<CudaBlockSparseStructure> block_structure_;
+  bool f_is_crs_compatible_;
+  int num_row_blocks_e_;
+  ContextImpl* context_;
+};
+
+}  // namespace ceres::internal
+
+#endif  // CERES_NO_CUDA
+#endif  // CERES_INTERNAL_CUDA_PARTITIONED_BLOCK_SPARSE_CRS_VIEW_H_
diff --git a/internal/ceres/cuda_partitioned_block_sparse_crs_view_test.cc b/internal/ceres/cuda_partitioned_block_sparse_crs_view_test.cc
new file mode 100644
index 0000000..5090b6a
--- /dev/null
+++ b/internal/ceres/cuda_partitioned_block_sparse_crs_view_test.cc
@@ -0,0 +1,282 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#include "ceres/cuda_partitioned_block_sparse_crs_view.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#ifndef CERES_NO_CUDA
+
+namespace ceres::internal {
+
+namespace {
+struct RandomPartitionedMatrixOptions {
+  int num_row_blocks_e;
+  int num_row_blocks_f;
+  int num_col_blocks_e;
+  int num_col_blocks_f;
+  int min_row_block_size;
+  int max_row_block_size;
+  int min_col_block_size;
+  int max_col_block_size;
+  double empty_f_probability;
+  double cell_probability_f;
+  int max_cells_f;
+};
+
+std::unique_ptr<BlockSparseMatrix> CreateRandomPartitionedMatrix(
+    const RandomPartitionedMatrixOptions& options, std::mt19937& rng) {
+  const int num_row_blocks =
+      std::max(options.num_row_blocks_e, options.num_row_blocks_f);
+  const int num_col_blocks =
+      options.num_col_blocks_e + options.num_col_blocks_f;
+
+  CompressedRowBlockStructure* block_structure =
+      new CompressedRowBlockStructure;
+  block_structure->cols.reserve(num_col_blocks);
+  block_structure->rows.reserve(num_row_blocks);
+
+  // Create column blocks
+  std::uniform_int_distribution<int> col_size(options.min_col_block_size,
+                                              options.max_col_block_size);
+  int num_cols = 0;
+  for (int i = 0; i < num_col_blocks; ++i) {
+    const int size = col_size(rng);
+    block_structure->cols.emplace_back(size, num_cols);
+    num_cols += size;
+  }
+
+  // Prepare column-block indices of E cells
+  std::vector<int> e_col_block_idx;
+  e_col_block_idx.reserve(options.num_row_blocks_e);
+  std::uniform_int_distribution<int> col_e(0, options.num_col_blocks_e - 1);
+  for (int i = 0; i < options.num_row_blocks_e; ++i) {
+    e_col_block_idx.emplace_back(col_e(rng));
+  }
+  std::sort(e_col_block_idx.begin(), e_col_block_idx.end());
+
+  // Prepare cell structure
+  std::uniform_int_distribution<int> row_size(options.min_row_block_size,
+                                              options.max_row_block_size);
+  std::uniform_real_distribution<double> uniform;
+  int num_rows = 0;
+  for (int i = 0; i < num_row_blocks; ++i) {
+    const int size = row_size(rng);
+    block_structure->rows.emplace_back();
+    auto& row = block_structure->rows.back();
+    row.block.size = size;
+    row.block.position = num_rows;
+    num_rows += size;
+    if (i < options.num_row_blocks_e) {
+      row.cells.emplace_back(e_col_block_idx[i], -1);
+      if (uniform(rng) < options.empty_f_probability) {
+        continue;
+      }
+    }
+    if (i >= options.num_row_blocks_f) continue;
+    const int cells_before = row.cells.size();
+    for (int j = options.num_col_blocks_e; j < num_col_blocks; ++j) {
+      if (uniform(rng) > options.cell_probability_f) {
+        continue;
+      }
+      row.cells.emplace_back(j, -1);
+    }
+    if (row.cells.size() > cells_before + options.max_cells_f) {
+      std::shuffle(row.cells.begin() + cells_before, row.cells.end(), rng);
+      row.cells.resize(cells_before + options.max_cells_f);
+      std::sort(
+          row.cells.begin(), row.cells.end(), [](const auto& a, const auto& b) {
+            return a.block_id < b.block_id;
+          });
+    }
+  }
+
+  // Fill positions in E sub-matrix
+  int num_nonzeros = 0;
+  for (int i = 0; i < options.num_row_blocks_e; ++i) {
+    CHECK_GE(block_structure->rows[i].cells.size(), 1);
+    block_structure->rows[i].cells[0].position = num_nonzeros;
+    const int col_block_size =
+        block_structure->cols[block_structure->rows[i].cells[0].block_id].size;
+    const int row_block_size = block_structure->rows[i].block.size;
+    num_nonzeros += row_block_size * col_block_size;
+    CHECK_GE(num_nonzeros, 0);
+  }
+  // Fill positions in F sub-matrix
+  for (int i = 0; i < options.num_row_blocks_f; ++i) {
+    const int row_block_size = block_structure->rows[i].block.size;
+    for (auto& cell : block_structure->rows[i].cells) {
+      if (cell.position >= 0) continue;
+      cell.position = num_nonzeros;
+      const int col_block_size = block_structure->cols[cell.block_id].size;
+      num_nonzeros += row_block_size * col_block_size;
+      CHECK_GE(num_nonzeros, 0);
+    }
+  }
+  // Populate values
+  auto bsm = std::make_unique<BlockSparseMatrix>(block_structure, true);
+  for (int i = 0; i < num_nonzeros; ++i) {
+    bsm->mutable_values()[i] = i + 1;
+  }
+  return bsm;
+}
+}  // namespace
+
+class CudaPartitionedBlockSparseCRSViewTest : public ::testing::Test {
+  static constexpr int kNumColBlocksE = 456;
+
+ protected:
+  void SetUp() final {
+    std::string message;
+    CHECK(context_.InitCuda(&message))
+        << "InitCuda() failed because: " << message;
+
+    RandomPartitionedMatrixOptions options;
+    options.num_row_blocks_f = 123;
+    options.num_row_blocks_e = 456;
+    options.num_col_blocks_f = 123;
+    options.num_col_blocks_e = kNumColBlocksE;
+    options.min_row_block_size = 1;
+    options.max_row_block_size = 4;
+    options.min_col_block_size = 1;
+    options.max_col_block_size = 4;
+    options.empty_f_probability = .1;
+    options.cell_probability_f = .2;
+    options.max_cells_f = options.num_col_blocks_f;
+
+    std::mt19937 rng;
+    short_f_ = CreateRandomPartitionedMatrix(options, rng);
+
+    options.num_row_blocks_e = 123;
+    options.num_row_blocks_f = 456;
+    short_e_ = CreateRandomPartitionedMatrix(options, rng);
+
+    options.max_cells_f = 1;
+    options.num_row_blocks_e = options.num_row_blocks_f;
+    options.num_row_blocks_e = options.num_row_blocks_f;
+    f_crs_compatible_ = CreateRandomPartitionedMatrix(options, rng);
+  }
+
+  void TestMatrix(const BlockSparseMatrix& A_) {
+    const int num_col_blocks_e = 456;
+    CudaPartitionedBlockSparseCRSView view(A_, kNumColBlocksE, &context_);
+
+    const int num_rows = A_.num_rows();
+    const int num_cols = A_.num_cols();
+
+    const auto& bs = *A_.block_structure();
+    const int num_cols_e = bs.cols[num_col_blocks_e].position;
+    const int num_cols_f = num_cols - num_cols_e;
+
+    // TODO: we definitely would like to use matrix() here, but
+    // CudaSparseMatrix::RightMultiplyAndAccumulate is defined non-const because
+    // it might allocate additional storage by request of cuSPARSE
+    auto matrix_e = view.mutable_matrix_e();
+    auto matrix_f = view.mutable_matrix_f();
+    ASSERT_EQ(matrix_e->num_cols(), num_cols_e);
+    ASSERT_EQ(matrix_e->num_rows(), num_rows);
+    ASSERT_EQ(matrix_f->num_cols(), num_cols_f);
+    ASSERT_EQ(matrix_f->num_rows(), num_rows);
+
+    Vector x(num_cols);
+    Vector x_left(num_cols_e);
+    Vector x_right(num_cols_f);
+    Vector y(num_rows);
+    CudaVector x_cuda(&context_, num_cols);
+    CudaVector x_left_cuda(&context_, num_cols_e);
+    CudaVector x_right_cuda(&context_, num_cols_f);
+    CudaVector y_cuda(&context_, num_rows);
+    Vector y_cuda_host(num_rows);
+
+    for (int i = 0; i < num_cols_e; ++i) {
+      x.setZero();
+      x_left.setZero();
+      y.setZero();
+      y_cuda.SetZero();
+      x[i] = 1.;
+      x_left[i] = 1.;
+      x_left_cuda.CopyFromCpu(x_left);
+      A_.RightMultiplyAndAccumulate(
+          x.data(), y.data(), &context_, std::thread::hardware_concurrency());
+      matrix_e->RightMultiplyAndAccumulate(x_left_cuda, &y_cuda);
+      y_cuda.CopyTo(&y_cuda_host);
+      // There will be up to 1 non-zero product per row, thus we expect an exact
+      // match on 32-bit integer indices
+      EXPECT_EQ((y - y_cuda_host).squaredNorm(), 0.);
+    }
+    for (int i = num_cols_e; i < num_cols_f; ++i) {
+      x.setZero();
+      x_right.setZero();
+      y.setZero();
+      y_cuda.SetZero();
+      x[i] = 1.;
+      x_right[i - num_cols_e] = 1.;
+      x_right_cuda.CopyFromCpu(x_right);
+      A_.RightMultiplyAndAccumulate(
+          x.data(), y.data(), &context_, std::thread::hardware_concurrency());
+      matrix_f->RightMultiplyAndAccumulate(x_right_cuda, &y_cuda);
+      y_cuda.CopyTo(&y_cuda_host);
+      // There will be up to 1 non-zero product per row, thus we expect an exact
+      // match on 32-bit integer indices
+      EXPECT_EQ((y - y_cuda_host).squaredNorm(), 0.);
+    }
+  }
+
+  // E sub-matrix might have less row-blocks with cells than F sub-matrix. This
+  // test matrix checks if this case is handled properly
+  std::unique_ptr<BlockSparseMatrix> short_e_;
+  // In case of non-crs compatible F matrix, permuting values from block-order
+  // to crs order involves binary search over row-blocks of F. Having lots of
+  // row-blocks with no F cells is an edge case for this algorithm.
+  std::unique_ptr<BlockSparseMatrix> short_f_;
+  // With F matrix being CRS-compatible, update of the values of partitioned
+  // matrix view reduces to two host->device memcopies, and uses a separate code
+  // path
+  std::unique_ptr<BlockSparseMatrix> f_crs_compatible_;
+
+  ContextImpl context_;
+};
+
+TEST_F(CudaPartitionedBlockSparseCRSViewTest, CreateUpdateValuesShortE) {
+  TestMatrix(*short_e_);
+}
+
+TEST_F(CudaPartitionedBlockSparseCRSViewTest, CreateUpdateValuesShortF) {
+  TestMatrix(*short_f_);
+}
+
+TEST_F(CudaPartitionedBlockSparseCRSViewTest,
+       CreateUpdateValuesCrsCompatibleF) {
+  TestMatrix(*f_crs_compatible_);
+}
+}  // namespace ceres::internal
+
+#endif  // CERES_NO_CUDA
diff --git a/internal/ceres/evaluation_benchmark.cc b/internal/ceres/evaluation_benchmark.cc
index c23c881..28a8aff 100644
--- a/internal/ceres/evaluation_benchmark.cc
+++ b/internal/ceres/evaluation_benchmark.cc
@@ -37,6 +37,7 @@
 #include "ceres/block_sparse_matrix.h"
 #include "ceres/bundle_adjustment_test_util.h"
 #include "ceres/cuda_block_sparse_crs_view.h"
+#include "ceres/cuda_partitioned_block_sparse_crs_view.h"
 #include "ceres/cuda_sparse_matrix.h"
 #include "ceres/cuda_vector.h"
 #include "ceres/evaluator.h"
@@ -476,6 +477,110 @@
 }
 
 #ifndef CERES_NO_CUDA
+static void PMVRightMultiplyAndAccumulateFCuda(benchmark::State& state,
+                                               BALData* data,
+                                               ContextImpl* context) {
+  LinearSolver::Options options;
+  options.elimination_groups.push_back(data->bal_problem->num_points());
+  options.context = context;
+  options.num_threads = 1;
+  auto jacobian = data->PartitionedMatrixViewJacobian(options);
+  auto underlying_matrix = data->BlockSparseJacobianPartitioned(context);
+  CudaPartitionedBlockSparseCRSView view(
+      *underlying_matrix, jacobian->num_col_blocks_e(), context);
+
+  Vector x = Vector::Random(jacobian->num_cols_f());
+  CudaVector cuda_x(context, x.size());
+  CudaVector cuda_y(context, jacobian->num_rows());
+
+  cuda_x.CopyFromCpu(x);
+  cuda_y.SetZero();
+
+  auto matrix = view.mutable_matrix_f();
+  for (auto _ : state) {
+    matrix->RightMultiplyAndAccumulate(cuda_x, &cuda_y);
+  }
+  CHECK_GT(cuda_y.Norm(), 0.);
+}
+
+static void PMVLeftMultiplyAndAccumulateFCuda(benchmark::State& state,
+                                              BALData* data,
+                                              ContextImpl* context) {
+  LinearSolver::Options options;
+  options.elimination_groups.push_back(data->bal_problem->num_points());
+  options.context = context;
+  options.num_threads = 1;
+  auto jacobian = data->PartitionedMatrixViewJacobian(options);
+  auto underlying_matrix = data->BlockSparseJacobianPartitioned(context);
+  CudaPartitionedBlockSparseCRSView view(
+      *underlying_matrix, jacobian->num_col_blocks_e(), context);
+
+  Vector x = Vector::Random(jacobian->num_rows());
+  CudaVector cuda_x(context, x.size());
+  CudaVector cuda_y(context, jacobian->num_cols_f());
+
+  cuda_x.CopyFromCpu(x);
+  cuda_y.SetZero();
+
+  auto matrix = view.mutable_matrix_f();
+  for (auto _ : state) {
+    matrix->LeftMultiplyAndAccumulate(cuda_x, &cuda_y);
+  }
+  CHECK_GT(cuda_y.Norm(), 0.);
+}
+
+static void PMVRightMultiplyAndAccumulateECuda(benchmark::State& state,
+                                               BALData* data,
+                                               ContextImpl* context) {
+  LinearSolver::Options options;
+  options.elimination_groups.push_back(data->bal_problem->num_points());
+  options.context = context;
+  options.num_threads = 1;
+  auto jacobian = data->PartitionedMatrixViewJacobian(options);
+  auto underlying_matrix = data->BlockSparseJacobianPartitioned(context);
+  CudaPartitionedBlockSparseCRSView view(
+      *underlying_matrix, jacobian->num_col_blocks_e(), context);
+
+  Vector x = Vector::Random(jacobian->num_cols_e());
+  CudaVector cuda_x(context, x.size());
+  CudaVector cuda_y(context, jacobian->num_rows());
+
+  cuda_x.CopyFromCpu(x);
+  cuda_y.SetZero();
+
+  auto matrix = view.mutable_matrix_e();
+  for (auto _ : state) {
+    matrix->RightMultiplyAndAccumulate(cuda_x, &cuda_y);
+  }
+  CHECK_GT(cuda_y.Norm(), 0.);
+}
+
+static void PMVLeftMultiplyAndAccumulateECuda(benchmark::State& state,
+                                              BALData* data,
+                                              ContextImpl* context) {
+  LinearSolver::Options options;
+  options.elimination_groups.push_back(data->bal_problem->num_points());
+  options.context = context;
+  options.num_threads = 1;
+  auto jacobian = data->PartitionedMatrixViewJacobian(options);
+  auto underlying_matrix = data->BlockSparseJacobianPartitioned(context);
+  CudaPartitionedBlockSparseCRSView view(
+      *underlying_matrix, jacobian->num_col_blocks_e(), context);
+
+  Vector x = Vector::Random(jacobian->num_rows());
+  CudaVector cuda_x(context, x.size());
+  CudaVector cuda_y(context, jacobian->num_cols_e());
+
+  cuda_x.CopyFromCpu(x);
+  cuda_y.SetZero();
+
+  auto matrix = view.mutable_matrix_e();
+  for (auto _ : state) {
+    matrix->LeftMultiplyAndAccumulate(cuda_x, &cuda_y);
+  }
+  CHECK_GT(cuda_y.Norm(), 0.);
+}
+
 // We want CudaBlockSparseCRSView to be not slower than explicit conversion to
 // CRS on CPU
 static void JacobianToCRSView(benchmark::State& state,
@@ -734,6 +839,16 @@
         ->Arg(8)
         ->Arg(16);
 
+#ifndef CERES_NO_CUDA
+    const std::string name_right_product_partitioned_f_cuda =
+        "PMVRightMultiplyAndAccumulateFCuda<" + path + ">";
+    ::benchmark::RegisterBenchmark(
+        name_right_product_partitioned_f_cuda.c_str(),
+        ceres::internal::PMVRightMultiplyAndAccumulateFCuda,
+        data,
+        &context);
+#endif
+
     const std::string name_right_product_partitioned_e =
         "PMVRightMultiplyAndAccumulateE<" + path + ">";
     ::benchmark::RegisterBenchmark(
@@ -747,6 +862,16 @@
         ->Arg(8)
         ->Arg(16);
 
+#ifndef CERES_NO_CUDA
+    const std::string name_right_product_partitioned_e_cuda =
+        "PMVRightMultiplyAndAccumulateECuda<" + path + ">";
+    ::benchmark::RegisterBenchmark(
+        name_right_product_partitioned_e_cuda.c_str(),
+        ceres::internal::PMVRightMultiplyAndAccumulateECuda,
+        data,
+        &context);
+#endif
+
     const std::string name_update_block_diagonal_ftf =
         "PMVUpdateBlockDiagonalFtF<" + path + ">";
     ::benchmark::RegisterBenchmark(name_update_block_diagonal_ftf.c_str(),
@@ -831,6 +956,16 @@
         ->Arg(8)
         ->Arg(16);
 
+#ifndef CERES_NO_CUDA
+    const std::string name_left_product_partitioned_f_cuda =
+        "PMVLeftMultiplyAndAccumulateFCuda<" + path + ">";
+    ::benchmark::RegisterBenchmark(
+        name_left_product_partitioned_f_cuda.c_str(),
+        ceres::internal::PMVLeftMultiplyAndAccumulateFCuda,
+        data,
+        &context);
+#endif
+
     const std::string name_left_product_partitioned_e =
         "PMVLeftMultiplyAndAccumulateE<" + path + ">";
     ::benchmark::RegisterBenchmark(
@@ -845,6 +980,16 @@
         ->Arg(16);
 
 #ifndef CERES_NO_CUDA
+    const std::string name_left_product_partitioned_e_cuda =
+        "PMVLeftMultiplyAndAccumulateECuda<" + path + ">";
+    ::benchmark::RegisterBenchmark(
+        name_left_product_partitioned_e_cuda.c_str(),
+        ceres::internal::PMVLeftMultiplyAndAccumulateECuda,
+        data,
+        &context);
+#endif
+
+#ifndef CERES_NO_CUDA
     const std::string name_left_product_cuda =
         "JacobianLeftMultiplyAndAccumulateCuda<" + path + ">";
     ::benchmark::RegisterBenchmark(