Block-sparse to CRS conversion using block-structure

Instead of pre-computing pemutation from block-sparse to CRS order,
index of value in CRS matrix is computed in the process of updating
values using block-sparse structure.

When it is possible to update values via a simple host-to-device copy,
block-sparse structure on GPU is discarded after computing CRS
structure.

Computing index is significantly slower than using pre-computed
permutation, but is still hidden by host-to-device transfer.

On problems from BAL dataset this results into reduction of extra
gpu memory consumption from 33% (permutation stored as 32-bit indices)
to ~10% for storing block-sparse structure.

Benchmark results:

======================= CUDA Device Properties ======================
Cuda version         : 11.8
Device ID            : 0
Device name          : NVIDIA GeForce RTX 2080 Ti
Total GPU memory     :  11012 MiB
GPU memory available :  10852 MiB
Compute capability   : 7.5
Warp size            : 32
Max threads per block: 1024
Max threads per dim  : 1024 1024 64
Max grid size        : 2147483647 65535 65535
Multiprocessor count : 68
====================================================================
Running ./bin/evaluation_benchmark
Run on (112 X 3200 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x56)
  L1 Instruction 32 KiB (x56)
  L2 Unified 1024 KiB (x56)
  L3 Unified 39424 KiB (x2)
Load Average: 24.58, 11.75, 8.52

-----------------------------------------------------------------------
Benchmark                                                          Time
-----------------------------------------------------------------------
Using on-the-fly computation of CRS index corresponding to block-sparse
index:

JacobianToCRS<g/final/problem-4585-1324582-pre.txt>             1607 ms
JacobianToCRSView<g/final/problem-4585-1324582-pre.txt>          564 ms
JacobianToCRSMatrix<g/final/problem-4585-1324582-pre.txt>       2226 ms
JacobianToCRSViewUpdate<g/final/problem-4585-1324582-pre.txt>    228 ms
JacobianToCRSMatrixUpdate<g/final/problem-4585-1324582-pre.txt>  400 ms

Using precomputed permutation:
JacobianToCRS</final/problem-4585-1324582-pre.txt>              1656 ms
JacobianToCRSView</final/problem-4585-1324582-pre.txt>           553 ms
JacobianToCRSMatrix</final/problem-4585-1324582-pre.txt>        2255 ms
JacobianToCRSViewUpdate</final/problem-4585-1324582-pre.txt>     228 ms
JacobianToCRSMatrixUpdate</final/problem-4585-1324582-pre.txt>   406 ms

Performance of JacobianToCRSViewUpdate is still limited by
host-to-device transfer, and JacobianToCRSView is faster than computing
CRS structure on CPU.

Change-Id: Ifb6910fb01ae6071400d36c277846fadc5857964
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index e4b124d..be4bad1 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -49,20 +49,23 @@
             libsuitesparse-dev \
             ninja-build \
             wget
-      - name: Setup CUDA toolkit (system repositories)
+      # nvidia cuda toolkit shipped with 20.04 LTS does not support stream-ordered allocations
+      - name: Setup CUDA toolkit repositories (20.04)
         if: matrix.gpu == 'cuda' && matrix.os == 'ubuntu:20.04'
         run: |
-          apt-get install -y \
-            nvidia-cuda-dev \
-            nvidia-cuda-toolkit
+          wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb
+          dpkg -i cuda-keyring_1.0-1_all.deb
       # nvidia cuda toolkit + gcc combo shipped with 22.04LTS is broken
       # and is not able to compile code that uses thrust
       # https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1006962
-      - name: Setup CUDA toolkit (nvidia repositories)
+      - name: Setup CUDA toolkit repositories (22.04)
         if: matrix.gpu == 'cuda' && matrix.os == 'ubuntu:22.04'
         run: |
           wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb
           dpkg -i cuda-keyring_1.0-1_all.deb
+      - name: Setup CUDA toolkit
+        if: matrix.gpu == 'cuda'
+        run: |
           apt-get update
           apt-get install -y cuda
           echo "CUDACXX=/usr/local/cuda/bin/nvcc" >> $GITHUB_ENV
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index 9cb99b4..b67b10e 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -140,9 +140,12 @@
         COMMAND ${CUDAToolkit_BIN_DIR}/cuda-memcheck --leak-check full
             $<TARGET_FILE:cuda_dense_cholesky_test>)
   endif (BUILD_TESTING AND GFLAGS)
-  set_source_files_properties(cuda_kernels.cu.cc PROPERTIES LANGUAGE CUDA)
-  add_library(ceres_cuda_kernels STATIC cuda_kernels.cu.cc)
+  set_source_files_properties(cuda_kernels_vector_ops.cu.cc PROPERTIES LANGUAGE CUDA)
+  set_source_files_properties(cuda_kernels_bsm_to_crs.cu.cc PROPERTIES LANGUAGE CUDA)
+  add_library(ceres_cuda_kernels STATIC cuda_kernels_vector_ops.cu.cc cuda_kernels_bsm_to_crs.cu.cc)
   target_compile_features(ceres_cuda_kernels PRIVATE cxx_std_14)
+  # Enable __host__ / __device__ annotations in lambda declarations
+  target_compile_options(ceres_cuda_kernels PRIVATE --extended-lambda)
   target_include_directories(ceres_cuda_kernels PRIVATE  ${Ceres_SOURCE_DIR}/internal  ${Ceres_BINARY_DIR}/${CMAKE_INSTALL_INCLUDEDIR})
   list(APPEND CERES_LIBRARY_PRIVATE_DEPENDENCIES ceres_cuda_kernels)
 endif (USE_CUDA)
@@ -479,7 +482,7 @@
   ceres_test(cuda_block_structure)
   ceres_test(cuda_dense_cholesky)
   ceres_test(cuda_dense_qr)
-  ceres_test(cuda_kernels)
+  ceres_test(cuda_kernels_vector_ops)
   ceres_test(cuda_sparse_matrix)
   ceres_test(cuda_streamed_buffer)
   ceres_test(cuda_vector)
diff --git a/internal/ceres/block_sparse_matrix.cc b/internal/ceres/block_sparse_matrix.cc
index ab1d746..01dbfbe 100644
--- a/internal/ceres/block_sparse_matrix.cc
+++ b/internal/ceres/block_sparse_matrix.cc
@@ -709,7 +709,9 @@
 }
 
 std::unique_ptr<BlockSparseMatrix> BlockSparseMatrix::CreateRandomMatrix(
-    const BlockSparseMatrix::RandomMatrixOptions& options, std::mt19937& prng) {
+    const BlockSparseMatrix::RandomMatrixOptions& options,
+    std::mt19937& prng,
+    bool use_page_locked_memory) {
   CHECK_GT(options.num_row_blocks, 0);
   CHECK_GT(options.min_row_block_size, 0);
   CHECK_GT(options.max_row_block_size, 0);
@@ -766,7 +768,8 @@
     }
   }
 
-  auto matrix = std::make_unique<BlockSparseMatrix>(bs.release());
+  auto matrix =
+      std::make_unique<BlockSparseMatrix>(bs.release(), use_page_locked_memory);
   double* values = matrix->mutable_values();
   std::normal_distribution<double> standard_normal_distribution;
   std::generate_n(
diff --git a/internal/ceres/block_sparse_matrix.h b/internal/ceres/block_sparse_matrix.h
index 0d99e15..cd13b06 100644
--- a/internal/ceres/block_sparse_matrix.h
+++ b/internal/ceres/block_sparse_matrix.h
@@ -157,7 +157,9 @@
   // distributed and whose structure is determined by
   // RandomMatrixOptions.
   static std::unique_ptr<BlockSparseMatrix> CreateRandomMatrix(
-      const RandomMatrixOptions& options, std::mt19937& prng);
+      const RandomMatrixOptions& options,
+      std::mt19937& prng,
+      bool use_page_locked_memory = false);
 
  private:
   double* AllocateValues(int size);
diff --git a/internal/ceres/cuda_block_sparse_crs_view.cc b/internal/ceres/cuda_block_sparse_crs_view.cc
index a03d01c..bb267da 100644
--- a/internal/ceres/cuda_block_sparse_crs_view.cc
+++ b/internal/ceres/cuda_block_sparse_crs_view.cc
@@ -32,41 +32,69 @@
 
 #ifndef CERES_NO_CUDA
 
-#include "ceres/cuda_block_structure.h"
-#include "ceres/cuda_kernels.h"
+#include "ceres/cuda_kernels_bsm_to_crs.h"
 
 namespace ceres::internal {
 
 CudaBlockSparseCRSView::CudaBlockSparseCRSView(const BlockSparseMatrix& bsm,
                                                ContextImpl* context)
-    : permutation_(context, bsm.num_nonzeros()),
-      streamed_buffer_(context, kMaxTemporaryArraySize) {
-  CudaBlockSparseStructure block_structure(*bsm.block_structure(), context);
+    : context_(context) {
+  block_structure_ = std::make_unique<CudaBlockSparseStructure>(
+      *bsm.block_structure(), context);
+  // Only block-sparse matrices with sequential layout of cells are supported
+  CHECK(block_structure_->sequential_layout());
   crs_matrix_ = std::make_unique<CudaSparseMatrix>(
       bsm.num_rows(), bsm.num_cols(), bsm.num_nonzeros(), context);
-  CudaBuffer<int> temp_rows(context, bsm.num_rows());
-  FillCRSStructure(block_structure.num_row_blocks(),
+  FillCRSStructure(block_structure_->num_row_blocks(),
                    bsm.num_rows(),
-                   block_structure.row_block_offsets(),
-                   block_structure.cells(),
-                   block_structure.row_blocks(),
-                   block_structure.col_blocks(),
+                   block_structure_->first_cell_in_row_block(),
+                   block_structure_->cells(),
+                   block_structure_->row_blocks(),
+                   block_structure_->col_blocks(),
                    crs_matrix_->mutable_rows(),
                    crs_matrix_->mutable_cols(),
-                   temp_rows.data(),
-                   permutation_.data(),
                    context->DefaultStream());
+  is_crs_compatible_ = block_structure_->IsCrsCompatible();
+  // if matrix is crs-compatible - we can drop block-structure and don't need
+  // streamed_buffer_
+  if (is_crs_compatible_) {
+    VLOG(3) << "Block-sparse matrix is compatible with CRS, discarding "
+               "block-structure";
+    block_structure_ = nullptr;
+  } else {
+    streamed_buffer_ = std::make_unique<CudaStreamedBuffer<double>>(
+        context_, kMaxTemporaryArraySize);
+  }
   UpdateValues(bsm);
 }
+
 void CudaBlockSparseCRSView::UpdateValues(const BlockSparseMatrix& bsm) {
-  streamed_buffer_.CopyToGpu(
+  if (is_crs_compatible_) {
+    // Values of CRS-compatible matrices can be copied as-is
+    CHECK_EQ(cudaSuccess,
+             cudaMemcpyAsync(crs_matrix_->mutable_values(),
+                             bsm.values(),
+                             bsm.num_nonzeros() * sizeof(double),
+                             cudaMemcpyHostToDevice,
+                             context_->DefaultStream()));
+    return;
+  }
+  streamed_buffer_->CopyToGpu(
       bsm.values(),
       bsm.num_nonzeros(),
-      [permutation = permutation_.data(),
-       values_to = crs_matrix_->mutable_values()](
+      [bs = block_structure_.get(), crs = crs_matrix_.get()](
           const double* values, int num_values, int offset, auto stream) {
-        PermuteValues(
-            offset, num_values, permutation, values, values_to, stream);
+        PermuteToCRS(offset,
+                     num_values,
+                     bs->num_row_blocks(),
+                     bs->first_cell_in_row_block(),
+                     bs->cells(),
+                     bs->row_blocks(),
+                     bs->col_blocks(),
+                     crs->rows(),
+                     values,
+                     crs->mutable_values(),
+                     stream);
       });
 }
 
diff --git a/internal/ceres/cuda_block_sparse_crs_view.h b/internal/ceres/cuda_block_sparse_crs_view.h
index 3ea8498..2ae8721 100644
--- a/internal/ceres/cuda_block_sparse_crs_view.h
+++ b/internal/ceres/cuda_block_sparse_crs_view.h
@@ -39,6 +39,7 @@
 #include <memory>
 
 #include "ceres/block_sparse_matrix.h"
+#include "ceres/cuda_block_structure.h"
 #include "ceres/cuda_buffer.h"
 #include "ceres/cuda_sparse_matrix.h"
 #include "ceres/cuda_streamed_buffer.h"
@@ -49,42 +50,23 @@
 // following operations in order to compute products of block-sparse matrices
 // and dense vectors on gpu:
 //  - Once per block-sparse structure update:
-//    - Compute CRS structure from block-sparse structure
-//    - Compute permutation from block-sparse values to CRS values
+//    - Compute CRS structure from block-sparse structure and check if values of
+//    block-sparse matrix would have the same order as values of CRS matrix
 //  - Once per block-sparse values update:
 //    - Update values in CRS matrix with values of block-sparse matrix
 //
-// Since there are no constraints on positions of cells in value array of
-// block-sparse matrix, a permutation from block-sparse values to CRS
-// values is stored explicitly.
+// Only block-sparse matrices with sequential order of cells are supported.
 //
-// Example: given matrix with the following block-structure
-//  [ 1 2 | | 6 7 ]
-//  [ 3 4 | | 8 9 ]
-//  [-----+-+-----]
-//  [     |5|     ]
-// with values stored as values_block_sparse = [1, 2, 3, 4, 5, 6, 7, 8, 9],
-// permutation from block-sparse to CRS is p = [0, 1, 4, 5, 8, 2, 3, 6, 7];
-// and i-th block-sparse value has index p[i] in CRS values array
-//
-// This allows to avoid storing both CRS and block-sparse values in GPU memory.
-// Instead, block-sparse values are transferred to gpu memory as a disjoint set
-// of small continuous segments with simultaneous permutation of the values into
-// correct order
+// UpdateValues method updates values:
+//  - In a single host-to-device copy for matrices with CRS-compatible value
+//  layout
+//  - Simultaneously transferring and permuting values using CudaStreamedBuffer
+//  otherwise
 class CERES_NO_EXPORT CudaBlockSparseCRSView {
  public:
-  // Initializes internal CRS matrix and permutation from block-sparse to CRS
-  // values. The following objects are stored in gpu memory for the whole
-  // lifetime of the object
-  //  - crs_matrix_: CRS matrix
-  //  - permutation_: permutation from block-sparse to CRS value order
-  //  (num_nonzeros integer values)
-  //  - streamed_buffer_: helper for value updating
-  // The following objects are created temporarily during construction:
-  //  - CudaBlockSparseStructure: block-sparse structure of block-sparse matrix
-  //  - num_rows integer values: row to row-block map
-  //  If copy_values flag is set to false, only structure of block-sparse matrix
-  //  bsm is captured, and values are left uninitialized
+  // Initializes internal CRS matrix using structure and values of block-sparse
+  // matrix For block-sparse matrices that have value layout different from CRS
+  // block-sparse structure will be stored/
   CudaBlockSparseCRSView(const BlockSparseMatrix& bsm, ContextImpl* context);
 
   const CudaSparseMatrix* crs_matrix() const { return crs_matrix_.get(); }
@@ -95,16 +77,21 @@
   // used for construction.
   void UpdateValues(const BlockSparseMatrix& bsm);
 
+  // Returns true if block-sparse matrix had CRS-compatible value layout
+  bool IsCrsCompatible() const { return is_crs_compatible_; }
+
  private:
   // Value permutation kernel performs a single element-wise operation per
   // thread, thus performing permutation in blocks of 8 megabytes of
   // block-sparse  values seems reasonable
   static constexpr int kMaxTemporaryArraySize = 1 * 1024 * 1024;
   std::unique_ptr<CudaSparseMatrix> crs_matrix_;
-  // Permutation from block-sparse to CRS value order.
-  // permutation_[i] = index of i-th block-sparse value in CRS values
-  CudaBuffer<int> permutation_;
-  CudaStreamedBuffer<double> streamed_buffer_;
+  // Only created if block-sparse matrix has non-CRS value layout
+  std::unique_ptr<CudaStreamedBuffer<double>> streamed_buffer_;
+  // Only stored if block-sparse matrix has non-CRS value layout
+  std::unique_ptr<CudaBlockSparseStructure> block_structure_;
+  bool is_crs_compatible_;
+  ContextImpl* context_;
 };
 
 }  // namespace ceres::internal
diff --git a/internal/ceres/cuda_block_sparse_crs_view_test.cc b/internal/ceres/cuda_block_sparse_crs_view_test.cc
index f8d1cdf..e80d026 100644
--- a/internal/ceres/cuda_block_sparse_crs_view_test.cc
+++ b/internal/ceres/cuda_block_sparse_crs_view_test.cc
@@ -52,48 +52,110 @@
     options.max_col_block_size = 10;
     options.block_density = 0.2;
     std::mt19937 rng;
-    A_ = BlockSparseMatrix::CreateRandomMatrix(options, rng);
-    std::iota(
-        A_->mutable_values(), A_->mutable_values() + A_->num_nonzeros(), 1);
+
+    // Block-sparse matrix with order of values different from CRS
+    block_sparse_non_crs_compatible_ =
+        BlockSparseMatrix::CreateRandomMatrix(options, rng, true);
+    std::iota(block_sparse_non_crs_compatible_->mutable_values(),
+              block_sparse_non_crs_compatible_->mutable_values() +
+                  block_sparse_non_crs_compatible_->num_nonzeros(),
+              1);
+
+    options.max_row_block_size = 1;
+    // Block-sparse matrix with CRS order of values (row-blocks are rows)
+    block_sparse_crs_compatible_rows_ =
+        BlockSparseMatrix::CreateRandomMatrix(options, rng, true);
+    std::iota(block_sparse_crs_compatible_rows_->mutable_values(),
+              block_sparse_crs_compatible_rows_->mutable_values() +
+                  block_sparse_crs_compatible_rows_->num_nonzeros(),
+              1);
+    // Block-sparse matrix with CRS order of values (single cell per row-block)
+    auto bs = std::make_unique<CompressedRowBlockStructure>(
+        *block_sparse_non_crs_compatible_->block_structure());
+
+    int num_nonzeros = 0;
+    for (auto& r : bs->rows) {
+      const int num_cells = r.cells.size();
+      if (num_cells > 1) {
+        std::uniform_int_distribution<int> uniform_cell(0, num_cells - 1);
+        const int selected_cell = uniform_cell(rng);
+        std::swap(r.cells[0], r.cells[selected_cell]);
+        r.cells.resize(1);
+      }
+      const int row_block_size = r.block.size;
+      for (auto& c : r.cells) {
+        c.position = num_nonzeros;
+        const int col_block_size = bs->cols[c.block_id].size;
+        num_nonzeros += col_block_size * row_block_size;
+      }
+    }
+    block_sparse_crs_compatible_single_cell_ =
+        std::make_unique<BlockSparseMatrix>(bs.release());
+    std::iota(block_sparse_crs_compatible_single_cell_->mutable_values(),
+              block_sparse_crs_compatible_single_cell_->mutable_values() +
+                  block_sparse_crs_compatible_single_cell_->num_nonzeros(),
+              1);
   }
 
-  std::unique_ptr<BlockSparseMatrix> A_;
+  void Compare(const BlockSparseMatrix& bsm, CudaSparseMatrix& csm) {
+    ASSERT_EQ(csm.num_cols(), bsm.num_cols());
+    ASSERT_EQ(csm.num_rows(), bsm.num_rows());
+    ASSERT_EQ(csm.num_nonzeros(), bsm.num_nonzeros());
+    const int num_rows = bsm.num_rows();
+    const int num_cols = bsm.num_cols();
+    Vector x(num_cols);
+    Vector y(num_rows);
+    CudaVector x_cuda(&context_, num_cols);
+    CudaVector y_cuda(&context_, num_rows);
+    Vector y_cuda_host(num_rows);
+
+    for (int i = 0; i < num_cols; ++i) {
+      x.setZero();
+      y.setZero();
+      y_cuda.SetZero();
+      x[i] = 1.;
+      x_cuda.CopyFromCpu(x);
+      csm.RightMultiplyAndAccumulate(x_cuda, &y_cuda);
+      bsm.RightMultiplyAndAccumulate(
+          x.data(), y.data(), &context_, std::thread::hardware_concurrency());
+      y_cuda.CopyTo(&y_cuda_host);
+      // There will be up to 1 non-zero product per row, thus we expect an exact
+      // match on 32-bit integer indices
+      EXPECT_EQ((y - y_cuda_host).squaredNorm(), 0.);
+    }
+  }
+
+  std::unique_ptr<BlockSparseMatrix> block_sparse_non_crs_compatible_;
+  std::unique_ptr<BlockSparseMatrix> block_sparse_crs_compatible_rows_;
+  std::unique_ptr<BlockSparseMatrix> block_sparse_crs_compatible_single_cell_;
   ContextImpl context_;
 };
 
-TEST_F(CudaBlockSparseCRSViewTest, CreateUpdateValues) {
-  auto view = CudaBlockSparseCRSView(*A_, &context_);
+TEST_F(CudaBlockSparseCRSViewTest, CreateUpdateValuesNonCompatible) {
+  auto view =
+      CudaBlockSparseCRSView(*block_sparse_non_crs_compatible_, &context_);
+  ASSERT_EQ(view.IsCrsCompatible(), false);
 
-  // TODO: we definitely would like to use crs_matrix() here, but
-  // CudaSparseMatrix::RightMultiplyAndAccumulate is defined non-const because
-  // it might allocate additional storage by request of cuSPARSE
   auto matrix = view.mutable_crs_matrix();
-  ASSERT_EQ(matrix->num_cols(), A_->num_cols());
-  ASSERT_EQ(matrix->num_rows(), A_->num_rows());
-  ASSERT_EQ(matrix->num_nonzeros(), A_->num_nonzeros());
+  Compare(*block_sparse_non_crs_compatible_, *matrix);
+}
 
-  const int num_rows = A_->num_rows();
-  const int num_cols = A_->num_cols();
-  Vector x(num_cols);
-  Vector y(num_rows);
-  CudaVector x_cuda(&context_, num_cols);
-  CudaVector y_cuda(&context_, num_rows);
-  Vector y_cuda_host(num_rows);
+TEST_F(CudaBlockSparseCRSViewTest, CreateUpdateValuesCompatibleRows) {
+  auto view =
+      CudaBlockSparseCRSView(*block_sparse_crs_compatible_rows_, &context_);
+  ASSERT_EQ(view.IsCrsCompatible(), true);
 
-  for (int i = 0; i < num_cols; ++i) {
-    x.setZero();
-    y.setZero();
-    y_cuda.SetZero();
-    x[i] = 1.;
-    x_cuda.CopyFromCpu(x);
-    A_->RightMultiplyAndAccumulate(
-        x.data(), y.data(), &context_, std::thread::hardware_concurrency());
-    matrix->RightMultiplyAndAccumulate(x_cuda, &y_cuda);
-    y_cuda.CopyTo(&y_cuda_host);
-    // There will be up to 1 non-zero product per row, thus we expect an exact
-    // match on 32-bit integer indices
-    EXPECT_EQ((y - y_cuda_host).squaredNorm(), 0.);
-  }
+  auto matrix = view.mutable_crs_matrix();
+  Compare(*block_sparse_crs_compatible_rows_, *matrix);
+}
+
+TEST_F(CudaBlockSparseCRSViewTest, CreateUpdateValuesCompatibleSingleCell) {
+  auto view = CudaBlockSparseCRSView(*block_sparse_crs_compatible_single_cell_,
+                                     &context_);
+  ASSERT_EQ(view.IsCrsCompatible(), true);
+
+  auto matrix = view.mutable_crs_matrix();
+  Compare(*block_sparse_crs_compatible_single_cell_, *matrix);
 }
 }  // namespace ceres::internal
 
diff --git a/internal/ceres/cuda_block_structure.cc b/internal/ceres/cuda_block_structure.cc
index 8e549e1..5817cd7 100644
--- a/internal/ceres/cuda_block_structure.cc
+++ b/internal/ceres/cuda_block_structure.cc
@@ -46,7 +46,7 @@
 
 CudaBlockSparseStructure::CudaBlockSparseStructure(
     const CompressedRowBlockStructure& block_structure, ContextImpl* context)
-    : row_block_offsets_(context),
+    : first_cell_in_row_block_(context),
       cells_(context),
       row_blocks_(context),
       col_blocks_(context) {
@@ -56,44 +56,56 @@
   const auto& col_blocks = block_structure.cols;
 
   // Row block offset is an index of the first cell corresponding to row block
-  std::vector<int> row_block_offsets;
+  std::vector<int> first_cell_in_row_block;
   // Flat array of all cells from all row-blocks
   std::vector<Cell> cells;
 
+  int f_values_offset = 0;
+  is_crs_compatible_ = true;
   num_row_blocks_ = block_structure.rows.size();
   num_col_blocks_ = col_blocks.size();
 
   row_blocks.reserve(num_row_blocks_);
-  row_block_offsets.reserve(num_row_blocks_ + 1);
+  first_cell_in_row_block.reserve(num_row_blocks_ + 1);
   num_nonzeros_ = 0;
-  num_cells_ = 0;
+  sequential_layout_ = true;
   for (const auto& r : block_structure.rows) {
     const int row_block_size = r.block.size;
+    if (r.cells.size() > 1 && row_block_size > 1) {
+      is_crs_compatible_ = false;
+    }
     row_blocks.emplace_back(r.block);
-    row_block_offsets.push_back(num_cells_);
+    first_cell_in_row_block.push_back(cells.size());
     for (const auto& c : r.cells) {
-      cells.emplace_back(c);
       const int col_block_size = col_blocks[c.block_id].size;
-      num_nonzeros_ += col_block_size * row_block_size;
-      ++num_cells_;
+      const int cell_size = col_block_size * row_block_size;
+      cells.push_back(c);
+      sequential_layout_ &= c.position == num_nonzeros_;
+      num_nonzeros_ += cell_size;
     }
   }
-  row_block_offsets.push_back(num_cells_);
+  first_cell_in_row_block.push_back(cells.size());
+  num_cells_ = cells.size();
 
   num_rows_ = Dimension(row_blocks);
   num_cols_ = Dimension(col_blocks);
 
+  is_crs_compatible_ &= sequential_layout_;
+
   if (VLOG_IS_ON(3)) {
-    const size_t row_block_offsets_size =
-        row_block_offsets.size() * sizeof(int);
+    const size_t first_cell_in_row_block_size =
+        first_cell_in_row_block.size() * sizeof(int);
     const size_t cells_size = cells.size() * sizeof(Cell);
     const size_t row_blocks_size = row_blocks.size() * sizeof(Block);
     const size_t col_blocks_size = col_blocks.size() * sizeof(Block);
-    const size_t total_size =
-        row_block_offsets_size + cells_size + col_blocks_size + row_blocks_size;
+    const size_t total_size = first_cell_in_row_block_size + cells_size +
+                              col_blocks_size + row_blocks_size;
+    const double ratio =
+        (100. * total_size) / (num_nonzeros_ * (sizeof(int) + sizeof(double)) +
+                               num_rows_ * sizeof(int));
     VLOG(3) << "\nCudaBlockSparseStructure:\n"
                "\tRow block offsets: "
-            << row_block_offsets_size
+            << first_cell_in_row_block_size
             << " bytes\n"
                "\tColumn blocks: "
             << col_blocks_size
@@ -102,13 +114,11 @@
             << row_blocks_size
             << " bytes\n"
                "\tCells: "
-            << cells_size
-            << " bytes\n"
-               "\tTotal: "
-            << total_size << " bytes of GPU memory";
+            << cells_size << " bytes\n\tTotal: " << total_size
+            << " bytes of GPU memory (" << ratio << "% of CRS matrix size)";
   }
 
-  row_block_offsets_.CopyFromCpuVector(row_block_offsets);
+  first_cell_in_row_block_.CopyFromCpuVector(first_cell_in_row_block);
   cells_.CopyFromCpuVector(cells);
   row_blocks_.CopyFromCpuVector(row_blocks);
   col_blocks_.CopyFromCpuVector(col_blocks);
diff --git a/internal/ceres/cuda_block_structure.h b/internal/ceres/cuda_block_structure.h
index a41d0b3..28e36b2 100644
--- a/internal/ceres/cuda_block_structure.h
+++ b/internal/ceres/cuda_block_structure.h
@@ -44,12 +44,11 @@
 // This class stores a read-only block-sparse structure in gpu memory.
 // Invariants are the same as those of CompressedRowBlockStructure.
 // In order to simplify allocation and copying data to gpu, cells from all
-// row-blocks are stored in a single array sequentially. Additional array
-// row_block_offsets of size num_row_blocks + 1 allows to identify range of
-// cells corresponding to a row-block.
-// Cells corresponding to i-th row-block are stored in sub-array
-// cells[row_block_offsets[i]; ... row_block_offsets[i + 1] - 1], and their
-// order is preserved.
+// row-blocks are stored in a single array sequentially. Array
+// first_cell_in_row_block of size num_row_blocks + 1 allows to identify range
+// of cells corresponding to a row-block. Cells corresponding to i-th row-block
+// are stored in sub-array cells[first_cell_in_row_block[i]; ...
+// first_cell_in_row_block[i + 1] - 1], and their order is preserved.
 class CERES_NO_EXPORT CudaBlockSparseStructure {
  public:
   // CompressedRowBlockStructure is contains a vector of CompressedLists, with
@@ -65,9 +64,20 @@
   int num_row_blocks() const { return num_row_blocks_; }
   int num_col_blocks() const { return num_col_blocks_; }
 
+  // Returns true if values from block-sparse matrix can be copied to CRS matrix
+  // as-is. This is possible if each row-block is stored in CRS order:
+  //  - Row-block consists of a single row
+  //  - Row-block contains a single cell
+  bool IsCrsCompatible() const { return is_crs_compatible_; }
+  // Returns true if block-sparse structure corresponds to block-sparse matrix
+  // with sequential cell positions
+  bool sequential_layout() const { return sequential_layout_; }
+
   // Device pointer to array of num_row_blocks + 1 indices of the first cell of
   // row block
-  const int* row_block_offsets() const { return row_block_offsets_.data(); }
+  const int* first_cell_in_row_block() const {
+    return first_cell_in_row_block_.data();
+  }
   // Device pointer to array of num_cells cells, sorted by row-block
   const Cell* cells() const { return cells_.data(); }
   // Device pointer to array of row blocks
@@ -82,7 +92,9 @@
   int num_nonzeros_;
   int num_row_blocks_;
   int num_col_blocks_;
-  CudaBuffer<int> row_block_offsets_;
+  bool is_crs_compatible_;
+  bool sequential_layout_;
+  CudaBuffer<int> first_cell_in_row_block_;
   CudaBuffer<Cell> cells_;
   CudaBuffer<Block> row_blocks_;
   CudaBuffer<Block> col_blocks_;
diff --git a/internal/ceres/cuda_block_structure_test.cc b/internal/ceres/cuda_block_structure_test.cc
index 95cb26b..000b6bb 100644
--- a/internal/ceres/cuda_block_structure_test.cc
+++ b/internal/ceres/cuda_block_structure_test.cc
@@ -83,10 +83,10 @@
   }
   std::vector<int> GetRowBlockOffsets(
       const CudaBlockSparseStructure& structure) {
-    const auto& cuda_buffer = structure.row_block_offsets_;
-    std::vector<int> row_block_offsets(cuda_buffer.size());
-    cuda_buffer.CopyToCpu(row_block_offsets.data(), row_block_offsets.size());
-    return row_block_offsets;
+    const auto& cuda_buffer = structure.first_cell_in_row_block_;
+    std::vector<int> first_cell_in_row_block(cuda_buffer.size());
+    cuda_buffer.CopyToCpu(first_cell_in_row_block.data(), first_cell_in_row_block.size());
+    return first_cell_in_row_block;
   }
 
   std::unique_ptr<BlockSparseMatrix> A_;
@@ -114,19 +114,19 @@
   }
 
   std::vector<Cell> cells = GetCells(cuda_block_structure);
-  std::vector<int> row_block_offsets = GetRowBlockOffsets(cuda_block_structure);
+  std::vector<int> first_cell_in_row_block = GetRowBlockOffsets(cuda_block_structure);
   blocks = GetRowBlocks(cuda_block_structure);
 
   ASSERT_EQ(blocks.size(), num_row_blocks);
-  ASSERT_EQ(row_block_offsets.size(), num_row_blocks + 1);
-  ASSERT_EQ(row_block_offsets.back(), cells.size());
+  ASSERT_EQ(first_cell_in_row_block.size(), num_row_blocks + 1);
+  ASSERT_EQ(first_cell_in_row_block.back(), cells.size());
 
   for (int i = 0; i < num_row_blocks; ++i) {
     const int num_cells = block_structure->rows[i].cells.size();
     EXPECT_EQ(blocks[i].position, block_structure->rows[i].block.position);
     EXPECT_EQ(blocks[i].size, block_structure->rows[i].block.size);
-    const int first_cell = row_block_offsets[i];
-    const int last_cell = row_block_offsets[i + 1];
+    const int first_cell = first_cell_in_row_block[i];
+    const int last_cell = first_cell_in_row_block[i + 1];
     ASSERT_EQ(last_cell - first_cell, num_cells);
     for (int j = 0; j < num_cells; ++j) {
       EXPECT_EQ(cells[first_cell + j].block_id,
diff --git a/internal/ceres/cuda_kernels.cu.cc b/internal/ceres/cuda_kernels.cu.cc
deleted file mode 100644
index 3d93ae8..0000000
--- a/internal/ceres/cuda_kernels.cu.cc
+++ /dev/null
@@ -1,304 +0,0 @@
-// Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2022 Google Inc. All rights reserved.
-// http://ceres-solver.org/
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-// * Neither the name of Google Inc. nor the names of its contributors may be
-//   used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
-
-//#include "ceres/cuda_kernels.h"
-
-#include <thrust/execution_policy.h>
-#include <thrust/scan.h>
-
-#include "block_structure.h"
-#include "cuda_runtime.h"
-
-namespace ceres {
-namespace internal {
-
-// As the CUDA Toolkit documentation says, "although arbitrary in this case, is
-// a common choice". This is determined by the warp size, max block size, and
-// multiprocessor sizes of recent GPUs. For complex kernels with significant
-// register usage and unusual memory patterns, the occupancy calculator API
-// might provide better performance. See "Occupancy Calculator" under the CUDA
-// toolkit documentation.
-constexpr int kCudaBlockSize = 256;
-inline int NumBlocks(int size) {
-  return (size + kCudaBlockSize - 1) / kCudaBlockSize;
-}
-
-template <typename SrcType, typename DstType>
-__global__ void TypeConversionKernel(const SrcType* __restrict__ input,
-                                     DstType* __restrict__ output,
-                                     const int size) {
-  const int i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i < size) {
-    output[i] = static_cast<DstType>(input[i]);
-  }
-}
-
-void CudaFP64ToFP32(const double* input,
-                    float* output,
-                    const int size,
-                    cudaStream_t stream) {
-  const int num_blocks = NumBlocks(size);
-  TypeConversionKernel<double, float>
-      <<<num_blocks, kCudaBlockSize, 0, stream>>>(input, output, size);
-}
-
-void CudaFP32ToFP64(const float* input,
-                    double* output,
-                    const int size,
-                    cudaStream_t stream) {
-  const int num_blocks = NumBlocks(size);
-  TypeConversionKernel<float, double>
-      <<<num_blocks, kCudaBlockSize, 0, stream>>>(input, output, size);
-}
-
-template <typename T>
-__global__ void SetZeroKernel(T* __restrict__ output, const int size) {
-  const int i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i < size) {
-    output[i] = T(0.0);
-  }
-}
-
-void CudaSetZeroFP32(float* output, const int size, cudaStream_t stream) {
-  const int num_blocks = NumBlocks(size);
-  SetZeroKernel<float><<<num_blocks, kCudaBlockSize, 0, stream>>>(output, size);
-}
-
-void CudaSetZeroFP64(double* output, const int size, cudaStream_t stream) {
-  const int num_blocks = NumBlocks(size);
-  SetZeroKernel<double>
-      <<<num_blocks, kCudaBlockSize, 0, stream>>>(output, size);
-}
-
-template <typename SrcType, typename DstType>
-__global__ void XPlusEqualsYKernel(DstType* __restrict__ x,
-                                   const SrcType* __restrict__ y,
-                                   const int size) {
-  const int i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i < size) {
-    x[i] = x[i] + DstType(y[i]);
-  }
-}
-
-void CudaDsxpy(double* x, float* y, const int size, cudaStream_t stream) {
-  const int num_blocks = NumBlocks(size);
-  XPlusEqualsYKernel<float, double>
-      <<<num_blocks, kCudaBlockSize, 0, stream>>>(x, y, size);
-}
-
-__global__ void CudaDtDxpyKernel(double* __restrict__ y,
-                                 const double* D,
-                                 const double* __restrict__ x,
-                                 const int size) {
-  const int i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i < size) {
-    y[i] = y[i] + D[i] * D[i] * x[i];
-  }
-}
-
-void CudaDtDxpy(double* y,
-                const double* D,
-                const double* x,
-                const int size,
-                cudaStream_t stream) {
-  const int num_blocks = NumBlocks(size);
-  CudaDtDxpyKernel<<<num_blocks, kCudaBlockSize, 0, stream>>>(y, D, x, size);
-}
-
-// Fill row block id and nnz for each row using block-sparse structure
-// represented by a set of flat arrays.
-// Inputs:
-// - num_row_blocks: number of row-blocks in block-sparse structure
-// - row_block_offsets: index of the first cell of the row-block; size:
-// num_row_blocks + 1
-// - cells: cells of block-sparse structure as a continuous array
-// - row_blocks: row blocks of block-sparse structure stored sequentially
-// - col_blocks: column blocks of block-sparse structure stored sequentially
-// Outputs:
-// - rows: rows[i + 1] will contain number of non-zeros in i-th row, rows[0]
-// will be set to 0; rows are filled with a shift by one element in order
-// to obtain row-index array of CRS matrix with a inclusive scan afterwards
-// - row_block_ids: row_block_ids[i] will be set to index of row-block that
-// contains i-th row.
-// Computation is perform row-block-wise
-__global__ void RowBlockIdAndNNZ(int num_row_blocks,
-                                 const int* __restrict__ row_block_offsets,
-                                 const Cell* __restrict__ cells,
-                                 const Block* __restrict__ row_blocks,
-                                 const Block* __restrict__ col_blocks,
-                                 int* __restrict__ rows,
-                                 int* __restrict__ row_block_ids) {
-  const int row_block_id = blockIdx.x * blockDim.x + threadIdx.x;
-  if (row_block_id > num_row_blocks) {
-    // No synchronization is performed in this kernel, thus it is safe to return
-    return;
-  }
-  if (row_block_id == num_row_blocks) {
-    // one extra thread sets the first element
-    rows[0] = 0;
-    return;
-  }
-  const auto& row_block = row_blocks[row_block_id];
-  int row_nnz = 0;
-  const auto first_cell = cells + row_block_offsets[row_block_id];
-  const auto last_cell = cells + row_block_offsets[row_block_id + 1];
-  for (auto cell = first_cell; cell < last_cell; ++cell) {
-    row_nnz += col_blocks[cell->block_id].size;
-  }
-  const int first_row = row_block.position;
-  const int last_row = first_row + row_block.size;
-  for (int i = first_row; i < last_row; ++i) {
-    rows[i + 1] = row_nnz;
-    row_block_ids[i] = row_block_id;
-  }
-}
-
-// Row-wise creation of CRS structure
-// Inputs:
-// - num_rows: number of rows in matrix
-// - row_block_offsets: index of the first cell of the row-block; size:
-// num_row_blocks + 1
-// - cells: cells of block-sparse structure as a continuous array
-// - row_blocks: row blocks of block-sparse structure stored sequentially
-// - col_blocks: column blocks of block-sparse structure stored sequentially
-// - row_block_ids: index of row-block that corresponds to row
-// - rows: row-index array of CRS structure
-// Outputs:
-// - cols: column-index array of CRS structure
-// - permutation: permutation from block-sparse to crs order
-// Computaion is perform row-wise
-__global__ void ComputeColumnsAndPermutation(
-    int num_rows,
-    const int* __restrict__ row_block_offsets,
-    const Cell* __restrict__ cells,
-    const Block* __restrict__ row_blocks,
-    const Block* __restrict__ col_blocks,
-    const int* __restrict__ row_block_ids,
-    const int* __restrict__ rows,
-    int* __restrict__ cols,
-    int* __restrict__ permutation) {
-  const int row = blockIdx.x * blockDim.x + threadIdx.x;
-  if (row >= num_rows) {
-    // No synchronization is performed in this kernel, thus it is safe to return
-    return;
-  }
-  const int row_block_id = row_block_ids[row];
-  // zero-based index of row in row-block
-  const int row_in_block = row - row_blocks[row_block_id].position;
-  // position in crs matrix
-  int crs_position = rows[row];
-  const auto first_cell = cells + row_block_offsets[row_block_id];
-  const auto last_cell = cells + row_block_offsets[row_block_id + 1];
-  // For reach cell of row-block only current row is being filled
-  for (auto cell = first_cell; cell < last_cell; ++cell) {
-    const auto& col_block = col_blocks[cell->block_id];
-    const int col_block_size = col_block.size;
-    int column_idx = col_block.position;
-    int bs_position = cell->position + row_in_block * col_block_size;
-    // Fill permutation and column indices for each element of row_in_block
-    // row of current cell
-    for (int i = 0; i < col_block_size; ++i, ++crs_position) {
-      permutation[bs_position++] = crs_position;
-      cols[crs_position] = column_idx++;
-    }
-  }
-}
-
-void FillCRSStructure(const int num_row_blocks,
-                      const int num_rows,
-                      const int* row_block_offsets,
-                      const Cell* cells,
-                      const Block* row_blocks,
-                      const Block* col_blocks,
-                      int* rows,
-                      int* cols,
-                      int* row_block_ids,
-                      int* permutation,
-                      cudaStream_t stream) {
-  // Set number of non-zeros per row in rows array and row to row-block map in
-  // row_block_ids array
-  const int num_blocks_blockwise = NumBlocks(num_row_blocks + 1);
-  RowBlockIdAndNNZ<<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>(
-      num_row_blocks,
-      row_block_offsets,
-      cells,
-      row_blocks,
-      col_blocks,
-      rows,
-      row_block_ids);
-  // Finalize row-index array of CRS strucure by computing prefix sum
-  thrust::inclusive_scan(
-      thrust::cuda::par.on(stream), rows, rows + num_rows + 1, rows);
-
-  // Fill cols array of CRS structure and permutation from block-sparse to CRS
-  const int num_blocks_rowwise = NumBlocks(num_rows);
-  ComputeColumnsAndPermutation<<<num_blocks_rowwise,
-                                 kCudaBlockSize,
-                                 0,
-                                 stream>>>(num_rows,
-                                           row_block_offsets,
-                                           cells,
-                                           row_blocks,
-                                           col_blocks,
-                                           row_block_ids,
-                                           rows,
-                                           cols,
-                                           permutation);
-}
-
-// Updates values of CRS matrix with a continuous block of values of
-// block-sparse matrix. With permutation[i] being an index of i-th value of
-// block-sparse matrix in values of CRS matrix updating values is quite
-// efficient when performed element-wise (reads of permutation and values arrays
-// are coalesced when offset is divisable by 16)
-__global__ void PermuteValuesKernel(const int offset,
-                                    const int num_values,
-                                    const int* permutation,
-                                    const double* block_sparse_values,
-                                    double* crs_values) {
-  const int value_id = blockIdx.x * blockDim.x + threadIdx.x;
-  if (value_id < num_values) {
-    crs_values[permutation[offset + value_id]] = block_sparse_values[value_id];
-  }
-}
-
-void PermuteValues(const int offset,
-                   const int num_values,
-                   const int* permutation,
-                   const double* block_sparse_values,
-                   double* crs_values,
-                   cudaStream_t stream) {
-  const int num_blocks = NumBlocks(num_values);
-  PermuteValuesKernel<<<num_blocks, kCudaBlockSize, 0, stream>>>(
-      offset, num_values, permutation, block_sparse_values, crs_values);
-}
-
-}  // namespace internal
-}  // namespace ceres
diff --git a/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc b/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc
new file mode 100644
index 0000000..8be3553
--- /dev/null
+++ b/internal/ceres/cuda_kernels_bsm_to_crs.cu.cc
@@ -0,0 +1,279 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2022 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#include "ceres/cuda_kernels_bsm_to_crs.h"
+
+#include <cuda_runtime.h>
+#include <thrust/execution_policy.h>
+#include <thrust/scan.h>
+
+#include "ceres/block_structure.h"
+#include "ceres/cuda_kernels_utils.h"
+
+namespace ceres {
+namespace internal {
+
+// Fill row block id and nnz for each row using block-sparse structure
+// represented by a set of flat arrays.
+// Inputs:
+// - num_row_blocks: number of row-blocks in block-sparse structure
+// - first_cell_in_row_block: index of the first cell of the row-block; size:
+// num_row_blocks + 1
+// - cells: cells of block-sparse structure as a continuous array
+// - row_blocks: row blocks of block-sparse structure stored sequentially
+// - col_blocks: column blocks of block-sparse structure stored sequentially
+// Outputs:
+// - rows: rows[i + 1] will contain number of non-zeros in i-th row, rows[0]
+// will be set to 0; rows are filled with a shift by one element in order
+// to obtain row-index array of CRS matrix with a inclusive scan afterwards
+// - row_block_ids: row_block_ids[i] will be set to index of row-block that
+// contains i-th row.
+// Computation is perform row-block-wise
+__global__ void RowBlockIdAndNNZ(
+    int num_row_blocks,
+    const int* __restrict__ first_cell_in_row_block,
+    const Cell* __restrict__ cells,
+    const Block* __restrict__ row_blocks,
+    const Block* __restrict__ col_blocks,
+    int* __restrict__ rows,
+    int* __restrict__ row_block_ids) {
+  const int row_block_id = blockIdx.x * blockDim.x + threadIdx.x;
+  if (row_block_id > num_row_blocks) {
+    // No synchronization is performed in this kernel, thus it is safe to return
+    return;
+  }
+  if (row_block_id == num_row_blocks) {
+    // one extra thread sets the first element
+    rows[0] = 0;
+    return;
+  }
+  const auto& row_block = row_blocks[row_block_id];
+  int row_nnz = 0;
+  const auto first_cell = cells + first_cell_in_row_block[row_block_id];
+  const auto last_cell = cells + first_cell_in_row_block[row_block_id + 1];
+  for (auto cell = first_cell; cell < last_cell; ++cell) {
+    row_nnz += col_blocks[cell->block_id].size;
+  }
+  const int first_row = row_block.position;
+  const int last_row = first_row + row_block.size;
+  for (int i = first_row; i < last_row; ++i) {
+    rows[i + 1] = row_nnz;
+    row_block_ids[i] = row_block_id;
+  }
+}
+
+// Row-wise creation of CRS structure
+// Inputs:
+// - num_rows: number of rows in matrix
+// - first_cell_in_row_block: index of the first cell of the row-block; size:
+// num_row_blocks + 1
+// - cells: cells of block-sparse structure as a continuous array
+// - row_blocks: row blocks of block-sparse structure stored sequentially
+// - col_blocks: column blocks of block-sparse structure stored sequentially
+// - row_block_ids: index of row-block that corresponds to row
+// - rows: row-index array of CRS structure
+// Outputs:
+// - cols: column-index array of CRS structure
+// Computaion is perform row-wise
+__global__ void ComputeColumnsAndPermutation(
+    int num_rows,
+    const int* __restrict__ first_cell_in_row_block,
+    const Cell* __restrict__ cells,
+    const Block* __restrict__ row_blocks,
+    const Block* __restrict__ col_blocks,
+    const int* __restrict__ row_block_ids,
+    const int* __restrict__ rows,
+    int* __restrict__ cols) {
+  const int row = blockIdx.x * blockDim.x + threadIdx.x;
+  if (row >= num_rows) {
+    // No synchronization is performed in this kernel, thus it is safe to return
+    return;
+  }
+  const int row_block_id = row_block_ids[row];
+  // position in crs matrix
+  int crs_position = rows[row];
+  const auto first_cell = cells + first_cell_in_row_block[row_block_id];
+  const auto last_cell = cells + first_cell_in_row_block[row_block_id + 1];
+  // For reach cell of row-block only current row is being filled
+  for (auto cell = first_cell; cell < last_cell; ++cell) {
+    const auto& col_block = col_blocks[cell->block_id];
+    const int col_block_size = col_block.size;
+    int column_idx = col_block.position;
+    // Column indices for each element of row_in_block row of current cell
+    for (int i = 0; i < col_block_size; ++i, ++crs_position) {
+      cols[crs_position] = column_idx++;
+    }
+  }
+}
+
+void FillCRSStructure(const int num_row_blocks,
+                      const int num_rows,
+                      const int* first_cell_in_row_block,
+                      const Cell* cells,
+                      const Block* row_blocks,
+                      const Block* col_blocks,
+                      int* rows,
+                      int* cols,
+                      cudaStream_t stream) {
+  // Set number of non-zeros per row in rows array and row to row-block map in
+  // row_block_ids array
+  int* row_block_ids;
+  cudaMallocAsync(&row_block_ids, sizeof(int) * num_rows, stream);
+  const int num_blocks_blockwise = NumBlocksInGrid(num_row_blocks + 1);
+  RowBlockIdAndNNZ<<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>(
+      num_row_blocks,
+      first_cell_in_row_block,
+      cells,
+      row_blocks,
+      col_blocks,
+      rows,
+      row_block_ids);
+  // Finalize row-index array of CRS strucure by computing prefix sum
+  thrust::inclusive_scan(
+      thrust::cuda::par_nosync.on(stream), rows, rows + num_rows + 1, rows);
+
+  // Fill cols array of CRS structure and permutation from block-sparse to CRS
+  const int num_blocks_rowwise = NumBlocksInGrid(num_rows);
+  ComputeColumnsAndPermutation<<<num_blocks_rowwise,
+                                 kCudaBlockSize,
+                                 0,
+                                 stream>>>(num_rows,
+                                           first_cell_in_row_block,
+                                           cells,
+                                           row_blocks,
+                                           col_blocks,
+                                           row_block_ids,
+                                           rows,
+                                           cols);
+  cudaFreeAsync(row_block_ids, stream);
+}
+
+template <typename T, typename Predicate>
+__device__ int PartitionPoint(const T* data,
+                              int first,
+                              int last,
+                              Predicate&& predicate) {
+  if (!predicate(data[first])) {
+    return first;
+  }
+  while (last - first > 1) {
+    const auto midpoint = first + (last - first) / 2;
+    if (predicate(data[midpoint])) {
+      first = midpoint;
+    } else {
+      last = midpoint;
+    }
+  }
+  return last;
+}
+
+// Element-wise reordering of block-sparse values
+// - first_cell_pos - position of the first cell of corresponding sub-matrix;
+// only used if use_first_cell is true; otherwise the first cell in row-block is
+// used
+// - block_sparse_values - segment of block-sparse values starting from
+// block_sparse_offset, containing num_values
+__global__ void PermuteToCrsKernel(
+    const int block_sparse_offset,
+    const int num_values,
+    const int num_row_blocks,
+    const int* __restrict__ first_cell_in_row_block,
+    const Cell* __restrict__ cells,
+    const Block* __restrict__ row_blocks,
+    const Block* __restrict__ col_blocks,
+    const int* __restrict__ crs_rows,
+    const double* __restrict__ block_sparse_values,
+    double* __restrict__ crs_values) {
+  const int value_id = blockIdx.x * blockDim.x + threadIdx.x;
+  if (value_id >= num_values) {
+    return;
+  }
+  const int block_sparse_value_id = value_id + block_sparse_offset;
+  // Find the corresponding row-block with a binary search
+  const int row_block_id =
+      PartitionPoint(
+          first_cell_in_row_block,
+          0,
+          num_row_blocks,
+          [cells,
+           block_sparse_value_id] __device__(const int row_block_offset) {
+            return cells[row_block_offset].position <= block_sparse_value_id;
+          }) -
+      1;
+  // Find cell and calculate offset within the row with a linear scan
+  const auto& row_block = row_blocks[row_block_id];
+  const auto first_cell = cells + first_cell_in_row_block[row_block_id];
+  const auto last_cell = cells + first_cell_in_row_block[row_block_id + 1];
+  const int row_block_size = row_block.size;
+  int num_cols_before = 0;
+  for (const Cell* cell = first_cell; cell < last_cell; ++cell) {
+    const auto& col_block = col_blocks[cell->block_id];
+    const int col_block_size = col_block.size;
+    const int cell_size = row_block_size * col_block_size;
+    if (cell->position + cell_size > block_sparse_value_id) {
+      const int pos_in_cell = block_sparse_value_id - cell->position;
+      const int row_in_cell = pos_in_cell / col_block_size;
+      const int col_in_cell = pos_in_cell % col_block_size;
+      const int row = row_in_cell + row_block.position;
+      crs_values[crs_rows[row] + num_cols_before + col_in_cell] =
+          block_sparse_values[value_id];
+      break;
+    }
+    num_cols_before += col_block_size;
+  }
+}
+
+void PermuteToCRS(const int block_sparse_offset,
+                  const int num_values,
+                  const int num_row_blocks,
+                  const int* first_cell_in_row_block,
+                  const Cell* cells,
+                  const Block* row_blocks,
+                  const Block* col_blocks,
+                  const int* crs_rows,
+                  const double* block_sparse_values,
+                  double* crs_values,
+                  cudaStream_t stream) {
+  const int num_blocks_valuewise = NumBlocksInGrid(num_values);
+  PermuteToCrsKernel<<<num_blocks_valuewise, kCudaBlockSize, 0, stream>>>(
+      block_sparse_offset,
+      num_values,
+      num_row_blocks,
+      first_cell_in_row_block,
+      cells,
+      row_blocks,
+      col_blocks,
+      crs_rows,
+      block_sparse_values,
+      crs_values);
+}
+
+}  // namespace internal
+}  // namespace ceres
diff --git a/internal/ceres/cuda_kernels_bsm_to_crs.h b/internal/ceres/cuda_kernels_bsm_to_crs.h
new file mode 100644
index 0000000..b6da69b
--- /dev/null
+++ b/internal/ceres/cuda_kernels_bsm_to_crs.h
@@ -0,0 +1,76 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#ifndef CERES_INTERNAL_CUDA_KERNELS_BSM_TO_CRS_H_
+#define CERES_INTERNAL_CUDA_KERNELS_BSM_TO_CRS_H_
+
+#include "ceres/internal/config.h"
+
+#ifndef CERES_NO_CUDA
+
+#include "cuda_runtime.h"
+
+namespace ceres {
+namespace internal {
+class Block;
+class Cell;
+
+// Compute structure of CRS matrix using block-sparse structure.
+// Arrays corresponding to CRS matrix are to be allocated by caller
+void FillCRSStructure(const int num_row_blocks,
+                      const int num_rows,
+                      const int* first_cell_in_row_block,
+                      const Cell* cells,
+                      const Block* row_blocks,
+                      const Block* col_blocks,
+                      int* rows,
+                      int* cols,
+                      cudaStream_t stream);
+
+// Permute segment of values from block-sparse matrix with sequential layout to
+// CRS order. Segment starts at block_sparse_offset and has length of num_values
+void PermuteToCRS(const int block_sparse_offset,
+                  const int num_values,
+                  const int num_row_blocks,
+                  const int* first_cell_in_row_block,
+                  const Cell* cells,
+                  const Block* row_blocks,
+                  const Block* col_blocks,
+                  const int* crs_rows,
+                  const double* block_sparse_values,
+                  double* crs_values,
+                  cudaStream_t stream);
+
+}  // namespace internal
+}  // namespace ceres
+
+#endif  // CERES_NO_CUDA
+
+#endif  // CERES_INTERNAL_CUDA_KERNELS_BSM_TO_CRS_H_
diff --git a/internal/ceres/cuda_kernels_utils.h b/internal/ceres/cuda_kernels_utils.h
new file mode 100644
index 0000000..45d5d3b
--- /dev/null
+++ b/internal/ceres/cuda_kernels_utils.h
@@ -0,0 +1,56 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2022 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+
+#ifndef CERES_INTERNAL_CUDA_KERNELS_UTILS_H_
+#define CERES_INTERNAL_CUDA_KERNELS_UTILS_H_
+
+namespace ceres {
+namespace internal {
+
+// Parallel execution on CUDA device requires splitting job into blocks of a
+// fixed size. We use block-size of kCudaBlockSize for all kernels that do not
+// require any specific block size. As the CUDA Toolkit documentation says,
+// "although arbitrary in this case, is a common choice". This is determined by
+// the warp size, max block size, and multiprocessor sizes of recent GPUs. For
+// complex kernels with significant register usage and unusual memory patterns,
+// the occupancy calculator API might provide better performance. See "Occupancy
+// Calculator" under the CUDA toolkit documentation.
+constexpr int kCudaBlockSize = 256;
+
+// Compute number of blocks of kCudaBlockSize that span over 1-d grid with
+// dimension size. Note that 1-d grid dimension is limited by 2^31-1 in CUDA,
+// thus a signed int is used as an argument.
+inline int NumBlocksInGrid(int size) {
+  return (size + kCudaBlockSize - 1) / kCudaBlockSize;
+}
+}  // namespace internal
+}  // namespace ceres
+
+#endif  // CERES_INTERNAL_CUDA_KERNELS_UTILS_H_
diff --git a/internal/ceres/cuda_kernels_vector_ops.cu.cc b/internal/ceres/cuda_kernels_vector_ops.cu.cc
new file mode 100644
index 0000000..b5a8e40
--- /dev/null
+++ b/internal/ceres/cuda_kernels_vector_ops.cu.cc
@@ -0,0 +1,123 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2022 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+
+#include "ceres/cuda_kernels_vector_ops.h"
+
+#include <cuda_runtime.h>
+
+#include "ceres/cuda_kernels_utils.h"
+
+namespace ceres {
+namespace internal {
+
+template <typename SrcType, typename DstType>
+__global__ void TypeConversionKernel(const SrcType* __restrict__ input,
+                                     DstType* __restrict__ output,
+                                     const int size) {
+  const int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < size) {
+    output[i] = static_cast<DstType>(input[i]);
+  }
+}
+
+void CudaFP64ToFP32(const double* input,
+                    float* output,
+                    const int size,
+                    cudaStream_t stream) {
+  const int num_blocks = NumBlocksInGrid(size);
+  TypeConversionKernel<double, float>
+      <<<num_blocks, kCudaBlockSize, 0, stream>>>(input, output, size);
+}
+
+void CudaFP32ToFP64(const float* input,
+                    double* output,
+                    const int size,
+                    cudaStream_t stream) {
+  const int num_blocks = NumBlocksInGrid(size);
+  TypeConversionKernel<float, double>
+      <<<num_blocks, kCudaBlockSize, 0, stream>>>(input, output, size);
+}
+
+template <typename T>
+__global__ void SetZeroKernel(T* __restrict__ output, const int size) {
+  const int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < size) {
+    output[i] = T(0.0);
+  }
+}
+
+void CudaSetZeroFP32(float* output, const int size, cudaStream_t stream) {
+  const int num_blocks = NumBlocksInGrid(size);
+  SetZeroKernel<float><<<num_blocks, kCudaBlockSize, 0, stream>>>(output, size);
+}
+
+void CudaSetZeroFP64(double* output, const int size, cudaStream_t stream) {
+  const int num_blocks = NumBlocksInGrid(size);
+  SetZeroKernel<double>
+      <<<num_blocks, kCudaBlockSize, 0, stream>>>(output, size);
+}
+
+template <typename SrcType, typename DstType>
+__global__ void XPlusEqualsYKernel(DstType* __restrict__ x,
+                                   const SrcType* __restrict__ y,
+                                   const int size) {
+  const int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < size) {
+    x[i] = x[i] + DstType(y[i]);
+  }
+}
+
+void CudaDsxpy(double* x, float* y, const int size, cudaStream_t stream) {
+  const int num_blocks = NumBlocksInGrid(size);
+  XPlusEqualsYKernel<float, double>
+      <<<num_blocks, kCudaBlockSize, 0, stream>>>(x, y, size);
+}
+
+__global__ void CudaDtDxpyKernel(double* __restrict__ y,
+                                 const double* D,
+                                 const double* __restrict__ x,
+                                 const int size) {
+  const int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < size) {
+    y[i] = y[i] + D[i] * D[i] * x[i];
+  }
+}
+
+void CudaDtDxpy(double* y,
+                const double* D,
+                const double* x,
+                const int size,
+                cudaStream_t stream) {
+  const int num_blocks = NumBlocksInGrid(size);
+  CudaDtDxpyKernel<<<num_blocks, kCudaBlockSize, 0, stream>>>(y, D, x, size);
+}
+
+}  // namespace internal
+}  // namespace ceres
diff --git a/internal/ceres/cuda_kernels.h b/internal/ceres/cuda_kernels_vector_ops.h
similarity index 66%
rename from internal/ceres/cuda_kernels.h
rename to internal/ceres/cuda_kernels_vector_ops.h
index 61f945a..b179201 100644
--- a/internal/ceres/cuda_kernels.h
+++ b/internal/ceres/cuda_kernels_vector_ops.h
@@ -28,8 +28,8 @@
 //
 // Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
 
-#ifndef CERES_INTERNAL_CUDA_KERNELS_H_
-#define CERES_INTERNAL_CUDA_KERNELS_H_
+#ifndef CERES_INTERNAL_CUDA_KERNELS_VECTOR_OPS_H_
+#define CERES_INTERNAL_CUDA_KERNELS_VECTOR_OPS_H_
 
 #include "ceres/internal/config.h"
 
@@ -75,39 +75,9 @@
                 const int size,
                 cudaStream_t stream);
 
-// Compute structure of CRS matrix and permutation of values using block-sparse
-// structure and temporary array row_block_ids. Array row_block_ids of size
-// num_rows will be filled with indices of row-blocks corresponding to rows of
-// CRS matrix. Arrays corresponding to CRS matrix, permutation and row_block_ids
-// arrays are to be allocated by caller
-void FillCRSStructure(const int num_row_blocks,
-                      const int num_rows,
-                      const int* row_block_offsets,
-                      const Cell* cells,
-                      const Block* row_blocks,
-                      const Block* col_blocks,
-                      int* rows,
-                      int* cols,
-                      int* row_block_ids,
-                      int* permutation,
-                      cudaStream_t stream);
-
-// Permute block of block-sparse values using permutation
-// Pointer block_sparse_values corresponds to a block of num_values values from
-// block-sparse matrix at the offset from begining. Pointer output corresponds
-// to values of CRS matrix. Array permutation stores permutation from
-// block-sparse to CRS matrix with permutation[i] being an index of i-th value
-// of block-sparse matrix in values of CRS matrix
-void PermuteValues(const int offset,
-                   const int num_values,
-                   const int* permutation,
-                   const double* block_sparse_values,
-                   double* crs_values,
-                   cudaStream_t stream);
-
 }  // namespace internal
 }  // namespace ceres
 
 #endif  // CERES_NO_CUDA
 
-#endif  // CERES_INTERNAL_CUDA_KERNELS_H_
+#endif  // CERES_INTERNAL_CUDA_KERNELS_VECTOR_OPS_H_
diff --git a/internal/ceres/cuda_kernels_test.cc b/internal/ceres/cuda_kernels_vector_ops_test.cc
similarity index 99%
rename from internal/ceres/cuda_kernels_test.cc
rename to internal/ceres/cuda_kernels_vector_ops_test.cc
index 41036a3..7f6e229 100644
--- a/internal/ceres/cuda_kernels_test.cc
+++ b/internal/ceres/cuda_kernels_vector_ops_test.cc
@@ -28,7 +28,7 @@
 //
 // Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
 
-#include "ceres/cuda_kernels.h"
+#include "ceres/cuda_kernels_vector_ops.h"
 
 #include <math.h>
 
diff --git a/internal/ceres/cuda_sparse_matrix.cc b/internal/ceres/cuda_sparse_matrix.cc
index 0ae8c5d..e9ca9cc 100644
--- a/internal/ceres/cuda_sparse_matrix.cc
+++ b/internal/ceres/cuda_sparse_matrix.cc
@@ -52,7 +52,7 @@
 #ifndef CERES_NO_CUDA
 
 #include "ceres/cuda_buffer.h"
-#include "ceres/cuda_kernels.h"
+#include "ceres/cuda_kernels_vector_ops.h"
 #include "ceres/cuda_vector.h"
 #include "cuda_runtime_api.h"
 #include "cusparse.h"
diff --git a/internal/ceres/cuda_sparse_matrix.h b/internal/ceres/cuda_sparse_matrix.h
index ad55170..f5fcb91 100644
--- a/internal/ceres/cuda_sparse_matrix.h
+++ b/internal/ceres/cuda_sparse_matrix.h
@@ -82,6 +82,10 @@
   int num_cols() const { return num_cols_; }
   int num_nonzeros() const { return num_nonzeros_; }
 
+  const int32_t* rows() const { return rows_.data(); }
+  const int32_t* cols() const { return cols_.data(); }
+  const double* values() const { return values_.data(); }
+
   int32_t* mutable_rows() { return rows_.data(); }
   int32_t* mutable_cols() { return cols_.data(); }
   double* mutable_values() { return values_.data(); }
diff --git a/internal/ceres/cuda_vector.cc b/internal/ceres/cuda_vector.cc
index e2143de..d434f36 100644
--- a/internal/ceres/cuda_vector.cc
+++ b/internal/ceres/cuda_vector.cc
@@ -44,7 +44,7 @@
 #ifndef CERES_NO_CUDA
 
 #include "ceres/cuda_buffer.h"
-#include "ceres/cuda_kernels.h"
+#include "ceres/cuda_kernels_vector_ops.h"
 #include "ceres/cuda_vector.h"
 #include "cublas_v2.h"
 
diff --git a/internal/ceres/cuda_vector.h b/internal/ceres/cuda_vector.h
index 34f3947..46661cf 100644
--- a/internal/ceres/cuda_vector.h
+++ b/internal/ceres/cuda_vector.h
@@ -50,7 +50,7 @@
 #ifndef CERES_NO_CUDA
 
 #include "ceres/cuda_buffer.h"
-#include "ceres/cuda_kernels.h"
+#include "ceres/cuda_kernels_vector_ops.h"
 #include "ceres/internal/eigen.h"
 #include "cublas_v2.h"
 #include "cusparse.h"
diff --git a/internal/ceres/dense_cholesky.cc b/internal/ceres/dense_cholesky.cc
index f19ca7e..fb95536 100644
--- a/internal/ceres/dense_cholesky.cc
+++ b/internal/ceres/dense_cholesky.cc
@@ -41,7 +41,7 @@
 
 #ifndef CERES_NO_CUDA
 #include "ceres/context_impl.h"
-#include "ceres/cuda_kernels.h"
+#include "ceres/cuda_kernels_vector_ops.h"
 #include "cuda_runtime.h"
 #include "cusolverDn.h"
 #endif  // CERES_NO_CUDA
diff --git a/internal/ceres/evaluation_benchmark.cc b/internal/ceres/evaluation_benchmark.cc
index 2ba03ed..c23c881 100644
--- a/internal/ceres/evaluation_benchmark.cc
+++ b/internal/ceres/evaluation_benchmark.cc
@@ -96,7 +96,7 @@
   }
 
   std::unique_ptr<BlockSparseMatrix> CreateBlockSparseJacobian(
-      ContextImpl* context) {
+      ContextImpl* context, bool sequential) {
     auto problem = bal_problem->mutable_problem();
     auto problem_impl = problem->mutable_impl();
     CHECK(problem_impl != nullptr);
@@ -116,6 +116,30 @@
     auto block_sparse = downcast_unique_ptr<BlockSparseMatrix>(jacobian);
     CHECK(block_sparse != nullptr);
 
+    if (sequential) {
+      auto block_structure_sequential =
+          std::make_unique<CompressedRowBlockStructure>(
+              *block_sparse->block_structure());
+      int num_nonzeros = 0;
+      for (auto& row_block : block_structure_sequential->rows) {
+        const int row_block_size = row_block.block.size;
+        for (auto& cell : row_block.cells) {
+          const int col_block_size =
+              block_structure_sequential->cols[cell.block_id].size;
+          cell.position = num_nonzeros;
+          num_nonzeros += col_block_size * row_block_size;
+        }
+      }
+      block_sparse = std::make_unique<BlockSparseMatrix>(
+          block_structure_sequential.release(),
+#ifndef CERES_NO_CUDA
+          true
+#else
+          false
+#endif
+      );
+    }
+
     std::mt19937 rng;
     std::normal_distribution<double> rnorm;
     const int nnz = block_sparse->num_nonzeros();
@@ -123,6 +147,7 @@
     for (int i = 0; i < nnz; ++i) {
       values[i] = rnorm(rng);
     }
+
     return block_sparse;
   }
 
@@ -134,11 +159,20 @@
 
   const BlockSparseMatrix* BlockSparseJacobian(ContextImpl* context) {
     if (!block_sparse_jacobian) {
-      block_sparse_jacobian = CreateBlockSparseJacobian(context);
+      block_sparse_jacobian = CreateBlockSparseJacobian(context, true);
     }
     return block_sparse_jacobian.get();
   }
 
+  const BlockSparseMatrix* BlockSparseJacobianPartitioned(
+      ContextImpl* context) {
+    if (!block_sparse_jacobian_partitioned) {
+      block_sparse_jacobian_partitioned =
+          CreateBlockSparseJacobian(context, false);
+    }
+    return block_sparse_jacobian_partitioned.get();
+  }
+
   const CompressedRowSparseMatrix* CompressedRowSparseJacobian(
       ContextImpl* context) {
     if (!crs_jacobian) {
@@ -149,7 +183,7 @@
 
   std::unique_ptr<PartitionedView> PartitionedMatrixViewJacobian(
       const LinearSolver::Options& options) {
-    auto block_sparse = BlockSparseJacobian(options.context);
+    auto block_sparse = BlockSparseJacobianPartitioned(options.context);
     return std::make_unique<PartitionedView>(options, *block_sparse);
   }
 
@@ -171,7 +205,7 @@
 
   const ImplicitSchurComplement* ImplicitSchurComplementWithoutDiagonal(
       const LinearSolver::Options& options) {
-    auto block_sparse = BlockSparseJacobian(options.context);
+    auto block_sparse = BlockSparseJacobianPartitioned(options.context);
     implicit_schur_complement =
         std::make_unique<ImplicitSchurComplement>(options);
     implicit_schur_complement->Init(*block_sparse, nullptr, b.data());
@@ -180,7 +214,7 @@
 
   const ImplicitSchurComplement* ImplicitSchurComplementWithDiagonal(
       const LinearSolver::Options& options) {
-    auto block_sparse = BlockSparseJacobian(options.context);
+    auto block_sparse = BlockSparseJacobianPartitioned(options.context);
     implicit_schur_complement_diag =
         std::make_unique<ImplicitSchurComplement>(options);
     implicit_schur_complement_diag->Init(*block_sparse, D.data(), b.data());
@@ -192,6 +226,7 @@
   Vector b;
   std::unique_ptr<BundleAdjustmentProblem> bal_problem;
   std::unique_ptr<PreprocessedProblem> preprocessed_problem;
+  std::unique_ptr<BlockSparseMatrix> block_sparse_jacobian_partitioned;
   std::unique_ptr<BlockSparseMatrix> block_sparse_jacobian;
   std::unique_ptr<CompressedRowSparseMatrix> crs_jacobian;
   std::unique_ptr<BlockSparseMatrix> block_diagonal_ete;