Permutation-based conversion from block-sparse to crs Change-Id: Ic33a6476c033187dff61886deb6d1761524943f0

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 682c27f..e4b124d 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml

@@ -47,15 +47,25 @@
             liblapack-dev \
             libmetis-dev \
             libsuitesparse-dev \
-            ninja-build
-
-      - name: Setup CUDA toolkit
-        if: matrix.gpu == 'cuda'
+            ninja-build \
+            wget
+      - name: Setup CUDA toolkit (system repositories)
+        if: matrix.gpu == 'cuda' && matrix.os == 'ubuntu:20.04'
         run: |
           apt-get install -y \
             nvidia-cuda-dev \
             nvidia-cuda-toolkit
-
+      # nvidia cuda toolkit + gcc combo shipped with 22.04LTS is broken
+      # and is not able to compile code that uses thrust
+      # https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1006962
+      - name: Setup CUDA toolkit (nvidia repositories)
+        if: matrix.gpu == 'cuda' && matrix.os == 'ubuntu:22.04'
+        run: |
+          wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb
+          dpkg -i cuda-keyring_1.0-1_all.deb
+          apt-get update
+          apt-get install -y cuda
+          echo "CUDACXX=/usr/local/cuda/bin/nvcc" >> $GITHUB_ENV
 
       - name: Cache Build
         id: cache-build

diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index 9729ef6..9cb99b4 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt

@@ -141,8 +141,9 @@
             $<TARGET_FILE:cuda_dense_cholesky_test>)
   endif (BUILD_TESTING AND GFLAGS)
   set_source_files_properties(cuda_kernels.cu.cc PROPERTIES LANGUAGE CUDA)
-  add_library(ceres_cuda_kernels cuda_kernels.cu.cc)
+  add_library(ceres_cuda_kernels STATIC cuda_kernels.cu.cc)
   target_compile_features(ceres_cuda_kernels PRIVATE cxx_std_14)
+  target_include_directories(ceres_cuda_kernels PRIVATE  ${Ceres_SOURCE_DIR}/internal  ${Ceres_BINARY_DIR}/${CMAKE_INSTALL_INCLUDEDIR})
   list(APPEND CERES_LIBRARY_PRIVATE_DEPENDENCIES ceres_cuda_kernels)
 endif (USE_CUDA)
 
@@ -181,6 +182,8 @@
     cost_function.cc
     covariance.cc
     covariance_impl.cc
+    cuda_block_sparse_crs_view.cc
+    cuda_block_structure.cc
     cuda_sparse_matrix.cc
     cuda_vector.cc
     dense_cholesky.cc
@@ -472,10 +475,13 @@
   ceres_test(cost_function_to_functor)
   ceres_test(covariance)
   ceres_test(cubic_interpolation)
+  ceres_test(cuda_block_sparse_crs_view)
+  ceres_test(cuda_block_structure)
   ceres_test(cuda_dense_cholesky)
   ceres_test(cuda_dense_qr)
   ceres_test(cuda_kernels)
   ceres_test(cuda_sparse_matrix)
+  ceres_test(cuda_streamed_buffer)
   ceres_test(cuda_vector)
   ceres_test(dense_linear_solver)
   ceres_test(dense_cholesky)

diff --git a/internal/ceres/block_structure.h b/internal/ceres/block_structure.h
index aaa4080..bccfd8b 100644
--- a/internal/ceres/block_structure.h
+++ b/internal/ceres/block_structure.h

@@ -43,7 +43,11 @@
 
 #include "ceres/internal/export.h"
 
-namespace ceres::internal {
+// This file is being included into source files that are compiled with nvcc.
+// nvcc shipped with ubuntu 20.04 does not support some features of c++17,
+// including nested namespace definitions
+namespace ceres {
+namespace internal {
 
 using BlockSize = int32_t;
 
@@ -189,6 +193,7 @@
 std::vector<Block> Tail(const std::vector<Block>& blocks, int n);
 int SumSquaredSizes(const std::vector<Block>& blocks);
 
-}  // namespace ceres::internal
+}  // namespace internal
+}  // namespace ceres
 
 #endif  // CERES_INTERNAL_BLOCK_STRUCTURE_H_

diff --git a/internal/ceres/context_impl.cc b/internal/ceres/context_impl.cc
index cc01b54..ea3016b 100644
--- a/internal/ceres/context_impl.cc
+++ b/internal/ceres/context_impl.cc

@@ -60,9 +60,11 @@
     cusparseDestroy(cusparse_handle_);
     cusparse_handle_ = nullptr;
   }
-  if (stream_ != nullptr) {
-    cudaStreamDestroy(stream_);
-    stream_ = nullptr;
+  for (auto& s : streams_) {
+    if (s != nullptr) {
+      cudaStreamDestroy(s);
+      s = nullptr;
+    }
   }
   is_cuda_initialized_ = false;
 }
@@ -143,19 +145,22 @@
     return false;
   }
   event_logger.AddEvent("cusparseCreate");
-  if (cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking) !=
-      cudaSuccess) {
-    *message =
-        "CUDA initialization failed because CUDA::cudaStreamCreateWithFlags "
-        "failed.";
-    TearDown();
-    return false;
+  for (auto& s : streams_) {
+    if (cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking) != cudaSuccess) {
+      *message =
+          "CUDA initialization failed because CUDA::cudaStreamCreateWithFlags "
+          "failed.";
+      TearDown();
+      return false;
+    }
   }
   event_logger.AddEvent("cudaStreamCreateWithFlags");
-  if (cusolverDnSetStream(cusolver_handle_, stream_) !=
+  if (cusolverDnSetStream(cusolver_handle_, DefaultStream()) !=
           CUSOLVER_STATUS_SUCCESS ||
-      cublasSetStream(cublas_handle_, stream_) != CUBLAS_STATUS_SUCCESS ||
-      cusparseSetStream(cusparse_handle_, stream_) != CUSPARSE_STATUS_SUCCESS) {
+      cublasSetStream(cublas_handle_, DefaultStream()) !=
+          CUBLAS_STATUS_SUCCESS ||
+      cusparseSetStream(cusparse_handle_, DefaultStream()) !=
+          CUSPARSE_STATUS_SUCCESS) {
     *message = "CUDA initialization failed because SetStream failed.";
     TearDown();
     return false;

diff --git a/internal/ceres/context_impl.h b/internal/ceres/context_impl.h
index a18d638..776ca31 100644
--- a/internal/ceres/context_impl.h
+++ b/internal/ceres/context_impl.h

@@ -78,11 +78,32 @@
   //    Ceres, Ceres will use that GPU.
 
   // Note on Ceres' use of CUDA Streams:
-  // All operations on the GPU are performed using a single stream. This ensures
-  // that the order of operations are stream-ordered, but we do not need to
-  // explicitly synchronize the stream at the end of every operation. Stream
-  // synchronization occurs only before GPU to CPU transfers, and is handled by
-  // CudaBuffer.
+  // Most of operations on the GPU are performed using a single stream.  In
+  // those cases DefaultStream() should be used. This ensures that operations
+  // are stream-ordered, and might be concurrent with cpu processing with no
+  // additional efforts.
+  //
+  // a. Single-stream workloads
+  //  - Only use default stream
+  //  - Return control to the callee without synchronization whenever possible
+  //  - Stream synchronization occurs only after GPU to CPU transfers, and is
+  //  handled by CudaBuffer
+  //
+  // b. Multi-stream workloads
+  // Multi-stream workloads are more restricted in order to make it harder to
+  // get a race-condition.
+  //  - Should always synchronize the default stream on entry
+  //  - Should always synchronize all utilized streams on exit
+  //  - Should not make any assumptions on one of streams_[] being default
+  //
+  // With those rules in place
+  //  - All single-stream asynchronous workloads are serialized using default
+  //  stream
+  //  - Multiple-stream workloads always wait single-stream workloads to finish
+  //  and leave no running computations on exit.
+  //  This slightly penalizes multi-stream workloads, but makes it easier to
+  //  avoid race conditions when  multiple-stream workload depends on results of
+  //  any preceeding gpu computations.
 
   // Initializes cuBLAS, cuSOLVER, and cuSPARSE contexts, creates an
   // asynchronous CUDA stream, and associates the stream with the contexts.
@@ -101,7 +122,14 @@
 
   cusolverDnHandle_t cusolver_handle_ = nullptr;
   cublasHandle_t cublas_handle_ = nullptr;
-  cudaStream_t stream_ = nullptr;
+
+  // Default stream.
+  // Kernel invocations and memory copies on this stream can be left without
+  // synchronization.
+  cudaStream_t DefaultStream() { return streams_[0]; }
+  static constexpr int kNumCudaStreams = 2;
+  cudaStream_t streams_[kNumCudaStreams] = {0};
+
   cusparseHandle_t cusparse_handle_ = nullptr;
   bool is_cuda_initialized_ = false;
   int gpu_device_id_in_use_ = -1;

diff --git a/internal/ceres/cuda_block_sparse_crs_view.cc b/internal/ceres/cuda_block_sparse_crs_view.cc
new file mode 100644
index 0000000..a03d01c
--- /dev/null
+++ b/internal/ceres/cuda_block_sparse_crs_view.cc

@@ -0,0 +1,74 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#include "ceres/cuda_block_sparse_crs_view.h"
+
+#ifndef CERES_NO_CUDA
+
+#include "ceres/cuda_block_structure.h"
+#include "ceres/cuda_kernels.h"
+
+namespace ceres::internal {
+
+CudaBlockSparseCRSView::CudaBlockSparseCRSView(const BlockSparseMatrix& bsm,
+                                               ContextImpl* context)
+    : permutation_(context, bsm.num_nonzeros()),
+      streamed_buffer_(context, kMaxTemporaryArraySize) {
+  CudaBlockSparseStructure block_structure(*bsm.block_structure(), context);
+  crs_matrix_ = std::make_unique<CudaSparseMatrix>(
+      bsm.num_rows(), bsm.num_cols(), bsm.num_nonzeros(), context);
+  CudaBuffer<int> temp_rows(context, bsm.num_rows());
+  FillCRSStructure(block_structure.num_row_blocks(),
+                   bsm.num_rows(),
+                   block_structure.row_block_offsets(),
+                   block_structure.cells(),
+                   block_structure.row_blocks(),
+                   block_structure.col_blocks(),
+                   crs_matrix_->mutable_rows(),
+                   crs_matrix_->mutable_cols(),
+                   temp_rows.data(),
+                   permutation_.data(),
+                   context->DefaultStream());
+  UpdateValues(bsm);
+}
+void CudaBlockSparseCRSView::UpdateValues(const BlockSparseMatrix& bsm) {
+  streamed_buffer_.CopyToGpu(
+      bsm.values(),
+      bsm.num_nonzeros(),
+      [permutation = permutation_.data(),
+       values_to = crs_matrix_->mutable_values()](
+          const double* values, int num_values, int offset, auto stream) {
+        PermuteValues(
+            offset, num_values, permutation, values, values_to, stream);
+      });
+}
+
+}  // namespace ceres::internal
+#endif  // CERES_NO_CUDA

diff --git a/internal/ceres/cuda_block_sparse_crs_view.h b/internal/ceres/cuda_block_sparse_crs_view.h
new file mode 100644
index 0000000..3ea8498
--- /dev/null
+++ b/internal/ceres/cuda_block_sparse_crs_view.h

@@ -0,0 +1,113 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+//
+
+#ifndef CERES_INTERNAL_CUDA_BLOCK_SPARSE_CRS_VIEW_H_
+#define CERES_INTERNAL_CUDA_BLOCK_SPARSE_CRS_VIEW_H_
+
+#include "ceres/internal/config.h"
+
+#ifndef CERES_NO_CUDA
+
+#include <memory>
+
+#include "ceres/block_sparse_matrix.h"
+#include "ceres/cuda_buffer.h"
+#include "ceres/cuda_sparse_matrix.h"
+#include "ceres/cuda_streamed_buffer.h"
+
+namespace ceres::internal {
+// We use cuSPARSE library for SpMV operations. However, it does not support
+// block-sparse format with varying size of the blocks. Thus, we perform the
+// following operations in order to compute products of block-sparse matrices
+// and dense vectors on gpu:
+//  - Once per block-sparse structure update:
+//    - Compute CRS structure from block-sparse structure
+//    - Compute permutation from block-sparse values to CRS values
+//  - Once per block-sparse values update:
+//    - Update values in CRS matrix with values of block-sparse matrix
+//
+// Since there are no constraints on positions of cells in value array of
+// block-sparse matrix, a permutation from block-sparse values to CRS
+// values is stored explicitly.
+//
+// Example: given matrix with the following block-structure
+//  [ 1 2 | | 6 7 ]
+//  [ 3 4 | | 8 9 ]
+//  [-----+-+-----]
+//  [     |5|     ]
+// with values stored as values_block_sparse = [1, 2, 3, 4, 5, 6, 7, 8, 9],
+// permutation from block-sparse to CRS is p = [0, 1, 4, 5, 8, 2, 3, 6, 7];
+// and i-th block-sparse value has index p[i] in CRS values array
+//
+// This allows to avoid storing both CRS and block-sparse values in GPU memory.
+// Instead, block-sparse values are transferred to gpu memory as a disjoint set
+// of small continuous segments with simultaneous permutation of the values into
+// correct order
+class CERES_NO_EXPORT CudaBlockSparseCRSView {
+ public:
+  // Initializes internal CRS matrix and permutation from block-sparse to CRS
+  // values. The following objects are stored in gpu memory for the whole
+  // lifetime of the object
+  //  - crs_matrix_: CRS matrix
+  //  - permutation_: permutation from block-sparse to CRS value order
+  //  (num_nonzeros integer values)
+  //  - streamed_buffer_: helper for value updating
+  // The following objects are created temporarily during construction:
+  //  - CudaBlockSparseStructure: block-sparse structure of block-sparse matrix
+  //  - num_rows integer values: row to row-block map
+  //  If copy_values flag is set to false, only structure of block-sparse matrix
+  //  bsm is captured, and values are left uninitialized
+  CudaBlockSparseCRSView(const BlockSparseMatrix& bsm, ContextImpl* context);
+
+  const CudaSparseMatrix* crs_matrix() const { return crs_matrix_.get(); }
+  CudaSparseMatrix* mutable_crs_matrix() { return crs_matrix_.get(); }
+
+  // Update values of crs_matrix_ using values of block-sparse matrix.
+  // Assumes that bsm has the same block-sparse structure as matrix that was
+  // used for construction.
+  void UpdateValues(const BlockSparseMatrix& bsm);
+
+ private:
+  // Value permutation kernel performs a single element-wise operation per
+  // thread, thus performing permutation in blocks of 8 megabytes of
+  // block-sparse  values seems reasonable
+  static constexpr int kMaxTemporaryArraySize = 1 * 1024 * 1024;
+  std::unique_ptr<CudaSparseMatrix> crs_matrix_;
+  // Permutation from block-sparse to CRS value order.
+  // permutation_[i] = index of i-th block-sparse value in CRS values
+  CudaBuffer<int> permutation_;
+  CudaStreamedBuffer<double> streamed_buffer_;
+};
+
+}  // namespace ceres::internal
+
+#endif  // CERES_NO_CUDA
+#endif  // CERES_INTERNAL_CUDA_BLOCK_SPARSE_CRS_VIEW_H_

diff --git a/internal/ceres/cuda_block_sparse_crs_view_test.cc b/internal/ceres/cuda_block_sparse_crs_view_test.cc
new file mode 100644
index 0000000..f8d1cdf
--- /dev/null
+++ b/internal/ceres/cuda_block_sparse_crs_view_test.cc

@@ -0,0 +1,100 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#include "ceres/cuda_block_sparse_crs_view.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#ifndef CERES_NO_CUDA
+
+namespace ceres::internal {
+class CudaBlockSparseCRSViewTest : public ::testing::Test {
+ protected:
+  void SetUp() final {
+    std::string message;
+    CHECK(context_.InitCuda(&message))
+        << "InitCuda() failed because: " << message;
+
+    BlockSparseMatrix::RandomMatrixOptions options;
+    options.num_row_blocks = 1234;
+    options.min_row_block_size = 1;
+    options.max_row_block_size = 10;
+    options.num_col_blocks = 567;
+    options.min_col_block_size = 1;
+    options.max_col_block_size = 10;
+    options.block_density = 0.2;
+    std::mt19937 rng;
+    A_ = BlockSparseMatrix::CreateRandomMatrix(options, rng);
+    std::iota(
+        A_->mutable_values(), A_->mutable_values() + A_->num_nonzeros(), 1);
+  }
+
+  std::unique_ptr<BlockSparseMatrix> A_;
+  ContextImpl context_;
+};
+
+TEST_F(CudaBlockSparseCRSViewTest, CreateUpdateValues) {
+  auto view = CudaBlockSparseCRSView(*A_, &context_);
+
+  // TODO: we definitely would like to use crs_matrix() here, but
+  // CudaSparseMatrix::RightMultiplyAndAccumulate is defined non-const because
+  // it might allocate additional storage by request of cuSPARSE
+  auto matrix = view.mutable_crs_matrix();
+  ASSERT_EQ(matrix->num_cols(), A_->num_cols());
+  ASSERT_EQ(matrix->num_rows(), A_->num_rows());
+  ASSERT_EQ(matrix->num_nonzeros(), A_->num_nonzeros());
+
+  const int num_rows = A_->num_rows();
+  const int num_cols = A_->num_cols();
+  Vector x(num_cols);
+  Vector y(num_rows);
+  CudaVector x_cuda(&context_, num_cols);
+  CudaVector y_cuda(&context_, num_rows);
+  Vector y_cuda_host(num_rows);
+
+  for (int i = 0; i < num_cols; ++i) {
+    x.setZero();
+    y.setZero();
+    y_cuda.SetZero();
+    x[i] = 1.;
+    x_cuda.CopyFromCpu(x);
+    A_->RightMultiplyAndAccumulate(
+        x.data(), y.data(), &context_, std::thread::hardware_concurrency());
+    matrix->RightMultiplyAndAccumulate(x_cuda, &y_cuda);
+    y_cuda.CopyTo(&y_cuda_host);
+    // There will be up to 1 non-zero product per row, thus we expect an exact
+    // match on 32-bit integer indices
+    EXPECT_EQ((y - y_cuda_host).squaredNorm(), 0.);
+  }
+}
+}  // namespace ceres::internal
+
+#endif  // CERES_NO_CUDA

diff --git a/internal/ceres/cuda_block_structure.cc b/internal/ceres/cuda_block_structure.cc
new file mode 100644
index 0000000..8e549e1
--- /dev/null
+++ b/internal/ceres/cuda_block_structure.cc

@@ -0,0 +1,118 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#include "ceres/cuda_block_structure.h"
+
+#ifndef CERES_NO_CUDA
+
+namespace ceres::internal {
+namespace {
+// Dimension of a sorted array of blocks
+inline int Dimension(const std::vector<Block>& blocks) {
+  if (blocks.empty()) {
+    return 0;
+  }
+  const auto& last = blocks.back();
+  return last.size + last.position;
+}
+}  // namespace
+
+CudaBlockSparseStructure::CudaBlockSparseStructure(
+    const CompressedRowBlockStructure& block_structure, ContextImpl* context)
+    : row_block_offsets_(context),
+      cells_(context),
+      row_blocks_(context),
+      col_blocks_(context) {
+  // Row blocks extracted from CompressedRowBlockStructure::rows
+  std::vector<Block> row_blocks;
+  // Column blocks can be reused as-is
+  const auto& col_blocks = block_structure.cols;
+
+  // Row block offset is an index of the first cell corresponding to row block
+  std::vector<int> row_block_offsets;
+  // Flat array of all cells from all row-blocks
+  std::vector<Cell> cells;
+
+  num_row_blocks_ = block_structure.rows.size();
+  num_col_blocks_ = col_blocks.size();
+
+  row_blocks.reserve(num_row_blocks_);
+  row_block_offsets.reserve(num_row_blocks_ + 1);
+  num_nonzeros_ = 0;
+  num_cells_ = 0;
+  for (const auto& r : block_structure.rows) {
+    const int row_block_size = r.block.size;
+    row_blocks.emplace_back(r.block);
+    row_block_offsets.push_back(num_cells_);
+    for (const auto& c : r.cells) {
+      cells.emplace_back(c);
+      const int col_block_size = col_blocks[c.block_id].size;
+      num_nonzeros_ += col_block_size * row_block_size;
+      ++num_cells_;
+    }
+  }
+  row_block_offsets.push_back(num_cells_);
+
+  num_rows_ = Dimension(row_blocks);
+  num_cols_ = Dimension(col_blocks);
+
+  if (VLOG_IS_ON(3)) {
+    const size_t row_block_offsets_size =
+        row_block_offsets.size() * sizeof(int);
+    const size_t cells_size = cells.size() * sizeof(Cell);
+    const size_t row_blocks_size = row_blocks.size() * sizeof(Block);
+    const size_t col_blocks_size = col_blocks.size() * sizeof(Block);
+    const size_t total_size =
+        row_block_offsets_size + cells_size + col_blocks_size + row_blocks_size;
+    VLOG(3) << "\nCudaBlockSparseStructure:\n"
+               "\tRow block offsets: "
+            << row_block_offsets_size
+            << " bytes\n"
+               "\tColumn blocks: "
+            << col_blocks_size
+            << " bytes\n"
+               "\tRow blocks: "
+            << row_blocks_size
+            << " bytes\n"
+               "\tCells: "
+            << cells_size
+            << " bytes\n"
+               "\tTotal: "
+            << total_size << " bytes of GPU memory";
+  }
+
+  row_block_offsets_.CopyFromCpuVector(row_block_offsets);
+  cells_.CopyFromCpuVector(cells);
+  row_blocks_.CopyFromCpuVector(row_blocks);
+  col_blocks_.CopyFromCpuVector(col_blocks);
+}
+}  // namespace ceres::internal
+
+#endif  // CERES_NO_CUDA

diff --git a/internal/ceres/cuda_block_structure.h b/internal/ceres/cuda_block_structure.h
new file mode 100644
index 0000000..a41d0b3
--- /dev/null
+++ b/internal/ceres/cuda_block_structure.h

@@ -0,0 +1,94 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#ifndef CERES_INTERNAL_CUDA_BLOCK_STRUCTURE_H_
+#define CERES_INTERNAL_CUDA_BLOCK_STRUCTURE_H_
+
+#include "ceres/internal/config.h"
+
+#ifndef CERES_NO_CUDA
+
+#include "ceres/block_structure.h"
+#include "ceres/cuda_buffer.h"
+
+namespace ceres::internal {
+class CudaBlockStructureTest;
+
+// This class stores a read-only block-sparse structure in gpu memory.
+// Invariants are the same as those of CompressedRowBlockStructure.
+// In order to simplify allocation and copying data to gpu, cells from all
+// row-blocks are stored in a single array sequentially. Additional array
+// row_block_offsets of size num_row_blocks + 1 allows to identify range of
+// cells corresponding to a row-block.
+// Cells corresponding to i-th row-block are stored in sub-array
+// cells[row_block_offsets[i]; ... row_block_offsets[i + 1] - 1], and their
+// order is preserved.
+class CERES_NO_EXPORT CudaBlockSparseStructure {
+ public:
+  // CompressedRowBlockStructure is contains a vector of CompressedLists, with
+  // each CompressedList containing a vector of Cells. We precompute a flat
+  // array of cells on cpu and transfer it to the gpu.
+  CudaBlockSparseStructure(const CompressedRowBlockStructure& block_structure,
+                           ContextImpl* context);
+
+  int num_rows() const { return num_rows_; }
+  int num_cols() const { return num_cols_; }
+  int num_cells() const { return num_cells_; }
+  int num_nonzeros() const { return num_nonzeros_; }
+  int num_row_blocks() const { return num_row_blocks_; }
+  int num_col_blocks() const { return num_col_blocks_; }
+
+  // Device pointer to array of num_row_blocks + 1 indices of the first cell of
+  // row block
+  const int* row_block_offsets() const { return row_block_offsets_.data(); }
+  // Device pointer to array of num_cells cells, sorted by row-block
+  const Cell* cells() const { return cells_.data(); }
+  // Device pointer to array of row blocks
+  const Block* row_blocks() const { return row_blocks_.data(); }
+  // Device pointer to array of column blocks
+  const Block* col_blocks() const { return col_blocks_.data(); }
+
+ private:
+  int num_rows_;
+  int num_cols_;
+  int num_cells_;
+  int num_nonzeros_;
+  int num_row_blocks_;
+  int num_col_blocks_;
+  CudaBuffer<int> row_block_offsets_;
+  CudaBuffer<Cell> cells_;
+  CudaBuffer<Block> row_blocks_;
+  CudaBuffer<Block> col_blocks_;
+  friend class CudaBlockStructureTest;
+};
+}  // namespace ceres::internal
+
+#endif  // CERES_NO_CUDA
+#endif  // CERES_INTERNAL_CUDA_BLOCK_SPARSE_STRUCTURE_H_

diff --git a/internal/ceres/cuda_block_structure_test.cc b/internal/ceres/cuda_block_structure_test.cc
new file mode 100644
index 0000000..95cb26b
--- /dev/null
+++ b/internal/ceres/cuda_block_structure_test.cc

@@ -0,0 +1,142 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#include "ceres/internal/config.h"
+
+#ifndef CERES_NO_CUDA
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <numeric>
+
+#include "ceres/block_sparse_matrix.h"
+#include "ceres/cuda_block_structure.h"
+
+namespace ceres::internal {
+
+class CudaBlockStructureTest : public ::testing::Test {
+ protected:
+  void SetUp() final {
+    std::string message;
+    CHECK(context_.InitCuda(&message))
+        << "InitCuda() failed because: " << message;
+
+    BlockSparseMatrix::RandomMatrixOptions options;
+    options.num_row_blocks = 1234;
+    options.min_row_block_size = 1;
+    options.max_row_block_size = 10;
+    options.num_col_blocks = 567;
+    options.min_col_block_size = 1;
+    options.max_col_block_size = 10;
+    options.block_density = 0.2;
+    std::mt19937 rng;
+    A_ = BlockSparseMatrix::CreateRandomMatrix(options, rng);
+    std::iota(
+        A_->mutable_values(), A_->mutable_values() + A_->num_nonzeros(), 1);
+  }
+
+  std::vector<Cell> GetCells(const CudaBlockSparseStructure& structure) {
+    const auto& cuda_buffer = structure.cells_;
+    std::vector<Cell> cells(cuda_buffer.size());
+    cuda_buffer.CopyToCpu(cells.data(), cells.size());
+    return cells;
+  }
+  std::vector<Block> GetRowBlocks(const CudaBlockSparseStructure& structure) {
+    const auto& cuda_buffer = structure.row_blocks_;
+    std::vector<Block> blocks(cuda_buffer.size());
+    cuda_buffer.CopyToCpu(blocks.data(), blocks.size());
+    return blocks;
+  }
+  std::vector<Block> GetColBlocks(const CudaBlockSparseStructure& structure) {
+    const auto& cuda_buffer = structure.col_blocks_;
+    std::vector<Block> blocks(cuda_buffer.size());
+    cuda_buffer.CopyToCpu(blocks.data(), blocks.size());
+    return blocks;
+  }
+  std::vector<int> GetRowBlockOffsets(
+      const CudaBlockSparseStructure& structure) {
+    const auto& cuda_buffer = structure.row_block_offsets_;
+    std::vector<int> row_block_offsets(cuda_buffer.size());
+    cuda_buffer.CopyToCpu(row_block_offsets.data(), row_block_offsets.size());
+    return row_block_offsets;
+  }
+
+  std::unique_ptr<BlockSparseMatrix> A_;
+  ContextImpl context_;
+};
+
+TEST_F(CudaBlockStructureTest, StructureIdentity) {
+  auto block_structure = A_->block_structure();
+  const int num_row_blocks = block_structure->rows.size();
+  const int num_col_blocks = block_structure->cols.size();
+
+  CudaBlockSparseStructure cuda_block_structure(*block_structure, &context_);
+
+  ASSERT_EQ(cuda_block_structure.num_rows(), A_->num_rows());
+  ASSERT_EQ(cuda_block_structure.num_cols(), A_->num_cols());
+  ASSERT_EQ(cuda_block_structure.num_nonzeros(), A_->num_nonzeros());
+  ASSERT_EQ(cuda_block_structure.num_row_blocks(), num_row_blocks);
+  ASSERT_EQ(cuda_block_structure.num_col_blocks(), num_col_blocks);
+
+  std::vector<Block> blocks = GetColBlocks(cuda_block_structure);
+  ASSERT_EQ(blocks.size(), num_col_blocks);
+  for (int i = 0; i < num_col_blocks; ++i) {
+    EXPECT_EQ(block_structure->cols[i].position, blocks[i].position);
+    EXPECT_EQ(block_structure->cols[i].size, blocks[i].size);
+  }
+
+  std::vector<Cell> cells = GetCells(cuda_block_structure);
+  std::vector<int> row_block_offsets = GetRowBlockOffsets(cuda_block_structure);
+  blocks = GetRowBlocks(cuda_block_structure);
+
+  ASSERT_EQ(blocks.size(), num_row_blocks);
+  ASSERT_EQ(row_block_offsets.size(), num_row_blocks + 1);
+  ASSERT_EQ(row_block_offsets.back(), cells.size());
+
+  for (int i = 0; i < num_row_blocks; ++i) {
+    const int num_cells = block_structure->rows[i].cells.size();
+    EXPECT_EQ(blocks[i].position, block_structure->rows[i].block.position);
+    EXPECT_EQ(blocks[i].size, block_structure->rows[i].block.size);
+    const int first_cell = row_block_offsets[i];
+    const int last_cell = row_block_offsets[i + 1];
+    ASSERT_EQ(last_cell - first_cell, num_cells);
+    for (int j = 0; j < num_cells; ++j) {
+      EXPECT_EQ(cells[first_cell + j].block_id,
+                block_structure->rows[i].cells[j].block_id);
+      EXPECT_EQ(cells[first_cell + j].position,
+                block_structure->rows[i].cells[j].position);
+    }
+  }
+}
+
+}  // namespace ceres::internal
+
+#endif  // CERES_NO_CUDA

diff --git a/internal/ceres/cuda_buffer.h b/internal/ceres/cuda_buffer.h
index 97d126c..d6abb15 100644
--- a/internal/ceres/cuda_buffer.h
+++ b/internal/ceres/cuda_buffer.h

@@ -86,7 +86,7 @@
                              data,
                              size * sizeof(T),
                              cudaMemcpyHostToDevice,
-                             context_->stream_),
+                             context_->DefaultStream()),
              cudaSuccess);
   }
 
@@ -98,7 +98,7 @@
                              data.data(),
                              data.size() * sizeof(T),
                              cudaMemcpyHostToDevice,
-                             context_->stream_),
+                             context_->DefaultStream()),
              cudaSuccess);
   }
 
@@ -110,7 +110,7 @@
                              data,
                              size * sizeof(T),
                              cudaMemcpyDeviceToDevice,
-                             context_->stream_),
+                             context_->DefaultStream()),
              cudaSuccess);
   }
 
@@ -126,9 +126,9 @@
                              data_,
                              size * sizeof(T),
                              cudaMemcpyDeviceToHost,
-                             context_->stream_),
+                             context_->DefaultStream()),
              cudaSuccess);
-    CHECK_EQ(cudaStreamSynchronize(context_->stream_), cudaSuccess);
+    CHECK_EQ(cudaStreamSynchronize(context_->DefaultStream()), cudaSuccess);
   }
 
   // Copy N items from another GPU memory array to the GPU memory managed by
@@ -142,7 +142,7 @@
                              other.data_,
                              size_ * sizeof(T),
                              cudaMemcpyDeviceToDevice,
-                             context_->stream_),
+                             context_->DefaultStream()),
              cudaSuccess);
   }
 

diff --git a/internal/ceres/cuda_kernels.cu.cc b/internal/ceres/cuda_kernels.cu.cc
index c0f5453..3d93ae8 100644
--- a/internal/ceres/cuda_kernels.cu.cc
+++ b/internal/ceres/cuda_kernels.cu.cc

@@ -28,6 +28,12 @@
 //
 // Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
 
+//#include "ceres/cuda_kernels.h"
+
+#include <thrust/execution_policy.h>
+#include <thrust/scan.h>
+
+#include "block_structure.h"
 #include "cuda_runtime.h"
 
 namespace ceres {
@@ -40,6 +46,9 @@
 // might provide better performance. See "Occupancy Calculator" under the CUDA
 // toolkit documentation.
 constexpr int kCudaBlockSize = 256;
+inline int NumBlocks(int size) {
+  return (size + kCudaBlockSize - 1) / kCudaBlockSize;
+}
 
 template <typename SrcType, typename DstType>
 __global__ void TypeConversionKernel(const SrcType* __restrict__ input,
@@ -55,7 +64,7 @@
                     float* output,
                     const int size,
                     cudaStream_t stream) {
-  const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
+  const int num_blocks = NumBlocks(size);
   TypeConversionKernel<double, float>
       <<<num_blocks, kCudaBlockSize, 0, stream>>>(input, output, size);
 }
@@ -64,7 +73,7 @@
                     double* output,
                     const int size,
                     cudaStream_t stream) {
-  const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
+  const int num_blocks = NumBlocks(size);
   TypeConversionKernel<float, double>
       <<<num_blocks, kCudaBlockSize, 0, stream>>>(input, output, size);
 }
@@ -78,12 +87,12 @@
 }
 
 void CudaSetZeroFP32(float* output, const int size, cudaStream_t stream) {
-  const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
+  const int num_blocks = NumBlocks(size);
   SetZeroKernel<float><<<num_blocks, kCudaBlockSize, 0, stream>>>(output, size);
 }
 
 void CudaSetZeroFP64(double* output, const int size, cudaStream_t stream) {
-  const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
+  const int num_blocks = NumBlocks(size);
   SetZeroKernel<double>
       <<<num_blocks, kCudaBlockSize, 0, stream>>>(output, size);
 }
@@ -99,7 +108,7 @@
 }
 
 void CudaDsxpy(double* x, float* y, const int size, cudaStream_t stream) {
-  const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
+  const int num_blocks = NumBlocks(size);
   XPlusEqualsYKernel<float, double>
       <<<num_blocks, kCudaBlockSize, 0, stream>>>(x, y, size);
 }
@@ -119,9 +128,177 @@
                 const double* x,
                 const int size,
                 cudaStream_t stream) {
-  const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
+  const int num_blocks = NumBlocks(size);
   CudaDtDxpyKernel<<<num_blocks, kCudaBlockSize, 0, stream>>>(y, D, x, size);
 }
 
+// Fill row block id and nnz for each row using block-sparse structure
+// represented by a set of flat arrays.
+// Inputs:
+// - num_row_blocks: number of row-blocks in block-sparse structure
+// - row_block_offsets: index of the first cell of the row-block; size:
+// num_row_blocks + 1
+// - cells: cells of block-sparse structure as a continuous array
+// - row_blocks: row blocks of block-sparse structure stored sequentially
+// - col_blocks: column blocks of block-sparse structure stored sequentially
+// Outputs:
+// - rows: rows[i + 1] will contain number of non-zeros in i-th row, rows[0]
+// will be set to 0; rows are filled with a shift by one element in order
+// to obtain row-index array of CRS matrix with a inclusive scan afterwards
+// - row_block_ids: row_block_ids[i] will be set to index of row-block that
+// contains i-th row.
+// Computation is perform row-block-wise
+__global__ void RowBlockIdAndNNZ(int num_row_blocks,
+                                 const int* __restrict__ row_block_offsets,
+                                 const Cell* __restrict__ cells,
+                                 const Block* __restrict__ row_blocks,
+                                 const Block* __restrict__ col_blocks,
+                                 int* __restrict__ rows,
+                                 int* __restrict__ row_block_ids) {
+  const int row_block_id = blockIdx.x * blockDim.x + threadIdx.x;
+  if (row_block_id > num_row_blocks) {
+    // No synchronization is performed in this kernel, thus it is safe to return
+    return;
+  }
+  if (row_block_id == num_row_blocks) {
+    // one extra thread sets the first element
+    rows[0] = 0;
+    return;
+  }
+  const auto& row_block = row_blocks[row_block_id];
+  int row_nnz = 0;
+  const auto first_cell = cells + row_block_offsets[row_block_id];
+  const auto last_cell = cells + row_block_offsets[row_block_id + 1];
+  for (auto cell = first_cell; cell < last_cell; ++cell) {
+    row_nnz += col_blocks[cell->block_id].size;
+  }
+  const int first_row = row_block.position;
+  const int last_row = first_row + row_block.size;
+  for (int i = first_row; i < last_row; ++i) {
+    rows[i + 1] = row_nnz;
+    row_block_ids[i] = row_block_id;
+  }
+}
+
+// Row-wise creation of CRS structure
+// Inputs:
+// - num_rows: number of rows in matrix
+// - row_block_offsets: index of the first cell of the row-block; size:
+// num_row_blocks + 1
+// - cells: cells of block-sparse structure as a continuous array
+// - row_blocks: row blocks of block-sparse structure stored sequentially
+// - col_blocks: column blocks of block-sparse structure stored sequentially
+// - row_block_ids: index of row-block that corresponds to row
+// - rows: row-index array of CRS structure
+// Outputs:
+// - cols: column-index array of CRS structure
+// - permutation: permutation from block-sparse to crs order
+// Computaion is perform row-wise
+__global__ void ComputeColumnsAndPermutation(
+    int num_rows,
+    const int* __restrict__ row_block_offsets,
+    const Cell* __restrict__ cells,
+    const Block* __restrict__ row_blocks,
+    const Block* __restrict__ col_blocks,
+    const int* __restrict__ row_block_ids,
+    const int* __restrict__ rows,
+    int* __restrict__ cols,
+    int* __restrict__ permutation) {
+  const int row = blockIdx.x * blockDim.x + threadIdx.x;
+  if (row >= num_rows) {
+    // No synchronization is performed in this kernel, thus it is safe to return
+    return;
+  }
+  const int row_block_id = row_block_ids[row];
+  // zero-based index of row in row-block
+  const int row_in_block = row - row_blocks[row_block_id].position;
+  // position in crs matrix
+  int crs_position = rows[row];
+  const auto first_cell = cells + row_block_offsets[row_block_id];
+  const auto last_cell = cells + row_block_offsets[row_block_id + 1];
+  // For reach cell of row-block only current row is being filled
+  for (auto cell = first_cell; cell < last_cell; ++cell) {
+    const auto& col_block = col_blocks[cell->block_id];
+    const int col_block_size = col_block.size;
+    int column_idx = col_block.position;
+    int bs_position = cell->position + row_in_block * col_block_size;
+    // Fill permutation and column indices for each element of row_in_block
+    // row of current cell
+    for (int i = 0; i < col_block_size; ++i, ++crs_position) {
+      permutation[bs_position++] = crs_position;
+      cols[crs_position] = column_idx++;
+    }
+  }
+}
+
+void FillCRSStructure(const int num_row_blocks,
+                      const int num_rows,
+                      const int* row_block_offsets,
+                      const Cell* cells,
+                      const Block* row_blocks,
+                      const Block* col_blocks,
+                      int* rows,
+                      int* cols,
+                      int* row_block_ids,
+                      int* permutation,
+                      cudaStream_t stream) {
+  // Set number of non-zeros per row in rows array and row to row-block map in
+  // row_block_ids array
+  const int num_blocks_blockwise = NumBlocks(num_row_blocks + 1);
+  RowBlockIdAndNNZ<<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>(
+      num_row_blocks,
+      row_block_offsets,
+      cells,
+      row_blocks,
+      col_blocks,
+      rows,
+      row_block_ids);
+  // Finalize row-index array of CRS strucure by computing prefix sum
+  thrust::inclusive_scan(
+      thrust::cuda::par.on(stream), rows, rows + num_rows + 1, rows);
+
+  // Fill cols array of CRS structure and permutation from block-sparse to CRS
+  const int num_blocks_rowwise = NumBlocks(num_rows);
+  ComputeColumnsAndPermutation<<<num_blocks_rowwise,
+                                 kCudaBlockSize,
+                                 0,
+                                 stream>>>(num_rows,
+                                           row_block_offsets,
+                                           cells,
+                                           row_blocks,
+                                           col_blocks,
+                                           row_block_ids,
+                                           rows,
+                                           cols,
+                                           permutation);
+}
+
+// Updates values of CRS matrix with a continuous block of values of
+// block-sparse matrix. With permutation[i] being an index of i-th value of
+// block-sparse matrix in values of CRS matrix updating values is quite
+// efficient when performed element-wise (reads of permutation and values arrays
+// are coalesced when offset is divisable by 16)
+__global__ void PermuteValuesKernel(const int offset,
+                                    const int num_values,
+                                    const int* permutation,
+                                    const double* block_sparse_values,
+                                    double* crs_values) {
+  const int value_id = blockIdx.x * blockDim.x + threadIdx.x;
+  if (value_id < num_values) {
+    crs_values[permutation[offset + value_id]] = block_sparse_values[value_id];
+  }
+}
+
+void PermuteValues(const int offset,
+                   const int num_values,
+                   const int* permutation,
+                   const double* block_sparse_values,
+                   double* crs_values,
+                   cudaStream_t stream) {
+  const int num_blocks = NumBlocks(num_values);
+  PermuteValuesKernel<<<num_blocks, kCudaBlockSize, 0, stream>>>(
+      offset, num_values, permutation, block_sparse_values, crs_values);
+}
+
 }  // namespace internal
 }  // namespace ceres

diff --git a/internal/ceres/cuda_kernels.h b/internal/ceres/cuda_kernels.h
index d347c20..61f945a 100644
--- a/internal/ceres/cuda_kernels.h
+++ b/internal/ceres/cuda_kernels.h

@@ -37,7 +37,10 @@
 
 #include "cuda_runtime.h"
 
-namespace ceres::internal {
+namespace ceres {
+namespace internal {
+class Block;
+class Cell;
 
 // Convert an array of double (FP64) values to float (FP32). Both arrays must
 // already be on GPU memory.
@@ -72,7 +75,38 @@
                 const int size,
                 cudaStream_t stream);
 
-}  // namespace ceres::internal
+// Compute structure of CRS matrix and permutation of values using block-sparse
+// structure and temporary array row_block_ids. Array row_block_ids of size
+// num_rows will be filled with indices of row-blocks corresponding to rows of
+// CRS matrix. Arrays corresponding to CRS matrix, permutation and row_block_ids
+// arrays are to be allocated by caller
+void FillCRSStructure(const int num_row_blocks,
+                      const int num_rows,
+                      const int* row_block_offsets,
+                      const Cell* cells,
+                      const Block* row_blocks,
+                      const Block* col_blocks,
+                      int* rows,
+                      int* cols,
+                      int* row_block_ids,
+                      int* permutation,
+                      cudaStream_t stream);
+
+// Permute block of block-sparse values using permutation
+// Pointer block_sparse_values corresponds to a block of num_values values from
+// block-sparse matrix at the offset from begining. Pointer output corresponds
+// to values of CRS matrix. Array permutation stores permutation from
+// block-sparse to CRS matrix with permutation[i] being an index of i-th value
+// of block-sparse matrix in values of CRS matrix
+void PermuteValues(const int offset,
+                   const int num_values,
+                   const int* permutation,
+                   const double* block_sparse_values,
+                   double* crs_values,
+                   cudaStream_t stream);
+
+}  // namespace internal
+}  // namespace ceres
 
 #endif  // CERES_NO_CUDA
 

diff --git a/internal/ceres/cuda_kernels_test.cc b/internal/ceres/cuda_kernels_test.cc
index 32ff55a..41036a3 100644
--- a/internal/ceres/cuda_kernels_test.cc
+++ b/internal/ceres/cuda_kernels_test.cc

@@ -57,8 +57,10 @@
   fp64_gpu.CopyFromCpuVector(fp64_cpu);
   CudaBuffer<float> fp32_gpu(&context);
   fp32_gpu.Reserve(fp64_cpu.size());
-  CudaFP64ToFP32(
-      fp64_gpu.data(), fp32_gpu.data(), fp64_cpu.size(), context.stream_);
+  CudaFP64ToFP32(fp64_gpu.data(),
+                 fp32_gpu.data(),
+                 fp64_cpu.size(),
+                 context.DefaultStream());
   std::vector<float> fp32_cpu(fp64_cpu.size());
   fp32_gpu.CopyToCpu(fp32_cpu.data(), fp32_cpu.size());
   for (int i = 0; i < fp32_cpu.size(); ++i) {
@@ -82,8 +84,10 @@
   fp64_gpu.CopyFromCpuVector(fp64_cpu);
   CudaBuffer<float> fp32_gpu(&context);
   fp32_gpu.Reserve(fp64_cpu.size());
-  CudaFP64ToFP32(
-      fp64_gpu.data(), fp32_gpu.data(), fp64_cpu.size(), context.stream_);
+  CudaFP64ToFP32(fp64_gpu.data(),
+                 fp32_gpu.data(),
+                 fp64_cpu.size(),
+                 context.DefaultStream());
   std::vector<float> fp32_cpu(fp64_cpu.size());
   fp32_gpu.CopyToCpu(fp32_cpu.data(), fp32_cpu.size());
   EXPECT_EQ(fp32_cpu[0], 0.0f);
@@ -101,8 +105,10 @@
   fp32_gpu.CopyFromCpuVector(fp32_cpu);
   CudaBuffer<double> fp64_gpu(&context);
   fp64_gpu.Reserve(fp32_cpu.size());
-  CudaFP32ToFP64(
-      fp32_gpu.data(), fp64_gpu.data(), fp32_cpu.size(), context.stream_);
+  CudaFP32ToFP64(fp32_gpu.data(),
+                 fp64_gpu.data(),
+                 fp32_cpu.size(),
+                 context.DefaultStream());
   std::vector<double> fp64_cpu(fp32_cpu.size());
   fp64_gpu.CopyToCpu(fp64_cpu.data(), fp64_cpu.size());
   for (int i = 0; i < fp64_cpu.size(); ++i) {
@@ -117,7 +123,7 @@
   std::vector<float> fp32_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0};
   CudaBuffer<float> fp32_gpu(&context);
   fp32_gpu.CopyFromCpuVector(fp32_cpu);
-  CudaSetZeroFP32(fp32_gpu.data(), fp32_cpu.size(), context.stream_);
+  CudaSetZeroFP32(fp32_gpu.data(), fp32_cpu.size(), context.DefaultStream());
   std::vector<float> fp32_cpu_zero(fp32_cpu.size());
   fp32_gpu.CopyToCpu(fp32_cpu_zero.data(), fp32_cpu_zero.size());
   for (int i = 0; i < fp32_cpu_zero.size(); ++i) {
@@ -132,7 +138,7 @@
   std::vector<double> fp64_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0};
   CudaBuffer<double> fp64_gpu(&context);
   fp64_gpu.CopyFromCpuVector(fp64_cpu);
-  CudaSetZeroFP64(fp64_gpu.data(), fp64_cpu.size(), context.stream_);
+  CudaSetZeroFP64(fp64_gpu.data(), fp64_cpu.size(), context.DefaultStream());
   std::vector<double> fp64_cpu_zero(fp64_cpu.size());
   fp64_gpu.CopyToCpu(fp64_cpu_zero.data(), fp64_cpu_zero.size());
   for (int i = 0; i < fp64_cpu_zero.size(); ++i) {
@@ -151,8 +157,10 @@
   fp32_gpu_a.CopyFromCpuVector(fp32_cpu_a);
   CudaBuffer<double> fp64_gpu_b(&context);
   fp64_gpu_b.CopyFromCpuVector(fp64_cpu_b);
-  CudaDsxpy(
-      fp64_gpu_b.data(), fp32_gpu_a.data(), fp32_gpu_a.size(), context.stream_);
+  CudaDsxpy(fp64_gpu_b.data(),
+            fp32_gpu_a.data(),
+            fp32_gpu_a.size(),
+            context.DefaultStream());
   fp64_gpu_b.CopyToCpu(fp64_cpu_b.data(), fp64_cpu_b.size());
   for (int i = 0; i < fp64_cpu_b.size(); ++i) {
     EXPECT_DOUBLE_EQ(fp64_cpu_b[i], 2.0 * fp32_cpu_a[i]);
@@ -172,8 +180,11 @@
   y_gpu.CopyFromCpuVector(y_cpu);
   CudaBuffer<double> d_gpu(&context);
   d_gpu.CopyFromCpuVector(d_cpu);
-  CudaDtDxpy(
-      y_gpu.data(), d_gpu.data(), x_gpu.data(), y_gpu.size(), context.stream_);
+  CudaDtDxpy(y_gpu.data(),
+             d_gpu.data(),
+             x_gpu.data(),
+             y_gpu.size(),
+             context.DefaultStream());
   y_gpu.CopyToCpu(y_cpu.data(), y_cpu.size());
   EXPECT_DOUBLE_EQ(y_cpu[0], 4.0 + 10.0 * 10.0 * 1.0);
   EXPECT_DOUBLE_EQ(y_cpu[1], 3.0 + 20.0 * 20.0 * 2.0);

diff --git a/internal/ceres/cuda_sparse_matrix.cc b/internal/ceres/cuda_sparse_matrix.cc
index 7ee1761..0ae8c5d 100644
--- a/internal/ceres/cuda_sparse_matrix.cc
+++ b/internal/ceres/cuda_sparse_matrix.cc

@@ -59,6 +59,31 @@
 
 namespace ceres::internal {
 
+CudaSparseMatrix::CudaSparseMatrix(int num_rows,
+                                   int num_cols,
+                                   int num_nonzeros,
+                                   ContextImpl* context)
+    : context_(context),
+      rows_(context, num_rows + 1),
+      cols_(context, num_nonzeros),
+      values_(context, num_nonzeros),
+      spmv_buffer_(context),
+      num_rows_(num_rows),
+      num_cols_(num_cols),
+      num_nonzeros_(num_nonzeros) {
+  cusparseCreateCsr(&descr_,
+                    num_rows_,
+                    num_cols_,
+                    num_nonzeros_,
+                    rows_.data(),
+                    cols_.data(),
+                    values_.data(),
+                    CUSPARSE_INDEX_32I,
+                    CUSPARSE_INDEX_32I,
+                    CUSPARSE_INDEX_BASE_ZERO,
+                    CUDA_R_64F);
+}
+
 CudaSparseMatrix::CudaSparseMatrix(ContextImpl* context,
                                    const CompressedRowSparseMatrix& crs_matrix)
     : context_(context),

diff --git a/internal/ceres/cuda_sparse_matrix.h b/internal/ceres/cuda_sparse_matrix.h
index d5f3332..ad55170 100644
--- a/internal/ceres/cuda_sparse_matrix.h
+++ b/internal/ceres/cuda_sparse_matrix.h

@@ -64,6 +64,13 @@
   CudaSparseMatrix(ContextImpl* context,
                    const CompressedRowSparseMatrix& crs_matrix);
 
+  // Creates a "blank" matrix with an appropriate amount of memory allocated.
+  // The object itself is left in an inconsistent state.
+  CudaSparseMatrix(int num_rows,
+                   int num_cols,
+                   int num_nonzeros,
+                   ContextImpl* context);
+
   ~CudaSparseMatrix();
 
   // y = y + Ax;
@@ -75,6 +82,10 @@
   int num_cols() const { return num_cols_; }
   int num_nonzeros() const { return num_nonzeros_; }
 
+  int32_t* mutable_rows() { return rows_.data(); }
+  int32_t* mutable_cols() { return cols_.data(); }
+  double* mutable_values() { return values_.data(); }
+
   // If subsequent uses of this matrix involve only numerical changes and no
   // structural changes, then this method can be used to copy the updated
   // non-zero values -- the row and column index arrays are kept  the same. It

diff --git a/internal/ceres/cuda_streamed_buffer.h b/internal/ceres/cuda_streamed_buffer.h
new file mode 100644
index 0000000..8761ef4
--- /dev/null
+++ b/internal/ceres/cuda_streamed_buffer.h

@@ -0,0 +1,335 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#ifndef CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_
+#define CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_
+
+#include "ceres/internal/config.h"
+
+#ifndef CERES_NO_CUDA
+#include "ceres/cuda_buffer.h"
+
+namespace ceres::internal {
+
+// Most contemporary CUDA devices are capable of simultaneous code execution and
+// host-to-device transfer. This class copies batches of data to GPU memory and
+// executes processing of copied data in parallel (asynchronously).
+// Data is copied to a fixed-size buffer on GPU (containing at most
+// max_buffer_size values), and this memory is re-used when the previous
+// batch of values is processed by user-provided callback
+// Host-to-device copy uses a temporary buffer if required. Each batch of values
+// has size of kValuesPerBatch, except the last one.
+template <typename T>
+class CERES_NO_EXPORT CudaStreamedBuffer {
+ public:
+  // If hardware supports only one host-to-device copy or one host-to-device
+  // copy is able to reach peak bandwidth, two streams are sufficient to reach
+  // maximum efficiency:
+  //  - If transferring batch of values takes more time, than processing it on
+  //  gpu, then at every moment of time one of the streams will be transferring
+  //  data and other stream will be either processing data or idle; the whole
+  //  process will be bounded by host-to-device copy.
+  //  - If transferring batch of values takes less time, than processing it on
+  //  gpu, then at every moment of time one of the streams will be processing
+  //  data and other stream will be either performing computations or
+  //  transferring data, and the whole process will be bounded by computations.
+  static constexpr int kNumBatches = 2;
+  // max_buffer_size is the maximal size (in elements of type T) of array
+  // to be pre-allocated in gpu memory. The size of array determines size of
+  // batch of values for simultaneous copying and processing. It should be large
+  // enough to allow highly-parallel execution of user kernels; making it too
+  // large increases latency.
+  CudaStreamedBuffer(ContextImpl* context, const int max_buffer_size)
+      : kValuesPerBatch(max_buffer_size / kNumBatches),
+        context_(context),
+        values_gpu_(context, kValuesPerBatch * kNumBatches) {
+    static_assert(ContextImpl::kNumCudaStreams >= kNumBatches);
+    CHECK_GE(max_buffer_size, kNumBatches);
+    // Pre-allocate a buffer of page-locked memory for transfers from a regular
+    // cpu memory. Because we will be only writing into that buffer from cpu,
+    // memory is allocated with cudaHostAllocWriteCombined flag.
+    CHECK_EQ(cudaSuccess,
+             cudaHostAlloc(&values_cpu_pinned_,
+                           sizeof(T) * kValuesPerBatch * kNumBatches,
+                           cudaHostAllocWriteCombined));
+    for (auto& e : copy_finished_) {
+      CHECK_EQ(cudaSuccess,
+               cudaEventCreateWithFlags(&e, cudaEventDisableTiming));
+    }
+  }
+
+  CudaStreamedBuffer(const CudaStreamedBuffer&) = delete;
+
+  ~CudaStreamedBuffer() {
+    CHECK_EQ(cudaSuccess, cudaFreeHost(values_cpu_pinned_));
+    for (auto& e : copy_finished_) {
+      CHECK_EQ(cudaSuccess, cudaEventDestroy(e));
+    }
+  }
+
+  // Transfer num_values at host-memory pointer from, calling
+  // callback(device_pointer, size_of_batch, offset_of_batch, stream_to_use)
+  // after scheduling transfer of each batch of data. User-provided callback
+  // should perform processing of data at device_pointer only in
+  // stream_to_use stream (device_pointer will be re-used in the next
+  // callback invocation with the same stream).
+  //
+  // Two diagrams below describe operation in two possible scenarios, depending
+  // on input data being stored in page-locked memory. In this example we will
+  // have max_buffer_size = 2 * K, num_values = N * K and callback
+  // scheduling a single asynchronous launch of
+  // Kernel<<..., stream_to_use>>(device_pointer,
+  //                              size_of_batch,
+  //                              offset_of_batch)
+  //
+  // a. Copying from page-locked memory
+  // In this case no copy on the host-side is necessary, and this method just
+  // schedules a bunch of interleaved memory copies and callback invocations:
+  //
+  //  cudaStreamSynchronize(context->DefaultStream());
+  //  - Iteration #0:
+  //    - cudaMemcpyAsync(values_gpu_, from, K * sizeof(T), H->D, stream_0)
+  //    - callback(values_gpu_, K, 0, stream_0)
+  //  - Iteration #1:
+  //    - cudaMemcpyAsync(values_gpu_ + K, from + K, K * sizeof(T), H->D,
+  //    stream_1)
+  //    - callback(values_gpu_ + K, K, K, stream_1)
+  //  - Iteration #2:
+  //    - cudaMemcpyAsync(values_gpu_, from + 2 * K, K * sizeof(T), H->D,
+  //    stream_0)
+  //    - callback(values_gpu_, K, 2 * K, stream_0)
+  //  - Iteration #3:
+  //     - cudaMemcpyAsync(values_gpu_ + K, from + 3 * K, K * sizeof(T), H->D,
+  //     stream_1)
+  //     - callback(values_gpu_ + K, K, 3 * K, stream_1)
+  //  ...
+  //  - Iteration #i:
+  //     - cudaMemcpyAsync(values_gpu_ + (i % 2) * K, from + i * K, K *
+  //     sizeof(T), H->D, stream_(i % 2))
+  //     - callback(values_gpu_ + (i % 2) * K, K, i * K, stream_(i % 2)
+  //  ...
+  //  cudaStreamSynchronize(stream_0)
+  //  cudaStreamSynchronize(stream_1)
+  //
+  //  This sequence of calls results in following activity on gpu (assuming that
+  //  kernel invoked by callback takes less time than host-to-device copy):
+  //  +-------------------+-------------------+
+  //  | Stream #0         | Stream #1         |
+  //  +-------------------+-------------------+
+  //  | Copy host->device |                   |
+  //  |                   |                   |
+  //  |                   |                   |
+  //  +-------------------+-------------------+
+  //  | Kernel            | Copy host->device |
+  //  +-------------------+                   |
+  //  |                   |                   |
+  //  +-------------------+-------------------+
+  //  | Copy host->device | Kernel            |
+  //  |                   +-------------------+
+  //  |                   |                   |
+  //  +-------------------+-------------------+
+  //  | Kernel            | Copy host->device |
+  //  |                  ...                  |
+  //  +---------------------------------------+
+  //
+  // b. Copying from regular memory
+  // In this case a copy from regular memory to page-locked memory is required
+  // in order to get asynchrnonous operation. Because pinned memory on host-side
+  // is reused, additional synchronization is required. On each iteration method
+  // the following actions are performed:
+  //  - Wait till previous copy operation in stream is completed
+  //  - Copy batch of values from input array into pinned memory
+  //  - Asynchronously launch host-to-device copy
+  //  - Setup event for synchronization on copy completion
+  //  - Invoke callback (that launches kernel asynchronously)
+  //
+  //  Invocations are performed with the following arguments
+  //  cudaStreamSynchronize(context->DefaultStream());
+  //  - Iteration #0:
+  //    - cudaEventSynchronize(copy_finished_0)
+  //    - std::copy_n(from, K, values_cpu_pinned_)
+  //    - cudaMemcpyAsync(values_gpu_, values_cpu_pinned_, K * sizeof(T), H->D,
+  //    stream_0)
+  //    - cudaEventRecord(copy_finished_0, stream_0)
+  //    - callback(values_gpu_, K, 0, stream_0)
+  //  - Iteration #1:
+  //    - cudaEventSynchronize(copy_finished_1)
+  //    - std::copy_n(from + K, K, values_cpu_pinned_ + K)
+  //    - cudaMemcpyAsync(values_gpu_ + K, values_cpu_pinned_ + K, K *
+  //    sizeof(T), H->D, stream_1)
+  //    - cudaEventRecord(copy_finished_1, stream_1)
+  //    - callback(values_gpu_ + K, K, K, stream_1)
+  //  - Iteration #2:
+  //    - cudaEventSynchronize(copy_finished_0)
+  //    - std::copy_n(from + 2 * K, K, values_cpu_pinned_)
+  //    - cudaMemcpyAsync(values_gpu_, values_cpu_pinned_, K * sizeof(T), H->D,
+  //    stream_0)
+  //    - cudaEventRecord(copy_finished_0, stream_0)
+  //    - callback(values_gpu_, K, 2 * K, stream_0)
+  //  - Iteration #3:
+  //    - cudaEventSynchronize(copy_finished_1)
+  //    - std::copy_n(from + 3 * K, K, values_cpu_pinned_ + K)
+  //    - cudaMemcpyAsync(values_gpu_ + K, values_cpu_pinned_ + K, K *
+  //    sizeof(T), H->D, stream_1)
+  //    - cudaEventRecord(copy_finished_1, stream_1)
+  //    - callback(values_gpu_ + K, K, 3 * K, stream_1)
+  //  ...
+  //  - Iteration #i:
+  //    - cudaEventSynchronize(copy_finished_(i % 2))
+  //    - std::copy_n(from + i * K, K, values_cpu_pinned_ + (i % 2) * K)
+  //    - cudaMemcpyAsync(values_gpu_ + (i % 2) * K, values_cpu_pinned_ + (i %
+  //    2) * K, K * sizeof(T), H->D, stream_(i % 2))
+  //    - cudaEventRecord(copy_finished_(i % 2), stream_(i % 2))
+  //    - callback(values_gpu_ + (i % 2) * K, K, i * K, stream_(i % 2))
+  //  ...
+  //  cudaStreamSynchronize(stream_0)
+  //  cudaStreamSynchronize(stream_1)
+  //
+  //  This sequence of calls results in following activity on cpu and gpu
+  //  (assuming that kernel invoked by callback takes less time than
+  //  host-to-device copy and copy in cpu memory, and copy in cpu memory is
+  //  faster than host-to-device copy):
+  //  +----------------------------+-------------------+-------------------+
+  //  | Stream #0                  | Stream #0         | Stream #1         |
+  //  +----------------------------+-------------------+-------------------+
+  //  | Copy to pinned memory      |                   |                   |
+  //  |                            |                   |                   |
+  //  +----------------------------+-------------------|                   |
+  //  | Copy to pinned memory      | Copy host->device |                   |
+  //  |                            |                   |                   |
+  //  +----------------------------+                   |                   |
+  //  | Waiting previous h->d copy |                   |                   |
+  //  +----------------------------+-------------------+-------------------+
+  //  | Copy to pinned memory      | Kernel            | Copy host->device |
+  //  |                            +-------------------+                   |
+  //  +----------------------------+                   |                   |
+  //  | Waiting previous h->d copy |                   |                   |
+  //  +----------------------------+-------------------+-------------------+
+  //  | Copy to pinned memory      | Copy host->device | Kernel            |
+  //  |                            |                   +-------------------+
+  //  |                           ...                 ...                  |
+  //  +----------------------------+---------------------------------------+
+  //
+  template <typename Fun>
+  void CopyToGpu(const T* from, const int num_values, Fun&& callback) {
+    // This synchronization is not required in some cases, but we perform it in
+    // order to avoid situation when user callback depends on data that is
+    // still to be computed in default stream
+    CHECK_EQ(cudaSuccess, cudaStreamSynchronize(context_->DefaultStream()));
+
+    // If pointer to input data does not correspond to page-locked memory,
+    // host-to-device memory copy might be executed synchrnonously (with a copy
+    // to pinned memory happening inside the driver). In that case we perform
+    // copy to a pre-allocated array of page-locked memory.
+    const bool copy_to_pinned_memory = MemoryTypeResultsInSynchronousCopy(from);
+    T* batch_values_gpu[kNumBatches];
+    T* batch_values_cpu[kNumBatches];
+    auto streams = context_->streams_;
+    for (int i = 0; i < kNumBatches; ++i) {
+      batch_values_gpu[i] = values_gpu_.data() + kValuesPerBatch * i;
+      batch_values_cpu[i] = values_cpu_pinned_ + kValuesPerBatch * i;
+    }
+    int batch_id = 0;
+    for (int offset = 0; offset < num_values; offset += kValuesPerBatch) {
+      const int num_values_batch =
+          std::min(num_values - offset, kValuesPerBatch);
+      const T* batch_from = from + offset;
+      T* batch_to = batch_values_gpu[batch_id];
+      auto stream = streams[batch_id];
+      auto copy_finished = copy_finished_[batch_id];
+
+      if (copy_to_pinned_memory) {
+        // Copying values to a temporary buffer should be started only after the
+        // previous copy from temporary buffer to device is completed.
+        CHECK_EQ(cudaSuccess, cudaEventSynchronize(copy_finished));
+        std::copy_n(batch_from, num_values_batch, batch_values_cpu[batch_id]);
+        batch_from = batch_values_cpu[batch_id];
+      }
+      CHECK_EQ(cudaSuccess,
+               cudaMemcpyAsync(batch_to,
+                               batch_from,
+                               sizeof(T) * num_values_batch,
+                               cudaMemcpyHostToDevice,
+                               stream));
+      if (copy_to_pinned_memory) {
+        // Next copy to a temporary buffer can start straight after asynchronous
+        // copy is completed (and might be started before kernels asynchronously
+        // executed in stream by user-supplied callback are completed).
+        // No explicit synchronization is required when copying data from
+        // page-locked memory, because memory copy and user kernel execution
+        // with corresponding part of values_gpu_ array is serialized using
+        // stream
+        CHECK_EQ(cudaSuccess, cudaEventRecord(copy_finished, stream));
+      }
+      callback(batch_to, num_values_batch, offset, stream);
+      batch_id = (batch_id + 1) % kNumBatches;
+    }
+    // Explicitly synchronize on all CUDA streams that were utilized.
+    for (int i = 0; i < kNumBatches; ++i) {
+      CHECK_EQ(cudaSuccess, cudaStreamSynchronize(streams[i]));
+    }
+  }
+
+ private:
+  // It is necessary to have all host-to-device copies to be completely
+  // asynchronous. This requires source memory to be allocated in page-locked
+  // memory.
+  static bool MemoryTypeResultsInSynchronousCopy(const void* ptr) {
+    cudaPointerAttributes attributes;
+    auto status = cudaPointerGetAttributes(&attributes, ptr);
+#if CUDART_VERSION < 11000
+    // In CUDA versions prior 11 call to cudaPointerGetAttributes with host
+    // pointer will return  cudaErrorInvalidValue
+    if (status == cudaErrorInvalidValue) {
+      return true;
+    }
+#endif
+    CHECK_EQ(status, cudaSuccess);
+    // This class only supports cpu memory as a source
+    CHECK_NE(attributes.type, cudaMemoryTypeDevice);
+    // If host memory was allocated (or registered) with CUDA API, or is a
+    // managed memory, then call to cudaMemcpyAsync will be asynchrnous. In case
+    // of managed memory it might be slightly better to perform a single call of
+    // user-provided call-back (and hope that page migration will provide a
+    // similar throughput with zero efforts from our side).
+    return attributes.type == cudaMemoryTypeUnregistered;
+  }
+
+  const int kValuesPerBatch;
+  ContextImpl* context_ = nullptr;
+  CudaBuffer<T> values_gpu_;
+  T* values_cpu_pinned_ = nullptr;
+  cudaEvent_t copy_finished_[kNumBatches] = {nullptr};
+};
+
+}  // namespace ceres::internal
+
+#endif  // CERES_NO_CUDA
+#endif  // CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_

diff --git a/internal/ceres/cuda_streamed_buffer_test.cc b/internal/ceres/cuda_streamed_buffer_test.cc
new file mode 100644
index 0000000..4837005
--- /dev/null
+++ b/internal/ceres/cuda_streamed_buffer_test.cc

@@ -0,0 +1,169 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2023 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
+
+#include "ceres/internal/config.h"
+
+#ifndef CERES_NO_CUDA
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <numeric>
+
+#include "ceres/cuda_streamed_buffer.h"
+
+namespace ceres::internal {
+
+TEST(CudaStreamedBufferTest, IntegerCopy) {
+  // Offsets and sizes of batches supplied to callback
+  std::vector<std::pair<int, int>> batches;
+  const int kMaxTemporaryArraySize = 16;
+  const int kInputSize = kMaxTemporaryArraySize * 7 + 3;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
+
+  std::vector<int> inputs(kInputSize);
+  std::vector<int> outputs(kInputSize, -1);
+  std::iota(inputs.begin(), inputs.end(), 0);
+
+  CudaStreamedBuffer<int> streamed_buffer(&context, kMaxTemporaryArraySize);
+  streamed_buffer.CopyToGpu(inputs.data(),
+                            kInputSize,
+                            [&outputs, &batches](const int* device_pointer,
+                                                 int size,
+                                                 int offset,
+                                                 cudaStream_t stream) {
+                              batches.emplace_back(offset, size);
+                              CHECK_EQ(cudaSuccess,
+                                       cudaMemcpyAsync(outputs.data() + offset,
+                                                       device_pointer,
+                                                       sizeof(int) * size,
+                                                       cudaMemcpyDeviceToHost,
+                                                       stream));
+                            });
+  // All operations in all streams should be completed when CopyToGpu returns
+  // control to the callee
+  for (int i = 0; i < ContextImpl::kNumCudaStreams; ++i) {
+    CHECK_EQ(cudaSuccess, cudaStreamQuery(context.streams_[i]));
+  }
+
+  // Check if every element was visited
+  for (int i = 0; i < kInputSize; ++i) {
+    CHECK_EQ(outputs[i], i);
+  }
+
+  // Check if there is no overlap between batches
+  std::sort(batches.begin(), batches.end());
+  const int num_batches = batches.size();
+  for (int i = 0; i < num_batches; ++i) {
+    const auto [begin, size] = batches[i];
+    const int end = begin + size;
+    CHECK_GE(begin, 0);
+    CHECK_LT(begin, kInputSize);
+
+    CHECK_GT(size, 0);
+    CHECK_LE(end, kInputSize);
+
+    if (i + 1 == num_batches) continue;
+    CHECK_EQ(end, batches[i + 1].first);
+  }
+}
+
+TEST(CudaStreamedBufferTest, IntegerNoCopy) {
+  // Offsets and sizes of batches supplied to callback
+  std::vector<std::pair<int, int>> batches;
+  const int kMaxTemporaryArraySize = 16;
+  const int kInputSize = kMaxTemporaryArraySize * 7 + 3;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCuda(&message)) << "InitCuda() failed because: " << message;
+
+  int* inputs;
+  int* outputs;
+  CHECK_EQ(cudaSuccess,
+           cudaHostAlloc(
+               &inputs, sizeof(int) * kInputSize, cudaHostAllocWriteCombined));
+  CHECK_EQ(
+      cudaSuccess,
+      cudaHostAlloc(&outputs, sizeof(int) * kInputSize, cudaHostAllocDefault));
+
+  std::fill(outputs, outputs + kInputSize, -1);
+  std::iota(inputs, inputs + kInputSize, 0);
+
+  CudaStreamedBuffer<int> streamed_buffer(&context, kMaxTemporaryArraySize);
+  streamed_buffer.CopyToGpu(inputs,
+                            kInputSize,
+                            [outputs, &batches](const int* device_pointer,
+                                                int size,
+                                                int offset,
+                                                cudaStream_t stream) {
+                              batches.emplace_back(offset, size);
+                              CHECK_EQ(cudaSuccess,
+                                       cudaMemcpyAsync(outputs + offset,
+                                                       device_pointer,
+                                                       sizeof(int) * size,
+                                                       cudaMemcpyDeviceToHost,
+                                                       stream));
+                            });
+  // All operations in all streams should be completed when CopyToGpu returns
+  // control to the callee
+  for (int i = 0; i < ContextImpl::kNumCudaStreams; ++i) {
+    CHECK_EQ(cudaSuccess, cudaStreamQuery(context.streams_[i]));
+  }
+
+  // Check if every element was visited
+  for (int i = 0; i < kInputSize; ++i) {
+    CHECK_EQ(outputs[i], i);
+  }
+
+  // Check if there is no overlap between batches
+  std::sort(batches.begin(), batches.end());
+  const int num_batches = batches.size();
+  for (int i = 0; i < num_batches; ++i) {
+    const auto [begin, size] = batches[i];
+    const int end = begin + size;
+    CHECK_GE(begin, 0);
+    CHECK_LT(begin, kInputSize);
+
+    CHECK_GT(size, 0);
+    CHECK_LE(end, kInputSize);
+
+    if (i + 1 == num_batches) continue;
+    CHECK_EQ(end, batches[i + 1].first);
+  }
+
+  CHECK_EQ(cudaSuccess, cudaFreeHost(inputs));
+  CHECK_EQ(cudaSuccess, cudaFreeHost(outputs));
+}
+
+}  // namespace ceres::internal
+
+#endif  // CERES_NO_CUDA

diff --git a/internal/ceres/cuda_vector.cc b/internal/ceres/cuda_vector.cc
index 274a47a..e2143de 100644
--- a/internal/ceres/cuda_vector.cc
+++ b/internal/ceres/cuda_vector.cc

@@ -127,7 +127,7 @@
 
 void CudaVector::SetZero() {
   CHECK(data_.data() != nullptr);
-  CudaSetZeroFP64(data_.data(), num_rows_, context_->stream_);
+  CudaSetZeroFP64(data_.data(), num_rows_, context_->DefaultStream());
 }
 
 void CudaVector::Axpby(double a, const CudaVector& x, double b) {
@@ -160,7 +160,7 @@
              D.data().data(),
              x.data().data(),
              num_rows_,
-             context_->stream_);
+             context_->DefaultStream());
 }
 
 void CudaVector::Scale(double s) {

diff --git a/internal/ceres/dense_cholesky.cc b/internal/ceres/dense_cholesky.cc
index 93e9f0d..f19ca7e 100644
--- a/internal/ceres/dense_cholesky.cc
+++ b/internal/ceres/dense_cholesky.cc

@@ -516,7 +516,7 @@
                            residual_fp32_.data(),
                            num_cols_ * sizeof(float),
                            cudaMemcpyDeviceToDevice,
-                           context_->stream_),
+                           context_->DefaultStream()),
            cudaSuccess);
   if (cusolverDnSpotrs(context_->cusolver_handle_,
                        CUBLAS_FILL_MODE_LOWER,
@@ -569,7 +569,7 @@
   CudaFP64ToFP32(lhs_fp64_.data(),
                  lhs_fp32_.data(),
                  num_cols * num_cols,
-                 context_->stream_);
+                 context_->DefaultStream());
 
   // Factorize lhs_fp32.
   factorize_result_ = CudaCholeskyFactorize(message);
@@ -592,7 +592,7 @@
   residual_fp64_.Reserve(num_cols_);
 
   // Initialize x = 0.
-  CudaSetZeroFP64(x_fp64_.data(), num_cols_, context_->stream_);
+  CudaSetZeroFP64(x_fp64_.data(), num_cols_, context_->DefaultStream());
 
   // Initialize residual = rhs.
   rhs_fp64_.CopyFromCpu(rhs, num_cols_);
@@ -603,15 +603,17 @@
     CudaFP64ToFP32(residual_fp64_.data(),
                    residual_fp32_.data(),
                    num_cols_,
-                   context_->stream_);
+                   context_->DefaultStream());
     // [fp32] c = lhs^-1 * residual.
     auto result = CudaCholeskySolve(message);
     if (result != LinearSolverTerminationType::SUCCESS) {
       return result;
     }
     // [fp64] x += c.
-    CudaDsxpy(
-        x_fp64_.data(), correction_fp32_.data(), num_cols_, context_->stream_);
+    CudaDsxpy(x_fp64_.data(),
+              correction_fp32_.data(),
+              num_cols_,
+              context_->DefaultStream());
     if (i < max_num_refinement_iterations_) {
       // [fp64] residual = rhs - lhs * x
       // This is done in two steps:

diff --git a/internal/ceres/evaluation_benchmark.cc b/internal/ceres/evaluation_benchmark.cc
index ae1a65d..2ba03ed 100644
--- a/internal/ceres/evaluation_benchmark.cc
+++ b/internal/ceres/evaluation_benchmark.cc

@@ -36,6 +36,7 @@
 #include "benchmark/benchmark.h"
 #include "ceres/block_sparse_matrix.h"
 #include "ceres/bundle_adjustment_test_util.h"
+#include "ceres/cuda_block_sparse_crs_view.h"
 #include "ceres/cuda_sparse_matrix.h"
 #include "ceres/cuda_vector.h"
 #include "ceres/evaluator.h"
@@ -427,6 +428,75 @@
   CHECK_GT(y.squaredNorm(), 0.);
 }
 
+static void JacobianToCRS(benchmark::State& state,
+                          BALData* data,
+                          ContextImpl* context) {
+  auto jacobian = data->BlockSparseJacobian(context);
+
+  std::unique_ptr<CompressedRowSparseMatrix> matrix;
+  for (auto _ : state) {
+    matrix = jacobian->ToCompressedRowSparseMatrix();
+  }
+  CHECK(matrix != nullptr);
+}
+
+#ifndef CERES_NO_CUDA
+// We want CudaBlockSparseCRSView to be not slower than explicit conversion to
+// CRS on CPU
+static void JacobianToCRSView(benchmark::State& state,
+                              BALData* data,
+                              ContextImpl* context) {
+  auto jacobian = data->BlockSparseJacobian(context);
+
+  std::unique_ptr<CudaBlockSparseCRSView> matrix;
+  for (auto _ : state) {
+    matrix = std::make_unique<CudaBlockSparseCRSView>(*jacobian, context);
+  }
+  CHECK(matrix != nullptr);
+}
+static void JacobianToCRSMatrix(benchmark::State& state,
+                                BALData* data,
+                                ContextImpl* context) {
+  auto jacobian = data->BlockSparseJacobian(context);
+
+  std::unique_ptr<CudaSparseMatrix> matrix;
+  std::unique_ptr<CompressedRowSparseMatrix> matrix_cpu;
+  for (auto _ : state) {
+    matrix_cpu = jacobian->ToCompressedRowSparseMatrix();
+    matrix = std::make_unique<CudaSparseMatrix>(context, *matrix_cpu);
+  }
+  CHECK(matrix != nullptr);
+}
+// Updating values in CudaBlockSparseCRSView should be +- as fast as just
+// copying values (time spent in value permutation has to be hidden by PCIe
+// transfer)
+static void JacobianToCRSViewUpdate(benchmark::State& state,
+                                    BALData* data,
+                                    ContextImpl* context) {
+  auto jacobian = data->BlockSparseJacobian(context);
+
+  auto matrix = CudaBlockSparseCRSView(*jacobian, context);
+  for (auto _ : state) {
+    matrix.UpdateValues(*jacobian);
+  }
+}
+static void JacobianToCRSMatrixUpdate(benchmark::State& state,
+                                      BALData* data,
+                                      ContextImpl* context) {
+  auto jacobian = data->BlockSparseJacobian(context);
+
+  auto matrix_cpu = jacobian->ToCompressedRowSparseMatrix();
+  auto matrix = std::make_unique<CudaSparseMatrix>(context, *matrix_cpu);
+  for (auto _ : state) {
+    CHECK_EQ(cudaSuccess,
+             cudaMemcpy(matrix->mutable_values(),
+                        matrix_cpu->values(),
+                        matrix->num_nonzeros() * sizeof(double),
+                        cudaMemcpyHostToDevice));
+  }
+}
+#endif
+
 static void JacobianSquaredColumnNorm(benchmark::State& state,
                                       BALData* data,
                                       ContextImpl* context) {
@@ -772,6 +842,34 @@
         ->Arg(4)
         ->Arg(8)
         ->Arg(16);
+
+    const std::string name_to_crs = "JacobianToCRS<" + path + ">";
+    ::benchmark::RegisterBenchmark(
+        name_to_crs.c_str(), ceres::internal::JacobianToCRS, data, &context);
+#ifndef CERES_NO_CUDA
+    const std::string name_to_crs_view = "JacobianToCRSView<" + path + ">";
+    ::benchmark::RegisterBenchmark(name_to_crs_view.c_str(),
+                                   ceres::internal::JacobianToCRSView,
+                                   data,
+                                   &context);
+    const std::string name_to_crs_matrix = "JacobianToCRSMatrix<" + path + ">";
+    ::benchmark::RegisterBenchmark(name_to_crs_matrix.c_str(),
+                                   ceres::internal::JacobianToCRSMatrix,
+                                   data,
+                                   &context);
+    const std::string name_to_crs_view_update =
+        "JacobianToCRSViewUpdate<" + path + ">";
+    ::benchmark::RegisterBenchmark(name_to_crs_view_update.c_str(),
+                                   ceres::internal::JacobianToCRSViewUpdate,
+                                   data,
+                                   &context);
+    const std::string name_to_crs_matrix_update =
+        "JacobianToCRSMatrixUpdate<" + path + ">";
+    ::benchmark::RegisterBenchmark(name_to_crs_matrix_update.c_str(),
+                                   ceres::internal::JacobianToCRSMatrixUpdate,
+                                   data,
+                                   &context);
+#endif
   }
   ::benchmark::RunSpecifiedBenchmarks();