internal/ceres/cuda_streamed_buffer.h - ceres-solver - Git at Google

 // Ceres Solver - A fast non-linear least squares minimizer
 // Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
 // * Redistributions of source code must retain the above copyright notice,
 //   this list of conditions and the following disclaimer.
 // * Redistributions in binary form must reproduce the above copyright notice,
 //   this list of conditions and the following disclaimer in the documentation
 //   and/or other materials provided with the distribution.
 // * Neither the name of Google Inc. nor the names of its contributors may be
 //   used to endorse or promote products derived from this software without
 //   specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 //
 // Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)

 #ifndef CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_
 #define CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_

 #include "ceres/internal/config.h"

 #ifndef CERES_NO_CUDA
 #include "ceres/cuda_buffer.h"

 namespace ceres::internal {

 // Most contemporary CUDA devices are capable of simultaneous code execution and
 // host-to-device transfer. This class copies batches of data to GPU memory and
 // executes processing of copied data in parallel (asynchronously).
 // Data is copied to a fixed-size buffer on GPU (containing at most
 // max_buffer_size values), and this memory is re-used when the previous
 // batch of values is processed by user-provided callback
 // Host-to-device copy uses a temporary buffer if required. Each batch of values
 // has size of kValuesPerBatch, except the last one.
 template <typename T>
 class CERES_NO_EXPORT CudaStreamedBuffer {
  public:
   // If hardware supports only one host-to-device copy or one host-to-device
   // copy is able to reach peak bandwidth, two streams are sufficient to reach
   // maximum efficiency:
   //  - If transferring batch of values takes more time, than processing it on
   //  gpu, then at every moment of time one of the streams will be transferring
   //  data and other stream will be either processing data or idle; the whole
   //  process will be bounded by host-to-device copy.
   //  - If transferring batch of values takes less time, than processing it on
   //  gpu, then at every moment of time one of the streams will be processing
   //  data and other stream will be either performing computations or
   //  transferring data, and the whole process will be bounded by computations.
   static constexpr int kNumBatches = 2;
   // max_buffer_size is the maximal size (in elements of type T) of array
   // to be pre-allocated in gpu memory. The size of array determines size of
   // batch of values for simultaneous copying and processing. It should be large
   // enough to allow highly-parallel execution of user kernels; making it too
   // large increases latency.
   CudaStreamedBuffer(ContextImpl* context, const int max_buffer_size)
       : kValuesPerBatch(max_buffer_size / kNumBatches),
         context_(context),
         values_gpu_(context, kValuesPerBatch * kNumBatches) {
     static_assert(ContextImpl::kNumCudaStreams >= kNumBatches);
     CHECK_GE(max_buffer_size, kNumBatches);
     // Pre-allocate a buffer of page-locked memory for transfers from a regular
     // cpu memory. Because we will be only writing into that buffer from cpu,
     // memory is allocated with cudaHostAllocWriteCombined flag.
     CHECK_EQ(cudaSuccess,
              cudaHostAlloc(&values_cpu_pinned_,
                            sizeof(T) * kValuesPerBatch * kNumBatches,
                            cudaHostAllocWriteCombined));
     for (auto& e : copy_finished_) {
       CHECK_EQ(cudaSuccess,
                cudaEventCreateWithFlags(&e, cudaEventDisableTiming));
     }
   }

   CudaStreamedBuffer(const CudaStreamedBuffer&) = delete;

   ~CudaStreamedBuffer() {
     CHECK_EQ(cudaSuccess, cudaFreeHost(values_cpu_pinned_));
     for (auto& e : copy_finished_) {
       CHECK_EQ(cudaSuccess, cudaEventDestroy(e));
     }
   }

   // Transfer num_values at host-memory pointer from, calling
   // callback(device_pointer, size_of_batch, offset_of_batch, stream_to_use)
   // after scheduling transfer of each batch of data. User-provided callback
   // should perform processing of data at device_pointer only in
   // stream_to_use stream (device_pointer will be re-used in the next
   // callback invocation with the same stream).
   //
   // Two diagrams below describe operation in two possible scenarios, depending
   // on input data being stored in page-locked memory. In this example we will
   // have max_buffer_size = 2 * K, num_values = N * K and callback
   // scheduling a single asynchronous launch of
   // Kernel<<..., stream_to_use>>(device_pointer,
   //                              size_of_batch,
   //                              offset_of_batch)
   //
   // a. Copying from page-locked memory
   // In this case no copy on the host-side is necessary, and this method just
   // schedules a bunch of interleaved memory copies and callback invocations:
   //
   //  cudaStreamSynchronize(context->DefaultStream());
   //  - Iteration #0:
   //    - cudaMemcpyAsync(values_gpu_, from, K * sizeof(T), H->D, stream_0)
   //    - callback(values_gpu_, K, 0, stream_0)
   //  - Iteration #1:
   //    - cudaMemcpyAsync(values_gpu_ + K, from + K, K * sizeof(T), H->D,
   //    stream_1)
   //    - callback(values_gpu_ + K, K, K, stream_1)
   //  - Iteration #2:
   //    - cudaMemcpyAsync(values_gpu_, from + 2 * K, K * sizeof(T), H->D,
   //    stream_0)
   //    - callback(values_gpu_, K, 2 * K, stream_0)
   //  - Iteration #3:
   //     - cudaMemcpyAsync(values_gpu_ + K, from + 3 * K, K * sizeof(T), H->D,
   //     stream_1)
   //     - callback(values_gpu_ + K, K, 3 * K, stream_1)
   //  ...
   //  - Iteration #i:
   //     - cudaMemcpyAsync(values_gpu_ + (i % 2) * K, from + i * K, K *
   //     sizeof(T), H->D, stream_(i % 2))
   //     - callback(values_gpu_ + (i % 2) * K, K, i * K, stream_(i % 2)
   //  ...
   //  cudaStreamSynchronize(stream_0)
   //  cudaStreamSynchronize(stream_1)
   //
   //  This sequence of calls results in following activity on gpu (assuming that
   //  kernel invoked by callback takes less time than host-to-device copy):
   //  +-------------------+-------------------+
   //  | Stream #0         | Stream #1         |
   //  +-------------------+-------------------+
   //  | Copy host->device |                   |
   //  |                   |                   |
   //  |                   |                   |
   //  +-------------------+-------------------+
   //  | Kernel            | Copy host->device |
   //  +-------------------+                   |
   //  |                   |                   |
   //  +-------------------+-------------------+
   //  | Copy host->device | Kernel            |
   //  |                   +-------------------+
   //  |                   |                   |
   //  +-------------------+-------------------+
   //  | Kernel            | Copy host->device |
   //  |                  ...                  |
   //  +---------------------------------------+
   //
   // b. Copying from regular memory
   // In this case a copy from regular memory to page-locked memory is required
   // in order to get asynchrnonous operation. Because pinned memory on host-side
   // is reused, additional synchronization is required. On each iteration method
   // the following actions are performed:
   //  - Wait till previous copy operation in stream is completed
   //  - Copy batch of values from input array into pinned memory
   //  - Asynchronously launch host-to-device copy
   //  - Setup event for synchronization on copy completion
   //  - Invoke callback (that launches kernel asynchronously)
   //
   //  Invocations are performed with the following arguments
   //  cudaStreamSynchronize(context->DefaultStream());
   //  - Iteration #0:
   //    - cudaEventSynchronize(copy_finished_0)
   //    - std::copy_n(from, K, values_cpu_pinned_)
   //    - cudaMemcpyAsync(values_gpu_, values_cpu_pinned_, K * sizeof(T), H->D,
   //    stream_0)
   //    - cudaEventRecord(copy_finished_0, stream_0)
   //    - callback(values_gpu_, K, 0, stream_0)
   //  - Iteration #1:
   //    - cudaEventSynchronize(copy_finished_1)
   //    - std::copy_n(from + K, K, values_cpu_pinned_ + K)
   //    - cudaMemcpyAsync(values_gpu_ + K, values_cpu_pinned_ + K, K *
   //    sizeof(T), H->D, stream_1)
   //    - cudaEventRecord(copy_finished_1, stream_1)
   //    - callback(values_gpu_ + K, K, K, stream_1)
   //  - Iteration #2:
   //    - cudaEventSynchronize(copy_finished_0)
   //    - std::copy_n(from + 2 * K, K, values_cpu_pinned_)
   //    - cudaMemcpyAsync(values_gpu_, values_cpu_pinned_, K * sizeof(T), H->D,
   //    stream_0)
   //    - cudaEventRecord(copy_finished_0, stream_0)
   //    - callback(values_gpu_, K, 2 * K, stream_0)
   //  - Iteration #3:
   //    - cudaEventSynchronize(copy_finished_1)
   //    - std::copy_n(from + 3 * K, K, values_cpu_pinned_ + K)
   //    - cudaMemcpyAsync(values_gpu_ + K, values_cpu_pinned_ + K, K *
   //    sizeof(T), H->D, stream_1)
   //    - cudaEventRecord(copy_finished_1, stream_1)
   //    - callback(values_gpu_ + K, K, 3 * K, stream_1)
   //  ...
   //  - Iteration #i:
   //    - cudaEventSynchronize(copy_finished_(i % 2))
   //    - std::copy_n(from + i * K, K, values_cpu_pinned_ + (i % 2) * K)
   //    - cudaMemcpyAsync(values_gpu_ + (i % 2) * K, values_cpu_pinned_ + (i %
   //    2) * K, K * sizeof(T), H->D, stream_(i % 2))
   //    - cudaEventRecord(copy_finished_(i % 2), stream_(i % 2))
   //    - callback(values_gpu_ + (i % 2) * K, K, i * K, stream_(i % 2))
   //  ...
   //  cudaStreamSynchronize(stream_0)
   //  cudaStreamSynchronize(stream_1)
   //
   //  This sequence of calls results in following activity on cpu and gpu
   //  (assuming that kernel invoked by callback takes less time than
   //  host-to-device copy and copy in cpu memory, and copy in cpu memory is
   //  faster than host-to-device copy):
   //  +----------------------------+-------------------+-------------------+
   //  | Stream #0                  | Stream #0         | Stream #1         |
   //  +----------------------------+-------------------+-------------------+
   //  | Copy to pinned memory      |                   |                   |
   //  |                            |                   |                   |
   //  +----------------------------+-------------------|                   |
   //  | Copy to pinned memory      | Copy host->device |                   |
   //  |                            |                   |                   |
   //  +----------------------------+                   |                   |
   //  | Waiting previous h->d copy |                   |                   |
   //  +----------------------------+-------------------+-------------------+
   //  | Copy to pinned memory      | Kernel            | Copy host->device |
   //  |                            +-------------------+                   |
   //  +----------------------------+                   |                   |
   //  | Waiting previous h->d copy |                   |                   |
   //  +----------------------------+-------------------+-------------------+
   //  | Copy to pinned memory      | Copy host->device | Kernel            |
   //  |                            |                   +-------------------+
   //  |                           ...                 ...                  |
   //  +----------------------------+---------------------------------------+
   //
   template <typename Fun>
   void CopyToGpu(const T* from, const int num_values, Fun&& callback) {
     // This synchronization is not required in some cases, but we perform it in
     // order to avoid situation when user callback depends on data that is
     // still to be computed in default stream
     CHECK_EQ(cudaSuccess, cudaStreamSynchronize(context_->DefaultStream()));

     // If pointer to input data does not correspond to page-locked memory,
     // host-to-device memory copy might be executed synchrnonously (with a copy
     // to pinned memory happening inside the driver). In that case we perform
     // copy to a pre-allocated array of page-locked memory.
     const bool copy_to_pinned_memory = MemoryTypeResultsInSynchronousCopy(from);
     T* batch_values_gpu[kNumBatches];
     T* batch_values_cpu[kNumBatches];
     auto streams = context_->streams_;
     for (int i = 0; i < kNumBatches; ++i) {
       batch_values_gpu[i] = values_gpu_.data() + kValuesPerBatch * i;
       batch_values_cpu[i] = values_cpu_pinned_ + kValuesPerBatch * i;
     }
     int batch_id = 0;
     for (int offset = 0; offset < num_values; offset += kValuesPerBatch) {
       const int num_values_batch =
           std::min(num_values - offset, kValuesPerBatch);
       const T* batch_from = from + offset;
       T* batch_to = batch_values_gpu[batch_id];
       auto stream = streams[batch_id];
       auto copy_finished = copy_finished_[batch_id];

       if (copy_to_pinned_memory) {
         // Copying values to a temporary buffer should be started only after the
         // previous copy from temporary buffer to device is completed.
         CHECK_EQ(cudaSuccess, cudaEventSynchronize(copy_finished));
         std::copy_n(batch_from, num_values_batch, batch_values_cpu[batch_id]);
         batch_from = batch_values_cpu[batch_id];
       }
       CHECK_EQ(cudaSuccess,
                cudaMemcpyAsync(batch_to,
                                batch_from,
                                sizeof(T) * num_values_batch,
                                cudaMemcpyHostToDevice,
                                stream));
       if (copy_to_pinned_memory) {
         // Next copy to a temporary buffer can start straight after asynchronous
         // copy is completed (and might be started before kernels asynchronously
         // executed in stream by user-supplied callback are completed).
         // No explicit synchronization is required when copying data from
         // page-locked memory, because memory copy and user kernel execution
         // with corresponding part of values_gpu_ array is serialized using
         // stream
         CHECK_EQ(cudaSuccess, cudaEventRecord(copy_finished, stream));
       }
       callback(batch_to, num_values_batch, offset, stream);
       batch_id = (batch_id + 1) % kNumBatches;
     }
     // Explicitly synchronize on all CUDA streams that were utilized.
     for (int i = 0; i < kNumBatches; ++i) {
       CHECK_EQ(cudaSuccess, cudaStreamSynchronize(streams[i]));
     }
   }

  private:
   // It is necessary to have all host-to-device copies to be completely
   // asynchronous. This requires source memory to be allocated in page-locked
   // memory.
   static bool MemoryTypeResultsInSynchronousCopy(const void* ptr) {
     cudaPointerAttributes attributes;
     auto status = cudaPointerGetAttributes(&attributes, ptr);
 #if CUDART_VERSION < 11000
     // In CUDA versions prior 11 call to cudaPointerGetAttributes with host
     // pointer will return  cudaErrorInvalidValue
     if (status == cudaErrorInvalidValue) {
       return true;
     }
 #endif
     CHECK_EQ(status, cudaSuccess);
     // This class only supports cpu memory as a source
     CHECK_NE(attributes.type, cudaMemoryTypeDevice);
     // If host memory was allocated (or registered) with CUDA API, or is a
     // managed memory, then call to cudaMemcpyAsync will be asynchrnous. In case
     // of managed memory it might be slightly better to perform a single call of
     // user-provided call-back (and hope that page migration will provide a
     // similar throughput with zero efforts from our side).
     return attributes.type == cudaMemoryTypeUnregistered;
   }

   const int kValuesPerBatch;
   ContextImpl* context_ = nullptr;
   CudaBuffer<T> values_gpu_;
   T* values_cpu_pinned_ = nullptr;
   cudaEvent_t copy_finished_[kNumBatches] = {nullptr};
 };

 }  // namespace ceres::internal

 #endif  // CERES_NO_CUDA
 #endif  // CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_
	// Ceres Solver - A fast non-linear least squares minimizer
	// Copyright 2023 Google Inc. All rights reserved.
	// http://ceres-solver.org/
	//
	// Redistribution and use in source and binary forms, with or without
	// modification, are permitted provided that the following conditions are met:
	//
	// * Redistributions of source code must retain the above copyright notice,
	// this list of conditions and the following disclaimer.
	// * Redistributions in binary form must reproduce the above copyright notice,
	// this list of conditions and the following disclaimer in the documentation
	// and/or other materials provided with the distribution.
	// * Neither the name of Google Inc. nor the names of its contributors may be
	// used to endorse or promote products derived from this software without
	// specific prior written permission.
	//
	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
	// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	// POSSIBILITY OF SUCH DAMAGE.
	//
	// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)

	#ifndef CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_
	#define CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_

	#include "ceres/internal/config.h"

	#ifndef CERES_NO_CUDA
	#include "ceres/cuda_buffer.h"

	namespace ceres::internal {

	// Most contemporary CUDA devices are capable of simultaneous code execution and
	// host-to-device transfer. This class copies batches of data to GPU memory and
	// executes processing of copied data in parallel (asynchronously).
	// Data is copied to a fixed-size buffer on GPU (containing at most
	// max_buffer_size values), and this memory is re-used when the previous
	// batch of values is processed by user-provided callback
	// Host-to-device copy uses a temporary buffer if required. Each batch of values
	// has size of kValuesPerBatch, except the last one.
	template <typename T>
	class CERES_NO_EXPORT CudaStreamedBuffer {
	public:
	// If hardware supports only one host-to-device copy or one host-to-device
	// copy is able to reach peak bandwidth, two streams are sufficient to reach
	// maximum efficiency:
	// - If transferring batch of values takes more time, than processing it on
	// gpu, then at every moment of time one of the streams will be transferring
	// data and other stream will be either processing data or idle; the whole
	// process will be bounded by host-to-device copy.
	// - If transferring batch of values takes less time, than processing it on
	// gpu, then at every moment of time one of the streams will be processing
	// data and other stream will be either performing computations or
	// transferring data, and the whole process will be bounded by computations.
	static constexpr int kNumBatches = 2;
	// max_buffer_size is the maximal size (in elements of type T) of array
	// to be pre-allocated in gpu memory. The size of array determines size of
	// batch of values for simultaneous copying and processing. It should be large
	// enough to allow highly-parallel execution of user kernels; making it too
	// large increases latency.
	CudaStreamedBuffer(ContextImpl* context, const int max_buffer_size)
	: kValuesPerBatch(max_buffer_size / kNumBatches),
	context_(context),
	values_gpu_(context, kValuesPerBatch * kNumBatches) {
	static_assert(ContextImpl::kNumCudaStreams >= kNumBatches);
	CHECK_GE(max_buffer_size, kNumBatches);
	// Pre-allocate a buffer of page-locked memory for transfers from a regular
	// cpu memory. Because we will be only writing into that buffer from cpu,
	// memory is allocated with cudaHostAllocWriteCombined flag.
	CHECK_EQ(cudaSuccess,
	cudaHostAlloc(&values_cpu_pinned_,
	sizeof(T) * kValuesPerBatch * kNumBatches,
	cudaHostAllocWriteCombined));
	for (auto& e : copy_finished_) {
	CHECK_EQ(cudaSuccess,
	cudaEventCreateWithFlags(&e, cudaEventDisableTiming));
	}
	}

	CudaStreamedBuffer(const CudaStreamedBuffer&) = delete;

	~CudaStreamedBuffer() {
	CHECK_EQ(cudaSuccess, cudaFreeHost(values_cpu_pinned_));
	for (auto& e : copy_finished_) {
	CHECK_EQ(cudaSuccess, cudaEventDestroy(e));
	}
	}

	// Transfer num_values at host-memory pointer from, calling
	// callback(device_pointer, size_of_batch, offset_of_batch, stream_to_use)
	// after scheduling transfer of each batch of data. User-provided callback
	// should perform processing of data at device_pointer only in
	// stream_to_use stream (device_pointer will be re-used in the next
	// callback invocation with the same stream).
	//
	// Two diagrams below describe operation in two possible scenarios, depending
	// on input data being stored in page-locked memory. In this example we will
	// have max_buffer_size = 2 * K, num_values = N * K and callback
	// scheduling a single asynchronous launch of
	// Kernel<<..., stream_to_use>>(device_pointer,
	// size_of_batch,
	// offset_of_batch)
	//
	// a. Copying from page-locked memory
	// In this case no copy on the host-side is necessary, and this method just
	// schedules a bunch of interleaved memory copies and callback invocations:
	//
	// cudaStreamSynchronize(context->DefaultStream());
	// - Iteration #0:
	// - cudaMemcpyAsync(values_gpu_, from, K * sizeof(T), H->D, stream_0)
	// - callback(values_gpu_, K, 0, stream_0)
	// - Iteration #1:
	// - cudaMemcpyAsync(values_gpu_ + K, from + K, K * sizeof(T), H->D,
	// stream_1)
	// - callback(values_gpu_ + K, K, K, stream_1)
	// - Iteration #2:
	// - cudaMemcpyAsync(values_gpu_, from + 2 * K, K * sizeof(T), H->D,
	// stream_0)
	// - callback(values_gpu_, K, 2 * K, stream_0)
	// - Iteration #3:
	// - cudaMemcpyAsync(values_gpu_ + K, from + 3 * K, K * sizeof(T), H->D,
	// stream_1)
	// - callback(values_gpu_ + K, K, 3 * K, stream_1)
	// ...
	// - Iteration #i:
	// - cudaMemcpyAsync(values_gpu_ + (i % 2) * K, from + i * K, K *
	// sizeof(T), H->D, stream_(i % 2))
	// - callback(values_gpu_ + (i % 2) * K, K, i * K, stream_(i % 2)
	// ...
	// cudaStreamSynchronize(stream_0)
	// cudaStreamSynchronize(stream_1)
	//
	// This sequence of calls results in following activity on gpu (assuming that
	// kernel invoked by callback takes less time than host-to-device copy):
	// +-------------------+-------------------+
	// \| Stream #0 \| Stream #1 \|
	// +-------------------+-------------------+
	// \| Copy host->device \| \|
	// \| \| \|
	// \| \| \|
	// +-------------------+-------------------+
	// \| Kernel \| Copy host->device \|
	// +-------------------+ \|
	// \| \| \|
	// +-------------------+-------------------+
	// \| Copy host->device \| Kernel \|
	// \| +-------------------+
	// \| \| \|
	// +-------------------+-------------------+
	// \| Kernel \| Copy host->device \|
	// \| ... \|
	// +---------------------------------------+
	//
	// b. Copying from regular memory
	// In this case a copy from regular memory to page-locked memory is required
	// in order to get asynchrnonous operation. Because pinned memory on host-side
	// is reused, additional synchronization is required. On each iteration method
	// the following actions are performed:
	// - Wait till previous copy operation in stream is completed
	// - Copy batch of values from input array into pinned memory
	// - Asynchronously launch host-to-device copy
	// - Setup event for synchronization on copy completion
	// - Invoke callback (that launches kernel asynchronously)
	//
	// Invocations are performed with the following arguments
	// cudaStreamSynchronize(context->DefaultStream());
	// - Iteration #0:
	// - cudaEventSynchronize(copy_finished_0)
	// - std::copy_n(from, K, values_cpu_pinned_)
	// - cudaMemcpyAsync(values_gpu_, values_cpu_pinned_, K * sizeof(T), H->D,
	// stream_0)
	// - cudaEventRecord(copy_finished_0, stream_0)
	// - callback(values_gpu_, K, 0, stream_0)
	// - Iteration #1:
	// - cudaEventSynchronize(copy_finished_1)
	// - std::copy_n(from + K, K, values_cpu_pinned_ + K)
	// - cudaMemcpyAsync(values_gpu_ + K, values_cpu_pinned_ + K, K *
	// sizeof(T), H->D, stream_1)
	// - cudaEventRecord(copy_finished_1, stream_1)
	// - callback(values_gpu_ + K, K, K, stream_1)
	// - Iteration #2:
	// - cudaEventSynchronize(copy_finished_0)
	// - std::copy_n(from + 2 * K, K, values_cpu_pinned_)
	// - cudaMemcpyAsync(values_gpu_, values_cpu_pinned_, K * sizeof(T), H->D,
	// stream_0)
	// - cudaEventRecord(copy_finished_0, stream_0)
	// - callback(values_gpu_, K, 2 * K, stream_0)
	// - Iteration #3:
	// - cudaEventSynchronize(copy_finished_1)
	// - std::copy_n(from + 3 * K, K, values_cpu_pinned_ + K)
	// - cudaMemcpyAsync(values_gpu_ + K, values_cpu_pinned_ + K, K *
	// sizeof(T), H->D, stream_1)
	// - cudaEventRecord(copy_finished_1, stream_1)
	// - callback(values_gpu_ + K, K, 3 * K, stream_1)
	// ...
	// - Iteration #i:
	// - cudaEventSynchronize(copy_finished_(i % 2))
	// - std::copy_n(from + i * K, K, values_cpu_pinned_ + (i % 2) * K)
	// - cudaMemcpyAsync(values_gpu_ + (i % 2) * K, values_cpu_pinned_ + (i %
	// 2) * K, K * sizeof(T), H->D, stream_(i % 2))
	// - cudaEventRecord(copy_finished_(i % 2), stream_(i % 2))
	// - callback(values_gpu_ + (i % 2) * K, K, i * K, stream_(i % 2))
	// ...
	// cudaStreamSynchronize(stream_0)
	// cudaStreamSynchronize(stream_1)
	//
	// This sequence of calls results in following activity on cpu and gpu
	// (assuming that kernel invoked by callback takes less time than
	// host-to-device copy and copy in cpu memory, and copy in cpu memory is
	// faster than host-to-device copy):
	// +----------------------------+-------------------+-------------------+
	// \| Stream #0 \| Stream #0 \| Stream #1 \|
	// +----------------------------+-------------------+-------------------+
	// \| Copy to pinned memory \| \| \|
	// \| \| \| \|
	// +----------------------------+-------------------\| \|
	// \| Copy to pinned memory \| Copy host->device \| \|
	// \| \| \| \|
	// +----------------------------+ \| \|
	// \| Waiting previous h->d copy \| \| \|
	// +----------------------------+-------------------+-------------------+
	// \| Copy to pinned memory \| Kernel \| Copy host->device \|
	// \| +-------------------+ \|
	// +----------------------------+ \| \|
	// \| Waiting previous h->d copy \| \| \|
	// +----------------------------+-------------------+-------------------+
	// \| Copy to pinned memory \| Copy host->device \| Kernel \|
	// \| \| +-------------------+
	// \| ... ... \|
	// +----------------------------+---------------------------------------+
	//
	template <typename Fun>
	void CopyToGpu(const T* from, const int num_values, Fun&& callback) {
	// This synchronization is not required in some cases, but we perform it in
	// order to avoid situation when user callback depends on data that is
	// still to be computed in default stream
	CHECK_EQ(cudaSuccess, cudaStreamSynchronize(context_->DefaultStream()));

	// If pointer to input data does not correspond to page-locked memory,
	// host-to-device memory copy might be executed synchrnonously (with a copy
	// to pinned memory happening inside the driver). In that case we perform
	// copy to a pre-allocated array of page-locked memory.
	const bool copy_to_pinned_memory = MemoryTypeResultsInSynchronousCopy(from);
	T* batch_values_gpu[kNumBatches];
	T* batch_values_cpu[kNumBatches];
	auto streams = context_->streams_;
	for (int i = 0; i < kNumBatches; ++i) {
	batch_values_gpu[i] = values_gpu_.data() + kValuesPerBatch * i;
	batch_values_cpu[i] = values_cpu_pinned_ + kValuesPerBatch * i;
	}
	int batch_id = 0;
	for (int offset = 0; offset < num_values; offset += kValuesPerBatch) {
	const int num_values_batch =
	std::min(num_values - offset, kValuesPerBatch);
	const T* batch_from = from + offset;
	T* batch_to = batch_values_gpu[batch_id];
	auto stream = streams[batch_id];
	auto copy_finished = copy_finished_[batch_id];

	if (copy_to_pinned_memory) {
	// Copying values to a temporary buffer should be started only after the
	// previous copy from temporary buffer to device is completed.
	CHECK_EQ(cudaSuccess, cudaEventSynchronize(copy_finished));
	std::copy_n(batch_from, num_values_batch, batch_values_cpu[batch_id]);
	batch_from = batch_values_cpu[batch_id];
	}
	CHECK_EQ(cudaSuccess,
	cudaMemcpyAsync(batch_to,
	batch_from,
	sizeof(T) * num_values_batch,
	cudaMemcpyHostToDevice,
	stream));
	if (copy_to_pinned_memory) {
	// Next copy to a temporary buffer can start straight after asynchronous
	// copy is completed (and might be started before kernels asynchronously
	// executed in stream by user-supplied callback are completed).
	// No explicit synchronization is required when copying data from
	// page-locked memory, because memory copy and user kernel execution
	// with corresponding part of values_gpu_ array is serialized using
	// stream
	CHECK_EQ(cudaSuccess, cudaEventRecord(copy_finished, stream));
	}
	callback(batch_to, num_values_batch, offset, stream);
	batch_id = (batch_id + 1) % kNumBatches;
	}
	// Explicitly synchronize on all CUDA streams that were utilized.
	for (int i = 0; i < kNumBatches; ++i) {
	CHECK_EQ(cudaSuccess, cudaStreamSynchronize(streams[i]));
	}
	}

	private:
	// It is necessary to have all host-to-device copies to be completely
	// asynchronous. This requires source memory to be allocated in page-locked
	// memory.
	static bool MemoryTypeResultsInSynchronousCopy(const void* ptr) {
	cudaPointerAttributes attributes;
	auto status = cudaPointerGetAttributes(&attributes, ptr);
	#if CUDART_VERSION < 11000
	// In CUDA versions prior 11 call to cudaPointerGetAttributes with host
	// pointer will return cudaErrorInvalidValue
	if (status == cudaErrorInvalidValue) {
	return true;
	}
	#endif
	CHECK_EQ(status, cudaSuccess);
	// This class only supports cpu memory as a source
	CHECK_NE(attributes.type, cudaMemoryTypeDevice);
	// If host memory was allocated (or registered) with CUDA API, or is a
	// managed memory, then call to cudaMemcpyAsync will be asynchrnous. In case
	// of managed memory it might be slightly better to perform a single call of
	// user-provided call-back (and hope that page migration will provide a
	// similar throughput with zero efforts from our side).
	return attributes.type == cudaMemoryTypeUnregistered;
	}

	const int kValuesPerBatch;
	ContextImpl* context_ = nullptr;
	CudaBuffer<T> values_gpu_;
	T* values_cpu_pinned_ = nullptr;
	cudaEvent_t copy_finished_[kNumBatches] = {nullptr};
	};

	} // namespace ceres::internal

	#endif // CERES_NO_CUDA
	#endif // CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_