|  | // Ceres Solver - A fast non-linear least squares minimizer | 
|  | // Copyright 2023 Google Inc. All rights reserved. | 
|  | // http://ceres-solver.org/ | 
|  | // | 
|  | // Redistribution and use in source and binary forms, with or without | 
|  | // modification, are permitted provided that the following conditions are met: | 
|  | // | 
|  | // * Redistributions of source code must retain the above copyright notice, | 
|  | //   this list of conditions and the following disclaimer. | 
|  | // * Redistributions in binary form must reproduce the above copyright notice, | 
|  | //   this list of conditions and the following disclaimer in the documentation | 
|  | //   and/or other materials provided with the distribution. | 
|  | // * Neither the name of Google Inc. nor the names of its contributors may be | 
|  | //   used to endorse or promote products derived from this software without | 
|  | //   specific prior written permission. | 
|  | // | 
|  | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | 
|  | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
|  | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | 
|  | // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | 
|  | // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | 
|  | // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | 
|  | // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | 
|  | // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | 
|  | // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | 
|  | // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | 
|  | // POSSIBILITY OF SUCH DAMAGE. | 
|  | // | 
|  | // Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin) | 
|  |  | 
|  | #ifndef CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_ | 
|  | #define CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_ | 
|  |  | 
|  | #include "ceres/internal/config.h" | 
|  |  | 
|  | #ifndef CERES_NO_CUDA | 
|  | #include "ceres/cuda_buffer.h" | 
|  |  | 
|  | namespace ceres::internal { | 
|  |  | 
|  | // Most contemporary CUDA devices are capable of simultaneous code execution and | 
|  | // host-to-device transfer. This class copies batches of data to GPU memory and | 
|  | // executes processing of copied data in parallel (asynchronously). | 
|  | // Data is copied to a fixed-size buffer on GPU (containing at most | 
|  | // max_buffer_size values), and this memory is re-used when the previous | 
|  | // batch of values is processed by user-provided callback | 
|  | // Host-to-device copy uses a temporary buffer if required. Each batch of values | 
|  | // has size of kValuesPerBatch, except the last one. | 
|  | template <typename T> | 
|  | class CERES_NO_EXPORT CudaStreamedBuffer { | 
|  | public: | 
|  | // If hardware supports only one host-to-device copy or one host-to-device | 
|  | // copy is able to reach peak bandwidth, two streams are sufficient to reach | 
|  | // maximum efficiency: | 
|  | //  - If transferring batch of values takes more time, than processing it on | 
|  | //  gpu, then at every moment of time one of the streams will be transferring | 
|  | //  data and other stream will be either processing data or idle; the whole | 
|  | //  process will be bounded by host-to-device copy. | 
|  | //  - If transferring batch of values takes less time, than processing it on | 
|  | //  gpu, then at every moment of time one of the streams will be processing | 
|  | //  data and other stream will be either performing computations or | 
|  | //  transferring data, and the whole process will be bounded by computations. | 
|  | static constexpr int kNumBatches = 2; | 
|  | // max_buffer_size is the maximal size (in elements of type T) of array | 
|  | // to be pre-allocated in gpu memory. The size of array determines size of | 
|  | // batch of values for simultaneous copying and processing. It should be large | 
|  | // enough to allow highly-parallel execution of user kernels; making it too | 
|  | // large increases latency. | 
|  | CudaStreamedBuffer(ContextImpl* context, const int max_buffer_size) | 
|  | : kValuesPerBatch(max_buffer_size / kNumBatches), | 
|  | context_(context), | 
|  | values_gpu_(context, kValuesPerBatch * kNumBatches) { | 
|  | static_assert(ContextImpl::kNumCudaStreams >= kNumBatches); | 
|  | CHECK_GE(max_buffer_size, kNumBatches); | 
|  | // Pre-allocate a buffer of page-locked memory for transfers from a regular | 
|  | // cpu memory. Because we will be only writing into that buffer from cpu, | 
|  | // memory is allocated with cudaHostAllocWriteCombined flag. | 
|  | CHECK_EQ(cudaSuccess, | 
|  | cudaHostAlloc(&values_cpu_pinned_, | 
|  | sizeof(T) * kValuesPerBatch * kNumBatches, | 
|  | cudaHostAllocWriteCombined)); | 
|  | for (auto& e : copy_finished_) { | 
|  | CHECK_EQ(cudaSuccess, | 
|  | cudaEventCreateWithFlags(&e, cudaEventDisableTiming)); | 
|  | } | 
|  | } | 
|  |  | 
|  | CudaStreamedBuffer(const CudaStreamedBuffer&) = delete; | 
|  |  | 
|  | ~CudaStreamedBuffer() { | 
|  | CHECK_EQ(cudaSuccess, cudaFreeHost(values_cpu_pinned_)); | 
|  | for (auto& e : copy_finished_) { | 
|  | CHECK_EQ(cudaSuccess, cudaEventDestroy(e)); | 
|  | } | 
|  | } | 
|  |  | 
|  | // Transfer num_values at host-memory pointer from, calling | 
|  | // callback(device_pointer, size_of_batch, offset_of_batch, stream_to_use) | 
|  | // after scheduling transfer of each batch of data. User-provided callback | 
|  | // should perform processing of data at device_pointer only in | 
|  | // stream_to_use stream (device_pointer will be re-used in the next | 
|  | // callback invocation with the same stream). | 
|  | // | 
|  | // Two diagrams below describe operation in two possible scenarios, depending | 
|  | // on input data being stored in page-locked memory. In this example we will | 
|  | // have max_buffer_size = 2 * K, num_values = N * K and callback | 
|  | // scheduling a single asynchronous launch of | 
|  | // Kernel<<..., stream_to_use>>(device_pointer, | 
|  | //                              size_of_batch, | 
|  | //                              offset_of_batch) | 
|  | // | 
|  | // a. Copying from page-locked memory | 
|  | // In this case no copy on the host-side is necessary, and this method just | 
|  | // schedules a bunch of interleaved memory copies and callback invocations: | 
|  | // | 
|  | //  cudaStreamSynchronize(context->DefaultStream()); | 
|  | //  - Iteration #0: | 
|  | //    - cudaMemcpyAsync(values_gpu_, from, K * sizeof(T), H->D, stream_0) | 
|  | //    - callback(values_gpu_, K, 0, stream_0) | 
|  | //  - Iteration #1: | 
|  | //    - cudaMemcpyAsync(values_gpu_ + K, from + K, K * sizeof(T), H->D, | 
|  | //    stream_1) | 
|  | //    - callback(values_gpu_ + K, K, K, stream_1) | 
|  | //  - Iteration #2: | 
|  | //    - cudaMemcpyAsync(values_gpu_, from + 2 * K, K * sizeof(T), H->D, | 
|  | //    stream_0) | 
|  | //    - callback(values_gpu_, K, 2 * K, stream_0) | 
|  | //  - Iteration #3: | 
|  | //     - cudaMemcpyAsync(values_gpu_ + K, from + 3 * K, K * sizeof(T), H->D, | 
|  | //     stream_1) | 
|  | //     - callback(values_gpu_ + K, K, 3 * K, stream_1) | 
|  | //  ... | 
|  | //  - Iteration #i: | 
|  | //     - cudaMemcpyAsync(values_gpu_ + (i % 2) * K, from + i * K, K * | 
|  | //     sizeof(T), H->D, stream_(i % 2)) | 
|  | //     - callback(values_gpu_ + (i % 2) * K, K, i * K, stream_(i % 2) | 
|  | //  ... | 
|  | //  cudaStreamSynchronize(stream_0) | 
|  | //  cudaStreamSynchronize(stream_1) | 
|  | // | 
|  | //  This sequence of calls results in following activity on gpu (assuming that | 
|  | //  kernel invoked by callback takes less time than host-to-device copy): | 
|  | //  +-------------------+-------------------+ | 
|  | //  | Stream #0         | Stream #1         | | 
|  | //  +-------------------+-------------------+ | 
|  | //  | Copy host->device |                   | | 
|  | //  |                   |                   | | 
|  | //  |                   |                   | | 
|  | //  +-------------------+-------------------+ | 
|  | //  | Kernel            | Copy host->device | | 
|  | //  +-------------------+                   | | 
|  | //  |                   |                   | | 
|  | //  +-------------------+-------------------+ | 
|  | //  | Copy host->device | Kernel            | | 
|  | //  |                   +-------------------+ | 
|  | //  |                   |                   | | 
|  | //  +-------------------+-------------------+ | 
|  | //  | Kernel            | Copy host->device | | 
|  | //  |                  ...                  | | 
|  | //  +---------------------------------------+ | 
|  | // | 
|  | // b. Copying from regular memory | 
|  | // In this case a copy from regular memory to page-locked memory is required | 
|  | // in order to get asynchrnonous operation. Because pinned memory on host-side | 
|  | // is reused, additional synchronization is required. On each iteration method | 
|  | // the following actions are performed: | 
|  | //  - Wait till previous copy operation in stream is completed | 
|  | //  - Copy batch of values from input array into pinned memory | 
|  | //  - Asynchronously launch host-to-device copy | 
|  | //  - Setup event for synchronization on copy completion | 
|  | //  - Invoke callback (that launches kernel asynchronously) | 
|  | // | 
|  | //  Invocations are performed with the following arguments | 
|  | //  cudaStreamSynchronize(context->DefaultStream()); | 
|  | //  - Iteration #0: | 
|  | //    - cudaEventSynchronize(copy_finished_0) | 
|  | //    - std::copy_n(from, K, values_cpu_pinned_) | 
|  | //    - cudaMemcpyAsync(values_gpu_, values_cpu_pinned_, K * sizeof(T), H->D, | 
|  | //    stream_0) | 
|  | //    - cudaEventRecord(copy_finished_0, stream_0) | 
|  | //    - callback(values_gpu_, K, 0, stream_0) | 
|  | //  - Iteration #1: | 
|  | //    - cudaEventSynchronize(copy_finished_1) | 
|  | //    - std::copy_n(from + K, K, values_cpu_pinned_ + K) | 
|  | //    - cudaMemcpyAsync(values_gpu_ + K, values_cpu_pinned_ + K, K * | 
|  | //    sizeof(T), H->D, stream_1) | 
|  | //    - cudaEventRecord(copy_finished_1, stream_1) | 
|  | //    - callback(values_gpu_ + K, K, K, stream_1) | 
|  | //  - Iteration #2: | 
|  | //    - cudaEventSynchronize(copy_finished_0) | 
|  | //    - std::copy_n(from + 2 * K, K, values_cpu_pinned_) | 
|  | //    - cudaMemcpyAsync(values_gpu_, values_cpu_pinned_, K * sizeof(T), H->D, | 
|  | //    stream_0) | 
|  | //    - cudaEventRecord(copy_finished_0, stream_0) | 
|  | //    - callback(values_gpu_, K, 2 * K, stream_0) | 
|  | //  - Iteration #3: | 
|  | //    - cudaEventSynchronize(copy_finished_1) | 
|  | //    - std::copy_n(from + 3 * K, K, values_cpu_pinned_ + K) | 
|  | //    - cudaMemcpyAsync(values_gpu_ + K, values_cpu_pinned_ + K, K * | 
|  | //    sizeof(T), H->D, stream_1) | 
|  | //    - cudaEventRecord(copy_finished_1, stream_1) | 
|  | //    - callback(values_gpu_ + K, K, 3 * K, stream_1) | 
|  | //  ... | 
|  | //  - Iteration #i: | 
|  | //    - cudaEventSynchronize(copy_finished_(i % 2)) | 
|  | //    - std::copy_n(from + i * K, K, values_cpu_pinned_ + (i % 2) * K) | 
|  | //    - cudaMemcpyAsync(values_gpu_ + (i % 2) * K, values_cpu_pinned_ + (i % | 
|  | //    2) * K, K * sizeof(T), H->D, stream_(i % 2)) | 
|  | //    - cudaEventRecord(copy_finished_(i % 2), stream_(i % 2)) | 
|  | //    - callback(values_gpu_ + (i % 2) * K, K, i * K, stream_(i % 2)) | 
|  | //  ... | 
|  | //  cudaStreamSynchronize(stream_0) | 
|  | //  cudaStreamSynchronize(stream_1) | 
|  | // | 
|  | //  This sequence of calls results in following activity on cpu and gpu | 
|  | //  (assuming that kernel invoked by callback takes less time than | 
|  | //  host-to-device copy and copy in cpu memory, and copy in cpu memory is | 
|  | //  faster than host-to-device copy): | 
|  | //  +----------------------------+-------------------+-------------------+ | 
|  | //  | Stream #0                  | Stream #0         | Stream #1         | | 
|  | //  +----------------------------+-------------------+-------------------+ | 
|  | //  | Copy to pinned memory      |                   |                   | | 
|  | //  |                            |                   |                   | | 
|  | //  +----------------------------+-------------------|                   | | 
|  | //  | Copy to pinned memory      | Copy host->device |                   | | 
|  | //  |                            |                   |                   | | 
|  | //  +----------------------------+                   |                   | | 
|  | //  | Waiting previous h->d copy |                   |                   | | 
|  | //  +----------------------------+-------------------+-------------------+ | 
|  | //  | Copy to pinned memory      | Kernel            | Copy host->device | | 
|  | //  |                            +-------------------+                   | | 
|  | //  +----------------------------+                   |                   | | 
|  | //  | Waiting previous h->d copy |                   |                   | | 
|  | //  +----------------------------+-------------------+-------------------+ | 
|  | //  | Copy to pinned memory      | Copy host->device | Kernel            | | 
|  | //  |                            |                   +-------------------+ | 
|  | //  |                           ...                 ...                  | | 
|  | //  +----------------------------+---------------------------------------+ | 
|  | // | 
|  | template <typename Fun> | 
|  | void CopyToGpu(const T* from, const int num_values, Fun&& callback) { | 
|  | // This synchronization is not required in some cases, but we perform it in | 
|  | // order to avoid situation when user callback depends on data that is | 
|  | // still to be computed in default stream | 
|  | CHECK_EQ(cudaSuccess, cudaStreamSynchronize(context_->DefaultStream())); | 
|  |  | 
|  | // If pointer to input data does not correspond to page-locked memory, | 
|  | // host-to-device memory copy might be executed synchrnonously (with a copy | 
|  | // to pinned memory happening inside the driver). In that case we perform | 
|  | // copy to a pre-allocated array of page-locked memory. | 
|  | const bool copy_to_pinned_memory = MemoryTypeResultsInSynchronousCopy(from); | 
|  | T* batch_values_gpu[kNumBatches]; | 
|  | T* batch_values_cpu[kNumBatches]; | 
|  | auto streams = context_->streams_; | 
|  | for (int i = 0; i < kNumBatches; ++i) { | 
|  | batch_values_gpu[i] = values_gpu_.data() + kValuesPerBatch * i; | 
|  | batch_values_cpu[i] = values_cpu_pinned_ + kValuesPerBatch * i; | 
|  | } | 
|  | int batch_id = 0; | 
|  | for (int offset = 0; offset < num_values; offset += kValuesPerBatch) { | 
|  | const int num_values_batch = | 
|  | std::min(num_values - offset, kValuesPerBatch); | 
|  | const T* batch_from = from + offset; | 
|  | T* batch_to = batch_values_gpu[batch_id]; | 
|  | auto stream = streams[batch_id]; | 
|  | auto copy_finished = copy_finished_[batch_id]; | 
|  |  | 
|  | if (copy_to_pinned_memory) { | 
|  | // Copying values to a temporary buffer should be started only after the | 
|  | // previous copy from temporary buffer to device is completed. | 
|  | CHECK_EQ(cudaSuccess, cudaEventSynchronize(copy_finished)); | 
|  | std::copy_n(batch_from, num_values_batch, batch_values_cpu[batch_id]); | 
|  | batch_from = batch_values_cpu[batch_id]; | 
|  | } | 
|  | CHECK_EQ(cudaSuccess, | 
|  | cudaMemcpyAsync(batch_to, | 
|  | batch_from, | 
|  | sizeof(T) * num_values_batch, | 
|  | cudaMemcpyHostToDevice, | 
|  | stream)); | 
|  | if (copy_to_pinned_memory) { | 
|  | // Next copy to a temporary buffer can start straight after asynchronous | 
|  | // copy is completed (and might be started before kernels asynchronously | 
|  | // executed in stream by user-supplied callback are completed). | 
|  | // No explicit synchronization is required when copying data from | 
|  | // page-locked memory, because memory copy and user kernel execution | 
|  | // with corresponding part of values_gpu_ array is serialized using | 
|  | // stream | 
|  | CHECK_EQ(cudaSuccess, cudaEventRecord(copy_finished, stream)); | 
|  | } | 
|  | callback(batch_to, num_values_batch, offset, stream); | 
|  | batch_id = (batch_id + 1) % kNumBatches; | 
|  | } | 
|  | // Explicitly synchronize on all CUDA streams that were utilized. | 
|  | for (int i = 0; i < kNumBatches; ++i) { | 
|  | CHECK_EQ(cudaSuccess, cudaStreamSynchronize(streams[i])); | 
|  | } | 
|  | } | 
|  |  | 
|  | private: | 
|  | // It is necessary to have all host-to-device copies to be completely | 
|  | // asynchronous. This requires source memory to be allocated in page-locked | 
|  | // memory. | 
|  | static bool MemoryTypeResultsInSynchronousCopy(const void* ptr) { | 
|  | cudaPointerAttributes attributes; | 
|  | auto status = cudaPointerGetAttributes(&attributes, ptr); | 
|  | #if CUDART_VERSION < 11000 | 
|  | // In CUDA versions prior 11 call to cudaPointerGetAttributes with host | 
|  | // pointer will return  cudaErrorInvalidValue | 
|  | if (status == cudaErrorInvalidValue) { | 
|  | return true; | 
|  | } | 
|  | #endif | 
|  | CHECK_EQ(status, cudaSuccess); | 
|  | // This class only supports cpu memory as a source | 
|  | CHECK_NE(attributes.type, cudaMemoryTypeDevice); | 
|  | // If host memory was allocated (or registered) with CUDA API, or is a | 
|  | // managed memory, then call to cudaMemcpyAsync will be asynchrnous. In case | 
|  | // of managed memory it might be slightly better to perform a single call of | 
|  | // user-provided call-back (and hope that page migration will provide a | 
|  | // similar throughput with zero efforts from our side). | 
|  | return attributes.type == cudaMemoryTypeUnregistered; | 
|  | } | 
|  |  | 
|  | const int kValuesPerBatch; | 
|  | ContextImpl* context_ = nullptr; | 
|  | CudaBuffer<T> values_gpu_; | 
|  | T* values_cpu_pinned_ = nullptr; | 
|  | cudaEvent_t copy_finished_[kNumBatches] = {nullptr}; | 
|  | }; | 
|  |  | 
|  | }  // namespace ceres::internal | 
|  |  | 
|  | #endif  // CERES_NO_CUDA | 
|  | #endif  // CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_ |