blob: a8532d0360ef90b80236408cc9445d850af40aed [file] [log] [blame] [edit]
// Ceres Solver - A fast non-linear least squares minimizer
// Copyright 2023 Google Inc. All rights reserved.
// http://ceres-solver.org/
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// * Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
// * Neither the name of Google Inc. nor the names of its contributors may be
// used to endorse or promote products derived from this software without
// specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
#ifndef CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_
#define CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_
#include "ceres/internal/config.h"
#ifndef CERES_NO_CUDA
#include <algorithm>
#include "absl/log/check.h"
#include "ceres/cuda_buffer.h"
namespace ceres::internal {
// Most contemporary CUDA devices are capable of simultaneous code execution and
// host-to-device transfer. This class copies batches of data to GPU memory and
// executes processing of copied data in parallel (asynchronously).
// Data is copied to a fixed-size buffer on GPU (containing at most
// max_buffer_size values), and this memory is re-used when the previous
// batch of values is processed by user-provided callback
// Host-to-device copy uses a temporary buffer if required. Each batch of values
// has size of kValuesPerBatch, except the last one.
template <typename T>
class CERES_NO_EXPORT CudaStreamedBuffer {
public:
// If hardware supports only one host-to-device copy or one host-to-device
// copy is able to reach peak bandwidth, two streams are sufficient to reach
// maximum efficiency:
// - If transferring batch of values takes more time, than processing it on
// gpu, then at every moment of time one of the streams will be transferring
// data and other stream will be either processing data or idle; the whole
// process will be bounded by host-to-device copy.
// - If transferring batch of values takes less time, than processing it on
// gpu, then at every moment of time one of the streams will be processing
// data and other stream will be either performing computations or
// transferring data, and the whole process will be bounded by computations.
static constexpr int kNumBatches = 2;
// max_buffer_size is the maximal size (in elements of type T) of array
// to be pre-allocated in gpu memory. The size of array determines size of
// batch of values for simultaneous copying and processing. It should be large
// enough to allow highly-parallel execution of user kernels; making it too
// large increases latency.
CudaStreamedBuffer(ContextImpl* context, const int max_buffer_size)
: kValuesPerBatch(max_buffer_size / kNumBatches),
context_(context),
values_gpu_(context, kValuesPerBatch * kNumBatches) {
static_assert(ContextImpl::kNumCudaStreams >= kNumBatches);
CHECK_GE(max_buffer_size, kNumBatches);
// Pre-allocate a buffer of page-locked memory for transfers from a regular
// cpu memory. Because we will be only writing into that buffer from cpu,
// memory is allocated with cudaHostAllocWriteCombined flag.
CHECK_EQ(cudaSuccess,
cudaHostAlloc(&values_cpu_pinned_,
sizeof(T) * kValuesPerBatch * kNumBatches,
cudaHostAllocWriteCombined));
for (auto& e : copy_finished_) {
CHECK_EQ(cudaSuccess,
cudaEventCreateWithFlags(&e, cudaEventDisableTiming));
}
}
CudaStreamedBuffer(const CudaStreamedBuffer&) = delete;
~CudaStreamedBuffer() {
CHECK_EQ(cudaSuccess, cudaFreeHost(values_cpu_pinned_));
for (auto& e : copy_finished_) {
CHECK_EQ(cudaSuccess, cudaEventDestroy(e));
}
}
// Transfer num_values at host-memory pointer from, calling
// callback(device_pointer, size_of_batch, offset_of_batch, stream_to_use)
// after scheduling transfer of each batch of data. User-provided callback
// should perform processing of data at device_pointer only in
// stream_to_use stream (device_pointer will be re-used in the next
// callback invocation with the same stream).
//
// Two diagrams below describe operation in two possible scenarios, depending
// on input data being stored in page-locked memory. In this example we will
// have max_buffer_size = 2 * K, num_values = N * K and callback
// scheduling a single asynchronous launch of
// Kernel<<..., stream_to_use>>(device_pointer,
// size_of_batch,
// offset_of_batch)
//
// a. Copying from page-locked memory
// In this case no copy on the host-side is necessary, and this method just
// schedules a bunch of interleaved memory copies and callback invocations:
//
// cudaStreamSynchronize(context->DefaultStream());
// - Iteration #0:
// - cudaMemcpyAsync(values_gpu_, from, K * sizeof(T), H->D, stream_0)
// - callback(values_gpu_, K, 0, stream_0)
// - Iteration #1:
// - cudaMemcpyAsync(values_gpu_ + K, from + K, K * sizeof(T), H->D,
// stream_1)
// - callback(values_gpu_ + K, K, K, stream_1)
// - Iteration #2:
// - cudaMemcpyAsync(values_gpu_, from + 2 * K, K * sizeof(T), H->D,
// stream_0)
// - callback(values_gpu_, K, 2 * K, stream_0)
// - Iteration #3:
// - cudaMemcpyAsync(values_gpu_ + K, from + 3 * K, K * sizeof(T), H->D,
// stream_1)
// - callback(values_gpu_ + K, K, 3 * K, stream_1)
// ...
// - Iteration #i:
// - cudaMemcpyAsync(values_gpu_ + (i % 2) * K, from + i * K, K *
// sizeof(T), H->D, stream_(i % 2))
// - callback(values_gpu_ + (i % 2) * K, K, i * K, stream_(i % 2)
// ...
// cudaStreamSynchronize(stream_0)
// cudaStreamSynchronize(stream_1)
//
// This sequence of calls results in following activity on gpu (assuming that
// kernel invoked by callback takes less time than host-to-device copy):
// +-------------------+-------------------+
// | Stream #0 | Stream #1 |
// +-------------------+-------------------+
// | Copy host->device | |
// | | |
// | | |
// +-------------------+-------------------+
// | Kernel | Copy host->device |
// +-------------------+ |
// | | |
// +-------------------+-------------------+
// | Copy host->device | Kernel |
// | +-------------------+
// | | |
// +-------------------+-------------------+
// | Kernel | Copy host->device |
// | ... |
// +---------------------------------------+
//
// b. Copying from regular memory
// In this case a copy from regular memory to page-locked memory is required
// in order to get asynchronous operation. Because pinned memory on host-side
// is reused, additional synchronization is required. On each iteration method
// the following actions are performed:
// - Wait till previous copy operation in stream is completed
// - Copy batch of values from input array into pinned memory
// - Asynchronously launch host-to-device copy
// - Setup event for synchronization on copy completion
// - Invoke callback (that launches kernel asynchronously)
//
// Invocations are performed with the following arguments
// cudaStreamSynchronize(context->DefaultStream());
// - Iteration #0:
// - cudaEventSynchronize(copy_finished_0)
// - std::copy_n(from, K, values_cpu_pinned_)
// - cudaMemcpyAsync(values_gpu_, values_cpu_pinned_, K * sizeof(T), H->D,
// stream_0)
// - cudaEventRecord(copy_finished_0, stream_0)
// - callback(values_gpu_, K, 0, stream_0)
// - Iteration #1:
// - cudaEventSynchronize(copy_finished_1)
// - std::copy_n(from + K, K, values_cpu_pinned_ + K)
// - cudaMemcpyAsync(values_gpu_ + K, values_cpu_pinned_ + K, K *
// sizeof(T), H->D, stream_1)
// - cudaEventRecord(copy_finished_1, stream_1)
// - callback(values_gpu_ + K, K, K, stream_1)
// - Iteration #2:
// - cudaEventSynchronize(copy_finished_0)
// - std::copy_n(from + 2 * K, K, values_cpu_pinned_)
// - cudaMemcpyAsync(values_gpu_, values_cpu_pinned_, K * sizeof(T), H->D,
// stream_0)
// - cudaEventRecord(copy_finished_0, stream_0)
// - callback(values_gpu_, K, 2 * K, stream_0)
// - Iteration #3:
// - cudaEventSynchronize(copy_finished_1)
// - std::copy_n(from + 3 * K, K, values_cpu_pinned_ + K)
// - cudaMemcpyAsync(values_gpu_ + K, values_cpu_pinned_ + K, K *
// sizeof(T), H->D, stream_1)
// - cudaEventRecord(copy_finished_1, stream_1)
// - callback(values_gpu_ + K, K, 3 * K, stream_1)
// ...
// - Iteration #i:
// - cudaEventSynchronize(copy_finished_(i % 2))
// - std::copy_n(from + i * K, K, values_cpu_pinned_ + (i % 2) * K)
// - cudaMemcpyAsync(values_gpu_ + (i % 2) * K, values_cpu_pinned_ + (i %
// 2) * K, K * sizeof(T), H->D, stream_(i % 2))
// - cudaEventRecord(copy_finished_(i % 2), stream_(i % 2))
// - callback(values_gpu_ + (i % 2) * K, K, i * K, stream_(i % 2))
// ...
// cudaStreamSynchronize(stream_0)
// cudaStreamSynchronize(stream_1)
//
// This sequence of calls results in following activity on cpu and gpu
// (assuming that kernel invoked by callback takes less time than
// host-to-device copy and copy in cpu memory, and copy in cpu memory is
// faster than host-to-device copy):
// +----------------------------+-------------------+-------------------+
// | Stream #0 | Stream #0 | Stream #1 |
// +----------------------------+-------------------+-------------------+
// | Copy to pinned memory | | |
// | | | |
// +----------------------------+-------------------| |
// | Copy to pinned memory | Copy host->device | |
// | | | |
// +----------------------------+ | |
// | Waiting previous h->d copy | | |
// +----------------------------+-------------------+-------------------+
// | Copy to pinned memory | Kernel | Copy host->device |
// | +-------------------+ |
// +----------------------------+ | |
// | Waiting previous h->d copy | | |
// +----------------------------+-------------------+-------------------+
// | Copy to pinned memory | Copy host->device | Kernel |
// | | +-------------------+
// | ... ... |
// +----------------------------+---------------------------------------+
//
template <typename Fun>
void CopyToGpu(const T* from, const int num_values, Fun&& callback) {
// This synchronization is not required in some cases, but we perform it in
// order to avoid situation when user callback depends on data that is
// still to be computed in default stream
CHECK_EQ(cudaSuccess, cudaStreamSynchronize(context_->DefaultStream()));
// If pointer to input data does not correspond to page-locked memory,
// host-to-device memory copy might be executed synchrnonously (with a copy
// to pinned memory happening inside the driver). In that case we perform
// copy to a pre-allocated array of page-locked memory.
const bool copy_to_pinned_memory = MemoryTypeResultsInSynchronousCopy(from);
T* batch_values_gpu[kNumBatches];
T* batch_values_cpu[kNumBatches];
auto streams = context_->streams_;
for (int i = 0; i < kNumBatches; ++i) {
batch_values_gpu[i] = values_gpu_.data() + kValuesPerBatch * i;
batch_values_cpu[i] = values_cpu_pinned_ + kValuesPerBatch * i;
}
int batch_id = 0;
for (int offset = 0; offset < num_values; offset += kValuesPerBatch) {
const int num_values_batch =
std::min(num_values - offset, kValuesPerBatch);
const T* batch_from = from + offset;
T* batch_to = batch_values_gpu[batch_id];
auto stream = streams[batch_id];
auto copy_finished = copy_finished_[batch_id];
if (copy_to_pinned_memory) {
// Copying values to a temporary buffer should be started only after the
// previous copy from temporary buffer to device is completed.
CHECK_EQ(cudaSuccess, cudaEventSynchronize(copy_finished));
std::copy_n(batch_from, num_values_batch, batch_values_cpu[batch_id]);
batch_from = batch_values_cpu[batch_id];
}
CHECK_EQ(cudaSuccess,
cudaMemcpyAsync(batch_to,
batch_from,
sizeof(T) * num_values_batch,
cudaMemcpyHostToDevice,
stream));
if (copy_to_pinned_memory) {
// Next copy to a temporary buffer can start straight after asynchronous
// copy is completed (and might be started before kernels asynchronously
// executed in stream by user-supplied callback are completed).
// No explicit synchronization is required when copying data from
// page-locked memory, because memory copy and user kernel execution
// with corresponding part of values_gpu_ array is serialized using
// stream
CHECK_EQ(cudaSuccess, cudaEventRecord(copy_finished, stream));
}
callback(batch_to, num_values_batch, offset, stream);
batch_id = (batch_id + 1) % kNumBatches;
}
// Explicitly synchronize on all CUDA streams that were utilized.
for (int i = 0; i < kNumBatches; ++i) {
CHECK_EQ(cudaSuccess, cudaStreamSynchronize(streams[i]));
}
}
private:
// It is necessary to have all host-to-device copies to be completely
// asynchronous. This requires source memory to be allocated in page-locked
// memory.
static bool MemoryTypeResultsInSynchronousCopy(const void* ptr) {
cudaPointerAttributes attributes;
auto status = cudaPointerGetAttributes(&attributes, ptr);
#if CUDART_VERSION < 11000
// In CUDA versions prior 11 call to cudaPointerGetAttributes with host
// pointer will return cudaErrorInvalidValue
if (status == cudaErrorInvalidValue) {
return true;
}
#endif
CHECK_EQ(status, cudaSuccess);
// This class only supports cpu memory as a source
CHECK_NE(attributes.type, cudaMemoryTypeDevice);
// If host memory was allocated (or registered) with CUDA API, or is a
// managed memory, then call to cudaMemcpyAsync will be asynchronous. In
// case of managed memory it might be slightly better to perform a single
// call of user-provided call-back (and hope that page migration will
// provide a similar throughput with zero efforts from our side).
return attributes.type == cudaMemoryTypeUnregistered;
}
const int kValuesPerBatch;
ContextImpl* context_ = nullptr;
CudaBuffer<T> values_gpu_;
T* values_cpu_pinned_ = nullptr;
cudaEvent_t copy_finished_[kNumBatches] = {nullptr};
};
} // namespace ceres::internal
#endif // CERES_NO_CUDA
#endif // CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_