| // Ceres Solver - A fast non-linear least squares minimizer |
| // Copyright 2023 Google Inc. All rights reserved. |
| // http://ceres-solver.org/ |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are met: |
| // |
| // * Redistributions of source code must retain the above copyright notice, |
| // this list of conditions and the following disclaimer. |
| // * Redistributions in binary form must reproduce the above copyright notice, |
| // this list of conditions and the following disclaimer in the documentation |
| // and/or other materials provided with the distribution. |
| // * Neither the name of Google Inc. nor the names of its contributors may be |
| // used to endorse or promote products derived from this software without |
| // specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| // POSSIBILITY OF SUCH DAMAGE. |
| // |
| // Author: joydeepb@cs.utexas.edu (Joydeep Biswas) |
| |
| #ifndef CERES_INTERNAL_CUDA_BUFFER_H_ |
| #define CERES_INTERNAL_CUDA_BUFFER_H_ |
| |
| #include "ceres/context_impl.h" |
| #include "ceres/internal/config.h" |
| |
| #ifndef CERES_NO_CUDA |
| |
| #include <cstddef> |
| #include <utility> |
| #include <vector> |
| |
| #include "absl/log/check.h" |
| #include "cuda_runtime.h" |
| |
| namespace ceres::internal { |
| // An encapsulated buffer to maintain GPU memory, and handle transfers between |
| // GPU and system memory. It is the responsibility of the user to ensure that |
| // the appropriate GPU device is selected before each subroutine is called. This |
| // is particularly important when using multiple GPU devices on different CPU |
| // threads, since active Cuda devices are determined by the cuda runtime on a |
| // per-thread basis. |
| template <typename T> |
| class CudaBuffer { |
| public: |
| explicit CudaBuffer(ContextImpl* context) : context_(context) {} |
| CudaBuffer(ContextImpl* context, int size) : context_(context) { |
| Reserve(size); |
| } |
| |
| CudaBuffer(CudaBuffer&& other) |
| : data_(other.data_), size_(other.size_), context_(other.context_) { |
| other.data_ = nullptr; |
| other.size_ = 0; |
| } |
| |
| CudaBuffer(const CudaBuffer&) = delete; |
| CudaBuffer& operator=(const CudaBuffer&) = delete; |
| |
| ~CudaBuffer() { |
| if (data_ != nullptr) { |
| CHECK_EQ(cudaFree(data_), cudaSuccess); |
| } |
| } |
| |
| // Grow the GPU memory buffer if needed to accommodate data of the specified |
| // size |
| void Reserve(const size_t size) { |
| if (size > size_) { |
| if (data_ != nullptr) { |
| CHECK_EQ(cudaFree(data_), cudaSuccess); |
| } |
| CHECK_EQ(cudaMalloc(&data_, size * sizeof(T)), cudaSuccess) |
| << "Failed to allocate " << size * sizeof(T) |
| << " bytes of GPU memory"; |
| size_ = size; |
| } |
| } |
| |
| // Perform an asynchronous copy from CPU memory to GPU memory managed by this |
| // CudaBuffer instance using the stream provided. |
| void CopyFromCpu(const T* data, const size_t size) { |
| Reserve(size); |
| CHECK_EQ(cudaMemcpyAsync(data_, |
| data, |
| size * sizeof(T), |
| cudaMemcpyHostToDevice, |
| context_->DefaultStream()), |
| cudaSuccess); |
| } |
| |
| // Perform an asynchronous copy from a vector in CPU memory to GPU memory |
| // managed by this CudaBuffer instance. |
| void CopyFromCpuVector(const std::vector<T>& data) { |
| Reserve(data.size()); |
| CHECK_EQ(cudaMemcpyAsync(data_, |
| data.data(), |
| data.size() * sizeof(T), |
| cudaMemcpyHostToDevice, |
| context_->DefaultStream()), |
| cudaSuccess); |
| } |
| |
| // Perform an asynchronous copy from another GPU memory array to the GPU |
| // memory managed by this CudaBuffer instance using the stream provided. |
| void CopyFromGPUArray(const T* data, const size_t size) { |
| Reserve(size); |
| CHECK_EQ(cudaMemcpyAsync(data_, |
| data, |
| size * sizeof(T), |
| cudaMemcpyDeviceToDevice, |
| context_->DefaultStream()), |
| cudaSuccess); |
| } |
| |
| // Copy data from the GPU memory managed by this CudaBuffer instance to CPU |
| // memory. It is the caller's responsibility to ensure that the CPU memory |
| // pointer is valid, i.e. it is not null, and that it points to memory of |
| // at least this->size() size. This method ensures all previously dispatched |
| // GPU operations on the specified stream have completed before copying the |
| // data to CPU memory. |
| void CopyToCpu(T* data, const size_t size) const { |
| CHECK(data_ != nullptr); |
| CHECK_EQ(cudaMemcpyAsync(data, |
| data_, |
| size * sizeof(T), |
| cudaMemcpyDeviceToHost, |
| context_->DefaultStream()), |
| cudaSuccess); |
| CHECK_EQ(cudaStreamSynchronize(context_->DefaultStream()), cudaSuccess); |
| } |
| |
| // Copy N items from another GPU memory array to the GPU memory managed by |
| // this CudaBuffer instance, growing this buffer's size if needed. This copy |
| // is asynchronous, and operates on the stream provided. |
| void CopyNItemsFrom(int n, const CudaBuffer<T>& other) { |
| Reserve(n); |
| CHECK(other.data_ != nullptr); |
| CHECK(data_ != nullptr); |
| CHECK_EQ(cudaMemcpyAsync(data_, |
| other.data_, |
| size_ * sizeof(T), |
| cudaMemcpyDeviceToDevice, |
| context_->DefaultStream()), |
| cudaSuccess); |
| } |
| |
| // Return a pointer to the GPU memory managed by this CudaBuffer instance. |
| T* data() { return data_; } |
| const T* data() const { return data_; } |
| // Return the number of items of type T that can fit in the GPU memory |
| // allocated so far by this CudaBuffer instance. |
| size_t size() const { return size_; } |
| |
| private: |
| T* data_ = nullptr; |
| size_t size_ = 0; |
| ContextImpl* context_ = nullptr; |
| }; |
| |
| // This class wraps host memory region allocated via cudaMallocHost. Such memory |
| // region is page-locked, hence enabling direct transfer to/from device, |
| // avoiding implicit buffering under the hood of CUDA API. |
| template <typename T> |
| class CudaPinnedHostBuffer { |
| public: |
| CudaPinnedHostBuffer() noexcept = default; |
| CudaPinnedHostBuffer(int size) { Reserve(size); } |
| CudaPinnedHostBuffer(CudaPinnedHostBuffer&& other) noexcept |
| : data_(std::exchange(other.data_, nullptr)), |
| size_(std::exchange(other.size_, 0)) {} |
| CudaPinnedHostBuffer(const CudaPinnedHostBuffer&) = delete; |
| CudaPinnedHostBuffer& operator=(const CudaPinnedHostBuffer&) = delete; |
| CudaPinnedHostBuffer& operator=(CudaPinnedHostBuffer&& other) noexcept { |
| Free(); |
| data_ = std::exchange(other.data_, nullptr); |
| size_ = std::exchange(other.size_, 0); |
| return *this; |
| } |
| ~CudaPinnedHostBuffer() { Free(); } |
| |
| void Reserve(const std::size_t size) { |
| if (size > size_) { |
| Free(); |
| CHECK_EQ(cudaMallocHost(&data_, size * sizeof(T)), cudaSuccess) |
| << "Failed to allocate " << size * sizeof(T) |
| << " bytes of pinned host memory"; |
| size_ = size; |
| } |
| } |
| |
| T* data() noexcept { return data_; } |
| const T* data() const noexcept { return data_; } |
| std::size_t size() const noexcept { return size_; } |
| |
| private: |
| void Free() { |
| if (data_ != nullptr) { |
| CHECK_EQ(cudaFreeHost(data_), cudaSuccess); |
| data_ = nullptr; |
| size_ = 0; |
| } |
| } |
| |
| T* data_ = nullptr; |
| std::size_t size_ = 0; |
| }; |
| |
| } // namespace ceres::internal |
| |
| #endif // CERES_NO_CUDA |
| |
| #endif // CERES_INTERNAL_CUDA_BUFFER_H_ |