blob: c5756eaca9f2ab803cc45404c6d6b4a8fe10702a [file] [log] [blame]
Joydeep Biswas36d6d862022-02-03 08:09:10 -06001// Ceres Solver - A fast non-linear least squares minimizer
Sameer Agarwal5a30cae2023-09-19 15:29:34 -07002// Copyright 2023 Google Inc. All rights reserved.
Joydeep Biswas36d6d862022-02-03 08:09:10 -06003// http://ceres-solver.org/
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are met:
7//
8// * Redistributions of source code must retain the above copyright notice,
9// this list of conditions and the following disclaimer.
10// * Redistributions in binary form must reproduce the above copyright notice,
11// this list of conditions and the following disclaimer in the documentation
12// and/or other materials provided with the distribution.
13// * Neither the name of Google Inc. nor the names of its contributors may be
14// used to endorse or promote products derived from this software without
15// specific prior written permission.
16//
17// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27// POSSIBILITY OF SUCH DAMAGE.
28//
29// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
30
31#ifndef CERES_INTERNAL_CUDA_BUFFER_H_
32#define CERES_INTERNAL_CUDA_BUFFER_H_
33
Joydeep Biswasfc826c52022-09-13 17:09:17 -050034#include "ceres/context_impl.h"
Joydeep Biswas36d6d862022-02-03 08:09:10 -060035#include "ceres/internal/config.h"
36
37#ifndef CERES_NO_CUDA
38
Mark Shachkov6fb3dae2024-05-01 11:00:58 +020039#include <cstddef>
40#include <utility>
Joydeep Biswas36d6d862022-02-03 08:09:10 -060041#include <vector>
42
Sameer Agarwal0a53aa92024-07-07 10:24:18 -070043#include "absl/log/check.h"
Joydeep Biswas36d6d862022-02-03 08:09:10 -060044#include "cuda_runtime.h"
Joydeep Biswas36d6d862022-02-03 08:09:10 -060045
Joydeep Biswasfc826c52022-09-13 17:09:17 -050046namespace ceres::internal {
Joydeep Biswas36d6d862022-02-03 08:09:10 -060047// An encapsulated buffer to maintain GPU memory, and handle transfers between
48// GPU and system memory. It is the responsibility of the user to ensure that
49// the appropriate GPU device is selected before each subroutine is called. This
50// is particularly important when using multiple GPU devices on different CPU
51// threads, since active Cuda devices are determined by the cuda runtime on a
Joydeep Biswasd8dad142022-08-05 20:48:17 -050052// per-thread basis.
Joydeep Biswas36d6d862022-02-03 08:09:10 -060053template <typename T>
54class CudaBuffer {
55 public:
Joydeep Biswasfc826c52022-09-13 17:09:17 -050056 explicit CudaBuffer(ContextImpl* context) : context_(context) {}
57 CudaBuffer(ContextImpl* context, int size) : context_(context) {
58 Reserve(size);
59 }
Dmitriy Korchemkin5e4b22f2023-08-19 19:39:56 +000060
61 CudaBuffer(CudaBuffer&& other)
62 : data_(other.data_), size_(other.size_), context_(other.context_) {
63 other.data_ = nullptr;
64 other.size_ = 0;
65 }
66
Joydeep Biswas36d6d862022-02-03 08:09:10 -060067 CudaBuffer(const CudaBuffer&) = delete;
68 CudaBuffer& operator=(const CudaBuffer&) = delete;
69
70 ~CudaBuffer() {
71 if (data_ != nullptr) {
72 CHECK_EQ(cudaFree(data_), cudaSuccess);
73 }
74 }
75
Joydeep Biswas7d2e4152022-02-12 12:09:26 -060076 // Grow the GPU memory buffer if needed to accommodate data of the specified
77 // size
Joydeep Biswas36d6d862022-02-03 08:09:10 -060078 void Reserve(const size_t size) {
79 if (size > size_) {
80 if (data_ != nullptr) {
81 CHECK_EQ(cudaFree(data_), cudaSuccess);
82 }
Sameer Agarwal04899642022-08-10 09:55:43 -070083 CHECK_EQ(cudaMalloc(&data_, size * sizeof(T)), cudaSuccess)
84 << "Failed to allocate " << size * sizeof(T)
85 << " bytes of GPU memory";
Joydeep Biswas36d6d862022-02-03 08:09:10 -060086 size_ = size;
87 }
88 }
89
Joydeep Biswasd8dad142022-08-05 20:48:17 -050090 // Perform an asynchronous copy from CPU memory to GPU memory managed by this
91 // CudaBuffer instance using the stream provided.
Joydeep Biswasfc826c52022-09-13 17:09:17 -050092 void CopyFromCpu(const T* data, const size_t size) {
Joydeep Biswas36d6d862022-02-03 08:09:10 -060093 Reserve(size);
Joydeep Biswasfc826c52022-09-13 17:09:17 -050094 CHECK_EQ(cudaMemcpyAsync(data_,
95 data,
96 size * sizeof(T),
97 cudaMemcpyHostToDevice,
Dmitriy Korchemkine7bd72d2023-05-03 20:58:35 +030098 context_->DefaultStream()),
Joydeep Biswas36d6d862022-02-03 08:09:10 -060099 cudaSuccess);
100 }
101
Joydeep Biswasd8dad142022-08-05 20:48:17 -0500102 // Perform an asynchronous copy from a vector in CPU memory to GPU memory
103 // managed by this CudaBuffer instance.
Joydeep Biswasfc826c52022-09-13 17:09:17 -0500104 void CopyFromCpuVector(const std::vector<T>& data) {
Joydeep Biswasd8dad142022-08-05 20:48:17 -0500105 Reserve(data.size());
106 CHECK_EQ(cudaMemcpyAsync(data_,
107 data.data(),
108 data.size() * sizeof(T),
109 cudaMemcpyHostToDevice,
Dmitriy Korchemkine7bd72d2023-05-03 20:58:35 +0300110 context_->DefaultStream()),
Joydeep Biswasd8dad142022-08-05 20:48:17 -0500111 cudaSuccess);
112 }
113
114 // Perform an asynchronous copy from another GPU memory array to the GPU
115 // memory managed by this CudaBuffer instance using the stream provided.
Joydeep Biswasfc826c52022-09-13 17:09:17 -0500116 void CopyFromGPUArray(const T* data, const size_t size) {
Joydeep Biswas88e08cf2022-06-04 20:17:06 -0500117 Reserve(size);
Joydeep Biswasfc826c52022-09-13 17:09:17 -0500118 CHECK_EQ(cudaMemcpyAsync(data_,
119 data,
120 size * sizeof(T),
121 cudaMemcpyDeviceToDevice,
Dmitriy Korchemkine7bd72d2023-05-03 20:58:35 +0300122 context_->DefaultStream()),
Joydeep Biswasfc826c52022-09-13 17:09:17 -0500123 cudaSuccess);
Joydeep Biswas88e08cf2022-06-04 20:17:06 -0500124 }
125
Joydeep Biswasd8dad142022-08-05 20:48:17 -0500126 // Copy data from the GPU memory managed by this CudaBuffer instance to CPU
127 // memory. It is the caller's responsibility to ensure that the CPU memory
128 // pointer is valid, i.e. it is not null, and that it points to memory of
Joydeep Biswasfc826c52022-09-13 17:09:17 -0500129 // at least this->size() size. This method ensures all previously dispatched
130 // GPU operations on the specified stream have completed before copying the
131 // data to CPU memory.
Joydeep Biswasd8dad142022-08-05 20:48:17 -0500132 void CopyToCpu(T* data, const size_t size) const {
Joydeep Biswas36d6d862022-02-03 08:09:10 -0600133 CHECK(data_ != nullptr);
Joydeep Biswasfc826c52022-09-13 17:09:17 -0500134 CHECK_EQ(cudaMemcpyAsync(data,
135 data_,
136 size * sizeof(T),
137 cudaMemcpyDeviceToHost,
Dmitriy Korchemkine7bd72d2023-05-03 20:58:35 +0300138 context_->DefaultStream()),
Joydeep Biswas36d6d862022-02-03 08:09:10 -0600139 cudaSuccess);
Dmitriy Korchemkine7bd72d2023-05-03 20:58:35 +0300140 CHECK_EQ(cudaStreamSynchronize(context_->DefaultStream()), cudaSuccess);
Joydeep Biswas36d6d862022-02-03 08:09:10 -0600141 }
142
Joydeep Biswasd8dad142022-08-05 20:48:17 -0500143 // Copy N items from another GPU memory array to the GPU memory managed by
144 // this CudaBuffer instance, growing this buffer's size if needed. This copy
145 // is asynchronous, and operates on the stream provided.
Joydeep Biswasfc826c52022-09-13 17:09:17 -0500146 void CopyNItemsFrom(int n, const CudaBuffer<T>& other) {
Joydeep Biswasd8dad142022-08-05 20:48:17 -0500147 Reserve(n);
148 CHECK(other.data_ != nullptr);
149 CHECK(data_ != nullptr);
150 CHECK_EQ(cudaMemcpyAsync(data_,
151 other.data_,
152 size_ * sizeof(T),
153 cudaMemcpyDeviceToDevice,
Dmitriy Korchemkine7bd72d2023-05-03 20:58:35 +0300154 context_->DefaultStream()),
Joydeep Biswasd8dad142022-08-05 20:48:17 -0500155 cudaSuccess);
Joydeep Biswas36d6d862022-02-03 08:09:10 -0600156 }
157
Joydeep Biswasd8dad142022-08-05 20:48:17 -0500158 // Return a pointer to the GPU memory managed by this CudaBuffer instance.
Joydeep Biswas36d6d862022-02-03 08:09:10 -0600159 T* data() { return data_; }
Joydeep Biswas88e08cf2022-06-04 20:17:06 -0500160 const T* data() const { return data_; }
Joydeep Biswasd8dad142022-08-05 20:48:17 -0500161 // Return the number of items of type T that can fit in the GPU memory
162 // allocated so far by this CudaBuffer instance.
Joydeep Biswas36d6d862022-02-03 08:09:10 -0600163 size_t size() const { return size_; }
164
165 private:
166 T* data_ = nullptr;
167 size_t size_ = 0;
Joydeep Biswasfc826c52022-09-13 17:09:17 -0500168 ContextImpl* context_ = nullptr;
Joydeep Biswas36d6d862022-02-03 08:09:10 -0600169};
Mark Shachkov6fb3dae2024-05-01 11:00:58 +0200170
171// This class wraps host memory region allocated via cudaMallocHost. Such memory
172// region is page-locked, hence enabling direct transfer to/from device,
173// avoiding implicit buffering under the hood of CUDA API.
174template <typename T>
175class CudaPinnedHostBuffer {
176 public:
177 CudaPinnedHostBuffer() noexcept = default;
178 CudaPinnedHostBuffer(int size) { Reserve(size); }
179 CudaPinnedHostBuffer(CudaPinnedHostBuffer&& other) noexcept
180 : data_(std::exchange(other.data_, nullptr)),
181 size_(std::exchange(other.size_, 0)) {}
182 CudaPinnedHostBuffer(const CudaPinnedHostBuffer&) = delete;
183 CudaPinnedHostBuffer& operator=(const CudaPinnedHostBuffer&) = delete;
184 CudaPinnedHostBuffer& operator=(CudaPinnedHostBuffer&& other) noexcept {
185 Free();
186 data_ = std::exchange(other.data_, nullptr);
187 size_ = std::exchange(other.size_, 0);
188 return *this;
189 }
190 ~CudaPinnedHostBuffer() { Free(); }
191
192 void Reserve(const std::size_t size) {
193 if (size > size_) {
194 Free();
195 CHECK_EQ(cudaMallocHost(&data_, size * sizeof(T)), cudaSuccess)
196 << "Failed to allocate " << size * sizeof(T)
197 << " bytes of pinned host memory";
198 size_ = size;
199 }
200 }
201
202 T* data() noexcept { return data_; }
203 const T* data() const noexcept { return data_; }
204 std::size_t size() const noexcept { return size_; }
205
206 private:
207 void Free() {
208 if (data_ != nullptr) {
209 CHECK_EQ(cudaFreeHost(data_), cudaSuccess);
210 data_ = nullptr;
211 size_ = 0;
212 }
213 }
214
215 T* data_ = nullptr;
216 std::size_t size_ = 0;
217};
218
Joydeep Biswasfc826c52022-09-13 17:09:17 -0500219} // namespace ceres::internal
Joydeep Biswas36d6d862022-02-03 08:09:10 -0600220
221#endif // CERES_NO_CUDA
222
Dmitriy Korchemkin5e4b22f2023-08-19 19:39:56 +0000223#endif // CERES_INTERNAL_CUDA_BUFFER_H_