Joydeep Biswas | 36d6d86 | 2022-02-03 08:09:10 -0600 | [diff] [blame] | 1 | // Ceres Solver - A fast non-linear least squares minimizer |
Sameer Agarwal | 5a30cae | 2023-09-19 15:29:34 -0700 | [diff] [blame] | 2 | // Copyright 2023 Google Inc. All rights reserved. |
Joydeep Biswas | 36d6d86 | 2022-02-03 08:09:10 -0600 | [diff] [blame] | 3 | // http://ceres-solver.org/ |
| 4 | // |
| 5 | // Redistribution and use in source and binary forms, with or without |
| 6 | // modification, are permitted provided that the following conditions are met: |
| 7 | // |
| 8 | // * Redistributions of source code must retain the above copyright notice, |
| 9 | // this list of conditions and the following disclaimer. |
| 10 | // * Redistributions in binary form must reproduce the above copyright notice, |
| 11 | // this list of conditions and the following disclaimer in the documentation |
| 12 | // and/or other materials provided with the distribution. |
| 13 | // * Neither the name of Google Inc. nor the names of its contributors may be |
| 14 | // used to endorse or promote products derived from this software without |
| 15 | // specific prior written permission. |
| 16 | // |
| 17 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 18 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 19 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 20 | // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| 21 | // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 22 | // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 23 | // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 24 | // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 25 | // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 26 | // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 27 | // POSSIBILITY OF SUCH DAMAGE. |
| 28 | // |
| 29 | // Author: joydeepb@cs.utexas.edu (Joydeep Biswas) |
| 30 | |
| 31 | #ifndef CERES_INTERNAL_CUDA_BUFFER_H_ |
| 32 | #define CERES_INTERNAL_CUDA_BUFFER_H_ |
| 33 | |
Joydeep Biswas | fc826c5 | 2022-09-13 17:09:17 -0500 | [diff] [blame] | 34 | #include "ceres/context_impl.h" |
Joydeep Biswas | 36d6d86 | 2022-02-03 08:09:10 -0600 | [diff] [blame] | 35 | #include "ceres/internal/config.h" |
| 36 | |
| 37 | #ifndef CERES_NO_CUDA |
| 38 | |
Mark Shachkov | 6fb3dae | 2024-05-01 11:00:58 +0200 | [diff] [blame] | 39 | #include <cstddef> |
| 40 | #include <utility> |
Joydeep Biswas | 36d6d86 | 2022-02-03 08:09:10 -0600 | [diff] [blame] | 41 | #include <vector> |
| 42 | |
Sameer Agarwal | 0a53aa9 | 2024-07-07 10:24:18 -0700 | [diff] [blame] | 43 | #include "absl/log/check.h" |
Joydeep Biswas | 36d6d86 | 2022-02-03 08:09:10 -0600 | [diff] [blame] | 44 | #include "cuda_runtime.h" |
Joydeep Biswas | 36d6d86 | 2022-02-03 08:09:10 -0600 | [diff] [blame] | 45 | |
Joydeep Biswas | fc826c5 | 2022-09-13 17:09:17 -0500 | [diff] [blame] | 46 | namespace ceres::internal { |
Joydeep Biswas | 36d6d86 | 2022-02-03 08:09:10 -0600 | [diff] [blame] | 47 | // An encapsulated buffer to maintain GPU memory, and handle transfers between |
| 48 | // GPU and system memory. It is the responsibility of the user to ensure that |
| 49 | // the appropriate GPU device is selected before each subroutine is called. This |
| 50 | // is particularly important when using multiple GPU devices on different CPU |
| 51 | // threads, since active Cuda devices are determined by the cuda runtime on a |
Joydeep Biswas | d8dad14 | 2022-08-05 20:48:17 -0500 | [diff] [blame] | 52 | // per-thread basis. |
Joydeep Biswas | 36d6d86 | 2022-02-03 08:09:10 -0600 | [diff] [blame] | 53 | template <typename T> |
| 54 | class CudaBuffer { |
| 55 | public: |
Joydeep Biswas | fc826c5 | 2022-09-13 17:09:17 -0500 | [diff] [blame] | 56 | explicit CudaBuffer(ContextImpl* context) : context_(context) {} |
| 57 | CudaBuffer(ContextImpl* context, int size) : context_(context) { |
| 58 | Reserve(size); |
| 59 | } |
Dmitriy Korchemkin | 5e4b22f | 2023-08-19 19:39:56 +0000 | [diff] [blame] | 60 | |
| 61 | CudaBuffer(CudaBuffer&& other) |
| 62 | : data_(other.data_), size_(other.size_), context_(other.context_) { |
| 63 | other.data_ = nullptr; |
| 64 | other.size_ = 0; |
| 65 | } |
| 66 | |
Joydeep Biswas | 36d6d86 | 2022-02-03 08:09:10 -0600 | [diff] [blame] | 67 | CudaBuffer(const CudaBuffer&) = delete; |
| 68 | CudaBuffer& operator=(const CudaBuffer&) = delete; |
| 69 | |
| 70 | ~CudaBuffer() { |
| 71 | if (data_ != nullptr) { |
| 72 | CHECK_EQ(cudaFree(data_), cudaSuccess); |
| 73 | } |
| 74 | } |
| 75 | |
Joydeep Biswas | 7d2e415 | 2022-02-12 12:09:26 -0600 | [diff] [blame] | 76 | // Grow the GPU memory buffer if needed to accommodate data of the specified |
| 77 | // size |
Joydeep Biswas | 36d6d86 | 2022-02-03 08:09:10 -0600 | [diff] [blame] | 78 | void Reserve(const size_t size) { |
| 79 | if (size > size_) { |
| 80 | if (data_ != nullptr) { |
| 81 | CHECK_EQ(cudaFree(data_), cudaSuccess); |
| 82 | } |
Sameer Agarwal | 0489964 | 2022-08-10 09:55:43 -0700 | [diff] [blame] | 83 | CHECK_EQ(cudaMalloc(&data_, size * sizeof(T)), cudaSuccess) |
| 84 | << "Failed to allocate " << size * sizeof(T) |
| 85 | << " bytes of GPU memory"; |
Joydeep Biswas | 36d6d86 | 2022-02-03 08:09:10 -0600 | [diff] [blame] | 86 | size_ = size; |
| 87 | } |
| 88 | } |
| 89 | |
Joydeep Biswas | d8dad14 | 2022-08-05 20:48:17 -0500 | [diff] [blame] | 90 | // Perform an asynchronous copy from CPU memory to GPU memory managed by this |
| 91 | // CudaBuffer instance using the stream provided. |
Joydeep Biswas | fc826c5 | 2022-09-13 17:09:17 -0500 | [diff] [blame] | 92 | void CopyFromCpu(const T* data, const size_t size) { |
Joydeep Biswas | 36d6d86 | 2022-02-03 08:09:10 -0600 | [diff] [blame] | 93 | Reserve(size); |
Joydeep Biswas | fc826c5 | 2022-09-13 17:09:17 -0500 | [diff] [blame] | 94 | CHECK_EQ(cudaMemcpyAsync(data_, |
| 95 | data, |
| 96 | size * sizeof(T), |
| 97 | cudaMemcpyHostToDevice, |
Dmitriy Korchemkin | e7bd72d | 2023-05-03 20:58:35 +0300 | [diff] [blame] | 98 | context_->DefaultStream()), |
Joydeep Biswas | 36d6d86 | 2022-02-03 08:09:10 -0600 | [diff] [blame] | 99 | cudaSuccess); |
| 100 | } |
| 101 | |
Joydeep Biswas | d8dad14 | 2022-08-05 20:48:17 -0500 | [diff] [blame] | 102 | // Perform an asynchronous copy from a vector in CPU memory to GPU memory |
| 103 | // managed by this CudaBuffer instance. |
Joydeep Biswas | fc826c5 | 2022-09-13 17:09:17 -0500 | [diff] [blame] | 104 | void CopyFromCpuVector(const std::vector<T>& data) { |
Joydeep Biswas | d8dad14 | 2022-08-05 20:48:17 -0500 | [diff] [blame] | 105 | Reserve(data.size()); |
| 106 | CHECK_EQ(cudaMemcpyAsync(data_, |
| 107 | data.data(), |
| 108 | data.size() * sizeof(T), |
| 109 | cudaMemcpyHostToDevice, |
Dmitriy Korchemkin | e7bd72d | 2023-05-03 20:58:35 +0300 | [diff] [blame] | 110 | context_->DefaultStream()), |
Joydeep Biswas | d8dad14 | 2022-08-05 20:48:17 -0500 | [diff] [blame] | 111 | cudaSuccess); |
| 112 | } |
| 113 | |
| 114 | // Perform an asynchronous copy from another GPU memory array to the GPU |
| 115 | // memory managed by this CudaBuffer instance using the stream provided. |
Joydeep Biswas | fc826c5 | 2022-09-13 17:09:17 -0500 | [diff] [blame] | 116 | void CopyFromGPUArray(const T* data, const size_t size) { |
Joydeep Biswas | 88e08cf | 2022-06-04 20:17:06 -0500 | [diff] [blame] | 117 | Reserve(size); |
Joydeep Biswas | fc826c5 | 2022-09-13 17:09:17 -0500 | [diff] [blame] | 118 | CHECK_EQ(cudaMemcpyAsync(data_, |
| 119 | data, |
| 120 | size * sizeof(T), |
| 121 | cudaMemcpyDeviceToDevice, |
Dmitriy Korchemkin | e7bd72d | 2023-05-03 20:58:35 +0300 | [diff] [blame] | 122 | context_->DefaultStream()), |
Joydeep Biswas | fc826c5 | 2022-09-13 17:09:17 -0500 | [diff] [blame] | 123 | cudaSuccess); |
Joydeep Biswas | 88e08cf | 2022-06-04 20:17:06 -0500 | [diff] [blame] | 124 | } |
| 125 | |
Joydeep Biswas | d8dad14 | 2022-08-05 20:48:17 -0500 | [diff] [blame] | 126 | // Copy data from the GPU memory managed by this CudaBuffer instance to CPU |
| 127 | // memory. It is the caller's responsibility to ensure that the CPU memory |
| 128 | // pointer is valid, i.e. it is not null, and that it points to memory of |
Joydeep Biswas | fc826c5 | 2022-09-13 17:09:17 -0500 | [diff] [blame] | 129 | // at least this->size() size. This method ensures all previously dispatched |
| 130 | // GPU operations on the specified stream have completed before copying the |
| 131 | // data to CPU memory. |
Joydeep Biswas | d8dad14 | 2022-08-05 20:48:17 -0500 | [diff] [blame] | 132 | void CopyToCpu(T* data, const size_t size) const { |
Joydeep Biswas | 36d6d86 | 2022-02-03 08:09:10 -0600 | [diff] [blame] | 133 | CHECK(data_ != nullptr); |
Joydeep Biswas | fc826c5 | 2022-09-13 17:09:17 -0500 | [diff] [blame] | 134 | CHECK_EQ(cudaMemcpyAsync(data, |
| 135 | data_, |
| 136 | size * sizeof(T), |
| 137 | cudaMemcpyDeviceToHost, |
Dmitriy Korchemkin | e7bd72d | 2023-05-03 20:58:35 +0300 | [diff] [blame] | 138 | context_->DefaultStream()), |
Joydeep Biswas | 36d6d86 | 2022-02-03 08:09:10 -0600 | [diff] [blame] | 139 | cudaSuccess); |
Dmitriy Korchemkin | e7bd72d | 2023-05-03 20:58:35 +0300 | [diff] [blame] | 140 | CHECK_EQ(cudaStreamSynchronize(context_->DefaultStream()), cudaSuccess); |
Joydeep Biswas | 36d6d86 | 2022-02-03 08:09:10 -0600 | [diff] [blame] | 141 | } |
| 142 | |
Joydeep Biswas | d8dad14 | 2022-08-05 20:48:17 -0500 | [diff] [blame] | 143 | // Copy N items from another GPU memory array to the GPU memory managed by |
| 144 | // this CudaBuffer instance, growing this buffer's size if needed. This copy |
| 145 | // is asynchronous, and operates on the stream provided. |
Joydeep Biswas | fc826c5 | 2022-09-13 17:09:17 -0500 | [diff] [blame] | 146 | void CopyNItemsFrom(int n, const CudaBuffer<T>& other) { |
Joydeep Biswas | d8dad14 | 2022-08-05 20:48:17 -0500 | [diff] [blame] | 147 | Reserve(n); |
| 148 | CHECK(other.data_ != nullptr); |
| 149 | CHECK(data_ != nullptr); |
| 150 | CHECK_EQ(cudaMemcpyAsync(data_, |
| 151 | other.data_, |
| 152 | size_ * sizeof(T), |
| 153 | cudaMemcpyDeviceToDevice, |
Dmitriy Korchemkin | e7bd72d | 2023-05-03 20:58:35 +0300 | [diff] [blame] | 154 | context_->DefaultStream()), |
Joydeep Biswas | d8dad14 | 2022-08-05 20:48:17 -0500 | [diff] [blame] | 155 | cudaSuccess); |
Joydeep Biswas | 36d6d86 | 2022-02-03 08:09:10 -0600 | [diff] [blame] | 156 | } |
| 157 | |
Joydeep Biswas | d8dad14 | 2022-08-05 20:48:17 -0500 | [diff] [blame] | 158 | // Return a pointer to the GPU memory managed by this CudaBuffer instance. |
Joydeep Biswas | 36d6d86 | 2022-02-03 08:09:10 -0600 | [diff] [blame] | 159 | T* data() { return data_; } |
Joydeep Biswas | 88e08cf | 2022-06-04 20:17:06 -0500 | [diff] [blame] | 160 | const T* data() const { return data_; } |
Joydeep Biswas | d8dad14 | 2022-08-05 20:48:17 -0500 | [diff] [blame] | 161 | // Return the number of items of type T that can fit in the GPU memory |
| 162 | // allocated so far by this CudaBuffer instance. |
Joydeep Biswas | 36d6d86 | 2022-02-03 08:09:10 -0600 | [diff] [blame] | 163 | size_t size() const { return size_; } |
| 164 | |
| 165 | private: |
| 166 | T* data_ = nullptr; |
| 167 | size_t size_ = 0; |
Joydeep Biswas | fc826c5 | 2022-09-13 17:09:17 -0500 | [diff] [blame] | 168 | ContextImpl* context_ = nullptr; |
Joydeep Biswas | 36d6d86 | 2022-02-03 08:09:10 -0600 | [diff] [blame] | 169 | }; |
Mark Shachkov | 6fb3dae | 2024-05-01 11:00:58 +0200 | [diff] [blame] | 170 | |
| 171 | // This class wraps host memory region allocated via cudaMallocHost. Such memory |
| 172 | // region is page-locked, hence enabling direct transfer to/from device, |
| 173 | // avoiding implicit buffering under the hood of CUDA API. |
| 174 | template <typename T> |
| 175 | class CudaPinnedHostBuffer { |
| 176 | public: |
| 177 | CudaPinnedHostBuffer() noexcept = default; |
| 178 | CudaPinnedHostBuffer(int size) { Reserve(size); } |
| 179 | CudaPinnedHostBuffer(CudaPinnedHostBuffer&& other) noexcept |
| 180 | : data_(std::exchange(other.data_, nullptr)), |
| 181 | size_(std::exchange(other.size_, 0)) {} |
| 182 | CudaPinnedHostBuffer(const CudaPinnedHostBuffer&) = delete; |
| 183 | CudaPinnedHostBuffer& operator=(const CudaPinnedHostBuffer&) = delete; |
| 184 | CudaPinnedHostBuffer& operator=(CudaPinnedHostBuffer&& other) noexcept { |
| 185 | Free(); |
| 186 | data_ = std::exchange(other.data_, nullptr); |
| 187 | size_ = std::exchange(other.size_, 0); |
| 188 | return *this; |
| 189 | } |
| 190 | ~CudaPinnedHostBuffer() { Free(); } |
| 191 | |
| 192 | void Reserve(const std::size_t size) { |
| 193 | if (size > size_) { |
| 194 | Free(); |
| 195 | CHECK_EQ(cudaMallocHost(&data_, size * sizeof(T)), cudaSuccess) |
| 196 | << "Failed to allocate " << size * sizeof(T) |
| 197 | << " bytes of pinned host memory"; |
| 198 | size_ = size; |
| 199 | } |
| 200 | } |
| 201 | |
| 202 | T* data() noexcept { return data_; } |
| 203 | const T* data() const noexcept { return data_; } |
| 204 | std::size_t size() const noexcept { return size_; } |
| 205 | |
| 206 | private: |
| 207 | void Free() { |
| 208 | if (data_ != nullptr) { |
| 209 | CHECK_EQ(cudaFreeHost(data_), cudaSuccess); |
| 210 | data_ = nullptr; |
| 211 | size_ = 0; |
| 212 | } |
| 213 | } |
| 214 | |
| 215 | T* data_ = nullptr; |
| 216 | std::size_t size_ = 0; |
| 217 | }; |
| 218 | |
Joydeep Biswas | fc826c5 | 2022-09-13 17:09:17 -0500 | [diff] [blame] | 219 | } // namespace ceres::internal |
Joydeep Biswas | 36d6d86 | 2022-02-03 08:09:10 -0600 | [diff] [blame] | 220 | |
| 221 | #endif // CERES_NO_CUDA |
| 222 | |
Dmitriy Korchemkin | 5e4b22f | 2023-08-19 19:39:56 +0000 | [diff] [blame] | 223 | #endif // CERES_INTERNAL_CUDA_BUFFER_H_ |