Blame - internal/ceres/cuda_buffer.h - ceres-solver

blob: c5756eaca9f2ab803cc45404c6d6b4a8fe10702a [file] [log] [blame]

Joydeep Biswas	36d6d86	2022-02-03 08:09:10 -0600	[diff] [blame]	1	// Ceres Solver - A fast non-linear least squares minimizer
Sameer Agarwal	5a30cae	2023-09-19 15:29:34 -0700	[diff] [blame]	2	// Copyright 2023 Google Inc. All rights reserved.
Joydeep Biswas	36d6d86	2022-02-03 08:09:10 -0600	[diff] [blame]	3	// http://ceres-solver.org/
				4	//
				5	// Redistribution and use in source and binary forms, with or without
				6	// modification, are permitted provided that the following conditions are met:
				7	//
				8	// * Redistributions of source code must retain the above copyright notice,
				9	// this list of conditions and the following disclaimer.
				10	// * Redistributions in binary form must reproduce the above copyright notice,
				11	// this list of conditions and the following disclaimer in the documentation
				12	// and/or other materials provided with the distribution.
				13	// * Neither the name of Google Inc. nor the names of its contributors may be
				14	// used to endorse or promote products derived from this software without
				15	// specific prior written permission.
				16	//
				17	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
				18	// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
				19	// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
				20	// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
				21	// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
				22	// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
				23	// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
				24	// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
				25	// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
				26	// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
				27	// POSSIBILITY OF SUCH DAMAGE.
				28	//
				29	// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
				30
				31	#ifndef CERES_INTERNAL_CUDA_BUFFER_H_
				32	#define CERES_INTERNAL_CUDA_BUFFER_H_
				33
Joydeep Biswas	fc826c5	2022-09-13 17:09:17 -0500	[diff] [blame]	34	#include "ceres/context_impl.h"
Joydeep Biswas	36d6d86	2022-02-03 08:09:10 -0600	[diff] [blame]	35	#include "ceres/internal/config.h"
				36
				37	#ifndef CERES_NO_CUDA
				38
Mark Shachkov	6fb3dae	2024-05-01 11:00:58 +0200	[diff] [blame]	39	#include <cstddef>
				40	#include <utility>
Joydeep Biswas	36d6d86	2022-02-03 08:09:10 -0600	[diff] [blame]	41	#include <vector>
				42
Sameer Agarwal	0a53aa9	2024-07-07 10:24:18 -0700	[diff] [blame]	43	#include "absl/log/check.h"
Joydeep Biswas	36d6d86	2022-02-03 08:09:10 -0600	[diff] [blame]	44	#include "cuda_runtime.h"
Joydeep Biswas	36d6d86	2022-02-03 08:09:10 -0600	[diff] [blame]	45
Joydeep Biswas	fc826c5	2022-09-13 17:09:17 -0500	[diff] [blame]	46	namespace ceres::internal {
Joydeep Biswas	36d6d86	2022-02-03 08:09:10 -0600	[diff] [blame]	47	// An encapsulated buffer to maintain GPU memory, and handle transfers between
				48	// GPU and system memory. It is the responsibility of the user to ensure that
				49	// the appropriate GPU device is selected before each subroutine is called. This
				50	// is particularly important when using multiple GPU devices on different CPU
				51	// threads, since active Cuda devices are determined by the cuda runtime on a
Joydeep Biswas	d8dad14	2022-08-05 20:48:17 -0500	[diff] [blame]	52	// per-thread basis.
Joydeep Biswas	36d6d86	2022-02-03 08:09:10 -0600	[diff] [blame]	53	template <typename T>
				54	class CudaBuffer {
				55	public:
Joydeep Biswas	fc826c5	2022-09-13 17:09:17 -0500	[diff] [blame]	56	explicit CudaBuffer(ContextImpl* context) : context_(context) {}
				57	CudaBuffer(ContextImpl* context, int size) : context_(context) {
				58	Reserve(size);
				59	}
Dmitriy Korchemkin	5e4b22f	2023-08-19 19:39:56 +0000	[diff] [blame]	60
				61	CudaBuffer(CudaBuffer&& other)
				62	: data_(other.data_), size_(other.size_), context_(other.context_) {
				63	other.data_ = nullptr;
				64	other.size_ = 0;
				65	}
				66
Joydeep Biswas	36d6d86	2022-02-03 08:09:10 -0600	[diff] [blame]	67	CudaBuffer(const CudaBuffer&) = delete;
				68	CudaBuffer& operator=(const CudaBuffer&) = delete;
				69
				70	~CudaBuffer() {
				71	if (data_ != nullptr) {
				72	CHECK_EQ(cudaFree(data_), cudaSuccess);
				73	}
				74	}
				75
Joydeep Biswas	7d2e415	2022-02-12 12:09:26 -0600	[diff] [blame]	76	// Grow the GPU memory buffer if needed to accommodate data of the specified
				77	// size
Joydeep Biswas	36d6d86	2022-02-03 08:09:10 -0600	[diff] [blame]	78	void Reserve(const size_t size) {
				79	if (size > size_) {
				80	if (data_ != nullptr) {
				81	CHECK_EQ(cudaFree(data_), cudaSuccess);
				82	}
Sameer Agarwal	0489964	2022-08-10 09:55:43 -0700	[diff] [blame]	83	CHECK_EQ(cudaMalloc(&data_, size * sizeof(T)), cudaSuccess)
				84	<< "Failed to allocate " << size * sizeof(T)
				85	<< " bytes of GPU memory";
Joydeep Biswas	36d6d86	2022-02-03 08:09:10 -0600	[diff] [blame]	86	size_ = size;
				87	}
				88	}
				89
Joydeep Biswas	d8dad14	2022-08-05 20:48:17 -0500	[diff] [blame]	90	// Perform an asynchronous copy from CPU memory to GPU memory managed by this
				91	// CudaBuffer instance using the stream provided.
Joydeep Biswas	fc826c5	2022-09-13 17:09:17 -0500	[diff] [blame]	92	void CopyFromCpu(const T* data, const size_t size) {
Joydeep Biswas	36d6d86	2022-02-03 08:09:10 -0600	[diff] [blame]	93	Reserve(size);
Joydeep Biswas	fc826c5	2022-09-13 17:09:17 -0500	[diff] [blame]	94	CHECK_EQ(cudaMemcpyAsync(data_,
				95	data,
				96	size * sizeof(T),
				97	cudaMemcpyHostToDevice,
Dmitriy Korchemkin	e7bd72d	2023-05-03 20:58:35 +0300	[diff] [blame]	98	context_->DefaultStream()),
Joydeep Biswas	36d6d86	2022-02-03 08:09:10 -0600	[diff] [blame]	99	cudaSuccess);
				100	}
				101
Joydeep Biswas	d8dad14	2022-08-05 20:48:17 -0500	[diff] [blame]	102	// Perform an asynchronous copy from a vector in CPU memory to GPU memory
				103	// managed by this CudaBuffer instance.
Joydeep Biswas	fc826c5	2022-09-13 17:09:17 -0500	[diff] [blame]	104	void CopyFromCpuVector(const std::vector<T>& data) {
Joydeep Biswas	d8dad14	2022-08-05 20:48:17 -0500	[diff] [blame]	105	Reserve(data.size());
				106	CHECK_EQ(cudaMemcpyAsync(data_,
				107	data.data(),
				108	data.size() * sizeof(T),
				109	cudaMemcpyHostToDevice,
Dmitriy Korchemkin	e7bd72d	2023-05-03 20:58:35 +0300	[diff] [blame]	110	context_->DefaultStream()),
Joydeep Biswas	d8dad14	2022-08-05 20:48:17 -0500	[diff] [blame]	111	cudaSuccess);
				112	}
				113
				114	// Perform an asynchronous copy from another GPU memory array to the GPU
				115	// memory managed by this CudaBuffer instance using the stream provided.
Joydeep Biswas	fc826c5	2022-09-13 17:09:17 -0500	[diff] [blame]	116	void CopyFromGPUArray(const T* data, const size_t size) {
Joydeep Biswas	88e08cf	2022-06-04 20:17:06 -0500	[diff] [blame]	117	Reserve(size);
Joydeep Biswas	fc826c5	2022-09-13 17:09:17 -0500	[diff] [blame]	118	CHECK_EQ(cudaMemcpyAsync(data_,
				119	data,
				120	size * sizeof(T),
				121	cudaMemcpyDeviceToDevice,
Dmitriy Korchemkin	e7bd72d	2023-05-03 20:58:35 +0300	[diff] [blame]	122	context_->DefaultStream()),
Joydeep Biswas	fc826c5	2022-09-13 17:09:17 -0500	[diff] [blame]	123	cudaSuccess);
Joydeep Biswas	88e08cf	2022-06-04 20:17:06 -0500	[diff] [blame]	124	}
				125
Joydeep Biswas	d8dad14	2022-08-05 20:48:17 -0500	[diff] [blame]	126	// Copy data from the GPU memory managed by this CudaBuffer instance to CPU
				127	// memory. It is the caller's responsibility to ensure that the CPU memory
				128	// pointer is valid, i.e. it is not null, and that it points to memory of
Joydeep Biswas	fc826c5	2022-09-13 17:09:17 -0500	[diff] [blame]	129	// at least this->size() size. This method ensures all previously dispatched
				130	// GPU operations on the specified stream have completed before copying the
				131	// data to CPU memory.
Joydeep Biswas	d8dad14	2022-08-05 20:48:17 -0500	[diff] [blame]	132	void CopyToCpu(T* data, const size_t size) const {
Joydeep Biswas	36d6d86	2022-02-03 08:09:10 -0600	[diff] [blame]	133	CHECK(data_ != nullptr);
Joydeep Biswas	fc826c5	2022-09-13 17:09:17 -0500	[diff] [blame]	134	CHECK_EQ(cudaMemcpyAsync(data,
				135	data_,
				136	size * sizeof(T),
				137	cudaMemcpyDeviceToHost,
Dmitriy Korchemkin	e7bd72d	2023-05-03 20:58:35 +0300	[diff] [blame]	138	context_->DefaultStream()),
Joydeep Biswas	36d6d86	2022-02-03 08:09:10 -0600	[diff] [blame]	139	cudaSuccess);
Dmitriy Korchemkin	e7bd72d	2023-05-03 20:58:35 +0300	[diff] [blame]	140	CHECK_EQ(cudaStreamSynchronize(context_->DefaultStream()), cudaSuccess);
Joydeep Biswas	36d6d86	2022-02-03 08:09:10 -0600	[diff] [blame]	141	}
				142
Joydeep Biswas	d8dad14	2022-08-05 20:48:17 -0500	[diff] [blame]	143	// Copy N items from another GPU memory array to the GPU memory managed by
				144	// this CudaBuffer instance, growing this buffer's size if needed. This copy
				145	// is asynchronous, and operates on the stream provided.
Joydeep Biswas	fc826c5	2022-09-13 17:09:17 -0500	[diff] [blame]	146	void CopyNItemsFrom(int n, const CudaBuffer<T>& other) {
Joydeep Biswas	d8dad14	2022-08-05 20:48:17 -0500	[diff] [blame]	147	Reserve(n);
				148	CHECK(other.data_ != nullptr);
				149	CHECK(data_ != nullptr);
				150	CHECK_EQ(cudaMemcpyAsync(data_,
				151	other.data_,
				152	size_ * sizeof(T),
				153	cudaMemcpyDeviceToDevice,
Dmitriy Korchemkin	e7bd72d	2023-05-03 20:58:35 +0300	[diff] [blame]	154	context_->DefaultStream()),
Joydeep Biswas	d8dad14	2022-08-05 20:48:17 -0500	[diff] [blame]	155	cudaSuccess);
Joydeep Biswas	36d6d86	2022-02-03 08:09:10 -0600	[diff] [blame]	156	}
				157
Joydeep Biswas	d8dad14	2022-08-05 20:48:17 -0500	[diff] [blame]	158	// Return a pointer to the GPU memory managed by this CudaBuffer instance.
Joydeep Biswas	36d6d86	2022-02-03 08:09:10 -0600	[diff] [blame]	159	T* data() { return data_; }
Joydeep Biswas	88e08cf	2022-06-04 20:17:06 -0500	[diff] [blame]	160	const T* data() const { return data_; }
Joydeep Biswas	d8dad14	2022-08-05 20:48:17 -0500	[diff] [blame]	161	// Return the number of items of type T that can fit in the GPU memory
				162	// allocated so far by this CudaBuffer instance.
Joydeep Biswas	36d6d86	2022-02-03 08:09:10 -0600	[diff] [blame]	163	size_t size() const { return size_; }
				164
				165	private:
				166	T* data_ = nullptr;
				167	size_t size_ = 0;
Joydeep Biswas	fc826c5	2022-09-13 17:09:17 -0500	[diff] [blame]	168	ContextImpl* context_ = nullptr;
Joydeep Biswas	36d6d86	2022-02-03 08:09:10 -0600	[diff] [blame]	169	};
Mark Shachkov	6fb3dae	2024-05-01 11:00:58 +0200	[diff] [blame]	170
				171	// This class wraps host memory region allocated via cudaMallocHost. Such memory
				172	// region is page-locked, hence enabling direct transfer to/from device,
				173	// avoiding implicit buffering under the hood of CUDA API.
				174	template <typename T>
				175	class CudaPinnedHostBuffer {
				176	public:
				177	CudaPinnedHostBuffer() noexcept = default;
				178	CudaPinnedHostBuffer(int size) { Reserve(size); }
				179	CudaPinnedHostBuffer(CudaPinnedHostBuffer&& other) noexcept
				180	: data_(std::exchange(other.data_, nullptr)),
				181	size_(std::exchange(other.size_, 0)) {}
				182	CudaPinnedHostBuffer(const CudaPinnedHostBuffer&) = delete;
				183	CudaPinnedHostBuffer& operator=(const CudaPinnedHostBuffer&) = delete;
				184	CudaPinnedHostBuffer& operator=(CudaPinnedHostBuffer&& other) noexcept {
				185	Free();
				186	data_ = std::exchange(other.data_, nullptr);
				187	size_ = std::exchange(other.size_, 0);
				188	return *this;
				189	}
				190	~CudaPinnedHostBuffer() { Free(); }
				191
				192	void Reserve(const std::size_t size) {
				193	if (size > size_) {
				194	Free();
				195	CHECK_EQ(cudaMallocHost(&data_, size * sizeof(T)), cudaSuccess)
				196	<< "Failed to allocate " << size * sizeof(T)
				197	<< " bytes of pinned host memory";
				198	size_ = size;
				199	}
				200	}
				201
				202	T* data() noexcept { return data_; }
				203	const T* data() const noexcept { return data_; }
				204	std::size_t size() const noexcept { return size_; }
				205
				206	private:
				207	void Free() {
				208	if (data_ != nullptr) {
				209	CHECK_EQ(cudaFreeHost(data_), cudaSuccess);
				210	data_ = nullptr;
				211	size_ = 0;
				212	}
				213	}
				214
				215	T* data_ = nullptr;
				216	std::size_t size_ = 0;
				217	};
				218
Joydeep Biswas	fc826c5	2022-09-13 17:09:17 -0500	[diff] [blame]	219	} // namespace ceres::internal
Joydeep Biswas	36d6d86	2022-02-03 08:09:10 -0600	[diff] [blame]	220
				221	#endif // CERES_NO_CUDA
				222
Dmitriy Korchemkin	5e4b22f	2023-08-19 19:39:56 +0000	[diff] [blame]	223	#endif // CERES_INTERNAL_CUDA_BUFFER_H_