internal/ceres/cuda_buffer.h - ceres-solver - Git at Google

 // Ceres Solver - A fast non-linear least squares minimizer
 // Copyright 2023 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
 // * Redistributions of source code must retain the above copyright notice,
 //   this list of conditions and the following disclaimer.
 // * Redistributions in binary form must reproduce the above copyright notice,
 //   this list of conditions and the following disclaimer in the documentation
 //   and/or other materials provided with the distribution.
 // * Neither the name of Google Inc. nor the names of its contributors may be
 //   used to endorse or promote products derived from this software without
 //   specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 //
 // Author: joydeepb@cs.utexas.edu (Joydeep Biswas)

 #ifndef CERES_INTERNAL_CUDA_BUFFER_H_
 #define CERES_INTERNAL_CUDA_BUFFER_H_

 #include "ceres/context_impl.h"
 #include "ceres/internal/config.h"

 #ifndef CERES_NO_CUDA

 #include <cstddef>
 #include <utility>
 #include <vector>

 #include "absl/log/check.h"
 #include "cuda_runtime.h"

 namespace ceres::internal {
 // An encapsulated buffer to maintain GPU memory, and handle transfers between
 // GPU and system memory. It is the responsibility of the user to ensure that
 // the appropriate GPU device is selected before each subroutine is called. This
 // is particularly important when using multiple GPU devices on different CPU
 // threads, since active Cuda devices are determined by the cuda runtime on a
 // per-thread basis.
 template <typename T>
 class CudaBuffer {
  public:
   explicit CudaBuffer(ContextImpl* context) : context_(context) {}
   CudaBuffer(ContextImpl* context, int size) : context_(context) {
     Reserve(size);
   }

   CudaBuffer(CudaBuffer&& other)
       : data_(other.data_), size_(other.size_), context_(other.context_) {
     other.data_ = nullptr;
     other.size_ = 0;
   }

   CudaBuffer(const CudaBuffer&) = delete;
   CudaBuffer& operator=(const CudaBuffer&) = delete;

   ~CudaBuffer() {
     if (data_ != nullptr) {
       CHECK_EQ(cudaFree(data_), cudaSuccess);
     }
   }

   // Grow the GPU memory buffer if needed to accommodate data of the specified
   // size
   void Reserve(const size_t size) {
     if (size > size_) {
       if (data_ != nullptr) {
         CHECK_EQ(cudaFree(data_), cudaSuccess);
       }
       CHECK_EQ(cudaMalloc(&data_, size * sizeof(T)), cudaSuccess)
           << "Failed to allocate " << size * sizeof(T)
           << " bytes of GPU memory";
       size_ = size;
     }
   }

   // Perform an asynchronous copy from CPU memory to GPU memory managed by this
   // CudaBuffer instance using the stream provided.
   void CopyFromCpu(const T* data, const size_t size) {
     Reserve(size);
     CHECK_EQ(cudaMemcpyAsync(data_,
                              data,
                              size * sizeof(T),
                              cudaMemcpyHostToDevice,
                              context_->DefaultStream()),
              cudaSuccess);
   }

   // Perform an asynchronous copy from a vector in CPU memory to GPU memory
   // managed by this CudaBuffer instance.
   void CopyFromCpuVector(const std::vector<T>& data) {
     Reserve(data.size());
     CHECK_EQ(cudaMemcpyAsync(data_,
                              data.data(),
                              data.size() * sizeof(T),
                              cudaMemcpyHostToDevice,
                              context_->DefaultStream()),
              cudaSuccess);
   }

   // Perform an asynchronous copy from another GPU memory array to the GPU
   // memory managed by this CudaBuffer instance using the stream provided.
   void CopyFromGPUArray(const T* data, const size_t size) {
     Reserve(size);
     CHECK_EQ(cudaMemcpyAsync(data_,
                              data,
                              size * sizeof(T),
                              cudaMemcpyDeviceToDevice,
                              context_->DefaultStream()),
              cudaSuccess);
   }

   // Copy data from the GPU memory managed by this CudaBuffer instance to CPU
   // memory. It is the caller's responsibility to ensure that the CPU memory
   // pointer is valid, i.e. it is not null, and that it points to memory of
   // at least this->size() size. This method ensures all previously dispatched
   // GPU operations on the specified stream have completed before copying the
   // data to CPU memory.
   void CopyToCpu(T* data, const size_t size) const {
     CHECK(data_ != nullptr);
     CHECK_EQ(cudaMemcpyAsync(data,
                              data_,
                              size * sizeof(T),
                              cudaMemcpyDeviceToHost,
                              context_->DefaultStream()),
              cudaSuccess);
     CHECK_EQ(cudaStreamSynchronize(context_->DefaultStream()), cudaSuccess);
   }

   // Copy N items from another GPU memory array to the GPU memory managed by
   // this CudaBuffer instance, growing this buffer's size if needed. This copy
   // is asynchronous, and operates on the stream provided.
   void CopyNItemsFrom(int n, const CudaBuffer<T>& other) {
     Reserve(n);
     CHECK(other.data_ != nullptr);
     CHECK(data_ != nullptr);
     CHECK_EQ(cudaMemcpyAsync(data_,
                              other.data_,
                              size_ * sizeof(T),
                              cudaMemcpyDeviceToDevice,
                              context_->DefaultStream()),
              cudaSuccess);
   }

   // Return a pointer to the GPU memory managed by this CudaBuffer instance.
   T* data() { return data_; }
   const T* data() const { return data_; }
   // Return the number of items of type T that can fit in the GPU memory
   // allocated so far by this CudaBuffer instance.
   size_t size() const { return size_; }

  private:
   T* data_ = nullptr;
   size_t size_ = 0;
   ContextImpl* context_ = nullptr;
 };

 // This class wraps host memory region allocated via cudaMallocHost. Such memory
 // region is page-locked, hence enabling direct transfer to/from device,
 // avoiding implicit buffering under the hood of CUDA API.
 template <typename T>
 class CudaPinnedHostBuffer {
  public:
   CudaPinnedHostBuffer() noexcept = default;
   CudaPinnedHostBuffer(int size) { Reserve(size); }
   CudaPinnedHostBuffer(CudaPinnedHostBuffer&& other) noexcept
       : data_(std::exchange(other.data_, nullptr)),
         size_(std::exchange(other.size_, 0)) {}
   CudaPinnedHostBuffer(const CudaPinnedHostBuffer&) = delete;
   CudaPinnedHostBuffer& operator=(const CudaPinnedHostBuffer&) = delete;
   CudaPinnedHostBuffer& operator=(CudaPinnedHostBuffer&& other) noexcept {
     Free();
     data_ = std::exchange(other.data_, nullptr);
     size_ = std::exchange(other.size_, 0);
     return *this;
   }
   ~CudaPinnedHostBuffer() { Free(); }

   void Reserve(const std::size_t size) {
     if (size > size_) {
       Free();
       CHECK_EQ(cudaMallocHost(&data_, size * sizeof(T)), cudaSuccess)
           << "Failed to allocate " << size * sizeof(T)
           << " bytes of pinned host memory";
       size_ = size;
     }
   }

   T* data() noexcept { return data_; }
   const T* data() const noexcept { return data_; }
   std::size_t size() const noexcept { return size_; }

  private:
   void Free() {
     if (data_ != nullptr) {
       CHECK_EQ(cudaFreeHost(data_), cudaSuccess);
       data_ = nullptr;
       size_ = 0;
     }
   }

   T* data_ = nullptr;
   std::size_t size_ = 0;
 };

 }  // namespace ceres::internal

 #endif  // CERES_NO_CUDA

 #endif  // CERES_INTERNAL_CUDA_BUFFER_H_
	// Ceres Solver - A fast non-linear least squares minimizer
	// Copyright 2023 Google Inc. All rights reserved.
	// http://ceres-solver.org/
	//
	// Redistribution and use in source and binary forms, with or without
	// modification, are permitted provided that the following conditions are met:
	//
	// * Redistributions of source code must retain the above copyright notice,
	// this list of conditions and the following disclaimer.
	// * Redistributions in binary form must reproduce the above copyright notice,
	// this list of conditions and the following disclaimer in the documentation
	// and/or other materials provided with the distribution.
	// * Neither the name of Google Inc. nor the names of its contributors may be
	// used to endorse or promote products derived from this software without
	// specific prior written permission.
	//
	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
	// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	// POSSIBILITY OF SUCH DAMAGE.
	//
	// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)

	#ifndef CERES_INTERNAL_CUDA_BUFFER_H_
	#define CERES_INTERNAL_CUDA_BUFFER_H_

	#include "ceres/context_impl.h"
	#include "ceres/internal/config.h"

	#ifndef CERES_NO_CUDA

	#include <cstddef>
	#include <utility>
	#include <vector>

	#include "absl/log/check.h"
	#include "cuda_runtime.h"

	namespace ceres::internal {
	// An encapsulated buffer to maintain GPU memory, and handle transfers between
	// GPU and system memory. It is the responsibility of the user to ensure that
	// the appropriate GPU device is selected before each subroutine is called. This
	// is particularly important when using multiple GPU devices on different CPU
	// threads, since active Cuda devices are determined by the cuda runtime on a
	// per-thread basis.
	template <typename T>
	class CudaBuffer {
	public:
	explicit CudaBuffer(ContextImpl* context) : context_(context) {}
	CudaBuffer(ContextImpl* context, int size) : context_(context) {
	Reserve(size);
	}

	CudaBuffer(CudaBuffer&& other)
	: data_(other.data_), size_(other.size_), context_(other.context_) {
	other.data_ = nullptr;
	other.size_ = 0;
	}

	CudaBuffer(const CudaBuffer&) = delete;
	CudaBuffer& operator=(const CudaBuffer&) = delete;

	~CudaBuffer() {
	if (data_ != nullptr) {
	CHECK_EQ(cudaFree(data_), cudaSuccess);
	}
	}

	// Grow the GPU memory buffer if needed to accommodate data of the specified
	// size
	void Reserve(const size_t size) {
	if (size > size_) {
	if (data_ != nullptr) {
	CHECK_EQ(cudaFree(data_), cudaSuccess);
	}
	CHECK_EQ(cudaMalloc(&data_, size * sizeof(T)), cudaSuccess)
	<< "Failed to allocate " << size * sizeof(T)
	<< " bytes of GPU memory";
	size_ = size;
	}
	}

	// Perform an asynchronous copy from CPU memory to GPU memory managed by this
	// CudaBuffer instance using the stream provided.
	void CopyFromCpu(const T* data, const size_t size) {
	Reserve(size);
	CHECK_EQ(cudaMemcpyAsync(data_,
	data,
	size * sizeof(T),
	cudaMemcpyHostToDevice,
	context_->DefaultStream()),
	cudaSuccess);
	}

	// Perform an asynchronous copy from a vector in CPU memory to GPU memory
	// managed by this CudaBuffer instance.
	void CopyFromCpuVector(const std::vector<T>& data) {
	Reserve(data.size());
	CHECK_EQ(cudaMemcpyAsync(data_,
	data.data(),
	data.size() * sizeof(T),
	cudaMemcpyHostToDevice,
	context_->DefaultStream()),
	cudaSuccess);
	}

	// Perform an asynchronous copy from another GPU memory array to the GPU
	// memory managed by this CudaBuffer instance using the stream provided.
	void CopyFromGPUArray(const T* data, const size_t size) {
	Reserve(size);
	CHECK_EQ(cudaMemcpyAsync(data_,
	data,
	size * sizeof(T),
	cudaMemcpyDeviceToDevice,
	context_->DefaultStream()),
	cudaSuccess);
	}

	// Copy data from the GPU memory managed by this CudaBuffer instance to CPU
	// memory. It is the caller's responsibility to ensure that the CPU memory
	// pointer is valid, i.e. it is not null, and that it points to memory of
	// at least this->size() size. This method ensures all previously dispatched
	// GPU operations on the specified stream have completed before copying the
	// data to CPU memory.
	void CopyToCpu(T* data, const size_t size) const {
	CHECK(data_ != nullptr);
	CHECK_EQ(cudaMemcpyAsync(data,
	data_,
	size * sizeof(T),
	cudaMemcpyDeviceToHost,
	context_->DefaultStream()),
	cudaSuccess);
	CHECK_EQ(cudaStreamSynchronize(context_->DefaultStream()), cudaSuccess);
	}

	// Copy N items from another GPU memory array to the GPU memory managed by
	// this CudaBuffer instance, growing this buffer's size if needed. This copy
	// is asynchronous, and operates on the stream provided.
	void CopyNItemsFrom(int n, const CudaBuffer<T>& other) {
	Reserve(n);
	CHECK(other.data_ != nullptr);
	CHECK(data_ != nullptr);
	CHECK_EQ(cudaMemcpyAsync(data_,
	other.data_,
	size_ * sizeof(T),
	cudaMemcpyDeviceToDevice,
	context_->DefaultStream()),
	cudaSuccess);
	}

	// Return a pointer to the GPU memory managed by this CudaBuffer instance.
	T* data() { return data_; }
	const T* data() const { return data_; }
	// Return the number of items of type T that can fit in the GPU memory
	// allocated so far by this CudaBuffer instance.
	size_t size() const { return size_; }

	private:
	T* data_ = nullptr;
	size_t size_ = 0;
	ContextImpl* context_ = nullptr;
	};

	// This class wraps host memory region allocated via cudaMallocHost. Such memory
	// region is page-locked, hence enabling direct transfer to/from device,
	// avoiding implicit buffering under the hood of CUDA API.
	template <typename T>
	class CudaPinnedHostBuffer {
	public:
	CudaPinnedHostBuffer() noexcept = default;
	CudaPinnedHostBuffer(int size) { Reserve(size); }
	CudaPinnedHostBuffer(CudaPinnedHostBuffer&& other) noexcept
	: data_(std::exchange(other.data_, nullptr)),
	size_(std::exchange(other.size_, 0)) {}
	CudaPinnedHostBuffer(const CudaPinnedHostBuffer&) = delete;
	CudaPinnedHostBuffer& operator=(const CudaPinnedHostBuffer&) = delete;
	CudaPinnedHostBuffer& operator=(CudaPinnedHostBuffer&& other) noexcept {
	Free();
	data_ = std::exchange(other.data_, nullptr);
	size_ = std::exchange(other.size_, 0);
	return *this;
	}
	~CudaPinnedHostBuffer() { Free(); }

	void Reserve(const std::size_t size) {
	if (size > size_) {
	Free();
	CHECK_EQ(cudaMallocHost(&data_, size * sizeof(T)), cudaSuccess)
	<< "Failed to allocate " << size * sizeof(T)
	<< " bytes of pinned host memory";
	size_ = size;
	}
	}

	T* data() noexcept { return data_; }
	const T* data() const noexcept { return data_; }
	std::size_t size() const noexcept { return size_; }

	private:
	void Free() {
	if (data_ != nullptr) {
	CHECK_EQ(cudaFreeHost(data_), cudaSuccess);
	data_ = nullptr;
	size_ = 0;
	}
	}

	T* data_ = nullptr;
	std::size_t size_ = 0;
	};

	} // namespace ceres::internal

	#endif // CERES_NO_CUDA

	#endif // CERES_INTERNAL_CUDA_BUFFER_H_