internal/ceres/cuda_kernels.cu.cc - ceres-solver - Git at Google

 // Ceres Solver - A fast non-linear least squares minimizer
 // Copyright 2022 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
 // * Redistributions of source code must retain the above copyright notice,
 //   this list of conditions and the following disclaimer.
 // * Redistributions in binary form must reproduce the above copyright notice,
 //   this list of conditions and the following disclaimer in the documentation
 //   and/or other materials provided with the distribution.
 // * Neither the name of Google Inc. nor the names of its contributors may be
 //   used to endorse or promote products derived from this software without
 //   specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 //
 // Author: joydeepb@cs.utexas.edu (Joydeep Biswas)

 #include "cuda_runtime.h"

 namespace ceres::internal {

 // As the CUDA Toolkit documentation says, "although arbitrary in this case, is
 // a common choice". This is determined by the warp size, max block size, and
 // multiprocessor sizes of recent GPUs. For complex kernels with significant
 // register usage and unusual memory patterns, the occupancy calculator API
 // might provide better performance. See "Occupancy Calculator" under the CUDA
 // toolkit documentation.
 constexpr int kCudaBlockSize = 256;

 template <typename SrcType, typename DstType>
 __global__ void TypeConversionKernel(const SrcType* __restrict__ input,
                                      DstType* __restrict__ output,
                                      const int size) {
   const int i = blockIdx.x * blockDim.x + threadIdx.x;
   if (i < size) {
     output[i] = static_cast<DstType>(input[i]);
   }
 }

 void CudaFP64ToFP32(const double* input,
                     float* output,
                     const int size,
                     cudaStream_t stream) {
   const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
   TypeConversionKernel<double, float>
       <<<num_blocks, kCudaBlockSize, 0, stream>>>(input, output, size);
 }

 void CudaFP32ToFP64(const float* input,
                     double* output,
                     const int size,
                     cudaStream_t stream) {
   const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
   TypeConversionKernel<float, double>
       <<<num_blocks, kCudaBlockSize, 0, stream>>>(input, output, size);
 }

 template <typename T>
 __global__ void SetZeroKernel(T* __restrict__ output, const int size) {
   const int i = blockIdx.x * blockDim.x + threadIdx.x;
   if (i < size) {
     output[i] = T(0.0);
   }
 }

 void CudaSetZeroFP32(float* output, const int size, cudaStream_t stream) {
   const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
   SetZeroKernel<float><<<num_blocks, kCudaBlockSize, 0, stream>>>(output, size);
 }

 void CudaSetZeroFP64(double* output, const int size, cudaStream_t stream) {
   const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
   SetZeroKernel<double>
       <<<num_blocks, kCudaBlockSize, 0, stream>>>(output, size);
 }

 template <typename SrcType, typename DstType>
 __global__ void XPlusEqualsYKernel(DstType* __restrict__ x,
                                    const SrcType* __restrict__ y,
                                    const int size) {
   const int i = blockIdx.x * blockDim.x + threadIdx.x;
   if (i < size) {
     x[i] = x[i] + DstType(y[i]);
   }
 }

 void CudaDsxpy(double* x, float* y, const int size, cudaStream_t stream) {
   const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
   XPlusEqualsYKernel<float, double>
       <<<num_blocks, kCudaBlockSize, 0, stream>>>(x, y, size);
 }

 __global__ void CudaDtDxpyKernel(double* __restrict__ y,
                                  const double* D,
                                  const double* __restrict__ x,
                                  const int size) {
   const int i = blockIdx.x * blockDim.x + threadIdx.x;
   if (i < size) {
     y[i] = y[i] + D[i] * D[i] * x[i];
   }
 }

 void CudaDtDxpy(double* y,
                 const double* D,
                 const double* x,
                 const int size,
                 cudaStream_t stream) {
   const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
   CudaDtDxpyKernel<<<num_blocks, kCudaBlockSize, 0, stream>>>(y, D, x, size);
 }

 }  // namespace ceres::internal
	// Ceres Solver - A fast non-linear least squares minimizer
	// Copyright 2022 Google Inc. All rights reserved.
	// http://ceres-solver.org/
	//
	// Redistribution and use in source and binary forms, with or without
	// modification, are permitted provided that the following conditions are met:
	//
	// * Redistributions of source code must retain the above copyright notice,
	// this list of conditions and the following disclaimer.
	// * Redistributions in binary form must reproduce the above copyright notice,
	// this list of conditions and the following disclaimer in the documentation
	// and/or other materials provided with the distribution.
	// * Neither the name of Google Inc. nor the names of its contributors may be
	// used to endorse or promote products derived from this software without
	// specific prior written permission.
	//
	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
	// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	// POSSIBILITY OF SUCH DAMAGE.
	//
	// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)

	#include "cuda_runtime.h"

	namespace ceres::internal {

	// As the CUDA Toolkit documentation says, "although arbitrary in this case, is
	// a common choice". This is determined by the warp size, max block size, and
	// multiprocessor sizes of recent GPUs. For complex kernels with significant
	// register usage and unusual memory patterns, the occupancy calculator API
	// might provide better performance. See "Occupancy Calculator" under the CUDA
	// toolkit documentation.
	constexpr int kCudaBlockSize = 256;

	template <typename SrcType, typename DstType>
	__global__ void TypeConversionKernel(const SrcType* __restrict__ input,
	DstType* __restrict__ output,
	const int size) {
	const int i = blockIdx.x * blockDim.x + threadIdx.x;
	if (i < size) {
	output[i] = static_cast<DstType>(input[i]);
	}
	}

	void CudaFP64ToFP32(const double* input,
	float* output,
	const int size,
	cudaStream_t stream) {
	const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
	TypeConversionKernel<double, float>
	<<<num_blocks, kCudaBlockSize, 0, stream>>>(input, output, size);
	}

	void CudaFP32ToFP64(const float* input,
	double* output,
	const int size,
	cudaStream_t stream) {
	const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
	TypeConversionKernel<float, double>
	<<<num_blocks, kCudaBlockSize, 0, stream>>>(input, output, size);
	}

	template <typename T>
	__global__ void SetZeroKernel(T* __restrict__ output, const int size) {
	const int i = blockIdx.x * blockDim.x + threadIdx.x;
	if (i < size) {
	output[i] = T(0.0);
	}
	}

	void CudaSetZeroFP32(float* output, const int size, cudaStream_t stream) {
	const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
	SetZeroKernel<float><<<num_blocks, kCudaBlockSize, 0, stream>>>(output, size);
	}

	void CudaSetZeroFP64(double* output, const int size, cudaStream_t stream) {
	const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
	SetZeroKernel<double>
	<<<num_blocks, kCudaBlockSize, 0, stream>>>(output, size);
	}

	template <typename SrcType, typename DstType>
	__global__ void XPlusEqualsYKernel(DstType* __restrict__ x,
	const SrcType* __restrict__ y,
	const int size) {
	const int i = blockIdx.x * blockDim.x + threadIdx.x;
	if (i < size) {
	x[i] = x[i] + DstType(y[i]);
	}
	}

	void CudaDsxpy(double* x, float* y, const int size, cudaStream_t stream) {
	const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
	XPlusEqualsYKernel<float, double>
	<<<num_blocks, kCudaBlockSize, 0, stream>>>(x, y, size);
	}

	__global__ void CudaDtDxpyKernel(double* __restrict__ y,
	const double* D,
	const double* __restrict__ x,
	const int size) {
	const int i = blockIdx.x * blockDim.x + threadIdx.x;
	if (i < size) {
	y[i] = y[i] + D[i] * D[i] * x[i];
	}
	}

	void CudaDtDxpy(double* y,
	const double* D,
	const double* x,
	const int size,
	cudaStream_t stream) {
	const int num_blocks = (size + kCudaBlockSize - 1) / kCudaBlockSize;
	CudaDtDxpyKernel<<<num_blocks, kCudaBlockSize, 0, stream>>>(y, D, x, size);
	}

	} // namespace ceres::internal