Fix bug in cuda_kernels_test * Use the context stream, not the default stream, since all CUDA operations now operate on the context stream. Change-Id: I24c872c467f13ee4d276fc04d6822b79a18070fc
diff --git a/internal/ceres/cuda_kernels_test.cc b/internal/ceres/cuda_kernels_test.cc index 9290aa5..a364c90 100644 --- a/internal/ceres/cuda_kernels_test.cc +++ b/internal/ceres/cuda_kernels_test.cc
@@ -58,7 +58,7 @@ CudaBuffer<float> fp32_gpu(&context); fp32_gpu.Reserve(fp64_cpu.size()); CudaFP64ToFP32( - fp64_gpu.data(), fp32_gpu.data(), fp64_cpu.size(), cudaStreamDefault); + fp64_gpu.data(), fp32_gpu.data(), fp64_cpu.size(), context.stream_); std::vector<float> fp32_cpu(fp64_cpu.size()); fp32_gpu.CopyToCpu(fp32_cpu.data(), fp32_cpu.size()); for (int i = 0; i < fp32_cpu.size(); ++i) { @@ -83,7 +83,7 @@ CudaBuffer<float> fp32_gpu(&context); fp32_gpu.Reserve(fp64_cpu.size()); CudaFP64ToFP32( - fp64_gpu.data(), fp32_gpu.data(), fp64_cpu.size(), cudaStreamDefault); + fp64_gpu.data(), fp32_gpu.data(), fp64_cpu.size(), context.stream_); std::vector<float> fp32_cpu(fp64_cpu.size()); fp32_gpu.CopyToCpu(fp32_cpu.data(), fp32_cpu.size()); EXPECT_EQ(fp32_cpu[0], 0.0f); @@ -102,7 +102,7 @@ CudaBuffer<double> fp64_gpu(&context); fp64_gpu.Reserve(fp32_cpu.size()); CudaFP32ToFP64( - fp32_gpu.data(), fp64_gpu.data(), fp32_cpu.size(), cudaStreamDefault); + fp32_gpu.data(), fp64_gpu.data(), fp32_cpu.size(), context.stream_); std::vector<double> fp64_cpu(fp32_cpu.size()); fp64_gpu.CopyToCpu(fp64_cpu.data(), fp64_cpu.size()); for (int i = 0; i < fp64_cpu.size(); ++i) { @@ -117,7 +117,7 @@ std::vector<float> fp32_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0}; CudaBuffer<float> fp32_gpu(&context); fp32_gpu.CopyFromCpuVector(fp32_cpu); - CudaSetZeroFP32(fp32_gpu.data(), fp32_cpu.size(), cudaStreamDefault); + CudaSetZeroFP32(fp32_gpu.data(), fp32_cpu.size(), context.stream_); std::vector<float> fp32_cpu_zero(fp32_cpu.size()); fp32_gpu.CopyToCpu(fp32_cpu_zero.data(), fp32_cpu_zero.size()); for (int i = 0; i < fp32_cpu_zero.size(); ++i) { @@ -132,7 +132,7 @@ std::vector<double> fp64_cpu = {1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0}; CudaBuffer<double> fp64_gpu(&context); fp64_gpu.CopyFromCpuVector(fp64_cpu); - CudaSetZeroFP64(fp64_gpu.data(), fp64_cpu.size(), cudaStreamDefault); + CudaSetZeroFP64(fp64_gpu.data(), fp64_cpu.size(), context.stream_); std::vector<double> fp64_cpu_zero(fp64_cpu.size()); fp64_gpu.CopyToCpu(fp64_cpu_zero.data(), fp64_cpu_zero.size()); for (int i = 0; i < fp64_cpu_zero.size(); ++i) { @@ -154,7 +154,7 @@ CudaDsxpy(fp64_gpu_b.data(), fp32_gpu_a.data(), fp32_gpu_a.size(), - cudaStreamDefault); + context.stream_); fp64_gpu_b.CopyToCpu(fp64_cpu_b.data(), fp64_cpu_b.size()); for (int i = 0; i < fp64_cpu_b.size(); ++i) { EXPECT_DOUBLE_EQ(fp64_cpu_b[i], 2.0 * fp32_cpu_a[i]); @@ -178,7 +178,7 @@ d_gpu.data(), x_gpu.data(), y_gpu.size(), - cudaStreamDefault); + context.stream_); y_gpu.CopyToCpu(y_cpu.data(), y_cpu.size()); EXPECT_DOUBLE_EQ(y_cpu[0], 4.0 + 10.0 * 10.0 * 1.0); EXPECT_DOUBLE_EQ(y_cpu[1], 3.0 + 20.0 * 20.0 * 2.0);