Fix a missing CERES_NO_CUDA guard
Also run format_all.sh.
Change-Id: I13902c1d3eb0d3a97548540fee13ec67c490a5ff
diff --git a/internal/ceres/cuda_sparse_matrix.cc b/internal/ceres/cuda_sparse_matrix.cc
index 2f857e2..da64981 100644
--- a/internal/ceres/cuda_sparse_matrix.cc
+++ b/internal/ceres/cuda_sparse_matrix.cc
@@ -38,41 +38,37 @@
#include "ceres/cuda_sparse_matrix.h"
#include <math.h>
+
#include <memory>
-#include "ceres/internal/export.h"
#include "ceres/block_sparse_matrix.h"
#include "ceres/compressed_row_sparse_matrix.h"
-#include "ceres/crs_matrix.h"
-#include "ceres/types.h"
#include "ceres/context_impl.h"
+#include "ceres/crs_matrix.h"
+#include "ceres/internal/export.h"
+#include "ceres/types.h"
#include "ceres/wall_time.h"
#ifndef CERES_NO_CUDA
+#include "ceres/ceres_cuda_kernels.h"
#include "ceres/cuda_buffer.h"
#include "ceres/cuda_vector.h"
-#include "ceres/ceres_cuda_kernels.h"
#include "cusparse.h"
-
namespace ceres::internal {
CudaSparseMatrix::CudaSparseMatrix(
- ContextImpl* context,
- const CompressedRowSparseMatrix& crs_matrix) {
+ ContextImpl* context, const CompressedRowSparseMatrix& crs_matrix) {
DCHECK_NE(context, nullptr);
CHECK(context->IsCUDAInitialized());
context_ = context;
num_rows_ = crs_matrix.num_rows();
num_cols_ = crs_matrix.num_cols();
num_nonzeros_ = crs_matrix.num_nonzeros();
- rows_.CopyFromCpu(
- crs_matrix.rows(), num_rows_ + 1, context_->stream_);
- cols_.CopyFromCpu(
- crs_matrix.cols(), num_nonzeros_, context_->stream_);
- values_.CopyFromCpu(
- crs_matrix.values(), num_nonzeros_, context_->stream_);
+ rows_.CopyFromCpu(crs_matrix.rows(), num_rows_ + 1, context_->stream_);
+ cols_.CopyFromCpu(crs_matrix.cols(), num_nonzeros_, context_->stream_);
+ values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_, context_->stream_);
cusparseCreateCsr(&descr_,
num_rows_,
num_cols_,
@@ -99,8 +95,7 @@
CHECK_EQ(num_rows_, crs_matrix.num_rows());
CHECK_EQ(num_cols_, crs_matrix.num_cols());
CHECK_EQ(num_nonzeros_, crs_matrix.num_nonzeros());
- values_.CopyFromCpu(
- crs_matrix.values(), num_nonzeros_, context_->stream_);
+ values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_, context_->stream_);
}
void CudaSparseMatrix::SpMv(cusparseOperation_t op,
@@ -135,11 +130,13 @@
CUSPARSE_STATUS_SUCCESS);
}
-void CudaSparseMatrix::RightMultiplyAndAccumulate(const CudaVector& x, CudaVector* y) {
+void CudaSparseMatrix::RightMultiplyAndAccumulate(const CudaVector& x,
+ CudaVector* y) {
SpMv(CUSPARSE_OPERATION_NON_TRANSPOSE, x, y);
}
-void CudaSparseMatrix::LeftMultiplyAndAccumulate(const CudaVector& x, CudaVector* y) {
+void CudaSparseMatrix::LeftMultiplyAndAccumulate(const CudaVector& x,
+ CudaVector* y) {
// TODO(Joydeep Biswas): We should consider storing a transposed copy of the
// matrix by converting CSR to CSC. From the cuSPARSE documentation:
// "In general, opA == CUSPARSE_OPERATION_NON_TRANSPOSE is 3x faster than opA
diff --git a/internal/ceres/cuda_sparse_matrix.h b/internal/ceres/cuda_sparse_matrix.h
index 7661fb9..62f6b77 100644
--- a/internal/ceres/cuda_sparse_matrix.h
+++ b/internal/ceres/cuda_sparse_matrix.h
@@ -43,9 +43,9 @@
#include <string>
#include "ceres/compressed_row_sparse_matrix.h"
+#include "ceres/context_impl.h"
#include "ceres/internal/export.h"
#include "ceres/types.h"
-#include "ceres/context_impl.h"
#ifndef CERES_NO_CUDA
#include "ceres/cuda_buffer.h"
@@ -58,7 +58,6 @@
// CUDA-accelerated operations.
class CERES_NO_EXPORT CudaSparseMatrix {
public:
-
// Create a GPU copy of the matrix provided. The caller must ensure that
// InitCuda() has already been successfully called on context before calling
// this constructor.
@@ -86,7 +85,6 @@
const cusparseSpMatDescr_t& descr() const { return descr_; }
private:
-
// Disable copy and assignment.
CudaSparseMatrix(const CudaSparseMatrix&) = delete;
CudaSparseMatrix& operator=(const CudaSparseMatrix&) = delete;
diff --git a/internal/ceres/cuda_sparse_matrix_test.cc b/internal/ceres/cuda_sparse_matrix_test.cc
index 3a01ff9..ae76b8f 100644
--- a/internal/ceres/cuda_sparse_matrix_test.cc
+++ b/internal/ceres/cuda_sparse_matrix_test.cc
@@ -28,15 +28,16 @@
//
// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+#include "ceres/cuda_sparse_matrix.h"
+
#include <string>
+#include "ceres/block_sparse_matrix.h"
#include "ceres/casts.h"
+#include "ceres/cuda_vector.h"
#include "ceres/internal/config.h"
#include "ceres/internal/eigen.h"
#include "ceres/linear_least_squares_problems.h"
-#include "ceres/block_sparse_matrix.h"
-#include "ceres/cuda_vector.h"
-#include "ceres/cuda_sparse_matrix.h"
#include "ceres/triplet_sparse_matrix.h"
#include "glog/logging.h"
#include "gtest/gtest.h"
@@ -113,24 +114,11 @@
// b: [1 2 3 4]'
// A1 * b = [3 5]'
// A2 * b = [5 18]'
- TripletSparseMatrix A1(
- 2,
- 4,
- {0, 0, 1, 1},
- {0, 1, 1, 2},
- {1, 1, 1, 1}
- );
- TripletSparseMatrix A2(
- 2,
- 4,
- {0, 0, 1, 1},
- {0, 1, 1, 2},
- {1, 2, 3, 4}
- );
+ TripletSparseMatrix A1(2, 4, {0, 0, 1, 1}, {0, 1, 1, 2}, {1, 1, 1, 1});
+ TripletSparseMatrix A2(2, 4, {0, 0, 1, 1}, {0, 1, 1, 2}, {1, 2, 3, 4});
Vector b(4);
b << 1, 2, 3, 4;
-
ContextImpl context;
std::string message;
CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
@@ -163,13 +151,7 @@
// 0 3 4 0]
// b: [1 2 3 4]'
// A * b = [5 18]'
- TripletSparseMatrix A(
- 2,
- 4,
- {0, 0, 1, 1},
- {0, 1, 1, 2},
- {1, 2, 3, 4}
- );
+ TripletSparseMatrix A(2, 4, {0, 0, 1, 1}, {0, 1, 1, 2}, {1, 2, 3, 4});
Vector b(4);
b << 1, 2, 3, 4;
Vector x_expected(2);
@@ -199,13 +181,7 @@
// 0 3 4 0]
// b: [1 2]'
// A'* b = [1 8 8 0]'
- TripletSparseMatrix A(
- 2,
- 4,
- {0, 0, 1, 1},
- {0, 1, 1, 2},
- {1, 2, 3, 4}
- );
+ TripletSparseMatrix A(2, 4, {0, 0, 1, 1}, {0, 1, 1, 2}, {1, 2, 3, 4});
Vector b(2);
b << 1, 2;
Vector x_expected(4);
@@ -213,8 +189,7 @@
ContextImpl context;
std::string message;
- CHECK(context.InitCUDA(&message))
- << "InitCUDA() failed because: " << message;
+ CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
auto A_crs = CompressedRowSparseMatrix::FromTripletSparseMatrix(A);
CudaSparseMatrix A_gpu(&context, *A_crs);
CudaVector b_gpu(&context, A.num_rows());
@@ -269,15 +244,13 @@
ContextImpl context;
std::string message;
- CHECK(context.InitCUDA(&message))
- << "InitCUDA() failed because: " << message;
+ CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
auto A_crs = CompressedRowSparseMatrix::FromTripletSparseMatrix(A);
CudaSparseMatrix A_gpu(&context, *A_crs);
CudaVector b_gpu(&context, N);
CudaVector x_gpu(&context, N);
x_gpu.CopyFromCpu(x);
-
// First check RightMultiply.
{
b_gpu.SetZero();
diff --git a/internal/ceres/cuda_vector.cc b/internal/ceres/cuda_vector.cc
index 5a128de..7debeba 100644
--- a/internal/ceres/cuda_vector.cc
+++ b/internal/ceres/cuda_vector.cc
@@ -37,15 +37,15 @@
#include <math.h>
+#include "ceres/context_impl.h"
#include "ceres/internal/export.h"
#include "ceres/types.h"
-#include "ceres/context_impl.h"
#ifndef CERES_NO_CUDA
+#include "ceres/ceres_cuda_kernels.h"
#include "ceres/cuda_buffer.h"
#include "ceres/cuda_vector.h"
-#include "ceres/ceres_cuda_kernels.h"
#include "cublas_v2.h"
namespace ceres::internal {
@@ -72,18 +72,14 @@
}
}
-CudaVector::~CudaVector() {
- DestroyDescriptor();
-}
+CudaVector::~CudaVector() { DestroyDescriptor(); }
void CudaVector::Resize(int size) {
data_.Reserve(size);
num_rows_ = size;
DestroyDescriptor();
- CHECK_EQ(cusparseCreateDnVec(&descr_,
- num_rows_,
- data_.data(),
- CUDA_R_64F), CUSPARSE_STATUS_SUCCESS);
+ CHECK_EQ(cusparseCreateDnVec(&descr_, num_rows_, data_.data(), CUDA_R_64F),
+ CUSPARSE_STATUS_SUCCESS);
}
double CudaVector::Dot(const CudaVector& x) const {
@@ -93,19 +89,19 @@
data_.data(),
1,
x.data().data(),
- 1, &result),
- CUBLAS_STATUS_SUCCESS) << "CuBLAS cublasDdot failed.";
+ 1,
+ &result),
+ CUBLAS_STATUS_SUCCESS)
+ << "CuBLAS cublasDdot failed.";
return result;
}
double CudaVector::Norm() const {
double result = 0;
- CHECK_EQ(cublasDnrm2(context_->cublas_handle_,
- num_rows_,
- data_.data(),
- 1,
- &result),
- CUBLAS_STATUS_SUCCESS) << "CuBLAS cublasDnrm2 failed.";
+ CHECK_EQ(cublasDnrm2(
+ context_->cublas_handle_, num_rows_, data_.data(), 1, &result),
+ CUBLAS_STATUS_SUCCESS)
+ << "CuBLAS cublasDnrm2 failed.";
return result;
}
@@ -114,10 +110,8 @@
data_.CopyFromCpu(x.data(), x.rows(), context_->stream_);
num_rows_ = x.rows();
DestroyDescriptor();
- CHECK_EQ(cusparseCreateDnVec(&descr_,
- num_rows_,
- data_.data(),
- CUDA_R_64F), CUSPARSE_STATUS_SUCCESS);
+ CHECK_EQ(cusparseCreateDnVec(&descr_, num_rows_, data_.data(), CUDA_R_64F),
+ CUSPARSE_STATUS_SUCCESS);
}
void CudaVector::CopyTo(Vector* x) const {
@@ -150,12 +144,10 @@
CHECK_EQ(num_rows_, x.num_rows_);
if (b != 1.0) {
// First scale y by b.
- CHECK_EQ(cublasDscal(context_->cublas_handle_,
- num_rows_,
- &b,
- data_.data(),
- 1),
- CUBLAS_STATUS_SUCCESS) << "CuBLAS cublasDscal failed.";
+ CHECK_EQ(
+ cublasDscal(context_->cublas_handle_, num_rows_, &b, data_.data(), 1),
+ CUBLAS_STATUS_SUCCESS)
+ << "CuBLAS cublasDscal failed.";
}
// Then add a * x to y.
CHECK_EQ(cublasDaxpy(context_->cublas_handle_,
@@ -165,7 +157,8 @@
1,
data_.data(),
1),
- CUBLAS_STATUS_SUCCESS) << "CuBLAS cublasDaxpy failed.";
+ CUBLAS_STATUS_SUCCESS)
+ << "CuBLAS cublasDaxpy failed.";
}
void CudaVector::DtDxpy(const CudaVector& D, const CudaVector& x) {
@@ -177,12 +170,10 @@
}
void CudaVector::Scale(double s) {
- CHECK_EQ(cublasDscal(context_->cublas_handle_,
- num_rows_,
- &s,
- data_.data(),
- 1),
- CUBLAS_STATUS_SUCCESS) << "CuBLAS cublasDscal failed.";
+ CHECK_EQ(
+ cublasDscal(context_->cublas_handle_, num_rows_, &s, data_.data(), 1),
+ CUBLAS_STATUS_SUCCESS)
+ << "CuBLAS cublasDscal failed.";
}
} // namespace ceres::internal
diff --git a/internal/ceres/cuda_vector.h b/internal/ceres/cuda_vector.h
index 4018c1b..e7c4b81 100644
--- a/internal/ceres/cuda_vector.h
+++ b/internal/ceres/cuda_vector.h
@@ -39,17 +39,18 @@
// clang-format on
#include <math.h>
+
#include <memory>
#include <string>
+#include "ceres/context_impl.h"
#include "ceres/internal/export.h"
#include "ceres/types.h"
-#include "ceres/context_impl.h"
#ifndef CERES_NO_CUDA
-#include "ceres/cuda_buffer.h"
#include "ceres/ceres_cuda_kernels.h"
+#include "ceres/cuda_buffer.h"
#include "ceres/internal/eigen.h"
#include "cublas_v2.h"
#include "cusparse.h"
@@ -59,7 +60,6 @@
// An Nx1 vector, denoted y hosted on the GPU, with CUDA-accelerated operations.
class CERES_NO_EXPORT CudaVector {
public:
-
// Create a pre-allocated vector of size N and return a pointer to it. The
// caller must ensure that InitCuda() has already been successfully called on
// context before calling this method.
@@ -123,12 +123,11 @@
// object in the conjugate gradients linear solver.
inline double Norm(const CudaVector& x) { return x.Norm(); }
inline void SetZero(CudaVector& x) { x.SetZero(); }
-inline void Axpby(
- double a,
- const CudaVector& x,
- double b,
- const CudaVector& y,
- CudaVector& z) {
+inline void Axpby(double a,
+ const CudaVector& x,
+ double b,
+ const CudaVector& y,
+ CudaVector& z) {
if (&x == &y && &y == &z) {
// z = (a + b) * z;
z.Scale(a + b);
diff --git a/internal/ceres/cuda_vector_test.cc b/internal/ceres/cuda_vector_test.cc
index db930ed..84193c0 100644
--- a/internal/ceres/cuda_vector_test.cc
+++ b/internal/ceres/cuda_vector_test.cc
@@ -28,11 +28,12 @@
//
// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+#include "ceres/cuda_vector.h"
+
#include <string>
#include "ceres/internal/config.h"
#include "ceres/internal/eigen.h"
-#include "ceres/cuda_vector.h"
#include "glog/logging.h"
#include "gtest/gtest.h"
@@ -130,23 +131,15 @@
CudaVector x_gpu(&context, 10);
x_gpu.CopyFromCpu(x);
- EXPECT_NEAR(x_gpu.Norm(),
- 2.0,
- std::numeric_limits<double>::epsilon());
+ EXPECT_NEAR(x_gpu.Norm(), 2.0, std::numeric_limits<double>::epsilon());
x_gpu.SetZero();
- EXPECT_NEAR(x_gpu.Norm(),
- 0.0,
- std::numeric_limits<double>::epsilon());
+ EXPECT_NEAR(x_gpu.Norm(), 0.0, std::numeric_limits<double>::epsilon());
x_gpu.CopyFromCpu(x);
- EXPECT_NEAR(x_gpu.Norm(),
- 2.0,
- std::numeric_limits<double>::epsilon());
+ EXPECT_NEAR(x_gpu.Norm(), 2.0, std::numeric_limits<double>::epsilon());
SetZero(x_gpu);
- EXPECT_NEAR(x_gpu.Norm(),
- 0.0,
- std::numeric_limits<double>::epsilon());
+ EXPECT_NEAR(x_gpu.Norm(), 0.0, std::numeric_limits<double>::epsilon());
}
TEST(CudaVector, Resize) {
@@ -187,8 +180,7 @@
y << 100, 10, 1, 0;
ContextImpl context;
std::string message;
- CHECK(context.InitCUDA(&message))
- << "InitCUDA() failed because: " << message;
+ CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
CudaVector x_gpu(&context, 4);
CudaVector y_gpu(&context, 4);
x_gpu.CopyFromCpu(x);
@@ -209,8 +201,7 @@
y << 100, 10, 1, 0;
ContextImpl context;
std::string message;
- CHECK(context.InitCUDA(&message))
- << "InitCUDA() failed because: " << message;
+ CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
CudaVector x_gpu(&context, 4);
CudaVector y_gpu(&context, 4);
x_gpu.CopyFromCpu(x);
@@ -231,8 +222,7 @@
y << 100, 10, 1, 0;
ContextImpl context;
std::string message;
- CHECK(context.InitCUDA(&message))
- << "InitCUDA() failed because: " << message;
+ CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
CudaVector x_gpu(&context, 4);
CudaVector y_gpu(&context, 4);
x_gpu.CopyFromCpu(x);
@@ -251,8 +241,7 @@
x << 100, 10, 1, 0;
ContextImpl context;
std::string message;
- CHECK(context.InitCUDA(&message))
- << "InitCUDA() failed because: " << message;
+ CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
CudaVector x_gpu(&context, 4);
CudaVector y_gpu(&context, 4);
x_gpu.CopyFromCpu(x);
@@ -273,8 +262,7 @@
y << 100, 10, 1, 0;
ContextImpl context;
std::string message;
- CHECK(context.InitCUDA(&message))
- << "InitCUDA() failed because: " << message;
+ CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
CudaVector x_gpu(&context, 4);
CudaVector y_gpu(&context, 4);
CudaVector z_gpu(&context, 4);
@@ -296,8 +284,7 @@
x << 100, 10, 1, 0;
ContextImpl context;
std::string message;
- CHECK(context.InitCUDA(&message))
- << "InitCUDA() failed because: " << message;
+ CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
CudaVector x_gpu(&context, 4);
CudaVector z_gpu(&context, 4);
x_gpu.CopyFromCpu(x);
@@ -318,8 +305,7 @@
y << 100, 10, 1, 0;
ContextImpl context;
std::string message;
- CHECK(context.InitCUDA(&message))
- << "InitCUDA() failed because: " << message;
+ CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
CudaVector x_gpu(&context, 10);
CudaVector y_gpu(&context, 10);
x_gpu.CopyFromCpu(x);
@@ -340,8 +326,7 @@
y << 100, 10, 1, 0;
ContextImpl context;
std::string message;
- CHECK(context.InitCUDA(&message))
- << "InitCUDA() failed because: " << message;
+ CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
CudaVector x_gpu(&context, 4);
CudaVector y_gpu(&context, 4);
x_gpu.CopyFromCpu(x);
@@ -360,8 +345,7 @@
x << 100, 10, 1, 0;
ContextImpl context;
std::string message;
- CHECK(context.InitCUDA(&message))
- << "InitCUDA() failed because: " << message;
+ CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
CudaVector x_gpu(&context, 10);
x_gpu.CopyFromCpu(x);
diff --git a/internal/ceres/implicit_schur_complement_test.cc b/internal/ceres/implicit_schur_complement_test.cc
index 66c6707..f7abca1 100644
--- a/internal/ceres/implicit_schur_complement_test.cc
+++ b/internal/ceres/implicit_schur_complement_test.cc
@@ -155,8 +155,8 @@
// Here, assuming that block_diagonal(F'F) == diagonal(F'F)
Matrix Z_reference =
(F.transpose() * F + DF).diagonal().asDiagonal().inverse() *
- F.transpose() * E * (E.transpose() * E + DE).inverse() *
- E.transpose() * F;
+ F.transpose() * E * (E.transpose() * E + DE).inverse() * E.transpose() *
+ F;
for (int i = 0; i < num_f_cols; ++i) {
Vector x(num_f_cols);
@@ -166,7 +166,6 @@
Vector y(num_f_cols);
y = lhs * x;
-
Vector z(num_f_cols);
isc.RightMultiplyAndAccumulate(x.data(), z.data());
diff --git a/internal/ceres/sparse_linear_operator_benchmark.cc b/internal/ceres/sparse_linear_operator_benchmark.cc
index c224e35..e2e22c9 100644
--- a/internal/ceres/sparse_linear_operator_benchmark.cc
+++ b/internal/ceres/sparse_linear_operator_benchmark.cc
@@ -34,16 +34,19 @@
#include <string>
#include "Eigen/Dense"
-#include "gflags/gflags.h"
#include "benchmark/benchmark.h"
-#include "ceres/context_impl.h"
#include "ceres/block_sparse_matrix.h"
+#include "ceres/context_impl.h"
#include "ceres/cuda_sparse_matrix.h"
#include "ceres/cuda_vector.h"
#include "ceres/internal/config.h"
#include "ceres/internal/eigen.h"
#include "ceres/linear_solver.h"
+#include "gflags/gflags.h"
+
+#ifndef CERES_NO_CUDA
#include "cuda_runtime.h"
+#endif
namespace ceres::internal {
@@ -153,6 +156,10 @@
CHECK_NE(sum, 0.0);
}
+BENCHMARK(BM_CpuRightMultiplyAndAccumulate);
+BENCHMARK(BM_CpuLeftMultiplyAndAccumulate);
+
+#ifndef CERES_NO_CUDA
static void BM_CudaRightMultiplyAndAccumulate(benchmark::State& state) {
// Perform setup here
std::unique_ptr<BlockSparseMatrix> jacobian =
@@ -165,9 +172,7 @@
std::string message;
context.InitCUDA(&message);
CompressedRowSparseMatrix jacobian_crs(
- jacobian->num_rows(),
- jacobian->num_cols(),
- jacobian->num_nonzeros());
+ jacobian->num_rows(), jacobian->num_cols(), jacobian->num_nonzeros());
jacobian->ToCompressedRowSparseMatrix(&jacobian_crs);
CudaSparseMatrix cuda_jacobian(&context, jacobian_crs);
CudaVector cuda_x(&context, 0);
@@ -202,9 +207,7 @@
std::string message;
context.InitCUDA(&message);
CompressedRowSparseMatrix jacobian_crs(
- jacobian->num_rows(),
- jacobian->num_cols(),
- jacobian->num_nonzeros());
+ jacobian->num_rows(), jacobian->num_cols(), jacobian->num_nonzeros());
jacobian->ToCompressedRowSparseMatrix(&jacobian_crs);
CudaSparseMatrix cuda_jacobian(&context, jacobian_crs);
CudaVector cuda_x(&context, 0);
@@ -227,10 +230,9 @@
CHECK_NE(sum, 0.0);
}
-BENCHMARK(BM_CpuRightMultiplyAndAccumulate);
-BENCHMARK(BM_CpuLeftMultiplyAndAccumulate);
BENCHMARK(BM_CudaRightMultiplyAndAccumulate);
BENCHMARK(BM_CudaLeftMultiplyAndAccumulate);
+#endif
BENCHMARK_MAIN();