CUDA CGNR, Part 2: CudaVector

* Added CudaVector to manage and operate on vectors with CUDA.
* Added tests for CudaVector

Change-Id: I528258c8e883011e8709bddf59edb1e933af8060
diff --git a/internal/ceres/CMakeLists.txt b/internal/ceres/CMakeLists.txt
index afe91fb..02916b1 100644
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -198,6 +198,7 @@
     cost_function.cc
     covariance.cc
     covariance_impl.cc
+    cuda_vector.cc
     dense_cholesky.cc
     dense_normal_cholesky_solver.cc
     dense_qr.cc
@@ -483,6 +484,7 @@
   ceres_test(cuda_dense_cholesky)
   ceres_test(cuda_dense_qr)
   ceres_test(cuda_kernels)
+  ceres_test(cuda_vector)
   ceres_test(dense_linear_solver)
   ceres_test(dense_cholesky)
   ceres_test(dense_qr)
diff --git a/internal/ceres/context_impl.cc b/internal/ceres/context_impl.cc
index b8f9ff5..9f6cc25 100644
--- a/internal/ceres/context_impl.cc
+++ b/internal/ceres/context_impl.cc
@@ -63,11 +63,11 @@
     cudaStreamDestroy(stream_);
     stream_ = nullptr;
   }
-  cuda_initialized_ = false;
+  is_cuda_initialized_ = false;
 }
 
 bool ContextImpl::InitCUDA(std::string* message) {
-  if (cuda_initialized_) {
+  if (is_cuda_initialized_) {
     return true;
   }
   EventLogger event_logger("InitCuda");
@@ -105,7 +105,7 @@
     return false;
   }
   event_logger.AddEvent("SetStream");
-  cuda_initialized_ = true;
+  is_cuda_initialized_ = true;
   return true;
 }
 #endif  // CERES_NO_CUDA
diff --git a/internal/ceres/context_impl.h b/internal/ceres/context_impl.h
index d4bd436..4bd18e1 100644
--- a/internal/ceres/context_impl.h
+++ b/internal/ceres/context_impl.h
@@ -78,13 +78,13 @@
   // returned.
   bool InitCUDA(std::string* message);
   void TearDown();
+  inline bool IsCUDAInitialized() const { return is_cuda_initialized_; }
 
   cusolverDnHandle_t cusolver_handle_ = nullptr;
   cublasHandle_t cublas_handle_ = nullptr;
   cudaStream_t stream_ = nullptr;
   cusparseHandle_t cusparse_handle_ = nullptr;
-  // Indicates whether all the CUDA resources have been initialized.
-  bool cuda_initialized_ = false;
+  bool is_cuda_initialized_ = false;
 #endif  // CERES_NO_CUDA
 };
 
diff --git a/internal/ceres/cuda_vector.cc b/internal/ceres/cuda_vector.cc
new file mode 100644
index 0000000..5a128de
--- /dev/null
+++ b/internal/ceres/cuda_vector.cc
@@ -0,0 +1,190 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2022 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+//
+// A simple CUDA vector class.
+
+// This include must come before any #ifndef check on Ceres compile options.
+// clang-format off
+#include "ceres/internal/config.h"
+// clang-format on
+
+#include <math.h>
+
+#include "ceres/internal/export.h"
+#include "ceres/types.h"
+#include "ceres/context_impl.h"
+
+#ifndef CERES_NO_CUDA
+
+#include "ceres/cuda_buffer.h"
+#include "ceres/cuda_vector.h"
+#include "ceres/ceres_cuda_kernels.h"
+#include "cublas_v2.h"
+
+namespace ceres::internal {
+
+CudaVector::CudaVector(ContextImpl* context, int size) {
+  DCHECK_NE(context, nullptr);
+  CHECK(context->IsCUDAInitialized());
+  context_ = context;
+  Resize(size);
+}
+
+CudaVector& CudaVector::operator=(const CudaVector& other) {
+  if (this != &other) {
+    Resize(other.num_rows());
+    data_.CopyFromGPUArray(other.data_.data(), num_rows_, context_->stream_);
+  }
+  return *this;
+}
+
+void CudaVector::DestroyDescriptor() {
+  if (descr_ != nullptr) {
+    CHECK_EQ(cusparseDestroyDnVec(descr_), CUSPARSE_STATUS_SUCCESS);
+    descr_ = nullptr;
+  }
+}
+
+CudaVector::~CudaVector() {
+  DestroyDescriptor();
+}
+
+void CudaVector::Resize(int size) {
+  data_.Reserve(size);
+  num_rows_ = size;
+  DestroyDescriptor();
+  CHECK_EQ(cusparseCreateDnVec(&descr_,
+                               num_rows_,
+                               data_.data(),
+                               CUDA_R_64F), CUSPARSE_STATUS_SUCCESS);
+}
+
+double CudaVector::Dot(const CudaVector& x) const {
+  double result = 0;
+  CHECK_EQ(cublasDdot(context_->cublas_handle_,
+                      num_rows_,
+                      data_.data(),
+                      1,
+                      x.data().data(),
+                      1, &result),
+           CUBLAS_STATUS_SUCCESS) << "CuBLAS cublasDdot failed.";
+  return result;
+}
+
+double CudaVector::Norm() const {
+  double result = 0;
+  CHECK_EQ(cublasDnrm2(context_->cublas_handle_,
+                       num_rows_,
+                       data_.data(),
+                       1,
+                       &result),
+           CUBLAS_STATUS_SUCCESS) << "CuBLAS cublasDnrm2 failed.";
+  return result;
+}
+
+void CudaVector::CopyFromCpu(const Vector& x) {
+  data_.Reserve(x.rows());
+  data_.CopyFromCpu(x.data(), x.rows(), context_->stream_);
+  num_rows_ = x.rows();
+  DestroyDescriptor();
+  CHECK_EQ(cusparseCreateDnVec(&descr_,
+                               num_rows_,
+                               data_.data(),
+                               CUDA_R_64F), CUSPARSE_STATUS_SUCCESS);
+}
+
+void CudaVector::CopyTo(Vector* x) const {
+  CHECK(x != nullptr);
+  x->resize(num_rows_);
+  // Need to synchronize with any GPU kernels that may be writing to the
+  // buffer before the transfer happens.
+  CHECK_EQ(cudaStreamSynchronize(context_->stream_), cudaSuccess);
+  data_.CopyToCpu(x->data(), num_rows_);
+}
+
+void CudaVector::CopyTo(double* x) const {
+  CHECK(x != nullptr);
+  // Need to synchronize with any GPU kernels that may be writing to the
+  // buffer before the transfer happens.
+  CHECK_EQ(cudaStreamSynchronize(context_->stream_), cudaSuccess);
+  data_.CopyToCpu(x, num_rows_);
+}
+
+void CudaVector::SetZero() {
+  CHECK(data_.data() != nullptr);
+  CudaSetZeroFP64(data_.data(), num_rows_, context_->stream_);
+}
+
+void CudaVector::Axpby(double a, const CudaVector& x, double b) {
+  if (&x == this) {
+    Scale(a + b);
+    return;
+  }
+  CHECK_EQ(num_rows_, x.num_rows_);
+  if (b != 1.0) {
+    // First scale y by b.
+    CHECK_EQ(cublasDscal(context_->cublas_handle_,
+                        num_rows_,
+                        &b,
+                        data_.data(),
+                        1),
+            CUBLAS_STATUS_SUCCESS) << "CuBLAS cublasDscal failed.";
+  }
+  // Then add a * x to y.
+  CHECK_EQ(cublasDaxpy(context_->cublas_handle_,
+                       num_rows_,
+                       &a,
+                       x.data().data(),
+                       1,
+                       data_.data(),
+                       1),
+           CUBLAS_STATUS_SUCCESS) << "CuBLAS cublasDaxpy failed.";
+}
+
+void CudaVector::DtDxpy(const CudaVector& D, const CudaVector& x) {
+  CudaDtDxpy(data_.data(),
+             D.data().data(),
+             x.data().data(),
+             num_rows_,
+             context_->stream_);
+}
+
+void CudaVector::Scale(double s) {
+  CHECK_EQ(cublasDscal(context_->cublas_handle_,
+                       num_rows_,
+                       &s,
+                       data_.data(),
+                       1),
+           CUBLAS_STATUS_SUCCESS) << "CuBLAS cublasDscal failed.";
+}
+
+}  // namespace ceres::internal
+
+#endif  // CERES_NO_CUDA
\ No newline at end of file
diff --git a/internal/ceres/cuda_vector.h b/internal/ceres/cuda_vector.h
new file mode 100644
index 0000000..4018c1b
--- /dev/null
+++ b/internal/ceres/cuda_vector.h
@@ -0,0 +1,156 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2022 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+//
+// A simple CUDA vector class.
+
+#ifndef CERES_INTERNAL_CUDA_VECTOR_H_
+#define CERES_INTERNAL_CUDA_VECTOR_H_
+
+// This include must come before any #ifndef check on Ceres compile options.
+// clang-format off
+#include "ceres/internal/config.h"
+// clang-format on
+
+#include <math.h>
+#include <memory>
+#include <string>
+
+#include "ceres/internal/export.h"
+#include "ceres/types.h"
+#include "ceres/context_impl.h"
+
+#ifndef CERES_NO_CUDA
+
+#include "ceres/cuda_buffer.h"
+#include "ceres/ceres_cuda_kernels.h"
+#include "ceres/internal/eigen.h"
+#include "cublas_v2.h"
+#include "cusparse.h"
+
+namespace ceres::internal {
+
+// An Nx1 vector, denoted y hosted on the GPU, with CUDA-accelerated operations.
+class CERES_NO_EXPORT CudaVector {
+ public:
+
+  // Create a pre-allocated vector of size N and return a pointer to it. The
+  // caller must ensure that InitCuda() has already been successfully called on
+  // context before calling this method.
+  CudaVector(ContextImpl* context, int size);
+
+  ~CudaVector();
+
+  void Resize(int size);
+
+  // Perform a deep copy of the vector.
+  CudaVector& operator=(const CudaVector&);
+
+  // Return the inner product x' * y.
+  double Dot(const CudaVector& x) const;
+
+  // Return the L2 norm of the vector (||y||_2).
+  double Norm() const;
+
+  // Set all elements to zero.
+  void SetZero();
+
+  // Copy from Eigen vector.
+  void CopyFromCpu(const Vector& x);
+
+  // Copy to Eigen vector.
+  void CopyTo(Vector* x) const;
+
+  // Copy to CPU memory array. It is the caller's responsibility to ensure
+  // that the array is large enough.
+  void CopyTo(double* x) const;
+
+  // y = a * x + b * y.
+  void Axpby(double a, const CudaVector& x, double b);
+
+  // y = diag(d)' * diag(d) * x + y.
+  void DtDxpy(const CudaVector& D, const CudaVector& x);
+
+  // y = s * y.
+  void Scale(double s);
+
+  int num_rows() const { return num_rows_; }
+  int num_cols() const { return 1; }
+
+  const CudaBuffer<double>& data() const { return data_; }
+
+  const cusparseDnVecDescr_t& descr() const { return descr_; }
+
+ private:
+  CudaVector(const CudaVector&) = delete;
+  void DestroyDescriptor();
+
+  int num_rows_ = 0;
+  ContextImpl* context_ = nullptr;
+  CudaBuffer<double> data_;
+  // CuSparse object that describes this dense vector.
+  cusparseDnVecDescr_t descr_ = nullptr;
+};
+
+// Blas1 operations on Cuda vectors. These functions are needed as an
+// abstraction layer so that we can use different versions of a vector style
+// object in the conjugate gradients linear solver.
+inline double Norm(const CudaVector& x) { return x.Norm(); }
+inline void SetZero(CudaVector& x) { x.SetZero(); }
+inline void Axpby(
+    double a,
+    const CudaVector& x,
+    double b,
+    const CudaVector& y,
+    CudaVector& z) {
+  if (&x == &y && &y == &z) {
+    // z = (a + b) * z;
+    z.Scale(a + b);
+  } else if (&x == &z) {
+    // x is aliased to z.
+    // z = x
+    //   = b * y + a * x;
+    z.Axpby(b, y, a);
+  } else if (&y == &z) {
+    // y is aliased to z.
+    // z = y = a * x + b * y;
+    z.Axpby(a, x, b);
+  } else {
+    // General case: all inputs and outputs are distinct.
+    z = y;
+    z.Axpby(a, x, b);
+  }
+}
+inline double Dot(const CudaVector& x, const CudaVector& y) { return x.Dot(y); }
+inline void Copy(const CudaVector& from, CudaVector& to) { to = from; }
+
+}  // namespace ceres::internal
+
+#endif  // CERES_NO_CUDA
+#endif  // CERES_INTERNAL_CUDA_SPARSE_LINEAR_OPERATOR_H_
diff --git a/internal/ceres/cuda_vector_test.cc b/internal/ceres/cuda_vector_test.cc
new file mode 100644
index 0000000..db930ed
--- /dev/null
+++ b/internal/ceres/cuda_vector_test.cc
@@ -0,0 +1,422 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2022 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
+
+#include <string>
+
+#include "ceres/internal/config.h"
+#include "ceres/internal/eigen.h"
+#include "ceres/cuda_vector.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+namespace ceres {
+namespace internal {
+
+#ifndef CERES_NO_CUDA
+
+TEST(CudaVector, Creation) {
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  CudaVector x(&context, 1000);
+  EXPECT_EQ(x.num_rows(), 1000);
+  EXPECT_NE(x.data().data(), nullptr);
+}
+
+TEST(CudaVector, CopyVector) {
+  Vector x(3);
+  x << 1, 2, 3;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  CudaVector y(&context, 10);
+  y.CopyFromCpu(x);
+  EXPECT_EQ(y.num_rows(), 3);
+
+  Vector z(3);
+  z << 0, 0, 0;
+  y.CopyTo(&z);
+  EXPECT_EQ(x, z);
+}
+
+TEST(CudaVector, DeepCopy) {
+  Vector x(3);
+  x << 1, 2, 3;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  CudaVector x_gpu(&context, 3);
+  x_gpu.CopyFromCpu(x);
+
+  CudaVector y_gpu(&context, 3);
+  y_gpu.SetZero();
+  EXPECT_EQ(y_gpu.Norm(), 0.0);
+
+  y_gpu = x_gpu;
+  Vector y(3);
+  y << 0, 0, 0;
+  y_gpu.CopyTo(&y);
+  EXPECT_EQ(x, y);
+}
+
+TEST(CudaVector, Dot) {
+  Vector x(3);
+  Vector y(3);
+  x << 1, 2, 3;
+  y << 100, 10, 1;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  CudaVector x_gpu(&context, 10);
+  CudaVector y_gpu(&context, 10);
+  x_gpu.CopyFromCpu(x);
+  y_gpu.CopyFromCpu(y);
+
+  EXPECT_EQ(x_gpu.Dot(y_gpu), 123.0);
+  EXPECT_EQ(Dot(x_gpu, y_gpu), 123.0);
+}
+
+TEST(CudaVector, Norm) {
+  Vector x(3);
+  x << 1, 2, 3;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  CudaVector x_gpu(&context, 10);
+  x_gpu.CopyFromCpu(x);
+
+  EXPECT_NEAR(x_gpu.Norm(),
+              sqrt(1.0 + 4.0 + 9.0),
+              std::numeric_limits<double>::epsilon());
+
+  EXPECT_NEAR(Norm(x_gpu),
+              sqrt(1.0 + 4.0 + 9.0),
+              std::numeric_limits<double>::epsilon());
+}
+
+TEST(CudaVector, SetZero) {
+  Vector x(4);
+  x << 1, 1, 1, 1;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  CudaVector x_gpu(&context, 10);
+  x_gpu.CopyFromCpu(x);
+
+  EXPECT_NEAR(x_gpu.Norm(),
+              2.0,
+              std::numeric_limits<double>::epsilon());
+
+  x_gpu.SetZero();
+  EXPECT_NEAR(x_gpu.Norm(),
+              0.0,
+              std::numeric_limits<double>::epsilon());
+
+  x_gpu.CopyFromCpu(x);
+  EXPECT_NEAR(x_gpu.Norm(),
+              2.0,
+              std::numeric_limits<double>::epsilon());
+  SetZero(x_gpu);
+  EXPECT_NEAR(x_gpu.Norm(),
+              0.0,
+              std::numeric_limits<double>::epsilon());
+}
+
+TEST(CudaVector, Resize) {
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  CudaVector x_gpu(&context, 10);
+  EXPECT_EQ(x_gpu.num_rows(), 10);
+  x_gpu.Resize(4);
+  EXPECT_EQ(x_gpu.num_rows(), 4);
+}
+
+TEST(CudaVector, Axpy) {
+  Vector x(4);
+  Vector y(4);
+  x << 1, 1, 1, 1;
+  y << 100, 10, 1, 0;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  CudaVector x_gpu(&context, 4);
+  CudaVector y_gpu(&context, 4);
+  x_gpu.CopyFromCpu(x);
+  y_gpu.CopyFromCpu(y);
+
+  x_gpu.Axpby(2.0, y_gpu, 1.0);
+  Vector result;
+  Vector expected(4);
+  expected << 201, 21, 3, 1;
+  x_gpu.CopyTo(&result);
+  EXPECT_EQ(result, expected);
+}
+
+TEST(CudaVector, AxpbyBEquals1) {
+  Vector x(4);
+  Vector y(4);
+  x << 1, 1, 1, 1;
+  y << 100, 10, 1, 0;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCUDA(&message))
+      << "InitCUDA() failed because: " << message;
+  CudaVector x_gpu(&context, 4);
+  CudaVector y_gpu(&context, 4);
+  x_gpu.CopyFromCpu(x);
+  y_gpu.CopyFromCpu(y);
+
+  x_gpu.Axpby(2.0, y_gpu, 1.0);
+  Vector result;
+  Vector expected(4);
+  expected << 201, 21, 3, 1;
+  x_gpu.CopyTo(&result);
+  EXPECT_EQ(result, expected);
+}
+
+TEST(CudaVector, AxpbyMemberFunctionBNotEqual1) {
+  Vector x(4);
+  Vector y(4);
+  x << 1, 1, 1, 1;
+  y << 100, 10, 1, 0;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCUDA(&message))
+      << "InitCUDA() failed because: " << message;
+  CudaVector x_gpu(&context, 4);
+  CudaVector y_gpu(&context, 4);
+  x_gpu.CopyFromCpu(x);
+  y_gpu.CopyFromCpu(y);
+
+  x_gpu.Axpby(2.0, y_gpu, 3.0);
+  Vector result;
+  Vector expected(4);
+  expected << 203, 23, 5, 3;
+  x_gpu.CopyTo(&result);
+  EXPECT_EQ(result, expected);
+}
+
+TEST(CudaVector, AxpbyMemberFunctionBEqual1) {
+  Vector x(4);
+  Vector y(4);
+  x << 1, 1, 1, 1;
+  y << 100, 10, 1, 0;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCUDA(&message))
+      << "InitCUDA() failed because: " << message;
+  CudaVector x_gpu(&context, 4);
+  CudaVector y_gpu(&context, 4);
+  x_gpu.CopyFromCpu(x);
+  y_gpu.CopyFromCpu(y);
+
+  x_gpu.Axpby(2.0, y_gpu, 1.0);
+  Vector result;
+  Vector expected(4);
+  expected << 201, 21, 3, 1;
+  x_gpu.CopyTo(&result);
+  EXPECT_EQ(result, expected);
+}
+
+TEST(CudaVector, AxpbyMemberXAliasesY) {
+  Vector x(4);
+  x << 100, 10, 1, 0;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCUDA(&message))
+      << "InitCUDA() failed because: " << message;
+  CudaVector x_gpu(&context, 4);
+  CudaVector y_gpu(&context, 4);
+  x_gpu.CopyFromCpu(x);
+  y_gpu.SetZero();
+
+  x_gpu.Axpby(2.0, x_gpu, 1.0);
+  Vector result;
+  Vector expected(4);
+  expected << 300, 30, 3, 0;
+  x_gpu.CopyTo(&result);
+  EXPECT_EQ(result, expected);
+}
+
+TEST(CudaVector, AxpbyNonMemberMethodNoAliases) {
+  Vector x(4);
+  Vector y(4);
+  x << 1, 1, 1, 1;
+  y << 100, 10, 1, 0;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCUDA(&message))
+      << "InitCUDA() failed because: " << message;
+  CudaVector x_gpu(&context, 4);
+  CudaVector y_gpu(&context, 4);
+  CudaVector z_gpu(&context, 4);
+  x_gpu.CopyFromCpu(x);
+  y_gpu.CopyFromCpu(y);
+  z_gpu.Resize(4);
+  z_gpu.SetZero();
+
+  Axpby(2.0, x_gpu, 3.0, y_gpu, z_gpu);
+  Vector result;
+  Vector expected(4);
+  expected << 302, 32, 5, 2;
+  z_gpu.CopyTo(&result);
+  EXPECT_EQ(result, expected);
+}
+
+TEST(CudaVector, AxpbyNonMemberMethodXAliasesY) {
+  Vector x(4);
+  x << 100, 10, 1, 0;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCUDA(&message))
+      << "InitCUDA() failed because: " << message;
+  CudaVector x_gpu(&context, 4);
+  CudaVector z_gpu(&context, 4);
+  x_gpu.CopyFromCpu(x);
+  z_gpu.SetZero();
+
+  Axpby(2.0, x_gpu, 3.0, x_gpu, z_gpu);
+  Vector result;
+  Vector expected(4);
+  expected << 500, 50, 5, 0;
+  z_gpu.CopyTo(&result);
+  EXPECT_EQ(result, expected);
+}
+
+TEST(CudaVector, AxpbyNonMemberMethodXAliasesZ) {
+  Vector x(4);
+  Vector y(4);
+  x << 1, 1, 1, 1;
+  y << 100, 10, 1, 0;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCUDA(&message))
+      << "InitCUDA() failed because: " << message;
+  CudaVector x_gpu(&context, 10);
+  CudaVector y_gpu(&context, 10);
+  x_gpu.CopyFromCpu(x);
+  y_gpu.CopyFromCpu(y);
+
+  Axpby(2.0, x_gpu, 3.0, y_gpu, x_gpu);
+  Vector result;
+  Vector expected(4);
+  expected << 302, 32, 5, 2;
+  x_gpu.CopyTo(&result);
+  EXPECT_EQ(result, expected);
+}
+
+TEST(CudaVector, AxpbyNonMemberMethodYAliasesZ) {
+  Vector x(4);
+  Vector y(4);
+  x << 1, 1, 1, 1;
+  y << 100, 10, 1, 0;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCUDA(&message))
+      << "InitCUDA() failed because: " << message;
+  CudaVector x_gpu(&context, 4);
+  CudaVector y_gpu(&context, 4);
+  x_gpu.CopyFromCpu(x);
+  y_gpu.CopyFromCpu(y);
+
+  Axpby(2.0, x_gpu, 3.0, y_gpu, y_gpu);
+  Vector result;
+  Vector expected(4);
+  expected << 302, 32, 5, 2;
+  y_gpu.CopyTo(&result);
+  EXPECT_EQ(result, expected);
+}
+
+TEST(CudaVector, AxpbyNonMemberMethodXAliasesYAliasesZ) {
+  Vector x(4);
+  x << 100, 10, 1, 0;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCUDA(&message))
+      << "InitCUDA() failed because: " << message;
+  CudaVector x_gpu(&context, 10);
+  x_gpu.CopyFromCpu(x);
+
+  Axpby(2.0, x_gpu, 3.0, x_gpu, x_gpu);
+  Vector result;
+  Vector expected(4);
+  expected << 500, 50, 5, 0;
+  x_gpu.CopyTo(&result);
+  EXPECT_EQ(result, expected);
+}
+
+TEST(CudaVector, DtDxpy) {
+  Vector x(4);
+  Vector y(4);
+  Vector D(4);
+  x << 1, 2, 3, 4;
+  y << 100, 10, 1, 0;
+  D << 4, 3, 2, 1;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  CudaVector x_gpu(&context, 4);
+  CudaVector y_gpu(&context, 4);
+  CudaVector D_gpu(&context, 4);
+  x_gpu.CopyFromCpu(x);
+  y_gpu.CopyFromCpu(y);
+  D_gpu.CopyFromCpu(D);
+
+  y_gpu.DtDxpy(D_gpu, x_gpu);
+  Vector result;
+  Vector expected(4);
+  expected << 116, 28, 13, 4;
+  y_gpu.CopyTo(&result);
+  EXPECT_EQ(result, expected);
+}
+
+TEST(CudaVector, Scale) {
+  Vector x(4);
+  x << 1, 2, 3, 4;
+  ContextImpl context;
+  std::string message;
+  CHECK(context.InitCUDA(&message)) << "InitCUDA() failed because: " << message;
+  CudaVector x_gpu(&context, 4);
+  x_gpu.CopyFromCpu(x);
+
+  x_gpu.Scale(-3.0);
+
+  Vector result;
+  Vector expected(4);
+  expected << -3.0, -6.0, -9.0, -12.0;
+  x_gpu.CopyTo(&result);
+  EXPECT_EQ(result, expected);
+}
+
+#endif  // CERES_NO_CUDA
+
+}  // namespace internal
+}  // namespace ceres