27#ifdef DEAL_II_WITH_CUDA
35 using ::CUDAWrappers::block_size;
36 using ::CUDAWrappers::chunk_size;
40 template <
typename Number>
42 : val(nullptr,
Utilities::CUDA::delete_device_data<Number>)
48 template <
typename Number>
50 : val(
Utilities::CUDA::allocate_device_data<Number>(V.n_elements),
51 Utilities::CUDA::delete_device_data<Number>)
52 , n_elements(V.n_elements)
55 const cudaError_t error_code = cudaMemcpy(
val.get(),
58 cudaMemcpyDeviceToDevice);
64 template <
typename Number>
74 const cudaError_t error_code = cudaMemcpy(val.get(),
76 n_elements *
sizeof(Number),
77 cudaMemcpyDeviceToDevice);
85 template <
typename Number>
87 : val(nullptr,
Utilities::CUDA::delete_device_data<Number>)
95 template <
typename Number>
102 else if (n != n_elements)
106 if (omit_zeroing_entries ==
false)
108 const cudaError_t error_code =
109 cudaMemset(val.get(), 0, n *
sizeof(Number));
117 template <
typename Number>
120 const bool omit_zeroing_entries)
122 reinit(V.
size(), omit_zeroing_entries);
127 template <
typename Number>
132 std::shared_ptr<const Utilities::MPI::CommunicationPatternBase>)
136 const cudaError_t error_code = cudaMemcpy(val.get(),
138 n_elements *
sizeof(Number),
139 cudaMemcpyHostToDevice);
146 cudaError_t error_code =
147 cudaMalloc(&tmp, n_elements *
sizeof(Number));
151 error_code = cudaMemcpy(tmp,
153 n_elements *
sizeof(Number),
154 cudaMemcpyHostToDevice);
158 const int n_blocks = 1 + (n_elements - 1) / (chunk_size * block_size);
161 <<<n_blocks, block_size>>>(val.get(), tmp, n_elements);
173 template <
typename Number>
180 const cudaError_t error_code =
181 cudaMemset(val.get(), 0, n_elements *
sizeof(Number));
189 template <
typename Number>
194 const int n_blocks = 1 + (n_elements - 1) / (chunk_size * block_size);
196 <<<n_blocks, block_size>>>(val.get(), factor, n_elements);
204 template <
typename Number>
210 const int n_blocks = 1 + (n_elements - 1) / (chunk_size * block_size);
212 <<<n_blocks, block_size>>>(val.get(), 1. / factor, n_elements);
220 template <
typename Number>
226 ExcVectorTypeNotCompatible());
232 "Cannot add two vectors with different numbers of elements"));
234 const int n_blocks = 1 + (n_elements - 1) / (chunk_size * block_size);
237 <<<n_blocks, block_size>>>(val.get(), down_V.
val.get(), n_elements);
245 template <
typename Number>
251 ExcVectorTypeNotCompatible());
257 "Cannot add two vectors with different numbers of elements."));
259 const int n_blocks = 1 + (n_elements - 1) / (chunk_size * block_size);
262 <<<n_blocks, block_size>>>(val.get(), down_V.
val.get(), n_elements);
270 template <
typename Number>
276 ExcVectorTypeNotCompatible());
282 "Cannot add two vectors with different numbers of elements"));
284 Number * result_device;
285 cudaError_t error_code =
286 cudaMalloc(&result_device, n_elements *
sizeof(Number));
288 error_code = cudaMemset(result_device, 0,
sizeof(Number));
290 const int n_blocks = 1 + (n_elements - 1) / (chunk_size * block_size);
292 <<<dim3(n_blocks, 1), dim3(block_size)>>>(result_device,
295 static_cast<unsigned int>(
300 error_code = cudaMemcpy(&result,
303 cudaMemcpyDeviceToHost);
313 template <
typename Number>
318 const int n_blocks = 1 + (n_elements - 1) / (chunk_size * block_size);
320 <<<n_blocks, block_size>>>(val.get(), a, n_elements);
326 template <
typename Number>
334 ExcVectorTypeNotCompatible());
340 "Cannot add two vectors with different numbers of elements."));
342 const int n_blocks = 1 + (n_elements - 1) / (chunk_size * block_size);
344 val.get(), a, down_V.
val.get(), n_elements);
350 template <
typename Number>
362 ExcVectorTypeNotCompatible());
368 "Cannot add two vectors with different numbers of elements."));
372 ExcVectorTypeNotCompatible());
378 "Cannot add two vectors with different numbers of elements."));
380 const int n_blocks = 1 + (n_elements - 1) / (chunk_size * block_size);
382 val.get(), a, down_V.
val.get(), b, down_W.
val.get(), n_elements);
388 template <
typename Number>
399 ExcVectorTypeNotCompatible());
405 "Cannot add two vectors with different numbers of elements."));
407 const int n_blocks = 1 + (n_elements - 1) / (chunk_size * block_size);
409 s, val.get(), a, down_V.
val.get(), n_elements);
415 template <
typename Number>
421 ExcVectorTypeNotCompatible());
426 Assert(down_scaling_factors.
size() == this->size(),
428 "Cannot scale two vectors with different numbers of elements."));
430 const int n_blocks = 1 + (n_elements - 1) / (chunk_size * block_size);
432 val.get(), down_scaling_factors.
val.get(), n_elements);
438 template <
typename Number>
446 ExcVectorTypeNotCompatible());
451 down_V.
size() == this->size(),
453 "Cannot assign two vectors with different numbers of elements."));
455 const int n_blocks = 1 + (n_elements - 1) / (chunk_size * block_size);
457 val.get(), a, down_V.
val.get(), n_elements);
463 template <
typename Number>
472 template <
typename Number>
476 Number * result_device;
477 cudaError_t error_code = cudaMalloc(&result_device,
sizeof(Number));
479 error_code = cudaMemset(result_device, 0,
sizeof(Number));
481 const int n_blocks = 1 + (n_elements - 1) / (chunk_size * block_size);
483 <<<dim3(n_blocks, 1), dim3(block_size)>>>(result_device,
489 error_code = cudaMemcpy(&result,
492 cudaMemcpyDeviceToHost);
503 template <
typename Number>
507 Number * result_device;
508 cudaError_t error_code = cudaMalloc(&result_device,
sizeof(Number));
510 error_code = cudaMemset(result_device, 0,
sizeof(Number));
512 const int n_blocks = 1 + (n_elements - 1) / (chunk_size * block_size);
514 <<<dim3(n_blocks, 1), dim3(block_size)>>>(result_device,
520 error_code = cudaMemcpy(&result,
523 cudaMemcpyDeviceToHost);
533 template <
typename Number>
542 template <
typename Number>
546 return (*
this) * (*this);
551 template <
typename Number>
555 Number * result_device;
556 cudaError_t error_code = cudaMalloc(&result_device,
sizeof(Number));
558 error_code = cudaMemset(result_device, 0,
sizeof(Number));
560 const int n_blocks = 1 + (n_elements - 1) / (chunk_size * block_size);
562 <<<dim3(n_blocks, 1), dim3(block_size)>>>(result_device,
568 error_code = cudaMemcpy(&result,
571 cudaMemcpyDeviceToHost);
581 template <
typename Number>
591 ExcVectorTypeNotCompatible());
593 ExcVectorTypeNotCompatible());
603 Number * result_device;
604 cudaError_t error_code = cudaMalloc(&result_device,
sizeof(Number));
606 error_code = cudaMemset(result_device, 0,
sizeof(Number));
609 const int n_blocks = 1 + (n_elements - 1) / (chunk_size * block_size);
611 <<<dim3(n_blocks, 1), dim3(block_size)>>>(result_device,
619 error_code = cudaMemcpy(&result,
622 cudaMemcpyDeviceToHost);
630 template <
typename Number>
633 const unsigned int precision,
634 const bool scientific,
638 std::ios::fmtflags old_flags = out.flags();
639 unsigned int old_precision = out.precision(precision);
641 out.precision(precision);
643 out.setf(std::ios::scientific, std::ios::floatfield);
645 out.setf(std::ios::fixed, std::ios::floatfield);
652 std::vector<Number> cpu_val(n_elements);
654 for (
unsigned int i = 0; i < n_elements; ++i)
655 out << cpu_val[i] << std::endl;
660 out.flags(old_flags);
661 out.precision(old_precision);
666 template <
typename Number>
670 std::size_t memory =
sizeof(*this);
671 memory +=
sizeof(Number) *
static_cast<std::size_t
>(n_elements);
void print(StreamType &out) const
virtual value_type mean_value() const override
virtual void scale(const VectorSpaceVector< Number > &scaling_factors) override
typename VectorSpaceVector< Number >::value_type value_type
virtual void import_elements(const ReadWriteVector< Number > &V, VectorOperation::values operation, std::shared_ptr< const Utilities::MPI::CommunicationPatternBase > communication_pattern={}) override
std::unique_ptr< Number[], void(*)(Number *)> val
virtual void add(const Number a) override
typename VectorSpaceVector< Number >::size_type size_type
virtual Vector< Number > & operator+=(const VectorSpaceVector< Number > &V) override
virtual void sadd(const Number s, const Number a, const VectorSpaceVector< Number > &V) override
virtual Vector< Number > & operator-=(const VectorSpaceVector< Number > &V) override
virtual Vector< Number > & operator*=(const Number factor) override
virtual Number add_and_dot(const Number a, const VectorSpaceVector< Number > &V, const VectorSpaceVector< Number > &W) override
typename VectorSpaceVector< Number >::real_type real_type
virtual void print(std::ostream &out, const unsigned int precision=2, const bool scientific=true, const bool across=true) const override
real_type norm_sqr() const
virtual real_type l2_norm() const override
Vector & operator=(const Vector< Number > &v)
virtual void equ(const Number a, const VectorSpaceVector< Number > &V) override
virtual Number operator*(const VectorSpaceVector< Number > &V) const override
virtual real_type l1_norm() const override
virtual bool all_zero() const override
virtual real_type linfty_norm() const override
virtual Vector< Number > & operator/=(const Number factor) override
void reinit(const size_type n, const bool omit_zeroing_entries=false)
virtual std::size_t memory_consumption() const override
virtual size_type size() const override
virtual size_type size() const =0
#define DEAL_II_NAMESPACE_OPEN
#define DEAL_II_NAMESPACE_CLOSE
__global__ void add_aVbW(Number *val, const Number a, const Number *V_val, const Number b, const Number *W_val, const size_type N)
__global__ void add_aV(Number *val, const Number a, const Number *V_val, const size_type N)
__global__ void equ(Number *val, const Number a, const Number *V_val, const size_type N)
__global__ void vec_add(Number *val, const Number a, const size_type N)
__global__ void double_vector_reduction(Number *result, const Number *v1, const Number *v2, const size_type N)
__global__ void vec_scale(Number *val, const Number a, const size_type N)
__global__ void reduction(Number *result, const Number *v, const size_type N)
__global__ void add_and_dot(Number *res, Number *v1, const Number *v2, const Number *v3, const Number a, const size_type N)
__global__ void vector_bin_op(Number *v1, const Number *v2, const size_type N)
__global__ void scale(Number *val, const Number *V_val, const size_type N)
__global__ void sadd(const Number s, Number *val, const Number a, const Number *V_val, const size_type N)
static ::ExceptionBase & ExcIO()
#define AssertCudaKernel()
static ::ExceptionBase & ExcZero()
static ::ExceptionBase & ExcNotImplemented()
#define Assert(cond, exc)
#define AssertIsFinite(number)
#define AssertCuda(error_code)
static ::ExceptionBase & ExcMessage(std::string arg1)
#define AssertThrow(cond, exc)
IndexSet complete_index_set(const IndexSet::size_type N)
Number * allocate_device_data(const std::size_t size)
void copy_to_host(const ArrayView< const T, MemorySpace::CUDA > &in, ArrayView< T, MemorySpace::Host > &out)
::VectorizedArray< Number, width > sqrt(const ::VectorizedArray< Number, width > &)
Number linfty_norm(const Tensor< 2, dim, Number > &t)