11#ifndef CUBBYFLOW_CUDA_ARRAY_IMPL_HPP
12#define CUBBYFLOW_CUDA_ARRAY_IMPL_HPP
14#ifdef CUBBYFLOW_USE_CUDA
21template <
typename T,
size_t N,
size_t I>
24 template <
typename... RemainingIndices>
25 CUBBYFLOW_CUDA_HOST_DEVICE
static void Call(CUDAArrayView<const T, N> src,
26 CUDAStdArray<size_t, N> size,
27 CUDAArrayView<T, N> dst,
28 RemainingIndices... indices)
30 for (
size_t i = 0; i < size[I - 1]; ++i)
32 CUDABlockCopyHelper<T, N, I - 1>::Call(src, size, dst, i,
38template <
typename T,
size_t N>
41 template <
typename... RemainingIndices>
42 CUBBYFLOW_CUDA_HOST_DEVICE
static void Call(CUDAArrayView<const T, N> src,
43 CUDAStdArray<size_t, N> size,
44 CUDAArrayView<T, N> dst,
45 RemainingIndices... indices)
47 for (
size_t i = 0; i < size[0]; ++i)
49 dst(i, indices...) = src(i, indices...);
54template <
typename T,
size_t N>
62 CUDABlockCopyHelper<T, N, N - 1>::Call(
src,
size,
dst, i);
78template <
typename T,
size_t N>
81 static void Call(CUDAArrayView<const T, N> src,
82 CUDAStdArray<size_t, N> size, CUDAArrayView<T, N> dst)
90 unsigned int numBlocks, numThreads;
91 CUDAComputeGridSize((
unsigned int)size[N - 1], 256, numBlocks,
93 CUDABlockCopyKernelN<<<numBlocks, numThreads>>>(src, size, dst);
94 CUBBYFLOW_CUDA_CHECK_LAST_ERROR(
95 "Failed executing CUDABlockCopyKernelN");
102 static void Call(CUDAArrayView<const T, 1> src,
103 CUDAStdArray<size_t, 1> size, CUDAArrayView<T, 1> dst)
111 unsigned int numBlocks, numThreads;
112 CUDAComputeGridSize((
unsigned int)size[0], 256, numBlocks, numThreads);
113 CUDABlockCopyKernel1<<<numBlocks, numThreads>>>(src, size, dst);
114 CUBBYFLOW_CUDA_CHECK_LAST_ERROR(
115 "Failed executing CUDABlockCopyKernel1");
121template <
typename T,
size_t N>
122CUDAArray<T, N>::CUDAArray() : Base()
127template <
typename T,
size_t N>
128CUDAArray<T, N>::CUDAArray(
const CUDAStdArray<size_t, N>& size,
134 for (
size_t i = 1; i < N; ++i)
139 m_data.Resize(l, initVal);
140 Base::SetPtrAndSize(m_data.data(), size);
144template <
typename T,
size_t N>
145template <
typename... Args>
146CUDAArray<T, N>::CUDAArray(
size_t nx, Args... args) : CUDAArray{}
148 Vector<size_t, N> newSizeVec;
150 Internal::GetSizeAndInitVal<T, N, N - 1>::Call(newSizeVec, initVal, nx,
152 CUDAStdArray<size_t, N> newSize(newSizeVec);
153 CUDAArray newArray(newSize, initVal);
154 *
this = std::move(newArray);
157template <
typename T,
size_t N>
158CUDAArray<T, N>::CUDAArray(NestedInitializerListsT<T, N> lst) : CUDAArray{}
160 Vector<size_t, N> newSize;
161 Internal::GetSizeFromInitList<T, N, N>::Call(newSize, lst);
163 Array<T, N> newCpuArray(newSize);
164 Internal::SetArrayFromInitList<T, N, N>::Call(newCpuArray, lst);
165 CopyFrom(newCpuArray);
169template <
typename T,
size_t N>
171CUDAArray<T, N>::CUDAArray(
172 const std::enable_if_t<(M == 1), std::vector<T>>& vec)
178template <
typename T,
size_t N>
179template <
typename OtherDerived>
180CUDAArray<T, N>::CUDAArray(
const ArrayBase<T, N, OtherDerived>& other)
186template <
typename T,
size_t N>
187template <
typename OtherDerived>
188CUDAArray<T, N>::CUDAArray(
const CUDAArrayBase<T, N, OtherDerived>& other)
194template <
typename T,
size_t N>
195CUDAArray<T, N>::CUDAArray(
const CUDAArray& other) : CUDAArray{}
200template <
typename T,
size_t N>
201CUDAArray<T, N>::CUDAArray(CUDAArray&& other) noexcept : CUDAArray{}
203 *
this = std::move(other);
206template <
typename T,
size_t N>
208CUDAArray<T, N>& CUDAArray<T, N>::operator=(
209 const std::enable_if_t<(M == 1), std::vector<T>>& vec)
215template <
typename T,
size_t N>
216template <
typename OtherDerived>
217CUDAArray<T, N>& CUDAArray<T, N>::operator=(
218 const ArrayBase<T, N, OtherDerived>& other)
224template <
typename T,
size_t N>
225template <
typename OtherDerived>
226CUDAArray<T, N>& CUDAArray<T, N>::operator=(
227 const ArrayBase<const T, N, OtherDerived>& other)
233template <
typename T,
size_t N>
234template <
typename OtherDerived>
235CUDAArray<T, N>& CUDAArray<T, N>::operator=(
236 const CUDAArrayBase<T, N, OtherDerived>& other)
242template <
typename T,
size_t N>
243template <
typename OtherDerived>
244CUDAArray<T, N>& CUDAArray<T, N>::operator=(
245 const CUDAArrayBase<const T, N, OtherDerived>& other)
251template <
typename T,
size_t N>
252CUDAArray<T, N>& CUDAArray<T, N>::operator=(
const CUDAArray& other)
254 m_data = other.m_data;
255 Base::SetPtrAndSize(m_data.data(), other.Size());
259template <
typename T,
size_t N>
260CUDAArray<T, N>& CUDAArray<T, N>::operator=(CUDAArray&& other)
noexcept
267template <
typename T,
size_t N>
268template <
typename A,
size_t M>
269std::enable_if_t<(M == 1),
void> CUDAArray<T, N>::CopyFrom(
270 const std::vector<T, A>& vec)
272 CUDAArray newArray(vec.size());
273 newArray.m_data.CopyFrom(vec);
274 newArray.SetPtrAndSize(newArray.m_data.data(), newArray.Size());
275 *
this = std::move(newArray);
278template <
typename T,
size_t N>
279template <
typename OtherDerived>
280void CUDAArray<T, N>::CopyFrom(
const ArrayBase<T, N, OtherDerived>& other)
282 CUDAArray newArray(other.Size());
283 CUDACopyHostToDevice(other.data(), other.Length(), newArray.data());
284 *
this = std::move(newArray);
287template <
typename T,
size_t N>
288template <
typename OtherDerived>
289void CUDAArray<T, N>::CopyFrom(
const ArrayBase<const T, N, OtherDerived>& other)
291 CUDAArray newArray(other.Size());
292 CUDACopyHostToDevice(other.data(), other.Length(), newArray.data());
293 *
this = std::move(newArray);
296template <
typename T,
size_t N>
297template <
typename OtherDerived>
298void CUDAArray<T, N>::CopyFrom(
const CUDAArrayBase<T, N, OtherDerived>& other)
300 CUDAArray newArray(other.Size());
301 CUDACopyDeviceToDevice(other.data(), other.Length(), newArray.data());
302 *
this = std::move(newArray);
305template <
typename T,
size_t N>
306template <
typename OtherDerived>
307void CUDAArray<T, N>::CopyFrom(
308 const CUDAArrayBase<const T, N, OtherDerived>& other)
310 CUDAArray newArray(other.Size());
311 CUDACopyDeviceToDevice(other.data(), other.Length(), newArray.data());
312 *
this = std::move(newArray);
315template <
typename T,
size_t N>
316template <
typename A,
size_t M>
317std::enable_if_t<(M == 1),
void> CUDAArray<T, N>::CopyTo(std::vector<T, A>& vec)
319 vec.resize(Length());
320 CUDACopyDeviceToHost(data(), Length(), vec.data());
323template <
typename T,
size_t N>
324void CUDAArray<T, N>::CopyTo(Array<T, N>& other)
326 other.Resize(m_size.ToVector());
327 CUDACopyDeviceToHost(data(), Length(), other.data());
330template <
typename T,
size_t N>
331void CUDAArray<T, N>::CopyTo(ArrayView<T, N>& other)
333 assert(m_size.ToVector() == other.Size());
334 CUDACopyDeviceToHost(data(), Length(), other.data());
337template <
typename T,
size_t N>
338void CUDAArray<T, N>::CopyTo(CUDAArray<T, N>& other)
340 other.Resize(m_size.ToVector());
341 CUDACopyDeviceToDevice(data(), Length(), other.data());
344template <
typename T,
size_t N>
345void CUDAArray<T, N>::CopyTo(CUDAArrayView<T, N>& other)
347 assert(Length() == other.Length());
348 CUDACopyDeviceToDevice(data(), Length(), other.data());
351template <
typename T,
size_t N>
352void CUDAArray<T, N>::Fill(
const T& val)
358template <
typename T,
size_t N>
359void CUDAArray<T, N>::Resize(CUDAStdArray<size_t, N> newSize,
const T& initVal)
361 CUDAArray newArray(newSize, initVal);
362 CUDAStdArray<size_t, N> minSize;
364 for (
size_t i = 0; i < N; ++i)
366 minSize[i] = std::min(m_size[i], newArray.m_size[i]);
369 Internal::CUDABlockCopy<T, N>::Call(View(), minSize, newArray.View());
371 *
this = std::move(newArray);
374template <
typename T,
size_t N>
375template <
typename... Args>
376void CUDAArray<T, N>::Resize(
size_t nx, Args... args)
378 Vector<size_t, N> newSizeVec;
381 Internal::GetSizeAndInitVal<T, N, N - 1>::Call(newSizeVec, initVal, nx,
384 CUDAStdArray<size_t, N> newSize(newSizeVec);
385 Resize(newSize, initVal);
389template <
typename T,
size_t N>
391std::enable_if_t<(M == 1),
void> CUDAArray<T, N>::Append(
const T& val)
393 m_data.PushBack(val);
394 Base::SetPtrAndSize(m_data.data(), m_data.Size());
397template <
typename T,
size_t N>
398template <
typename A,
size_t M>
399std::enable_if_t<(M == 1),
void> CUDAArray<T, N>::Append(
400 const std::vector<T, A>& extra)
402 m_data.Append(extra);
403 m_size[0] = m_data.size();
406template <
typename T,
size_t N>
407template <
typename OtherDerived,
size_t M>
408std::enable_if_t<(M == 1),
void> CUDAArray<T, N>::Append(
409 const ArrayBase<T, N, OtherDerived>& extra)
411 CUDAArray newArray(Length() + extra.Length());
412 CUDACopy(data(), Length(), newArray.data());
413 CUDACopyHostToDevice(extra.data(), extra.Length(),
414 newArray.data() + m_size[0]);
418template <
typename T,
size_t N>
419template <
typename OtherDerived,
size_t M>
420std::enable_if_t<(M == 1),
void> CUDAArray<T, N>::Append(
421 const CUDAArrayBase<T, N, OtherDerived>& extra)
423 CUDAArray newArray(Length() + extra.Length());
424 CUDACopy(data(), Length(), newArray.data());
425 CUDACopy(extra.data(), extra.Length(), newArray.data() + m_size[0]);
429template <
typename T,
size_t N>
430void CUDAArray<T, N>::Clear()
432 Base::ClearPtrAndSize();
436template <
typename T,
size_t N>
437void CUDAArray<T, N>::Swap(CUDAArray& other)
439 Base::SwapPtrAndSize(other);
440 m_data.Swap(other.m_data);
443template <
typename T,
size_t N>
444CUDAArrayView<T, N> CUDAArray<T, N>::View()
446 return CUDAArrayView<T, N>(*
this);
449template <
typename T,
size_t N>
450CUDAArrayView<const T, N> CUDAArray<T, N>::View()
const
452 return CUDAArrayView<const T, N>(*
this);
Definition pybind11Utils.hpp:21
Matrix< T, Rows, 1 > Vector
Definition Matrix.hpp:738