Taskflow: A General - Taskflow

| | Taskflow: A General-purpose Task-parallel Programming System |

Searching...

No Matches

cuda_memory.hpp

1#pragma once

3#include "cuda_device.hpp"

10namespace tf {

12// ----------------------------------------------------------------------------

13// memory

14// ----------------------------------------------------------------------------

19inline size_t cuda_get_free_mem(int d) {

20cudaScopedDevice ctx(d);

21size_t free, total;

22 TF_CHECK_CUDA(

23 cudaMemGetInfo(&free, &total), "failed to get mem info on device ", d

24 );

25return free;

26}

31inline size_t cuda_get_total_mem(int d) {

32cudaScopedDevice ctx(d);

33size_t free, total;

34 TF_CHECK_CUDA(

35 cudaMemGetInfo(&free, &total), "failed to get mem info on device ", d

36 );

37return total;

38}

47template <typename T>

48T* cuda_malloc_device(size_t N, int d) {

49cudaScopedDevice ctx(d);

50 T* ptr {nullptr};

51 TF_CHECK_CUDA(

52 cudaMalloc(&ptr, N*sizeof(T)),

53"failed to allocate memory (", N*sizeof(T), "bytes) on device ", d

54 )

55return ptr;

56}

64template <typename T>

65T* cuda_malloc_device(size_t N) {

66 T* ptr {nullptr};

67 TF_CHECK_CUDA(

68 cudaMalloc(&ptr, N*sizeof(T)),

69"failed to allocate memory (", N*sizeof(T), "bytes)"

70 )

71return ptr;

72}

80template <typename T>

81T* cuda_malloc_shared(size_t N) {

82 T* ptr {nullptr};

83 TF_CHECK_CUDA(

84 cudaMallocManaged(&ptr, N*sizeof(T)),

85"failed to allocate shared memory (", N*sizeof(T), "bytes)"

86 )

87return ptr;

88}

100template <typename T>

101void cuda_free(T* ptr, int d) {

102cudaScopedDevice ctx(d);

103 TF_CHECK_CUDA(cudaFree(ptr), "failed to free memory ", ptr, " on GPU ", d);

104}

105

115template <typename T>

116void cuda_free(T* ptr) {

117 TF_CHECK_CUDA(cudaFree(ptr), "failed to free memory ", ptr);

118}

119

132inline void cuda_memcpy_async(

133 cudaStream_t stream, void* dst, const void* src, size_t count

{

135 TF_CHECK_CUDA(

136 cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream),

137"failed to perform cudaMemcpyAsync"

138 );

139}

140

153inline void cuda_memset_async(

154 cudaStream_t stream, void* devPtr, int value, size_t count

155){

156 TF_CHECK_CUDA(

157 cudaMemsetAsync(devPtr, value, count, stream),

158"failed to perform cudaMemsetAsync"

159 );

160}

161

162// ----------------------------------------------------------------------------

163// Shared Memory

164// ----------------------------------------------------------------------------

165//

166// Because dynamically sized shared memory arrays are declared "extern",

167// we can't templatize them directly. To get around this, we declare a

168// simple wrapper struct that will declare the extern array with a different

169// name depending on the type. This avoids compiler errors about duplicate

170// definitions.

171//

172// To use dynamically allocated shared memory in a templatized __global__ or

173// __device__ function, just replace code like this:

174//

175// template<class T>

176// __global__ void

177// foo( T* g_idata, T* g_odata)

178// {

179// // Shared mem size is determined by the host app at run time

180// extern __shared__ T sdata[];

181// ...

182// doStuff(sdata);

183// ...

184// }

185//

186// With this:

187//

188// template<class T>

189// __global__ void

190// foo( T* g_idata, T* g_odata)

191// {

192// // Shared mem size is determined by the host app at run time

193// cudaSharedMemory<T> smem;

194// T* sdata = smem.get();

195// ...

196// doStuff(sdata);

197// ...

198// }

199// ----------------------------------------------------------------------------

200

201// This is the un-specialized struct. Note that we prevent instantiation of this

202// struct by putting an undefined symbol in the function body so it won't compile.

206template <typename T>

207struct cudaSharedMemory

208{

209// Ensure that we won't compile any un-specialized types

210 __device__ T *get()

211 {

212extern __device__ void error(void);

213 error();

214return NULL;

215 }

216};

217

218// Following are the specializations for the following types.

219// int, uint, char, uchar, short, ushort, long, ulong, bool, float, and double

220// One could also specialize it for user-defined types.

221

225template <>

226struct cudaSharedMemory <int>

227{

228 __device__ int *get()

229 {

230extern __shared__ int s_int[];

231return s_int;

232 }

233};

234

238template <>

239struct cudaSharedMemory <unsigned int>

240{

241 __device__ unsigned int *get()

242 {

243extern __shared__ unsigned int s_uint[];

244return s_uint;

245 }

246};

247

251template <>

252struct cudaSharedMemory <char>

253{

254 __device__ char *get()

255 {

256extern __shared__ char s_char[];

257return s_char;

258 }

259};

260

264template <>

265struct cudaSharedMemory <unsigned char>

266{

267 __device__ unsigned char *get()

268 {

269extern __shared__ unsigned char s_uchar[];

270return s_uchar;

271 }

272};

273

277template <>

278struct cudaSharedMemory <short>

279{

280 __device__ short *get()

281 {

282extern __shared__ short s_short[];

283return s_short;

284 }

285};

286

290template <>

291struct cudaSharedMemory <unsigned short>

292{

293 __device__ unsigned short *get()

294 {

295extern __shared__ unsigned short s_ushort[];

296return s_ushort;

297 }

298};

299

303template <>

304struct cudaSharedMemory <long>

305{

306 __device__ long *get()

307 {

308extern __shared__ long s_long[];

309return s_long;

310 }

311};

312

316template <>

317struct cudaSharedMemory <unsigned long>

318{

319 __device__ unsigned long *get()

320 {

321extern __shared__ unsigned long s_ulong[];

322return s_ulong;

323 }

324};

325

326//template <>

327//struct cudaSharedMemory <size_t>

328//{

329// __device__ size_t *get()

330// {

331// extern __shared__ size_t s_sizet[];

332// return s_sizet;

333// }

334//};

335

339template <>

340struct cudaSharedMemory <bool>

341{

342 __device__ bool *get()

343 {

344extern __shared__ bool s_bool[];

345return s_bool;

346 }

347};

348

352template <>

353struct cudaSharedMemory <float>

354{

355 __device__ float *get()

356 {

357extern __shared__ float s_float[];

358return s_float;

359 }

360};

361

365template <>

366struct cudaSharedMemory <double>

367{

368 __device__ double *get()

369 {

370extern __shared__ double s_double[];

371return s_double;

372 }

373};

374

375

376

377// ----------------------------------------------------------------------------

378// cudaDeviceAllocator

379// ----------------------------------------------------------------------------

380

384template<typename T>

385class cudaDeviceAllocator {

386

387public:

388

392using value_type = T;

393

397using pointer = T*;

398

402using reference = T&;

403

407using const_pointer = const T*;

408

412using const_reference = const T&;

413

417using size_type = std::size_t;

418

422using difference_type = std::ptrdiff_t;

423

427template<typename U>

428struct rebind {

432using other = cudaDeviceAllocator;

433 };

434

438 cudaDeviceAllocator() noexcept {}

439

443 cudaDeviceAllocator( const cudaDeviceAllocator& ) noexcept {}

444

449template<typename U>

450 cudaDeviceAllocator( const cudaDeviceAllocator& ) noexcept {}

451

455 ~cudaDeviceAllocator() noexcept {}

456

465 pointer address( reference x ) { return &x; }

466

475 const_pointer address( const_reference x ) const { return &x; }

476

493 pointer allocate( size_type n, const void* = 0 )

494 {

495void* ptr = NULL;

496 TF_CHECK_CUDA(

497 cudaMalloc( &ptr, n*sizeof(T) ),

498"failed to allocate ", n, " elements (", n*sizeof(T), "bytes)"

499 )

500 return static_cast<pointer>(ptr);

501 }

502

510void deallocate( pointer ptr, size_type )

511 {

512if(ptr){

513 cudaFree(ptr);

514 }

515 }

516

527 size_type max_size() const noexcept { return size_type {-1}; }

528

532void construct( pointer, const_reference) { }

533

537void destroy( pointer) { }

538

546template <typename U>

547bool operator == (const cudaDeviceAllocator&) const noexcept {

548return true;

549 }

550

558template <typename U>

559bool operator != (const cudaDeviceAllocator&) const noexcept {

560return false;

561 }

562

563};

564

565// ----------------------------------------------------------------------------

566// cudaUSMAllocator

567// ----------------------------------------------------------------------------

568

572template<typename T>

573class cudaUSMAllocator {

574

575public:

576

580using value_type = T;

581

585using pointer = T*;

586

590using reference = T&;

591

595using const_pointer = const T*;

596

600using const_reference = const T&;

601

605using size_type = std::size_t;

606

610using difference_type = std::ptrdiff_t;

611

615template<typename U>

616struct rebind {

620using other = cudaUSMAllocator;

621 };

622

626 cudaUSMAllocator() noexcept {}

627

631 cudaUSMAllocator( const cudaUSMAllocator& ) noexcept {}

632

637template<typename U>

638 cudaUSMAllocator( const cudaUSMAllocator& ) noexcept {}

639

643 ~cudaUSMAllocator() noexcept {}

644

653 pointer address( reference x ) { return &x; }

654

663 const_pointer address( const_reference x ) const { return &x; }

664

681 pointer allocate( size_type n, const void* = 0 )

682 {

683void* ptr {nullptr};

684 TF_CHECK_CUDA(

685 cudaMallocManaged( &ptr, n*sizeof(T) ),

686"failed to allocate ", n, " elements (", n*sizeof(T), "bytes)"

687 )

688 return static_cast<pointer>(ptr);

689 }

690

698void deallocate( pointer ptr, size_type )

699 {

700if(ptr){

701 cudaFree(ptr);

702 }

703 }

704

715 size_type max_size() const noexcept { return size_type {-1}; }

716

724void construct( pointer ptr, const_reference val ) {

725new ((void*)ptr) value_type(val);

726 }

727

736void destroy( pointer ptr ) {

737 ptr->~value_type();

738 }

739

747template <typename U>

748bool operator == (const cudaUSMAllocator&) const noexcept {

749return true;

750 }

751

759template <typename U>

760bool operator != (const cudaUSMAllocator&) const noexcept {

761return false;

762 }

763

764};

765

766// ----------------------------------------------------------------------------

767// GPU vector object

768// ----------------------------------------------------------------------------

769

770//template <typename T>

771//using cudaDeviceVector = std::vector<NoInit<T>, cudaDeviceAllocator<NoInit<T>>>;

772

773//template <typename T>

774//using cudaUSMVector = std::vector<T, cudaUSMAllocator<T>>;

775

779template <typename T>

780class cudaDeviceVector {

781

782public:

783

784 cudaDeviceVector() = default;

785

786 cudaDeviceVector(size_t N) : _N {N} {

787if(N) {

788 TF_CHECK_CUDA(

789 cudaMalloc(&_data, N*sizeof(T)),

790"failed to allocate device memory (", N*sizeof(T), " bytes)"

791 );

792 }

793 }

794

795 cudaDeviceVector(cudaDeviceVector&& rhs) :

796 _data{rhs._data}, _N {rhs._N} {

797 rhs._data = nullptr;

798 rhs._N = 0;

799 }

800

801 ~cudaDeviceVector() {

802if(_data) {

803 cudaFree(_data);

804 }

805 }

806

807 cudaDeviceVector& operator = (cudaDeviceVector&& rhs) {

808if(_data) {

809 cudaFree(_data);

810 }

811 _data = rhs._data;

812 _N = rhs._N;

813 rhs._data = nullptr;

814 rhs._N = 0;

815return *this;

816 }

817

818size_t size() const { return _N; }

819

820 T* data() { return _data; }

821const T* data() const { return _data; }

822

823 cudaDeviceVector(const cudaDeviceVector&) = delete;

824 cudaDeviceVector& operator = (const cudaDeviceVector&) = delete;

825

826private:

827

828 T* _data {nullptr};

829size_t _N {0};

830};

831

832

833} // end of namespace tf -----------------------------------------------------

834

835

836

837

838

839

tf::cudaScopedDevice

class to create an RAII-styled context switch

Definition cuda_device.hpp:289

taskflow namespace

Definition small_vector.hpp:20

tf::cuda_get_free_mem

size_t cuda_get_free_mem(int d)

queries the free memory (expensive call)

Definition cuda_memory.hpp:19

tf::cuda_malloc_device

T * cuda_malloc_device(size_t N, int d)

allocates memory on the given device for holding N elements of type T

Definition cuda_memory.hpp:48

tf::cuda_get_total_mem

size_t cuda_get_total_mem(int d)

queries the total available memory (expensive call)

Definition cuda_memory.hpp:31

tf::cuda_memset_async

void cuda_memset_async(cudaStream_t stream, void *devPtr, int value, size_t count)

initializes or sets GPU memory to the given value byte by byte

Definition cuda_memory.hpp:153

tf::cuda_memcpy_async

void cuda_memcpy_async(cudaStream_t stream, void *dst, const void *src, size_t count)

copies data between host and device asynchronously through a stream

Definition cuda_memory.hpp:132

tf::cuda_free

void cuda_free(T *ptr, int d)

frees memory on the GPU device

Definition cuda_memory.hpp:101

tf::cuda_malloc_shared

T * cuda_malloc_shared(size_t N)

allocates shared memory for holding N elements of type T

Definition cuda_memory.hpp:81

tf::cudaDeviceAllocator::rebind

its member type U is the equivalent allocator type to allocate elements of type U

Definition cuda_memory.hpp:428

tf::cudaDeviceAllocator::rebind::other

cudaDeviceAllocator other

allocator of a different data type

Definition cuda_memory.hpp:432

tf::cudaUSMAllocator::rebind

its member type U is the equivalent allocator type to allocate elements of type U

Definition cuda_memory.hpp:616

tf::cudaUSMAllocator::rebind::other

cudaUSMAllocator other

allocator of a different data type

Definition cuda_memory.hpp:620

taskflow
cuda
cuda_memory.hpp
Maintained by Dr. Tsung-Wei Huang — Generated by 1.13.1