docs/cuda__device_8hpp_source.html
| | Taskflow: A General-purpose Task-parallel Programming System |
Loading...
Searching...
No Matches
cuda_device.hpp
1#pragma once
2
3#include "cuda_error.hpp"
4
9
10namespace tf {
11
15inline size_t cuda_get_num_devices() {
16int N = 0;
17 TF_CHECK_CUDA(cudaGetDeviceCount(&N), "failed to get device count");
18return static_cast<size_t>(N);
19}
20
24inline int cuda_get_device() {
25int id;
26 TF_CHECK_CUDA(cudaGetDevice(&id), "failed to get current device id");
27return id;
28}
29
33inline void cuda_set_device(int id) {
34 TF_CHECK_CUDA(cudaSetDevice(id), "failed to switch to device ", id);
35}
36
40inline void cuda_get_device_property(int i, cudaDeviceProp& p) {
41 TF_CHECK_CUDA(
42 cudaGetDeviceProperties(&p, i), "failed to get property of device ", i
43 );
44}
45
49inline cudaDeviceProp cuda_get_device_property(int i) {
50 cudaDeviceProp p;
51 TF_CHECK_CUDA(
52 cudaGetDeviceProperties(&p, i), "failed to get property of device ", i
53 );
54return p;
55}
56
60inline void cuda_dump_device_property(std::ostream& os, const cudaDeviceProp& p) {
61
62 os << "Major revision number: " << p.major << '\n'
63 << "Minor revision number: " << p.minor << '\n'
64 << "Name: " << p.name << '\n'
65 << "Total global memory: " << p.totalGlobalMem << '\n'
66 << "Total shared memory per block: " << p.sharedMemPerBlock << '\n'
67 << "Total registers per block: " << p.regsPerBlock << '\n'
68 << "Warp size: " << p.warpSize << '\n'
69 << "Maximum memory pitch: " << p.memPitch << '\n'
70 << "Maximum threads per block: " << p.maxThreadsPerBlock << '\n';
71
72 os << "Maximum dimension of block: ";
73for (int i = 0; i < 3; ++i) {
74if(i) os << 'x';
75 os << p.maxThreadsDim[i];
76 }
77 os << '\n';
78
79 os << "Maximum dimension of grid: ";
80for (int i = 0; i < 3; ++i) {
81if(i) os << 'x';
82 os << p.maxGridSize[i];;
83 }
84 os << '\n';
85 os << "Total constant memory: " << p.totalConstMem << '\n'
86 << "Texture alignment: " << p.textureAlignment << '\n'
87 << "Number of multiprocessors: " << p.multiProcessorCount << '\n'
88 << "GPU sharing Host Memory: " << p.integrated << '\n'
89 << "Host page-locked mem mapping: " << p.canMapHostMemory << '\n'
90 << "Alignment for Surfaces: " << p.surfaceAlignment << '\n'
91 << "Device has ECC support: " << p.ECCEnabled << '\n'
92 << "Unified Addressing (UVA): " << p.unifiedAddressing << '\n';
93}
94
98inline size_t cuda_get_device_max_threads_per_block(int d) {
99int threads = 0;
100 TF_CHECK_CUDA(
101 cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, d),
102"failed to query the maximum threads per block on device ", d
103 )
104return threads;
105}
106
110inline size_t cuda_get_device_max_x_dim_per_block(int d) {
111int dim = 0;
112 TF_CHECK_CUDA(
113 cudaDeviceGetAttribute(&dim, cudaDevAttrMaxBlockDimX, d),
114"failed to query the maximum x-dimension per block on device ", d
115 )
116return dim;
117}
118
122inline size_t cuda_get_device_max_y_dim_per_block(int d) {
123int dim = 0;
124 TF_CHECK_CUDA(
125 cudaDeviceGetAttribute(&dim, cudaDevAttrMaxBlockDimY, d),
126"failed to query the maximum y-dimension per block on device ", d
127 )
128return dim;
129}
130
134inline size_t cuda_get_device_max_z_dim_per_block(int d) {
135int dim = 0;
136 TF_CHECK_CUDA(
137 cudaDeviceGetAttribute(&dim, cudaDevAttrMaxBlockDimZ, d),
138"failed to query the maximum z-dimension per block on device ", d
139 )
140return dim;
141}
142
146inline size_t cuda_get_device_max_x_dim_per_grid(int d) {
147int dim = 0;
148 TF_CHECK_CUDA(
149 cudaDeviceGetAttribute(&dim, cudaDevAttrMaxGridDimX, d),
150"failed to query the maximum x-dimension per grid on device ", d
151 )
152return dim;
153}
154
158inline size_t cuda_get_device_max_y_dim_per_grid(int d) {
159int dim = 0;
160 TF_CHECK_CUDA(
161 cudaDeviceGetAttribute(&dim, cudaDevAttrMaxGridDimY, d),
162"failed to query the maximum y-dimension per grid on device ", d
163 )
164return dim;
165}
166
170inline size_t cuda_get_device_max_z_dim_per_grid(int d) {
171int dim = 0;
172 TF_CHECK_CUDA(
173 cudaDeviceGetAttribute(&dim, cudaDevAttrMaxGridDimZ, d),
174"failed to query the maximum z-dimension per grid on device ", d
175 )
176return dim;
177}
178
182inline size_t cuda_get_device_max_shm_per_block(int d) {
183int num = 0;
184 TF_CHECK_CUDA(
185 cudaDeviceGetAttribute(&num, cudaDevAttrMaxSharedMemoryPerBlock, d),
186"failed to query the maximum shared memory per block on device ", d
187 )
188return num;
189}
190
194inline size_t cuda_get_device_warp_size(int d) {
195int num = 0;
196 TF_CHECK_CUDA(
197 cudaDeviceGetAttribute(&num, cudaDevAttrWarpSize, d),
198"failed to query the warp size per block on device ", d
199 )
200return num;
201}
202
206inline int cuda_get_device_compute_capability_major(int d) {
207int num = 0;
208 TF_CHECK_CUDA(
209 cudaDeviceGetAttribute(&num, cudaDevAttrComputeCapabilityMajor, d),
210"failed to query the major number of compute capability of device ", d
211 )
212return num;
213}
214
218inline int cuda_get_device_compute_capability_minor(int d) {
219int num = 0;
220 TF_CHECK_CUDA(
221 cudaDeviceGetAttribute(&num, cudaDevAttrComputeCapabilityMinor, d),
222"failed to query the minor number of compute capability of device ", d
223 )
224return num;
225}
226
230inline bool cuda_get_device_unified_addressing(int d) {
231int num = 0;
232 TF_CHECK_CUDA(
233 cudaDeviceGetAttribute(&num, cudaDevAttrUnifiedAddressing, d),
234"failed to query unified addressing status on device ", d
235 )
236return num;
237}
238
239// ----------------------------------------------------------------------------
240// CUDA Version
241// ----------------------------------------------------------------------------
242
246inline int cuda_get_driver_version() {
247int num = 0;
248 TF_CHECK_CUDA(
249 cudaDriverGetVersion(&num),
250"failed to query the latest cuda version supported by the driver"
251 );
252return num;
253}
254
258inline int cuda_get_runtime_version() {
259int num = 0;
260 TF_CHECK_CUDA(
261 cudaRuntimeGetVersion(&num), "failed to query cuda runtime version"
262 );
263return num;
264}
265
266// ----------------------------------------------------------------------------
267// cudaScopedDevice
268// ----------------------------------------------------------------------------
269
289class cudaScopedDevice {
290
291public:
292
298explicit cudaScopedDevice(int device);
299
303~cudaScopedDevice();
304
305private:
306
307cudaScopedDevice() = delete;
308cudaScopedDevice(const cudaScopedDevice&) = delete;
309cudaScopedDevice(cudaScopedDevice&&) = delete;
310
311int _p;
312};
313
314// Constructor
315inline cudaScopedDevice::cudaScopedDevice(int dev) {
316 TF_CHECK_CUDA(cudaGetDevice(&_p), "failed to get current device scope");
317if(_p == dev) {
318 _p = -1;
319 }
320else {
321 TF_CHECK_CUDA(cudaSetDevice(dev), "failed to scope on device ", dev);
322 }
323}
324
325// Destructor
326inline cudaScopedDevice::~cudaScopedDevice() {
327if(_p != -1) {
328 cudaSetDevice(_p);
329//TF_CHECK_CUDA(cudaSetDevice(_p), "failed to scope back to device ", _p);
330 }
331}
332
333} // end of namespace cuda ---------------------------------------------------
334
335
336
337
338
tf::cudaScopedDevice::cudaScopedDevice
cudaScopedDevice(int device)
constructs a RAII-styled device switcher
Definition cuda_device.hpp:315
tf::cudaScopedDevice::~cudaScopedDevice
~cudaScopedDevice()
destructs the guard and switches back to the previous device context
Definition cuda_device.hpp:326
taskflow namespace
Definition small_vector.hpp:20
tf::cuda_get_device_max_z_dim_per_grid
size_t cuda_get_device_max_z_dim_per_grid(int d)
queries the maximum z-dimension per grid on a device
Definition cuda_device.hpp:170
tf::cuda_get_device_compute_capability_major
int cuda_get_device_compute_capability_major(int d)
queries the major number of compute capability of a device
Definition cuda_device.hpp:206
int cuda_get_device()
gets the current device associated with the caller thread
Definition cuda_device.hpp:24
int cuda_get_runtime_version()
queries the CUDA Runtime version (1000 * major + 10 * minor)
Definition cuda_device.hpp:258
void cuda_get_device_property(int i, cudaDeviceProp &p)
obtains the device property
Definition cuda_device.hpp:40
int cuda_get_driver_version()
queries the latest CUDA version (1000 * major + 10 * minor) supported by the driver
Definition cuda_device.hpp:246
tf::cuda_get_device_max_z_dim_per_block
size_t cuda_get_device_max_z_dim_per_block(int d)
queries the maximum z-dimension per block on a device
Definition cuda_device.hpp:134
tf::cuda_get_device_max_x_dim_per_grid
size_t cuda_get_device_max_x_dim_per_grid(int d)
queries the maximum x-dimension per grid on a device
Definition cuda_device.hpp:146
tf::cuda_get_device_compute_capability_minor
int cuda_get_device_compute_capability_minor(int d)
queries the minor number of compute capability of a device
Definition cuda_device.hpp:218
tf::cuda_get_device_max_y_dim_per_grid
size_t cuda_get_device_max_y_dim_per_grid(int d)
queries the maximum y-dimension per grid on a device
Definition cuda_device.hpp:158
tf::cuda_get_device_max_y_dim_per_block
size_t cuda_get_device_max_y_dim_per_block(int d)
queries the maximum y-dimension per block on a device
Definition cuda_device.hpp:122
tf::cuda_get_device_max_threads_per_block
size_t cuda_get_device_max_threads_per_block(int d)
queries the maximum threads per block on a device
Definition cuda_device.hpp:98
size_t cuda_get_num_devices()
queries the number of available devices
Definition cuda_device.hpp:15
tf::cuda_get_device_unified_addressing
bool cuda_get_device_unified_addressing(int d)
queries if the device supports unified addressing
Definition cuda_device.hpp:230
void cuda_set_device(int id)
switches to a given device context
Definition cuda_device.hpp:33
size_t cuda_get_device_warp_size(int d)
queries the warp size on a device
Definition cuda_device.hpp:194
tf::cuda_get_device_max_shm_per_block
size_t cuda_get_device_max_shm_per_block(int d)
queries the maximum shared memory size in bytes per block on a device
Definition cuda_device.hpp:182
tf::cuda_get_device_max_x_dim_per_block
size_t cuda_get_device_max_x_dim_per_block(int d)
queries the maximum x-dimension per block on a device
Definition cuda_device.hpp:110
void cuda_dump_device_property(std::ostream &os, const cudaDeviceProp &p)
dumps the device property
Definition cuda_device.hpp:60