template<typename Creator, typename Deleter> tf::cudaGraphBase class

class to create a CUDA graph with uunique ownership

Template parameters
Creator
Deleter

This class wraps a cudaGraph_t handle with std::unique_ptr to ensure proper resource management and automatic cleanup.

Public types

using base_type = std::unique_ptr<std::remove_pointer_t<cudaGraph_t>, Deleter> base std::unique_ptr type

Constructors, destructors, conversion operators

template<typename... ArgsT> cudaGraphBase(ArgsT && ... args) explicit constructs a cudaGraph object by passing the given arguments to the executable CUDA graph creatorcudaGraphBase(cudaGraphBase&&) defaultedconstructs a cudaGraph from the given rhs using move semantics

Public functions

auto operator=(cudaGraphBase&&) -> cudaGraphBase& defaultedassign the rhs to *this using move semanticsauto num_nodes() const -> size_tqueries the number of nodes in a native CUDA graphauto num_edges() const -> size_tqueries the number of edges in a native CUDA graphauto empty() const -> boolqueries if the graph is emptyvoid dump(std::ostream& os)dumps the CUDA graph to a DOT format through the given output streamauto noop() -> cudaTaskcreates a no-operation task template<typename C> auto host(C&& callable, void* user_data) -> cudaTaskcreates a host task that runs a callable on the host template<typename F, typename... ArgsT> auto kernel(dim3 g, dim3 b, size_t s, F f, ArgsT... args) -> cudaTaskcreates a kernel taskauto memset(void* dst, int v, size_t count) -> cudaTaskcreates a memset task that fills untyped data with a byte valueauto memcpy(void* tgt, const void* src, size_t bytes) -> cudaTaskcreates a memcpy task that copies untyped data in bytes template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr> auto zero(T* dst, size_t count) -> cudaTaskcreates a memset task that sets a typed memory block to zero template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr> auto fill(T* dst, T value, size_t count) -> cudaTaskcreates a memset task that fills a typed memory block with a value template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr> auto copy(T* tgt, const T* src, size_t num) -> cudaTaskcreates a memcopy task that copies typed data template<typename C> auto single_task(C c) -> cudaTaskruns a callable with only a single kernel thread template<typename I, typename C, typename E = cudaDefaultExecutionPolicy> auto for_each(I first, I last, C callable) -> cudaTaskapplies a callable to each dereferenced element of the data array template<typename I, typename C, typename E = cudaDefaultExecutionPolicy> auto for_each_index(I first, I last, I step, C callable) -> cudaTaskapplies a callable to each index in the range with the step size template<typename I, typename O, typename C, typename E = cudaDefaultExecutionPolicy> auto transform(I first, I last, O output, C op) -> cudaTaskapplies a callable to a source range and stores the result in a target range template<typename I1, typename I2, typename O, typename C, typename E = cudaDefaultExecutionPolicy> auto transform(I1 first1, I1 last1, I2 first2, O output, C op) -> cudaTaskcreates a task to perform parallel transforms over two ranges of items

Function documentation

template<typename Creator, typename Deleter> template<typename... ArgsT> tf::cudaGraphBase<Creator, Deleter>::cudaGraphBase(ArgsT && ... args) explicit

constructs a cudaGraph object by passing the given arguments to the executable CUDA graph creator

Parameters
args

Constructs a cudaGraph object by passing the given arguments to the executable CUDA graph creator

template<typename Creator, typename Deleter> void tf::cudaGraphBase<Creator, Deleter>::dump(std::ostream& os)

dumps the CUDA graph to a DOT format through the given output stream

Parameters
os

template<typename Creator, typename Deleter> cudaTask tf::cudaGraphBase<Creator, Deleter>::noop()

creates a no-operation task

| Returns | a tf::cudaTask handle |

An empty node performs no operation during execution, but can be used for transitive ordering. For example, a phased execution graph with 2 groups of n nodes with a barrier between them can be represented using an empty node and 2*n dependency edges, rather than no empty node and n^2 dependency edges.

template<typename Creator, typename Deleter> template<typename C> cudaTask tf::cudaGraphBase<Creator, Deleter>::host(C&& callable, void* user_data)

creates a host task that runs a callable on the host

Template parameters
C
Parameters
---
callable
user_data
Returns

A host task can only execute CPU-specific functions and cannot do any CUDA calls (e.g., cudaMalloc).

template<typename Creator, typename Deleter> template<typename F, typename... ArgsT> cudaTask tf::cudaGraphBase<Creator, Deleter>::kernel(dim3 g, dim3 b, size_t s, F f, ArgsT... args)

creates a kernel task

Template parameters
F
ArgsT
Parameters
---
g
b
s
f
args
Returns

template<typename Creator, typename Deleter> cudaTask tf::cudaGraphBase<Creator, Deleter>::memset(void* dst, int v, size_t count)

creates a memset task that fills untyped data with a byte value

Parameters
dst
v
count
Returns

A memset task fills the first count bytes of device memory area pointed by dst with the byte value v.

template<typename Creator, typename Deleter> cudaTask tf::cudaGraphBase<Creator, Deleter>::memcpy(void* tgt, const void* src, size_t bytes)

creates a memcpy task that copies untyped data in bytes

Parameters
tgt
src
bytes
Returns

A memcpy task transfers bytes of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs.

template<typename Creator, typename Deleter> template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr> cudaTask tf::cudaGraphBase<Creator, Deleter>::zero(T* dst, size_t count)

creates a memset task that sets a typed memory block to zero

Template parameters
T
Parameters
---
dst
count
Returns

A zero task zeroes the first count elements of type T in a device memory area pointed by dst.

template<typename Creator, typename Deleter> template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr> cudaTask tf::cudaGraphBase<Creator, Deleter>::fill(T* dst, T value, size_t count)

creates a memset task that fills a typed memory block with a value

Template parameters
T
Parameters
---
dst
value
count
Returns

A fill task fills the first count elements of type T with value in a device memory area pointed by dst. The value to fill is interpreted in type T rather than byte.

template<typename Creator, typename Deleter> template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr> cudaTask tf::cudaGraphBase<Creator, Deleter>::copy(T* tgt, const T* src, size_t num)

creates a memcopy task that copies typed data

Template parameters
T
Parameters
---
tgt
src
num
Returns

A copy task transfers num*sizeof(T) bytes of data from a source location to a target location. Direction can be arbitrary among CPUs and GPUs.

template<typename Creator, typename Deleter> template<typename C> cudaTask tf::cudaGraphBase<Creator, Deleter>::single_task(C c)

runs a callable with only a single kernel thread

Template parameters
C
Parameters
---
c
Returns

template<typename Creator, typename Deleter> template<typename I, typename C, typename E = cudaDefaultExecutionPolicy> cudaTask tf::cudaGraphBase<Creator, Deleter>::for_each(I first, I last, C callable)

applies a callable to each dereferenced element of the data array

Template parameters
I
C
E
Parameters
---
first
last
callable
Returns

This method is equivalent to the parallel execution of the following loop on a GPU:

for(auto itr = first; itr != last; itr++) {callable(\*itr);}

template<typename Creator, typename Deleter> template<typename I, typename C, typename E = cudaDefaultExecutionPolicy> cudaTask tf::cudaGraphBase<Creator, Deleter>::for_each_index(I first, I last, I step, C callable)

applies a callable to each index in the range with the step size

Template parameters
I
C
E
Parameters
---
first
last
step
callable
Returns

This method is equivalent to the parallel execution of the following loop on a GPU:

// step is positive [first, last)for(auto i=first; i\<last; i+=step) {callable(i);}// step is negative [first, last)for(auto i=first; i\>last; i+=step) {callable(i);}

template<typename Creator, typename Deleter> template<typename I, typename O, typename C, typename E = cudaDefaultExecutionPolicy> cudaTask tf::cudaGraphBase<Creator, Deleter>::transform(I first, I last, O output, C op)

applies a callable to a source range and stores the result in a target range

Template parameters
I
O
C
E
Parameters
---
first
last
output
op
Returns

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first != last) {\*output++ = callable(\*first++);}

template<typename Creator, typename Deleter> template<typename I1, typename I2, typename O, typename C, typename E = cudaDefaultExecutionPolicy> cudaTask tf::cudaGraphBase<Creator, Deleter>::transform(I1 first1, I1 last1, I2 first2, O output, C op)

creates a task to perform parallel transforms over two ranges of items

Template parameters
I1
I2
O
C
E
Parameters
---
first1
last1
first2
output
op
Returns

This method is equivalent to the parallel execution of the following loop on a GPU:

while (first1 != last1) {\*output++ = op(\*first1++, \*first2++);}