docs/cuda__graph_8hpp_source.html
| | Taskflow: A General-purpose Task-parallel Programming System |
Loading...
Searching...
No Matches
cuda_graph.hpp
1#pragma once
2
3#include <filesystem>
4
5#include "cuda_memory.hpp"
6#include "cuda_stream.hpp"
7#include "cuda_meta.hpp"
8
9#include "../utility/traits.hpp"
10
11namespace tf {
12
13// ----------------------------------------------------------------------------
14// cudaGraph_t routines
15// ----------------------------------------------------------------------------
16
20template <typename T,
21 std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr
22>
23cudaMemcpy3DParms cuda_get_copy_parms(T* tgt, const T* src, size_t num) {
24
25using U = std::decay_t<T>;
26
27 cudaMemcpy3DParms p;
28
29 p.srcArray = nullptr;
30 p.srcPos = ::make_cudaPos(0, 0, 0);
31 p.srcPtr = ::make_cudaPitchedPtr(const_cast<T*>(src), num*sizeof(U), num, 1);
32 p.dstArray = nullptr;
33 p.dstPos = ::make_cudaPos(0, 0, 0);
34 p.dstPtr = ::make_cudaPitchedPtr(tgt, num*sizeof(U), num, 1);
35 p.extent = ::make_cudaExtent(num*sizeof(U), 1, 1);
36 p.kind = cudaMemcpyDefault;
37
38return p;
39}
40
44inline cudaMemcpy3DParms cuda_get_memcpy_parms(
45void* tgt, const void* src, size_t bytes
47
48// Parameters in cudaPitchedPtr
49// d - Pointer to allocated memory
50// p - Pitch of allocated memory in bytes
51// xsz - Logical width of allocation in elements
52// ysz - Logical height of allocation in elements
53 cudaMemcpy3DParms p;
54 p.srcArray = nullptr;
55 p.srcPos = ::make_cudaPos(0, 0, 0);
56 p.srcPtr = ::make_cudaPitchedPtr(const_cast<void*>(src), bytes, bytes, 1);
57 p.dstArray = nullptr;
58 p.dstPos = ::make_cudaPos(0, 0, 0);
59 p.dstPtr = ::make_cudaPitchedPtr(tgt, bytes, bytes, 1);
60 p.extent = ::make_cudaExtent(bytes, 1, 1);
61 p.kind = cudaMemcpyDefault;
62
63return p;
64}
65
69inline cudaMemsetParams cuda_get_memset_parms(void* dst, int ch, size_t count) {
70
71 cudaMemsetParams p;
72 p.dst = dst;
73 p.value = ch;
74 p.pitch = 0;
75//p.elementSize = (count & 1) == 0 ? ((count & 3) == 0 ? 4 : 2) : 1;
76//p.width = (count & 1) == 0 ? ((count & 3) == 0 ? count >> 2 : count >> 1) : count;
77 p.elementSize = 1; // either 1, 2, or 4
78 p.width = count;
79 p.height = 1;
80
81return p;
82}
83
87template <typename T, std::enable_if_t<
88 is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr
89>
90cudaMemsetParams cuda_get_fill_parms(T* dst, T value, size_t count) {
91
92 cudaMemsetParams p;
93 p.dst = dst;
94
95// perform bit-wise copy
96 p.value = 0; // crucial
97static_assert(sizeof(T) <= sizeof(p.value), "internal error");
98 std::memcpy(&p.value, &value, sizeof(T));
99
100 p.pitch = 0;
101 p.elementSize = sizeof(T); // either 1, 2, or 4
102 p.width = count;
103 p.height = 1;
104
105return p;
106}
107
111template <typename T, std::enable_if_t<
112 is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr
113>
114cudaMemsetParams cuda_get_zero_parms(T* dst, size_t count) {
115
116 cudaMemsetParams p;
117 p.dst = dst;
118 p.value = 0;
119 p.pitch = 0;
120 p.elementSize = sizeof(T); // either 1, 2, or 4
121 p.width = count;
122 p.height = 1;
123
124return p;
125}
126
130inline size_t cuda_graph_get_num_root_nodes(cudaGraph_t graph) {
131size_t num_nodes;
132 TF_CHECK_CUDA(
133 cudaGraphGetRootNodes(graph, nullptr, &num_nodes),
134"failed to get native graph root nodes"
135 );
136return num_nodes;
137}
138
142inline size_t cuda_graph_get_num_nodes(cudaGraph_t graph) {
143size_t num_nodes;
144 TF_CHECK_CUDA(
145 cudaGraphGetNodes(graph, nullptr, &num_nodes),
146"failed to get native graph nodes"
147 );
148return num_nodes;
149}
150
154inline size_t cuda_graph_get_num_edges(cudaGraph_t graph, cudaGraphNode_t* from, cudaGraphNode_t* to) {
155size_t num_edges;
156 TF_CHECK_CUDA(
157 TF_CUDA_PRE13(cudaGraphGetEdges(graph, from, to, &num_edges))
158 TF_CUDA_POST13(cudaGraphGetEdges(graph, from, to, nullptr, &num_edges)),
159"failed to get native graph edges"
160 );
161return num_edges;
162}
163
170inline size_t cuda_graph_node_get_dependencies(cudaGraphNode_t node, cudaGraphNode_t* dependencies) {
171size_t num_predecessors;
172 TF_CHECK_CUDA(
173 TF_CUDA_PRE13(cudaGraphNodeGetDependencies(node, dependencies, &num_predecessors))
174 TF_CUDA_POST13(cudaGraphNodeGetDependencies(node, dependencies, nullptr, &num_predecessors)),
175"Failed to get number of dependencies");
176return num_predecessors;
177}
178
185inline size_t cuda_graph_node_get_dependent_nodes(cudaGraphNode_t node, cudaGraphNode_t *dependent_nodes) {
186size_t num_successors;
187 TF_CHECK_CUDA(
188 TF_CUDA_PRE13(cudaGraphNodeGetDependentNodes(node, dependent_nodes, &num_successors))
189 TF_CUDA_POST13(cudaGraphNodeGetDependentNodes(node, dependent_nodes, nullptr, &num_successors)),
190"Failed to get CUDA dependent nodes");
191return num_successors;
192}
193
201inline void cuda_graph_add_dependencies(cudaGraph_t graph, const cudaGraphNode_t *from, const cudaGraphNode_t *to, size_t numDependencies) {
202 TF_CHECK_CUDA(
203 TF_CUDA_PRE13(cudaGraphAddDependencies(graph, from, to, numDependencies))
204 TF_CUDA_POST13(cudaGraphAddDependencies(graph, from, to, nullptr, numDependencies)),
205"Failed to add CUDA graph node dependencies"
206 );
207}
208
212inline size_t cuda_graph_get_num_edges(cudaGraph_t graph) {
213return cuda_graph_get_num_edges(graph, nullptr, nullptr);
214}
215
216
217
221inline std::vector<cudaGraphNode_t> cuda_graph_get_nodes(cudaGraph_t graph) {
222size_t num_nodes = cuda_graph_get_num_nodes(graph);
223 std::vector<cudaGraphNode_t> nodes(num_nodes);
224 TF_CHECK_CUDA(
225 cudaGraphGetNodes(graph, nodes.data(), &num_nodes),
226"failed to get native graph nodes"
227 );
228return nodes;
229}
230
234inline std::vector<cudaGraphNode_t> cuda_graph_get_root_nodes(cudaGraph_t graph) {
235size_t num_nodes = cuda_graph_get_num_root_nodes(graph);
236 std::vector<cudaGraphNode_t> nodes(num_nodes);
237 TF_CHECK_CUDA(
238 cudaGraphGetRootNodes(graph, nodes.data(), &num_nodes),
239"failed to get native graph nodes"
240 );
241return nodes;
242}
243
247inline std::vector<std::pair<cudaGraphNode_t, cudaGraphNode_t>>
248cuda_graph_get_edges(cudaGraph_t graph) {
249size_t num_edges = cuda_graph_get_num_edges(graph);
250 std::vector<cudaGraphNode_t> froms(num_edges), tos(num_edges);
251 num_edges = cuda_graph_get_num_edges(graph, froms.data(), tos.data());
252 std::vector<std::pair<cudaGraphNode_t, cudaGraphNode_t>> edges(num_edges);
253for(size_t i=0; i<num_edges; i++) {
254 edges[i] = std::make_pair(froms[i], tos[i]);
255 }
256return edges;
257}
258
272inline cudaGraphNodeType cuda_get_graph_node_type(cudaGraphNode_t node) {
273 cudaGraphNodeType type;
274 TF_CHECK_CUDA(
275 cudaGraphNodeGetType(node, &type), "failed to get native graph node type"
276 );
277return type;
278}
279
280// ----------------------------------------------------------------------------
281// cudaTask Types
282// ----------------------------------------------------------------------------
283
287constexpr const char* to_string(cudaGraphNodeType type) {
288switch (type) {
289case cudaGraphNodeTypeKernel: return "Kernel";
290case cudaGraphNodeTypeMemcpy: return "Memcpy";
291case cudaGraphNodeTypeMemset: return "Memset";
292case cudaGraphNodeTypeHost: return "Host";
293case cudaGraphNodeTypeGraph: return "Graph";
294case cudaGraphNodeTypeEmpty: return "Empty";
295case cudaGraphNodeTypeWaitEvent: return "WaitEvent";
296case cudaGraphNodeTypeEventRecord: return "EventRecord";
297case cudaGraphNodeTypeExtSemaphoreSignal: return "ExtSemaphoreSignal";
298case cudaGraphNodeTypeExtSemaphoreWait: return "ExtSemaphoreWait";
299case cudaGraphNodeTypeMemAlloc: return "MemAlloc";
300case cudaGraphNodeTypeMemFree: return "MemFree";
301case cudaGraphNodeTypeConditional: return "Conditional";
302default: return "undefined";
303 }
304}
305
306// ----------------------------------------------------------------------------
307// cudaTask
308// ----------------------------------------------------------------------------
309
316
317template <typename Creator, typename Deleter>
318friend class cudaGraphBase;
319
320template <typename Creator, typename Deleter>
321friend class cudaGraphExecBase;
322
323friend class cudaFlow;
324friend class cudaFlowCapturer;
325friend class cudaFlowCapturerBase;
326
327friend std::ostream& operator <<(std::ostream&, const cudaTask&);
328
329public:
330
335
339cudaTask(const cudaTask&) = default;
340
344cudaTask& operator =(const cudaTask&) = default;
345
355template <typename... Ts>
356cudaTask& precede(Ts&&... tasks);
357
367template <typename... Ts>
368cudaTask& succeed(Ts&&... tasks);
369
373size_t num_successors() const;
374
378size_t num_predecessors() const;
379
383auto type() const;
384
390void dump(std::ostream& os) const;
391
392private:
393
394cudaTask(cudaGraph_t, cudaGraphNode_t);
395
396 cudaGraph_t _native_graph {nullptr};
397 cudaGraphNode_t _native_node {nullptr};
398};
399
400// Constructor
401inline cudaTask::cudaTask(cudaGraph_t native_graph, cudaGraphNode_t native_node) :
402 _native_graph {native_graph}, _native_node {native_node} {
403}
404
405// Function: precede
406template <typename... Ts>
407cudaTask& cudaTask::precede(Ts&&... tasks) {
408 (
409cuda_graph_add_dependencies(
410 _native_graph, &_native_node, &(tasks._native_node), 1
411 ), ...
412 );
413return *this;
414}
415
416// Function: succeed
417template <typename... Ts>
418cudaTask& cudaTask::succeed(Ts&&... tasks) {
419 (tasks.precede(*this), ...);
420return *this;
421}
422
423// Function: num_predecessors
424inline size_t cudaTask::num_predecessors() const {
425return cuda_graph_node_get_dependencies(_native_node, nullptr);
426}
427
428// Function: num_successors
429inline size_t cudaTask::num_successors() const {
430return cuda_graph_node_get_dependent_nodes(_native_node, nullptr);
431}
432
433// Function: type
434inline auto cudaTask::type() const {
435 cudaGraphNodeType type;
436 cudaGraphNodeGetType(_native_node, &type);
437return type;
438}
439
440// Function: dump
441inline void cudaTask::dump(std::ostream& os) const {
442 os << "cudaTask [type=" << to_string(type()) << ']';
443}
444
448inline std::ostream& operator <<(std::ostream& os, const cudaTask& ct) {
449 ct.dump(os);
450return os;
451}
452
453// ----------------------------------------------------------------------------
454// cudaGraph
455// ----------------------------------------------------------------------------
456
465class cudaGraphCreator {
466
467public:
468
478 cudaGraph_t operator ()() const {
479 cudaGraph_t g;
480 TF_CHECK_CUDA(cudaGraphCreate(&g, 0), "failed to create a CUDA native graph");
481return g;
482 }
483
487 cudaGraph_t operator ()(cudaGraph_t graph) const {
488return graph;
489 }
490
491};
492
502class cudaGraphDeleter {
503
504public:
505
513void operator ()(cudaGraph_t g) const {
514 cudaGraphDestroy(g);
515 }
516};
517
518
530template <typename Creator, typename Deleter>
531class cudaGraphBase : public std::unique_ptr<std::remove_pointer_t<cudaGraph_t>, cudaGraphDeleter> {
532
533static_assert(std::is_pointer_v<cudaGraph_t>, "cudaGraph_t is not a pointer type");
534
535public:
536
540using base_type = std::unique_ptr<std::remove_pointer_t<cudaGraph_t>, Deleter>;
541
549template <typename... ArgsT>
550explicit cudaGraphBase(ArgsT&& ... args) : base_type(
551 Creator{}(std::forward<ArgsT>(args)...), Deleter()
552 ) {
553 }
554
558cudaGraphBase(cudaGraphBase&&) = default;
559
563cudaGraphBase& operator =(cudaGraphBase&&) = default;
564
569
574
579
585void dump(std::ostream& os);
586
587// ------------------------------------------------------------------------
588// Graph building routines
589// ------------------------------------------------------------------------
590
604
619template <typename C>
620cudaTask host(C&& callable, void* user_data);
621
636template <typename F, typename... ArgsT>
637cudaTask kernel(dim3 g, dim3 b, size_t s, F f, ArgsT... args);
638
651cudaTask memset(void* dst, int v, size_t count);
652
665cudaTask memcpy(void* tgt, const void* src, size_t bytes);
666
679template <typename T, std::enable_if_t<
680 is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr
681 >
682cudaTask zero(T* dst, size_t count);
683
699template <typename T, std::enable_if_t<
700 is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr
701 >
702cudaTask fill(T* dst, T value, size_t count);
703
718template <typename T,
719 std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr
720 >
721cudaTask copy(T* tgt, const T* src, size_t num);
722
723// ------------------------------------------------------------------------
724// generic algorithms
725// ------------------------------------------------------------------------
726
736template <typename C>
737cudaTask single_task(C c);
738
760template <typename I, typename C, typename E = cudaDefaultExecutionPolicy>
761cudaTask for_each(I first, I last, C callable);
762
791template <typename I, typename C, typename E = cudaDefaultExecutionPolicy>
792cudaTask for_each_index(I first, I last, I step, C callable);
793
817template <typename I, typename O, typename C, typename E = cudaDefaultExecutionPolicy>
818cudaTask transform(I first, I last, O output, C op);
819
846template <typename I1, typename I2, typename O, typename C, typename E = cudaDefaultExecutionPolicy>
847cudaTask transform(I1 first1, I1 last1, I2 first2, O output, C op);
848
849private:
850
851cudaGraphBase(const cudaGraphBase&) = delete;
852cudaGraphBase& operator =(const cudaGraphBase&) = delete;
853};
854
855// query the number of nodes
856template <typename Creator, typename Deleter>
857size_t cudaGraphBase<Creator, Deleter>::num_nodes() const {
858size_t n;
859 TF_CHECK_CUDA(
860 cudaGraphGetNodes(this->get(), nullptr, &n),
861"failed to get native graph nodes"
862 );
863return n;
864}
865
866// query the emptiness
867template <typename Creator, typename Deleter>
868bool cudaGraphBase<Creator, Deleter>::empty() const {
869return num_nodes() == 0;
870}
871
872// query the number of edges
873template <typename Creator, typename Deleter>
874size_t cudaGraphBase<Creator, Deleter>::num_edges() const {
875return cuda_graph_get_num_edges(this->get());
876}
877
879//inline void cudaGraph::dump(std::ostream& os) {
880//
881// // acquire the native handle
882// auto g = this->get();
883//
884// os << "digraph cudaGraph {\n";
885//
886// std::stack<std::tuple<cudaGraph_t, cudaGraphNode_t, int>> stack;
887// stack.push(std::make_tuple(g, nullptr, 1));
888//
889// int pl = 0;
890//
891// while(stack.empty() == false) {
892//
893// auto [graph, parent, l] = stack.top();
894// stack.pop();
895//
896// for(int i=0; i<pl-l+1; i++) {
897// os << "}\n";
898// }
899//
900// os << "subgraph cluster_p" << graph << " {\n"
901// << "label="cudaGraph-L" << l << "";\n"
902// << "color="purple";\n";
903//
904// auto nodes = cuda_graph_get_nodes(graph);
905// auto edges = cuda_graph_get_edges(graph);
906//
907// for(auto& [from, to] : edges) {
908// os << 'p' << from << " -> " << 'p' << to << ";\n";
909// }
910//
911// for(auto& node : nodes) {
912// auto type = cuda_get_graph_node_type(node);
913// if(type == cudaGraphNodeTypeGraph) {
914//
915// cudaGraph_t child_graph;
916// TF_CHECK_CUDA(cudaGraphChildGraphNodeGetGraph(node, &child_graph), "");
917// stack.push(std::make_tuple(child_graph, node, l+1));
918//
919// os << 'p' << node << "["
920// << "shape=folder, style=filled, fontcolor=white, fillcolor=purple, "
921// << "label="cudaGraph-L" << l+1
922// << ""];\n";
923// }
924// else {
925// os << 'p' << node << "[label=""
926// << to_string(type)
927// << ""];\n";
928// }
929// }
930//
931// // precede to parent
932// if(parent != nullptr) {
933// std::unordered_set<cudaGraphNode_t> successors;
934// for(const auto& p : edges) {
935// successors.insert(p.first);
936// }
937// for(auto node : nodes) {
938// if(successors.find(node) == successors.end()) {
939// os << 'p' << node << " -> " << 'p' << parent << ";\n";
940// }
941// }
942// }
943//
944// // set the previous level
945// pl = l;
946// }
947//
948// for(int i=0; i<=pl; i++) {
949// os << "}\n";
950// }
951//}
952
953// dump the graph
954template <typename Creator, typename Deleter>
955void cudaGraphBase<Creator, Deleter>::dump(std::ostream& os) {
956
957// Generate a unique temporary filename in the system's temp directory using filesystem
958auto temp_path = std::filesystem::temp_directory_path() / "graph_";
959 std::random_device rd;
960 std::uniform_int_distribution<int> dist(100000, 999999); // Generates a random number
961 temp_path += std::to_string(dist(rd)) + ".dot";
962
963// Call the original function with the temporary file
964 TF_CHECK_CUDA(cudaGraphDebugDotPrint(this->get(), temp_path.string().c_str(), 0), "");
965
966// Read the file and write to the output stream
967 std::ifstream file(temp_path);
968if (file) {
969 os << file.rdbuf(); // Copy file contents to the stream
970 file.close();
971 std::filesystem::remove(temp_path); // Clean up the temporary file
972 } else {
973 TF_THROW("failed to open ", temp_path, " for dumping the CUDA graph");
974 }
975}
976
977// Function: noop
978template <typename Creator, typename Deleter>
979cudaTask cudaGraphBase<Creator, Deleter>::noop() {
980
981 cudaGraphNode_t node;
982
983 TF_CHECK_CUDA(
984 cudaGraphAddEmptyNode(&node, this->get(), nullptr, 0),
985"failed to create a no-operation (empty) node"
986 );
987
988return cudaTask(this->get(), node);
989}
990
991// Function: host
992template <typename Creator, typename Deleter>
993template <typename C>
994cudaTask cudaGraphBase<Creator, Deleter>::host(C&& callable, void* user_data) {
995
996 cudaGraphNode_t node;
997 cudaHostNodeParams p {callable, user_data};
998
999 TF_CHECK_CUDA(
1000 cudaGraphAddHostNode(&node, this->get(), nullptr, 0, &p),
1001"failed to create a host node"
1002 );
1003
1004return cudaTask(this->get(), node);
1005}
1006
1007// Function: kernel
1008template <typename Creator, typename Deleter>
1009template <typename F, typename... ArgsT>
1010cudaTask cudaGraphBase<Creator, Deleter>::kernel(
1011 dim3 g, dim3 b, size_t s, F f, ArgsT... args
1013
1014 cudaGraphNode_t node;
1015 cudaKernelNodeParams p;
1016
1017void* arguments[sizeof...(ArgsT)] = { (void*)(&args)... };
1018
1019 p.func = (void*)f;
1020 p.gridDim = g;
1021 p.blockDim = b;
1022 p.sharedMemBytes = s;
1023 p.kernelParams = arguments;
1024 p.extra = nullptr;
1025
1026 TF_CHECK_CUDA(
1027 cudaGraphAddKernelNode(&node, this->get(), nullptr, 0, &p),
1028"failed to create a kernel task"
1029 );
1030
1031return cudaTask(this->get(), node);
1032}
1033
1034// Function: zero
1035template <typename Creator, typename Deleter>
1036template <typename T, std::enable_if_t<
1037 is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>*
1038>
1039cudaTask cudaGraphBase<Creator, Deleter>::zero(T* dst, size_t count) {
1040
1041 cudaGraphNode_t node;
1042auto p = cuda_get_zero_parms(dst, count);
1043
1044 TF_CHECK_CUDA(
1045 cudaGraphAddMemsetNode(&node, this->get(), nullptr, 0, &p),
1046"failed to create a memset (zero) task"
1047 );
1048
1049return cudaTask(this->get(), node);
1050}
1051
1052// Function: fill
1053template <typename Creator, typename Deleter>
1054template <typename T, std::enable_if_t<
1055 is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>*
1056>
1057cudaTask cudaGraphBase<Creator, Deleter>::fill(T* dst, T value, size_t count) {
1058
1059 cudaGraphNode_t node;
1060auto p = cuda_get_fill_parms(dst, value, count);
1061 TF_CHECK_CUDA(
1062 cudaGraphAddMemsetNode(&node, this->get(), nullptr, 0, &p),
1063"failed to create a memset (fill) task"
1064 );
1065
1066return cudaTask(this->get(), node);
1067}
1068
1069// Function: copy
1070template <typename Creator, typename Deleter>
1071template <
1072typename T,
1073 std::enable_if_t<!std::is_same_v<T, void>, void>*
1074>
1075cudaTask cudaGraphBase<Creator, Deleter>::copy(T* tgt, const T* src, size_t num) {
1076
1077 cudaGraphNode_t node;
1078auto p = cuda_get_copy_parms(tgt, src, num);
1079
1080 TF_CHECK_CUDA(
1081 cudaGraphAddMemcpyNode(&node, this->get(), nullptr, 0, &p),
1082"failed to create a memcpy (copy) task"
1083 );
1084
1085return cudaTask(this->get(), node);
1086}
1087
1088// Function: memset
1089template <typename Creator, typename Deleter>
1090cudaTask cudaGraphBase<Creator, Deleter>::memset(void* dst, int ch, size_t count) {
1091
1092 cudaGraphNode_t node;
1093auto p = cuda_get_memset_parms(dst, ch, count);
1094
1095 TF_CHECK_CUDA(
1096 cudaGraphAddMemsetNode(&node, this->get(), nullptr, 0, &p),
1097"failed to create a memset task"
1098 );
1099
1100return cudaTask(this->get(), node);
1101}
1102
1103// Function: memcpy
1104template <typename Creator, typename Deleter>
1105cudaTask cudaGraphBase<Creator, Deleter>::memcpy(void* tgt, const void* src, size_t bytes) {
1106
1107 cudaGraphNode_t node;
1108auto p = cuda_get_memcpy_parms(tgt, src, bytes);
1109
1110 TF_CHECK_CUDA(
1111 cudaGraphAddMemcpyNode(&node, this->get(), nullptr, 0, &p),
1112"failed to create a memcpy task"
1113 );
1114
1115return cudaTask(this->get(), node);
1116}
1117
1118
1119
1120
1121
1122} // end of namespace tf -----------------------------------------------------
1123
1124
1125
1126
cudaTask copy(T *tgt, const T *src, size_t num)
creates a memcopy task that copies typed data
Definition cuda_graph.hpp:1075
size_t num_edges() const
queries the number of edges in a native CUDA graph
Definition cuda_graph.hpp:874
cudaTask for_each(I first, I last, C callable)
applies a callable to each dereferenced element of the data array
Definition for_each.hpp:53
cudaTask memset(void *dst, int v, size_t count)
creates a memset task that fills untyped data with a byte value
Definition cuda_graph.hpp:1090
cudaTask kernel(dim3 g, dim3 b, size_t s, F f, ArgsT... args)
creates a kernel task
Definition cuda_graph.hpp:1010
cudaTask fill(T *dst, T value, size_t count)
creates a memset task that fills a typed memory block with a value
Definition cuda_graph.hpp:1057
tf::cudaGraphBase::cudaGraphBase
cudaGraphBase(cudaGraphBase &&)=default
constructs a cudaGraph from the given rhs using move semantics
cudaTask host(C &&callable, void *user_data)
creates a host task that runs a callable on the host
Definition cuda_graph.hpp:994
bool empty() const
queries if the graph is empty
Definition cuda_graph.hpp:868
cudaTask memcpy(void *tgt, const void *src, size_t bytes)
creates a memcpy task that copies untyped data in bytes
Definition cuda_graph.hpp:1105
tf::cudaGraphBase::cudaGraphBase
cudaGraphBase(ArgsT &&... args)
constructs a cudaGraph object by passing the given arguments to the executable CUDA graph creator
Definition cuda_graph.hpp:550
std::unique_ptr< std::remove_pointer_t< cudaGraph_t >, Deleter > base_type
base std::unique_ptr type
Definition cuda_graph.hpp:540
cudaTask transform(I first, I last, O output, C op)
applies a callable to a source range and stores the result in a target range
Definition transform.hpp:65
cudaTask zero(T *dst, size_t count)
creates a memset task that sets a typed memory block to zero
Definition cuda_graph.hpp:1039
tf::cudaGraphBase::single_task
cudaTask single_task(C c)
runs a callable with only a single kernel thread
void dump(std::ostream &os)
dumps the CUDA graph to a DOT format through the given output stream
Definition cuda_graph.hpp:955
cudaTask transform(I1 first1, I1 last1, I2 first2, O output, C op)
creates a task to perform parallel transforms over two ranges of items
Definition transform.hpp:79
tf::cudaGraphBase::for_each_index
cudaTask for_each_index(I first, I last, I step, C callable)
applies a callable to each index in the range with the step size
Definition for_each.hpp:79
size_t num_nodes() const
queries the number of nodes in a native CUDA graph
Definition cuda_graph.hpp:857
cudaGraphBase & operator=(cudaGraphBase &&)=default
assign the rhs to *this using move semantics
cudaTask noop()
creates a no-operation task
Definition cuda_graph.hpp:979
class to create functors that construct CUDA graphs
Definition cuda_graph.hpp:465
tf::cudaGraphCreator::operator()
cudaGraph_t operator()() const
creates a new CUDA graph
Definition cuda_graph.hpp:478
class to create a functor that deletes a CUDA graph
Definition cuda_graph.hpp:502
tf::cudaGraphDeleter::operator()
void operator()(cudaGraph_t g) const
deletes a CUDA graph
Definition cuda_graph.hpp:513
class to create a task handle of a CUDA Graph node
Definition cuda_graph.hpp:315
cudaTask(const cudaTask &)=default
copy-constructs a cudaTask
cudaTask & succeed(Ts &&... tasks)
adds precedence links from other tasks to this
Definition cuda_graph.hpp:418
friend std::ostream & operator<<(std::ostream &, const cudaTask &)
overload of ostream inserter operator for cudaTask
Definition cuda_graph.hpp:448
tf::cudaTask::num_predecessors
size_t num_predecessors() const
queries the number of dependents
Definition cuda_graph.hpp:424
size_t num_successors() const
queries the number of successors
Definition cuda_graph.hpp:429
cudaTask()=default
constructs an empty cudaTask
auto type() const
queries the type of this task
Definition cuda_graph.hpp:434
cudaTask & operator=(const cudaTask &)=default
copy-assigns a cudaTask
cudaTask & precede(Ts &&... tasks)
adds precedence links from this to other tasks
Definition cuda_graph.hpp:407
void dump(std::ostream &os) const
dumps the task through an output stream
Definition cuda_graph.hpp:441
taskflow namespace
Definition small_vector.hpp:20
const char * to_string(TaskType type)
convert a task type to a human-readable string
Definition task.hpp:66
cudaMemsetParams cuda_get_zero_parms(T *dst, size_t count)
gets the memset node parameter of a zero task (typed)
Definition cuda_graph.hpp:114
std::vector< cudaGraphNode_t > cuda_graph_get_root_nodes(cudaGraph_t graph)
acquires the root nodes in a native CUDA graph
Definition cuda_graph.hpp:234
tf::cuda_graph_node_get_dependencies
size_t cuda_graph_node_get_dependencies(cudaGraphNode_t node, cudaGraphNode_t *dependencies)
Handles compatibility with CUDA <= 12.x and CUDA 13.
Definition cuda_graph.hpp:170
tf::cuda_graph_node_get_dependent_nodes
size_t cuda_graph_node_get_dependent_nodes(cudaGraphNode_t node, cudaGraphNode_t *dependent_nodes)
Handles compatibility with CUDA <= 12.x and CUDA 13.
Definition cuda_graph.hpp:185
std::vector< cudaGraphNode_t > cuda_graph_get_nodes(cudaGraph_t graph)
acquires the nodes in a native CUDA graph
Definition cuda_graph.hpp:221
cudaMemcpy3DParms cuda_get_memcpy_parms(void *tgt, const void *src, size_t bytes)
gets the memcpy node parameter of a memcpy task (untyped)
Definition cuda_graph.hpp:44
size_t cuda_graph_get_num_nodes(cudaGraph_t graph)
queries the number of nodes in a native CUDA graph
Definition cuda_graph.hpp:142
tf::cuda_graph_get_num_root_nodes
size_t cuda_graph_get_num_root_nodes(cudaGraph_t graph)
queries the number of root nodes in a native CUDA graph
Definition cuda_graph.hpp:130
cudaMemsetParams cuda_get_memset_parms(void *dst, int ch, size_t count)
gets the memset node parameter of a memcpy task (untyped)
Definition cuda_graph.hpp:69
cudaMemsetParams cuda_get_fill_parms(T *dst, T value, size_t count)
gets the memset node parameter of a fill task (typed)
Definition cuda_graph.hpp:90
std::ostream & operator<<(std::ostream &os, const Task &task)
overload of ostream inserter operator for Task
Definition task.hpp:1532
size_t cuda_graph_get_num_edges(cudaGraph_t graph, cudaGraphNode_t *from, cudaGraphNode_t *to)
Handles compatibility with CUDA <= 12.x and CUDA == 13.x.
Definition cuda_graph.hpp:154
cudaMemcpy3DParms cuda_get_copy_parms(T *tgt, const T *src, size_t num)
gets the memcpy node parameter of a copy task
Definition cuda_graph.hpp:23
tf::cuda_graph_add_dependencies
void cuda_graph_add_dependencies(cudaGraph_t graph, const cudaGraphNode_t *from, const cudaGraphNode_t *to, size_t numDependencies)
Handles compatibility with CUDA <= 12.x and CUDA 13.
Definition cuda_graph.hpp:201
std::vector< std::pair< cudaGraphNode_t, cudaGraphNode_t > > cuda_graph_get_edges(cudaGraph_t graph)
acquires the edges in a native CUDA graph
Definition cuda_graph.hpp:248
cudaGraphNodeType cuda_get_graph_node_type(cudaGraphNode_t node)
queries the type of a native CUDA graph node
Definition cuda_graph.hpp:272