Back to Taskflow

Taskflow: A General

docs/cuda__graph_8hpp_source.html

4.1.046.8 KB
Original Source

| | Taskflow: A General-purpose Task-parallel Programming System |

Loading...

Searching...

No Matches

cuda_graph.hpp

1#pragma once

2

3#include <filesystem>

4

5#include "cuda_memory.hpp"

6#include "cuda_stream.hpp"

7#include "cuda_meta.hpp"

8

9#include "../utility/traits.hpp"

10

11namespace tf {

12

13// ----------------------------------------------------------------------------

14// cudaGraph_t routines

15// ----------------------------------------------------------------------------

16

20template <typename T,

21 std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr

22>

23cudaMemcpy3DParms cuda_get_copy_parms(T* tgt, const T* src, size_t num) {

24

25using U = std::decay_t<T>;

26

27 cudaMemcpy3DParms p;

28

29 p.srcArray = nullptr;

30 p.srcPos = ::make_cudaPos(0, 0, 0);

31 p.srcPtr = ::make_cudaPitchedPtr(const_cast<T*>(src), num*sizeof(U), num, 1);

32 p.dstArray = nullptr;

33 p.dstPos = ::make_cudaPos(0, 0, 0);

34 p.dstPtr = ::make_cudaPitchedPtr(tgt, num*sizeof(U), num, 1);

35 p.extent = ::make_cudaExtent(num*sizeof(U), 1, 1);

36 p.kind = cudaMemcpyDefault;

37

38return p;

39}

40

44inline cudaMemcpy3DParms cuda_get_memcpy_parms(

45void* tgt, const void* src, size_t bytes

  1. {

47

48// Parameters in cudaPitchedPtr

49// d - Pointer to allocated memory

50// p - Pitch of allocated memory in bytes

51// xsz - Logical width of allocation in elements

52// ysz - Logical height of allocation in elements

53 cudaMemcpy3DParms p;

54 p.srcArray = nullptr;

55 p.srcPos = ::make_cudaPos(0, 0, 0);

56 p.srcPtr = ::make_cudaPitchedPtr(const_cast<void*>(src), bytes, bytes, 1);

57 p.dstArray = nullptr;

58 p.dstPos = ::make_cudaPos(0, 0, 0);

59 p.dstPtr = ::make_cudaPitchedPtr(tgt, bytes, bytes, 1);

60 p.extent = ::make_cudaExtent(bytes, 1, 1);

61 p.kind = cudaMemcpyDefault;

62

63return p;

64}

65

69inline cudaMemsetParams cuda_get_memset_parms(void* dst, int ch, size_t count) {

70

71 cudaMemsetParams p;

72 p.dst = dst;

73 p.value = ch;

74 p.pitch = 0;

75//p.elementSize = (count & 1) == 0 ? ((count & 3) == 0 ? 4 : 2) : 1;

76//p.width = (count & 1) == 0 ? ((count & 3) == 0 ? count >> 2 : count >> 1) : count;

77 p.elementSize = 1; // either 1, 2, or 4

78 p.width = count;

79 p.height = 1;

80

81return p;

82}

83

87template <typename T, std::enable_if_t<

88 is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr

89>

90cudaMemsetParams cuda_get_fill_parms(T* dst, T value, size_t count) {

91

92 cudaMemsetParams p;

93 p.dst = dst;

94

95// perform bit-wise copy

96 p.value = 0; // crucial

97static_assert(sizeof(T) <= sizeof(p.value), "internal error");

98 std::memcpy(&p.value, &value, sizeof(T));

99

100 p.pitch = 0;

101 p.elementSize = sizeof(T); // either 1, 2, or 4

102 p.width = count;

103 p.height = 1;

104

105return p;

106}

107

111template <typename T, std::enable_if_t<

112 is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr

113>

114cudaMemsetParams cuda_get_zero_parms(T* dst, size_t count) {

115

116 cudaMemsetParams p;

117 p.dst = dst;

118 p.value = 0;

119 p.pitch = 0;

120 p.elementSize = sizeof(T); // either 1, 2, or 4

121 p.width = count;

122 p.height = 1;

123

124return p;

125}

126

130inline size_t cuda_graph_get_num_root_nodes(cudaGraph_t graph) {

131size_t num_nodes;

132 TF_CHECK_CUDA(

133 cudaGraphGetRootNodes(graph, nullptr, &num_nodes),

134"failed to get native graph root nodes"

135 );

136return num_nodes;

137}

138

142inline size_t cuda_graph_get_num_nodes(cudaGraph_t graph) {

143size_t num_nodes;

144 TF_CHECK_CUDA(

145 cudaGraphGetNodes(graph, nullptr, &num_nodes),

146"failed to get native graph nodes"

147 );

148return num_nodes;

149}

150

154inline size_t cuda_graph_get_num_edges(cudaGraph_t graph, cudaGraphNode_t* from, cudaGraphNode_t* to) {

155size_t num_edges;

156 TF_CHECK_CUDA(

157 TF_CUDA_PRE13(cudaGraphGetEdges(graph, from, to, &num_edges))

158 TF_CUDA_POST13(cudaGraphGetEdges(graph, from, to, nullptr, &num_edges)),

159"failed to get native graph edges"

160 );

161return num_edges;

162}

163

170inline size_t cuda_graph_node_get_dependencies(cudaGraphNode_t node, cudaGraphNode_t* dependencies) {

171size_t num_predecessors;

172 TF_CHECK_CUDA(

173 TF_CUDA_PRE13(cudaGraphNodeGetDependencies(node, dependencies, &num_predecessors))

174 TF_CUDA_POST13(cudaGraphNodeGetDependencies(node, dependencies, nullptr, &num_predecessors)),

175"Failed to get number of dependencies");

176return num_predecessors;

177}

178

185inline size_t cuda_graph_node_get_dependent_nodes(cudaGraphNode_t node, cudaGraphNode_t *dependent_nodes) {

186size_t num_successors;

187 TF_CHECK_CUDA(

188 TF_CUDA_PRE13(cudaGraphNodeGetDependentNodes(node, dependent_nodes, &num_successors))

189 TF_CUDA_POST13(cudaGraphNodeGetDependentNodes(node, dependent_nodes, nullptr, &num_successors)),

190"Failed to get CUDA dependent nodes");

191return num_successors;

192}

193

201inline void cuda_graph_add_dependencies(cudaGraph_t graph, const cudaGraphNode_t *from, const cudaGraphNode_t *to, size_t numDependencies) {

202 TF_CHECK_CUDA(

203 TF_CUDA_PRE13(cudaGraphAddDependencies(graph, from, to, numDependencies))

204 TF_CUDA_POST13(cudaGraphAddDependencies(graph, from, to, nullptr, numDependencies)),

205"Failed to add CUDA graph node dependencies"

206 );

207}

208

212inline size_t cuda_graph_get_num_edges(cudaGraph_t graph) {

213return cuda_graph_get_num_edges(graph, nullptr, nullptr);

214}

215

216

217

221inline std::vector<cudaGraphNode_t> cuda_graph_get_nodes(cudaGraph_t graph) {

222size_t num_nodes = cuda_graph_get_num_nodes(graph);

223 std::vector<cudaGraphNode_t> nodes(num_nodes);

224 TF_CHECK_CUDA(

225 cudaGraphGetNodes(graph, nodes.data(), &num_nodes),

226"failed to get native graph nodes"

227 );

228return nodes;

229}

230

234inline std::vector<cudaGraphNode_t> cuda_graph_get_root_nodes(cudaGraph_t graph) {

235size_t num_nodes = cuda_graph_get_num_root_nodes(graph);

236 std::vector<cudaGraphNode_t> nodes(num_nodes);

237 TF_CHECK_CUDA(

238 cudaGraphGetRootNodes(graph, nodes.data(), &num_nodes),

239"failed to get native graph nodes"

240 );

241return nodes;

242}

243

247inline std::vector<std::pair<cudaGraphNode_t, cudaGraphNode_t>>

248cuda_graph_get_edges(cudaGraph_t graph) {

249size_t num_edges = cuda_graph_get_num_edges(graph);

250 std::vector<cudaGraphNode_t> froms(num_edges), tos(num_edges);

251 num_edges = cuda_graph_get_num_edges(graph, froms.data(), tos.data());

252 std::vector<std::pair<cudaGraphNode_t, cudaGraphNode_t>> edges(num_edges);

253for(size_t i=0; i<num_edges; i++) {

254 edges[i] = std::make_pair(froms[i], tos[i]);

255 }

256return edges;

257}

258

272inline cudaGraphNodeType cuda_get_graph_node_type(cudaGraphNode_t node) {

273 cudaGraphNodeType type;

274 TF_CHECK_CUDA(

275 cudaGraphNodeGetType(node, &type), "failed to get native graph node type"

276 );

277return type;

278}

279

280// ----------------------------------------------------------------------------

281// cudaTask Types

282// ----------------------------------------------------------------------------

283

287constexpr const char* to_string(cudaGraphNodeType type) {

288switch (type) {

289case cudaGraphNodeTypeKernel: return "Kernel";

290case cudaGraphNodeTypeMemcpy: return "Memcpy";

291case cudaGraphNodeTypeMemset: return "Memset";

292case cudaGraphNodeTypeHost: return "Host";

293case cudaGraphNodeTypeGraph: return "Graph";

294case cudaGraphNodeTypeEmpty: return "Empty";

295case cudaGraphNodeTypeWaitEvent: return "WaitEvent";

296case cudaGraphNodeTypeEventRecord: return "EventRecord";

297case cudaGraphNodeTypeExtSemaphoreSignal: return "ExtSemaphoreSignal";

298case cudaGraphNodeTypeExtSemaphoreWait: return "ExtSemaphoreWait";

299case cudaGraphNodeTypeMemAlloc: return "MemAlloc";

300case cudaGraphNodeTypeMemFree: return "MemFree";

301case cudaGraphNodeTypeConditional: return "Conditional";

302default: return "undefined";

303 }

304}

305

306// ----------------------------------------------------------------------------

307// cudaTask

308// ----------------------------------------------------------------------------

309

315class cudaTask {

316

317template <typename Creator, typename Deleter>

318friend class cudaGraphBase;

319

320template <typename Creator, typename Deleter>

321friend class cudaGraphExecBase;

322

323friend class cudaFlow;

324friend class cudaFlowCapturer;

325friend class cudaFlowCapturerBase;

326

327friend std::ostream& operator <<(std::ostream&, const cudaTask&);

328

329public:

330

334cudaTask() = default;

335

339cudaTask(const cudaTask&) = default;

340

344cudaTask& operator =(const cudaTask&) = default;

345

355template <typename... Ts>

356cudaTask& precede(Ts&&... tasks);

357

367template <typename... Ts>

368cudaTask& succeed(Ts&&... tasks);

369

373size_t num_successors() const;

374

378size_t num_predecessors() const;

379

383auto type() const;

384

390void dump(std::ostream& os) const;

391

392private:

393

394cudaTask(cudaGraph_t, cudaGraphNode_t);

395

396 cudaGraph_t _native_graph {nullptr};

397 cudaGraphNode_t _native_node {nullptr};

398};

399

400// Constructor

401inline cudaTask::cudaTask(cudaGraph_t native_graph, cudaGraphNode_t native_node) :

402 _native_graph {native_graph}, _native_node {native_node} {

403}

404

405// Function: precede

406template <typename... Ts>

407cudaTask& cudaTask::precede(Ts&&... tasks) {

408 (

409cuda_graph_add_dependencies(

410 _native_graph, &_native_node, &(tasks._native_node), 1

411 ), ...

412 );

413return *this;

414}

415

416// Function: succeed

417template <typename... Ts>

418cudaTask& cudaTask::succeed(Ts&&... tasks) {

419 (tasks.precede(*this), ...);

420return *this;

421}

422

423// Function: num_predecessors

424inline size_t cudaTask::num_predecessors() const {

425return cuda_graph_node_get_dependencies(_native_node, nullptr);

426}

427

428// Function: num_successors

429inline size_t cudaTask::num_successors() const {

430return cuda_graph_node_get_dependent_nodes(_native_node, nullptr);

431}

432

433// Function: type

434inline auto cudaTask::type() const {

435 cudaGraphNodeType type;

436 cudaGraphNodeGetType(_native_node, &type);

437return type;

438}

439

440// Function: dump

441inline void cudaTask::dump(std::ostream& os) const {

442 os << "cudaTask [type=" << to_string(type()) << ']';

443}

444

448inline std::ostream& operator <<(std::ostream& os, const cudaTask& ct) {

449 ct.dump(os);

450return os;

451}

452

453// ----------------------------------------------------------------------------

454// cudaGraph

455// ----------------------------------------------------------------------------

456

465class cudaGraphCreator {

466

467public:

468

478 cudaGraph_t operator ()() const {

479 cudaGraph_t g;

480 TF_CHECK_CUDA(cudaGraphCreate(&g, 0), "failed to create a CUDA native graph");

481return g;

482 }

483

487 cudaGraph_t operator ()(cudaGraph_t graph) const {

488return graph;

489 }

490

491};

492

502class cudaGraphDeleter {

503

504public:

505

513void operator ()(cudaGraph_t g) const {

514 cudaGraphDestroy(g);

515 }

516};

517

518

530template <typename Creator, typename Deleter>

531class cudaGraphBase : public std::unique_ptr<std::remove_pointer_t<cudaGraph_t>, cudaGraphDeleter> {

532

533static_assert(std::is_pointer_v<cudaGraph_t>, "cudaGraph_t is not a pointer type");

534

535public:

536

540using base_type = std::unique_ptr<std::remove_pointer_t<cudaGraph_t>, Deleter>;

541

549template <typename... ArgsT>

550explicit cudaGraphBase(ArgsT&& ... args) : base_type(

551 Creator{}(std::forward<ArgsT>(args)...), Deleter()

552 ) {

553 }

554

558cudaGraphBase(cudaGraphBase&&) = default;

559

563cudaGraphBase& operator =(cudaGraphBase&&) = default;

564

568size_t num_nodes() const;

569

573size_t num_edges() const;

574

578bool empty() const;

579

585void dump(std::ostream& os);

586

587// ------------------------------------------------------------------------

588// Graph building routines

589// ------------------------------------------------------------------------

590

603cudaTask noop();

604

619template <typename C>

620cudaTask host(C&& callable, void* user_data);

621

636template <typename F, typename... ArgsT>

637cudaTask kernel(dim3 g, dim3 b, size_t s, F f, ArgsT... args);

638

651cudaTask memset(void* dst, int v, size_t count);

652

665cudaTask memcpy(void* tgt, const void* src, size_t bytes);

666

679template <typename T, std::enable_if_t<

680 is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr

681 >

682cudaTask zero(T* dst, size_t count);

683

699template <typename T, std::enable_if_t<

700 is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>* = nullptr

701 >

702cudaTask fill(T* dst, T value, size_t count);

703

718template <typename T,

719 std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr

720 >

721cudaTask copy(T* tgt, const T* src, size_t num);

722

723// ------------------------------------------------------------------------

724// generic algorithms

725// ------------------------------------------------------------------------

726

736template <typename C>

737cudaTask single_task(C c);

738

760template <typename I, typename C, typename E = cudaDefaultExecutionPolicy>

761cudaTask for_each(I first, I last, C callable);

762

791template <typename I, typename C, typename E = cudaDefaultExecutionPolicy>

792cudaTask for_each_index(I first, I last, I step, C callable);

793

817template <typename I, typename O, typename C, typename E = cudaDefaultExecutionPolicy>

818cudaTask transform(I first, I last, O output, C op);

819

846template <typename I1, typename I2, typename O, typename C, typename E = cudaDefaultExecutionPolicy>

847cudaTask transform(I1 first1, I1 last1, I2 first2, O output, C op);

848

849private:

850

851cudaGraphBase(const cudaGraphBase&) = delete;

852cudaGraphBase& operator =(const cudaGraphBase&) = delete;

853};

854

855// query the number of nodes

856template <typename Creator, typename Deleter>

857size_t cudaGraphBase<Creator, Deleter>::num_nodes() const {

858size_t n;

859 TF_CHECK_CUDA(

860 cudaGraphGetNodes(this->get(), nullptr, &n),

861"failed to get native graph nodes"

862 );

863return n;

864}

865

866// query the emptiness

867template <typename Creator, typename Deleter>

868bool cudaGraphBase<Creator, Deleter>::empty() const {

869return num_nodes() == 0;

870}

871

872// query the number of edges

873template <typename Creator, typename Deleter>

874size_t cudaGraphBase<Creator, Deleter>::num_edges() const {

875return cuda_graph_get_num_edges(this->get());

876}

877

879//inline void cudaGraph::dump(std::ostream& os) {

880//

881// // acquire the native handle

882// auto g = this->get();

883//

884// os << "digraph cudaGraph {\n";

885//

886// std::stack<std::tuple<cudaGraph_t, cudaGraphNode_t, int>> stack;

887// stack.push(std::make_tuple(g, nullptr, 1));

888//

889// int pl = 0;

890//

891// while(stack.empty() == false) {

892//

893// auto [graph, parent, l] = stack.top();

894// stack.pop();

895//

896// for(int i=0; i<pl-l+1; i++) {

897// os << "}\n";

898// }

899//

900// os << "subgraph cluster_p" << graph << " {\n"

901// << "label="cudaGraph-L" << l << "";\n"

902// << "color="purple";\n";

903//

904// auto nodes = cuda_graph_get_nodes(graph);

905// auto edges = cuda_graph_get_edges(graph);

906//

907// for(auto& [from, to] : edges) {

908// os << 'p' << from << " -> " << 'p' << to << ";\n";

909// }

910//

911// for(auto& node : nodes) {

912// auto type = cuda_get_graph_node_type(node);

913// if(type == cudaGraphNodeTypeGraph) {

914//

915// cudaGraph_t child_graph;

916// TF_CHECK_CUDA(cudaGraphChildGraphNodeGetGraph(node, &child_graph), "");

917// stack.push(std::make_tuple(child_graph, node, l+1));

918//

919// os << 'p' << node << "["

920// << "shape=folder, style=filled, fontcolor=white, fillcolor=purple, "

921// << "label="cudaGraph-L" << l+1

922// << ""];\n";

923// }

924// else {

925// os << 'p' << node << "[label=""

926// << to_string(type)

927// << ""];\n";

928// }

929// }

930//

931// // precede to parent

932// if(parent != nullptr) {

933// std::unordered_set<cudaGraphNode_t> successors;

934// for(const auto& p : edges) {

935// successors.insert(p.first);

936// }

937// for(auto node : nodes) {

938// if(successors.find(node) == successors.end()) {

939// os << 'p' << node << " -> " << 'p' << parent << ";\n";

940// }

941// }

942// }

943//

944// // set the previous level

945// pl = l;

946// }

947//

948// for(int i=0; i<=pl; i++) {

949// os << "}\n";

950// }

951//}

952

953// dump the graph

954template <typename Creator, typename Deleter>

955void cudaGraphBase<Creator, Deleter>::dump(std::ostream& os) {

956

957// Generate a unique temporary filename in the system's temp directory using filesystem

958auto temp_path = std::filesystem::temp_directory_path() / "graph_";

959 std::random_device rd;

960 std::uniform_int_distribution<int> dist(100000, 999999); // Generates a random number

961 temp_path += std::to_string(dist(rd)) + ".dot";

962

963// Call the original function with the temporary file

964 TF_CHECK_CUDA(cudaGraphDebugDotPrint(this->get(), temp_path.string().c_str(), 0), "");

965

966// Read the file and write to the output stream

967 std::ifstream file(temp_path);

968if (file) {

969 os << file.rdbuf(); // Copy file contents to the stream

970 file.close();

971 std::filesystem::remove(temp_path); // Clean up the temporary file

972 } else {

973 TF_THROW("failed to open ", temp_path, " for dumping the CUDA graph");

974 }

975}

976

977// Function: noop

978template <typename Creator, typename Deleter>

979cudaTask cudaGraphBase<Creator, Deleter>::noop() {

980

981 cudaGraphNode_t node;

982

983 TF_CHECK_CUDA(

984 cudaGraphAddEmptyNode(&node, this->get(), nullptr, 0),

985"failed to create a no-operation (empty) node"

986 );

987

988return cudaTask(this->get(), node);

989}

990

991// Function: host

992template <typename Creator, typename Deleter>

993template <typename C>

994cudaTask cudaGraphBase<Creator, Deleter>::host(C&& callable, void* user_data) {

995

996 cudaGraphNode_t node;

997 cudaHostNodeParams p {callable, user_data};

998

999 TF_CHECK_CUDA(

1000 cudaGraphAddHostNode(&node, this->get(), nullptr, 0, &p),

1001"failed to create a host node"

1002 );

1003

1004return cudaTask(this->get(), node);

1005}

1006

1007// Function: kernel

1008template <typename Creator, typename Deleter>

1009template <typename F, typename... ArgsT>

1010cudaTask cudaGraphBase<Creator, Deleter>::kernel(

1011 dim3 g, dim3 b, size_t s, F f, ArgsT... args

  1. {

1013

1014 cudaGraphNode_t node;

1015 cudaKernelNodeParams p;

1016

1017void* arguments[sizeof...(ArgsT)] = { (void*)(&args)... };

1018

1019 p.func = (void*)f;

1020 p.gridDim = g;

1021 p.blockDim = b;

1022 p.sharedMemBytes = s;

1023 p.kernelParams = arguments;

1024 p.extra = nullptr;

1025

1026 TF_CHECK_CUDA(

1027 cudaGraphAddKernelNode(&node, this->get(), nullptr, 0, &p),

1028"failed to create a kernel task"

1029 );

1030

1031return cudaTask(this->get(), node);

1032}

1033

1034// Function: zero

1035template <typename Creator, typename Deleter>

1036template <typename T, std::enable_if_t<

1037 is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>*

1038>

1039cudaTask cudaGraphBase<Creator, Deleter>::zero(T* dst, size_t count) {

1040

1041 cudaGraphNode_t node;

1042auto p = cuda_get_zero_parms(dst, count);

1043

1044 TF_CHECK_CUDA(

1045 cudaGraphAddMemsetNode(&node, this->get(), nullptr, 0, &p),

1046"failed to create a memset (zero) task"

1047 );

1048

1049return cudaTask(this->get(), node);

1050}

1051

1052// Function: fill

1053template <typename Creator, typename Deleter>

1054template <typename T, std::enable_if_t<

1055 is_pod_v<T> && (sizeof(T)==1 || sizeof(T)==2 || sizeof(T)==4), void>*

1056>

1057cudaTask cudaGraphBase<Creator, Deleter>::fill(T* dst, T value, size_t count) {

1058

1059 cudaGraphNode_t node;

1060auto p = cuda_get_fill_parms(dst, value, count);

1061 TF_CHECK_CUDA(

1062 cudaGraphAddMemsetNode(&node, this->get(), nullptr, 0, &p),

1063"failed to create a memset (fill) task"

1064 );

1065

1066return cudaTask(this->get(), node);

1067}

1068

1069// Function: copy

1070template <typename Creator, typename Deleter>

1071template <

1072typename T,

1073 std::enable_if_t<!std::is_same_v<T, void>, void>*

1074>

1075cudaTask cudaGraphBase<Creator, Deleter>::copy(T* tgt, const T* src, size_t num) {

1076

1077 cudaGraphNode_t node;

1078auto p = cuda_get_copy_parms(tgt, src, num);

1079

1080 TF_CHECK_CUDA(

1081 cudaGraphAddMemcpyNode(&node, this->get(), nullptr, 0, &p),

1082"failed to create a memcpy (copy) task"

1083 );

1084

1085return cudaTask(this->get(), node);

1086}

1087

1088// Function: memset

1089template <typename Creator, typename Deleter>

1090cudaTask cudaGraphBase<Creator, Deleter>::memset(void* dst, int ch, size_t count) {

1091

1092 cudaGraphNode_t node;

1093auto p = cuda_get_memset_parms(dst, ch, count);

1094

1095 TF_CHECK_CUDA(

1096 cudaGraphAddMemsetNode(&node, this->get(), nullptr, 0, &p),

1097"failed to create a memset task"

1098 );

1099

1100return cudaTask(this->get(), node);

1101}

1102

1103// Function: memcpy

1104template <typename Creator, typename Deleter>

1105cudaTask cudaGraphBase<Creator, Deleter>::memcpy(void* tgt, const void* src, size_t bytes) {

1106

1107 cudaGraphNode_t node;

1108auto p = cuda_get_memcpy_parms(tgt, src, bytes);

1109

1110 TF_CHECK_CUDA(

1111 cudaGraphAddMemcpyNode(&node, this->get(), nullptr, 0, &p),

1112"failed to create a memcpy task"

1113 );

1114

1115return cudaTask(this->get(), node);

1116}

1117

1118

1119

1120

1121

1122} // end of namespace tf -----------------------------------------------------

1123

1124

1125

1126

tf::cudaGraphBase::copy

cudaTask copy(T *tgt, const T *src, size_t num)

creates a memcopy task that copies typed data

Definition cuda_graph.hpp:1075

tf::cudaGraphBase::num_edges

size_t num_edges() const

queries the number of edges in a native CUDA graph

Definition cuda_graph.hpp:874

tf::cudaGraphBase::for_each

cudaTask for_each(I first, I last, C callable)

applies a callable to each dereferenced element of the data array

Definition for_each.hpp:53

tf::cudaGraphBase::memset

cudaTask memset(void *dst, int v, size_t count)

creates a memset task that fills untyped data with a byte value

Definition cuda_graph.hpp:1090

tf::cudaGraphBase::kernel

cudaTask kernel(dim3 g, dim3 b, size_t s, F f, ArgsT... args)

creates a kernel task

Definition cuda_graph.hpp:1010

tf::cudaGraphBase::fill

cudaTask fill(T *dst, T value, size_t count)

creates a memset task that fills a typed memory block with a value

Definition cuda_graph.hpp:1057

tf::cudaGraphBase::cudaGraphBase

cudaGraphBase(cudaGraphBase &&)=default

constructs a cudaGraph from the given rhs using move semantics

tf::cudaGraphBase::host

cudaTask host(C &&callable, void *user_data)

creates a host task that runs a callable on the host

Definition cuda_graph.hpp:994

tf::cudaGraphBase::empty

bool empty() const

queries if the graph is empty

Definition cuda_graph.hpp:868

tf::cudaGraphBase::memcpy

cudaTask memcpy(void *tgt, const void *src, size_t bytes)

creates a memcpy task that copies untyped data in bytes

Definition cuda_graph.hpp:1105

tf::cudaGraphBase::cudaGraphBase

cudaGraphBase(ArgsT &&... args)

constructs a cudaGraph object by passing the given arguments to the executable CUDA graph creator

Definition cuda_graph.hpp:550

tf::cudaGraphBase::base_type

std::unique_ptr< std::remove_pointer_t< cudaGraph_t >, Deleter > base_type

base std::unique_ptr type

Definition cuda_graph.hpp:540

tf::cudaGraphBase::transform

cudaTask transform(I first, I last, O output, C op)

applies a callable to a source range and stores the result in a target range

Definition transform.hpp:65

tf::cudaGraphBase::zero

cudaTask zero(T *dst, size_t count)

creates a memset task that sets a typed memory block to zero

Definition cuda_graph.hpp:1039

tf::cudaGraphBase::single_task

cudaTask single_task(C c)

runs a callable with only a single kernel thread

tf::cudaGraphBase::dump

void dump(std::ostream &os)

dumps the CUDA graph to a DOT format through the given output stream

Definition cuda_graph.hpp:955

tf::cudaGraphBase::transform

cudaTask transform(I1 first1, I1 last1, I2 first2, O output, C op)

creates a task to perform parallel transforms over two ranges of items

Definition transform.hpp:79

tf::cudaGraphBase::for_each_index

cudaTask for_each_index(I first, I last, I step, C callable)

applies a callable to each index in the range with the step size

Definition for_each.hpp:79

tf::cudaGraphBase::num_nodes

size_t num_nodes() const

queries the number of nodes in a native CUDA graph

Definition cuda_graph.hpp:857

tf::cudaGraphBase::operator=

cudaGraphBase & operator=(cudaGraphBase &&)=default

assign the rhs to *this using move semantics

tf::cudaGraphBase::noop

cudaTask noop()

creates a no-operation task

Definition cuda_graph.hpp:979

tf::cudaGraphCreator

class to create functors that construct CUDA graphs

Definition cuda_graph.hpp:465

tf::cudaGraphCreator::operator()

cudaGraph_t operator()() const

creates a new CUDA graph

Definition cuda_graph.hpp:478

tf::cudaGraphDeleter

class to create a functor that deletes a CUDA graph

Definition cuda_graph.hpp:502

tf::cudaGraphDeleter::operator()

void operator()(cudaGraph_t g) const

deletes a CUDA graph

Definition cuda_graph.hpp:513

tf::cudaTask

class to create a task handle of a CUDA Graph node

Definition cuda_graph.hpp:315

tf::cudaTask::cudaTask

cudaTask(const cudaTask &)=default

copy-constructs a cudaTask

tf::cudaTask::succeed

cudaTask & succeed(Ts &&... tasks)

adds precedence links from other tasks to this

Definition cuda_graph.hpp:418

tf::cudaTask::operator<<

friend std::ostream & operator<<(std::ostream &, const cudaTask &)

overload of ostream inserter operator for cudaTask

Definition cuda_graph.hpp:448

tf::cudaTask::num_predecessors

size_t num_predecessors() const

queries the number of dependents

Definition cuda_graph.hpp:424

tf::cudaTask::num_successors

size_t num_successors() const

queries the number of successors

Definition cuda_graph.hpp:429

tf::cudaTask::cudaTask

cudaTask()=default

constructs an empty cudaTask

tf::cudaTask::type

auto type() const

queries the type of this task

Definition cuda_graph.hpp:434

tf::cudaTask::operator=

cudaTask & operator=(const cudaTask &)=default

copy-assigns a cudaTask

tf::cudaTask::precede

cudaTask & precede(Ts &&... tasks)

adds precedence links from this to other tasks

Definition cuda_graph.hpp:407

tf::cudaTask::dump

void dump(std::ostream &os) const

dumps the task through an output stream

Definition cuda_graph.hpp:441

tf

taskflow namespace

Definition small_vector.hpp:20

tf::to_string

const char * to_string(TaskType type)

convert a task type to a human-readable string

Definition task.hpp:66

tf::cuda_get_zero_parms

cudaMemsetParams cuda_get_zero_parms(T *dst, size_t count)

gets the memset node parameter of a zero task (typed)

Definition cuda_graph.hpp:114

tf::cuda_graph_get_root_nodes

std::vector< cudaGraphNode_t > cuda_graph_get_root_nodes(cudaGraph_t graph)

acquires the root nodes in a native CUDA graph

Definition cuda_graph.hpp:234

tf::cuda_graph_node_get_dependencies

size_t cuda_graph_node_get_dependencies(cudaGraphNode_t node, cudaGraphNode_t *dependencies)

Handles compatibility with CUDA <= 12.x and CUDA 13.

Definition cuda_graph.hpp:170

tf::cuda_graph_node_get_dependent_nodes

size_t cuda_graph_node_get_dependent_nodes(cudaGraphNode_t node, cudaGraphNode_t *dependent_nodes)

Handles compatibility with CUDA <= 12.x and CUDA 13.

Definition cuda_graph.hpp:185

tf::cuda_graph_get_nodes

std::vector< cudaGraphNode_t > cuda_graph_get_nodes(cudaGraph_t graph)

acquires the nodes in a native CUDA graph

Definition cuda_graph.hpp:221

tf::cuda_get_memcpy_parms

cudaMemcpy3DParms cuda_get_memcpy_parms(void *tgt, const void *src, size_t bytes)

gets the memcpy node parameter of a memcpy task (untyped)

Definition cuda_graph.hpp:44

tf::cuda_graph_get_num_nodes

size_t cuda_graph_get_num_nodes(cudaGraph_t graph)

queries the number of nodes in a native CUDA graph

Definition cuda_graph.hpp:142

tf::cuda_graph_get_num_root_nodes

size_t cuda_graph_get_num_root_nodes(cudaGraph_t graph)

queries the number of root nodes in a native CUDA graph

Definition cuda_graph.hpp:130

tf::cuda_get_memset_parms

cudaMemsetParams cuda_get_memset_parms(void *dst, int ch, size_t count)

gets the memset node parameter of a memcpy task (untyped)

Definition cuda_graph.hpp:69

tf::cuda_get_fill_parms

cudaMemsetParams cuda_get_fill_parms(T *dst, T value, size_t count)

gets the memset node parameter of a fill task (typed)

Definition cuda_graph.hpp:90

tf::operator<<

std::ostream & operator<<(std::ostream &os, const Task &task)

overload of ostream inserter operator for Task

Definition task.hpp:1532

tf::cuda_graph_get_num_edges

size_t cuda_graph_get_num_edges(cudaGraph_t graph, cudaGraphNode_t *from, cudaGraphNode_t *to)

Handles compatibility with CUDA <= 12.x and CUDA == 13.x.

Definition cuda_graph.hpp:154

tf::cuda_get_copy_parms

cudaMemcpy3DParms cuda_get_copy_parms(T *tgt, const T *src, size_t num)

gets the memcpy node parameter of a copy task

Definition cuda_graph.hpp:23

tf::cuda_graph_add_dependencies

void cuda_graph_add_dependencies(cudaGraph_t graph, const cudaGraphNode_t *from, const cudaGraphNode_t *to, size_t numDependencies)

Handles compatibility with CUDA <= 12.x and CUDA 13.

Definition cuda_graph.hpp:201

tf::cuda_graph_get_edges

std::vector< std::pair< cudaGraphNode_t, cudaGraphNode_t > > cuda_graph_get_edges(cudaGraph_t graph)

acquires the edges in a native CUDA graph

Definition cuda_graph.hpp:248

tf::cuda_get_graph_node_type

cudaGraphNodeType cuda_get_graph_node_type(cudaGraphNode_t node)

queries the type of a native CUDA graph node

Definition cuda_graph.hpp:272