Back to Taskflow

CUDA Kernels

docs/KMeansWithCUDAGPU.html

4.1.09.7 KB
Original Source

| | Taskflow: A General-purpose Task-parallel Programming System |

Loading...

Searching...

No Matches

k-means Clustering with CUDA GPU

Following k-means Clustering, we accelerate k-means clustering on a CUDA GPU using tf::cudaGraph. The GPU's massive thread-level parallelism makes the assignment step and centroid update step much faster, achieving over 12× speed-up over the parallel CPU implementation at large problem sizes.

CUDA Kernels

Both the assignment step and the centroid update step are parallelisable at the GPU thread level. We write two CUDA kernels: one that assigns each point to its nearest centroid (one thread per point), and one that recomputes each centroid mean (one thread per centroid):

// assign each point to its nearest centroid

// one thread per point

__global__ void assign_clusters(

float* px, float* py, int N,

float* mx, float* my, float* sx, float* sy, int K, int* c

) {

const int i = blockIdx.x * blockDim.x + threadIdx.x;

if(i >= N) return;

float x = px[i], y = py[i];

float best_d = FLT_MAX;

int best_k = 0;

for(int k = 0; k < K; k++) {

float d = L2(x, y, mx[k], my[k]);

if(d < best_d) { best_d = d; best_k = k; }

}

// use atomics because multiple threads update the same centroid bucket

atomicAdd(&sx[best_k], x);

atomicAdd(&sy[best_k], y);

atomicAdd(&c [best_k], 1);

}

// recompute each centroid as the mean of its assigned points

// one thread per centroid

__global__ void compute_new_means(

float* mx, float* my, float* sx, float* sy, int* c

) {

int k = threadIdx.x;

int count = max(1, c[k]); // guard against empty clusters

mx[k] = sx[k] / count;

my[k] = sy[k] / count;

}

Because multiple GPU threads may update the same centroid bucket in assign_clusters, we use atomic operations (atomicAdd) on sx, sy, and c to avoid data races.

CUDA Graph Task

We build a Taskflow that allocates GPU memory in parallel, copies data to the GPU, runs the CUDA graph for M iterations, copies results back, and frees GPU memory:

void kmeans_gpu(

int N, int K, int M,

const std::vector<float>& px, const std::vector<float>& py

) {

std::vector<float> h_mx(px.begin(), px.begin() + K);

std::vector<float> h_my(py.begin(), py.begin() + K);

float *d_px, *d_py, *d_mx, *d_my, *d_sx, *d_sy;

int *d_c;

tf::Executor executor;

tf::Taskflow taskflow("K-Means GPU");

// allocate GPU memory for all arrays in parallel

auto alloc_px = taskflow.emplace(& {

cudaMalloc(&d_px, N * sizeof(float));

}).name("alloc_px");

auto alloc_py = taskflow.emplace(& {

cudaMalloc(&d_py, N * sizeof(float));

}).name("alloc_py");

auto alloc_mx = taskflow.emplace(& {

cudaMalloc(&d_mx, K * sizeof(float));

}).name("alloc_mx");

auto alloc_my = taskflow.emplace(& {

cudaMalloc(&d_my, K * sizeof(float));

}).name("alloc_my");

auto alloc_sx = taskflow.emplace(& {

cudaMalloc(&d_sx, K * sizeof(float));

}).name("alloc_sx");

auto alloc_sy = taskflow.emplace(& {

cudaMalloc(&d_sy, K * sizeof(float));

}).name("alloc_sy");

auto alloc_c = taskflow.emplace(& {

cudaMalloc(&d_c, K * sizeof(int));

}).name("alloc_c");

// copy point data and initial centroids to GPU

auto h2d = taskflow.emplace(& {

cudaMemcpy(d_px, px.data(), N * sizeof(float), cudaMemcpyDefault);

cudaMemcpy(d_py, py.data(), N * sizeof(float), cudaMemcpyDefault);

cudaMemcpy(d_mx, h_mx.data(), K * sizeof(float), cudaMemcpyDefault);

cudaMemcpy(d_my, h_my.data(), K * sizeof(float), cudaMemcpyDefault);

}).name("h2d");

// build and run the CUDA graph for M iterations

auto kmeans = taskflow.emplace(& {

tf::cudaGraph cg;

// zero accumulator arrays before each iteration

auto zero_sx = cg.zero(d_sx, K);

auto zero_sy = cg.zero(d_sy, K);

auto zero_c = cg.zero(d_c, K);

// one thread per point for assignment

auto cluster = cg.kernel(

(N + 511) / 512, 512, 0,

assign_clusters, d_px, d_py, N, d_mx, d_my, d_sx, d_sy, K, d_c

);

// one thread per centroid for update

auto new_means = cg.kernel(

1, K, 0,

compute_new_means, d_mx, d_my, d_sx, d_sy, d_c

);

cluster.succeed(zero_sx, zero_sy, zero_c)

.precede(new_means);

// instantiate and run the CUDA graph M times on a single stream

tf::cudaStream stream;

tf::cudaGraphExec exec(cg);

for(int i = 0; i < M; i++) {

stream.run(exec);

}

stream.synchronize();

}).name("update_means");

// copy final centroids back to host

auto d2h = taskflow.emplace(& {

cudaMemcpy(h_mx.data(), d_mx, K * sizeof(float), cudaMemcpyDefault);

cudaMemcpy(h_my.data(), d_my, K * sizeof(float), cudaMemcpyDefault);

}).name("d2h");

// free all GPU memory

auto free_mem = taskflow.emplace(& {

cudaFree(d_px); cudaFree(d_py);

cudaFree(d_mx); cudaFree(d_my);

cudaFree(d_sx); cudaFree(d_sy);

cudaFree(d_c);

}).name("free");

// wire dependencies

h2d.succeed(alloc_px, alloc_py, alloc_mx, alloc_my);

kmeans.succeed(alloc_sx, alloc_sy, alloc_c, h2d)

.precede(d2h);

d2h.precede(free_mem);

executor.run(taskflow).wait();

}

tf::Executor

class to create an executor

Definition executor.hpp:62

tf::Executor::run

tf::Future< void > run(Taskflow &taskflow)

runs a taskflow once

tf::Taskflow

class to create a taskflow object

Definition taskflow.hpp:64

tf::cudaGraphBase::kernel

cudaTask kernel(dim3 g, dim3 b, size_t s, F f, ArgsT... args)

creates a kernel task

Definition cuda_graph.hpp:1010

tf::cudaGraphBase::zero

cudaTask zero(T *dst, size_t count)

creates a memset task that sets a typed memory block to zero

Definition cuda_graph.hpp:1039

tf::cudaStreamBase::synchronize

cudaStreamBase & synchronize()

synchronizes the associated stream

Definition cuda_stream.hpp:232

tf::cudaStreamBase::run

cudaStreamBase & run(const cudaGraphExecBase< C, D > &exec)

runs the given executable CUDA graph

tf::cudaTask::succeed

cudaTask & succeed(Ts &&... tasks)

adds precedence links from other tasks to this

Definition cuda_graph.hpp:418

tf::cudaTask::precede

cudaTask & precede(Ts &&... tasks)

adds precedence links from this to other tasks

Definition cuda_graph.hpp:407

tf::cudaGraphExec

cudaGraphExecBase< cudaGraphExecCreator, cudaGraphExecDeleter > cudaGraphExec

default smart pointer type to manage a cudaGraphExec_t object with unique ownership

Definition cudaflow.hpp:23

tf::cudaGraph

cudaGraphBase< cudaGraphCreator, cudaGraphDeleter > cudaGraph

default smart pointer type to manage a cudaGraph_t object with unique ownership

Definition cudaflow.hpp:18

tf::cudaStream

cudaStreamBase< cudaStreamCreator, cudaStreamDeleter > cudaStream

default smart pointer type to manage a cudaStream_t object with unique ownership

Definition cuda_stream.hpp:340

The outer Taskflow orchestrates all CPU-side work: allocation tasks run in parallel where possible, the CUDA graph task runs M iterations entirely on the GPU without returning to the CPU between iterations, and finally results are copied back and memory is freed.

The task graph structure is shown below:

Embedded content

Benchmarking

Runtime comparison across three implementations on a 12-core Intel i7-8700 at 3.20 GHz and a Nvidia RTX 2080:

NKMCPU sequentialCPU parallelGPU
105100.14 ms77 ms1 ms
100101000.56 ms86 ms7 ms
100010100010 ms98 ms55 ms
1000010100001006 ms713 ms458 ms
10000010100000102483 ms49966 ms7952 ms

For small problem sizes, kernel launch and data transfer overhead makes the GPU slower than the CPU. At N = 100K, the GPU is over 12× faster than the parallel CPU and over 12× faster than the sequential CPU, demonstrating the value of offloading data-intensive inner loops to the GPU.