docs/KMeansWithCUDAGPU.html
Following up on k-means Clustering, this page studies how to accelerate a k-means workload on a GPU using tf::cudaGraph.
Recall that the k-means algorithm has the following steps:
We observe Step 2 and Step 3 of the algorithm are parallelizable across individual points for use to harness the power of GPU:
At a fine-grained level, we request one GPU thread to work on one point for Step 2 and one GPU thread to work on one centroid for Step 3.
// px/py: 2D points// N: number of points// mx/my: centroids// K: number of clusters// sx/sy/c: storage to compute the average\_\_global\_\_ void assign\_clusters(float\* px, float\* py, int N, float\* mx, float\* my, float\* sx, float\* sy, int K, int\* c) {const int index = blockIdx.x \* blockDim.x + threadIdx.x;if (index \>= N) {return;}// Make global loads once.float x = px[index];float y = py[index];float best\_dance = FLT\_MAX;int best\_k = 0;for (int k = 0; k \< K; ++k) {float d = L2(x, y, mx[k], my[k]);if (d \< best\_d) {best\_d = d;best\_k = k;}}atomicAdd(&sx[best\_k], x); atomicAdd(&sy[best\_k], y); atomicAdd(&c [best\_k], 1); }// mx/my: centroids, sx/sy/c: storage to compute the average\_\_global\_\_ void compute\_new\_means(float\* mx, float\* my, float\* sx, float\* sy, int\* c) {int k = threadIdx.x;int count = max(1, c[k]);// turn 0/0 to 0/1mx[k] = sx[k] / count;my[k] = sy[k] / count;}
When we recompute the cluster centroids to be the mean of all points assigned to a particular centroid, multiple GPU threads may access the sum arrays, sx and sy, and the count array, c. To avoid data race, we use a simple atomicAdd method.
Based on the two kernels, we can define a CUDA graph for the k-means workload below:
// N: number of points// K: number of clusters// M: number of iterations// px/py: 2D point vector void kmeans\_gpu(int N, int K, int M, cconst std::vector\<float\>& px, const std::vector\<float\>& py) {std::vector\<float\> h\_mx, h\_my;float \*d\_px, \*d\_py, \*d\_mx, \*d\_my, \*d\_sx, \*d\_sy, \*d\_c;for(int i=0; i\<K; ++i) {h\_mx.push\_back(h\_px[i]);h\_my.push\_back(h\_py[i]);}// create a taskflow graphtf::Executor executor;tf::Taskflow taskflow("K-Means");auto allocate\_px = taskflow.emplace(&{TF\_CHECK\_CUDA(cudaMalloc(&d\_px, N\*sizeof(float)), "failed to allocate d\_px");}).name("allocate\_px");auto allocate\_py = taskflow.emplace(&{TF\_CHECK\_CUDA(cudaMalloc(&d\_py, N\*sizeof(float)), "failed to allocate d\_py");}).name("allocate\_py");auto allocate\_mx = taskflow.emplace(&{TF\_CHECK\_CUDA(cudaMalloc(&d\_mx, K\*sizeof(float)), "failed to allocate d\_mx");}).name("allocate\_mx");auto allocate\_my = taskflow.emplace(&{TF\_CHECK\_CUDA(cudaMalloc(&d\_my, K\*sizeof(float)), "failed to allocate d\_my");}).name("allocate\_my");auto allocate\_sx = taskflow.emplace(&{TF\_CHECK\_CUDA(cudaMalloc(&d\_sx, K\*sizeof(float)), "failed to allocate d\_sx");}).name("allocate\_sx");auto allocate\_sy = taskflow.emplace(&{TF\_CHECK\_CUDA(cudaMalloc(&d\_sy, K\*sizeof(float)), "failed to allocate d\_sy");}).name("allocate\_sy");auto allocate\_c = taskflow.emplace(&{TF\_CHECK\_CUDA(cudaMalloc(&d\_c, K\*sizeof(float)), "failed to allocate dc");}).name("allocate\_c");auto h2d = taskflow.emplace(&{cudaMemcpy(d\_px, h\_px.data(), N\*sizeof(float), cudaMemcpyDefault);cudaMemcpy(d\_py, h\_py.data(), N\*sizeof(float), cudaMemcpyDefault);cudaMemcpy(d\_mx, h\_mx.data(), K\*sizeof(float), cudaMemcpyDefault);cudaMemcpy(d\_my, h\_my.data(), K\*sizeof(float), cudaMemcpyDefault);}).name("h2d");auto kmeans = taskflow.emplace(&{tf::cudaGraph cg;auto zero\_c = cg.zero(d\_c, K);auto zero\_sx = cg.zero(d\_sx, K);auto zero\_sy = cg.zero(d\_sy, K);auto cluster = cg.kernel((N+512-1) / 512, 512, 0,assign\_clusters, d\_px, d\_py, N, d\_mx, d\_my, d\_sx, d\_sy, K, d\_c);auto new\_centroid = cg.kernel(1, K, 0,compute\_new\_means, d\_mx, d\_my, d\_sx, d\_sy, d\_c);cluster.precede(new\_centroid).succeed(zero\_c, zero\_sx, zero\_sy);// dump the CUDA graphcg.dump(std::cout);// instantiate an executable CUDA graphtf::cudaGraphExec exec(cg);// Repeat the execution for M times and then synchronizetf::cudaStream stream;for(int i=0; i\<M; i++) {stream.run(exec);}stream.synchronize();}).name("update\_means");auto stop = taskflow.emplace(&{cudaMemcpy(h\_mx.data(), d\_mx, K\*sizeof(float), cudaMemcpyDefault);cudaMemcpy(h\_my.data(), d\_my, K\*sizeof(float), cudaMemcpyDefault);}).name("d2h");auto free = taskflow.emplace(&{cudaFree(d\_px);cudaFree(d\_py);cudaFree(d\_mx);cudaFree(d\_my);cudaFree(d\_sx);cudaFree(d\_sy);cudaFree(d\_c);}).name("free");// build up the dependencyh2d.succeed(allocate\_px, allocate\_py, allocate\_mx, allocate\_my);kmeans.succeed(allocate\_sx, allocate\_sy, allocate\_c, h2d).precede(stop);stop.precede(free);// run the taskflowexecutor.run(taskflow).wait();return {h\_mx, h\_my};}
The first dump before executing the taskflow produces the following diagram. The condition tasks introduces a cycle between itself and update_means. Each time it goes back to update_means, the CUDA graph is reconstructed with captured parameters in the closure and offloaded to the GPU.
Taskflowcluster_p0x7ffcc549dd00Taskflow: K-Meansp0x112f740allocate_pxp0x112fa10h2dp0x112f740->p0x112fa10p0x112fb00update_meansp0x112fa10->p0x112fb00p0x112f650allocate_pyp0x112f650->p0x112fa10p0x112f560allocate_mxp0x112f560->p0x112fa10p0x112f470allocate_myp0x112f470->p0x112fa10p0x112f380allocate_sxp0x112f380->p0x112fb00p0x112fbf0d2hp0x112fb00->p0x112fbf0p0x112f830allocate_syp0x112f830->p0x112fb00p0x112f920allocate_cp0x112f920->p0x112fb00p0x112fce0freep0x112fbf0->p0x112fce0
The main CUDA Graph task, update_means, must not run before all required data has settled down. It precedes a condition task that circles back to itself until we reach M iterations. When iteration completes, the condition task directs the execution path to the CUDA graph, h2d, to copy the results of clusters to h_mx and h_my and then deallocate all GPU memory.
We run three versions of k-means, sequential CPU, parallel CPUs, and one GPU, on a machine of 12 Intel i7-8700 CPUs at 3.20 GHz and a Nvidia RTX 2080 GPU using various numbers of 2D point counts and iterations.
| N | K | M | CPU Sequential | CPU Parallel | GPU |
|---|---|---|---|---|---|
| 10 | 5 | 10 | 0.14 ms | 77 ms | 1 ms |
| 100 | 10 | 100 | 0.56 ms | 86 ms | 7 ms |
| 1000 | 10 | 1000 | 10 ms | 98 ms | 55 ms |
| 10000 | 10 | 10000 | 1006 ms | 713 ms | 458 ms |
| 100000 | 10 | 100000 | 102483 ms | 49966 ms | 7952 ms |
When the number of points is larger than 10K, both parallel CPU and GPU implementations start to pick up the speed over than the sequential version.