docs/device_2tensor__compare_8h_source.html
| | CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers |
device/tensor_compare.h
Go to the documentation of this file.
1 /***************************************************************************************************
2 * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without modification, are permitted
5 * provided that the following conditions are met:
6 * * Redistributions of source code must retain the above copyright notice, this list of
7 * conditions and the following disclaimer.
8 * * Redistributions in binary form must reproduce the above copyright notice, this list of
9 * conditions and the following disclaimer in the documentation and/or other materials
10 * provided with the distribution.
11 * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12 * to endorse or promote products derived from this software without specific prior written
13 * permission.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21 * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 *
24 **************************************************************************************************/
25 /* \file
26 \brief Defines host-side elementwise operations on TensorView.
27 */
28
29 #pragma once
30 // Standard Library includes
31 #include <utility>
32
33 // Cutlass includes
34 #include "cutlass/cutlass.h"
35 #include "cutlass/relatively_equal.h"
36
37 #include "cutlass/util/distribution.h"
38
39 #include "tensor_foreach.h"
40
41 namespace cutlass {
42 namespace reference {
43 namespace device {
44
46
47 namespace kernel {
48
49 template <typename Element>
50 __global__ void BlockCompareEqual(
51int *equal,
52 Element const *ptr_A,
53 Element const *ptr_B,
54size_t capacity) {
55
56size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
57
58for (; idx < capacity; idx += gridDim.x * blockDim.x) {
59if (ptr_A[idx] != ptr_B[idx]) {
60 *equal = 0;
61return;
62 }
63 }
64 }
65
66 template <typename Element>
67 __global__ void BlockCompareRelativelyEqual(
68int *equal,
69 Element const *ptr_A,
70 Element const *ptr_B,
71size_t capacity,
72 Element epsilon,
73 Element nonzero_floor) {
74
75size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
76
77for (; idx < capacity; idx += gridDim.x * blockDim.x) {
78
79 Element a = ptr_A[idx];
80 Element b = ptr_B[idx];
81
82if (! relatively_equal(a, b, epsilon, nonzero_floor)) {
83 *equal = 0;
84return;
85 }
86 }
87 }
88
89 } // namespace kernel
90
91
93
95 template <typename Element>
96 bool BlockCompareEqual(
97 Element const *ptr_A,
98 Element const *ptr_B,
99size_t capacity,
100int grid_size = 0,
101int block_size = 0) {
102
103int equal_flag = 1;
104int *device_equal_flag = nullptr;
105
106if (cudaMalloc((void **)&device_equal_flag, sizeof(int)) != cudaSuccess) {
107throw std::runtime_error("Failed to allocate device flag.");
108 }
109
110if (cudaMemcpy(
111 device_equal_flag,
112 &equal_flag,
113sizeof(int),
114 cudaMemcpyHostToDevice) != cudaSuccess) {
115
116throw std::runtime_error("Failed to copy equality flag to device.");
117 }
118
119if (!grid_size || !block_size) {
120
121// if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
122 cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
123 &grid_size,
124 &block_size,
125 reinterpret_cast<void const *>(kernel::BlockCompareEqual<Element>));
126
127if (result != cudaSuccess) {
128throw std::runtime_error("Failed to query occupancy.");
129 }
130
131// Limit block size. This has the effect of increasing the number of items processed by a
132// single thread and reduces the impact of initialization overhead.
133 block_size = (block_size < 128 ? block_size : 128);
134 }
135
136 dim3 grid(grid_size, 1, 1);
137 dim3 block(block_size, 1, 1);
138
139 kernel::BlockCompareEqual<Element><<< grid, block >>>(device_equal_flag, ptr_A, ptr_B, capacity);
140
141if (cudaMemcpy(
142 &equal_flag,
143 device_equal_flag,
144sizeof(int),
145 cudaMemcpyDeviceToHost) != cudaSuccess) {
146
147 cudaFree(device_equal_flag);
148
149throw std::runtime_error("Failed to copy equality flag from device.");
150 }
151
152 cudaFree(device_equal_flag);
153
154return equal_flag;
155 }
156
158
160 template <typename Element>
161 bool BlockCompareRelativelyEqual(
162 Element const *ptr_A,
163 Element const *ptr_B,
164size_t capacity,
165 Element epsilon,
166 Element nonzero_floor,
167int grid_size = 0,
168int block_size = 0) {
169
170int equal_flag = 1;
171int *device_equal_flag = nullptr;
172
173if (cudaMalloc((void **)&device_equal_flag, sizeof(int)) != cudaSuccess) {
174throw std::runtime_error("Failed to allocate device flag.");
175 }
176
177if (cudaMemcpy(
178 device_equal_flag,
179 &equal_flag,
180sizeof(int),
181 cudaMemcpyHostToDevice) != cudaSuccess) {
182
183throw std::runtime_error("Failed to copy equality flag to device.");
184 }
185
186if (!grid_size || !block_size) {
187
188// if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
189 cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
190 &grid_size,
191 &block_size,
192 reinterpret_cast<void const *>(kernel::BlockCompareRelativelyEqual<Element>));
193
194if (result != cudaSuccess) {
195throw std::runtime_error("Failed to query occupancy.");
196 }
197
198// Limit block size. This has the effect of increasing the number of items processed by a
199// single thread and reduces the impact of initialization overhead.
200 block_size = (block_size < 128 ? block_size : 128);
201 }
202
203 dim3 grid(grid_size, 1, 1);
204 dim3 block(block_size, 1, 1);
205
206 kernel::BlockCompareRelativelyEqual<Element><<< grid, block >>>(
207 device_equal_flag,
208 ptr_A,
209 ptr_B,
210 capacity,
211 epsilon,
212 nonzero_floor
213 );
214
215if (cudaMemcpy(
216 &equal_flag,
217 device_equal_flag,
218sizeof(int),
219 cudaMemcpyDeviceToHost) != cudaSuccess) {
220
221 cudaFree(device_equal_flag);
222
223throw std::runtime_error("Failed to copy equality flag from device.");
224 }
225
226 cudaFree(device_equal_flag);
227
228return equal_flag;
229 }
230
232
233 } // device
234 } // reference
235 } // cutlass
Definition: aligned_buffer.h:35
cutlass::reference::device::kernel::BlockCompareRelativelyEqual
__global__ void BlockCompareRelativelyEqual(int *equal, Element const *ptr_A, Element const *ptr_B, size_t capacity, Element epsilon, Element nonzero_floor)
Definition: device/tensor_compare.h:67
CUTLASS_HOST_DEVICE bool relatively_equal(T a, T b, T epsilon, T nonzero_floor)
cutlass::reference::device::kernel::BlockCompareEqual
__global__ void BlockCompareEqual(int *equal, Element const *ptr_A, Element const *ptr_B, size_t capacity)
Definition: device/tensor_compare.h:50
This header contains a class to parametrize a statistical distribution function.
Basic include for CUTLASS.
Generated by 1.8.11