CUTLASS: tensor_compare.h Source File - Cutlass

| | CUTLASS

CUDA Templates for Linear Algebra Subroutines and Solvers |

device/tensor_compare.h

Go to the documentation of this file.

1 /***************************************************************************************************

3 *

4 * Redistribution and use in source and binary forms, with or without modification, are permitted

5 * provided that the following conditions are met:

6 * * Redistributions of source code must retain the above copyright notice, this list of

7 * conditions and the following disclaimer.

8 * * Redistributions in binary form must reproduce the above copyright notice, this list of

9 * conditions and the following disclaimer in the documentation and/or other materials

10 * provided with the distribution.

11 * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used

12 * to endorse or promote products derived from this software without specific prior written

13 * permission.

14 *

15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR

16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND

17 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE

18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

19 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;

20 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,

21 * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

22 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

23 *

24 **************************************************************************************************/

25 /* \file

26 \brief Defines host-side elementwise operations on TensorView.

27 */

29 #pragma once

30 // Standard Library includes

31 #include <utility>

33 // Cutlass includes

34 #include "cutlass/cutlass.h"

35 #include "cutlass/relatively_equal.h"

37 #include "cutlass/util/distribution.h"

39 #include "tensor_foreach.h"

41 namespace cutlass {

42 namespace reference {

43 namespace device {

47 namespace kernel {

49 template <typename Element>

50 __global__ void BlockCompareEqual(

51int *equal,

52 Element const *ptr_A,

53 Element const *ptr_B,

54size_t capacity) {

56size_t idx = threadIdx.x + blockDim.x * blockIdx.x;

58for (; idx < capacity; idx += gridDim.x * blockDim.x) {

59if (ptr_A[idx] != ptr_B[idx]) {

60 *equal = 0;

61return;

62 }

63 }

64 }

66 template <typename Element>

67 __global__ void BlockCompareRelativelyEqual(

68int *equal,

69 Element const *ptr_A,

70 Element const *ptr_B,

71size_t capacity,

72 Element epsilon,

73 Element nonzero_floor) {

75size_t idx = threadIdx.x + blockDim.x * blockIdx.x;

77for (; idx < capacity; idx += gridDim.x * blockDim.x) {

79 Element a = ptr_A[idx];

80 Element b = ptr_B[idx];

82if (! relatively_equal(a, b, epsilon, nonzero_floor)) {

83 *equal = 0;

84return;

85 }

86 }

87 }

89 } // namespace kernel

95 template <typename Element>

96 bool BlockCompareEqual(

97 Element const *ptr_A,

98 Element const *ptr_B,

99size_t capacity,

100int grid_size = 0,

101int block_size = 0) {

102

103int equal_flag = 1;

104int *device_equal_flag = nullptr;

105

106if (cudaMalloc((void **)&device_equal_flag, sizeof(int)) != cudaSuccess) {

107throw std::runtime_error("Failed to allocate device flag.");

108 }

109

110if (cudaMemcpy(

111 device_equal_flag,

112 &equal_flag,

113sizeof(int),

114 cudaMemcpyHostToDevice) != cudaSuccess) {

115

116throw std::runtime_error("Failed to copy equality flag to device.");

117 }

118

119if (!grid_size || !block_size) {

120

121// if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API

122 cudaError_t result = cudaOccupancyMaxPotentialBlockSize(

123 &grid_size,

124 &block_size,

125 reinterpret_cast<void const *>(kernel::BlockCompareEqual<Element>));

126

127if (result != cudaSuccess) {

128throw std::runtime_error("Failed to query occupancy.");

129 }

130

131// Limit block size. This has the effect of increasing the number of items processed by a

132// single thread and reduces the impact of initialization overhead.

133 block_size = (block_size < 128 ? block_size : 128);

134 }

135

136 dim3 grid(grid_size, 1, 1);

137 dim3 block(block_size, 1, 1);

138

139 kernel::BlockCompareEqual<Element><<< grid, block >>>(device_equal_flag, ptr_A, ptr_B, capacity);

140

141if (cudaMemcpy(

142 &equal_flag,

143 device_equal_flag,

144sizeof(int),

145 cudaMemcpyDeviceToHost) != cudaSuccess) {

146

147 cudaFree(device_equal_flag);

148

149throw std::runtime_error("Failed to copy equality flag from device.");

150 }

151

152 cudaFree(device_equal_flag);

153

154return equal_flag;

155 }

156

158

160 template <typename Element>

161 bool BlockCompareRelativelyEqual(

162 Element const *ptr_A,

163 Element const *ptr_B,

164size_t capacity,

165 Element epsilon,

166 Element nonzero_floor,

167int grid_size = 0,

168int block_size = 0) {

169

170int equal_flag = 1;

171int *device_equal_flag = nullptr;

172

173if (cudaMalloc((void **)&device_equal_flag, sizeof(int)) != cudaSuccess) {

174throw std::runtime_error("Failed to allocate device flag.");

175 }

176

177if (cudaMemcpy(

178 device_equal_flag,

179 &equal_flag,

180sizeof(int),

181 cudaMemcpyHostToDevice) != cudaSuccess) {

182

183throw std::runtime_error("Failed to copy equality flag to device.");

184 }

185

186if (!grid_size || !block_size) {

187

188// if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API

189 cudaError_t result = cudaOccupancyMaxPotentialBlockSize(

190 &grid_size,

191 &block_size,

192 reinterpret_cast<void const *>(kernel::BlockCompareRelativelyEqual<Element>));

193

194if (result != cudaSuccess) {

195throw std::runtime_error("Failed to query occupancy.");

196 }

197

198// Limit block size. This has the effect of increasing the number of items processed by a

199// single thread and reduces the impact of initialization overhead.

200 block_size = (block_size < 128 ? block_size : 128);

201 }

202

203 dim3 grid(grid_size, 1, 1);

204 dim3 block(block_size, 1, 1);

205

206 kernel::BlockCompareRelativelyEqual<Element><<< grid, block >>>(

207 device_equal_flag,

208 ptr_A,

209 ptr_B,

210 capacity,

211 epsilon,

212 nonzero_floor

213 );

214

215if (cudaMemcpy(

216 &equal_flag,

217 device_equal_flag,

218sizeof(int),

219 cudaMemcpyDeviceToHost) != cudaSuccess) {

220

221 cudaFree(device_equal_flag);

222

223throw std::runtime_error("Failed to copy equality flag from device.");

224 }

225

226 cudaFree(device_equal_flag);

227

228return equal_flag;

229 }

230

232

233 } // device

234 } // reference

235 } // cutlass

cutlass

Definition: aligned_buffer.h:35

cutlass::reference::device::kernel::BlockCompareRelativelyEqual

__global__ void BlockCompareRelativelyEqual(int *equal, Element const *ptr_A, Element const *ptr_B, size_t capacity, Element epsilon, Element nonzero_floor)

Definition: device/tensor_compare.h:67

cutlass::relatively_equal

CUTLASS_HOST_DEVICE bool relatively_equal(T a, T b, T epsilon, T nonzero_floor)

tensor_foreach.h

relatively_equal.h

cutlass::reference::device::kernel::BlockCompareEqual

__global__ void BlockCompareEqual(int *equal, Element const *ptr_A, Element const *ptr_B, size_t capacity)

Definition: device/tensor_compare.h:50

distribution.h

This header contains a class to parametrize a statistical distribution function.

cutlass.h

Basic include for CUTLASS.

Generated by 1.8.11