Back to Cutlass

CUTLASS: linear_combination_relu.h Source File

docs/linear__combination__relu_8h_source.html

4.4.247.9 KB
Original Source

| | CUTLASS

CUDA Templates for Linear Algebra Subroutines and Solvers |

linear_combination_relu.h

[Go to the documentation of this file.](linear combination relu_8h.html)

1 /***************************************************************************************************

2 * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.

3 *

4 * Redistribution and use in source and binary forms, with or without modification, are permitted

5 * provided that the following conditions are met:

6 * * Redistributions of source code must retain the above copyright notice, this list of

7 * conditions and the following disclaimer.

8 * * Redistributions in binary form must reproduce the above copyright notice, this list of

9 * conditions and the following disclaimer in the documentation and/or other materials

10 * provided with the distribution.

11 * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used

12 * to endorse or promote products derived from this software without specific prior written

13 * permission.

14 *

15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR

16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND

17 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE

18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

19 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;

20 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,

21 * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

22 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

23 *

24 **************************************************************************************************/

30 #pragma once

31

32 #include "cutlass/cutlass.h"

33 #include "cutlass/numeric_types.h"

34 #include "cutlass/array.h"

35 #include "cutlass/functional.h"

36 #include "cutlass/numeric_conversion.h"

37

39

40 namespace cutlass {

41 namespace epilogue {

42 namespace thread {

43

45

51 template <

52typename ElementOutput_,

53int Count,

54typename ElementAccumulator_ = ElementOutput_,

55typename ElementCompute_ = ElementOutput_,

56FloatRoundStyle Round = FloatRoundStyle::round_to_nearest

57 >

58 class LinearCombinationRelu {

59 public:

60

61using ElementOutput = ElementOutput_;

62using ElementAccumulator = ElementAccumulator_;

63using ElementCompute = ElementCompute_;

64

65static int const kCount = Count;

66

67using FragmentOutput = Array<ElementOutput, kCount>;

68using FragmentAccumulator = Array<ElementAccumulator, kCount>;

69using ComputeFragment = Array<ElementCompute, kCount>;

70

71static FloatRoundStyle const kRound = Round;

72

74struct Params {

75

76ElementCompute alpha;

77ElementCompute beta;

78ElementCompute threshold;

79ElementCompute const *alpha_ptr;

80ElementCompute const *beta_ptr;

81

82//

83// Methods

84//

85

86CUTLASS_HOST_DEVICE

87Params():

88 alpha(ElementCompute(1)),

89 beta(ElementCompute(0)),

90 threshold(ElementCompute(0)),

91 alpha_ptr(nullptr),

92 beta_ptr(nullptr) { }

93

94CUTLASS_HOST_DEVICE

95Params(

96ElementCompute alpha,

97ElementCompute beta,

98ElementCompute threshold = ElementCompute(0)

99 ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) {

100

101 }

102

103CUTLASS_HOST_DEVICE

104Params(

105ElementCompute const *alpha_ptr,

106ElementCompute const *beta_ptr,

107ElementCompute threshold = ElementCompute(0)

108 ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {

109

110 }

111 };

112

113 private:

114

115//

116// Data members

117//

118

119ElementCompute alpha_;

120ElementCompute beta_;

121ElementCompute threshold_;

122

123 public:

124

126CUTLASS_HOST_DEVICE

127LinearCombinationRelu(Params const &params) {

128

129 alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);

130 beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);

131 threshold_ = params.threshold;

132 }

133

135CUTLASS_HOST_DEVICE

136bool is_source_needed() const {

137return beta_ != ElementCompute(0);

138 }

139

141CUTLASS_HOST_DEVICE

142void set_k_partition(int k_partition) {

143if (k_partition) {

144 beta_ = ElementCompute(1);

145 }

146 }

147

149CUTLASS_HOST_DEVICE

150FragmentOutput operator()(

151FragmentAccumulator const &accumulator,

152FragmentOutput const &source,

153ElementCompute uniform = ElementCompute(0)) const {

154

155// Convert source to interal compute numeric type

156NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;

157NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;

158

159ComputeFragment converted_source = source_converter(source);

160ComputeFragment converted_accumulator = accumulator_converter(accumulator);

161

162// Perform binary operations

163

164ComputeFragment intermediate;

165

166multiplies<ComputeFragment> mul_add_source;

167multiply_add<ComputeFragment> mul_add_accumulator;

168

169maximum<ComputeFragment> max_accumulator;

170

171 intermediate = mul_add_source(beta_, converted_source); // X = beta * C + uniform

172 intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate); // D = alpha * Accum + X

173

174 intermediate = max_accumulator(intermediate, threshold_);

175

176// Convert to destination numeric type

177NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;

178

179return destination_converter(intermediate);

180 }

181 };

182

183

185

191 template <

192typename ElementOutput_,

193int Count,

194FloatRoundStyle Round

195 >

196 class LinearCombinationRelu<ElementOutput_, Count, int, float, Round> {

197 public:

198

199using ElementOutput = ElementOutput_;

200using ElementAccumulator = int;

201using ElementCompute = float;

202

203static int const kCount = Count;

204

205using FragmentOutput = Array<ElementOutput, kCount>;

206using FragmentAccumulator = Array<ElementAccumulator, kCount>;

207using ComputeFragment = Array<ElementCompute, kCount>;

208

209static FloatRoundStyle const kRound = Round;

210

212struct Params {

213

214ElementCompute alpha;

215ElementCompute beta;

216ElementCompute threshold;

217ElementCompute const *alpha_ptr;

218ElementCompute const *beta_ptr;

219

220//

221// Methods

222//

223

224CUTLASS_HOST_DEVICE

225Params():

226 alpha(ElementCompute(1)),

227 beta(ElementCompute(0)),

228 threshold(ElementCompute(0)),

229 alpha_ptr(nullptr),

230 beta_ptr(nullptr) { }

231

232CUTLASS_HOST_DEVICE

233Params(

234ElementCompute alpha,

235ElementCompute beta,

236ElementCompute threshold = ElementCompute(0)

237 ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) {

238

239 }

240

241CUTLASS_HOST_DEVICE

242Params(

243ElementCompute const *alpha_ptr,

244ElementCompute const *beta_ptr,

245ElementCompute threshold = ElementCompute(0)

246 ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {

247

248 }

249 };

250

251 private:

252

253//

254// Data members

255//

256

257ElementCompute alpha_;

258ElementCompute beta_;

259ElementCompute threshold_;

260

261 public:

262

264CUTLASS_HOST_DEVICE

265LinearCombinationRelu(Params const &params) {

266

267 alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);

268 beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);

269 threshold_ = params.threshold;

270 }

271

273CUTLASS_HOST_DEVICE

274bool is_source_needed() const {

275return beta_ != ElementCompute(0);

276 }

277

279CUTLASS_HOST_DEVICE

280void set_k_partition(int k_partition) {

281if (k_partition) {

282 beta_ = ElementCompute(1);

283 }

284 }

285

287CUTLASS_HOST_DEVICE

288FragmentOutput operator()(

289FragmentAccumulator const &accumulator,

290FragmentOutput const &source,

291ElementCompute uniform = ElementCompute(0)) const {

292

293// Convert source to interal compute numeric type

294NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;

295NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;

296

297ComputeFragment converted_source = source_converter(source);

298ComputeFragment converted_accumulator = accumulator_converter(accumulator);

299

300// Perform binary operations

301

302ComputeFragment intermediate;

303

304multiplies<ComputeFragment> mul_add_source;

305multiply_add<ComputeFragment> mul_add_accumulator;

306

307maximum<ComputeFragment> max_accumulator;

308

309 intermediate = mul_add_source(beta_, converted_source); // X = beta * C + uniform

310 intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate); // D = alpha * Accum + X

311

312// Clamp to theshold

313 intermediate = max_accumulator(intermediate, threshold_);

314

315// Convert back to accumulator data type

316FragmentAccumulator scaled_accumulator;

317

318CUTLASS_PRAGMA_UNROLL

319for (int i = 0; i < kCount; ++i) {

320 scaled_accumulator[i] = static_cast<int>(intermediate[i]);

321 }

322

323// Convert to destination numeric type and pack

324NumericArrayConverter<ElementOutput, ElementAccumulator, kCount, Round> destination_converter;

325

326return destination_converter(scaled_accumulator);

327 }

328 };

329

331

332 } // namespace thread

333 } // namespace epilogue

334 } // namespace cutlass

cutlass::multiply_add

Fused multiply-add.

Definition: functional.h:92

cutlass::epilogue::thread::LinearCombinationRelu::operator()

CUTLASS_HOST_DEVICE FragmentOutput operator()(FragmentAccumulator const &accumulator, FragmentOutput const &source, ElementCompute uniform=ElementCompute(0)) const

Computes linear scaling: D = alpha * accumulator + beta * source.

Definition: linear_combination_relu.h:150

cutlass::epilogue::thread::LinearCombinationRelu::Params::Params

CUTLASS_HOST_DEVICE Params()

Definition: linear_combination_relu.h:87

cutlass

Definition: aligned_buffer.h:35

cutlass::epilogue::thread::LinearCombinationRelu::Params::Params

CUTLASS_HOST_DEVICE Params(ElementCompute const *alpha_ptr, ElementCompute const *beta_ptr, ElementCompute threshold=ElementCompute(0))

Definition: linear_combination_relu.h:104

cutlass::epilogue::thread::LinearCombinationRelu::Params::beta

ElementCompute beta

scales source tensor

Definition: linear_combination_relu.h:77

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::Params::Params

CUTLASS_HOST_DEVICE Params(ElementCompute alpha, ElementCompute beta, ElementCompute threshold=ElementCompute(0))

Definition: linear_combination_relu.h:233

cutlass::epilogue::thread::LinearCombinationRelu::FragmentOutput

Array< ElementOutput, kCount > FragmentOutput

Definition: linear_combination_relu.h:67

cutlass::epilogue::thread::LinearCombinationRelu::FragmentAccumulator

Array< ElementAccumulator, kCount > FragmentAccumulator

Definition: linear_combination_relu.h:68

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::Params::beta_ptr

ElementCompute const * beta_ptr

pointer to source scalar - if not null, loads it from memory

Definition: linear_combination_relu.h:218

cutlass::epilogue::thread::LinearCombinationRelu

Definition: linear_combination_relu.h:58

cutlass::maximum

Definition: functional.h:235

cutlass::epilogue::thread::LinearCombinationRelu::Params::beta_ptr

ElementCompute const * beta_ptr

pointer to source scalar - if not null, loads it from memory

Definition: linear_combination_relu.h:80

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::LinearCombinationRelu

CUTLASS_HOST_DEVICE LinearCombinationRelu(Params const &params)

Constructs the function object, possibly loading from pointers in host memory.

Definition: linear_combination_relu.h:265

cutlass::epilogue::thread::LinearCombinationRelu::ElementCompute

ElementCompute_ ElementCompute

Definition: linear_combination_relu.h:63

array.h

Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...

CUTLASS_PRAGMA_UNROLL

#define CUTLASS_PRAGMA_UNROLL

Definition: cutlass.h:110

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::set_k_partition

CUTLASS_HOST_DEVICE void set_k_partition(int k_partition)

Functionally required for serial reduction in the epilogue.

Definition: linear_combination_relu.h:280

numeric_conversion.h

Boost-like numeric conversion operator for CUTLASS numeric types.

nullptr

#define nullptr

nullptr

Definition: platform.h:144

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::Params::alpha

ElementCompute alpha

scales accumulators

Definition: linear_combination_relu.h:214

cutlass::epilogue::thread::LinearCombinationRelu::LinearCombinationRelu

CUTLASS_HOST_DEVICE LinearCombinationRelu(Params const &params)

Constructs the function object, possibly loading from pointers in host memory.

Definition: linear_combination_relu.h:127

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::Params::beta

ElementCompute beta

scales source tensor

Definition: linear_combination_relu.h:215

cutlass::epilogue::thread::LinearCombinationRelu::Params::threshold

ElementCompute threshold

Relu threshold.

Definition: linear_combination_relu.h:78

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::FragmentOutput

Array< ElementOutput, kCount > FragmentOutput

Definition: linear_combination_relu.h:205

cutlass::epilogue::thread::LinearCombinationRelu::Params::Params

CUTLASS_HOST_DEVICE Params(ElementCompute alpha, ElementCompute beta, ElementCompute threshold=ElementCompute(0))

Definition: linear_combination_relu.h:95

cutlass::multiplies

Definition: functional.h:64

CUTLASS_HOST_DEVICE

#define CUTLASS_HOST_DEVICE

Definition: cutlass.h:89

cutlass::epilogue::thread::LinearCombinationRelu::kRound

static FloatRoundStyle const kRound

Definition: linear_combination_relu.h:71

numeric_types.h

Top-level include for all CUTLASS numeric types.

cutlass::epilogue::thread::LinearCombinationRelu::ComputeFragment

Array< ElementCompute, kCount > ComputeFragment

Definition: linear_combination_relu.h:69

cutlass::epilogue::thread::LinearCombinationRelu::set_k_partition

CUTLASS_HOST_DEVICE void set_k_partition(int k_partition)

Functionally required for serial reduction in the epilogue.

Definition: linear_combination_relu.h:142

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::Params::Params

CUTLASS_HOST_DEVICE Params()

Definition: linear_combination_relu.h:225

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::ComputeFragment

Array< ElementCompute, kCount > ComputeFragment

Definition: linear_combination_relu.h:207

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::ElementOutput

ElementOutput_ ElementOutput

Definition: linear_combination_relu.h:199

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::is_source_needed

CUTLASS_HOST_DEVICE bool is_source_needed() const

Returns true if source is needed.

Definition: linear_combination_relu.h:274

cutlass::FloatRoundStyle::round_to_nearest

round to nearest even

cutlass::FloatRoundStyle

FloatRoundStyle

Definition: numeric_conversion.h:43

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::Params::alpha_ptr

ElementCompute const * alpha_ptr

pointer to accumulator scalar - if not null, loads it from memory

Definition: linear_combination_relu.h:217

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::Params::Params

CUTLASS_HOST_DEVICE Params(ElementCompute const *alpha_ptr, ElementCompute const *beta_ptr, ElementCompute threshold=ElementCompute(0))

Definition: linear_combination_relu.h:242

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::Params::threshold

ElementCompute threshold

Relu threshold.

Definition: linear_combination_relu.h:216

cutlass::NumericArrayConverter

Conversion operator for Array.

Definition: numeric_conversion.h:294

cutlass::epilogue::thread::LinearCombinationRelu::Params::alpha

ElementCompute alpha

scales accumulators

Definition: linear_combination_relu.h:76

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::ElementAccumulator

int ElementAccumulator

Definition: linear_combination_relu.h:200

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::ElementCompute

float ElementCompute

Definition: linear_combination_relu.h:201

cutlass::epilogue::thread::LinearCombinationRelu::ElementAccumulator

ElementAccumulator_ ElementAccumulator

Definition: linear_combination_relu.h:62

cutlass::epilogue::thread::LinearCombinationRelu::is_source_needed

CUTLASS_HOST_DEVICE bool is_source_needed() const

Returns true if source is needed.

Definition: linear_combination_relu.h:136

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::operator()

CUTLASS_HOST_DEVICE FragmentOutput operator()(FragmentAccumulator const &accumulator, FragmentOutput const &source, ElementCompute uniform=ElementCompute(0)) const

Computes linear scaling: D = alpha * accumulator + beta * source.

Definition: linear_combination_relu.h:288

cutlass::epilogue::thread::LinearCombinationRelu::kCount

static int const kCount

Definition: linear_combination_relu.h:65

cutlass.h

Basic include for CUTLASS.

cutlass::epilogue::thread::LinearCombinationRelu::Params::alpha_ptr

ElementCompute const * alpha_ptr

pointer to accumulator scalar - if not null, loads it from memory

Definition: linear_combination_relu.h:79

cutlass::epilogue::thread::LinearCombinationRelu< ElementOutput_, Count, int, float, Round >::FragmentAccumulator

Array< ElementAccumulator, kCount > FragmentAccumulator

Definition: linear_combination_relu.h:206

functional.h

Define basic numeric operators with specializations for Array<T, N>. SIMD-ize where possible...

cutlass::epilogue::thread::LinearCombinationRelu::Params

Host-constructable parameters structure.

Definition: linear_combination_relu.h:74

cutlass::epilogue::thread::LinearCombinationRelu::ElementOutput

ElementOutput_ ElementOutput

Definition: linear_combination_relu.h:61


Generated by 1.8.11