CUTLASS: default_thread_map_tensor_op.h Source File - Cutlass

| | CUTLASS

CUDA Templates for Linear Algebra Subroutines and Solvers |

default_thread_map_tensor_op.h

[Go to the documentation of this file.](default thread map tensor op_8h.html)

1 /***************************************************************************************************

3 *

4 * Redistribution and use in source and binary forms, with or without modification, are permitted

5 * provided that the following conditions are met:

6 * * Redistributions of source code must retain the above copyright notice, this list of

7 * conditions and the following disclaimer.

8 * * Redistributions in binary form must reproduce the above copyright notice, this list of

9 * conditions and the following disclaimer in the documentation and/or other materials

10 * provided with the distribution.

11 * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used

12 * to endorse or promote products derived from this software without specific prior written

13 * permission.

14 *

15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR

16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND

17 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE

18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

19 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;

20 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,

21 * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

22 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

23 *

24 **************************************************************************************************/

30 #pragma once

32 #include "[predicated_tile_iterator.h](epilogue_2threadblock_2predicated tile iterator_8h.html)"

33 #include "cutlass/gemm/gemm.h"

34 #include "cutlass/layout/pitch_linear.h"

38 namespace cutlass {

39 namespace epilogue {

40 namespace threadblock {

45 template <

46typename ThreadblockShape_,

47typename WarpShape_,

48int PartitionsK,

49typename Element_,

50int ElementsPerAccess

51 >

52 struct DefaultThreadMapTensorOp {

54using ThreadblockShape = ThreadblockShape_;

55using WarpShape = WarpShape_;

56static int const kPartitionsK = PartitionsK;

57using Element = Element_;

58static int const kElementsPerAccess = ElementsPerAccess;

60//

61// Definitions

62//

64struct Detail {

67static int const kTensorOpRows = 8;

68static int const kWarpSize = 32;

70static_assert(

71 !(ThreadblockShape::kM % WarpShape::kM) &&

72 !(ThreadblockShape::kM % WarpShape::kM), "Divisibility");

75using WarpCount = gemm::GemmShape<

76 ThreadblockShape::kM / WarpShape::kM,

77 ThreadblockShape::kN / WarpShape::kN,

78 kPartitionsK

79 >;

82static int const kThreads = WarpCount::kCount * kWarpSize;

83 };

85//

86// ThreadMap

87//

90using Type = OutputTileOptimalThreadMap <

91OutputTileShape<ThreadblockShape::kN, Detail::kTensorOpRows, Detail::WarpCount::kM, 1, 1>,

92OutputTileShape<1, WarpShape::kM / Detail::kTensorOpRows, 1, 1, WarpShape::kM / Detail::kTensorOpRows>,

93Detail::kThreads,

94kElementsPerAccess,

95sizeof_bits<Element>::value

96 >;

97 };

100

102 template <typename ThreadblockShape_, typename WarpShape_, int PartitionsK,

103typename Element_, int ElementsPerAccess, int InterleavedK>

104 struct DefaultInterleavedThreadMapTensorOp {

105using ThreadblockShape = ThreadblockShape_;

106using WarpShape = WarpShape_;

107static int const kPartitionsK = PartitionsK;

108using Element = Element_;

109static int const kElementsPerAccess = ElementsPerAccess;

110static int const kInterleavedK = InterleavedK;

111

112//

113// Definitions

114//

115

116struct Detail {

118static int const kTensorOpRows = 8;

119static int const kWarpSize = 32;

120

121static_assert(!(ThreadblockShape::kM % WarpShape::kM) &&

122 !(ThreadblockShape::kM % WarpShape::kM),

123"Divisibility");

124

126using WarpCount =

127gemm::GemmShape<ThreadblockShape::kM / WarpShape::kM,

128 ThreadblockShape::kN / WarpShape::kN, kPartitionsK>;

129

131static int const kThreads = WarpCount::kCount * kWarpSize;

132 };

133

134//

135// ThreadMap

136//

137

140using Type = InterleavedOutputTileThreadMap<

141layout::PitchLinearShape<Detail::WarpCount::kM, Detail::WarpCount::kN>,

142layout::PitchLinearShape<WarpShape::kM / Detail::kTensorOpRows,

143 WarpShape::kN / InterleavedK>,

144 Detail::kThreads, kElementsPerAccess, sizeof_bits<Element>::value>;

145 };

146

148

149 } // namespace threadblock

150 } // namespace epilogue

151 } // namespace cutlass

152

cutlass::epilogue::threadblock::DefaultThreadMapTensorOp::Detail

Definition: default_thread_map_tensor_op.h:64

cutlass::epilogue::threadblock::OutputTileOptimalThreadMap

Definition: output_tile_thread_map.h:228

cutlass::epilogue::threadblock::DefaultThreadMapTensorOp::Detail::kWarpSize

static int const kWarpSize

Definition: default_thread_map_tensor_op.h:68

cutlass

Definition: aligned_buffer.h:35

cutlass::epilogue::threadblock::OutputTileShape

Tuple defining point in output tile.

Definition: output_tile_thread_map.h:57

[predicated_tile_iterator.h](epilogue_2threadblock_2predicated tile iterator_8h.html)

Epilogue for threadblock scoped GEMMs using Tensor Ops.

cutlass::epilogue::threadblock::DefaultInterleavedThreadMapTensorOp::Detail

Definition: default_thread_map_tensor_op.h:116

cutlass::epilogue::threadblock::DefaultThreadMapTensorOp::kPartitionsK

static int const kPartitionsK

Definition: default_thread_map_tensor_op.h:56

gemm.h

Defines common types used for all GEMM-like operators.

cutlass::epilogue::threadblock::DefaultInterleavedThreadMapTensorOp::Element

Element_ Element

Definition: default_thread_map_tensor_op.h:108

cutlass::gemm::GemmShape::kCount

static int const kCount

Definition: include/cutlass/gemm/gemm.h:67

cutlass::epilogue::threadblock::InterleavedOutputTileThreadMap

Definition: output_tile_thread_map.h:442

cutlass::layout::PitchLinearShape

Template defining a shape used by pitch-linear operators.

Definition: pitch_linear.h:43

cutlass::epilogue::threadblock::DefaultThreadMapTensorOp::Detail::kThreads

static int const kThreads

Number of participating threads.

Definition: default_thread_map_tensor_op.h:82

cutlass::epilogue::threadblock::DefaultInterleavedThreadMapTensorOp

Defines the optimal thread map for TensorOp accumulator layouts.

Definition: default_thread_map_tensor_op.h:104

cutlass::sizeof_bits

Defines the size of an element in bits.

Definition: numeric_types.h:42

cutlass::epilogue::threadblock::DefaultThreadMapTensorOp

Defines the optimal thread map for TensorOp accumulator layouts.

Definition: default_thread_map_tensor_op.h:52

cutlass::epilogue::threadblock::DefaultThreadMapTensorOp::Detail::kTensorOpRows

static int const kTensorOpRows

Tensor Operations fundamentally perform operations on 8 rows.

Definition: default_thread_map_tensor_op.h:67

cutlass::gemm::GemmShape

Shape of a matrix multiply-add operation.

Definition: include/cutlass/gemm/gemm.h:57

cutlass::epilogue::threadblock::DefaultInterleavedThreadMapTensorOp::ThreadblockShape

ThreadblockShape_ ThreadblockShape

Definition: default_thread_map_tensor_op.h:105

static_assert

#define static_assert(__e, __m)

Definition: platform.h:153

cutlass::epilogue::threadblock::DefaultThreadMapTensorOp::Element

Element_ Element

Definition: default_thread_map_tensor_op.h:57

cutlass::epilogue::threadblock::DefaultInterleavedThreadMapTensorOp::WarpShape

WarpShape_ WarpShape

Definition: default_thread_map_tensor_op.h:106

cutlass::epilogue::threadblock::DefaultThreadMapTensorOp::WarpShape

WarpShape_ WarpShape

Definition: default_thread_map_tensor_op.h:55

cutlass::epilogue::threadblock::DefaultThreadMapTensorOp::ThreadblockShape

ThreadblockShape_ ThreadblockShape

Definition: default_thread_map_tensor_op.h:54

pitch_linear.h

Defines layout functions used by TensorRef and derived classes for pitch-linear memory.

cutlass::epilogue::threadblock::DefaultThreadMapTensorOp::kElementsPerAccess

static int const kElementsPerAccess

Definition: default_thread_map_tensor_op.h:58

Generated by 1.8.11