CUTLASS: shared_load_iterator.h Source File - Cutlass

| | CUTLASS

CUDA Templates for Linear Algebra Subroutines and Solvers |

shared_load_iterator.h

[Go to the documentation of this file.](shared load iterator_8h.html)

1 /***************************************************************************************************

3 *

4 * Redistribution and use in source and binary forms, with or without modification, are permitted

5 * provided that the following conditions are met:

6 * * Redistributions of source code must retain the above copyright notice, this list of

7 * conditions and the following disclaimer.

8 * * Redistributions in binary form must reproduce the above copyright notice, this list of

9 * conditions and the following disclaimer in the documentation and/or other materials

10 * provided with the distribution.

11 * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used

12 * to endorse or promote products derived from this software without specific prior written

13 * permission.

14 *

15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR

16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND

17 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE

18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

19 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;

20 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,

21 * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

22 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

23 *

24 **************************************************************************************************/

33 #pragma once

35 #include "cutlass/cutlass.h"

36 #include "cutlass/numeric_types.h"

37 #include "cutlass/array.h"

38 #include "cutlass/layout/matrix.h"

39 #include "cutlass/matrix_shape.h"

40 #include "cutlass/tensor_ref.h"

42 #include "[cutlass/epilogue/threadblock/output_tile_thread_map.h](output tile thread__map_8h.html)"

46 namespace cutlass {

47 namespace epilogue {

48 namespace threadblock {

56 template <

57typename ThreadMap_,

58typename Element_,

59int MaxAlignment = ThreadMap_::kElementsPerAccess * sizeof_bits<Element_>::value / 8

60 >

61 class SharedLoadIterator {

62 public:

63using ThreadMap = ThreadMap_;

64using Shape = typename ThreadMap::Shape;

66using Element = Element_;

68using Layout = layout::RowMajor;

69using TensorRef = TensorRef<Element, Layout>;

70using ConstTensorRef = typename TensorRef::ConstTensorRef;

72using Index = typename Layout::Index;

73using LongIndex = typename Layout::LongIndex;

74using TensorCoord = MatrixCoord;

76static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;

78static int const kMinAlignment = ThreadMap_::kElementsPerAccess * sizeof_bits<Element_>::value / 8;

80static int const kAlignment = (MaxAlignment < kMinAlignment ? MaxAlignment : kMinAlignment);

82static int const kThreads = ThreadMap::kThreads;

85using Fragment = Array<

86Element,

87 ThreadMap::Iterations::kColumn *

88 ThreadMap::Iterations::kRow *

89 ThreadMap::Iterations::kGroup *

90 ThreadMap::Iterations::kCluster *

91 ThreadMap::kElementsPerAccess>;

94using AccessType = AlignedArray<

95Element,

96 ThreadMap::kElementsPerAccess,

97 kAlignment>;

99 private:

100

101//

102// Data members

103//

104

106 uint8_t *byte_pointer_;

107

109int stride_;

110

111 public:

112

113//

114// Methods

115//

116

118 CUTLASS_DEVICE

119 SharedLoadIterator(

120TensorRef ref,

121int thread_idx

122 ):

123 byte_pointer_(reinterpret_cast<uint8_t *>(ref.data())),

124 stride_((ref.stride(0) * sizeof_bits<Element>::value) / 8) {

125

126TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx);

127

128// Initialize pointer

129 byte_pointer_ +=

130 thread_offset.row() * stride_ +

131 thread_offset.column() * sizeof(AccessType) / kElementsPerAccess;

132

133int byte_offset = thread_offset.row() * stride_ +

134 thread_offset.column() * sizeof(AccessType) / kElementsPerAccess;

135 }

136

138CUTLASS_HOST_DEVICE

139void add_pointer_offset(LongIndex pointer_offset) {

140 byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;

141 }

142

143 CUTLASS_DEVICE

144void add_tile_offset(TensorCoord const &offset) {

145add_pointer_offset(offset.row() * stride_ / (sizeof_bits<Element>::value / 8) + offset.column() * Shape::kColumn);

146 }

147

149 CUTLASS_DEVICE

150void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {

151

152AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);

153

154CUTLASS_PRAGMA_UNROLL

155for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {

156

157CUTLASS_PRAGMA_UNROLL

158for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {

159

160CUTLASS_PRAGMA_UNROLL

161for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {

162

163 uint8_t const *byte_pointer = byte_pointer_ +

164 row * ThreadMap::Delta::kRow * stride_ +

165 group * ThreadMap::Delta::kGroup* stride_ +

166 cluster * ThreadMap::Delta::kCluster * stride_ +

167 pointer_offset * sizeof_bits<Element>::value / 8;

168

169int frag_row_idx =

170 (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));

171

172AccessType const *memory_pointer = reinterpret_cast<AccessType const *>(byte_pointer);

173

174CUTLASS_PRAGMA_UNROLL

175for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {

176

177int frag_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column;

178

179 frag_ptr[frag_idx] =

180 memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess];

181 }

182 }

183 }

184 }

185 }

186

188 CUTLASS_DEVICE

189void load(Fragment &frag) {

190

191load_with_pointer_offset(frag, 0);

192 }

193 };

194

196

197 } // namespace threadblock

198 } // namespace epilogue

199 } // namespace cutlass

200

cutlass::layout::RowMajor::LongIndex

int64_t LongIndex

Long index type used for offsets.

Definition: layout/matrix.h:62

cutlass::MatrixCoord::column

CUTLASS_HOST_DEVICE Index const & column() const

Returns the column of the coordinate.

Definition: matrix_coord.h:85

cutlass::epilogue::threadblock::SharedLoadIterator::Fragment

Array< Element, ThreadMap::Iterations::kColumn *ThreadMap::Iterations::kRow *ThreadMap::Iterations::kGroup *ThreadMap::Iterations::kCluster *ThreadMap::kElementsPerAccess > Fragment

Fragment object.

Definition: shared_load_iterator.h:91

cutlass

Definition: aligned_buffer.h:35

cutlass::sizeof_bits::value

static int const value

Definition: numeric_types.h:43

tensor_ref.h

Defines a structure containing strides, bounds, and a pointer to tensor data.

cutlass::epilogue::threadblock::SharedLoadIterator::load_with_pointer_offset

CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset)

Loads a fragment from memory.

Definition: shared_load_iterator.h:150

cutlass::epilogue::threadblock::SharedLoadIterator::kThreads

static int const kThreads

Definition: shared_load_iterator.h:82

cutlass::AlignedArray

Aligned array type.

Definition: array.h:511

cutlass::epilogue::threadblock::SharedLoadIterator::SharedLoadIterator

CUTLASS_DEVICE SharedLoadIterator(TensorRef ref, int thread_idx)

Constructor.

Definition: shared_load_iterator.h:119

cutlass::MatrixCoord::row

CUTLASS_HOST_DEVICE Index const & row() const

Returns the row of the coordinate.

Definition: matrix_coord.h:77

cutlass::epilogue::threadblock::SharedLoadIterator::kMinAlignment

static int const kMinAlignment

Definition: shared_load_iterator.h:78

cutlass::epilogue::threadblock::SharedLoadIterator::ConstTensorRef

typename TensorRef::ConstTensorRef ConstTensorRef

Definition: shared_load_iterator.h:70

cutlass::TensorRef< Element, Layout >::ConstTensorRef

TensorRef< typename platform::remove_const< Element >::type const, Layout > ConstTensorRef

TensorRef to constant data.

Definition: tensor_ref.h:179

cutlass::epilogue::threadblock::SharedLoadIterator::ThreadMap

ThreadMap_ ThreadMap

Definition: shared_load_iterator.h:63

array.h

Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...

CUTLASS_PRAGMA_UNROLL

#define CUTLASS_PRAGMA_UNROLL

Definition: cutlass.h:110

cutlass::layout::RowMajor::Index

int32_t Index

Index type used for coordinates.

Definition: layout/matrix.h:59

cutlass::epilogue::threadblock::SharedLoadIterator::kAlignment

static int const kAlignment

Definition: shared_load_iterator.h:80

matrix_shape.h

Defines a Shape template for matrix tiles.

cutlass::sizeof_bits

Defines the size of an element in bits.

Definition: numeric_types.h:42

cutlass::TensorRef< Element, Layout >

cutlass::epilogue::threadblock::SharedLoadIterator::AccessType

AlignedArray< Element, ThreadMap::kElementsPerAccess, kAlignment > AccessType

Memory access size.

Definition: shared_load_iterator.h:97

cutlass::epilogue::threadblock::SharedLoadIterator::Index

typename Layout::Index Index

Definition: shared_load_iterator.h:72

CUTLASS_HOST_DEVICE

#define CUTLASS_HOST_DEVICE

Definition: cutlass.h:89

numeric_types.h

Top-level include for all CUTLASS numeric types.

[output_tile_thread_map.h](output tile thread__map_8h.html)

Metaprogram for determining the mapping of output elements to threads for epilogue tiles...

cutlass::layout::RowMajor

Mapping function for row-major matrices.

Definition: layout/matrix.h:50

cutlass::epilogue::threadblock::SharedLoadIterator::load

CUTLASS_DEVICE void load(Fragment &frag)

Loads a fragment.

Definition: shared_load_iterator.h:189

cutlass::epilogue::threadblock::SharedLoadIterator::Element

Element_ Element

Definition: shared_load_iterator.h:66

matrix.h

Defines layout functions used by TensorRef and derived classes.

cutlass::epilogue::threadblock::SharedLoadIterator::Shape

typename ThreadMap::Shape Shape

Definition: shared_load_iterator.h:64

cutlass::epilogue::threadblock::SharedLoadIterator::add_pointer_offset

CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset)

Adds a pointer offset in units of Element.

Definition: shared_load_iterator.h:139

cutlass::epilogue::threadblock::SharedLoadIterator::kElementsPerAccess

static int const kElementsPerAccess

Definition: shared_load_iterator.h:76

cutlass::epilogue::threadblock::SharedLoadIterator

Definition: shared_load_iterator.h:61

cutlass::epilogue::threadblock::SharedLoadIterator::LongIndex

typename Layout::LongIndex LongIndex

Definition: shared_load_iterator.h:73

cutlass.h

Basic include for CUTLASS.

cutlass::MatrixCoord

Definition: matrix_coord.h:39

cutlass::epilogue::threadblock::SharedLoadIterator::add_tile_offset

CUTLASS_DEVICE void add_tile_offset(TensorCoord const &offset)

Definition: shared_load_iterator.h:144

Generated by 1.8.11