docs/default__thread__map__tensor__op_8h_source.html
| | CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers |
default_thread_map_tensor_op.h
[Go to the documentation of this file.](default thread map tensor op_8h.html)
1 /***************************************************************************************************
2 * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without modification, are permitted
5 * provided that the following conditions are met:
6 * * Redistributions of source code must retain the above copyright notice, this list of
7 * conditions and the following disclaimer.
8 * * Redistributions in binary form must reproduce the above copyright notice, this list of
9 * conditions and the following disclaimer in the documentation and/or other materials
10 * provided with the distribution.
11 * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12 * to endorse or promote products derived from this software without specific prior written
13 * permission.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21 * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 *
24 **************************************************************************************************/
30 #pragma once
31
32 #include "[predicated_tile_iterator.h](epilogue_2threadblock_2predicated tile iterator_8h.html)"
33 #include "cutlass/gemm/gemm.h"
34 #include "cutlass/layout/pitch_linear.h"
35
37
38 namespace cutlass {
39 namespace epilogue {
40 namespace threadblock {
41
43
45 template <
46typename ThreadblockShape_,
47typename WarpShape_,
48int PartitionsK,
49typename Element_,
50int ElementsPerAccess
51 >
52 struct DefaultThreadMapTensorOp {
53
54using ThreadblockShape = ThreadblockShape_;
55using WarpShape = WarpShape_;
56static int const kPartitionsK = PartitionsK;
58static int const kElementsPerAccess = ElementsPerAccess;
59
60//
61// Definitions
62//
63
65
67static int const kTensorOpRows = 8;
68static int const kWarpSize = 32;
69
71 !(ThreadblockShape::kM % WarpShape::kM) &&
72 !(ThreadblockShape::kM % WarpShape::kM), "Divisibility");
73
75using WarpCount = gemm::GemmShape<
76 ThreadblockShape::kM / WarpShape::kM,
77 ThreadblockShape::kN / WarpShape::kN,
78 kPartitionsK
79 >;
80
82static int const kThreads = WarpCount::kCount * kWarpSize;
83 };
84
85//
86// ThreadMap
87//
88
90using Type = OutputTileOptimalThreadMap <
91OutputTileShape<ThreadblockShape::kN, Detail::kTensorOpRows, Detail::WarpCount::kM, 1, 1>,
92OutputTileShape<1, WarpShape::kM / Detail::kTensorOpRows, 1, 1, WarpShape::kM / Detail::kTensorOpRows>,
96 >;
97 };
98
100
102 template <typename ThreadblockShape_, typename WarpShape_, int PartitionsK,
103typename Element_, int ElementsPerAccess, int InterleavedK>
104 struct DefaultInterleavedThreadMapTensorOp {
105using ThreadblockShape = ThreadblockShape_;
106using WarpShape = WarpShape_;
107static int const kPartitionsK = PartitionsK;
109static int const kElementsPerAccess = ElementsPerAccess;
110static int const kInterleavedK = InterleavedK;
111
112//
113// Definitions
114//
115
118static int const kTensorOpRows = 8;
119static int const kWarpSize = 32;
120
121static_assert(!(ThreadblockShape::kM % WarpShape::kM) &&
122 !(ThreadblockShape::kM % WarpShape::kM),
123"Divisibility");
124
126using WarpCount =
127gemm::GemmShape<ThreadblockShape::kM / WarpShape::kM,
128 ThreadblockShape::kN / WarpShape::kN, kPartitionsK>;
129
131static int const kThreads = WarpCount::kCount * kWarpSize;
132 };
133
134//
135// ThreadMap
136//
137
140using Type = InterleavedOutputTileThreadMap<
141layout::PitchLinearShape<Detail::WarpCount::kM, Detail::WarpCount::kN>,
142layout::PitchLinearShape<WarpShape::kM / Detail::kTensorOpRows,
143 WarpShape::kN / InterleavedK>,
144Detail::kThreads, kElementsPerAccess, sizeof_bits<Element>::value>;
145 };
146
148
149 } // namespace threadblock
150 } // namespace epilogue
151 } // namespace cutlass
152
cutlass::epilogue::threadblock::DefaultThreadMapTensorOp::Detail
Definition: default_thread_map_tensor_op.h:64
cutlass::epilogue::threadblock::OutputTileOptimalThreadMap
Definition: output_tile_thread_map.h:228
cutlass::epilogue::threadblock::DefaultThreadMapTensorOp::Detail::kWarpSize
static int const kWarpSize
Definition: default_thread_map_tensor_op.h:68
Definition: aligned_buffer.h:35
cutlass::epilogue::threadblock::OutputTileShape
Tuple defining point in output tile.
Definition: output_tile_thread_map.h:57
[predicated_tile_iterator.h](epilogue_2threadblock_2predicated tile iterator_8h.html)
Epilogue for threadblock scoped GEMMs using Tensor Ops.
cutlass::epilogue::threadblock::DefaultInterleavedThreadMapTensorOp::Detail
Definition: default_thread_map_tensor_op.h:116
cutlass::epilogue::threadblock::DefaultThreadMapTensorOp::kPartitionsK
static int const kPartitionsK
Definition: default_thread_map_tensor_op.h:56
Defines common types used for all GEMM-like operators.
cutlass::epilogue::threadblock::DefaultInterleavedThreadMapTensorOp::Element
Element_ Element
Definition: default_thread_map_tensor_op.h:108
cutlass::gemm::GemmShape::kCount
static int const kCount
Definition: include/cutlass/gemm/gemm.h:67
cutlass::epilogue::threadblock::InterleavedOutputTileThreadMap
Definition: output_tile_thread_map.h:442
cutlass::layout::PitchLinearShape
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43
cutlass::epilogue::threadblock::DefaultThreadMapTensorOp::Detail::kThreads
static int const kThreads
Number of participating threads.
Definition: default_thread_map_tensor_op.h:82
cutlass::epilogue::threadblock::DefaultInterleavedThreadMapTensorOp
Defines the optimal thread map for TensorOp accumulator layouts.
Definition: default_thread_map_tensor_op.h:104
Defines the size of an element in bits.
Definition: numeric_types.h:42
cutlass::epilogue::threadblock::DefaultThreadMapTensorOp
Defines the optimal thread map for TensorOp accumulator layouts.
Definition: default_thread_map_tensor_op.h:52
cutlass::epilogue::threadblock::DefaultThreadMapTensorOp::Detail::kTensorOpRows
static int const kTensorOpRows
Tensor Operations fundamentally perform operations on 8 rows.
Definition: default_thread_map_tensor_op.h:67
Shape of a matrix multiply-add operation.
Definition: include/cutlass/gemm/gemm.h:57
cutlass::epilogue::threadblock::DefaultInterleavedThreadMapTensorOp::ThreadblockShape
ThreadblockShape_ ThreadblockShape
Definition: default_thread_map_tensor_op.h:105
#define static_assert(__e, __m)
Definition: platform.h:153
cutlass::epilogue::threadblock::DefaultThreadMapTensorOp::Element
Element_ Element
Definition: default_thread_map_tensor_op.h:57
cutlass::epilogue::threadblock::DefaultInterleavedThreadMapTensorOp::WarpShape
WarpShape_ WarpShape
Definition: default_thread_map_tensor_op.h:106
cutlass::epilogue::threadblock::DefaultThreadMapTensorOp::WarpShape
WarpShape_ WarpShape
Definition: default_thread_map_tensor_op.h:55
cutlass::epilogue::threadblock::DefaultThreadMapTensorOp::ThreadblockShape
ThreadblockShape_ ThreadblockShape
Definition: default_thread_map_tensor_op.h:54
Defines layout functions used by TensorRef and derived classes for pitch-linear memory.
cutlass::epilogue::threadblock::DefaultThreadMapTensorOp::kElementsPerAccess
static int const kElementsPerAccess
Definition: default_thread_map_tensor_op.h:58
Generated by 1.8.11