docs/pitch__linear__thread__map_8h_source.html
| | CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers |
pitch_linear_thread_map.h
[Go to the documentation of this file.](pitch linear thread__map_8h.html)
1 /***************************************************************************************************
2 * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without modification, are permitted
5 * provided that the following conditions are met:
6 * * Redistributions of source code must retain the above copyright notice, this list of
7 * conditions and the following disclaimer.
8 * * Redistributions in binary form must reproduce the above copyright notice, this list of
9 * conditions and the following disclaimer in the documentation and/or other materials
10 * provided with the distribution.
11 * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12 * to endorse or promote products derived from this software without specific prior written
13 * permission.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21 * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 *
24 **************************************************************************************************/
30 #pragma once
31
32 #include "cutlass/cutlass.h"
33 #include "cutlass/array.h"
34 #include "cutlass/coord.h"
35 #include "cutlass/predicate_vector.h"
36 #include "cutlass/tensor_ref.h"
37 #include "cutlass/tensor_view.h"
38 #include "cutlass/layout/pitch_linear.h"
39
41
42 namespace cutlass {
43 namespace transform {
44
46
54 template <
55typename Shape_,
56int Threads,
57int ElementsPerAccess = 1
58 >
59 struct PitchLinearStripminedThreadMap {
60
62using TensorCoord = layout::PitchLinearCoord;
63
66
68static int const kThreads = Threads;
69
71static int const kElementsPerAccess = ElementsPerAccess;
72
74using ThreadAccessShape = layout::PitchLinearShape<kElementsPerAccess, 1>;
75
78
79static_assert(!(Shape::kContiguous % kElementsPerAccess), "");
80
81static_assert(!((Shape::kContiguous * Shape::kStrided) % (kThreads * kElementsPerAccess)),
82"Shape must be divisible thread count.");
83
85using ShapeVec = layout::PitchLinearShape<
86 Shape::kContiguous / kElementsPerAccess,
87 Shape::kStrided
88 >;
89
91 (Threads < ShapeVec::kContiguous && !(ShapeVec::kContiguous % kThreads)) ||
92 (!(kThreads % ShapeVec::kContiguous) && !(ShapeVec::kStrided % (kThreads / ShapeVec::kContiguous))),
93"Shape must be divisible by number of iterations of each thread."
94 );
95 };
96
98using Iterations = typename platform::conditional<
99 Threads >= Detail::ShapeVec::kContiguous,
101 1,
102 (Threads >= Detail::ShapeVec::kContiguous ? Detail::ShapeVec::kStrided / (kThreads / Detail::ShapeVec::kContiguous) : 0)
103 >,
104 layout::PitchLinearShape<
105Detail::ShapeVec::kContiguous / kThreads,
107 >
108 >::type;
109
112using Delta = typename platform::conditional<
113 Threads >= Detail::ShapeVec::kContiguous,
114 layout::PitchLinearShape<
115 1,
116 kThreads / Detail::ShapeVec::kContiguous
117 >,
118 layout::PitchLinearShape<
119 kThreads * kElementsPerAccess,
120 1
121 >
122 >::type;
123
127static TensorCoord initial_offset(int thread_id) {
128
129return TensorCoord(
130 (thread_id % Detail::ShapeVec::kContiguous) * kElementsPerAccess,
131 thread_id / Detail::ShapeVec::kContiguous);
132 }
133 };
134
135 template <
136typename Shape,
137int Threads,
138int ElementsPerAccess = 1
139 >
140 struct PitchLinearTilePolicyStripminedThreadContiguous
141 {
142static_assert((Shape::kContiguous % (Threads * ElementsPerAccess)) == 0,
143"Contiguous shape must divide number of threads");
144
145using TensorCoord = layout::PitchLinearCoord;
146
147static int const kThreads = Threads;
148static int const kElementsPerAccess = ElementsPerAccess;
149
150using Iterations = layout::PitchLinearShape<
151 Shape::kContiguous / (kThreads * kElementsPerAccess),
152 Shape::kStrided>;
153
154using Delta = layout::PitchLinearShape<1, 1>;
155
157static TensorCoord initial_offset(int thread_id)
158 {
159return TensorCoord(thread_id * Iterations::kContiguous * kElementsPerAccess, 0);
160 }
161 };
162
163 template <
164typename Shape,
165int Threads,
166int ElementsPerAccess = 1
167 >
168 struct PitchLinearTilePolicyStripminedThreadStrided
169 {
170static_assert((Shape::kStrided % Threads == 0),
171"Strided shape must divide number of threads");
172
173using TensorCoord = layout::PitchLinearCoord;
174
175static int const kThreads = Threads;
176static int const kElementsPerAccess = ElementsPerAccess;
177
178using Iterations = layout::PitchLinearShape<
179 Shape::kContiguous / kElementsPerAccess,
180 Shape::kStrided / kThreads>;
181
182using Delta = layout::PitchLinearShape<1, 1>;
183
185
187static TensorCoord initial_offset(int thread_id)
188 {
189
190return TensorCoord(0, thread_id * Iterations::kStrided);
191 }
192 };
193
194
196
199 template <
200typename Shape_,
201int Threads,
202typename WarpThreadArrangement_,
203int ElementsPerAccess = 1
204 >
205 struct PitchLinearWarpRakedThreadMap {
206
208using TensorCoord = layout::PitchLinearCoord;
209
212
214static int const kThreads = Threads;
215
217static int const kElementsPerAccess = ElementsPerAccess;
218
220using ThreadAccessShape = layout::PitchLinearShape<kElementsPerAccess, 1>;
221
224
226using WarpThreadArrangement = WarpThreadArrangement_;
227
229static int const kWarpSize = WarpThreadArrangement::kCount;
230
232static int const kWarpCount = kThreads / kWarpSize;
233
234static_assert(
235 !(Shape::kContiguous % kElementsPerAccess),
236"Shape must be divisible by vector length.");
237
239using ShapeInAccesses = layout::PitchLinearShape<
240 Shape::kContiguous / kElementsPerAccess,
241 Shape::kStrided
242 >;
243
244// compute number of warp-level accesses total
245using WarpAccessIterations = layout::PitchLinearShape<
246 ShapeInAccesses::kContiguous / WarpThreadArrangement::kContiguous,
247 ShapeInAccesses::kStrided / WarpThreadArrangement::kStrided
248 >;
249
250// Divide it into the number of warps, first partitioning the strided dimension then the
251// contiguous.
252static int const kWarpsStrided =
253 (WarpAccessIterations::kStrided >= kWarpCount
254 ? kWarpCount
255 : WarpAccessIterations::kStrided);
256
257static int const kWarpsContiguous =
258 (kWarpCount > WarpAccessIterations::kStrided
259 ? kWarpCount / kWarpsStrided
260 : 1);
261
263using WarpArrangement = layout::PitchLinearShape<
264 kWarpsContiguous, kWarpsStrided
265 >;
266 };
267
269using Iterations = layout::PitchLinearShape<
270 Detail::WarpAccessIterations::kContiguous / Detail::kWarpsContiguous,
271 Detail::WarpAccessIterations::kStrided / Detail::kWarpsStrided
272 >;
273
274static_assert(Iterations::kCount,
275"Number of iterations must be non-zero");
276
278using Delta = layout::PitchLinearShape<
279 Detail::WarpThreadArrangement::kContiguous * kElementsPerAccess,
280 Detail::WarpThreadArrangement::kStrided
281 >;
282
285static TensorCoord initial_offset(int thread_id) {
286
287int warp_id = (thread_id / Detail::kWarpSize);
288int lane_id = (thread_id % Detail::kWarpSize);
289
290//
291// compute warp-level offset
292//
293
294// This is the shape of the entire area covered by a warp's memory access (in units of vectors)
295layout::PitchLinearCoord warp_footprint{
296 Detail::WarpThreadArrangement::kContiguous * Iterations::kContiguous,
297 Detail::WarpThreadArrangement::kStrided * Iterations::kStrided
298 };
299
300// This is the offset of a specific warp (in units of vectors)
301layout::PitchLinearCoord warp_offset{
302 (warp_id % Detail::kWarpsContiguous),
303 (warp_id / Detail::kWarpsContiguous)
304 };
305
306// This is the offset of a specific thread within a warp (units of vectors)
307layout::PitchLinearCoord thread_offset_in_warp{
308 lane_id % Detail::WarpThreadArrangement::kContiguous,
309 lane_id / Detail::WarpThreadArrangement::kContiguous
310 };
311
312// This is the offset of a thread within a threadblock tile (units of vectors)
313layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec =
314 warp_footprint * warp_offset + thread_offset_in_warp;
315
316// This is the offset of a thread within a threadblock tile (units of elements)
317layout::PitchLinearCoord thread_offset_in_threadblock_tile_base{
318 thread_offset_in_threadblock_tile_vec.contiguous() * kElementsPerAccess,
319 thread_offset_in_threadblock_tile_vec.strided()
320 };
321
322return thread_offset_in_threadblock_tile_base;
323 }
324 };
325
327
331
332 template <typename ThreadMap_, typename WarpThreadArrangement_>
333 struct TransposePitchLinearThreadMap {
335using ThreadMap = ThreadMap_;
336
338using TensorCoord = typename ThreadMap::TensorCoord;
339
341using Shape = typename ThreadMap::Shape;
342
344static int const kThreads = ThreadMap::kThreads;
345
347static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
348
350using ThreadAccessShape = layout::PitchLinearShape<kElementsPerAccess, 1>;
351
355using WarpThreadArrangement = WarpThreadArrangement_;
356
358static int const kWarpSize = WarpThreadArrangement::kCount;
359
361static int const kWarpCount = kThreads / kWarpSize;
362
363static_assert(!(Shape::kContiguous % kElementsPerAccess),
364"Shape must be divisible by vector length.");
365
367using WarpArrangement =
368layout::PitchLinearShape<ThreadMap::Detail::kWarpsStrided,
369 ThreadMap::Detail::kWarpsContiguous>;
370 };
371
373using Iterations =
374layout::PitchLinearShape<ThreadMap::Iterations::kStrided,
375 ThreadMap::Iterations::kContiguous>;
376
377static_assert(Iterations::kCount, "Number of iterations must be non-zero");
378
380using Delta =
381layout::PitchLinearShape<Detail::WarpThreadArrangement::kContiguous *
383 Detail::WarpThreadArrangement::kStrided>;
384
389static TensorCoord initial_offset(int thread_id) {
390
391int warp_id = (thread_id / Detail::kWarpSize);
392int lane_id = (thread_id % Detail::kWarpSize);
393
394//
395// compute warp-level offset
396//
397
398// This is the shape of the entire area covered by a warp's memory access
399// (in units of vectors)
400layout::PitchLinearCoord warp_footprint{
401 Detail::WarpThreadArrangement::kContiguous * Iterations::kContiguous,
402 Detail::WarpThreadArrangement::kStrided * Iterations::kStrided};
403
404// This is the offset of a specific warp (in units of vectors)
405// Note the order of / and %. Also the 2nd operand is kStrided.
406layout::PitchLinearCoord warp_offset{
407 (warp_id / Detail::WarpArrangement::kStrided),
408 (warp_id % Detail::WarpArrangement::kStrided)};
409
410// This is the offset of a specific thread within a warp (units of vectors)
411layout::PitchLinearCoord thread_offset_in_warp{
412 lane_id % Detail::WarpThreadArrangement::kContiguous,
413 lane_id / Detail::WarpThreadArrangement::kContiguous};
414
415// This is the offset of a thread within a threadblock tile (units of
416// vectors)
417layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec =
418 warp_footprint * warp_offset + thread_offset_in_warp;
419
420// This is the offset of a thread within a threadblock tile (units of
421// elements)
422layout::PitchLinearCoord thread_offset_in_threadblock_tile_base{
423 thread_offset_in_threadblock_tile_vec.contiguous() * kElementsPerAccess,
424 thread_offset_in_threadblock_tile_vec.strided()};
425
426return thread_offset_in_threadblock_tile_base;
427 }
428 };
429
430 template <typename ThreadMap_>
431 struct TransposePitchLinearThreadMapSimt {
433using ThreadMap = ThreadMap_;
434
436using TensorCoord = typename ThreadMap::TensorCoord;
437
439using Shape = typename ThreadMap::Shape;
440
442static int const kThreads = ThreadMap::kThreads;
443
445static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
446
447static_assert(kElementsPerAccess == 1 , "Simt transpose requires elements per access to be 1");
449using Iterations =
450layout::PitchLinearShape<ThreadMap::Iterations::kStrided,
451 ThreadMap::Iterations::kContiguous>;
452
453static_assert(Iterations::kCount, "Number of iterations must be non-zero");
454
456using ThreadAccessShape = typename ThreadMap::ThreadAccessShape;
457
459using Delta =
460layout::PitchLinearShape<ThreadMap::Delta::kStrided,
461 ThreadMap::Delta::kContiguous>;
462
463
468static TensorCoord initial_offset(int thread_id) {
469
470TensorCoord coord = ThreadMap::initial_offset(thread_id);
471
472return TensorCoord(
473 coord.strided(),
474 coord.contiguous()
475 );
476 }
477 };
478
480
481
485 template <
486typename Shape_,
487int Threads,
488typename WarpThreadArrangement_,
489int ElementsPerAccess = 1
490 >
491 struct PitchLinearWarpStripedThreadMap {
492
494using TensorCoord = layout::PitchLinearCoord;
495
498
500static int const kThreads = Threads;
501
503static int const kElementsPerAccess = ElementsPerAccess;
504
506using ThreadAccessShape = layout::PitchLinearShape<kElementsPerAccess, 1>;
507
510
512using WarpThreadArrangement = WarpThreadArrangement_;
513
515static int const kWarpSize = WarpThreadArrangement::kCount;
516
518static int const kWarpCount = kThreads / kWarpSize;
519
520static_assert(
521 !(Shape::kContiguous % kElementsPerAccess),
522"Shape must be divisible by vector length.");
523
525using ShapeInAccesses = layout::PitchLinearShape<
526 Shape::kContiguous / kElementsPerAccess,
527 Shape::kStrided
528 >;
529
530// compute number of warp-level accesses total
531using WarpAccessIterations = layout::PitchLinearShape<
532 ShapeInAccesses::kContiguous / WarpThreadArrangement::kContiguous,
533 ShapeInAccesses::kStrided / WarpThreadArrangement::kStrided
534 >;
535
536// Divide it into the number of warps, first partitioning the strided dimension then the
537// contiguous.
538static int const kWarpsStrided =
539 (WarpAccessIterations::kStrided >= kWarpCount
540 ? kWarpCount : (kWarpCount / WarpAccessIterations::kStrided));
541
542static int const kWarpsContiguous =
543 (kWarpCount > WarpAccessIterations::kStrided ?
544 WarpAccessIterations::kContiguous / kWarpsStrided : 1);
545
547using WarpArrangement = layout::PitchLinearShape<
548 kWarpsContiguous, kWarpsStrided
549 >;
550 };
551
553using Iterations = layout::PitchLinearShape<
554 Detail::WarpAccessIterations::kContiguous / Detail::kWarpsContiguous,
555 Detail::WarpAccessIterations::kStrided / Detail::kWarpsStrided
556 >;
557
558static_assert(Iterations::kCount,
559"Number of iterations must be non-zero");
560
562using Delta = layout::PitchLinearShape<
563 Detail::WarpThreadArrangement::kContiguous * kElementsPerAccess,
564 Detail::WarpThreadArrangement::kStrided * Detail::WarpArrangement::kStrided
565 >;
566
569static TensorCoord initial_offset(int thread_id) {
570
571int warp_id = (thread_id / Detail::kWarpSize);
572int lane_id = (thread_id % Detail::kWarpSize);
573
574//
575// compute warp-level offset
576//
577
578// This is the shape of the entire area covered by a warp's memory access (in units of vectors)
579layout::PitchLinearCoord warp_footprint{
580 Detail::WarpThreadArrangement::kContiguous * Iterations::kContiguous,
581 Detail::WarpThreadArrangement::kStrided
582 };
583
584// This is the offset of a specific warp (in units of vectors)
585layout::PitchLinearCoord warp_offset{
586 (warp_id % Detail::kWarpsContiguous),
587 (warp_id / Detail::kWarpsContiguous)
588 };
589
590// This is the offset of a specific thread within a warp (units of vectors)
591layout::PitchLinearCoord thread_offset_in_warp{
592 lane_id % Detail::WarpThreadArrangement::kContiguous,
593 lane_id / Detail::WarpThreadArrangement::kContiguous
594 };
595
596// This is the offset of a thread within a threadblock tile (units of vectors)
597layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec =
598 warp_footprint * warp_offset + thread_offset_in_warp;
599
600// This is the offset of a thread within a threadblock tile (units of elements)
601layout::PitchLinearCoord thread_offset_in_threadblock_tile_base{
602 thread_offset_in_threadblock_tile_vec.contiguous() * kElementsPerAccess,
603 thread_offset_in_threadblock_tile_vec.strided()
604 };
605
606return thread_offset_in_threadblock_tile_base;
607 }
608 };
609
618 template <
619typename Shape_,
620int Threads,
621typename ThreadTileShape
622 >
623 struct PitchLinear2DThreadTileStripminedThreadMap;
624
625
626 template <
627typename Shape_,
628int Threads
629 >
630 struct PitchLinear2DThreadTileStripminedThreadMap <Shape_, Threads, cutlass::layout::PitchLinearShape<4, 4>>{
631
633using TensorCoord = layout::PitchLinearCoord;
634
637
639using ThreadAccessShape = cutlass::layout::PitchLinearShape<4, 4>;
640//using ThreadAccessShape = ThreadTileShape;
641
643static int const kThreads = Threads;
644
646static int const kElementsPerAccess = ThreadAccessShape::kContiguous;
647
648static_assert(!(kElementsPerAccess % 4) , "kElementsPerAccess, needs to be multiple of 4 (32bits)");
649
651struct Detail {
652
653static_assert(!(ThreadAccessShape::kContiguous % 4), "ThreadAccessShape, needs to be multiple of 4");
654
655static_assert(!(Shape::kContiguous % ThreadAccessShape::kContiguous), "");
656
657static_assert(!((Shape::kContiguous * Shape::kStrided) % (kThreads * ThreadAccessShape::kCount)),
658"Shape must be divisible thread count * accesses per thread.");
659
661using ShapeVec = layout::PitchLinearShape<
662 Shape::kContiguous / ThreadAccessShape::kContiguous,
663 Shape::kStrided / ThreadAccessShape::kStrided
664 >;
665
666static_assert(
667 (Threads < ShapeVec::kContiguous && !(ShapeVec::kContiguous % kThreads)) ||
668 (!(kThreads % ShapeVec::kContiguous) && !(ShapeVec::kStrided % (kThreads / ShapeVec::kContiguous))),
669"Shape must be divisible by number of iterations of each thread."
670 );
671 };
672
674using Iterations = typename platform::conditional<
675 Threads >= Detail::ShapeVec::kContiguous,
677 1,
678 (Threads >= Detail::ShapeVec::kContiguous ? Detail::ShapeVec::kStrided / (kThreads / Detail::ShapeVec::kContiguous) : 0)
679 >,
680 layout::PitchLinearShape<
681Detail::ShapeVec::kContiguous / kThreads,
683 >
684 >::type;
685
688using Delta = typename platform::conditional<
689 Threads >= Detail::ShapeVec::kContiguous,
690 layout::PitchLinearShape<
691 Shape::kContiguous,
692 kThreads * ThreadAccessShape::kStrided / Detail::ShapeVec::kContiguous
693 >,
694 layout::PitchLinearShape<
695 kThreads * ThreadAccessShape::kContiguous,
696 1
697 >
698 >::type;
699
703static TensorCoord initial_offset(int thread_id) {
704
705return TensorCoord(
706 (thread_id % Detail::ShapeVec::kContiguous) * ThreadAccessShape::kContiguous,
707 (thread_id / Detail::ShapeVec::kContiguous) * ThreadAccessShape::kStrided);
708 }
709 };
710
712 template <typename ThreadMap_>
713 struct TransposePitchLinearThreadMap2DThreadTile {
715using ThreadMap = ThreadMap_;
716
718using TensorCoord = typename ThreadMap::TensorCoord;
719
721using Shape = typename ThreadMap::Shape;
722
724static int const kThreads = ThreadMap::kThreads;
725
727static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
728
729
730static_assert(kElementsPerAccess > 1 , "Simt transpose requires elements per access to be 1");
732using Iterations =
733layout::PitchLinearShape<ThreadMap::Iterations::kStrided,
734 ThreadMap::Iterations::kContiguous>;
735
736static_assert(Iterations::kCount, "Number of iterations must be non-zero");
737
739using ThreadAccessShape = typename ThreadMap::ThreadAccessShape;
740
742using Delta =
743layout::PitchLinearShape<ThreadMap::Delta::kStrided,
744 ThreadMap::Delta::kContiguous>;
745
746
751static TensorCoord initial_offset(int thread_id) {
752
753TensorCoord coord = ThreadMap::initial_offset(thread_id);
754return TensorCoord(
755 coord.strided(),
756 coord.contiguous()
757 );
758 }
759 };
760
761
763
764 } // namespace transform
765 } // namespace cutlass
766
cutlass::layout::PitchLinearShape::kCount
static int const kCount
Definition: pitch_linear.h:46
Definition: aligned_buffer.h:35
cutlass::transform::PitchLinear2DThreadTileStripminedThreadMap
Definition: pitch_linear_thread_map.h:623
cutlass::layout::PitchLinearCoord
Coordinate in pitch-linear space.
Definition: pitch_linear.h:52
Defines a structure containing strides, bounds, and a pointer to tensor data.
cutlass::transform::PitchLinearStripminedThreadMap::Shape
Shape_ Shape
Tile shape.
Definition: pitch_linear_thread_map.h:65
A Coord is a coordinate of arbitrary rank into a tensor or matrix.
cutlass::transform::PitchLinearStripminedThreadMap::TensorCoord
layout::PitchLinearCoord TensorCoord
Tensor coordinate.
Definition: pitch_linear_thread_map.h:62
cutlass::transform::TransposePitchLinearThreadMapSimt::ThreadMap
ThreadMap_ ThreadMap
Underlying ThreadMap.
Definition: pitch_linear_thread_map.h:433
cutlass::transform::TransposePitchLinearThreadMapSimt
Definition: pitch_linear_thread_map.h:431
cutlass::transform::PitchLinearWarpStripedThreadMap::Detail::WarpThreadArrangement
WarpThreadArrangement_ WarpThreadArrangement
Fixed arrangement of threads within a warp (units of threads).
Definition: pitch_linear_thread_map.h:512
cutlass::transform::TransposePitchLinearThreadMap2DThreadTile::ThreadAccessShape
typename ThreadMap::ThreadAccessShape ThreadAccessShape
Delta betweeen accesses (units of elements, concept: PitchLinearShape)
Definition: pitch_linear_thread_map.h:741
cutlass::transform::PitchLinearStripminedThreadMap::kElementsPerAccess
static int const kElementsPerAccess
Extract vector length from Layout.
Definition: pitch_linear_thread_map.h:71
cutlass::transform::PitchLinearWarpStripedThreadMap::initial_offset
static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)
Maps thread ID to a coordinate offset within the tensor's logical coordinate space.
Definition: pitch_linear_thread_map.h:569
cutlass::transform::PitchLinearWarpStripedThreadMap::Shape
Shape_ Shape
Tile shape.
Definition: pitch_linear_thread_map.h:497
cutlass::transform::TransposePitchLinearThreadMapSimt::Shape
typename ThreadMap::Shape Shape
Tile shape.
Definition: pitch_linear_thread_map.h:439
Defines a structure containing strides and a pointer to tensor data.
cutlass::transform::PitchLinearTilePolicyStripminedThreadStrided::ShapeVec
Shape ShapeVec
Definition: pitch_linear_thread_map.h:184
Shape_ Shape
Tile shape.
Definition: pitch_linear_thread_map.h:636
cutlass::transform::TransposePitchLinearThreadMapSimt::initial_offset
static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)
Definition: pitch_linear_thread_map.h:468
cutlass::layout::PitchLinearShape
Template defining a shape used by pitch-linear operators.
Definition: pitch_linear.h:43
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...
cutlass::transform::TransposePitchLinearThreadMap
Definition: pitch_linear_thread_map.h:333
Defines container classes and iterators for managing a statically sized vector of boolean predicates...
cutlass::transform::PitchLinearWarpRakedThreadMap::Shape
Shape_ Shape
Tile shape.
Definition: pitch_linear_thread_map.h:211
cutlass::layout::PitchLinearShape::kStrided
static int const kStrided
Definition: pitch_linear.h:45
typename platform::conditional< Threads >=Detail::ShapeVec::kContiguous, layout::PitchLinearShape< 1,(Threads >=Detail::ShapeVec::kContiguous?Detail::ShapeVec::kStrided/(kThreads/Detail::ShapeVec::kContiguous):0) >, layout::PitchLinearShape< Detail::ShapeVec::kContiguous/kThreads, Detail::ShapeVec::kStrided > >::type Iterations
Number of iterations by each thread.
Definition: pitch_linear_thread_map.h:684
cutlass::layout::PitchLinearShape::kContiguous
static int const kContiguous
Definition: pitch_linear.h:44
cutlass::transform::PitchLinearWarpStripedThreadMap
Definition: pitch_linear_thread_map.h:491
cutlass::transform::PitchLinearTilePolicyStripminedThreadStrided::initial_offset
static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)
Definition: pitch_linear_thread_map.h:187
cutlass::transform::PitchLinearWarpRakedThreadMap::Detail::WarpThreadArrangement
WarpThreadArrangement_ WarpThreadArrangement
Fixed arrangement of threads within a warp (units of threads).
Definition: pitch_linear_thread_map.h:226
cutlass::transform::TransposePitchLinearThreadMap2DThreadTile
Thread Mapping a 2D threadtiled mapping as a transposed Pitchlinear2DThreadTile mapping.
Definition: pitch_linear_thread_map.h:713
cutlass::transform::TransposePitchLinearThreadMap::Detail::WarpThreadArrangement
WarpThreadArrangement_ WarpThreadArrangement
Fixed arrangement of threads within a warp (units of threads).
Definition: pitch_linear_thread_map.h:355
cutlass::transform::TransposePitchLinearThreadMap::Detail
Internal details made public to facilitate introspection Iterations along each dimension (concept: Pi...
Definition: pitch_linear_thread_map.h:353
cutlass::transform::PitchLinearWarpRakedThreadMap
Definition: pitch_linear_thread_map.h:205
cutlass::transform::TransposePitchLinearThreadMap::ThreadMap
ThreadMap_ ThreadMap
Underlying ThreadMap.
Definition: pitch_linear_thread_map.h:335
cutlass::transform::PitchLinearStripminedThreadMap::Detail
Internal implementation details.
Definition: pitch_linear_thread_map.h:77
cutlass::transform::TransposePitchLinearThreadMap::Shape
typename ThreadMap::Shape Shape
Tile shape.
Definition: pitch_linear_thread_map.h:341
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89
cutlass::layout::PitchLinearCoord::contiguous
CUTLASS_HOST_DEVICE Index const & contiguous() const
Returns the contiguous dimension.
Definition: pitch_linear.h:89
cutlass::platform::conditional
std::conditional (true specialization)
Definition: platform.h:325
#define static_assert(__e, __m)
Definition: platform.h:153
cutlass::transform::PitchLinearTilePolicyStripminedThreadContiguous::initial_offset
static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)
Definition: pitch_linear_thread_map.h:157
cutlass::transform::TransposePitchLinearThreadMap::initial_offset
static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)
Definition: pitch_linear_thread_map.h:389
cutlass::transform::PitchLinearStripminedThreadMap::kThreads
static int const kThreads
Number of threads total.
Definition: pitch_linear_thread_map.h:68
cutlass::transform::TransposePitchLinearThreadMap2DThreadTile::initial_offset
static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)
Definition: pitch_linear_thread_map.h:751
cutlass::transform::PitchLinearWarpRakedThreadMap::initial_offset
static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)
Maps thread ID to a coordinate offset within the tensor's logical coordinate space.
Definition: pitch_linear_thread_map.h:285
cutlass::transform::PitchLinearStripminedThreadMap::initial_offset
static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)
Definition: pitch_linear_thread_map.h:127
cutlass::transform::PitchLinearWarpRakedThreadMap::Detail
Internal details made public to facilitate introspection Iterations along each dimension (concept: Pi...
Definition: pitch_linear_thread_map.h:223
cutlass::transform::TransposePitchLinearThreadMapSimt::TensorCoord
typename ThreadMap::TensorCoord TensorCoord
Tensor coordinate.
Definition: pitch_linear_thread_map.h:436
Defines layout functions used by TensorRef and derived classes for pitch-linear memory.
cutlass::transform::PitchLinearStripminedThreadMap::Iterations
typename platform::conditional< Threads >=Detail::ShapeVec::kContiguous, layout::PitchLinearShape< 1,(Threads >=Detail::ShapeVec::kContiguous?Detail::ShapeVec::kStrided/(kThreads/Detail::ShapeVec::kContiguous):0) >, layout::PitchLinearShape< Detail::ShapeVec::kContiguous/kThreads, Detail::ShapeVec::kStrided > >::type Iterations
Number of iterations by each thread.
Definition: pitch_linear_thread_map.h:108
cutlass::transform::TransposePitchLinearThreadMap2DThreadTile::Shape
typename ThreadMap::Shape Shape
Tile shape.
Definition: pitch_linear_thread_map.h:721
cutlass::transform::PitchLinearStripminedThreadMap::Delta
typename platform::conditional< Threads >=Detail::ShapeVec::kContiguous, layout::PitchLinearShape< 1, kThreads/Detail::ShapeVec::kContiguous >, layout::PitchLinearShape< kThreads *kElementsPerAccess, 1 > >::type Delta
Definition: pitch_linear_thread_map.h:122
cutlass::transform::PitchLinearTilePolicyStripminedThreadStrided
Definition: pitch_linear_thread_map.h:168
cutlass::transform::TransposePitchLinearThreadMap::TensorCoord
typename ThreadMap::TensorCoord TensorCoord
Tensor coordinate.
Definition: pitch_linear_thread_map.h:338
cutlass::transform::PitchLinearTilePolicyStripminedThreadContiguous
Definition: pitch_linear_thread_map.h:140
Basic include for CUTLASS.
cutlass::transform::PitchLinearStripminedThreadMap
Definition: pitch_linear_thread_map.h:59
cutlass::layout::PitchLinearCoord::strided
CUTLASS_HOST_DEVICE Index const & strided() const
Returns the column of the coordinate.
Definition: pitch_linear.h:97
typename platform::conditional< Threads >=Detail::ShapeVec::kContiguous, layout::PitchLinearShape< Shape::kContiguous, kThreads *ThreadAccessShape::kStrided/Detail::ShapeVec::kContiguous >, layout::PitchLinearShape< kThreads *ThreadAccessShape::kContiguous, 1 > >::type Delta
Definition: pitch_linear_thread_map.h:698
cutlass::transform::PitchLinearWarpStripedThreadMap::Detail
Internal details made public to facilitate introspection Iterations along each dimension (concept: Pi...
Definition: pitch_linear_thread_map.h:509
cutlass::transform::TransposePitchLinearThreadMap2DThreadTile::TensorCoord
typename ThreadMap::TensorCoord TensorCoord
Tensor coordinate.
Definition: pitch_linear_thread_map.h:718
static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)
Definition: pitch_linear_thread_map.h:703
cutlass::transform::TransposePitchLinearThreadMap2DThreadTile::ThreadMap
ThreadMap_ ThreadMap
Underlying ThreadMap.
Definition: pitch_linear_thread_map.h:715
cutlass::transform::TransposePitchLinearThreadMapSimt::ThreadAccessShape
typename ThreadMap::ThreadAccessShape ThreadAccessShape
Delta betweeen accesses (units of elements, concept: PitchLinearShape)
Definition: pitch_linear_thread_map.h:458
Generated by 1.8.11