CUTLASS: pitch_linear_thread_map.h Source File - Cutlass

| | CUTLASS

CUDA Templates for Linear Algebra Subroutines and Solvers |

pitch_linear_thread_map.h

[Go to the documentation of this file.](pitch linear thread__map_8h.html)

1 /***************************************************************************************************

3 *

4 * Redistribution and use in source and binary forms, with or without modification, are permitted

5 * provided that the following conditions are met:

6 * * Redistributions of source code must retain the above copyright notice, this list of

7 * conditions and the following disclaimer.

8 * * Redistributions in binary form must reproduce the above copyright notice, this list of

9 * conditions and the following disclaimer in the documentation and/or other materials

10 * provided with the distribution.

11 * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used

12 * to endorse or promote products derived from this software without specific prior written

13 * permission.

14 *

15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR

16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND

17 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE

18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

19 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;

20 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,

21 * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

22 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

23 *

24 **************************************************************************************************/

30 #pragma once

32 #include "cutlass/cutlass.h"

33 #include "cutlass/array.h"

34 #include "cutlass/coord.h"

35 #include "cutlass/predicate_vector.h"

36 #include "cutlass/tensor_ref.h"

37 #include "cutlass/tensor_view.h"

38 #include "cutlass/layout/pitch_linear.h"

42 namespace cutlass {

43 namespace transform {

54 template <

55typename Shape_,

56int Threads,

57int ElementsPerAccess = 1

58 >

59 struct PitchLinearStripminedThreadMap {

62using TensorCoord = layout::PitchLinearCoord;

65using Shape = Shape_;

68static int const kThreads = Threads;

71static int const kElementsPerAccess = ElementsPerAccess;

74using ThreadAccessShape = layout::PitchLinearShape<kElementsPerAccess, 1>;

77struct Detail {

79static_assert(!(Shape::kContiguous % kElementsPerAccess), "");

81static_assert(!((Shape::kContiguous * Shape::kStrided) % (kThreads * kElementsPerAccess)),

82"Shape must be divisible thread count.");

85using ShapeVec = layout::PitchLinearShape<

86 Shape::kContiguous / kElementsPerAccess,

87 Shape::kStrided

88 >;

90static_assert(

91 (Threads < ShapeVec::kContiguous && !(ShapeVec::kContiguous % kThreads)) ||

92 (!(kThreads % ShapeVec::kContiguous) && !(ShapeVec::kStrided % (kThreads / ShapeVec::kContiguous))),

93"Shape must be divisible by number of iterations of each thread."

94 );

95 };

98using Iterations = typename platform::conditional<

99 Threads >= Detail::ShapeVec::kContiguous,

100layout::PitchLinearShape<

101 1,

102 (Threads >= Detail::ShapeVec::kContiguous ? Detail::ShapeVec::kStrided / (kThreads / Detail::ShapeVec::kContiguous) : 0)

103 >,

104 layout::PitchLinearShape<

105Detail::ShapeVec::kContiguous / kThreads,

106Detail::ShapeVec::kStrided

107 >

108 >::type;

109

112using Delta = typename platform::conditional<

113 Threads >= Detail::ShapeVec::kContiguous,

114 layout::PitchLinearShape<

115 1,

116 kThreads / Detail::ShapeVec::kContiguous

117 >,

118 layout::PitchLinearShape<

119 kThreads * kElementsPerAccess,

120 1

121 >

122 >::type;

123

126CUTLASS_HOST_DEVICE

127static TensorCoord initial_offset(int thread_id) {

128

129return TensorCoord(

130 (thread_id % Detail::ShapeVec::kContiguous) * kElementsPerAccess,

131 thread_id / Detail::ShapeVec::kContiguous);

132 }

133 };

134

135 template <

136typename Shape,

137int Threads,

138int ElementsPerAccess = 1

139 >

140 struct PitchLinearTilePolicyStripminedThreadContiguous

141 {

142static_assert((Shape::kContiguous % (Threads * ElementsPerAccess)) == 0,

143"Contiguous shape must divide number of threads");

144

145using TensorCoord = layout::PitchLinearCoord;

146

147static int const kThreads = Threads;

148static int const kElementsPerAccess = ElementsPerAccess;

149

150using Iterations = layout::PitchLinearShape<

151 Shape::kContiguous / (kThreads * kElementsPerAccess),

152 Shape::kStrided>;

153

154using Delta = layout::PitchLinearShape<1, 1>;

155

156CUTLASS_HOST_DEVICE

157static TensorCoord initial_offset(int thread_id)

158 {

159return TensorCoord(thread_id * Iterations::kContiguous * kElementsPerAccess, 0);

160 }

161 };

162

163 template <

164typename Shape,

165int Threads,

166int ElementsPerAccess = 1

167 >

168 struct PitchLinearTilePolicyStripminedThreadStrided

169 {

170static_assert((Shape::kStrided % Threads == 0),

171"Strided shape must divide number of threads");

172

173using TensorCoord = layout::PitchLinearCoord;

174

175static int const kThreads = Threads;

176static int const kElementsPerAccess = ElementsPerAccess;

177

178using Iterations = layout::PitchLinearShape<

179 Shape::kContiguous / kElementsPerAccess,

180 Shape::kStrided / kThreads>;

181

182using Delta = layout::PitchLinearShape<1, 1>;

183

184using ShapeVec = Shape;

185

186CUTLASS_HOST_DEVICE

187static TensorCoord initial_offset(int thread_id)

188 {

189

190return TensorCoord(0, thread_id * Iterations::kStrided);

191 }

192 };

193

194

196

199 template <

200typename Shape_,

201int Threads,

202typename WarpThreadArrangement_,

203int ElementsPerAccess = 1

204 >

205 struct PitchLinearWarpRakedThreadMap {

206

208using TensorCoord = layout::PitchLinearCoord;

209

211using Shape = Shape_;

212

214static int const kThreads = Threads;

215

217static int const kElementsPerAccess = ElementsPerAccess;

218

220using ThreadAccessShape = layout::PitchLinearShape<kElementsPerAccess, 1>;

221

223struct Detail {

224

226using WarpThreadArrangement = WarpThreadArrangement_;

227

229static int const kWarpSize = WarpThreadArrangement::kCount;

230

232static int const kWarpCount = kThreads / kWarpSize;

233

234static_assert(

235 !(Shape::kContiguous % kElementsPerAccess),

236"Shape must be divisible by vector length.");

237

239using ShapeInAccesses = layout::PitchLinearShape<

240 Shape::kContiguous / kElementsPerAccess,

241 Shape::kStrided

242 >;

243

244// compute number of warp-level accesses total

245using WarpAccessIterations = layout::PitchLinearShape<

246 ShapeInAccesses::kContiguous / WarpThreadArrangement::kContiguous,

247 ShapeInAccesses::kStrided / WarpThreadArrangement::kStrided

248 >;

249

250// Divide it into the number of warps, first partitioning the strided dimension then the

251// contiguous.

252static int const kWarpsStrided =

253 (WarpAccessIterations::kStrided >= kWarpCount

254 ? kWarpCount

255 : WarpAccessIterations::kStrided);

256

257static int const kWarpsContiguous =

258 (kWarpCount > WarpAccessIterations::kStrided

259 ? kWarpCount / kWarpsStrided

260 : 1);

261

263using WarpArrangement = layout::PitchLinearShape<

264 kWarpsContiguous, kWarpsStrided

265 >;

266 };

267

269using Iterations = layout::PitchLinearShape<

270 Detail::WarpAccessIterations::kContiguous / Detail::kWarpsContiguous,

271 Detail::WarpAccessIterations::kStrided / Detail::kWarpsStrided

272 >;

273

274static_assert(Iterations::kCount,

275"Number of iterations must be non-zero");

276

278using Delta = layout::PitchLinearShape<

279 Detail::WarpThreadArrangement::kContiguous * kElementsPerAccess,

280 Detail::WarpThreadArrangement::kStrided

281 >;

282

284CUTLASS_HOST_DEVICE

285static TensorCoord initial_offset(int thread_id) {

286

287int warp_id = (thread_id / Detail::kWarpSize);

288int lane_id = (thread_id % Detail::kWarpSize);

289

290//

291// compute warp-level offset

292//

293

294// This is the shape of the entire area covered by a warp's memory access (in units of vectors)

295layout::PitchLinearCoord warp_footprint{

296 Detail::WarpThreadArrangement::kContiguous * Iterations::kContiguous,

297 Detail::WarpThreadArrangement::kStrided * Iterations::kStrided

298 };

299

300// This is the offset of a specific warp (in units of vectors)

301layout::PitchLinearCoord warp_offset{

302 (warp_id % Detail::kWarpsContiguous),

303 (warp_id / Detail::kWarpsContiguous)

304 };

305

306// This is the offset of a specific thread within a warp (units of vectors)

307layout::PitchLinearCoord thread_offset_in_warp{

308 lane_id % Detail::WarpThreadArrangement::kContiguous,

309 lane_id / Detail::WarpThreadArrangement::kContiguous

310 };

311

312// This is the offset of a thread within a threadblock tile (units of vectors)

313layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec =

314 warp_footprint * warp_offset + thread_offset_in_warp;

315

316// This is the offset of a thread within a threadblock tile (units of elements)

317layout::PitchLinearCoord thread_offset_in_threadblock_tile_base{

318 thread_offset_in_threadblock_tile_vec.contiguous() * kElementsPerAccess,

319 thread_offset_in_threadblock_tile_vec.strided()

320 };

321

322return thread_offset_in_threadblock_tile_base;

323 }

324 };

325

327

331

332 template <typename ThreadMap_, typename WarpThreadArrangement_>

333 struct TransposePitchLinearThreadMap {

335using ThreadMap = ThreadMap_;

336

338using TensorCoord = typename ThreadMap::TensorCoord;

339

341using Shape = typename ThreadMap::Shape;

342

344static int const kThreads = ThreadMap::kThreads;

345

347static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;

348

350using ThreadAccessShape = layout::PitchLinearShape<kElementsPerAccess, 1>;

351

353struct Detail {

355using WarpThreadArrangement = WarpThreadArrangement_;

356

358static int const kWarpSize = WarpThreadArrangement::kCount;

359

361static int const kWarpCount = kThreads / kWarpSize;

362

363static_assert(!(Shape::kContiguous % kElementsPerAccess),

364"Shape must be divisible by vector length.");

365

367using WarpArrangement =

368layout::PitchLinearShape<ThreadMap::Detail::kWarpsStrided,

369 ThreadMap::Detail::kWarpsContiguous>;

370 };

371

373using Iterations =

374layout::PitchLinearShape<ThreadMap::Iterations::kStrided,

375 ThreadMap::Iterations::kContiguous>;

376

377static_assert(Iterations::kCount, "Number of iterations must be non-zero");

378

380using Delta =

381layout::PitchLinearShape<Detail::WarpThreadArrangement::kContiguous *

382kElementsPerAccess,

383 Detail::WarpThreadArrangement::kStrided>;

384

388CUTLASS_HOST_DEVICE

389static TensorCoord initial_offset(int thread_id) {

390

391int warp_id = (thread_id / Detail::kWarpSize);

392int lane_id = (thread_id % Detail::kWarpSize);

393

394//

395// compute warp-level offset

396//

397

398// This is the shape of the entire area covered by a warp's memory access

399// (in units of vectors)

400layout::PitchLinearCoord warp_footprint{

401 Detail::WarpThreadArrangement::kContiguous * Iterations::kContiguous,

402 Detail::WarpThreadArrangement::kStrided * Iterations::kStrided};

403

404// This is the offset of a specific warp (in units of vectors)

405// Note the order of / and %. Also the 2nd operand is kStrided.

406layout::PitchLinearCoord warp_offset{

407 (warp_id / Detail::WarpArrangement::kStrided),

408 (warp_id % Detail::WarpArrangement::kStrided)};

409

410// This is the offset of a specific thread within a warp (units of vectors)

411layout::PitchLinearCoord thread_offset_in_warp{

412 lane_id % Detail::WarpThreadArrangement::kContiguous,

413 lane_id / Detail::WarpThreadArrangement::kContiguous};

414

415// This is the offset of a thread within a threadblock tile (units of

416// vectors)

417layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec =

418 warp_footprint * warp_offset + thread_offset_in_warp;

419

420// This is the offset of a thread within a threadblock tile (units of

421// elements)

422layout::PitchLinearCoord thread_offset_in_threadblock_tile_base{

423 thread_offset_in_threadblock_tile_vec.contiguous() * kElementsPerAccess,

424 thread_offset_in_threadblock_tile_vec.strided()};

425

426return thread_offset_in_threadblock_tile_base;

427 }

428 };

429

430 template <typename ThreadMap_>

431 struct TransposePitchLinearThreadMapSimt {

433using ThreadMap = ThreadMap_;

434

436using TensorCoord = typename ThreadMap::TensorCoord;

437

439using Shape = typename ThreadMap::Shape;

440

442static int const kThreads = ThreadMap::kThreads;

443

445static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;

446

447static_assert(kElementsPerAccess == 1 , "Simt transpose requires elements per access to be 1");

449using Iterations =

450layout::PitchLinearShape<ThreadMap::Iterations::kStrided,

451 ThreadMap::Iterations::kContiguous>;

452

453static_assert(Iterations::kCount, "Number of iterations must be non-zero");

454

456using ThreadAccessShape = typename ThreadMap::ThreadAccessShape;

457

459using Delta =

460layout::PitchLinearShape<ThreadMap::Delta::kStrided,

461 ThreadMap::Delta::kContiguous>;

462

463

467CUTLASS_HOST_DEVICE

468static TensorCoord initial_offset(int thread_id) {

469

470TensorCoord coord = ThreadMap::initial_offset(thread_id);

471

472return TensorCoord(

473 coord.strided(),

474 coord.contiguous()

475 );

476 }

477 };

478

480

481

485 template <

486typename Shape_,

487int Threads,

488typename WarpThreadArrangement_,

489int ElementsPerAccess = 1

490 >

491 struct PitchLinearWarpStripedThreadMap {

492

494using TensorCoord = layout::PitchLinearCoord;

495

497using Shape = Shape_;

498

500static int const kThreads = Threads;

501

503static int const kElementsPerAccess = ElementsPerAccess;

504

506using ThreadAccessShape = layout::PitchLinearShape<kElementsPerAccess, 1>;

507

509struct Detail {

510

512using WarpThreadArrangement = WarpThreadArrangement_;

513

515static int const kWarpSize = WarpThreadArrangement::kCount;

516

518static int const kWarpCount = kThreads / kWarpSize;

519

520static_assert(

521 !(Shape::kContiguous % kElementsPerAccess),

522"Shape must be divisible by vector length.");

523

525using ShapeInAccesses = layout::PitchLinearShape<

526 Shape::kContiguous / kElementsPerAccess,

527 Shape::kStrided

528 >;

529

530// compute number of warp-level accesses total

531using WarpAccessIterations = layout::PitchLinearShape<

532 ShapeInAccesses::kContiguous / WarpThreadArrangement::kContiguous,

533 ShapeInAccesses::kStrided / WarpThreadArrangement::kStrided

534 >;

535

536// Divide it into the number of warps, first partitioning the strided dimension then the

537// contiguous.

538static int const kWarpsStrided =

539 (WarpAccessIterations::kStrided >= kWarpCount

540 ? kWarpCount : (kWarpCount / WarpAccessIterations::kStrided));

541

542static int const kWarpsContiguous =

543 (kWarpCount > WarpAccessIterations::kStrided ?

544 WarpAccessIterations::kContiguous / kWarpsStrided : 1);

545

547using WarpArrangement = layout::PitchLinearShape<

548 kWarpsContiguous, kWarpsStrided

549 >;

550 };

551

553using Iterations = layout::PitchLinearShape<

554 Detail::WarpAccessIterations::kContiguous / Detail::kWarpsContiguous,

555 Detail::WarpAccessIterations::kStrided / Detail::kWarpsStrided

556 >;

557

558static_assert(Iterations::kCount,

559"Number of iterations must be non-zero");

560

562using Delta = layout::PitchLinearShape<

563 Detail::WarpThreadArrangement::kContiguous * kElementsPerAccess,

564 Detail::WarpThreadArrangement::kStrided * Detail::WarpArrangement::kStrided

565 >;

566

568CUTLASS_HOST_DEVICE

569static TensorCoord initial_offset(int thread_id) {

570

571int warp_id = (thread_id / Detail::kWarpSize);

572int lane_id = (thread_id % Detail::kWarpSize);

573

574//

575// compute warp-level offset

576//

577

578// This is the shape of the entire area covered by a warp's memory access (in units of vectors)

579layout::PitchLinearCoord warp_footprint{

580 Detail::WarpThreadArrangement::kContiguous * Iterations::kContiguous,

581 Detail::WarpThreadArrangement::kStrided

582 };

583

584// This is the offset of a specific warp (in units of vectors)

585layout::PitchLinearCoord warp_offset{

586 (warp_id % Detail::kWarpsContiguous),

587 (warp_id / Detail::kWarpsContiguous)

588 };

589

590// This is the offset of a specific thread within a warp (units of vectors)

591layout::PitchLinearCoord thread_offset_in_warp{

592 lane_id % Detail::WarpThreadArrangement::kContiguous,

593 lane_id / Detail::WarpThreadArrangement::kContiguous

594 };

595

596// This is the offset of a thread within a threadblock tile (units of vectors)

597layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec =

598 warp_footprint * warp_offset + thread_offset_in_warp;

599

600// This is the offset of a thread within a threadblock tile (units of elements)

601layout::PitchLinearCoord thread_offset_in_threadblock_tile_base{

602 thread_offset_in_threadblock_tile_vec.contiguous() * kElementsPerAccess,

603 thread_offset_in_threadblock_tile_vec.strided()

604 };

605

606return thread_offset_in_threadblock_tile_base;

607 }

608 };

609

618 template <

619typename Shape_,

620int Threads,

621typename ThreadTileShape

622 >

623 struct PitchLinear2DThreadTileStripminedThreadMap;

624

625

626 template <

627typename Shape_,

628int Threads

629 >

630 struct PitchLinear2DThreadTileStripminedThreadMap <Shape_, Threads, cutlass::layout::PitchLinearShape<4, 4>>{

631

633using TensorCoord = layout::PitchLinearCoord;

634

636using Shape = Shape_;

637

639using ThreadAccessShape = cutlass::layout::PitchLinearShape<4, 4>;

640//using ThreadAccessShape = ThreadTileShape;

641

643static int const kThreads = Threads;

644

646static int const kElementsPerAccess = ThreadAccessShape::kContiguous;

647

648static_assert(!(kElementsPerAccess % 4) , "kElementsPerAccess, needs to be multiple of 4 (32bits)");

649

651struct Detail {

652

653static_assert(!(ThreadAccessShape::kContiguous % 4), "ThreadAccessShape, needs to be multiple of 4");

654

655static_assert(!(Shape::kContiguous % ThreadAccessShape::kContiguous), "");

656

657static_assert(!((Shape::kContiguous * Shape::kStrided) % (kThreads * ThreadAccessShape::kCount)),

658"Shape must be divisible thread count * accesses per thread.");

659

661using ShapeVec = layout::PitchLinearShape<

662 Shape::kContiguous / ThreadAccessShape::kContiguous,

663 Shape::kStrided / ThreadAccessShape::kStrided

664 >;

665

666static_assert(

667 (Threads < ShapeVec::kContiguous && !(ShapeVec::kContiguous % kThreads)) ||

668 (!(kThreads % ShapeVec::kContiguous) && !(ShapeVec::kStrided % (kThreads / ShapeVec::kContiguous))),

669"Shape must be divisible by number of iterations of each thread."

670 );

671 };

672

674using Iterations = typename platform::conditional<

675 Threads >= Detail::ShapeVec::kContiguous,

676layout::PitchLinearShape<

677 1,

678 (Threads >= Detail::ShapeVec::kContiguous ? Detail::ShapeVec::kStrided / (kThreads / Detail::ShapeVec::kContiguous) : 0)

679 >,

680 layout::PitchLinearShape<

681Detail::ShapeVec::kContiguous / kThreads,

682Detail::ShapeVec::kStrided

683 >

684 >::type;

685

688using Delta = typename platform::conditional<

689 Threads >= Detail::ShapeVec::kContiguous,

690 layout::PitchLinearShape<

691 Shape::kContiguous,

692 kThreads * ThreadAccessShape::kStrided / Detail::ShapeVec::kContiguous

693 >,

694 layout::PitchLinearShape<

695 kThreads * ThreadAccessShape::kContiguous,

696 1

697 >

698 >::type;

699

702CUTLASS_HOST_DEVICE

703static TensorCoord initial_offset(int thread_id) {

704

705return TensorCoord(

706 (thread_id % Detail::ShapeVec::kContiguous) * ThreadAccessShape::kContiguous,

707 (thread_id / Detail::ShapeVec::kContiguous) * ThreadAccessShape::kStrided);

708 }

709 };

710

712 template <typename ThreadMap_>

713 struct TransposePitchLinearThreadMap2DThreadTile {

715using ThreadMap = ThreadMap_;

716

718using TensorCoord = typename ThreadMap::TensorCoord;

719

721using Shape = typename ThreadMap::Shape;

722

724static int const kThreads = ThreadMap::kThreads;

725

727static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;

728

729

730static_assert(kElementsPerAccess > 1 , "Simt transpose requires elements per access to be 1");

732using Iterations =

733layout::PitchLinearShape<ThreadMap::Iterations::kStrided,

734 ThreadMap::Iterations::kContiguous>;

735

736static_assert(Iterations::kCount, "Number of iterations must be non-zero");

737

739using ThreadAccessShape = typename ThreadMap::ThreadAccessShape;

740

742using Delta =

743layout::PitchLinearShape<ThreadMap::Delta::kStrided,

744 ThreadMap::Delta::kContiguous>;

745

746

750CUTLASS_HOST_DEVICE

751static TensorCoord initial_offset(int thread_id) {

752

753TensorCoord coord = ThreadMap::initial_offset(thread_id);

754return TensorCoord(

755 coord.strided(),

756 coord.contiguous()

757 );

758 }

759 };

760

761

763

764 } // namespace transform

765 } // namespace cutlass

766

cutlass::layout::PitchLinearShape::kCount

static int const kCount

Definition: pitch_linear.h:46

cutlass

Definition: aligned_buffer.h:35

cutlass::transform::PitchLinear2DThreadTileStripminedThreadMap

Definition: pitch_linear_thread_map.h:623

cutlass::layout::PitchLinearCoord

Coordinate in pitch-linear space.

Definition: pitch_linear.h:52

tensor_ref.h

Defines a structure containing strides, bounds, and a pointer to tensor data.

cutlass::transform::PitchLinearStripminedThreadMap::Shape

Shape_ Shape

Tile shape.

Definition: pitch_linear_thread_map.h:65

coord.h

A Coord is a coordinate of arbitrary rank into a tensor or matrix.

cutlass::transform::PitchLinearStripminedThreadMap::TensorCoord

layout::PitchLinearCoord TensorCoord

Tensor coordinate.

Definition: pitch_linear_thread_map.h:62

cutlass::transform::TransposePitchLinearThreadMapSimt::ThreadMap

ThreadMap_ ThreadMap

Underlying ThreadMap.

Definition: pitch_linear_thread_map.h:433

cutlass::transform::TransposePitchLinearThreadMapSimt

Definition: pitch_linear_thread_map.h:431

cutlass::transform::PitchLinearWarpStripedThreadMap::Detail::WarpThreadArrangement

WarpThreadArrangement_ WarpThreadArrangement

Fixed arrangement of threads within a warp (units of threads).

Definition: pitch_linear_thread_map.h:512

cutlass::transform::TransposePitchLinearThreadMap2DThreadTile::ThreadAccessShape

typename ThreadMap::ThreadAccessShape ThreadAccessShape

Delta betweeen accesses (units of elements, concept: PitchLinearShape)

Definition: pitch_linear_thread_map.h:741

cutlass::transform::PitchLinearStripminedThreadMap::kElementsPerAccess

static int const kElementsPerAccess

Extract vector length from Layout.

Definition: pitch_linear_thread_map.h:71

cutlass::transform::PitchLinearWarpStripedThreadMap::initial_offset

static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)

Maps thread ID to a coordinate offset within the tensor's logical coordinate space.

Definition: pitch_linear_thread_map.h:569

cutlass::transform::PitchLinearWarpStripedThreadMap::Shape

Shape_ Shape

Tile shape.

Definition: pitch_linear_thread_map.h:497

cutlass::transform::TransposePitchLinearThreadMapSimt::Shape

typename ThreadMap::Shape Shape

Tile shape.

Definition: pitch_linear_thread_map.h:439

tensor_view.h

Defines a structure containing strides and a pointer to tensor data.

cutlass::transform::PitchLinearTilePolicyStripminedThreadStrided::ShapeVec

Shape ShapeVec

Definition: pitch_linear_thread_map.h:184

cutlass::transform::PitchLinear2DThreadTileStripminedThreadMap< Shape_, Threads, cutlass::layout::PitchLinearShape< 4, 4 > >::Shape

Shape_ Shape

Tile shape.

Definition: pitch_linear_thread_map.h:636

cutlass::transform::TransposePitchLinearThreadMapSimt::initial_offset

static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)

Definition: pitch_linear_thread_map.h:468

cutlass::layout::PitchLinearShape

Template defining a shape used by pitch-linear operators.

Definition: pitch_linear.h:43

array.h

Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...

cutlass::transform::TransposePitchLinearThreadMap

Definition: pitch_linear_thread_map.h:333

predicate_vector.h

Defines container classes and iterators for managing a statically sized vector of boolean predicates...

cutlass::transform::PitchLinearWarpRakedThreadMap::Shape

Shape_ Shape

Tile shape.

Definition: pitch_linear_thread_map.h:211

cutlass::layout::PitchLinearShape::kStrided

static int const kStrided

Definition: pitch_linear.h:45

cutlass::transform::PitchLinear2DThreadTileStripminedThreadMap< Shape_, Threads, cutlass::layout::PitchLinearShape< 4, 4 > >::Iterations

typename platform::conditional< Threads >=Detail::ShapeVec::kContiguous, layout::PitchLinearShape< 1,(Threads >=Detail::ShapeVec::kContiguous?Detail::ShapeVec::kStrided/(kThreads/Detail::ShapeVec::kContiguous):0) >, layout::PitchLinearShape< Detail::ShapeVec::kContiguous/kThreads, Detail::ShapeVec::kStrided > >::type Iterations

Number of iterations by each thread.

Definition: pitch_linear_thread_map.h:684

cutlass::layout::PitchLinearShape::kContiguous

static int const kContiguous

Definition: pitch_linear.h:44

cutlass::transform::PitchLinearWarpStripedThreadMap

Definition: pitch_linear_thread_map.h:491

cutlass::transform::PitchLinearTilePolicyStripminedThreadStrided::initial_offset

static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)

Definition: pitch_linear_thread_map.h:187

cutlass::transform::PitchLinearWarpRakedThreadMap::Detail::WarpThreadArrangement

WarpThreadArrangement_ WarpThreadArrangement

Fixed arrangement of threads within a warp (units of threads).

Definition: pitch_linear_thread_map.h:226

cutlass::transform::TransposePitchLinearThreadMap2DThreadTile

Thread Mapping a 2D threadtiled mapping as a transposed Pitchlinear2DThreadTile mapping.

Definition: pitch_linear_thread_map.h:713

cutlass::transform::TransposePitchLinearThreadMap::Detail::WarpThreadArrangement

WarpThreadArrangement_ WarpThreadArrangement

Fixed arrangement of threads within a warp (units of threads).

Definition: pitch_linear_thread_map.h:355

cutlass::transform::TransposePitchLinearThreadMap::Detail

Internal details made public to facilitate introspection Iterations along each dimension (concept: Pi...

Definition: pitch_linear_thread_map.h:353

cutlass::transform::PitchLinearWarpRakedThreadMap

Definition: pitch_linear_thread_map.h:205

cutlass::transform::TransposePitchLinearThreadMap::ThreadMap

ThreadMap_ ThreadMap

Underlying ThreadMap.

Definition: pitch_linear_thread_map.h:335

cutlass::transform::PitchLinearStripminedThreadMap::Detail

Internal implementation details.

Definition: pitch_linear_thread_map.h:77

cutlass::transform::TransposePitchLinearThreadMap::Shape

typename ThreadMap::Shape Shape

Tile shape.

Definition: pitch_linear_thread_map.h:341

CUTLASS_HOST_DEVICE

#define CUTLASS_HOST_DEVICE

Definition: cutlass.h:89

cutlass::layout::PitchLinearCoord::contiguous

CUTLASS_HOST_DEVICE Index const & contiguous() const

Returns the contiguous dimension.

Definition: pitch_linear.h:89

cutlass::platform::conditional

std::conditional (true specialization)

Definition: platform.h:325

static_assert

#define static_assert(__e, __m)

Definition: platform.h:153

cutlass::transform::PitchLinearTilePolicyStripminedThreadContiguous::initial_offset

static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)

Definition: pitch_linear_thread_map.h:157

cutlass::transform::TransposePitchLinearThreadMap::initial_offset

static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)

Definition: pitch_linear_thread_map.h:389

cutlass::transform::PitchLinearStripminedThreadMap::kThreads

static int const kThreads

Number of threads total.

Definition: pitch_linear_thread_map.h:68

cutlass::transform::TransposePitchLinearThreadMap2DThreadTile::initial_offset

static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)

Definition: pitch_linear_thread_map.h:751

cutlass::transform::PitchLinearWarpRakedThreadMap::initial_offset

static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)

Maps thread ID to a coordinate offset within the tensor's logical coordinate space.

Definition: pitch_linear_thread_map.h:285

cutlass::transform::PitchLinearStripminedThreadMap::initial_offset

static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)

Definition: pitch_linear_thread_map.h:127

cutlass::transform::PitchLinearWarpRakedThreadMap::Detail

Internal details made public to facilitate introspection Iterations along each dimension (concept: Pi...

Definition: pitch_linear_thread_map.h:223

cutlass::transform::TransposePitchLinearThreadMapSimt::TensorCoord

typename ThreadMap::TensorCoord TensorCoord

Tensor coordinate.

Definition: pitch_linear_thread_map.h:436

pitch_linear.h

Defines layout functions used by TensorRef and derived classes for pitch-linear memory.

cutlass::transform::PitchLinearStripminedThreadMap::Iterations

Number of iterations by each thread.

Definition: pitch_linear_thread_map.h:108

cutlass::transform::TransposePitchLinearThreadMap2DThreadTile::Shape

typename ThreadMap::Shape Shape

Tile shape.

Definition: pitch_linear_thread_map.h:721

cutlass::transform::PitchLinearStripminedThreadMap::Delta

typename platform::conditional< Threads >=Detail::ShapeVec::kContiguous, layout::PitchLinearShape< 1, kThreads/Detail::ShapeVec::kContiguous >, layout::PitchLinearShape< kThreads *kElementsPerAccess, 1 > >::type Delta

Definition: pitch_linear_thread_map.h:122

cutlass::transform::PitchLinearTilePolicyStripminedThreadStrided

Definition: pitch_linear_thread_map.h:168

cutlass::transform::TransposePitchLinearThreadMap::TensorCoord

typename ThreadMap::TensorCoord TensorCoord

Tensor coordinate.

Definition: pitch_linear_thread_map.h:338

cutlass::transform::PitchLinearTilePolicyStripminedThreadContiguous

Definition: pitch_linear_thread_map.h:140

cutlass.h

Basic include for CUTLASS.

cutlass::transform::PitchLinearStripminedThreadMap

Definition: pitch_linear_thread_map.h:59

cutlass::layout::PitchLinearCoord::strided

CUTLASS_HOST_DEVICE Index const & strided() const

Returns the column of the coordinate.

Definition: pitch_linear.h:97

cutlass::transform::PitchLinear2DThreadTileStripminedThreadMap< Shape_, Threads, cutlass::layout::PitchLinearShape< 4, 4 > >::Delta

typename platform::conditional< Threads >=Detail::ShapeVec::kContiguous, layout::PitchLinearShape< Shape::kContiguous, kThreads *ThreadAccessShape::kStrided/Detail::ShapeVec::kContiguous >, layout::PitchLinearShape< kThreads *ThreadAccessShape::kContiguous, 1 > >::type Delta

Definition: pitch_linear_thread_map.h:698

cutlass::transform::PitchLinearWarpStripedThreadMap::Detail

Internal details made public to facilitate introspection Iterations along each dimension (concept: Pi...

Definition: pitch_linear_thread_map.h:509

cutlass::transform::TransposePitchLinearThreadMap2DThreadTile::TensorCoord

typename ThreadMap::TensorCoord TensorCoord

Tensor coordinate.

Definition: pitch_linear_thread_map.h:718

cutlass::transform::PitchLinear2DThreadTileStripminedThreadMap< Shape_, Threads, cutlass::layout::PitchLinearShape< 4, 4 > >::initial_offset

static CUTLASS_HOST_DEVICE TensorCoord initial_offset(int thread_id)

Definition: pitch_linear_thread_map.h:703

cutlass::transform::TransposePitchLinearThreadMap2DThreadTile::ThreadMap