Back to Cutlass

CUTLASS: transpose.h Source File

docs/transpose_8h_source.html

4.4.29.0 KB
Original Source

| | CUTLASS

CUDA Templates for Linear Algebra Subroutines and Solvers |

transpose.h

Go to the documentation of this file.

1 /***************************************************************************************************

2 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.

3 *

4 * Redistribution and use in source and binary forms, with or without modification, are permitted

5 * provided that the following conditions are met:

6 * * Redistributions of source code must retain the above copyright notice, this list of

7 * conditions and the following disclaimer.

8 * * Redistributions in binary form must reproduce the above copyright notice, this list of

9 * conditions and the following disclaimer in the documentation and/or other materials

10 * provided with the distribution.

11 * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used

12 * to endorse or promote products derived from this software without specific prior written

13 * permission.

14 *

15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR

16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND

17 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE

18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

19 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;

20 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,

21 * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

22 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

23 *

24 **************************************************************************************************/

25

30 namespace cutlass {

31 namespace transform {

32 namespace thread {

33

35 template <

36int ElementCount,

37typename TransposeShape,

38typename Element

39 > class Transpose;

40

42 template <int ElementCount_>

43 struct Transpose<ElementCount_, layout::PitchLinearShape<4,4> , int8_t> {

44

45static const int kElementCount = ElementCount_;

46using TransposeShape = layout::PitchLinearShape<4,4>;

47using Element = int8_t;

48using Fragment = cutlass::Array<Element, kElementCount>;

49

50static_assert(!(kElementCount % TransposeShape::kCount), "Shape needs to be multiple of 16 elements to do a 4x4 transpose");

51

52 CUTLASS_DEVICE

53void transform(Fragment& dst, Fragment& src) {

54

55// Expose src/dst as int arrays.

56int* src_int = reinterpret_cast<int*>(&src);

57int* dst_int = reinterpret_cast<int*>(&dst);

58

59CUTLASS_PRAGMA_UNROLL

60for (int i = 0; i < kElementCount / TransposeShape::kCount; i++){

61

62int const i0 = 4 * i + 0;

63int const i1 = 4 * i + 1;

64int const i2 = 4 * i + 2;

65int const i3 = 4 * i + 3;

66

67int a0 = src_int[i0];

68int a1 = src_int[i1];

69int a2 = src_int[i2];

70int a3 = src_int[i3];

71

72int b0, b1, b2, b3, c0;

73asm volatile("prmt.b32 %0, %1, %2, 0x0040;" : "=r"(b0) : "r"(a0), "r"(a1));

74asm volatile("prmt.b32 %0, %1, %2, 0x0040;" : "=r"(c0) : "r"(a2), "r"(a3));

75asm volatile("prmt.b32 %0, %1, %2, 0x5410;" : "=r"(b0) : "r"(b0), "r"(c0));

76

77asm volatile("prmt.b32 %0, %1, %2, 0x0051;" : "=r"(b1) : "r"(a0), "r"(a1));

78asm volatile("prmt.b32 %0, %1, %2, 0x0051;" : "=r"(c0) : "r"(a2), "r"(a3));

79asm volatile("prmt.b32 %0, %1, %2, 0x5410;" : "=r"(b1) : "r"(b1), "r"(c0));

80

81asm volatile("prmt.b32 %0, %1, %2, 0x0062;" : "=r"(b2) : "r"(a0), "r"(a1));

82asm volatile("prmt.b32 %0, %1, %2, 0x0062;" : "=r"(c0) : "r"(a2), "r"(a3));

83asm volatile("prmt.b32 %0, %1, %2, 0x5410;" : "=r"(b2) : "r"(b2), "r"(c0));

84

85asm volatile("prmt.b32 %0, %1, %2, 0x0073;" : "=r"(b3) : "r"(a0), "r"(a1));

86asm volatile("prmt.b32 %0, %1, %2, 0x0073;" : "=r"(c0) : "r"(a2), "r"(a3));

87asm volatile("prmt.b32 %0, %1, %2, 0x5410;" : "=r"(b3) : "r"(b3), "r"(c0));

88

89 dst_int[i0] = b0;

90 dst_int[i1] = b1;

91 dst_int[i2] = b2;

92 dst_int[i3] = b3;

93 }

94 }

95 };

96

97 } // namespace thread

98 } // namespace layout

99 } // namespace cutlass

cutlass

Definition: aligned_buffer.h:35

cutlass::transform::thread::Transpose< ElementCount_, layout::PitchLinearShape< 4, 4 >, int8_t >::Fragment

cutlass::Array< Element, kElementCount > Fragment

Definition: transpose.h:48

cutlass::transform::thread::Transpose< ElementCount_, layout::PitchLinearShape< 4, 4 >, int8_t >::transform

CUTLASS_DEVICE void transform(Fragment &dst, Fragment &src)

Definition: transpose.h:53

cutlass::transform::thread::Transpose< ElementCount_, layout::PitchLinearShape< 4, 4 >, int8_t >::Element

int8_t Element

Definition: transpose.h:47

cutlass::layout::PitchLinearShape

Template defining a shape used by pitch-linear operators.

Definition: pitch_linear.h:43

CUTLASS_PRAGMA_UNROLL

#define CUTLASS_PRAGMA_UNROLL

Definition: cutlass.h:110

cutlass::transform::thread::Transpose

Transforms a fragment by doing a transpose.

Definition: transpose.h:39

static_assert

#define static_assert(__e, __m)

Definition: platform.h:153


Generated by 1.8.11