CUTLASS: memory_sm75.h Source File - Cutlass

| | CUTLASS

CUDA Templates for Linear Algebra Subroutines and Solvers |

memory_sm75.h

Go to the documentation of this file.

1 /***************************************************************************************************

3 *

4 * Redistribution and use in source and binary forms, with or without modification, are permitted

5 * provided that the following conditions are met:

6 * * Redistributions of source code must retain the above copyright notice, this list of

7 * conditions and the following disclaimer.

8 * * Redistributions in binary form must reproduce the above copyright notice, this list of

9 * conditions and the following disclaimer in the documentation and/or other materials

10 * provided with the distribution.

11 * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used

12 * to endorse or promote products derived from this software without specific prior written

13 * permission.

14 *

15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR

16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND

17 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE

18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

19 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;

20 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,

21 * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

22 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

23 *

24 **************************************************************************************************/

29 #pragma once

31 #include "cutlass/array.h"

32 #include "cutlass/layout/matrix.h"

34 namespace cutlass {

35 namespace arch {

39 template <

41typename Layout,

43int MatrixCount

44 >

45 inline __device__ void ldsm(Array<unsigned, MatrixCount> & D, void const* ptr);

48 //

49 // Specializations

50 //

53 #if (__CUDACC_VER_MAJOR__ == 10) && (__CUDACC_VER_MINOR__ == 2)

54 #define CUDA_NVVM_GET_SHARED_POINTER_SUPPORTED 1

55 #else

56 #define CUDA_NVVM_GET_SHARED_POINTER_SUPPORTED 0

57 #endif

59 #if ! defined(CUDA_NVVM_GET_SHARED_POINTER_ENABLED)

60 #define CUDA_NVVM_GET_SHARED_POINTER_ENABLED (CUDA_NVVM_GET_SHARED_POINTER_SUPPORTED)

61 #endif

63 #if ! defined(CUDA_LDMATRIX_SUPPORTED)

64 #define CUDA_LDMATRIX_SUPPORTED ((__CUDACC_VER_MAJOR__ == 10) && (__CUDACC_VER_MINOR__ >= 2))

65 #endif

67 #if ! defined(CUDA_LDMATRIX_ENABLED)

68 #define CUDA_LDMATRIX_ENABLED (CUDA_LDMATRIX_SUPPORTED)

69 #endif

71 #if (CUDA_LDMATRIX_ENABLED && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750))

72 #define CUDA_LDMATRIX_ACTIVATED 1

73 #else

74 #define CUDA_LDMATRIX_ACTIVATED 0

75 #endif

77 #if defined(CUTLASS_GET_SMEM_POINTER)

78// Use the existing implementation

79 #elif CUDA_NVVM_GET_SHARED_POINTER_ENABLED

80 #if ! defined(NVVM_GET_SMEM_POINTER)

81 #define NVVM_GET_SMEM_POINTER

82extern "C" {

83//

84// This NVVM intrinsic is subject to change in future versions of CUDA.

85// Clients should not call it directly. Rather, they should use the

86// cutlass::arch::ldsm<>() template.

87//

88 __device__ uint32_t __nvvm_get_smem_pointer(void*);

89 }

90 #endif

91 #define CUTLASS_GET_SMEM_POINTER(ptr) __nvvm_get_smem_pointer((void*)ptr)

92 #endif

96 template <>

97 inline __device__ void ldsm<layout::RowMajor, 1>(

98 Array<unsigned, 1> & D,

99void const* ptr) {

100

101 #if CUDA_LDMATRIX_ACTIVATED

102

103unsigned addr = CUTLASS_GET_SMEM_POINTER(ptr);

104

105int x;

106asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];" : "=r"(x) : "r"(addr));

107reinterpret_cast<int &>(D) = x;

108

109 #else

110

111 assert(0);

112

113 #endif

114 }

115

117

118 template <>

119 inline __device__ void ldsm<layout::RowMajor, 2>(

120 Array<unsigned, 2> & D,

121void const* ptr) {

122

123 #if CUDA_LDMATRIX_ACTIVATED

124

125unsigned addr = CUTLASS_GET_SMEM_POINTER(ptr);

126

127int x, y;

128asm volatile ("ldmatrix.sync.aligned.x2.m8n8.shared.b16 {%0, %1}, [%2];" : "=r"(x), "=r"(y) : "r"(addr));

129reinterpret_cast<int2 &>(D) = make_int2(x, y);

130

131 #else

132

133 assert(0);

134

135 #endif

136 }

137

139

140 template <>

141 inline __device__ void ldsm<layout::RowMajor, 4>(

142 Array<unsigned, 4> & D,

143void const* ptr) {

144

145 #if CUDA_LDMATRIX_ACTIVATED

146

147unsigned addr = CUTLASS_GET_SMEM_POINTER(ptr);

148

149int x, y, z, w;

150asm volatile ("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" : "=r"(x), "=r"(y), "=r"(z), "=r"(w) : "r"(addr));

151reinterpret_cast<int4 &>(D) = make_int4(x, y, z, w);

152

153 #else

154

155 assert(0);

156

157 #endif

158 }

159

161 //

162 // Transpose on 16b granularity

163 //

165

166 template <>

167 inline __device__ void ldsm<layout::ColumnMajor, 1>(

168 Array<unsigned, 1> & D,

169void const* ptr) {

170 #if CUDA_LDMATRIX_ACTIVATED

171

172unsigned addr = CUTLASS_GET_SMEM_POINTER(ptr);

173

174int x;

175asm volatile ("ldmatrix.sync.aligned.x1.trans.m8n8.shared.b16 {%0}, [%1];" : "=r"(x) : "r"(addr));

176reinterpret_cast<int &>(D) = x;

177

178 #else

179

180 assert(0);

181

182 #endif

183 }

184

186

187 template <>

188 inline __device__ void ldsm<layout::ColumnMajor, 2>(

189 Array<unsigned, 2> & D,

190void const* ptr) {

191

192 #if CUDA_LDMATRIX_ACTIVATED

193

194unsigned addr = CUTLASS_GET_SMEM_POINTER(ptr);

195

196int x, y;

197asm volatile ("ldmatrix.sync.aligned.x2.trans.m8n8.shared.b16 {%0, %1}, [%2];" : "=r"(x), "=r"(y) : "r"(addr));

198reinterpret_cast<int2 &>(D) = make_int2(x, y);

199

200 #else

201

202 assert(0);

203

204 #endif

205 }

206

208

209 template <>

210 inline __device__ void ldsm<layout::ColumnMajor, 4>(

211 Array<unsigned, 4> & D,

212void const* ptr) {

213

214 #if CUDA_LDMATRIX_ACTIVATED

215

216unsigned addr = CUTLASS_GET_SMEM_POINTER(ptr);

217

218int x, y, z, w;

219asm volatile ("ldmatrix.sync.aligned.x4.trans.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" : "=r"(x), "=r"(y), "=r"(z), "=r"(w) : "r"(addr));

220reinterpret_cast<int4 &>(D) = make_int4(x, y, z, w);

221

222 #else

223

224 assert(0);

225

226 #endif

227 }

228

230 } // namespace arch

231 } // namespace cutlass

cutlass

Definition: aligned_buffer.h:35

array.h

Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...

cutlass::arch::ldsm

__device__ void ldsm(Array< unsigned, MatrixCount > &D, void const *ptr)

matrix.h

Defines layout functions used by TensorRef and derived classes.

Generated by 1.8.11