docs/memory__sm75_8h_source.html
| | CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers |
memory_sm75.h
Go to the documentation of this file.
1 /***************************************************************************************************
2 * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without modification, are permitted
5 * provided that the following conditions are met:
6 * * Redistributions of source code must retain the above copyright notice, this list of
7 * conditions and the following disclaimer.
8 * * Redistributions in binary form must reproduce the above copyright notice, this list of
9 * conditions and the following disclaimer in the documentation and/or other materials
10 * provided with the distribution.
11 * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12 * to endorse or promote products derived from this software without specific prior written
13 * permission.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21 * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 *
24 **************************************************************************************************/
29 #pragma once
30
31 #include "cutlass/array.h"
32 #include "cutlass/layout/matrix.h"
33
34 namespace cutlass {
35 namespace arch {
36
38
39 template <
41typename Layout,
43int MatrixCount
44 >
45 inline __device__ void ldsm(Array<unsigned, MatrixCount> & D, void const* ptr);
46
48 //
49 // Specializations
50 //
52
53 #if (__CUDACC_VER_MAJOR__ == 10) && (__CUDACC_VER_MINOR__ == 2)
54 #define CUDA_NVVM_GET_SHARED_POINTER_SUPPORTED 1
55 #else
56 #define CUDA_NVVM_GET_SHARED_POINTER_SUPPORTED 0
57 #endif
58
59 #if ! defined(CUDA_NVVM_GET_SHARED_POINTER_ENABLED)
60 #define CUDA_NVVM_GET_SHARED_POINTER_ENABLED (CUDA_NVVM_GET_SHARED_POINTER_SUPPORTED)
61 #endif
62
63 #if ! defined(CUDA_LDMATRIX_SUPPORTED)
64 #define CUDA_LDMATRIX_SUPPORTED ((__CUDACC_VER_MAJOR__ == 10) && (__CUDACC_VER_MINOR__ >= 2))
65 #endif
66
67 #if ! defined(CUDA_LDMATRIX_ENABLED)
68 #define CUDA_LDMATRIX_ENABLED (CUDA_LDMATRIX_SUPPORTED)
69 #endif
70
71 #if (CUDA_LDMATRIX_ENABLED && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750))
72 #define CUDA_LDMATRIX_ACTIVATED 1
73 #else
74 #define CUDA_LDMATRIX_ACTIVATED 0
75 #endif
76
77 #if defined(CUTLASS_GET_SMEM_POINTER)
78// Use the existing implementation
79 #elif CUDA_NVVM_GET_SHARED_POINTER_ENABLED
80 #if ! defined(NVVM_GET_SMEM_POINTER)
81 #define NVVM_GET_SMEM_POINTER
82extern "C" {
83//
84// This NVVM intrinsic is subject to change in future versions of CUDA.
85// Clients should not call it directly. Rather, they should use the
86// cutlass::arch::ldsm<>() template.
87//
88 __device__ uint32_t __nvvm_get_smem_pointer(void*);
89 }
90 #endif
91 #define CUTLASS_GET_SMEM_POINTER(ptr) __nvvm_get_smem_pointer((void*)ptr)
92 #endif
93
95
96 template <>
97 inline __device__ void ldsm<layout::RowMajor, 1>(
98 Array<unsigned, 1> & D,
99void const* ptr) {
100
101 #if CUDA_LDMATRIX_ACTIVATED
102
103unsigned addr = CUTLASS_GET_SMEM_POINTER(ptr);
104
105int x;
106asm volatile ("ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];" : "=r"(x) : "r"(addr));
107reinterpret_cast<int &>(D) = x;
108
109 #else
110
111 assert(0);
112
113 #endif
114 }
115
117
118 template <>
119 inline __device__ void ldsm<layout::RowMajor, 2>(
120 Array<unsigned, 2> & D,
121void const* ptr) {
122
123 #if CUDA_LDMATRIX_ACTIVATED
124
125unsigned addr = CUTLASS_GET_SMEM_POINTER(ptr);
126
127int x, y;
128asm volatile ("ldmatrix.sync.aligned.x2.m8n8.shared.b16 {%0, %1}, [%2];" : "=r"(x), "=r"(y) : "r"(addr));
129reinterpret_cast<int2 &>(D) = make_int2(x, y);
130
131 #else
132
133 assert(0);
134
135 #endif
136 }
137
139
140 template <>
141 inline __device__ void ldsm<layout::RowMajor, 4>(
142 Array<unsigned, 4> & D,
143void const* ptr) {
144
145 #if CUDA_LDMATRIX_ACTIVATED
146
147unsigned addr = CUTLASS_GET_SMEM_POINTER(ptr);
148
149int x, y, z, w;
150asm volatile ("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" : "=r"(x), "=r"(y), "=r"(z), "=r"(w) : "r"(addr));
151reinterpret_cast<int4 &>(D) = make_int4(x, y, z, w);
152
153 #else
154
155 assert(0);
156
157 #endif
158 }
159
161 //
162 // Transpose on 16b granularity
163 //
165
166 template <>
167 inline __device__ void ldsm<layout::ColumnMajor, 1>(
168 Array<unsigned, 1> & D,
169void const* ptr) {
170 #if CUDA_LDMATRIX_ACTIVATED
171
172unsigned addr = CUTLASS_GET_SMEM_POINTER(ptr);
173
174int x;
175asm volatile ("ldmatrix.sync.aligned.x1.trans.m8n8.shared.b16 {%0}, [%1];" : "=r"(x) : "r"(addr));
176reinterpret_cast<int &>(D) = x;
177
178 #else
179
180 assert(0);
181
182 #endif
183 }
184
186
187 template <>
188 inline __device__ void ldsm<layout::ColumnMajor, 2>(
189 Array<unsigned, 2> & D,
190void const* ptr) {
191
192 #if CUDA_LDMATRIX_ACTIVATED
193
194unsigned addr = CUTLASS_GET_SMEM_POINTER(ptr);
195
196int x, y;
197asm volatile ("ldmatrix.sync.aligned.x2.trans.m8n8.shared.b16 {%0, %1}, [%2];" : "=r"(x), "=r"(y) : "r"(addr));
198reinterpret_cast<int2 &>(D) = make_int2(x, y);
199
200 #else
201
202 assert(0);
203
204 #endif
205 }
206
208
209 template <>
210 inline __device__ void ldsm<layout::ColumnMajor, 4>(
211 Array<unsigned, 4> & D,
212void const* ptr) {
213
214 #if CUDA_LDMATRIX_ACTIVATED
215
216unsigned addr = CUTLASS_GET_SMEM_POINTER(ptr);
217
218int x, y, z, w;
219asm volatile ("ldmatrix.sync.aligned.x4.trans.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" : "=r"(x), "=r"(y), "=r"(z), "=r"(w) : "r"(addr));
220reinterpret_cast<int4 &>(D) = make_int4(x, y, z, w);
221
222 #else
223
224 assert(0);
225
226 #endif
227 }
228
230 } // namespace arch
231 } // namespace cutlass
Definition: aligned_buffer.h:35
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...
__device__ void ldsm(Array< unsigned, MatrixCount > &D, void const *ptr)
Defines layout functions used by TensorRef and derived classes.
Generated by 1.8.11