CUTLASS: tensor.h Source File - Cutlass

| | CUTLASS

CUDA Templates for Linear Algebra Subroutines and Solvers |

tensor.h

Go to the documentation of this file.

1 /***************************************************************************************************

3 *

4 * Redistribution and use in source and binary forms, with or without modification, are permitted

5 * provided that the following conditions are met:

6 * * Redistributions of source code must retain the above copyright notice, this list of

7 * conditions and the following disclaimer.

8 * * Redistributions in binary form must reproduce the above copyright notice, this list of

9 * conditions and the following disclaimer in the documentation and/or other materials

10 * provided with the distribution.

11 * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used

12 * to endorse or promote products derived from this software without specific prior written

13 * permission.

14 *

15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR

16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND

17 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE

18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

19 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;

20 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,

21 * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

22 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

23 *

24 **************************************************************************************************/

35 #pragma once

36 #include "assert.h"

37 #include "cutlass/cutlass.h"

38 #include "cutlass/fast_math.h"

39 #include "cutlass/layout/matrix.h"

40 #include "cutlass/coord.h"

41 #include "cutlass/tensor_coord.h"

43 namespace cutlass {

44 namespace layout {

47 //

48 // Defines data layouts of various tensor formats usable by TensorRef and other classes.

49 //

53 class TensorNHWC {

54 public:

56static int const kRank = 4;

59static int const kStrideRank = 3;

62using Index = int32_t;

65using LongIndex = int64_t;

68using TensorCoord = Tensor4DCoord;

71using Stride = Coord<kStrideRank>;

73 private:

74//

75// Data members

76//

79Stride stride_;

81 public:

82//

83// Methods

84//

87CUTLASS_HOST_DEVICE

88 TensorNHWC(Stride const &stride = Stride(0)): stride_(stride) { }

91CUTLASS_HOST_DEVICE

92 TensorNHWC(typename Stride::Index c, typename Stride::Index wc, typename Stride::Index hwc): stride_(make_Coord(c, wc, hwc)) { }

95CUTLASS_HOST_DEVICE

96static TensorNHWC packed(TensorCoord const &extent) {

97return TensorNHWC(

98make_Coord(

99 extent.c(),

100 extent.w() * extent.c(),

101 extent.h() * extent.w() * extent.c()

102 )

103 );

104 }

105

107CUTLASS_HOST_DEVICE

108 LongIndex operator()(TensorCoord const &coord) const {

109return coord.c() +

110LongIndex(stride_[0] * coord.w()) +

111LongIndex(stride_[1] * coord.h()) +

112LongIndex(stride_[2] * coord.n());

113 }

114

116CUTLASS_HOST_DEVICE

117explicit operator RowMajor() {

118return RowMajor(stride_[0]);

119 }

120

122CUTLASS_HOST_DEVICE

123 TensorCoord inverse(LongIndex index) const {

124

125int n = 0, h = 0, w = 0, c = 0;

126

127 #if defined(__CUDA_ARCH__)

128int tmp = 0;

129 c = int(index % static_cast<int>(stride_[0]));

130

131unsigned int hw_mul, hw_shr, w_mul, w_shr, c_mul, c_shr;

132

133find_divisor(hw_mul, hw_shr, stride_[2]);

134find_divisor(w_mul, w_shr, stride_[1]);

135find_divisor(c_mul, c_shr, stride_[0]);

136

137fast_divmod(n, tmp, index, int(stride_[2]), hw_mul, hw_shr);

138fast_divmod(h, w, tmp, int(stride_[1]), w_mul, w_shr);

139fast_divmod(w, tmp, w, int(stride_[0]), c_mul, c_shr);

140 #else

141

142 n = int(index / (stride_[0] * stride_[1] * stride_[2]));

143LongIndex residual = index % (stride_[0] * stride_[1] * stride_[2]);

144

145 h = int(residual / (stride_[0] * stride_[1]));

146 residual = (residual % (stride_[0] * stride_[1]));

147

148 w = int(residual / stride_[0]);

149 c = int(residual % stride_[0]);

150

151 #endif

152return TensorCoord(n, h, w, c);

153 }

154

156CUTLASS_HOST_DEVICE

157 Stride stride() const {

158return stride_;

159 }

160

162CUTLASS_HOST_DEVICE

163 Stride & stride() {

164return stride_;

165 }

166

168CUTLASS_HOST_DEVICE

169 LongIndex capacity(TensorCoord const &extent) const {

170// it does not make sense if the extent is larger than stride

171// and we could not rely on the capacity calculation in such cases

172// we could move this checkers to debug code only

173if ((extent.c() > stride_[0])

174 || (extent.w() * stride_[0] > stride_[1])

175 || (extent.h() * stride_[1] > stride_[2])) {

176 assert(0);

177 }

178return extent.n() * stride_[2];

179 }

180 };

181

182

184

186 class TensorNCHW {

187 public:

189static int const kRank = 4;

190

192static int const kStrideRank = 3;

193

195using Index = int32_t;

196

198using LongIndex = int64_t;

199

201using TensorCoord = Tensor4DCoord;

202

204using Stride = Coord<kStrideRank>;

205

206 private:

207//

208// Data members

209//

210

212Stride stride_;

213

214 public:

215//

216// Methods

217//

218

220CUTLASS_HOST_DEVICE

221 TensorNCHW(Stride const &stride = Stride(0)): stride_(stride) { }

222

224CUTLASS_HOST_DEVICE

225static TensorNCHW packed(TensorCoord const &extent) {

226return TensorNCHW(

227make_Coord(

228 extent.w(),

229 extent.w() * extent.h(),

230 extent.h() * extent.w() * extent.c()

231 )

232 );

233 }

234

236CUTLASS_HOST_DEVICE

237 LongIndex operator()(TensorCoord const &coord) const {

238return coord.w() +

239LongIndex(stride_[0] * coord.h()) +

240LongIndex(stride_[1] * coord.c()) +

241LongIndex(stride_[2] * coord.n());

242 }

243

245CUTLASS_HOST_DEVICE

246 Stride stride() const {

247return stride_;

248 }

249

251CUTLASS_HOST_DEVICE

252 Stride & stride() {

253return stride_;

254 }

255

257CUTLASS_HOST_DEVICE

258 LongIndex capacity(TensorCoord const &extent) const {

259return extent.n() * stride_[2];

260 }

261 };

262

264

266 template <int Interleave>

267 class TensorNCxHWx {

268 public:

269

271static int const kInterleave = Interleave;

272

274static int const kRank = 4;

275

277static int const kStrideRank = 3;

278

280using Index = int32_t;

281

283using LongIndex = int64_t;

284

286using TensorCoord = Tensor4DCoord;

287

289using Stride = Coord<kStrideRank>;

290

291 private:

292//

293// Data members

294//

295

297Stride stride_;

298

299 public:

300//

301// Methods

302//

303

305CUTLASS_HOST_DEVICE

306 TensorNCxHWx(Stride const &stride = Stride(0)): stride_(stride) { }

307

309CUTLASS_HOST_DEVICE

310static TensorNCxHWx packed(TensorCoord const &extent) {

311return TensorNCxHWx(

312make_Coord(

313 kInterleave * extent.w(),

314 kInterleave * extent.w() * extent.h(),

315 extent.h() * extent.w() * extent.c()

316 )

317 );

318 }

319

321CUTLASS_HOST_DEVICE

322 LongIndex operator()(TensorCoord const &coord) const {

323

324Index c_minor = (coord.c() % kInterleave);

325Index c_major = (coord.c() / kInterleave);

326

327return c_minor +

328LongIndex(kInterleave * coord.w()) +

329LongIndex(stride_[0] * coord.h()) +

330LongIndex(stride_[1] * c_major) +

331LongIndex(stride_[2] * coord.n());

332 }

333

335CUTLASS_HOST_DEVICE

336 Stride stride() const {

337return stride_;

338 }

339

341CUTLASS_HOST_DEVICE

342 Stride & stride() {

343return stride_;

344 }

345

347CUTLASS_HOST_DEVICE

348 LongIndex capacity(TensorCoord const &extent) const {

349return extent.n() * stride_[2];

350 }

351 };

352

354

356 template <int Interleave>

357 class TensorCxRSKx {

358 public:

359

361static int const kInterleave = Interleave;

362

364static int const kRank = 4;

365

367static int const kStrideRank = 3;

368

370using Index = int32_t;

371

373using LongIndex = int64_t;

374

376using TensorCoord = Tensor4DCoord;

377

379using Stride = Coord<kStrideRank>;

380

381 private:

382//

383// Data members

384//

385

387Stride stride_;

388

389 public:

390//

391// Methods

392//

393

395CUTLASS_HOST_DEVICE

396 TensorCxRSKx(Stride const &stride = Stride(0)): stride_(stride) { }

397

399CUTLASS_HOST_DEVICE

400static TensorCxRSKx packed(TensorCoord const &extent) {

401return TensorCxRSKx(

402make_Coord(

403 kInterleave * extent.n(),

404 kInterleave * extent.n() * extent.w(),

405 kInterleave * extent.n() * extent.w() * extent.h()

406 )

407 );

408 }

409

411CUTLASS_HOST_DEVICE

412 LongIndex operator()(TensorCoord const &coord) const {

413

414Index c_minor = (coord.c() % kInterleave);

415Index c_major = (coord.c() / kInterleave);

416

417return c_minor +

418LongIndex(kInterleave * coord.n()) +

419LongIndex(stride_[0] * coord.w()) +

420LongIndex(stride_[1] * coord.h()) +

421LongIndex(stride_[2] * c_major);

422 }

423

425CUTLASS_HOST_DEVICE

426 Stride stride() const {

427return stride_;

428 }

429

431CUTLASS_HOST_DEVICE

432 Stride & stride() {

433return stride_;

434 }

435

437CUTLASS_HOST_DEVICE

438 LongIndex capacity(TensorCoord const &extent) const {

439return (extent.c() / kInterleave * stride_[2]);

440 }

441 };

442

444

445 } // namespace layout

446 } // namespace cutlass

cutlass::layout::TensorNHWC::Stride

Coord< kStrideRank > Stride

Stride vector.

Definition: tensor.h:71

cutlass::layout::TensorNCHW::stride

CUTLASS_HOST_DEVICE Stride stride() const

Returns the stride of the layout.

Definition: tensor.h:246

cutlass::Tensor4DCoord

Defines a canonical 4D coordinate used by tensor operations.

Definition: tensor_coord.h:38

cutlass::layout::TensorCxRSKx::TensorCxRSKx

CUTLASS_HOST_DEVICE TensorCxRSKx(Stride const &stride=Stride(0))

Constructor.

Definition: tensor.h:396

cutlass

Definition: aligned_buffer.h:35

cutlass::fast_divmod

CUTLASS_HOST_DEVICE void fast_divmod(int &quo, int &rem, int src, int div, unsigned int mul, unsigned int shr)

Definition: fast_math.h:176

cutlass::layout::TensorNCxHWx::capacity

CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const

Compute the number of contiguous elements needed to store a tensor with the given size...

Definition: tensor.h:348

cutlass::layout::TensorNCxHWx::TensorNCxHWx

CUTLASS_HOST_DEVICE TensorNCxHWx(Stride const &stride=Stride(0))

Constructor.

Definition: tensor.h:306

cutlass::layout::TensorNHWC::kStrideRank

static int const kStrideRank

Rank of stride vector.

Definition: tensor.h:59

cutlass::layout::TensorNCxHWx::packed

static CUTLASS_HOST_DEVICE TensorNCxHWx packed(TensorCoord const &extent)

Helper returns a layout to a tightly packed tensor.

Definition: tensor.h:310

coord.h

A Coord is a coordinate of arbitrary rank into a tensor or matrix.

cutlass::make_Coord

CUTLASS_HOST_DEVICE Coord< 1 > make_Coord(int _0)

Helper to make a 2-element coordinate.

Definition: coord.h:387

cutlass::layout::TensorCxRSKx::operator()

CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const

Returns the offset of a coordinate in linear memory.

Definition: tensor.h:412

cutlass::layout::TensorNCxHWx::stride

CUTLASS_HOST_DEVICE Stride stride() const

Returns the stride of the layout.

Definition: tensor.h:336

cutlass::layout::TensorNHWC::capacity

CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const

Compute the number of contiguous elements needed to store a tensor with the given size...

Definition: tensor.h:169

cutlass::layout::TensorNHWC::Index

int32_t Index

Index type used for coordinates.

Definition: tensor.h:62

cutlass::layout::TensorNHWC::TensorCoord

Tensor4DCoord TensorCoord

Logical coordinate (n, h, w, c)

Definition: tensor.h:68

cutlass::layout::TensorNCxHWx

Mapping function for 4-D NC/xHWx tensors.

Definition: tensor.h:267

cutlass::layout::TensorNCHW::LongIndex

int64_t LongIndex

Long index type used for offsets.

Definition: tensor.h:198

cutlass::Tensor4DCoord::w

CUTLASS_HOST_DEVICE Index const & w() const

Returns the column of the coordinate.

Definition: tensor_coord.h:95

cutlass::layout::TensorNHWC::TensorNHWC

CUTLASS_HOST_DEVICE TensorNHWC(Stride const &stride=Stride(0))

Constructor.

Definition: tensor.h:88

cutlass::Coord< kStrideRank >::Index

int Index

Index type used to store elements.

Definition: coord.h:55

cutlass::layout::TensorNCHW::packed

static CUTLASS_HOST_DEVICE TensorNCHW packed(TensorCoord const &extent)

Helper returns a layout to a tightly packed tensor.

Definition: tensor.h:225

cutlass::layout::TensorNHWC::stride

CUTLASS_HOST_DEVICE Stride & stride()

Returns the stride of the layout.

Definition: tensor.h:163

cutlass::layout::TensorNCHW::TensorNCHW

CUTLASS_HOST_DEVICE TensorNCHW(Stride const &stride=Stride(0))

Constructor.

Definition: tensor.h:221

cutlass::layout::TensorNHWC::kRank

static int const kRank

Logical rank of tensor.

Definition: tensor.h:56

cutlass::layout::TensorNCxHWx::stride

CUTLASS_HOST_DEVICE Stride & stride()

Returns the stride of the layout.

Definition: tensor.h:342

cutlass::layout::TensorNCHW::capacity

CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const

Compute the number of contiguous elements needed to store a tensor with the given size...

Definition: tensor.h:258

cutlass::Tensor4DCoord::c

CUTLASS_HOST_DEVICE Index const & c() const

Returns the channel of the coordinate.

Definition: tensor_coord.h:103

cutlass::layout::TensorCxRSKx::Index

int32_t Index

Index type used for coordinates.

Definition: tensor.h:370

cutlass::layout::TensorNHWC::TensorNHWC

CUTLASS_HOST_DEVICE TensorNHWC(typename Stride::Index c, typename Stride::Index wc, typename Stride::Index hwc)

Constructor.

Definition: tensor.h:92

cutlass::layout::TensorCxRSKx::stride

CUTLASS_HOST_DEVICE Stride & stride()

Returns the stride of the layout.

Definition: tensor.h:432

tensor_coord.h

Defines a canonical coordinate for rank=4 tensors offering named indices.

cutlass::layout::TensorCxRSKx::stride

CUTLASS_HOST_DEVICE Stride stride() const

Returns the stride of the layout.

Definition: tensor.h:426

cutlass::layout::TensorCxRSKx

Mapping function for 4-D CxRSKx tensors.

Definition: tensor.h:357

cutlass::layout::TensorNHWC::LongIndex

int64_t LongIndex

Long index type used for offsets.

Definition: tensor.h:65

CUTLASS_HOST_DEVICE

#define CUTLASS_HOST_DEVICE

Definition: cutlass.h:89

cutlass::layout::TensorNHWC::packed

static CUTLASS_HOST_DEVICE TensorNHWC packed(TensorCoord const &extent)

Helper returns a layout to a tightly packed NHWC tensor.

Definition: tensor.h:96

cutlass::layout::TensorNCHW

Mapping function for 4-D NCHW tensors.

Definition: tensor.h:186

cutlass::layout::TensorNHWC::stride

CUTLASS_HOST_DEVICE Stride stride() const

Returns the stride of the layout.

Definition: tensor.h:157

cutlass::layout::TensorCxRSKx::capacity

CUTLASS_HOST_DEVICE LongIndex capacity(TensorCoord const &extent) const

Compute the number of contiguous elements needed to store a tensor with the given size...

Definition: tensor.h:438

cutlass::layout::TensorCxRSKx::packed

static CUTLASS_HOST_DEVICE TensorCxRSKx packed(TensorCoord const &extent)

Helper returns a layout to a tightly packed tensor.

Definition: tensor.h:400

cutlass::layout::TensorNCHW::Index

int32_t Index

Index type used for coordinates.

Definition: tensor.h:195

cutlass::layout::TensorNCHW::operator()

CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const

Returns the offset of a coordinate in linear memory.

Definition: tensor.h:237

cutlass::Coord< kStrideRank >

cutlass::layout::RowMajor

Mapping function for row-major matrices.

Definition: layout/matrix.h:50

cutlass::Tensor4DCoord::n

CUTLASS_HOST_DEVICE Index const & n() const

Returns the batch of the coordinate.

Definition: tensor_coord.h:79

cutlass::find_divisor

CUTLASS_HOST_DEVICE void find_divisor(unsigned int &mul, unsigned int &shr, unsigned int denom)

Definition: fast_math.h:159

cutlass::layout::TensorCxRSKx::LongIndex

int64_t LongIndex

Long index type used for offsets.

Definition: tensor.h:373

cutlass::layout::TensorNCHW::stride

CUTLASS_HOST_DEVICE Stride & stride()

Returns the stride of the layout.

Definition: tensor.h:252

matrix.h

Defines layout functions used by TensorRef and derived classes.

cutlass::layout::TensorNCxHWx::operator()

CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const

Returns the offset of a coordinate in linear memory.

Definition: tensor.h:322

fast_math.h

Math utilities.

cutlass::layout::TensorNCxHWx::LongIndex

int64_t LongIndex

Long index type used for offsets.

Definition: tensor.h:283

cutlass::Tensor4DCoord::h

CUTLASS_HOST_DEVICE Index const & h() const

Returns the row of the coordinate.

Definition: tensor_coord.h:87

cutlass::layout::TensorNHWC

Mapping function for 4-D NHWC tensors.

Definition: tensor.h:53

cutlass::layout::TensorNCxHWx::Index

int32_t Index

Index type used for coordinates.

Definition: tensor.h:280

cutlass::layout::TensorNHWC::inverse

CUTLASS_HOST_DEVICE TensorCoord inverse(LongIndex index) const

Returns the logical coordinate (n, h, w, c) from a given offset in linear memory. ...

Definition: tensor.h:123

cutlass.h

Basic include for CUTLASS.

cutlass::layout::TensorNHWC::operator()

CUTLASS_HOST_DEVICE LongIndex operator()(TensorCoord const &coord) const

Returns the offset of a coordinate (n, h, w, c) in linear memory.

Definition: tensor.h:108

Generated by 1.8.11