CUTLASS: half.h Source File - Cutlass

| | CUTLASS

CUDA Templates for Linear Algebra Subroutines and Solvers |

half.h

Go to the documentation of this file.

1 /***************************************************************************************************

3 *

4 * Redistribution and use in source and binary forms, with or without modification, are permitted

5 * provided that the following conditions are met:

6 * * Redistributions of source code must retain the above copyright notice, this list of

7 * conditions and the following disclaimer.

8 * * Redistributions in binary form must reproduce the above copyright notice, this list of

9 * conditions and the following disclaimer in the documentation and/or other materials

10 * provided with the distribution.

11 * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used

12 * to endorse or promote products derived from this software without specific prior written

13 * permission.

14 *

15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR

16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND

17 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE

18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

19 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;

20 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,

21 * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

22 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

23 *

24 **************************************************************************************************/

30 #pragma once

32 #ifndef CUTLASS_ENABLE_F16C

33 #define CUTLASS_ENABLE_F16C 0

34 #endif

36 #if defined(__CUDACC_RTC__)

37 /* All floating-point numbers can be put in one of these categories. */

38 enum

39 {

40 FP_NAN =

41 # define FP_NAN 0

42 FP_NAN,

43 FP_INFINITE =

44 # define FP_INFINITE 1

45 FP_INFINITE,

46 FP_ZERO =

47 # define FP_ZERO 2

48 FP_ZERO,

49 FP_SUBNORMAL =

50 # define FP_SUBNORMAL 3

51 FP_SUBNORMAL,

52 FP_NORMAL =

53 # define FP_NORMAL 4

54 FP_NORMAL

55 };

57 // F16C extensions are not meaningful when compiling for NVRTC which only accommodates device code.

58 #undef CUTLASS_ENABLE_F16C

59 #define CUTLASS_ENABLE_F16C 0

61 #else

62 #include <cmath>

63 #include <limits>

64 #include <cstdint>

65 #endif

69 #include <cuda_fp16.h>

71 #include "cutlass/cutlass.h"

75 // Optionally target F16C extentions to accelerate half-precision conversion.

76 #if !defined(__CUDA_ARCH__) && (CUTLASS_ENABLE_F16C)

77 #if defined(_MSC_VER)

79 #include <immintrin.h>

81 #define F16C_ROUND_NEAREST 0

83 #if !defined(__CUDA_ARCH__)

84 extern __inline float _cvtsh_ss (unsigned short __S) {

85 __m128i packed;

86 std::memcpy(&packed, &__S, sizeof(__S));

88 __m128 result = _mm_cvtph_ps(packed);

90float flt;

91 std::memcpy(&flt, &result, sizeof(flt));

93return flt;

94 }

96 __inline unsigned short _cvtss_sh (float __F, const int) {

97 __m128 packed;

98 std::memcpy(&packed, &__F, sizeof(__F));

100 __m128i result = _mm_cvtps_ph(packed, F16C_ROUND_NEAREST);

101

102unsigned short u;

103 std::memcpy(&u, &result, sizeof(u));

104

105return u;

106 }

107 #endif

108

109 #else

110

111 // Linux

112 #include <x86intrin.h>

113 #define F16C_ROUND_NEAREST (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)

114

115 #endif

116 #endif // !defined(__CUDA_ARCH__) && CUTLASS_ENABLE_F16C

117

119

120

121 namespace cutlass {

122

124

126 struct alignas(2) half_t {

127

128//

129// Data members

130//

131

133 uint16_t storage;

134

135//

136// Static conversion operators

137//

138

140CUTLASS_HOST_DEVICE

141static half_t bitcast(uint16_t x) {

142half_t h;

143 h.storage = x;

144return h;

145 }

146

148 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 530)

149// Avoid inlining in device code if no hardware support

150 __device__ __noinline__

151 #else

152CUTLASS_HOST_DEVICE

153 #endif

154static half_t convert(float const& flt) {

155 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

156return half_t(__float2half_rn(flt));

157 #elif !defined(__CUDA_ARCH__) && CUTLASS_ENABLE_F16C

158unsigned short u = _cvtss_sh(flt, F16C_ROUND_NEAREST);

159return bitcast(u);

160 #else

161// software implementation rounds toward nearest even

162unsigned const& s = reinterpret_cast<unsigned const &>(flt);

163 uint16_t sign = uint16_t((s >> 16) & 0x8000);

164 int16_t exp = uint16_t(((s >> 23) & 0xff) - 127);

165int mantissa = s & 0x7fffff;

166 uint16_t u = 0;

167

168if ((s & 0x7fffffff) == 0) {

169// sign-preserving zero

170return bitcast(sign);

171 }

172

173if (exp > 15) {

174if (exp == 128 && mantissa) {

175// not a number

176 u = 0x7fff;

177 } else {

178// overflow to infinity

179 u = sign | 0x7c00;

180 }

181return bitcast(u);

182 }

183

184int sticky_bit = 0;

185

186if (exp >= -14) {

187// normal fp32 to normal fp16

188 exp = uint16_t(exp + uint16_t(15));

189 u = uint16_t(((exp & 0x1f) << 10));

190 u = uint16_t(u | (mantissa >> 13));

191 } else {

192// normal single-precision to subnormal half_t-precision representation

193int rshift = (-14 - exp);

194if (rshift < 32) {

195 mantissa |= (1 << 23);

196

197 sticky_bit = ((mantissa & ((1 << rshift) - 1)) != 0);

198

199 mantissa = (mantissa >> rshift);

200 u = (uint16_t(mantissa >> 13) & 0x3ff);

201 } else {

202 mantissa = 0;

203 u = 0;

204 }

205 }

206

207// round to nearest even

208int round_bit = ((mantissa >> 12) & 1);

209 sticky_bit |= ((mantissa & ((1 << 12) - 1)) != 0);

210

211if ((round_bit && sticky_bit) || (round_bit && (u & 1))) {

212 u = uint16_t(u + 1);

213 }

214

215 u |= sign;

216

217return bitcast(u);

218 #endif

219 }

220

222CUTLASS_HOST_DEVICE

223static half_t convert(int const& n) {

224 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

225return half_t(__int2half_rn(n));

226 #else

227return convert(float(n));

228 #endif

229 }

230

232CUTLASS_HOST_DEVICE

233static half_t convert(unsigned const& n) {

234 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

235return half_t(__uint2half_rn(n));

236 #else

237return convert(float(n));

238 #endif

239 }

240

242 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 530)

243// Avoid inlining in device code if no hardware support

244 __device__ __noinline__

245 #else

246CUTLASS_HOST_DEVICE

247 #endif

248static float convert(half_t const& x) {

249 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

250return __half2float(x.to_half());

251 #elif !defined(__CUDA_ARCH__) && CUTLASS_ENABLE_F16C

252unsigned short u = x.storage;

253return _cvtsh_ss(u);

254 #else

255 uint16_t const &h = x.storage;

256int sign = ((h >> 15) & 1);

257int exp = ((h >> 10) & 0x1f);

258int mantissa = (h & 0x3ff);

259unsigned f = 0;

260

261if (exp > 0 && exp < 31) {

262// normal

263 exp += 112;

264 f = (sign << 31) | (exp << 23) | (mantissa << 13);

265 } else if (exp == 0) {

266if (mantissa) {

267// subnormal

268 exp += 113;

269while ((mantissa & (1 << 10)) == 0) {

270 mantissa <<= 1;

271 exp--;

272 }

273 mantissa &= 0x3ff;

274 f = (sign << 31) | (exp << 23) | (mantissa << 13);

275 } else {

276// sign-preserving zero

277 f = (sign << 31);

278 }

279 } else if (exp == 31) {

280if (mantissa) {

281 f = 0x7fffffff; // not a number

282 } else {

283 f = (0xff << 23) | (sign << 31); // inf

284 }

285 }

286return reinterpret_cast<float const&>(f);

287 #endif

288 }

289

290//

291// Methods

292//

293

295CUTLASS_HOST_DEVICE

296 half_t() { }

297

299CUTLASS_HOST_DEVICE

300explicit half_t(half const & x): storage(reinterpret_cast<uint16_t const &>(x)) {

301

302 }

303

305CUTLASS_HOST_DEVICE

306explicit half_t(float x) {

307 storage = convert(x).storage;

308 }

309

311CUTLASS_HOST_DEVICE

312explicit half_t(double x): half_t(float(x)) {

313

314 }

315

317CUTLASS_HOST_DEVICE

318explicit half_t(int x) {

319 storage = convert(x).storage;

320 }

321

323CUTLASS_HOST_DEVICE

324explicit half_t(unsigned x) {

325 storage = convert(x).storage;

326 }

327

329CUTLASS_HOST_DEVICE

330 half_t & operator=(half const &x) {

331 storage = reinterpret_cast<uint16_t const &>(x);

332return *this;

333 }

334

336CUTLASS_HOST_DEVICE

337operator float() const {

338return convert(*this);

339 }

340

342CUTLASS_HOST_DEVICE

343operator double() const {

344return double(convert(*this));

345 }

346

348CUTLASS_HOST_DEVICE

349explicit operator int() const {

350return int(convert(*this));

351 }

352

354CUTLASS_HOST_DEVICE

355operator bool() const {

356return (convert(*this) != 0.0f);

357 }

358

360CUTLASS_HOST_DEVICE

361 half to_half() const {

362return reinterpret_cast<half const &>(storage);

363 }

364

366CUTLASS_HOST_DEVICE

367 uint16_t& raw() {

368return storage;

369 }

370

372CUTLASS_HOST_DEVICE

373 uint16_t raw() const {

374return storage;

375 }

376

378CUTLASS_HOST_DEVICE

379bool signbit() const {

380return ((storage & 0x8000) != 0);

381 }

382

384CUTLASS_HOST_DEVICE

385int exponent_biased() const {

386return int((storage >> 10) & 0x1f);

387 }

388

390CUTLASS_HOST_DEVICE

391int exponent() const {

392return exponent_biased() - 15;

393 }

394

396CUTLASS_HOST_DEVICE

397int mantissa() const {

398return int(storage & 0x3ff);

399 }

400 };

401

403

404 CUTLASS_HOST_DEVICE

405 bool signbit(cutlass::half_t const& h) {

406return ((h.raw() & 0x8000) != 0);

407 }

408

409 CUTLASS_HOST_DEVICE

410 cutlass::half_t abs(cutlass::half_t const& h) {

411return cutlass::half_t::bitcast(h.raw() & 0x7fff);

412 }

413

414 CUTLASS_HOST_DEVICE

415 bool isnan(cutlass::half_t const& h) {

416return (h.exponent_biased() == 0x1f) && h.mantissa();

417 }

418

419 CUTLASS_HOST_DEVICE

420 bool isfinite(cutlass::half_t const& h) {

421return (h.exponent_biased() != 0x1f);

422 }

423

424 CUTLASS_HOST_DEVICE

425 cutlass::half_t nanh(const char*) {

426// NVIDIA canonical NaN

427return cutlass::half_t::bitcast(0x7fff);

428 }

429

430 CUTLASS_HOST_DEVICE

431 bool isinf(cutlass::half_t const& h) {

432return (h.exponent_biased() == 0x1f) && !h.mantissa();

433 }

434

435 CUTLASS_HOST_DEVICE

436 bool isnormal(cutlass::half_t const& h) {

437return h.exponent_biased() && h.exponent_biased() != 0x1f;

438 }

439

440 CUTLASS_HOST_DEVICE

441 int fpclassify(cutlass::half_t const& h) {

442int exp = h.exponent_biased();

443int mantissa = h.mantissa();

444if (exp == 0x1f) {

445if (mantissa) {

446return FP_NAN;

447 }

448else {

449return FP_INFINITE;

450 }

451 }

452else if (!exp) {

453if (mantissa) {

454return FP_SUBNORMAL;

455 }

456else {

457return FP_ZERO;

458 }

459 }

460return FP_NORMAL;

461 }

462

463 CUTLASS_HOST_DEVICE

464 cutlass::half_t sqrt(cutlass::half_t const& h) {

465 #if defined(__CUDACC_RTC__)

466return cutlass::half_t(sqrtf(float(h)));

467 #else

468return cutlass::half_t(std::sqrt(float(h)));

469 #endif

470 }

471

472 CUTLASS_HOST_DEVICE

473 half_t copysign(half_t const& a, half_t const& b) {

474

475 uint16_t a_mag = (reinterpret_cast<uint16_t const &>(a) & 0x7fff);

476 uint16_t b_sign = (reinterpret_cast<uint16_t const &>(b) & 0x8000);

477 uint16_t result = (a_mag | b_sign);

478

479return reinterpret_cast<cutlass::half_t const &>(result);

480 }

481

483

484 } // namespace cutlass

485

487 //

488 // Standard Library operations and definitions

489 //

491

492 namespace std {

493

494 #if !defined(__CUDACC_RTC__)

495 template <>

[497](structstd_1_1numeric limits_3_01cutlass_1_1half t_01_4.html) struct numeric_limits<cutlass::half_t> {

[498](structstd_1_1numeric limits_3_01cutlass_1_1half t_01_4.html#ade4affb586360c5356a7939c1b343a40)static bool const is_specialized = true;

[499](structstd_1_1numeric limits_3_01cutlass_1_1half t_01_4.html#ab3a169117baca2e7fae33846caa5dbfd)static bool const is_signed = true;

[500](structstd_1_1numeric limits_3_01cutlass_1_1half t_01_4.html#a2a2ec168a6f0e9f55dc42d3b3e5fff25)static bool const is_integer = false;

[501](structstd_1_1numeric limits_3_01cutlass_1_1half t_01_4.html#a7413e9cd24eb03a86cc5f2d47c49db3e)static bool const is_exact = false;

[502](structstd_1_1numeric limits_3_01cutlass_1_1half t_01_4.html#a6f7f2fbe6cd7a04803b90b8fa9172098)static bool const has_infinity = true;

[503](structstd_1_1numeric limits_3_01cutlass_1_1half t_01_4.html#a3d75832e46bc154758e35a03a624ccf8)static bool const has_quiet_NaN = true;

[504](structstd_1_1numeric limits_3_01cutlass_1_1half t_01_4.html#a68a0d0f6ecc2f3b84f2e71475b2c48bd)static bool const has_signaling_NaN = false;

[505](structstd_1_1numeric limits_3_01cutlass_1_1half t_01_4.html#aaf46b5d03403828c1e6633fb714ffd84)static std::float_denorm_style const has_denorm = std::denorm_present;

[506](structstd_1_1numeric limits_3_01cutlass_1_1half t_01_4.html#a8b371f82151fd0238b7da083fa2b87a9)static bool const has_denorm_loss = true;

[507](structstd_1_1numeric limits_3_01cutlass_1_1half t_01_4.html#ab0af85c1d7c83ca03ed0c083fe22262f)static std::float_round_style const round_style = std::round_to_nearest;

[508](structstd_1_1numeric limits_3_01cutlass_1_1half t_01_4.html#afc86b85a1fe209658a50d8c06e54cb77)static bool const is_iec559 = true;

[509](structstd_1_1numeric limits_3_01cutlass_1_1half t_01_4.html#a59737f49161b87c259683c26737f42c2)static bool const is_bounded = true;

[510](structstd_1_1numeric limits_3_01cutlass_1_1half t_01_4.html#a640a034527a3577039053113bc1c5e46)static bool const is_modulo = false;

[511](structstd_1_1numeric limits_3_01cutlass_1_1half t_01_4.html#a92152311525685a53c6a0db4cb74f193)static int const digits = 10;

512

[514](structstd_1_1numeric limits_3_01cutlass_1_1half t_01_4.html#ad9175b4d7b32fe18cf9c07e4f559b32c)static cutlass::half_t min() { return cutlass::half_t::bitcast(0x0001); }

515

[517](structstd_1_1numeric limits_3_01cutlass_1_1half t_01_4.html#ac2ae43139037875e38056a675ae1f6c4)static cutlass::half_t lowest() { return cutlass::half_t::bitcast(0xfbff); }

518

[520](structstd_1_1numeric limits_3_01cutlass_1_1half t_01_4.html#a39a5774583daedbb5ac4aaaaa8034883)static cutlass::half_t max() { return cutlass::half_t::bitcast(0x7bff); }

521

[523](structstd_1_1numeric limits_3_01cutlass_1_1half t_01_4.html#ab9fc3a009eaff0c922307f2780ee3fc0)static cutlass::half_t epsilon() { return cutlass::half_t::bitcast(0x1800); }

524

[526](structstd_1_1numeric limits_3_01cutlass_1_1half t_01_4.html#ab0a036db7a1ad11c65e876020c78b1a5)static cutlass::half_t round_error() { return cutlass::half_t(0.5f); }

527

[529](structstd_1_1numeric limits_3_01cutlass_1_1half t_01_4.html#ab7a40820e64282376a050095d5004b74)static cutlass::half_t infinity() { return cutlass::half_t::bitcast(0x7c00); }

530

[532](structstd_1_1numeric limits_3_01cutlass_1_1half t_01_4.html#a8c7eafdd3b121353c0914dc6e1c0d108)static cutlass::half_t quiet_NaN() { return cutlass::half_t::bitcast(0x7fff); }

533

[535](structstd_1_1numeric limits_3_01cutlass_1_1half t_01_4.html#a423fb5b95e6071e832d40918e597f63f)static cutlass::half_t signaling_NaN() { return cutlass::half_t::bitcast(0x7fff); }

536

[538](structstd_1_1numeric limits_3_01cutlass_1_1half t_01_4.html#a2c05c19022c183e8734ada65c8970af5)static cutlass::half_t denorm_min() { return cutlass::half_t::bitcast(0x0001); }

539 };

540 #endif

541

542 } // namespace std

543

545 //

546 // Arithmetic operators

547 //

549

550 namespace cutlass {

551

553

554 CUTLASS_HOST_DEVICE

555 bool operator==(half_t const& lhs, half_t const& rhs) {

556 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

557return __heq(lhs.to_half(), rhs.to_half());

558 #else

559return float(lhs) == float(rhs);

560 #endif

561 }

562

563 CUTLASS_HOST_DEVICE

564 bool operator!=(half_t const& lhs, half_t const& rhs) {

565 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

566return __hne(lhs.to_half(), rhs.to_half());

567 #else

568return float(lhs) != float(rhs);

569 #endif

570 }

571

572 CUTLASS_HOST_DEVICE

573 bool operator<(half_t const& lhs, half_t const& rhs) {

574 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

575return __hlt(lhs.to_half(), rhs.to_half());

576 #else

577return float(lhs) < float(rhs);

578 #endif

579 }

580

581 CUTLASS_HOST_DEVICE

582 bool operator<=(half_t const& lhs, half_t const& rhs) {

583 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

584return __hle(lhs.to_half(), rhs.to_half());

585 #else

586return float(lhs) <= float(rhs);

587 #endif

588 }

589

590 CUTLASS_HOST_DEVICE

591 bool operator>(half_t const& lhs, half_t const& rhs) {

592 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

593return __hgt(lhs.to_half(), rhs.to_half());

594 #else

595return float(lhs) > float(rhs);

596 #endif

597 }

598

599 CUTLASS_HOST_DEVICE

600 bool operator>=(half_t const& lhs, half_t const& rhs) {

601 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

602return __hge(lhs.to_half(), rhs.to_half());

603 #else

604return float(lhs) >= float(rhs);

605 #endif

606 }

607

608 CUTLASS_HOST_DEVICE

609 half_t operator+(half_t const& lhs, half_t const& rhs) {

610 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

611return half_t(__hadd(lhs.to_half(), rhs.to_half()));

612 #else

613return half_t(float(lhs) + float(rhs));

614 #endif

615 }

616

617 CUTLASS_HOST_DEVICE

618 half_t operator-(half_t const& lhs) {

619 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

620return half_t(__hneg(lhs.to_half()));

621 #else

622return half_t(-float(lhs));

623 #endif

624 }

625

626 CUTLASS_HOST_DEVICE

627 half_t operator-(half_t const& lhs, half_t const& rhs) {

628 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

629return half_t(__hsub(lhs.to_half(), rhs.to_half()));

630 #else

631return half_t(float(lhs) - float(rhs));

632 #endif

633 }

634

635 CUTLASS_HOST_DEVICE

636 half_t operator*(half_t const& lhs, half_t const& rhs) {

637 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

638return half_t(__hmul(lhs.to_half(), rhs.to_half()));

639 #else

640return half_t(float(lhs) * float(rhs));

641 #endif

642 }

643

644 CUTLASS_HOST_DEVICE

645 half_t operator/(half_t const& lhs, half_t const& rhs) {

646 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

647return half_t(__hdiv(lhs.to_half(), rhs.to_half()));

648 #else

649return half_t(float(lhs) / float(rhs));

650 #endif

651 }

652

653 CUTLASS_HOST_DEVICE

654 half_t& operator+=(half_t & lhs, half_t const& rhs) {

655 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

656 lhs = half_t(__hadd(lhs.to_half(), rhs.to_half()));

657 #else

658 lhs = half_t(float(lhs) + float(rhs));

659 #endif

660return lhs;

661 }

662

663 CUTLASS_HOST_DEVICE

664 half_t& operator-=(half_t & lhs, half_t const& rhs) {

665 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

666 lhs = half_t(__hsub(lhs.to_half(), rhs.to_half()));

667 #else

668 lhs = half_t(float(lhs) - float(rhs));

669 #endif

670return lhs;

671 }

672

673 CUTLASS_HOST_DEVICE

674 half_t& operator*=(half_t & lhs, half_t const& rhs) {

675 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

676 lhs = half_t(__hmul(lhs.to_half(), rhs.to_half()));

677 #else

678 lhs = half_t(float(lhs) * float(rhs));

679 #endif

680return lhs;

681 }

682

683 CUTLASS_HOST_DEVICE

684 half_t& operator/=(half_t & lhs, half_t const& rhs) {

685 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

686 lhs = half_t(__hdiv(lhs.to_half(), rhs.to_half()));

687 #else

688 lhs = half_t(float(lhs) / float(rhs));

689 #endif

690return lhs;

691 }

692

693 CUTLASS_HOST_DEVICE

694 half_t& operator++(half_t & lhs) {

695 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

696 lhs = half_t(__hadd(lhs.to_half(), half_t(1.0f).to_half()));

697 #else

698float tmp(lhs);

699 ++tmp;

700 lhs = half_t(tmp);

701 #endif

702return lhs;

703 }

704

705 CUTLASS_HOST_DEVICE

706 half_t& operator--(half_t & lhs) {

707 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

708 lhs = half_t(__hsub(lhs.to_half(), half_t(1.0f).to_half()));

709 #else

710float tmp(lhs);

711 --tmp;

712 lhs = half_t(tmp);

713 #endif

714return lhs;

715 }

716

717 CUTLASS_HOST_DEVICE

718 half_t operator++(half_t & lhs, int) {

719half_t ret(lhs);

720 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

721 lhs = half_t(__hadd(lhs.to_half(), half_t(1.0f).to_half()));

722 #else

723float tmp(lhs);

724 tmp++;

725 lhs = half_t(tmp);

726 #endif

727return ret;

728 }

729

730 CUTLASS_HOST_DEVICE

731 half_t operator--(half_t & lhs, int) {

732half_t ret(lhs);

733 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)

734 lhs = half_t(__hsub(lhs.to_half(), half_t(1.0f).to_half()));

735 #else

736float tmp(lhs);

737 tmp--;

738 lhs = half_t(tmp);

739 #endif

740return ret;

741 }

742

744

745 } // namespace cutlass

746

748

749 //

750 // User-defined literals

751 //

752

753 CUTLASS_HOST_DEVICE

754 cutlass::half_t operator "" _hf(long double x) {

755return cutlass::half_t(float(x));

756 }

757

758 CUTLASS_HOST_DEVICE

759 cutlass::half_t operator "" _hf(unsigned long long int x) {

760return cutlass::half_t(int(x));

761 }

762

[std::numeric_limits< cutlass::half_t >::max](structstd_1_1numeric limits_3_01cutlass_1_1half t_01_4.html#a39a5774583daedbb5ac4aaaaa8034883)

static cutlass::half_t max()

Maximum finite value.

Definition: half.h:520

cutlass::half_t::bitcast

static CUTLASS_HOST_DEVICE half_t bitcast(uint16_t x)

Constructs from an unsigned short.

Definition: half.h:141

cutlass

Definition: aligned_buffer.h:35

cutlass::half_t::half_t

CUTLASS_HOST_DEVICE half_t(int x)

Integer conversion - round to nearest even.

Definition: half.h:318

cutlass::abs

CUTLASS_HOST_DEVICE T abs(complex< T > const &z)

Returns the magnitude of the complex number.

Definition: complex.h:313

[std::numeric_limits< cutlass::half_t >::signaling_NaN](structstd_1_1numeric limits_3_01cutlass_1_1half t_01_4.html#a423fb5b95e6071e832d40918e597f63f)

static cutlass::half_t signaling_NaN()

Returns smallest finite value.

Definition: half.h:535

[std::numeric_limits< cutlass::half_t >::infinity](structstd_1_1numeric limits_3_01cutlass_1_1half t_01_4.html#ab7a40820e64282376a050095d5004b74)

static cutlass::half_t infinity()

Returns smallest finite value.

Definition: half.h:529

cutlass::half_t::half_t

CUTLASS_HOST_DEVICE half_t()

Default constructor.

Definition: half.h:296

cutlass::half_t::convert

static CUTLASS_HOST_DEVICE half_t convert(float const &flt)

FP32 -> FP16 conversion - rounds to nearest even.

Definition: half.h:154

cutlass::operator/=

CUTLASS_HOST_DEVICE half_t & operator/=(half_t &lhs, half_t const &rhs)

Definition: half.h:684

cutlass::half_t::storage

uint16_t storage

Storage type.

Definition: half.h:133

cutlass::half_t

IEEE half-precision floating-point type.

Definition: half.h:126

cutlass::half_t::convert

static CUTLASS_HOST_DEVICE half_t convert(int const &n)

FP32 -> FP16 conversion - rounds to nearest even.

Definition: half.h:223

cutlass::isnormal

CUTLASS_HOST_DEVICE bool isnormal(cutlass::half_t const &h)

Definition: half.h:436

std

STL namespace.

cutlass::operator<=

CUTLASS_HOST_DEVICE bool operator<=(half_t const &lhs, half_t const &rhs)

Definition: half.h:582

[std::numeric_limits< cutlass::half_t >::denorm_min](structstd_1_1numeric limits_3_01cutlass_1_1half t_01_4.html#a2c05c19022c183e8734ada65c8970af5)

static cutlass::half_t denorm_min()

Returns smallest finite value.

Definition: half.h:538

cutlass::exp

CUTLASS_HOST_DEVICE complex< T > exp(complex< T > const &z)

Computes the complex exponential of z.

Definition: complex.h:375

cutlass::half_t::convert

static CUTLASS_HOST_DEVICE float convert(half_t const &x)

Converts a half-precision value stored as a uint16_t to a float.

Definition: half.h:248

cutlass::operator+=

CUTLASS_HOST_DEVICE half_t & operator+=(half_t &lhs, half_t const &rhs)

Definition: half.h:654

cutlass::half_t::half_t

CUTLASS_HOST_DEVICE half_t(unsigned x)

Integer conversion - round toward zero.

Definition: half.h:324

cutlass::operator+

CUTLASS_HOST_DEVICE half_t operator+(half_t const &lhs, half_t const &rhs)

Definition: half.h:609

cutlass::operator-=

CUTLASS_HOST_DEVICE half_t & operator-=(half_t &lhs, half_t const &rhs)

Definition: half.h:664

[std::numeric_limits< cutlass::half_t >::round_error](structstd_1_1numeric limits_3_01cutlass_1_1half t_01_4.html#ab0a036db7a1ad11c65e876020c78b1a5)

static cutlass::half_t round_error()

Returns smallest finite value.

Definition: half.h:526

cutlass::operator++

CUTLASS_HOST_DEVICE half_t & operator++(half_t &lhs)

Definition: half.h:694

cutlass::half_t::signbit

CUTLASS_HOST_DEVICE bool signbit() const

Returns the sign bit.

Definition: half.h:379

cutlass::sqrt

CUTLASS_HOST_DEVICE cutlass::half_t sqrt(cutlass::half_t const &h)

Definition: half.h:464

cutlass::fpclassify

CUTLASS_HOST_DEVICE int fpclassify(cutlass::half_t const &h)

Definition: half.h:441

cutlass::operator!=

CUTLASS_HOST_DEVICE bool operator!=(half_t const &lhs, half_t const &rhs)

Definition: half.h:564

cutlass::half_t::to_half

CUTLASS_HOST_DEVICE half to_half() const

Bitcasts to CUDA's half type.

Definition: half.h:361

cutlass::half_t::operator=

CUTLASS_HOST_DEVICE half_t & operator=(half const &x)

Assignment.

Definition: half.h:330

cutlass::half_t::half_t

CUTLASS_HOST_DEVICE half_t(half const &x)

Reinterpret cast from CUDA's half type.

Definition: half.h:300

cutlass::half_t::raw

CUTLASS_HOST_DEVICE uint16_t raw() const

Accesses raw internal state.

Definition: half.h:373

cutlass::isinf

CUTLASS_HOST_DEVICE bool isinf(cutlass::half_t const &h)

Definition: half.h:431

cutlass::copysign

CUTLASS_HOST_DEVICE half_t copysign(half_t const &a, half_t const &b)

Definition: half.h:473

cutlass::operator--

CUTLASS_HOST_DEVICE half_t & operator--(half_t &lhs)

Definition: half.h:706

CUTLASS_HOST_DEVICE

#define CUTLASS_HOST_DEVICE

Definition: cutlass.h:89

cutlass::nanh

CUTLASS_HOST_DEVICE cutlass::half_t nanh(const char *)

Definition: half.h:425

cutlass::half_t::half_t

CUTLASS_HOST_DEVICE half_t(float x)

Floating point conversion.

Definition: half.h:306

cutlass::operator>

CUTLASS_HOST_DEVICE bool operator>(half_t const &lhs, half_t const &rhs)

Definition: half.h:591

cutlass::operator-

CUTLASS_HOST_DEVICE half_t operator-(half_t const &lhs)

Definition: half.h:618

cutlass::isfinite

CUTLASS_HOST_DEVICE bool isfinite(cutlass::half_t const &h)

Definition: half.h:420

cutlass::operator*=

CUTLASS_HOST_DEVICE half_t & operator*=(half_t &lhs, half_t const &rhs)

Definition: half.h:674

[std::numeric_limits< cutlass::half_t >::min](structstd_1_1numeric limits_3_01cutlass_1_1half t_01_4.html#ad9175b4d7b32fe18cf9c07e4f559b32c)

static cutlass::half_t min()

Least positive value.

Definition: half.h:514

cutlass::FloatRoundStyle::round_to_nearest

round to nearest even

cutlass::half_t::mantissa

CUTLASS_HOST_DEVICE int mantissa() const

Returns the mantissa.

Definition: half.h:397

[std::numeric_limits< cutlass::half_t >::lowest](structstd_1_1numeric limits_3_01cutlass_1_1half t_01_4.html#ac2ae43139037875e38056a675ae1f6c4)

static cutlass::half_t lowest()

Minimum finite value.

Definition: half.h:517

[std::numeric_limits< cutlass::half_t >::epsilon](structstd_1_1numeric limits_3_01cutlass_1_1half t_01_4.html#ab9fc3a009eaff0c922307f2780ee3fc0)

static cutlass::half_t epsilon()

Returns smallest finite value.

Definition: half.h:523

cutlass::operator==

CUTLASS_HOST_DEVICE bool operator==(half_t const &lhs, half_t const &rhs)

Definition: half.h:555

cutlass::operator/

CUTLASS_HOST_DEVICE Coord< Rank, Index > operator/(Index s, Coord< Rank, Index > coord)

Scalar division.

Definition: coord.h:360

cutlass::operator>=

CUTLASS_HOST_DEVICE bool operator>=(half_t const &lhs, half_t const &rhs)

Definition: half.h:600

cutlass::half_t::half_t

CUTLASS_HOST_DEVICE half_t(double x)

Floating point conversion.

Definition: half.h:312

cutlass::half_t::convert

static CUTLASS_HOST_DEVICE half_t convert(unsigned const &n)

FP32 -> FP16 conversion - rounds to nearest even.

Definition: half.h:233

cutlass::operator<

CUTLASS_HOST_DEVICE bool operator<(half_t const &lhs, half_t const &rhs)

Definition: half.h:573

cutlass.h

Basic include for CUTLASS.

cutlass::sqrt

CUTLASS_HOST_DEVICE complex< T > sqrt(complex< T > const &z)

Computes the square root of complex number z.

Definition: complex.h:393

cutlass::half_t::raw

CUTLASS_HOST_DEVICE uint16_t & raw()

Accesses raw internal state.

Definition: half.h:367

[std::numeric_limits< cutlass::half_t >::quiet_NaN](structstd_1_1numeric limits_3_01cutlass_1_1half t_01_4.html#a8c7eafdd3b121353c0914dc6e1c0d108)

static cutlass::half_t quiet_NaN()

Returns smallest finite value.

Definition: half.h:532

cutlass::half_t::exponent

CUTLASS_HOST_DEVICE int exponent() const

Returns the unbiased exponent.

Definition: half.h:391

cutlass::half_t::exponent_biased

CUTLASS_HOST_DEVICE int exponent_biased() const

Returns the biased exponent.

Definition: half.h:385

cutlass::operator*