CUTLASS: tensor_foreach.h Source File - Cutlass

| | CUTLASS

CUDA Templates for Linear Algebra Subroutines and Solvers |

device/tensor_foreach.h

Go to the documentation of this file.

1 /***************************************************************************************************

3 *

4 * Redistribution and use in source and binary forms, with or without modification, are permitted

5 * provided that the following conditions are met:

6 * * Redistributions of source code must retain the above copyright notice, this list of

7 * conditions and the following disclaimer.

8 * * Redistributions in binary form must reproduce the above copyright notice, this list of

9 * conditions and the following disclaimer in the documentation and/or other materials

10 * provided with the distribution.

11 * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used

12 * to endorse or promote products derived from this software without specific prior written

13 * permission.

14 *

15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR

16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND

17 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE

18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

19 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;

20 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,

21 * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

22 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

23 *

24 **************************************************************************************************/

25 #pragma once

27 #include <stdexcept>

28 #include "cutlass/cutlass.h"

29 #include "cutlass/util/reference/device/kernel/tensor_foreach.h"

31 namespace cutlass {

32 namespace reference {

33 namespace device {

38 template <typename Func, int Rank, typename Params>

39 struct TensorForEach {

42 TensorForEach(Coord<Rank> size, Params params = Params(), int grid_size = 0, int block_size = 0) {

44if (!grid_size || !block_size) {

46// if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API

47 cudaError_t result = cudaOccupancyMaxPotentialBlockSize(

48 &grid_size,

49 &block_size,

50 reinterpret_cast<void const *>(kernel::TensorForEach<Func, Rank, Params>));

52if (result != cudaSuccess) {

53throw std::runtime_error("Failed to query occupancy.");

54 }

56// Limit block size. This has the effect of increasing the number of items processed by a

57// single thread and reduces the impact of initialization overhead.

58 block_size = (block_size < 128 ? block_size : 128);

59 }

61 dim3 grid(grid_size, 1, 1);

62 dim3 block(block_size, 1, 1);

64 kernel::TensorForEach<Func, Rank, Params><<< grid, block >>>(size, params);

65 }

66 };

71 template <typename Func, int Rank, typename Params>

72 struct TensorDiagonalForEach {

75 TensorDiagonalForEach(Coord<Rank> size, Params params = Params(), int start = 0, int end = -1, int block_size = 128) {

77if (end < 0) {

78 end = size.min();

79 }

81 dim3 block(block_size, 1, 1);

82 dim3 grid((end - start + block_size - 1) / block_size, 1, 1);

84 kernel::TensorDiagonalForEach<Func, Rank, Params><<< grid, block >>>(size, params, start, end);

85 }

86 };

91 template <typename Element, typename Func>

92 struct BlockForEach {

95 BlockForEach(

96 Element *ptr,

97size_t capacity,

98typename Func::Params params = typename Func::Params(),

99int grid_size = 0,

100int block_size = 0) {

101

102if (!grid_size || !block_size) {

103

104// if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API

105 cudaError_t result = cudaOccupancyMaxPotentialBlockSize(

106 &grid_size,

107 &block_size,

108 reinterpret_cast<void const *>(kernel::BlockForEach<Element, Func>));

109

110if (result != cudaSuccess) {

111throw std::runtime_error("Failed to query occupancy.");

112 }

113

114// Limit block size. This has the effect of increasing the number of items processed by a

115// single thread and reduces the impact of initialization overhead.

116 block_size = (block_size < 128 ? block_size : 128);

117 }

118

119 dim3 grid(grid_size, 1, 1);

120 dim3 block(block_size, 1, 1);

121

122 kernel::BlockForEach<Element, Func><<< grid, block >>>(ptr, capacity, params);

123 }

124 };

125

127

128 } // namespace device

129 } // namespace reference

130 } // namespace cutlass

cutlass

Definition: aligned_buffer.h:35

cutlass::reference::device::TensorDiagonalForEach::TensorDiagonalForEach

TensorDiagonalForEach(Coord< Rank > size, Params params=Params(), int start=0, int end=-1, int block_size=128)

Constructor performs the operation.

Definition: device/tensor_foreach.h:75

cutlass::reference::device::TensorForEach::TensorForEach

TensorForEach(Coord< Rank > size, Params params=Params(), int grid_size=0, int block_size=0)

Constructor performs the operation.

Definition: device/tensor_foreach.h:42

cutlass::reference::device::TensorDiagonalForEach

Launches a kernel calling a functor for each element along a tensor's diagonal.

Definition: device/tensor_foreach.h:72

cutlass::reference::device::BlockForEach::BlockForEach

BlockForEach(Element *ptr, size_t capacity, typename Func::Params params=typename Func::Params(), int grid_size=0, int block_size=0)

Constructor performs the operation.

Definition: device/tensor_foreach.h:95

cutlass::reference::device::TensorForEach

Launches a kernel calling a functor for each element in a tensor's index space.

Definition: device/tensor_foreach.h:39

cutlass::Coord

Statically-sized array specifying Coords within a tensor.

Definition: coord.h:43

cutlass::reference::device::BlockForEach

Definition: device/tensor_foreach.h:92

tensor_foreach.h

cutlass.h

Basic include for CUTLASS.

Generated by 1.8.11