docs/device_2tensor__foreach_8h_source.html
| | CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers |
device/tensor_foreach.h
Go to the documentation of this file.
1 /***************************************************************************************************
2 * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without modification, are permitted
5 * provided that the following conditions are met:
6 * * Redistributions of source code must retain the above copyright notice, this list of
7 * conditions and the following disclaimer.
8 * * Redistributions in binary form must reproduce the above copyright notice, this list of
9 * conditions and the following disclaimer in the documentation and/or other materials
10 * provided with the distribution.
11 * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12 * to endorse or promote products derived from this software without specific prior written
13 * permission.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21 * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 *
24 **************************************************************************************************/
25 #pragma once
26
27 #include <stdexcept>
28 #include "cutlass/cutlass.h"
29 #include "cutlass/util/reference/device/kernel/tensor_foreach.h"
30
31 namespace cutlass {
32 namespace reference {
33 namespace device {
34
36
38 template <typename Func, int Rank, typename Params>
39 struct TensorForEach {
40
42TensorForEach(Coord<Rank> size, Params params = Params(), int grid_size = 0, int block_size = 0) {
43
44if (!grid_size || !block_size) {
45
46// if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
47 cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
48 &grid_size,
49 &block_size,
50 reinterpret_cast<void const *>(kernel::TensorForEach<Func, Rank, Params>));
51
52if (result != cudaSuccess) {
53throw std::runtime_error("Failed to query occupancy.");
54 }
55
56// Limit block size. This has the effect of increasing the number of items processed by a
57// single thread and reduces the impact of initialization overhead.
58 block_size = (block_size < 128 ? block_size : 128);
59 }
60
61 dim3 grid(grid_size, 1, 1);
62 dim3 block(block_size, 1, 1);
63
64 kernel::TensorForEach<Func, Rank, Params><<< grid, block >>>(size, params);
65 }
66 };
67
69
71 template <typename Func, int Rank, typename Params>
72 struct TensorDiagonalForEach {
73
75TensorDiagonalForEach(Coord<Rank> size, Params params = Params(), int start = 0, int end = -1, int block_size = 128) {
76
77if (end < 0) {
78 end = size.min();
79 }
80
81 dim3 block(block_size, 1, 1);
82 dim3 grid((end - start + block_size - 1) / block_size, 1, 1);
83
84 kernel::TensorDiagonalForEach<Func, Rank, Params><<< grid, block >>>(size, params, start, end);
85 }
86 };
87
88
90
91 template <typename Element, typename Func>
92 struct BlockForEach {
93
96 Element *ptr,
97size_t capacity,
98typename Func::Params params = typename Func::Params(),
99int grid_size = 0,
100int block_size = 0) {
101
102if (!grid_size || !block_size) {
103
104// if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
105 cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
106 &grid_size,
107 &block_size,
108 reinterpret_cast<void const *>(kernel::BlockForEach<Element, Func>));
109
110if (result != cudaSuccess) {
111throw std::runtime_error("Failed to query occupancy.");
112 }
113
114// Limit block size. This has the effect of increasing the number of items processed by a
115// single thread and reduces the impact of initialization overhead.
116 block_size = (block_size < 128 ? block_size : 128);
117 }
118
119 dim3 grid(grid_size, 1, 1);
120 dim3 block(block_size, 1, 1);
121
122 kernel::BlockForEach<Element, Func><<< grid, block >>>(ptr, capacity, params);
123 }
124 };
125
127
128 } // namespace device
129 } // namespace reference
130 } // namespace cutlass
Definition: aligned_buffer.h:35
cutlass::reference::device::TensorDiagonalForEach::TensorDiagonalForEach
TensorDiagonalForEach(Coord< Rank > size, Params params=Params(), int start=0, int end=-1, int block_size=128)
Constructor performs the operation.
Definition: device/tensor_foreach.h:75
cutlass::reference::device::TensorForEach::TensorForEach
TensorForEach(Coord< Rank > size, Params params=Params(), int grid_size=0, int block_size=0)
Constructor performs the operation.
Definition: device/tensor_foreach.h:42
cutlass::reference::device::TensorDiagonalForEach
Launches a kernel calling a functor for each element along a tensor's diagonal.
Definition: device/tensor_foreach.h:72
cutlass::reference::device::BlockForEach::BlockForEach
BlockForEach(Element *ptr, size_t capacity, typename Func::Params params=typename Func::Params(), int grid_size=0, int block_size=0)
Constructor performs the operation.
Definition: device/tensor_foreach.h:95
cutlass::reference::device::TensorForEach
Launches a kernel calling a functor for each element in a tensor's index space.
Definition: device/tensor_foreach.h:39
Statically-sized array specifying Coords within a tensor.
Definition: coord.h:43
cutlass::reference::device::BlockForEach
Definition: device/tensor_foreach.h:92
Basic include for CUTLASS.
Generated by 1.8.11