docs/classcutlass_1_1reduction_1_1kernel_1_1ReduceSplitK.html
| | CUTLASS
CUDA Templates for Linear Algebra Subroutines and Solvers |
Classes | Public Types | Public Member Functions | Static Public Member Functions | Static Public Attributes | List of all members
cutlass::reduction::kernel::ReduceSplitK< Shape_, OutputOp_, ReductionOp_, PartitionsPerStage > Class Template Reference
#include <reduce_split_k.h>
|
|
| struct | Params |
| | Params structure. More...
|
| |
| struct | SharedStorage |
| |
|
| | using | Shape = Shape_ | | | | using | ReductionOp = ReductionOp_ | | | | using | OutputOp = OutputOp_ | | | | using | ElementWorkspace = typename ReductionOp::Element | | | | using | ElementAccumulator = typename ReductionOp::ElementAccumulator | | | | using | ElementOutput = typename OutputOp::ElementOutput | | | | using | WorkspaceTensorRef = TensorRef< ElementWorkspace, layout::RowMajor > | | | | using | OutputTensorRef = TensorRef< ElementOutput, layout::RowMajor > | | | | using | FragmentWorkspace = AlignedArray< ElementWorkspace, kElementsPerAccess > | | | | using | FragmentAccumulator = Array< ElementAccumulator, kElementsPerAccess > | | | | using | FragmentOutput = AlignedArray< ElementOutput, kElementsPerAccess > | | |
|
|
| CUTLASS_DEVICE void | operator() (Params const ¶ms, SharedStorage &storage) |
| | Perform a reduction. More...
|
| |
|
|
| static CUTLASS_HOST_DEVICE dim3 | grid_shape (cutlass::MatrixCoord problem_size) |
| | Computes the grid size given a chosen threadblock shape. More...
|
| |
| static CUTLASS_HOST_DEVICE dim3 | block_shape () |
| | Determines the threadblock shape. More...
|
| |
|
| | static int const | kElementsPerAccess = OutputOp::kCount | | | | static int const | kPartitionsPerStage = PartitionsPerStage | | |
template<typename Shape_ , typename OutputOp_ , typename ReductionOp_ , int PartitionsPerStage = 4>
| using cutlass::reduction::kernel::ReduceSplitK< Shape_, OutputOp_, ReductionOp_, PartitionsPerStage >::ElementAccumulator = typename ReductionOp::ElementAccumulator |
template<typename Shape_ , typename OutputOp_ , typename ReductionOp_ , int PartitionsPerStage = 4>
| using cutlass::reduction::kernel::ReduceSplitK< Shape_, OutputOp_, ReductionOp_, PartitionsPerStage >::ElementOutput = typename OutputOp::ElementOutput |
template<typename Shape_ , typename OutputOp_ , typename ReductionOp_ , int PartitionsPerStage = 4>
| using cutlass::reduction::kernel::ReduceSplitK< Shape_, OutputOp_, ReductionOp_, PartitionsPerStage >::ElementWorkspace = typename ReductionOp::Element |
template<typename Shape_ , typename OutputOp_ , typename ReductionOp_ , int PartitionsPerStage = 4>
| using cutlass::reduction::kernel::ReduceSplitK< Shape_, OutputOp_, ReductionOp_, PartitionsPerStage >::FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess> |
template<typename Shape_ , typename OutputOp_ , typename ReductionOp_ , int PartitionsPerStage = 4>
| using cutlass::reduction::kernel::ReduceSplitK< Shape_, OutputOp_, ReductionOp_, PartitionsPerStage >::FragmentOutput = AlignedArray<ElementOutput, kElementsPerAccess> |
template<typename Shape_ , typename OutputOp_ , typename ReductionOp_ , int PartitionsPerStage = 4>
| using cutlass::reduction::kernel::ReduceSplitK< Shape_, OutputOp_, ReductionOp_, PartitionsPerStage >::FragmentWorkspace = AlignedArray<ElementWorkspace, kElementsPerAccess> |
template<typename Shape_ , typename OutputOp_ , typename ReductionOp_ , int PartitionsPerStage = 4>
| using cutlass::reduction::kernel::ReduceSplitK< Shape_, OutputOp_, ReductionOp_, PartitionsPerStage >::OutputOp = OutputOp_ |
template<typename Shape_ , typename OutputOp_ , typename ReductionOp_ , int PartitionsPerStage = 4>
| using cutlass::reduction::kernel::ReduceSplitK< Shape_, OutputOp_, ReductionOp_, PartitionsPerStage >::OutputTensorRef = TensorRef<ElementOutput, layout::RowMajor> |
template<typename Shape_ , typename OutputOp_ , typename ReductionOp_ , int PartitionsPerStage = 4>
| using cutlass::reduction::kernel::ReduceSplitK< Shape_, OutputOp_, ReductionOp_, PartitionsPerStage >::ReductionOp = ReductionOp_ |
template<typename Shape_ , typename OutputOp_ , typename ReductionOp_ , int PartitionsPerStage = 4>
| using cutlass::reduction::kernel::ReduceSplitK< Shape_, OutputOp_, ReductionOp_, PartitionsPerStage >::Shape = Shape_ |
template<typename Shape_ , typename OutputOp_ , typename ReductionOp_ , int PartitionsPerStage = 4>
| using cutlass::reduction::kernel::ReduceSplitK< Shape_, OutputOp_, ReductionOp_, PartitionsPerStage >::WorkspaceTensorRef = TensorRef<ElementWorkspace, layout::RowMajor> |
template<typename Shape_ , typename OutputOp_ , typename ReductionOp_ , int PartitionsPerStage = 4>
|
| static CUTLASS_HOST_DEVICE dim3 cutlass::reduction::kernel::ReduceSplitK< Shape_, OutputOp_, ReductionOp_, PartitionsPerStage >::block_shape | ( | | ) | |
| inlinestatic |
template<typename Shape_ , typename OutputOp_ , typename ReductionOp_ , int PartitionsPerStage = 4>
|
| static CUTLASS_HOST_DEVICE dim3 cutlass::reduction::kernel::ReduceSplitK< Shape_, OutputOp_, ReductionOp_, PartitionsPerStage >::grid_shape | ( | cutlass::MatrixCoord | problem_size | ) | |
| inlinestatic |
template<typename Shape_ , typename OutputOp_ , typename ReductionOp_ , int PartitionsPerStage = 4>
|
| CUTLASS_DEVICE void cutlass::reduction::kernel::ReduceSplitK< Shape_, OutputOp_, ReductionOp_, PartitionsPerStage >::operator() | ( | Params const & | params, | | | | SharedStorage & | storage | | | ) | | |
| inline |
template<typename Shape_ , typename OutputOp_ , typename ReductionOp_ , int PartitionsPerStage = 4>
|
| int const cutlass::reduction::kernel::ReduceSplitK< Shape_, OutputOp_, ReductionOp_, PartitionsPerStage >::kElementsPerAccess = OutputOp::kCount |
| static |
template<typename Shape_ , typename OutputOp_ , typename ReductionOp_ , int PartitionsPerStage = 4>
|
| int const cutlass::reduction::kernel::ReduceSplitK< Shape_, OutputOp_, ReductionOp_, PartitionsPerStage >::kPartitionsPerStage = PartitionsPerStage |
| static |
The documentation for this class was generated from the following file:
Generated by 1.8.11