Back to Cutlass

CUTLASS: cutlass::gemm::device::GemmSplitKParallel< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_, Stages, kAlignmentA, kAlignmentB, Operator_ > Class Template Reference

docs/classcutlass_1_1gemm_1_1device_1_1GemmSplitKParallel.html

4.4.285.5 KB
Original Source

| | CUTLASS

CUDA Templates for Linear Algebra Subroutines and Solvers |

Classes | Public Types | Public Member Functions | Static Public Member Functions | Static Public Attributes | List of all members

cutlass::gemm::device::GemmSplitKParallel< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_, Stages, kAlignmentA, kAlignmentB, Operator_ > Class Template Reference

#include <gemm_splitk_parallel.h>

|

Classes

| | struct | Arguments | | | Argument structure. More...
| | |

|

Public Types

| | using | ElementA = ElementA_ | | | | using | LayoutA = LayoutA_ | | | | using | ElementB = ElementB_ | | | | using | LayoutB = LayoutB_ | | | | using | ElementC = ElementC_ | | | | using | LayoutC = LayoutC_ | | | | using | ElementAccumulator = ElementAccumulator_ | | | | using | OperatorClass = OperatorClass_ | | | | using | ArchTag = ArchTag_ | | | | using | ThreadblockShape = ThreadblockShape_ | | | | using | WarpShape = WarpShape_ | | | | using | InstructionShape = InstructionShape_ | | | | using | ConvertScaledOp = ConvertScaledOp_ | | | | using | EpilogueOutputOp = EpilogueOutputOp_ | | | | using | ReductionOp = ReductionOp_ | | | | using | ThreadblockSwizzle = ThreadblockSwizzle_ | | | | using | Operator = Operator_ | | | | using | GemmKernel = typename kernel::DefaultGemmSplitKParallel< ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementAccumulator, LayoutC, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, ConvertScaledOp, ThreadblockSwizzle, kStages, Operator >::GemmKernel | | | GEMM kernel. More...
| | | | using | ReductionKernel = cutlass::reduction::kernel::ReduceSplitK< cutlass::MatrixShape< 4, 32 *EpilogueOutputOp::kCount >, EpilogueOutputOp, ReductionOp > | | | Reduction kernel. More...
| | |

|

Public Member Functions

| | | GemmSplitKParallel () | | | Constructs the GEMM. More...
| | | | Status | initialize (Arguments const &args, void *workspace) | | | Initializes GEMM state from arguments. More...
| | | | Status | update (Arguments const &args, void *workspace=nullptr) | | | Lightweight update given a subset of arguments. More...
| | | | Status | run (cudaStream_t stream=nullptr) | | | Runs the kernel using initialized state. More...
| | | | Status | operator() (cudaStream_t stream=nullptr) | | | Runs the kernel using initialized state. More...
| | | | Status | operator() (Arguments const &args, void *workspace=nullptr, cudaStream_t stream=nullptr) | | | Runs the kernel using initialized state. More...
| | |

|

Static Public Member Functions

| | static Status | can_implement (Arguments const &args) | | | Determines whether the GEMM can execute the given problem. More...
| | | | static size_t | get_workspace_size (Arguments const &args) | | | Gets the workspace size. More...
| | |

|

Static Public Attributes

| | static int const | kStages = Stages | | |

Detailed Description

template<typename ElementA_, typename LayoutA_, typename ElementB_, typename LayoutB_, typename ElementC_, typename LayoutC_, typename ElementAccumulator_ = ElementC_, typename OperatorClass_ = arch::OpClassSimt, typename ArchTag_ = arch::Sm70, typename ThreadblockShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::ThreadblockShape, typename WarpShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::WarpShape, typename InstructionShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::InstructionShape, typename EpilogueOutputOp_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::EpilogueOutputOp, typename ConvertScaledOp_ = cutlass::epilogue::thread::Convert< ElementAccumulator_, DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementAccumulator_, ElementAccumulator_>::EpilogueOutputOp::kCount, ElementAccumulator_>, typename ReductionOp_ = cutlass::reduction::thread::ReduceAdd< ElementAccumulator_, typename EpilogueOutputOp_::ElementAccumulator, EpilogueOutputOp_::kCount>, typename ThreadblockSwizzle_ = threadblock::GemmSplitKHorizontalThreadblockSwizzle, int Stages = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kStages, int kAlignmentA = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentA, int kAlignmentB = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentB, typename Operator_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::Operator> class cutlass::gemm::device::GemmSplitKParallel< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_, Stages, kAlignmentA, kAlignmentB, Operator_ >

Gemm device-level operator performing parallel reduction over the K partition.

Member Typedef Documentation

template<typename ElementA_, typename LayoutA_, typename ElementB_, typename LayoutB_, typename ElementC_, typename LayoutC_, typename ElementAccumulator_ = ElementC_, typename OperatorClass_ = arch::OpClassSimt, typename ArchTag_ = arch::Sm70, typename ThreadblockShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::ThreadblockShape, typename WarpShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::WarpShape, typename InstructionShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::InstructionShape, typename EpilogueOutputOp_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::EpilogueOutputOp, typename ConvertScaledOp_ = cutlass::epilogue::thread::Convert< ElementAccumulator_, DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementAccumulator_, ElementAccumulator_>::EpilogueOutputOp::kCount, ElementAccumulator_>, typename ReductionOp_ = cutlass::reduction::thread::ReduceAdd< ElementAccumulator_, typename EpilogueOutputOp_::ElementAccumulator, EpilogueOutputOp_::kCount>, typename ThreadblockSwizzle_ = threadblock::GemmSplitKHorizontalThreadblockSwizzle, int Stages = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kStages, int kAlignmentA = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentA, int kAlignmentB = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentB, typename Operator_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::Operator>

| using cutlass::gemm::device::GemmSplitKParallel< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_, Stages, kAlignmentA, kAlignmentB, Operator_ >::ArchTag = ArchTag_ |

template<typename ElementA_, typename LayoutA_, typename ElementB_, typename LayoutB_, typename ElementC_, typename LayoutC_, typename ElementAccumulator_ = ElementC_, typename OperatorClass_ = arch::OpClassSimt, typename ArchTag_ = arch::Sm70, typename ThreadblockShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::ThreadblockShape, typename WarpShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::WarpShape, typename InstructionShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::InstructionShape, typename EpilogueOutputOp_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::EpilogueOutputOp, typename ConvertScaledOp_ = cutlass::epilogue::thread::Convert< ElementAccumulator_, DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementAccumulator_, ElementAccumulator_>::EpilogueOutputOp::kCount, ElementAccumulator_>, typename ReductionOp_ = cutlass::reduction::thread::ReduceAdd< ElementAccumulator_, typename EpilogueOutputOp_::ElementAccumulator, EpilogueOutputOp_::kCount>, typename ThreadblockSwizzle_ = threadblock::GemmSplitKHorizontalThreadblockSwizzle, int Stages = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kStages, int kAlignmentA = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentA, int kAlignmentB = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentB, typename Operator_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::Operator>

| using cutlass::gemm::device::GemmSplitKParallel< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_, Stages, kAlignmentA, kAlignmentB, Operator_ >::ConvertScaledOp = ConvertScaledOp_ |

template<typename ElementA_, typename LayoutA_, typename ElementB_, typename LayoutB_, typename ElementC_, typename LayoutC_, typename ElementAccumulator_ = ElementC_, typename OperatorClass_ = arch::OpClassSimt, typename ArchTag_ = arch::Sm70, typename ThreadblockShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::ThreadblockShape, typename WarpShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::WarpShape, typename InstructionShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::InstructionShape, typename EpilogueOutputOp_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::EpilogueOutputOp, typename ConvertScaledOp_ = cutlass::epilogue::thread::Convert< ElementAccumulator_, DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementAccumulator_, ElementAccumulator_>::EpilogueOutputOp::kCount, ElementAccumulator_>, typename ReductionOp_ = cutlass::reduction::thread::ReduceAdd< ElementAccumulator_, typename EpilogueOutputOp_::ElementAccumulator, EpilogueOutputOp_::kCount>, typename ThreadblockSwizzle_ = threadblock::GemmSplitKHorizontalThreadblockSwizzle, int Stages = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kStages, int kAlignmentA = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentA, int kAlignmentB = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentB, typename Operator_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::Operator>

| using cutlass::gemm::device::GemmSplitKParallel< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_, Stages, kAlignmentA, kAlignmentB, Operator_ >::ElementA = ElementA_ |

template<typename ElementA_, typename LayoutA_, typename ElementB_, typename LayoutB_, typename ElementC_, typename LayoutC_, typename ElementAccumulator_ = ElementC_, typename OperatorClass_ = arch::OpClassSimt, typename ArchTag_ = arch::Sm70, typename ThreadblockShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::ThreadblockShape, typename WarpShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::WarpShape, typename InstructionShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::InstructionShape, typename EpilogueOutputOp_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::EpilogueOutputOp, typename ConvertScaledOp_ = cutlass::epilogue::thread::Convert< ElementAccumulator_, DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementAccumulator_, ElementAccumulator_>::EpilogueOutputOp::kCount, ElementAccumulator_>, typename ReductionOp_ = cutlass::reduction::thread::ReduceAdd< ElementAccumulator_, typename EpilogueOutputOp_::ElementAccumulator, EpilogueOutputOp_::kCount>, typename ThreadblockSwizzle_ = threadblock::GemmSplitKHorizontalThreadblockSwizzle, int Stages = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kStages, int kAlignmentA = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentA, int kAlignmentB = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentB, typename Operator_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::Operator>

| using cutlass::gemm::device::GemmSplitKParallel< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_, Stages, kAlignmentA, kAlignmentB, Operator_ >::ElementAccumulator = ElementAccumulator_ |

template<typename ElementA_, typename LayoutA_, typename ElementB_, typename LayoutB_, typename ElementC_, typename LayoutC_, typename ElementAccumulator_ = ElementC_, typename OperatorClass_ = arch::OpClassSimt, typename ArchTag_ = arch::Sm70, typename ThreadblockShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::ThreadblockShape, typename WarpShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::WarpShape, typename InstructionShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::InstructionShape, typename EpilogueOutputOp_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::EpilogueOutputOp, typename ConvertScaledOp_ = cutlass::epilogue::thread::Convert< ElementAccumulator_, DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementAccumulator_, ElementAccumulator_>::EpilogueOutputOp::kCount, ElementAccumulator_>, typename ReductionOp_ = cutlass::reduction::thread::ReduceAdd< ElementAccumulator_, typename EpilogueOutputOp_::ElementAccumulator, EpilogueOutputOp_::kCount>, typename ThreadblockSwizzle_ = threadblock::GemmSplitKHorizontalThreadblockSwizzle, int Stages = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kStages, int kAlignmentA = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentA, int kAlignmentB = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentB, typename Operator_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::Operator>

| using cutlass::gemm::device::GemmSplitKParallel< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_, Stages, kAlignmentA, kAlignmentB, Operator_ >::ElementB = ElementB_ |

template<typename ElementA_, typename LayoutA_, typename ElementB_, typename LayoutB_, typename ElementC_, typename LayoutC_, typename ElementAccumulator_ = ElementC_, typename OperatorClass_ = arch::OpClassSimt, typename ArchTag_ = arch::Sm70, typename ThreadblockShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::ThreadblockShape, typename WarpShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::WarpShape, typename InstructionShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::InstructionShape, typename EpilogueOutputOp_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::EpilogueOutputOp, typename ConvertScaledOp_ = cutlass::epilogue::thread::Convert< ElementAccumulator_, DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementAccumulator_, ElementAccumulator_>::EpilogueOutputOp::kCount, ElementAccumulator_>, typename ReductionOp_ = cutlass::reduction::thread::ReduceAdd< ElementAccumulator_, typename EpilogueOutputOp_::ElementAccumulator, EpilogueOutputOp_::kCount>, typename ThreadblockSwizzle_ = threadblock::GemmSplitKHorizontalThreadblockSwizzle, int Stages = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kStages, int kAlignmentA = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentA, int kAlignmentB = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentB, typename Operator_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::Operator>

| using cutlass::gemm::device::GemmSplitKParallel< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_, Stages, kAlignmentA, kAlignmentB, Operator_ >::ElementC = ElementC_ |

template<typename ElementA_, typename LayoutA_, typename ElementB_, typename LayoutB_, typename ElementC_, typename LayoutC_, typename ElementAccumulator_ = ElementC_, typename OperatorClass_ = arch::OpClassSimt, typename ArchTag_ = arch::Sm70, typename ThreadblockShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::ThreadblockShape, typename WarpShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::WarpShape, typename InstructionShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::InstructionShape, typename EpilogueOutputOp_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::EpilogueOutputOp, typename ConvertScaledOp_ = cutlass::epilogue::thread::Convert< ElementAccumulator_, DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementAccumulator_, ElementAccumulator_>::EpilogueOutputOp::kCount, ElementAccumulator_>, typename ReductionOp_ = cutlass::reduction::thread::ReduceAdd< ElementAccumulator_, typename EpilogueOutputOp_::ElementAccumulator, EpilogueOutputOp_::kCount>, typename ThreadblockSwizzle_ = threadblock::GemmSplitKHorizontalThreadblockSwizzle, int Stages = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kStages, int kAlignmentA = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentA, int kAlignmentB = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentB, typename Operator_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::Operator>

| using cutlass::gemm::device::GemmSplitKParallel< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_, Stages, kAlignmentA, kAlignmentB, Operator_ >::EpilogueOutputOp = EpilogueOutputOp_ |

template<typename ElementA_, typename LayoutA_, typename ElementB_, typename LayoutB_, typename ElementC_, typename LayoutC_, typename ElementAccumulator_ = ElementC_, typename OperatorClass_ = arch::OpClassSimt, typename ArchTag_ = arch::Sm70, typename ThreadblockShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::ThreadblockShape, typename WarpShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::WarpShape, typename InstructionShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::InstructionShape, typename EpilogueOutputOp_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::EpilogueOutputOp, typename ConvertScaledOp_ = cutlass::epilogue::thread::Convert< ElementAccumulator_, DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementAccumulator_, ElementAccumulator_>::EpilogueOutputOp::kCount, ElementAccumulator_>, typename ReductionOp_ = cutlass::reduction::thread::ReduceAdd< ElementAccumulator_, typename EpilogueOutputOp_::ElementAccumulator, EpilogueOutputOp_::kCount>, typename ThreadblockSwizzle_ = threadblock::GemmSplitKHorizontalThreadblockSwizzle, int Stages = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kStages, int kAlignmentA = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentA, int kAlignmentB = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentB, typename Operator_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::Operator>

| using cutlass::gemm::device::GemmSplitKParallel< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_, Stages, kAlignmentA, kAlignmentB, Operator_ >::GemmKernel = typename kernel::DefaultGemmSplitKParallel< ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementAccumulator, LayoutC, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, ConvertScaledOp, ThreadblockSwizzle, kStages, Operator >::GemmKernel |

template<typename ElementA_, typename LayoutA_, typename ElementB_, typename LayoutB_, typename ElementC_, typename LayoutC_, typename ElementAccumulator_ = ElementC_, typename OperatorClass_ = arch::OpClassSimt, typename ArchTag_ = arch::Sm70, typename ThreadblockShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::ThreadblockShape, typename WarpShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::WarpShape, typename InstructionShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::InstructionShape, typename EpilogueOutputOp_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::EpilogueOutputOp, typename ConvertScaledOp_ = cutlass::epilogue::thread::Convert< ElementAccumulator_, DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementAccumulator_, ElementAccumulator_>::EpilogueOutputOp::kCount, ElementAccumulator_>, typename ReductionOp_ = cutlass::reduction::thread::ReduceAdd< ElementAccumulator_, typename EpilogueOutputOp_::ElementAccumulator, EpilogueOutputOp_::kCount>, typename ThreadblockSwizzle_ = threadblock::GemmSplitKHorizontalThreadblockSwizzle, int Stages = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kStages, int kAlignmentA = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentA, int kAlignmentB = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentB, typename Operator_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::Operator>

| using cutlass::gemm::device::GemmSplitKParallel< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_, Stages, kAlignmentA, kAlignmentB, Operator_ >::InstructionShape = InstructionShape_ |

template<typename ElementA_, typename LayoutA_, typename ElementB_, typename LayoutB_, typename ElementC_, typename LayoutC_, typename ElementAccumulator_ = ElementC_, typename OperatorClass_ = arch::OpClassSimt, typename ArchTag_ = arch::Sm70, typename ThreadblockShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::ThreadblockShape, typename WarpShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::WarpShape, typename InstructionShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::InstructionShape, typename EpilogueOutputOp_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::EpilogueOutputOp, typename ConvertScaledOp_ = cutlass::epilogue::thread::Convert< ElementAccumulator_, DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementAccumulator_, ElementAccumulator_>::EpilogueOutputOp::kCount, ElementAccumulator_>, typename ReductionOp_ = cutlass::reduction::thread::ReduceAdd< ElementAccumulator_, typename EpilogueOutputOp_::ElementAccumulator, EpilogueOutputOp_::kCount>, typename ThreadblockSwizzle_ = threadblock::GemmSplitKHorizontalThreadblockSwizzle, int Stages = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kStages, int kAlignmentA = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentA, int kAlignmentB = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentB, typename Operator_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::Operator>

| using cutlass::gemm::device::GemmSplitKParallel< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_, Stages, kAlignmentA, kAlignmentB, Operator_ >::LayoutA = LayoutA_ |

template<typename ElementA_, typename LayoutA_, typename ElementB_, typename LayoutB_, typename ElementC_, typename LayoutC_, typename ElementAccumulator_ = ElementC_, typename OperatorClass_ = arch::OpClassSimt, typename ArchTag_ = arch::Sm70, typename ThreadblockShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::ThreadblockShape, typename WarpShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::WarpShape, typename InstructionShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::InstructionShape, typename EpilogueOutputOp_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::EpilogueOutputOp, typename ConvertScaledOp_ = cutlass::epilogue::thread::Convert< ElementAccumulator_, DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementAccumulator_, ElementAccumulator_>::EpilogueOutputOp::kCount, ElementAccumulator_>, typename ReductionOp_ = cutlass::reduction::thread::ReduceAdd< ElementAccumulator_, typename EpilogueOutputOp_::ElementAccumulator, EpilogueOutputOp_::kCount>, typename ThreadblockSwizzle_ = threadblock::GemmSplitKHorizontalThreadblockSwizzle, int Stages = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kStages, int kAlignmentA = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentA, int kAlignmentB = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentB, typename Operator_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::Operator>

| using cutlass::gemm::device::GemmSplitKParallel< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_, Stages, kAlignmentA, kAlignmentB, Operator_ >::LayoutB = LayoutB_ |

template<typename ElementA_, typename LayoutA_, typename ElementB_, typename LayoutB_, typename ElementC_, typename LayoutC_, typename ElementAccumulator_ = ElementC_, typename OperatorClass_ = arch::OpClassSimt, typename ArchTag_ = arch::Sm70, typename ThreadblockShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::ThreadblockShape, typename WarpShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::WarpShape, typename InstructionShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::InstructionShape, typename EpilogueOutputOp_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::EpilogueOutputOp, typename ConvertScaledOp_ = cutlass::epilogue::thread::Convert< ElementAccumulator_, DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementAccumulator_, ElementAccumulator_>::EpilogueOutputOp::kCount, ElementAccumulator_>, typename ReductionOp_ = cutlass::reduction::thread::ReduceAdd< ElementAccumulator_, typename EpilogueOutputOp_::ElementAccumulator, EpilogueOutputOp_::kCount>, typename ThreadblockSwizzle_ = threadblock::GemmSplitKHorizontalThreadblockSwizzle, int Stages = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kStages, int kAlignmentA = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentA, int kAlignmentB = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentB, typename Operator_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::Operator>

| using cutlass::gemm::device::GemmSplitKParallel< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_, Stages, kAlignmentA, kAlignmentB, Operator_ >::LayoutC = LayoutC_ |

template<typename ElementA_, typename LayoutA_, typename ElementB_, typename LayoutB_, typename ElementC_, typename LayoutC_, typename ElementAccumulator_ = ElementC_, typename OperatorClass_ = arch::OpClassSimt, typename ArchTag_ = arch::Sm70, typename ThreadblockShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::ThreadblockShape, typename WarpShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::WarpShape, typename InstructionShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::InstructionShape, typename EpilogueOutputOp_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::EpilogueOutputOp, typename ConvertScaledOp_ = cutlass::epilogue::thread::Convert< ElementAccumulator_, DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementAccumulator_, ElementAccumulator_>::EpilogueOutputOp::kCount, ElementAccumulator_>, typename ReductionOp_ = cutlass::reduction::thread::ReduceAdd< ElementAccumulator_, typename EpilogueOutputOp_::ElementAccumulator, EpilogueOutputOp_::kCount>, typename ThreadblockSwizzle_ = threadblock::GemmSplitKHorizontalThreadblockSwizzle, int Stages = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kStages, int kAlignmentA = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentA, int kAlignmentB = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentB, typename Operator_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::Operator>

| using cutlass::gemm::device::GemmSplitKParallel< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_, Stages, kAlignmentA, kAlignmentB, Operator_ >::Operator = Operator_ |

template<typename ElementA_, typename LayoutA_, typename ElementB_, typename LayoutB_, typename ElementC_, typename LayoutC_, typename ElementAccumulator_ = ElementC_, typename OperatorClass_ = arch::OpClassSimt, typename ArchTag_ = arch::Sm70, typename ThreadblockShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::ThreadblockShape, typename WarpShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::WarpShape, typename InstructionShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::InstructionShape, typename EpilogueOutputOp_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::EpilogueOutputOp, typename ConvertScaledOp_ = cutlass::epilogue::thread::Convert< ElementAccumulator_, DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementAccumulator_, ElementAccumulator_>::EpilogueOutputOp::kCount, ElementAccumulator_>, typename ReductionOp_ = cutlass::reduction::thread::ReduceAdd< ElementAccumulator_, typename EpilogueOutputOp_::ElementAccumulator, EpilogueOutputOp_::kCount>, typename ThreadblockSwizzle_ = threadblock::GemmSplitKHorizontalThreadblockSwizzle, int Stages = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kStages, int kAlignmentA = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentA, int kAlignmentB = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentB, typename Operator_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::Operator>

| using cutlass::gemm::device::GemmSplitKParallel< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_, Stages, kAlignmentA, kAlignmentB, Operator_ >::OperatorClass = OperatorClass_ |

template<typename ElementA_, typename LayoutA_, typename ElementB_, typename LayoutB_, typename ElementC_, typename LayoutC_, typename ElementAccumulator_ = ElementC_, typename OperatorClass_ = arch::OpClassSimt, typename ArchTag_ = arch::Sm70, typename ThreadblockShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::ThreadblockShape, typename WarpShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::WarpShape, typename InstructionShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::InstructionShape, typename EpilogueOutputOp_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::EpilogueOutputOp, typename ConvertScaledOp_ = cutlass::epilogue::thread::Convert< ElementAccumulator_, DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementAccumulator_, ElementAccumulator_>::EpilogueOutputOp::kCount, ElementAccumulator_>, typename ReductionOp_ = cutlass::reduction::thread::ReduceAdd< ElementAccumulator_, typename EpilogueOutputOp_::ElementAccumulator, EpilogueOutputOp_::kCount>, typename ThreadblockSwizzle_ = threadblock::GemmSplitKHorizontalThreadblockSwizzle, int Stages = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kStages, int kAlignmentA = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentA, int kAlignmentB = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentB, typename Operator_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::Operator>

| using cutlass::gemm::device::GemmSplitKParallel< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_, Stages, kAlignmentA, kAlignmentB, Operator_ >::ReductionKernel = cutlass::reduction::kernel::ReduceSplitK< cutlass::MatrixShape<4, 32 * EpilogueOutputOp::kCount>, EpilogueOutputOp, ReductionOp > |

template<typename ElementA_, typename LayoutA_, typename ElementB_, typename LayoutB_, typename ElementC_, typename LayoutC_, typename ElementAccumulator_ = ElementC_, typename OperatorClass_ = arch::OpClassSimt, typename ArchTag_ = arch::Sm70, typename ThreadblockShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::ThreadblockShape, typename WarpShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::WarpShape, typename InstructionShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::InstructionShape, typename EpilogueOutputOp_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::EpilogueOutputOp, typename ConvertScaledOp_ = cutlass::epilogue::thread::Convert< ElementAccumulator_, DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementAccumulator_, ElementAccumulator_>::EpilogueOutputOp::kCount, ElementAccumulator_>, typename ReductionOp_ = cutlass::reduction::thread::ReduceAdd< ElementAccumulator_, typename EpilogueOutputOp_::ElementAccumulator, EpilogueOutputOp_::kCount>, typename ThreadblockSwizzle_ = threadblock::GemmSplitKHorizontalThreadblockSwizzle, int Stages = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kStages, int kAlignmentA = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentA, int kAlignmentB = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentB, typename Operator_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::Operator>

| using cutlass::gemm::device::GemmSplitKParallel< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_, Stages, kAlignmentA, kAlignmentB, Operator_ >::ReductionOp = ReductionOp_ |

template<typename ElementA_, typename LayoutA_, typename ElementB_, typename LayoutB_, typename ElementC_, typename LayoutC_, typename ElementAccumulator_ = ElementC_, typename OperatorClass_ = arch::OpClassSimt, typename ArchTag_ = arch::Sm70, typename ThreadblockShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::ThreadblockShape, typename WarpShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::WarpShape, typename InstructionShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::InstructionShape, typename EpilogueOutputOp_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::EpilogueOutputOp, typename ConvertScaledOp_ = cutlass::epilogue::thread::Convert< ElementAccumulator_, DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementAccumulator_, ElementAccumulator_>::EpilogueOutputOp::kCount, ElementAccumulator_>, typename ReductionOp_ = cutlass::reduction::thread::ReduceAdd< ElementAccumulator_, typename EpilogueOutputOp_::ElementAccumulator, EpilogueOutputOp_::kCount>, typename ThreadblockSwizzle_ = threadblock::GemmSplitKHorizontalThreadblockSwizzle, int Stages = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kStages, int kAlignmentA = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentA, int kAlignmentB = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentB, typename Operator_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::Operator>

| using cutlass::gemm::device::GemmSplitKParallel< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_, Stages, kAlignmentA, kAlignmentB, Operator_ >::ThreadblockShape = ThreadblockShape_ |

template<typename ElementA_, typename LayoutA_, typename ElementB_, typename LayoutB_, typename ElementC_, typename LayoutC_, typename ElementAccumulator_ = ElementC_, typename OperatorClass_ = arch::OpClassSimt, typename ArchTag_ = arch::Sm70, typename ThreadblockShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::ThreadblockShape, typename WarpShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::WarpShape, typename InstructionShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::InstructionShape, typename EpilogueOutputOp_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::EpilogueOutputOp, typename ConvertScaledOp_ = cutlass::epilogue::thread::Convert< ElementAccumulator_, DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementAccumulator_, ElementAccumulator_>::EpilogueOutputOp::kCount, ElementAccumulator_>, typename ReductionOp_ = cutlass::reduction::thread::ReduceAdd< ElementAccumulator_, typename EpilogueOutputOp_::ElementAccumulator, EpilogueOutputOp_::kCount>, typename ThreadblockSwizzle_ = threadblock::GemmSplitKHorizontalThreadblockSwizzle, int Stages = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kStages, int kAlignmentA = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentA, int kAlignmentB = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentB, typename Operator_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::Operator>

| using cutlass::gemm::device::GemmSplitKParallel< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_, Stages, kAlignmentA, kAlignmentB, Operator_ >::ThreadblockSwizzle = ThreadblockSwizzle_ |

template<typename ElementA_, typename LayoutA_, typename ElementB_, typename LayoutB_, typename ElementC_, typename LayoutC_, typename ElementAccumulator_ = ElementC_, typename OperatorClass_ = arch::OpClassSimt, typename ArchTag_ = arch::Sm70, typename ThreadblockShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::ThreadblockShape, typename WarpShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::WarpShape, typename InstructionShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::InstructionShape, typename EpilogueOutputOp_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::EpilogueOutputOp, typename ConvertScaledOp_ = cutlass::epilogue::thread::Convert< ElementAccumulator_, DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementAccumulator_, ElementAccumulator_>::EpilogueOutputOp::kCount, ElementAccumulator_>, typename ReductionOp_ = cutlass::reduction::thread::ReduceAdd< ElementAccumulator_, typename EpilogueOutputOp_::ElementAccumulator, EpilogueOutputOp_::kCount>, typename ThreadblockSwizzle_ = threadblock::GemmSplitKHorizontalThreadblockSwizzle, int Stages = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kStages, int kAlignmentA = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentA, int kAlignmentB = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentB, typename Operator_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::Operator>

| using cutlass::gemm::device::GemmSplitKParallel< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_, Stages, kAlignmentA, kAlignmentB, Operator_ >::WarpShape = WarpShape_ |

Constructor & Destructor Documentation

template<typename ElementA_, typename LayoutA_, typename ElementB_, typename LayoutB_, typename ElementC_, typename LayoutC_, typename ElementAccumulator_ = ElementC_, typename OperatorClass_ = arch::OpClassSimt, typename ArchTag_ = arch::Sm70, typename ThreadblockShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::ThreadblockShape, typename WarpShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::WarpShape, typename InstructionShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::InstructionShape, typename EpilogueOutputOp_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::EpilogueOutputOp, typename ConvertScaledOp_ = cutlass::epilogue::thread::Convert< ElementAccumulator_, DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementAccumulator_, ElementAccumulator_>::EpilogueOutputOp::kCount, ElementAccumulator_>, typename ReductionOp_ = cutlass::reduction::thread::ReduceAdd< ElementAccumulator_, typename EpilogueOutputOp_::ElementAccumulator, EpilogueOutputOp_::kCount>, typename ThreadblockSwizzle_ = threadblock::GemmSplitKHorizontalThreadblockSwizzle, int Stages = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kStages, int kAlignmentA = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentA, int kAlignmentB = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentB, typename Operator_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::Operator>

|

| cutlass::gemm::device::GemmSplitKParallel< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_, Stages, kAlignmentA, kAlignmentB, Operator_ >::GemmSplitKParallel | ( | | ) | |

| inline |

Member Function Documentation

template<typename ElementA_, typename LayoutA_, typename ElementB_, typename LayoutB_, typename ElementC_, typename LayoutC_, typename ElementAccumulator_ = ElementC_, typename OperatorClass_ = arch::OpClassSimt, typename ArchTag_ = arch::Sm70, typename ThreadblockShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::ThreadblockShape, typename WarpShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::WarpShape, typename InstructionShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::InstructionShape, typename EpilogueOutputOp_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::EpilogueOutputOp, typename ConvertScaledOp_ = cutlass::epilogue::thread::Convert< ElementAccumulator_, DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementAccumulator_, ElementAccumulator_>::EpilogueOutputOp::kCount, ElementAccumulator_>, typename ReductionOp_ = cutlass::reduction::thread::ReduceAdd< ElementAccumulator_, typename EpilogueOutputOp_::ElementAccumulator, EpilogueOutputOp_::kCount>, typename ThreadblockSwizzle_ = threadblock::GemmSplitKHorizontalThreadblockSwizzle, int Stages = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kStages, int kAlignmentA = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentA, int kAlignmentB = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentB, typename Operator_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::Operator>

|

| static Status cutlass::gemm::device::GemmSplitKParallel< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_, Stages, kAlignmentA, kAlignmentB, Operator_ >::can_implement | ( | Arguments const & | args | ) | |

| inlinestatic |

template<typename ElementA_, typename LayoutA_, typename ElementB_, typename LayoutB_, typename ElementC_, typename LayoutC_, typename ElementAccumulator_ = ElementC_, typename OperatorClass_ = arch::OpClassSimt, typename ArchTag_ = arch::Sm70, typename ThreadblockShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::ThreadblockShape, typename WarpShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::WarpShape, typename InstructionShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::InstructionShape, typename EpilogueOutputOp_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::EpilogueOutputOp, typename ConvertScaledOp_ = cutlass::epilogue::thread::Convert< ElementAccumulator_, DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementAccumulator_, ElementAccumulator_>::EpilogueOutputOp::kCount, ElementAccumulator_>, typename ReductionOp_ = cutlass::reduction::thread::ReduceAdd< ElementAccumulator_, typename EpilogueOutputOp_::ElementAccumulator, EpilogueOutputOp_::kCount>, typename ThreadblockSwizzle_ = threadblock::GemmSplitKHorizontalThreadblockSwizzle, int Stages = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kStages, int kAlignmentA = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentA, int kAlignmentB = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentB, typename Operator_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::Operator>

|

| static size_t cutlass::gemm::device::GemmSplitKParallel< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_, Stages, kAlignmentA, kAlignmentB, Operator_ >::get_workspace_size | ( | Arguments const & | args | ) | |

| inlinestatic |

template<typename ElementA_, typename LayoutA_, typename ElementB_, typename LayoutB_, typename ElementC_, typename LayoutC_, typename ElementAccumulator_ = ElementC_, typename OperatorClass_ = arch::OpClassSimt, typename ArchTag_ = arch::Sm70, typename ThreadblockShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::ThreadblockShape, typename WarpShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::WarpShape, typename InstructionShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::InstructionShape, typename EpilogueOutputOp_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::EpilogueOutputOp, typename ConvertScaledOp_ = cutlass::epilogue::thread::Convert< ElementAccumulator_, DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementAccumulator_, ElementAccumulator_>::EpilogueOutputOp::kCount, ElementAccumulator_>, typename ReductionOp_ = cutlass::reduction::thread::ReduceAdd< ElementAccumulator_, typename EpilogueOutputOp_::ElementAccumulator, EpilogueOutputOp_::kCount>, typename ThreadblockSwizzle_ = threadblock::GemmSplitKHorizontalThreadblockSwizzle, int Stages = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kStages, int kAlignmentA = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentA, int kAlignmentB = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentB, typename Operator_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::Operator>

|

| Status cutlass::gemm::device::GemmSplitKParallel< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_, Stages, kAlignmentA, kAlignmentB, Operator_ >::initialize | ( | Arguments const & | args, | | | | void * | workspace | | | ) | | |

| inline |

template<typename ElementA_, typename LayoutA_, typename ElementB_, typename LayoutB_, typename ElementC_, typename LayoutC_, typename ElementAccumulator_ = ElementC_, typename OperatorClass_ = arch::OpClassSimt, typename ArchTag_ = arch::Sm70, typename ThreadblockShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::ThreadblockShape, typename WarpShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::WarpShape, typename InstructionShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::InstructionShape, typename EpilogueOutputOp_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::EpilogueOutputOp, typename ConvertScaledOp_ = cutlass::epilogue::thread::Convert< ElementAccumulator_, DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementAccumulator_, ElementAccumulator_>::EpilogueOutputOp::kCount, ElementAccumulator_>, typename ReductionOp_ = cutlass::reduction::thread::ReduceAdd< ElementAccumulator_, typename EpilogueOutputOp_::ElementAccumulator, EpilogueOutputOp_::kCount>, typename ThreadblockSwizzle_ = threadblock::GemmSplitKHorizontalThreadblockSwizzle, int Stages = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kStages, int kAlignmentA = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentA, int kAlignmentB = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentB, typename Operator_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::Operator>

|

| Status cutlass::gemm::device::GemmSplitKParallel< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_, Stages, kAlignmentA, kAlignmentB, Operator_ >::operator() | ( | cudaStream_t | stream = nullptr | ) | |

| inline |

template<typename ElementA_, typename LayoutA_, typename ElementB_, typename LayoutB_, typename ElementC_, typename LayoutC_, typename ElementAccumulator_ = ElementC_, typename OperatorClass_ = arch::OpClassSimt, typename ArchTag_ = arch::Sm70, typename ThreadblockShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::ThreadblockShape, typename WarpShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::WarpShape, typename InstructionShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::InstructionShape, typename EpilogueOutputOp_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::EpilogueOutputOp, typename ConvertScaledOp_ = cutlass::epilogue::thread::Convert< ElementAccumulator_, DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementAccumulator_, ElementAccumulator_>::EpilogueOutputOp::kCount, ElementAccumulator_>, typename ReductionOp_ = cutlass::reduction::thread::ReduceAdd< ElementAccumulator_, typename EpilogueOutputOp_::ElementAccumulator, EpilogueOutputOp_::kCount>, typename ThreadblockSwizzle_ = threadblock::GemmSplitKHorizontalThreadblockSwizzle, int Stages = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kStages, int kAlignmentA = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentA, int kAlignmentB = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentB, typename Operator_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::Operator>

|

| Status cutlass::gemm::device::GemmSplitKParallel< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_, Stages, kAlignmentA, kAlignmentB, Operator_ >::operator() | ( | Arguments const & | args, | | | | void * | workspace = nullptr, | | | | cudaStream_t | stream = nullptr | | | ) | | |

| inline |

template<typename ElementA_, typename LayoutA_, typename ElementB_, typename LayoutB_, typename ElementC_, typename LayoutC_, typename ElementAccumulator_ = ElementC_, typename OperatorClass_ = arch::OpClassSimt, typename ArchTag_ = arch::Sm70, typename ThreadblockShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::ThreadblockShape, typename WarpShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::WarpShape, typename InstructionShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::InstructionShape, typename EpilogueOutputOp_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::EpilogueOutputOp, typename ConvertScaledOp_ = cutlass::epilogue::thread::Convert< ElementAccumulator_, DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementAccumulator_, ElementAccumulator_>::EpilogueOutputOp::kCount, ElementAccumulator_>, typename ReductionOp_ = cutlass::reduction::thread::ReduceAdd< ElementAccumulator_, typename EpilogueOutputOp_::ElementAccumulator, EpilogueOutputOp_::kCount>, typename ThreadblockSwizzle_ = threadblock::GemmSplitKHorizontalThreadblockSwizzle, int Stages = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kStages, int kAlignmentA = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentA, int kAlignmentB = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentB, typename Operator_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::Operator>

|

| Status cutlass::gemm::device::GemmSplitKParallel< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_, Stages, kAlignmentA, kAlignmentB, Operator_ >::run | ( | cudaStream_t | stream = nullptr | ) | |

| inline |

template<typename ElementA_, typename LayoutA_, typename ElementB_, typename LayoutB_, typename ElementC_, typename LayoutC_, typename ElementAccumulator_ = ElementC_, typename OperatorClass_ = arch::OpClassSimt, typename ArchTag_ = arch::Sm70, typename ThreadblockShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::ThreadblockShape, typename WarpShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::WarpShape, typename InstructionShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::InstructionShape, typename EpilogueOutputOp_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::EpilogueOutputOp, typename ConvertScaledOp_ = cutlass::epilogue::thread::Convert< ElementAccumulator_, DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementAccumulator_, ElementAccumulator_>::EpilogueOutputOp::kCount, ElementAccumulator_>, typename ReductionOp_ = cutlass::reduction::thread::ReduceAdd< ElementAccumulator_, typename EpilogueOutputOp_::ElementAccumulator, EpilogueOutputOp_::kCount>, typename ThreadblockSwizzle_ = threadblock::GemmSplitKHorizontalThreadblockSwizzle, int Stages = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kStages, int kAlignmentA = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentA, int kAlignmentB = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentB, typename Operator_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::Operator>

|

| Status cutlass::gemm::device::GemmSplitKParallel< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_, Stages, kAlignmentA, kAlignmentB, Operator_ >::update | ( | Arguments const & | args, | | | | void * | workspace = nullptr | | | ) | | |

| inline |

Member Data Documentation

template<typename ElementA_, typename LayoutA_, typename ElementB_, typename LayoutB_, typename ElementC_, typename LayoutC_, typename ElementAccumulator_ = ElementC_, typename OperatorClass_ = arch::OpClassSimt, typename ArchTag_ = arch::Sm70, typename ThreadblockShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::ThreadblockShape, typename WarpShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::WarpShape, typename InstructionShape_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::InstructionShape, typename EpilogueOutputOp_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::EpilogueOutputOp, typename ConvertScaledOp_ = cutlass::epilogue::thread::Convert< ElementAccumulator_, DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementAccumulator_, ElementAccumulator_>::EpilogueOutputOp::kCount, ElementAccumulator_>, typename ReductionOp_ = cutlass::reduction::thread::ReduceAdd< ElementAccumulator_, typename EpilogueOutputOp_::ElementAccumulator, EpilogueOutputOp_::kCount>, typename ThreadblockSwizzle_ = threadblock::GemmSplitKHorizontalThreadblockSwizzle, int Stages = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kStages, int kAlignmentA = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentA, int kAlignmentB = DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::kAlignmentB, typename Operator_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::Operator>

|

| int const cutlass::gemm::device::GemmSplitKParallel< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_, Stages, kAlignmentA, kAlignmentB, Operator_ >::kStages = Stages |

| static |


The documentation for this class was generated from the following file:

  • [device/gemm_splitk_parallel.h](device_2gemm splitk parallel_8h_source.html)

Generated by 1.8.11