Cutlass
CUDA Templates for Linear Algebra Subroutines and Solvers
List of all members
cutlass::gemm::HgemmConfig< OutputTile_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_ > Struct Template Reference

#include <hgemm_traits.h>

Inheritance diagram for cutlass::gemm::HgemmConfig< OutputTile_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_ >:
cutlass::gemm::GemmConfig< half, half, half, half, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, half, half, half >, kScalarsPerLdgA_, kScalarsPerLdgA_, 8, kScalarsPerLdgB_, kScalarsPerLdgB_, 8, 2, 8, 2, 2, false, true, false >

Additional Inherited Members

- Public Types inherited from cutlass::gemm::GemmConfig< half, half, half, half, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, half, half, half >, kScalarsPerLdgA_, kScalarsPerLdgA_, 8, kScalarsPerLdgB_, kScalarsPerLdgB_, 8, 2, 8, 2, 2, false, true, false >
typedef half ScalarA
 The scalar for A. More...
 
typedef half ScalarB
 The scalar for B. More...
 
typedef half ScalarC
 The scalar for C. More...
 
typedef half ScalarD
 The scalar for D. More...
 
typedef OutputTile_ OutputTile
 The tile. More...
 
typedef ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, half, half, half > MultiplyAdd
 The functor to do D = A*B + C. More...
 
typedef MultiplyAdd::InstructionShape InstructionShape
 The shape of the instruction. More...
 
typedef MultiplyAdd::AccumulatorsPerWarp AccumulatorsPerWarp
 The shape of warp-level GEMM. More...
 
typedef MultiplyAdd::Accumulators Accumulators
 The accumulators. More...
 
typedef ShapeDiv< OutputTile, AccumulatorsPerWarp >::Shape Warps
 The number of warps. More...
 
- Static Public Attributes inherited from cutlass::gemm::GemmConfig< half, half, half, half, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, half, half, half >, kScalarsPerLdgA_, kScalarsPerLdgA_, 8, kScalarsPerLdgB_, kScalarsPerLdgB_, 8, 2, 8, 2, 2, false, true, false >
static int const kWarpSize
 The default warp size (32 threads per warp). More...
 
static int const kThreads
 The numnber of threads. More...
 
static int const kScalarsPerLdgA
 The number of scalars per LDG/STS/LDS for A. More...
 
static int const kScalarsPerStsA
 
static int const kScalarsPerLdsA
 
static int const kScalarsPerLdgB
 The number of scalars per LDG/STS/LDS for B. More...
 
static int const kScalarsPerStsB
 
static int const kScalarsPerLdsB
 
static int const kScalarsPerLdgC
 The number of scalars per LDG for C. More...
 
static int const kScalarsPerStgD
 The number of scalars per STS/LDS/STG for D. More...
 
static int const kScalarsPerStsD
 
static int const kScalarsPerLdsD
 
static int const kAccumulatorsPerLdsA
 The number of accumulators that are going to be fed from one LDS A/B. More...
 
static int const kAccumulatorsPerLdsB
 
static int const kStages
 The number of stages in shared memory to implement double, triple, more-buffering. More...
 
static bool const kResidueSeparate
 If true, mainloop is instantiated twice. The first instantiation contains no predicate. More...
 
static bool const kResidueInProlog
 If true, residue is computed in the prologue. More...
 
static bool const kLaunchBounds
 If true, kernel is launched with launch bounds specified. More...
 

The documentation for this struct was generated from the following file: