Cutlass
CUDA Templates for Linear Algebra Subroutines and Solvers
Classes | Public Types | Static Public Attributes | List of all members
cutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ > Struct Template Reference

#include <gemm_shared_tile.h>

Classes

struct  ThreadOffset
 Computes the thread offset in (H, W) based on thread ID. More...
 

Public Types

typedef platform::remove_const< Scalar_ >::type Scalar
 The scalar. More...
 
typedef Scalar_ * Pointer
 The pointer. More...
 
typedef OutputTile_ OutputTile
 The dimension of the output tile. More...
 
typedef Warps_ Warps
 The warps in the tile. More...
 
typedef ThreadsPerWarp_ ThreadsPerWarp
 The threads in the warps. More...
 
typedef Shape< 1, 2, kScalarsPerRow/kAccessSize, kAccessSizeTile
 
typedef Shape< kIterationsD, kIterationsH, OutputTile::kW/kWarpSize/kAccessSize, Warps::kD > Iterations
 The number of iterations needed to store the tile. More...
 
typedef Shape< OutputTile::kW, kScalarsPerRow, kWarpSize *kAccessSize, kSplitKImmediateOffsetStrides
 The strides in each dimension between different loads/stores. More...
 
typedef Shape< OutputTile::kW, kScalarsPerRow, kWarpSize *kAccessSize, kSplitKDelta
 The strides in each dimension between different loads/stores. More...
 

Static Public Attributes

static int const kAccessSize = kScalarsPerLds_
 The number of scalars per LDG/STG. More...
 
static int const kSkew = kSkew_
 The skew. More...
 
static MemorySpace::Kind const kMemorySpace = MemorySpace::kShared
 The memory space. More...
 
static int const kScalarsPerThread = OutputTile_::kW / Warps::kW / ThreadsPerWarp::kW
 The number of scalars per thread. More...
 
static int const kThreads = ShapeCount<Warps>::kCount * kWarpSize
 The number of threads. More...
 
static int const kScalarsPerRow = kThreads / 2 * kScalarsPerThread + kSkew
 The number of scalars per row. We build a tile with 2 rows (to avoid bank conflicts). More...
 
static int const kIterationsInHPerWarp = kTileH_ / ShapeCount<Warps>::kCount
 
static int const kIterationsH = kIterationsInHPerWarp == 1 ? 1 : 2
 
static int const kIterationsD = kIterationsInHPerWarp / kIterationsH
 
static int const kSplitK = OutputTile::kW * ThreadsPerWarp::kH / 2 * Warps::kH
 

Member Typedef Documentation

◆ Delta

template<typename Scalar_ , typename OutputTile_ , typename Warps_ , typename ThreadsPerWarp_ , int kTileH_, int kScalarsPerLds_, int kSkew_ = 0>
typedef Shape<OutputTile::kW, kScalarsPerRow, kWarpSize * kAccessSize, kSplitK> cutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >::Delta

◆ ImmediateOffsetStrides

template<typename Scalar_ , typename OutputTile_ , typename Warps_ , typename ThreadsPerWarp_ , int kTileH_, int kScalarsPerLds_, int kSkew_ = 0>
typedef Shape<OutputTile::kW, kScalarsPerRow, kWarpSize * kAccessSize, kSplitK> cutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >::ImmediateOffsetStrides

◆ Iterations

template<typename Scalar_ , typename OutputTile_ , typename Warps_ , typename ThreadsPerWarp_ , int kTileH_, int kScalarsPerLds_, int kSkew_ = 0>
typedef Shape<kIterationsD, kIterationsH, OutputTile::kW / kWarpSize / kAccessSize, Warps::kD> cutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >::Iterations

◆ OutputTile

template<typename Scalar_ , typename OutputTile_ , typename Warps_ , typename ThreadsPerWarp_ , int kTileH_, int kScalarsPerLds_, int kSkew_ = 0>
typedef OutputTile_ cutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >::OutputTile

◆ Pointer

template<typename Scalar_ , typename OutputTile_ , typename Warps_ , typename ThreadsPerWarp_ , int kTileH_, int kScalarsPerLds_, int kSkew_ = 0>
typedef Scalar_* cutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >::Pointer

◆ Scalar

template<typename Scalar_ , typename OutputTile_ , typename Warps_ , typename ThreadsPerWarp_ , int kTileH_, int kScalarsPerLds_, int kSkew_ = 0>
typedef platform::remove_const<Scalar_>::type cutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >::Scalar

◆ ThreadsPerWarp

template<typename Scalar_ , typename OutputTile_ , typename Warps_ , typename ThreadsPerWarp_ , int kTileH_, int kScalarsPerLds_, int kSkew_ = 0>
typedef ThreadsPerWarp_ cutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >::ThreadsPerWarp

◆ Tile

template<typename Scalar_ , typename OutputTile_ , typename Warps_ , typename ThreadsPerWarp_ , int kTileH_, int kScalarsPerLds_, int kSkew_ = 0>
typedef Shape<1, 2, kScalarsPerRow / kAccessSize, kAccessSize> cutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >::Tile

The tile. We have 2 rows of scalars. We use those two rows to make sure we do not have bank conflicts in the epilogue.

◆ Warps

template<typename Scalar_ , typename OutputTile_ , typename Warps_ , typename ThreadsPerWarp_ , int kTileH_, int kScalarsPerLds_, int kSkew_ = 0>
typedef Warps_ cutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >::Warps

Member Data Documentation

◆ kAccessSize

template<typename Scalar_ , typename OutputTile_ , typename Warps_ , typename ThreadsPerWarp_ , int kTileH_, int kScalarsPerLds_, int kSkew_ = 0>
int const cutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >::kAccessSize = kScalarsPerLds_
static

◆ kIterationsD

template<typename Scalar_ , typename OutputTile_ , typename Warps_ , typename ThreadsPerWarp_ , int kTileH_, int kScalarsPerLds_, int kSkew_ = 0>
int const cutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >::kIterationsD = kIterationsInHPerWarp / kIterationsH
static

◆ kIterationsH

template<typename Scalar_ , typename OutputTile_ , typename Warps_ , typename ThreadsPerWarp_ , int kTileH_, int kScalarsPerLds_, int kSkew_ = 0>
int const cutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >::kIterationsH = kIterationsInHPerWarp == 1 ? 1 : 2
static

◆ kIterationsInHPerWarp

template<typename Scalar_ , typename OutputTile_ , typename Warps_ , typename ThreadsPerWarp_ , int kTileH_, int kScalarsPerLds_, int kSkew_ = 0>
int const cutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >::kIterationsInHPerWarp = kTileH_ / ShapeCount<Warps>::kCount
static

◆ kMemorySpace

template<typename Scalar_ , typename OutputTile_ , typename Warps_ , typename ThreadsPerWarp_ , int kTileH_, int kScalarsPerLds_, int kSkew_ = 0>
MemorySpace::Kind const cutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >::kMemorySpace = MemorySpace::kShared
static

◆ kScalarsPerRow

template<typename Scalar_ , typename OutputTile_ , typename Warps_ , typename ThreadsPerWarp_ , int kTileH_, int kScalarsPerLds_, int kSkew_ = 0>
int const cutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >::kScalarsPerRow = kThreads / 2 * kScalarsPerThread + kSkew
static

◆ kScalarsPerThread

template<typename Scalar_ , typename OutputTile_ , typename Warps_ , typename ThreadsPerWarp_ , int kTileH_, int kScalarsPerLds_, int kSkew_ = 0>
int const cutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >::kScalarsPerThread = OutputTile_::kW / Warps::kW / ThreadsPerWarp::kW
static

◆ kSkew

template<typename Scalar_ , typename OutputTile_ , typename Warps_ , typename ThreadsPerWarp_ , int kTileH_, int kScalarsPerLds_, int kSkew_ = 0>
int const cutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >::kSkew = kSkew_
static

◆ kSplitK

template<typename Scalar_ , typename OutputTile_ , typename Warps_ , typename ThreadsPerWarp_ , int kTileH_, int kScalarsPerLds_, int kSkew_ = 0>
int const cutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >::kSplitK = OutputTile::kW * ThreadsPerWarp::kH / 2 * Warps::kH
static

◆ kThreads

template<typename Scalar_ , typename OutputTile_ , typename Warps_ , typename ThreadsPerWarp_ , int kTileH_, int kScalarsPerLds_, int kSkew_ = 0>
int const cutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >::kThreads = ShapeCount<Warps>::kCount * kWarpSize
static

The documentation for this struct was generated from the following file: