56 typename ThreadGemmShape_>
69 ThreadMultiplyAdd<ThreadGemmShape_, Shape<1, 4, 8>, int8_t, int8_t, int>,
99 template <
typename OutputTile_,
typename ThreadGemmShape_>
113 ThreadMultiplyAdd<ThreadGemmShape_, Shape<1, 4, 8>, int8_t, int8_t, int>,
143 template <enum MatrixLayout::Kind kLayout_,
typename GemmConfig_,
typename Index_>
148 template <
typename GemmConfig_,
typename Index_>
155 static int const kScalarsPerStsA = 16;
169 GemmConfig_::kScalarsPerLdgA>
180 Shape<GemmConfig_::kStages, GemmConfig_::OutputTile::kD / 4, GemmConfig_::OutputTile::kW * 4>,
190 template <
typename GemmConfig_,
typename Index_>
201 static int const kScalarsPerStsA = 16;
215 GemmConfig_::kScalarsPerLdgA>
226 Shape<GemmConfig_::kStages, GemmConfig_::OutputTile::kD / 4, GemmConfig_::OutputTile::kW * 4>,
240 typename GemmConfig_::OutputTile,
242 typename GemmConfig_::Warps,
244 typename GemmConfig_::MultiplyAdd::ThreadsPerWarp,
246 typename GemmConfig_::InstructionShape,
248 GemmConfig_::kStages,
252 SharedStoreTileTraits::kSkew>
258 template <enum MatrixLayout::Kind kLayout_,
typename GemmConfig_,
typename Index_>
263 template <
typename GemmConfig_,
typename Index_>
274 static int const kScalarsPerStsB = 16;
288 GemmConfig_::kScalarsPerLdgB>
299 Shape<GemmConfig_::kStages, GemmConfig_::OutputTile::kD / 4, GemmConfig_::OutputTile::kH * 4>,
313 typename GemmConfig_::OutputTile,
315 typename GemmConfig_::Warps,
317 typename GemmConfig_::MultiplyAdd::ThreadsPerWarp,
319 typename GemmConfig_::InstructionShape,
321 GemmConfig_::kStages,
325 SharedStoreTileTraits::kSkew>
331 template <
typename GemmConfig_,
typename Index_>
338 static int const kScalarsPerStsB = 16;
352 GemmConfig_::kScalarsPerLdgB>
363 Shape<GemmConfig_::kStages, GemmConfig_::OutputTile::kD / 4, GemmConfig_::OutputTile::kH * 4>,
373 template <enum MatrixLayout::Kind kLayout_,
typename Iterator_>
376 template <
typename Iterator_>
381 template <
typename Iterator_>
388 template <enum MatrixLayout::Kind kLayout_,
typename Iterator_>
391 template <
typename Iterator_>
396 template <
typename Iterator_>
409 typename OutputTile_,
413 typename EpilogueFunctor_,
417 typename Index_ =
int>
433 typename GemmTileTraitsHelperA::SharedStoreTileTraits::Scalar,
451 typename GemmTileTraitsHelperB::SharedStoreTileTraits::Scalar,
463 typedef TileLoadIterator<
typename GemmTileTraitsHelperA::SharedLoadTileTraits,
464 typename GemmTileTraitsHelperA::SharedLoadTileTraits::Scalar,
472 typedef TileLoadIterator<
typename GemmTileTraitsHelperB::SharedLoadTileTraits,
473 typename GemmTileTraitsHelperB::SharedLoadTileTraits::Scalar,
492 template <
typename ScalarD_>
512 typename ScalarD_ = int,
518 typename Index_ = int,
529 typename Helper_::GemmConfig,
531 typename Helper_::GlobalLoadStreamA,
533 typename Helper_::GlobalLoadStreamB,
535 typename Helper_::SharedLoadStreamA,
537 typename Helper_::SharedLoadStreamB,
539 typename Helper_::Epilogue,
541 IdentityBlockSwizzle,
545 typename Helper_::ClearAccumulators> {};
IgemmTransformerB< GemmTileTraitsHelperB::kLayout, GlobalLoadIteratorB >::Transformer GlobalTransformerB
Definition: igemm_traits.h:448
Definition: load_store.h:41
GemmTileTraitsHelperB< MatrixLayout::kRowMajor, GemmConfig_ > Base
The base config.
Definition: igemm_traits.h:335
Definition: gemm_shared_tile.h:128
Base::Threads Threads
The threads.
Definition: igemm_global_tile.h:66
IgemmTileTraitsHelperB< kLayoutB_, GemmConfig, Index_ > GemmTileTraitsHelperB
The GEMM config for B.
Definition: igemm_traits.h:424
Definition: gemm_shared_tile.h:80
Defines iterators for efficiently loading and storing to global memory.
Transposes a fragment of data containing packed 8-bit integer elements.
GemmSharedStoreWithSkewTileAbTraits< int8_t, Shape< GemmConfig_::kStages, GemmConfig_::OutputTile::kD/4, GemmConfig_::OutputTile::kW *4 >, typename GlobalTileTraits::Threads, kScalarsPerStsA, 16 > SharedStoreTileTraits
The traits class to build the iterator to store data to shared memory for A^N.
Definition: igemm_traits.h:233
IgemmGlobalTileTraits< GemmOperand::kB, MatrixLayout::kColumnMajor, int8_t const, Shape< 1, GemmConfig_::OutputTile::kH, GemmConfig_::OutputTile::kD >, Shape< 1, ShapeCount< typename GemmConfig_::Warps >::kCount, GemmConfig_::kWarpSize >, GemmConfig_::kScalarsPerLdgB > GlobalTileTraits
The traits class to build the iterator to load data from global memory for B^T.
Definition: igemm_traits.h:289
Defines structural properties of complete GEMM computation.
IgemmGlobalIteratorAb< GlobalTileTraits, Index_ > GlobalLoadIterator
The global load iterator.
Definition: igemm_traits.h:219
Definition: igemm_traits.h:144
Definition: igemm_epilogue.h:290
Kind
Enumeration defining fundamental contiguous layouts.
Definition: matrix_traits.h:159
IgemmGlobalTileTraits< GemmOperand::kB, MatrixLayout::kRowMajor, int8_t const, Shape< 1, GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kH >, Shape< 1, ShapeCount< typename GemmConfig_::Warps >::kCount, GemmConfig_::kWarpSize >, GemmConfig_::kScalarsPerLdgB > GlobalTileTraits
The traits class to build the iterator to load data from global memory for B^T.
Definition: igemm_traits.h:353
Definition: gemm_shared_tile.h:38
Definition: tile_iterator.h:65
int8_t MultiplyAddScalar
The scalar stored in shared memory.
Definition: igemm_traits.h:198
GemmTileTraitsHelperB::GlobalLoadIterator GlobalLoadIteratorB
The iterator to load B from global memory.
Definition: igemm_traits.h:445
Implements matrix multiply accumulate operation of 8-bit integer data using DP4A instruction.
Definition: gemm_shared_tile.h:200
TileStoreIterator< typename GemmTileTraitsHelperB::SharedStoreTileTraits, typename GemmTileTraitsHelperB::SharedStoreTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared > SharedStoreIteratorB
The iterator to store B to shared memory.
Definition: igemm_traits.h:454
GemmSharedLoadTileBTraits< int8_t const, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, typename GemmConfig_::InstructionShape, GemmConfig_::kStages, 16, SharedStoreTileTraits::kSkew > SharedLoadTileTraits
The traits class to build the iterator to load from shared memory for B^N.
Definition: igemm_traits.h:326
Definition: gemm_global_tile.h:163
int8_t MultiplyAddScalar
The scalar stored in shared memory.
Definition: igemm_traits.h:271
Implements the epilogue phase of the GEMM kernel that efficiently updates global memory with the comp...
IgemmGlobalTileTraits< GemmOperand::kA, MatrixLayout::kRowMajor, int8_t const, Shape< 1, GemmConfig_::OutputTile::kW, GemmConfig_::OutputTile::kD >, Shape< 1, ShapeCount< typename GemmConfig_::Warps >::kCount, GemmConfig_::kWarpSize >, GemmConfig_::kScalarsPerLdgA > GlobalTileTraits
The traits class to build the iterator to load data from global memory for A^T.
Definition: igemm_traits.h:216
Definition: gemm_global_stream.h:52
Definition: gemm_traits.h:191
IgemmEpilogue< IgemmEpilogueTraits< GemmConfig, EpilogueFunctor_ > > Epilogue
The epilogue.
Definition: igemm_traits.h:487
int Scalar
Definition: igemm_traits.h:499
GemmSharedStoreTileAbTraits< int8_t, Shape< GemmConfig_::kStages, GemmConfig_::OutputTile::kD/4, GemmConfig_::OutputTile::kW *4 >, typename GlobalTileTraits::Threads, kScalarsPerStsA > SharedStoreTileTraits
The traits class to build the iterator to store data to shared memory for A^N.
Definition: igemm_traits.h:185
Definition: igemm_swizzle.h:38
Definition: igemm_traits.h:259
Definition: igemm_traits.h:418
An iterator implementing Tile Load Iterator Concept for loading a tile from memory.
Definition: tile_iterator.h:402
IgemmTransformerA< GemmTileTraitsHelperA::kLayout, GlobalLoadIteratorA >::Transformer GlobalTransformerA
The default transformer for A.
Definition: igemm_traits.h:430
Defines iterators for efficiently loading and storing tiles to and from shared memory.
GlobalLoadStream< GemmOperand::kB, GlobalLoadIteratorB, SharedStoreIteratorB, GlobalTransformerB > GlobalLoadStreamB
The stream to load B from global memory to shared memory.
Definition: igemm_traits.h:460
Definition: gemm_shared_stream.h:45
Definition: igemm_global_tile.h:50
Defines a type for restructuring a tile.
GemmTileTraitsHelperA::GlobalLoadIterator GlobalLoadIteratorA
The iterator to load A from global memory.
Definition: igemm_traits.h:427
Definition: gemm_config.h:76
TileStoreIterator< typename GemmTileTraitsHelperA::SharedStoreTileTraits, typename GemmTileTraitsHelperA::SharedStoreTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared > SharedStoreIteratorA
The iterator to store A to shared memory.
Definition: igemm_traits.h:436
Definition: gemm_traits.h:52
Definition: matrix_traits.h:357
Definition: igemm_traits.h:57
IgemmGlobalTileTraits< GemmOperand::kA, MatrixLayout::kColumnMajor, int8_t const, Shape< 1, GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kW >, Shape< 1, ShapeCount< typename GemmConfig_::Warps >::kCount, GemmConfig_::kWarpSize >, GemmConfig_::kScalarsPerLdgA > GlobalTileTraits
The traits class to build the iterator to load data from global memory for A^N.
Definition: igemm_traits.h:170
Definition: igemm_global_tile.h:95
float Scalar
Definition: igemm_traits.h:494
Definition: gemm_traits.h:349
Definition: igemm_traits.h:527
A Shape implementing Layout Concept describing the dimensions of a cube.
Definition: shape.h:64
Definition: matrix_traits.h:159
TileLoadIterator< typename GemmTileTraitsHelperB::SharedLoadTileTraits, typename GemmTileTraitsHelperB::SharedLoadTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared > SharedLoadIteratorB
The iterator to load B from shared memory.
Definition: igemm_traits.h:476
GemmSharedStoreTileAbTraits< int8_t, Shape< GemmConfig_::kStages, GemmConfig_::OutputTile::kD/4, GemmConfig_::OutputTile::kH *4 >, typename GlobalTileTraits::Threads, kScalarsPerStsB > SharedStoreTileTraits
The traits class to build the iterator to store data to shared memory for B^N.
Definition: igemm_traits.h:368
Definition: matrix_traits.h:159
ReshapeThreads< VectorizedTile, Threads_ >::Threads Threads
The threads shape.
Definition: gemm_global_tile.h:88
Definition: gemm_traits.h:267
Template performing matrix multiply-add operation within a thread.
Definition: thread_multiply_add.h:44
Definition: gemm_traits.h:57
int8_t Scalar
The input scalar.
Definition: igemm_traits.h:196
IgemmConfig< OutputTile_, ScalarD_, ThreadGemmShape_ > GemmConfig
The IGEMM config.
Definition: igemm_traits.h:420
IgemmGlobalIteratorAb< GlobalTileTraits, Index_ > GlobalLoadIterator
The global load iterator.
Definition: igemm_traits.h:292
GemmGlobalIteratorAb< GlobalTileTraits, Index_ > GlobalLoadIterator
The global load iterator.
Definition: igemm_traits.h:173
GemmGlobalIteratorAb< GlobalTileTraits, Index_ > GlobalLoadIterator
The global load iterator.
Definition: igemm_traits.h:356
GemmConfig::MultiplyAdd MultiplyAdd
The multiply-add functor.
Definition: igemm_traits.h:482
Functor to compute linear combination of fragments.
Definition: linear_scaling.h:51
SharedLoadStream< SharedLoadIteratorA, Copy< typename SharedLoadIteratorA::Fragment > > SharedLoadStreamA
The stream to load A from shared memory.
Definition: igemm_traits.h:470
Definition: matrix_traits.h:357
GlobalLoadStream< GemmOperand::kA, GlobalLoadIteratorA, SharedStoreIteratorA, GlobalTransformerA > GlobalLoadStreamA
The stream to load A from global memory to shared memory.
Definition: igemm_traits.h:442
IgemmTileTraitsHelperA< kLayoutA_, GemmConfig, Index_ > GemmTileTraitsHelperA
The GEMM config for A.
Definition: igemm_traits.h:422
Implements a software-pipelined efficient GEMM.
GemmSharedLoadTileATraits< int8_t const, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, typename GemmConfig_::InstructionShape, GemmConfig_::kStages, 16, SharedStoreTileTraits::kSkew > SharedLoadTileTraits
The traits class to build the iterator to load from shared memory for A^N.
Definition: igemm_traits.h:253
SharedLoadStream< SharedLoadIteratorB, Copy< typename SharedLoadIteratorB::Fragment > > SharedLoadStreamB
The stream to load B from shared memory.
Definition: igemm_traits.h:479
Defines structural properties of the GEMM epilogue.
Definition: igemm_traits.h:493
Defines the epilogue phase of the GEMM computation for IGEMM, supporting integer and floating-point o...
Defines conversion operations among Fragments of different base type.
GemmSharedStoreWithSkewTileAbTraits< int8_t, Shape< GemmConfig_::kStages, GemmConfig_::OutputTile::kD/4, GemmConfig_::OutputTile::kH *4 >, typename GlobalTileTraits::Threads, kScalarsPerStsB, 16 > SharedStoreTileTraits
The traits class to build the iterator to store data to shared memory for B^N.
Definition: igemm_traits.h:306
int8_t Scalar
The input scalar.
Definition: igemm_traits.h:269
Implements tile iterators to partition the thread block tile into 2D subtiles and efficiently load ea...
TileLoadIterator< typename GemmTileTraitsHelperA::SharedLoadTileTraits, typename GemmTileTraitsHelperA::SharedLoadTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared > SharedLoadIteratorA
The iterator to load A from shared memory.
Definition: igemm_traits.h:467
GemmTileTraitsHelperA< MatrixLayout::kColumnMajor, GemmConfig_ > Base
The base config.
Definition: igemm_traits.h:152
An iterator implementing Tile Store Iterator Concept for storing a tile to memory.
Definition: tile_iterator.h:841
ClearAccumulators< typename MultiplyAdd::ScalarC > ClearAccumulators
The object to clear accumulators.
Definition: igemm_traits.h:484