52 typename ThreadGemmShape_,
54 int kScalarsPerLdgA_ = 2,
56 int kScalarsPerLdgB_ = 2>
69 ThreadMultiplyAdd<ThreadGemmShape_, Shape<1, 4, 8>, half, half, half>,
100 template <enum MatrixLayout::Kind kLayout_,
typename Iterator_>
103 template <
typename Iterator_>
108 template <
typename Iterator_>
115 template <enum MatrixLayout::Kind kLayout_,
typename Iterator_>
118 template <
typename Iterator_>
123 template <
typename Iterator_>
130 template <enum MatrixLayout::Kind kLayout_,
typename GemmConfig_>
135 template <
typename GemmConfig_>
151 Shape<1, GemmConfig_::kThreads / GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kD>,
153 GemmConfig_::kScalarsPerLdgA>
156 static int const kSkewA = 128 /
sizeof(half) / GlobalTileTraits::Threads::kW / 2;
163 Shape<GemmConfig_::kStages,
164 GemmConfig_::OutputTile::kD / GemmConfig_::InstructionShape::kD,
165 GemmConfig_::OutputTile::kW * GemmConfig_::InstructionShape::kD>,
171 kSkewA<GemmConfig_::kScalarsPerLdsA ? GemmConfig_::kScalarsPerLdsA : kSkewA>
172 SharedStoreTileTraits;
179 typename GemmConfig_::OutputTile,
181 typename GemmConfig_::Warps,
183 typename GemmConfig_::MultiplyAdd::ThreadsPerWarp,
185 typename GemmConfig_::InstructionShape,
187 GemmConfig_::kStages,
191 SharedStoreTileTraits::kSkew>
192 SharedLoadTileTraits;
197 template <enum MatrixLayout::Kind kLayout_,
typename GemmConfig_>
202 template <
typename GemmConfig_>
218 Shape<1, GemmConfig_::kThreads / GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kD>,
220 GemmConfig_::kScalarsPerLdgB>
223 static int const kSkewB = 128 /
sizeof(half) / GlobalTileTraits::Threads::kW / 2;
230 Shape<GemmConfig_::kStages,
231 GemmConfig_::OutputTile::kD / GemmConfig_::InstructionShape::kD,
232 GemmConfig_::OutputTile::kH * GemmConfig_::InstructionShape::kD>,
238 kSkewB<GemmConfig_::kScalarsPerLdsB ? GemmConfig_::kScalarsPerLdsB : kSkewB>
239 SharedStoreTileTraits;
246 typename GemmConfig_::OutputTile,
248 typename GemmConfig_::Warps,
250 typename GemmConfig_::MultiplyAdd::ThreadsPerWarp,
252 typename GemmConfig_::InstructionShape,
254 GemmConfig_::kStages,
258 SharedStoreTileTraits::kSkew>
259 SharedLoadTileTraits;
270 typename OutputTile_,
272 typename EpilogueFunctor_,
274 typename ThreadGemmShape_,
276 int kScalarsPerLdgA_ = 2,
278 int kScalarsPerLdgB_ = 2,
280 typename Index_ =
int>
297 typename GemmTileTraitsHelperA::SharedStoreTileTraits::Scalar,
316 typename GemmTileTraitsHelperB::SharedStoreTileTraits::Scalar,
328 typedef TileLoadIterator<
typename GemmTileTraitsHelperA::SharedLoadTileTraits,
329 typename GemmTileTraitsHelperA::SharedLoadTileTraits::Scalar,
336 typedef TileLoadIterator<
typename GemmTileTraitsHelperB::SharedLoadTileTraits,
337 typename GemmTileTraitsHelperB::SharedLoadTileTraits::Scalar,
369 int kScalarsPerLdgA_ = 2,
371 int kScalarsPerLdgB_ = 2,
373 typename Index_ = int,
385 typename Helper_::GemmConfig,
387 typename Helper_::GlobalLoadStreamA,
389 typename Helper_::GlobalLoadStreamB,
391 typename Helper_::SharedLoadStreamA,
393 typename Helper_::SharedLoadStreamB,
395 typename Helper_::Epilogue,
397 IdentityBlockSwizzle,
401 typename Helper_::ClearAccumulators> {};
SharedLoadStream< SharedLoadIteratorB > SharedLoadStreamB
The stream to load B from shared memory.
Definition: hgemm_traits.h:342
GemmGlobalIteratorAb< typename GemmTileTraitsHelperB::GlobalTileTraits, Index_ > GlobalLoadIteratorB
The iterator to load B from global memory.
Definition: hgemm_traits.h:310
Definition: load_store.h:41
HgemmConfig< OutputTile_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_ > GemmConfig
The HGEMM config.
Definition: hgemm_traits.h:283
Definition: gemm_shared_tile.h:128
Definition: gemm_shared_tile.h:80
Definition: gemm_epilogue.h:42
Defines iterators for efficiently loading and storing to global memory.
SimplifiedGemmEpilogueTraits< GemmConfig, EpilogueFunctor_, Index_ > GemmEpilogueTraits
The traits class for the epilogue.
Definition: hgemm_traits.h:350
Defines structural properties of complete GEMM computation.
HgemmCrosswiseGlobalTileTraits< GemmOperand::kB, MatrixLayout::kColumnMajor, half const, Shape< 1, GemmConfig_::OutputTile::kH, GemmConfig_::OutputTile::kD >, Shape< 1, GemmConfig_::kThreads/GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kD >, GemmConfig_::kScalarsPerLdgB > GlobalTileTraits
The traits class to build the iterator to load data from global memory for B^N.
Definition: hgemm_traits.h:221
GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ > Base
The base config.
Definition: hgemm_traits.h:206
GemmEpilogue< GemmEpilogueTraits > Epilogue
The epilogue.
Definition: hgemm_traits.h:352
Kind
Enumeration defining fundamental contiguous layouts.
Definition: matrix_traits.h:159
GlobalLoadStream< GemmOperand::kA, GlobalLoadIteratorA, SharedStoreIteratorA, GlobalTransformerA > GlobalLoadStreamA
The stream to load A from global memory to shared memory.
Definition: hgemm_traits.h:306
Definition: hgemm_traits.h:383
TileLoadIterator< typename GemmTileTraitsHelperB::SharedLoadTileTraits, typename GemmTileTraitsHelperB::SharedLoadTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared > SharedLoadIteratorB
The iterator to load B from shared memory.
Definition: hgemm_traits.h:340
HgemmTransformerA< GemmTileTraitsHelperA::kLayout, GlobalLoadIteratorA >::Transformer GlobalTransformerA
The default transformer for A.
Definition: hgemm_traits.h:294
Definition: tile_iterator.h:65
Definition: gemm_shared_tile.h:200
Definition: gemm_global_tile.h:163
Implements the epilogue phase of the GEMM kernel that efficiently updates global memory with the comp...
Definition: gemm_global_stream.h:52
Definition: gemm_traits.h:191
Definition: hgemm_traits.h:131
HgemmTileTraitsHelperA< kLayoutA_, GemmConfig > GemmTileTraitsHelperA
The GEMM config for A.
Definition: hgemm_traits.h:285
GemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ > Base
The base config.
Definition: hgemm_traits.h:139
An iterator implementing Tile Load Iterator Concept for loading a tile from memory.
Definition: tile_iterator.h:402
Defines iterators for efficiently loading and storing tiles to and from shared memory.
Definition: gemm_traits.h:120
Definition: gemm_shared_stream.h:45
HgemmTransformerB< GemmTileTraitsHelperB::kLayout, GlobalLoadIteratorB >::Transformer GlobalTransformerB
Definition: hgemm_traits.h:313
Defines a type for restructuring a tile.
ClearAccumulators< typename MultiplyAdd::ScalarC > ClearAccumulators
The object to clear accumulators.
Definition: hgemm_traits.h:347
Specialization implementing multiply-add operation on half-precision floating point fragments...
Definition: gemm_config.h:76
TileLoadIterator< typename GemmTileTraitsHelperA::SharedLoadTileTraits, typename GemmTileTraitsHelperA::SharedLoadTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared > SharedLoadIteratorA
The iterator to load A from shared memory.
Definition: hgemm_traits.h:332
Transposes a tile of 16b elements. Used by HGEMM to construct a K-strided layout in shared memory for...
Definition: gemm_traits.h:52
Definition: matrix_traits.h:357
Definition: hgemm_traits.h:198
GemmConfig::MultiplyAdd MultiplyAdd
The functor to do the multiply-add in the main loop.
Definition: hgemm_traits.h:345
Definition: gemm_traits.h:349
HgemmTileTraitsHelperB< kLayoutB_, GemmConfig > GemmTileTraitsHelperB
The GEMM config for B.
Definition: hgemm_traits.h:287
Definition: hgemm_global_tile.h:48
A Shape implementing Layout Concept describing the dimensions of a cube.
Definition: shape.h:64
Definition: matrix_traits.h:159
Definition: gemm_epilogue_traits.h:340
Definition: matrix_traits.h:159
ReshapeThreads< VectorizedTile, Threads_ >::Threads Threads
The threads shape.
Definition: gemm_global_tile.h:88
Template performing matrix multiply-add operation within a thread.
Definition: thread_multiply_add.h:44
GemmGlobalIteratorAb< typename GemmTileTraitsHelperA::GlobalTileTraits, Index_ > GlobalLoadIteratorA
The iterator to load A from global memory.
Definition: hgemm_traits.h:291
Definition: gemm_traits.h:196
Definition: hgemm_traits.h:281
HgemmCrosswiseGlobalTileTraits< GemmOperand::kA, MatrixLayout::kRowMajor, half const, Shape< 1, GemmConfig_::OutputTile::kW, GemmConfig_::OutputTile::kD >, Shape< 1, GemmConfig_::kThreads/GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kD >, GemmConfig_::kScalarsPerLdgA > GlobalTileTraits
The traits class to build the iterator to load data from global memory for A^T.
Definition: hgemm_traits.h:154
Tile traits used to construct global tile iterator for HGEMM. This is intended to partition the threa...
Functor to compute linear combination of fragments.
Definition: linear_scaling.h:51
Definition: matrix_traits.h:357
Implements a software-pipelined efficient GEMM.
GlobalLoadStream< GemmOperand::kB, GlobalLoadIteratorB, SharedStoreIteratorB, GlobalTransformerB > GlobalLoadStreamB
The stream to load B from global memory to shared memory.
Definition: hgemm_traits.h:325
SharedLoadStream< SharedLoadIteratorA > SharedLoadStreamA
The stream to load A from shared memory.
Definition: hgemm_traits.h:334
Defines structural properties of the GEMM epilogue.
TileStoreIterator< typename GemmTileTraitsHelperB::SharedStoreTileTraits, typename GemmTileTraitsHelperB::SharedStoreTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared > SharedStoreIteratorB
The iterator to store B to shared memory.
Definition: hgemm_traits.h:319
TileStoreIterator< typename GemmTileTraitsHelperA::SharedStoreTileTraits, typename GemmTileTraitsHelperA::SharedStoreTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared > SharedStoreIteratorA
The iterator to store A to shared memory.
Definition: hgemm_traits.h:300
Definition: hgemm_swizzle.h:40
Defines conversion operations among Fragments of different base type.
Definition: hgemm_traits.h:57
An iterator implementing Tile Store Iterator Concept for storing a tile to memory.
Definition: tile_iterator.h:841