Cutlass
CUDA Templates for Linear Algebra Subroutines and Solvers
|
Template performing matrix multiply-add operation within a thread.
#include <igemm_multiply_add.h>
Public Types | |
typedef Shape< 4, 1, 1 > | InstructionShape |
The shape of the instruction. More... | |
typedef ThreadGemmShape_ | ThreadGemmShape |
Shape of the thread-level GEMM (K-by-N-by-M) More... | |
typedef ThreadGemmShape | AccumulatorsPerThread |
Aliased for compatibility. Will be removed in CUTLASS v2.0. More... | |
typedef ThreadsPerWarp_ | ThreadsPerWarp |
The number of threads per warp. More... | |
typedef ShapeMul< ThreadGemmShape, ThreadsPerWarp >::Shape | AccumulatorsPerWarp |
The number of accumulators per warp. More... | |
typedef int8_t | ScalarA |
The type for A. More... | |
typedef Fragment< ScalarA, AccumulatorsPerThread::kW *4 > | FragmentA |
The fragment for A. More... | |
typedef int8_t | ScalarB |
The type for B. More... | |
typedef Fragment< ScalarB, AccumulatorsPerThread::kH *4 > | FragmentB |
The fragment for B. More... | |
typedef int | ScalarC |
The type for C and D. More... | |
typedef Fragment< ScalarC, AccumulatorsPerThread::kH *AccumulatorsPerThread::kW > | Accumulators |
The accumulators. More... | |
Public Member Functions | |
CUTLASS_DEVICE | ThreadMultiplyAdd () |
Ctor. More... | |
CUTLASS_DEVICE void | multiply_add (FragmentA const &a, FragmentB const &b, Accumulators const &c, Accumulators &d) |
Multiply : d = a*b + c. More... | |
typedef Fragment<ScalarC, AccumulatorsPerThread::kH * AccumulatorsPerThread::kW> cutlass::gemm::ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, int8_t, int8_t, int >::Accumulators |
typedef ThreadGemmShape cutlass::gemm::ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, int8_t, int8_t, int >::AccumulatorsPerThread |
typedef ShapeMul<ThreadGemmShape, ThreadsPerWarp>::Shape cutlass::gemm::ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, int8_t, int8_t, int >::AccumulatorsPerWarp |
typedef Fragment<ScalarA, AccumulatorsPerThread::kW * 4> cutlass::gemm::ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, int8_t, int8_t, int >::FragmentA |
typedef Fragment<ScalarB, AccumulatorsPerThread::kH * 4> cutlass::gemm::ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, int8_t, int8_t, int >::FragmentB |
typedef Shape<4, 1, 1> cutlass::gemm::ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, int8_t, int8_t, int >::InstructionShape |
typedef int8_t cutlass::gemm::ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, int8_t, int8_t, int >::ScalarA |
typedef int8_t cutlass::gemm::ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, int8_t, int8_t, int >::ScalarB |
typedef int cutlass::gemm::ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, int8_t, int8_t, int >::ScalarC |
typedef ThreadGemmShape_ cutlass::gemm::ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, int8_t, int8_t, int >::ThreadGemmShape |
typedef ThreadsPerWarp_ cutlass::gemm::ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, int8_t, int8_t, int >::ThreadsPerWarp |
|
inline |
|
inline |