38 template <
typename ThreadGemmShape_,
39 typename ThreadsPerWarp_,
77 for (
int j = 0; j < AccumulatorsPerThread::kH; ++j) {
78 for (
int i = 0; i < AccumulatorsPerThread::kW; ++i) {
79 d[j * AccumulatorsPerThread::kW + i] = a[i] * b[j] + c[j * AccumulatorsPerThread::kW + i];
84 for(
int i = 0; i < AccumulatorsPerThread::kW; ++i) {
85 for(
int j = 0; j < AccumulatorsPerThread::kH; ++j) {
86 d[i * AccumulatorsPerThread::kH + j] = a[i] * b[j] + c[i * AccumulatorsPerThread::kH + j];
Fragment< ScalarA, AccumulatorsPerThread::kW > FragmentA
The fragment for A.
Definition: thread_multiply_add.h:58
Kind
Enumeration defining fundamental contiguous layouts.
Definition: matrix_traits.h:159
Shape< A_::kD *B_::kD, A_::kH *B_::kH, A_::kW *B_::kW, A_::kC *B_::kC > Shape
Definition: shape.h:119
A template defining Fragment Concept.
Definition: fragment.h:99
ShapeMul< ThreadGemmShape, ThreadsPerWarp >::Shape AccumulatorsPerWarp
The number of accumulators per warp.
Definition: thread_multiply_add.h:54
ScalarB_ ScalarB
The type for B.
Definition: thread_multiply_add.h:60
CUTLASS_DEVICE void multiply_add(FragmentA const &a, FragmentB const &b, Accumulators const &c, Accumulators &d)
Multiply : d = a*b + c.
Definition: thread_multiply_add.h:72
CUTLASS_DEVICE ThreadMultiplyAdd()
Ctor.
Definition: thread_multiply_add.h:69
ThreadGemmShape_ ThreadGemmShape
The shape of a thread-leveel matrix multiply accumulate.
Definition: thread_multiply_add.h:48
A Shape implementing Layout Concept describing the dimensions of a cube.
Definition: shape.h:64
Definition: matrix_traits.h:159
Fragment< ScalarC, AccumulatorsPerThread::kH *AccumulatorsPerThread::kW, 16 > Accumulators
The accumulators.
Definition: thread_multiply_add.h:66
ScalarA_ ScalarA
The type for A.
Definition: thread_multiply_add.h:56
Template performing matrix multiply-add operation within a thread.
Definition: thread_multiply_add.h:44
ThreadsPerWarp_ ThreadsPerWarp
The number of threads per warp.
Definition: thread_multiply_add.h:52
ScalarC_ ScalarC
The type for C and D.
Definition: thread_multiply_add.h:64
ThreadGemmShape AccumulatorsPerThread
Aliased to "AccumulatorsPerThread" for compatibility. Expect to be renamed in CUTLASS v2...
Definition: thread_multiply_add.h:50
Shape< 1, 1, 1, 1 > InstructionShape
The shape of the instruction.
Definition: thread_multiply_add.h:46
Defines Fragment, a statically-sized array for storing parts of matrices within a thread's registers...
Fragment< ScalarB, AccumulatorsPerThread::kH > FragmentB
The fragment for B.
Definition: thread_multiply_add.h:62