31 #include <cuda_fp16.h> 39 template <
typename GlobalIterator_>
44 typedef typename GlobalIterator::Fragment
Fragment;
65 int const* src_int =
reinterpret_cast<int const*
>(&src[0]);
66 int* dst_int =
reinterpret_cast<int*
>(&dst[0]);
69 for (
int d = 0; d < FragmentShape::kD; ++d) {
71 int const i0 = 2 * d + 0;
72 int const i1 = 2 * d + 1;
78 asm volatile(
"prmt.b32 %0, %1, %2, 0x5410;" :
"=r"(b0) :
"r"(a0),
"r"(a1));
79 asm volatile(
"prmt.b32 %0, %1, %2, 0x7632;" :
"=r"(b1) :
"r"(a0),
"r"(a1));
GlobalIterator_ GlobalIterator
The global iterator.
Definition: hgemm_swizzle.h:42
CUTLASS_DEVICE HgemmSwizzle()
The src/dst must be half fragments.
Definition: hgemm_swizzle.h:60
CUTLASS_DEVICE void transform(Fragment const &src, Fragment &dst)
Transform a fragment.
Definition: hgemm_swizzle.h:63
Fragment InputFragment
The input fragment.
Definition: hgemm_swizzle.h:49
Fragment OutputFragment
The output fragment.
Definition: hgemm_swizzle.h:51
GlobalIterator::Fragment Fragment
The source fragment.
Definition: hgemm_swizzle.h:44
Defines Fragment, a statically-sized array for storing parts of matrices within a thread's registers...
GlobalIterator::FragmentShape FragmentShape
The shape of the source fragment.
Definition: hgemm_swizzle.h:46
Compute derived counted of a Layout Concept based class.
Definition: shape.h:79
Definition: hgemm_swizzle.h:40