37   typename TransposeShape, 
    42 template <
int ElementCount_>
    43 struct Transpose<ElementCount_, layout::PitchLinearShape<4,4> , int8_t> {
    45     static const int kElementCount = ElementCount_;
    48     using Fragment = cutlass::Array<Element, kElementCount>;
    50     static_assert(!(kElementCount % TransposeShape::kCount), 
"Shape needs to be multiple of 16 elements to do a 4x4 transpose");
    56     int* src_int = 
reinterpret_cast<int*
>(&src);
    57     int* dst_int = 
reinterpret_cast<int*
>(&dst);
    60     for (
int i = 0; i < kElementCount / TransposeShape::kCount; i++){
    62       int const i0 = 4 * i + 0;
    63       int const i1 = 4 * i + 1;
    64       int const i2 = 4 * i + 2;
    65       int const i3 = 4 * i + 3;
    72       int b0, b1, b2, b3, c0;
    73       asm volatile(
"prmt.b32 %0, %1, %2, 0x0040;" : 
"=r"(b0) : 
"r"(a0), 
"r"(a1));
    74       asm volatile(
"prmt.b32 %0, %1, %2, 0x0040;" : 
"=r"(c0) : 
"r"(a2), 
"r"(a3));
    75       asm volatile(
"prmt.b32 %0, %1, %2, 0x5410;" : 
"=r"(b0) : 
"r"(b0), 
"r"(c0));
    77       asm volatile(
"prmt.b32 %0, %1, %2, 0x0051;" : 
"=r"(b1) : 
"r"(a0), 
"r"(a1));
    78       asm volatile(
"prmt.b32 %0, %1, %2, 0x0051;" : 
"=r"(c0) : 
"r"(a2), 
"r"(a3));
    79       asm volatile(
"prmt.b32 %0, %1, %2, 0x5410;" : 
"=r"(b1) : 
"r"(b1), 
"r"(c0));
    81       asm volatile(
"prmt.b32 %0, %1, %2, 0x0062;" : 
"=r"(b2) : 
"r"(a0), 
"r"(a1));
    82       asm volatile(
"prmt.b32 %0, %1, %2, 0x0062;" : 
"=r"(c0) : 
"r"(a2), 
"r"(a3));
    83       asm volatile(
"prmt.b32 %0, %1, %2, 0x5410;" : 
"=r"(b2) : 
"r"(b2), 
"r"(c0));
    85       asm volatile(
"prmt.b32 %0, %1, %2, 0x0073;" : 
"=r"(b3) : 
"r"(a0), 
"r"(a1));
    86       asm volatile(
"prmt.b32 %0, %1, %2, 0x0073;" : 
"=r"(c0) : 
"r"(a2), 
"r"(a3));
    87       asm volatile(
"prmt.b32 %0, %1, %2, 0x5410;" : 
"=r"(b3) : 
"r"(b3), 
"r"(c0));
 Definition: aligned_buffer.h:35
Template defining a shape used by pitch-linear operators. 
Definition: pitch_linear.h:43
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110