45 inline __device__ 
void ldsm(Array<unsigned, MatrixCount> & D, 
void const* ptr);
    53 #if (__CUDACC_VER_MAJOR__ == 10) && (__CUDACC_VER_MINOR__ == 2)    54   #define CUDA_NVVM_GET_SHARED_POINTER_SUPPORTED 1    56   #define CUDA_NVVM_GET_SHARED_POINTER_SUPPORTED 0    59 #if ! defined(CUDA_NVVM_GET_SHARED_POINTER_ENABLED)    60   #define CUDA_NVVM_GET_SHARED_POINTER_ENABLED (CUDA_NVVM_GET_SHARED_POINTER_SUPPORTED)    63 #if ! defined(CUDA_LDMATRIX_SUPPORTED)    64   #define CUDA_LDMATRIX_SUPPORTED ((__CUDACC_VER_MAJOR__ == 10) && (__CUDACC_VER_MINOR__ >= 2))    67 #if ! defined(CUDA_LDMATRIX_ENABLED)    68   #define CUDA_LDMATRIX_ENABLED (CUDA_LDMATRIX_SUPPORTED)    71 #if (CUDA_LDMATRIX_ENABLED && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750))    72   #define CUDA_LDMATRIX_ACTIVATED 1    74   #define CUDA_LDMATRIX_ACTIVATED 0    77 #if defined(CUTLASS_GET_SMEM_POINTER)    79 #elif CUDA_NVVM_GET_SHARED_POINTER_ENABLED    80   #if ! defined(NVVM_GET_SMEM_POINTER)    81     #define NVVM_GET_SMEM_POINTER    88     __device__ uint32_t __nvvm_get_smem_pointer(
void*);
    91   #define CUTLASS_GET_SMEM_POINTER(ptr) __nvvm_get_smem_pointer((void*)ptr)    97 inline __device__ 
void ldsm<layout::RowMajor, 1>(
    98     Array<unsigned, 1> & D,
   101   #if CUDA_LDMATRIX_ACTIVATED   103     unsigned addr = CUTLASS_GET_SMEM_POINTER(ptr);
   106     asm volatile (
"ldmatrix.sync.aligned.x1.m8n8.shared.b16 {%0}, [%1];" : 
"=r"(x) : 
"r"(addr));
   107     reinterpret_cast<int &
>(D) = x;
   119 inline __device__ 
void ldsm<layout::RowMajor, 2>(
   120     Array<unsigned, 2> & D,
   123   #if CUDA_LDMATRIX_ACTIVATED   125     unsigned addr = CUTLASS_GET_SMEM_POINTER(ptr);
   128     asm volatile (
"ldmatrix.sync.aligned.x2.m8n8.shared.b16 {%0, %1}, [%2];" : 
"=r"(x), 
"=r"(y) : 
"r"(addr));
   129     reinterpret_cast<int2 &
>(D) = make_int2(x, y);
   141 inline __device__ 
void ldsm<layout::RowMajor, 4>(
   142     Array<unsigned, 4> & D,
   145   #if CUDA_LDMATRIX_ACTIVATED   147     unsigned addr = CUTLASS_GET_SMEM_POINTER(ptr);
   150     asm volatile (
"ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" : 
"=r"(x), 
"=r"(y), 
"=r"(z), 
"=r"(w) : 
"r"(addr));
   151     reinterpret_cast<int4 &
>(D) = make_int4(x, y, z, w);
   167 inline __device__ 
void ldsm<layout::ColumnMajor, 1>(
   168     Array<unsigned, 1> & D,
   170   #if CUDA_LDMATRIX_ACTIVATED   172     unsigned addr = CUTLASS_GET_SMEM_POINTER(ptr);
   175     asm volatile (
"ldmatrix.sync.aligned.x1.trans.m8n8.shared.b16 {%0}, [%1];" : 
"=r"(x) : 
"r"(addr));
   176     reinterpret_cast<int &
>(D) = x;
   188 inline __device__ 
void ldsm<layout::ColumnMajor, 2>(
   189     Array<unsigned, 2> & D,
   192   #if CUDA_LDMATRIX_ACTIVATED   194     unsigned addr = CUTLASS_GET_SMEM_POINTER(ptr);
   197     asm volatile (
"ldmatrix.sync.aligned.x2.trans.m8n8.shared.b16 {%0, %1}, [%2];" : 
"=r"(x), 
"=r"(y) : 
"r"(addr));
   198     reinterpret_cast<int2 &
>(D) = make_int2(x, y);
   210 inline __device__ 
void ldsm<layout::ColumnMajor, 4>(
   211     Array<unsigned, 4> & D,
   214   #if CUDA_LDMATRIX_ACTIVATED   216     unsigned addr = CUTLASS_GET_SMEM_POINTER(ptr);
   219     asm volatile (
"ldmatrix.sync.aligned.x4.trans.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];" : 
"=r"(x), 
"=r"(y), 
"=r"(z), 
"=r"(w) : 
"r"(addr));
   220     reinterpret_cast<int4 &
>(D) = make_int4(x, y, z, w);
 Definition: aligned_buffer.h:35
Statically sized array of elements that accommodates all CUTLASS-supported numeric types and is safe ...
__device__ void ldsm(Array< unsigned, MatrixCount > &D, void const *ptr)
Defines layout functions used by TensorRef and derived classes.