40 namespace threadblock {
    46   typename ThreadblockShape_,
    71       !(ThreadblockShape::kM % WarpShape::kM) &&
    72       !(ThreadblockShape::kM % WarpShape::kM), 
"Divisibility");
    76       ThreadblockShape::kM / WarpShape::kM,
    77       ThreadblockShape::kN / WarpShape::kN,
   102 template <
typename ThreadblockShape_, 
typename WarpShape_, 
int PartitionsK,
   103           typename Element_, 
int ElementsPerAccess, 
int InterleavedK>
   107   static int const kPartitionsK = PartitionsK;
   109   static int const kElementsPerAccess = ElementsPerAccess;
   110   static int const kInterleavedK = InterleavedK;
   118     static int const kTensorOpRows = 8;
   119     static int const kWarpSize = 32;
   122                       !(ThreadblockShape::kM % WarpShape::kM),
   128                         ThreadblockShape::kN / WarpShape::kN, kPartitionsK>;
   143                                WarpShape::kN / InterleavedK>,
 Definition: default_thread_map_tensor_op.h:64
Definition: output_tile_thread_map.h:228
static int const kWarpSize
Definition: default_thread_map_tensor_op.h:68
Definition: aligned_buffer.h:35
Tuple defining point in output tile. 
Definition: output_tile_thread_map.h:57
Epilogue for threadblock scoped GEMMs using Tensor Ops. 
Definition: default_thread_map_tensor_op.h:116
static int const kPartitionsK
Definition: default_thread_map_tensor_op.h:56
Defines common types used for all GEMM-like operators. 
Element_ Element
Definition: default_thread_map_tensor_op.h:108
static int const kCount
Definition: include/cutlass/gemm/gemm.h:67
Definition: output_tile_thread_map.h:442
Template defining a shape used by pitch-linear operators. 
Definition: pitch_linear.h:43
static int const kThreads
Number of participating threads. 
Definition: default_thread_map_tensor_op.h:82
Defines the optimal thread map for TensorOp accumulator layouts. 
Definition: default_thread_map_tensor_op.h:104
Defines the size of an element in bits. 
Definition: numeric_types.h:42
Defines the optimal thread map for TensorOp accumulator layouts. 
Definition: default_thread_map_tensor_op.h:52
static int const kTensorOpRows
Tensor Operations fundamentally perform operations on 8 rows. 
Definition: default_thread_map_tensor_op.h:67
Shape of a matrix multiply-add operation. 
Definition: include/cutlass/gemm/gemm.h:57
ThreadblockShape_ ThreadblockShape
Definition: default_thread_map_tensor_op.h:105
Element_ Element
Definition: default_thread_map_tensor_op.h:57
WarpShape_ WarpShape
Definition: default_thread_map_tensor_op.h:106
WarpShape_ WarpShape
Definition: default_thread_map_tensor_op.h:55
ThreadblockShape_ ThreadblockShape
Definition: default_thread_map_tensor_op.h:54
Defines layout functions used by TensorRef and derived classes for pitch-linear memory. 
static int const kElementsPerAccess
Definition: default_thread_map_tensor_op.h:58