38 namespace threadblock {
    49 template <
typename Shape_, 
typename Element_, 
int AdvanceRank,
    50           typename ThreadMap_, 
int Alignment>
    53     layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
    54                                           int(128 / sizeof(Element_))>,
    55     AdvanceRank, ThreadMap_, Alignment> {
    59     "Specialization for pitch-linear iterator may along advance along the "    60     "contiguous(rank=0) or strided(rank=1) dimension.");
    67   static int const kAdvanceRank = AdvanceRank;
    68   static int const kAlignment = Alignment;
    82     static int const kAccessSizeInBits = 128;
    86       "This iterator requires a policy whose access size is 128bs");
    92   using AccessType = Array<Element, Layout::kElementsPerAccess>;
    97   using Fragment = Array<Element, ThreadMap::Iterations::kCount * Layout::kElementsPerAccess>;
   119       : address_iterator_(ref, thread_id) {}
   124     address_iterator_.add_pointer_offset(pointer_offset);
   130     address_iterator_.add_tile_offset({0, 1});
   146     address_iterator_.add_tile_offset(coord);
   152     address_iterator_.set_iteration_index(0);
   153     AccessType *frag_ptr = 
reinterpret_cast<AccessType *
>(&frag);
   156     for (
int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
   158       for (
int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
   159         int access_idx = c + s * ThreadMap::Iterations::kContiguous;
   160         frag_ptr[access_idx] = *(address_iterator_.get() + pointer_offset);
   169     load_with_pointer_offset(frag, 0);
   175     address_iterator_.set_iteration_index(0);
   176     AccessType 
const *frag_ptr = 
reinterpret_cast<AccessType 
const *
>(&frag);
   179     for (
int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
   181       for (
int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
   182         int access_idx = c + s * ThreadMap::Iterations::kContiguous;
   183         *(address_iterator_.get() + pointer_offset) = frag_ptr[access_idx];
   192     store_with_pointer_offset(frag, 0);
   205 template <
typename Shape_, 
typename Element_, 
int AdvanceRank,
   206           typename ThreadMap_, 
int Alignment>
   209     layout::ColumnMajorTensorOpMultiplicandCongruous<
   210         sizeof_bits<Element_>::value, int(128 / sizeof(Element_))>,
   211     AdvanceRank, ThreadMap_, Alignment> {
   215     "Specialization for column-major iterator may along advance along the "   216     "columns(rank=0) or rows(rank=1) dimension.");
   222   static int const kAdvanceRank = AdvanceRank;
   223   static int const kAlignment = Alignment;
   237                                             int(128 / 
sizeof(Element))>,
   238       (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
   243   using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
   257   ): iterator_({ref.
data(), ref.
stride()}, thread_id) {
   264     iterator_.add_pointer_offset(pointer_offset);
   270     iterator_.add_tile_offset({coord.row(), coord.column()});
   292     iterator_.load_with_pointer_offset(frag, pointer_offset);
   298     load_with_pointer_offset(frag, 0);
   305     Index pointer_offset) {
   307     iterator_.store_with_pointer_offset(frag, pointer_offset);
   313     store_with_pointer_offset(frag, 0);
   326 template <
typename Shape_, 
typename Element_, 
int AdvanceRank,
   327           typename ThreadMap_, 
int Alignment>
   330     layout::RowMajorTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
   331                                                   int(128 / sizeof(Element_))>,
   332     AdvanceRank, ThreadMap_, Alignment> {
   336     "Specialization for row-major iterator may along advance along the "   337     "columns(rank=0) or rows(rank=1) dimension.");
   340   using Element = Element_;
   343   static int const kAdvanceRank = AdvanceRank;
   344   static int const kAlignment = Alignment;
   357       layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
   358                                             int(128 / 
sizeof(Element))>,
   359       (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
   364   using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
   378   ): iterator_({ref.
data(), ref.
stride()}, thread_id) {
   385     iterator_.add_pointer_offset(pointer_offset);
   391     iterator_.add_tile_offset({coord.column(), coord.row()});
   415     iterator_.load_with_pointer_offset(frag, pointer_offset);
   421     load_with_pointer_offset(frag, 0);
   428     Index pointer_offset) {
   430     iterator_.store_with_pointer_offset(frag, pointer_offset);
   436     store_with_pointer_offset(frag, 0);
   449 template <
typename Shape_, 
typename Element_, 
int AdvanceRank,
   450           typename ThreadMap_, 
int Alignment, 
int Crosswise>
   452                           layout::TensorOpMultiplicandCrosswise<
   453                               sizeof_bits<Element_>::value, Crosswise>,
   454                           AdvanceRank, ThreadMap_, Alignment> {
   457       AdvanceRank == 0 || AdvanceRank == 1,
   458       "Specialization for pitch-linear iterator may along advance along the "   459       "contiguous(rank=0) or strided(rank=1) dimension.");
   462   using Element = Element_;
   467   static int const kAdvanceRank = AdvanceRank;
   468   static int const kAlignment = Alignment;
   482     static int const kAccessSizeInBits = 128;
   486                   "This iterator requires a policy whose access size is 128bs");
   491   using AccessType = Array<Element, Layout::kElementsPerAccess>;
   496       Array<Element, ThreadMap::Iterations::kCount * Layout::kElementsPerAccess>;
   516       : address_iterator_(ref, thread_id) {}
   521     address_iterator_.add_pointer_offset(pointer_offset);
   527     address_iterator_.add_tile_offset({1, 0});
   543     address_iterator_.add_tile_offset(coord);
   549     address_iterator_.set_iteration_index(0);
   550     AccessType *frag_ptr = 
reinterpret_cast<AccessType *
>(&frag);
   553     for (
int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
   555       for (
int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
   556         int access_idx = c + s * ThreadMap::Iterations::kContiguous;
   557         frag_ptr[access_idx] = *(address_iterator_.get() + pointer_offset);
   570     address_iterator_.set_iteration_index(0);
   571     AccessType 
const *frag_ptr = 
reinterpret_cast<AccessType 
const *
>(&frag);
   574     for (
int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
   576       for (
int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
   577         int access_idx = c + s * ThreadMap::Iterations::kContiguous;
   578         *(address_iterator_.get() + pointer_offset) = frag_ptr[access_idx];
   598 template <
typename Shape_, 
typename Element_, 
int AdvanceRank,
   599           typename ThreadMap_, 
int Alignment, 
int Crosswise>
   601                           layout::ColumnMajorTensorOpMultiplicandCrosswise<
   602                               sizeof_bits<Element_>::value, Crosswise>,
   603                           AdvanceRank, ThreadMap_, Alignment> {
   606       AdvanceRank == 0 || AdvanceRank == 1,
   607       "Specialization for column-major iterator may along advance along the "   608       "columns(rank=0) or rows(rank=1) dimension.");
   611   using Element = Element_;
   614   static int const kAdvanceRank = AdvanceRank;
   615   static int const kAlignment = Alignment;
   627       layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
   630       (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
   634   using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
   646       : iterator_({ref.
data(), ref.
stride()}, thread_id) {}
   651     iterator_.add_pointer_offset(pointer_offset);
   657     iterator_.add_tile_offset({coord.row(), coord.column()});
   679     iterator_.load_with_pointer_offset(frag, pointer_offset);
   689     iterator_.store_with_pointer_offset(frag, pointer_offset);
   706 template <
typename Shape_, 
typename Element_, 
int AdvanceRank,
   707           typename ThreadMap_, 
int Alignment, 
int Crosswise>
   709                           layout::RowMajorTensorOpMultiplicandCrosswise<
   710                               sizeof_bits<Element_>::value, Crosswise>,
   711                           AdvanceRank, ThreadMap_, Alignment> {
   714       AdvanceRank == 0 || AdvanceRank == 1,
   715       "Specialization for row-major iterator may along advance along the "   716       "columns(rank=0) or rows(rank=1) dimension.");
   719   using Element = Element_;
   722   static int const kAdvanceRank = AdvanceRank;
   723   static int const kAlignment = Alignment;
   735       layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
   738       (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
   742   using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
   754       : iterator_({ref.
data(), ref.
stride()}, thread_id) {}
   759     iterator_.add_pointer_offset(pointer_offset);
   765     iterator_.add_tile_offset({coord.column(), coord.row()});
   787     iterator_.load_with_pointer_offset(frag, pointer_offset);
   797     iterator_.store_with_pointer_offset(frag, pointer_offset);
 
int64_t LongIndex
Long index type used for offsets. 
Definition: tensor_op_multiplicand_sm75.h:434
int32_t Index
Index type used for coordinates. 
Definition: tensor_op_multiplicand_sm75.h:640
Definition: aligned_buffer.h:35
Coordinate in pitch-linear space. 
Definition: pitch_linear.h:52
int32_t Index
Index type used for coordinates. 
Definition: tensor_op_multiplicand_sm75.h:431
int32_t Index
Index type used for coordinates. 
Definition: tensor_op_multiplicand_sm75.h:536
Definition: tensor_op_multiplicand_sm75.h:734
CUTLASS_HOST_DEVICE Element * data() const 
Returns the pointer to referenced data. 
Definition: tensor_ref.h:254
int64_t LongIndex
Long index type used for offsets. 
Definition: tensor_op_multiplicand_sm75.h:539
int32_t Index
Index type used for coordinates. 
Definition: tensor_op_multiplicand_sm75.h:221
Definition: tensor_op_multiplicand_sm75.h:422
int32_t Index
Index type used for coordinates. 
Definition: tensor_op_multiplicand_sm75.h:843
int32_t Index
Index type used for coordinates. 
Definition: tensor_op_multiplicand_sm75.h:742
Definition: tensor_op_multiplicand_sm75.h:835
Definition: tensor_op_multiplicand_sm75.h:213
int64_t LongIndex
Long index type used for offsets. 
Definition: tensor_op_multiplicand_sm75.h:224
Template defining a shape used by pitch-linear operators. 
Definition: pitch_linear.h:43
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110
CUTLASS_HOST_DEVICE half_t & operator++(half_t &lhs)
Definition: half.h:694
int64_t LongIndex
Long index type used for offsets. 
Definition: tensor_op_multiplicand_sm75.h:846
CUTLASS_HOST_DEVICE Stride stride() const 
Returns the layout object's stride vector. 
Definition: tensor_ref.h:277
int64_t LongIndex
Long index type used for offsets. 
Definition: tensor_op_multiplicand_sm75.h:745
Defines the size of an element in bits. 
Definition: numeric_types.h:42
Templates implementing computing the addresses of storing of tiles from pitch-linear rank=2 tensors...
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:89
int64_t LongIndex
Long index type used for offsets. 
Definition: tensor_op_multiplicand_sm75.h:643
Templates implementing storing of tiles from pitch-linear rank=2 tensors. 
Definition: tensor_op_multiplicand_sm75.h:632
Definition: matrix_coord.h:39
Definition: tensor_op_multiplicand_sm75.h:527