|  | CUTLASS
    CUDA Templates for Linear Algebra Subroutines and Solvers | 
#include <mma_simt_tile_iterator.h>
| Public Types | |
| using | Shape = Shape_ | 
| Shape of tile to load (concept: MatrixShape)  More... | |
| using | Element = Element_ | 
| Element type.  More... | |
| using | Layout = layout::RowMajorInterleaved< 4 > | 
| Layout of policy.  More... | |
| using | Policy = Policy_ | 
| Decomposition of elements among threads.  More... | |
| using | TensorRef = TensorRef< Element, Layout > | 
| TensorRef type for loading element from a tensor.  More... | |
| using | Index = typename TensorRef::Index | 
| Index type.  More... | |
| using | LongIndex = typename TensorRef::LongIndex | 
| Long Index type.  More... | |
| using | TensorCoord = typename TensorRef::TensorCoord | 
| Coordinate for an element in the tensor.  More... | |
| using | ThreadShape = MatrixShape< Shape::kRow, Shape::kColumn/Policy::WarpShape::kColumn > | 
| Thread-level shape of a fragment.  More... | |
| using | Iterations = MatrixShape< ThreadShape::kRow/Policy::LaneMmaShape::kK, ThreadShape::kColumn/Policy::LaneMmaShape::kN > | 
| Number of individual loads.  More... | |
| using | Fragment = Array< Element, ThreadShape::kCount > | 
| Fragment object holding a thread's part of a tile.  More... | |
| Public Member Functions | |
| CUTLASS_HOST_DEVICE | MmaSimtTileIterator () | 
| Default ctor constructs null iterator.  More... | |
| CUTLASS_HOST_DEVICE | MmaSimtTileIterator (TensorRef ref, int lane_id) | 
| Constructor from TensorRef.  More... | |
| CUTLASS_HOST_DEVICE MmaSimtTileIterator & | add_pointer_offset (LongIndex offset) | 
| Adds a pointer offset to internal pointer(s) to advance through memory.  More... | |
| CUTLASS_HOST_DEVICE MmaSimtTileIterator & | add_tile_offset (TensorCoord const &coord) | 
| Advances an iterator along logical dimensions of matrix in units of whole tiles.  More... | |
| CUTLASS_HOST_DEVICE MmaSimtTileIterator & | operator++ () | 
| Advances the iterator along the advance dimension.  More... | |
| CUTLASS_HOST_DEVICE MmaSimtTileIterator & | operator-- () | 
| Advances the iterator along the advance dimension.  More... | |
| CUTLASS_HOST_DEVICE void | load_with_pointer_offset (Fragment &frag, Index pointer_offset) const | 
| Loads a fragment from memory at the location pointed to by the iterator.  More... | |
| CUTLASS_HOST_DEVICE void | load (Fragment &frag) const | 
| Loads a fragment from memory at the location pointed to by the iterator.  More... | |
| CUTLASS_HOST_DEVICE void | store_with_pointer_offset (Fragment const &frag, Index pointer_offset) const | 
| Stores a fragment to memory at the location pointed to by the iterator.  More... | |
| CUTLASS_HOST_DEVICE void | store (Fragment const &frag, Index pointer_offset) const | 
| Stores a fragment to memory at the location pointed to by the iterator.  More... | |
| CUTLASS_DEVICE void | set_kgroup_index (int k_group) | 
| Static Public Attributes | |
| static Operand const | kOperand = Operand::kB | 
| Operand tag.  More... | |
| static const int | kInterleave = 4 | 
| Interleave factor.  More... | |
| static const int | kPartitionsK = PartitionsK | 
| Number of partitions along K dimension.  More... | |
| static const int | kGroupPerTile = PartitionGroupSize / Shape::kRow | 
| Number of KGroups per kPartition.  More... | |
Specialization for B operands of row-major k-interleaved layouts
Concept: MutableRandomAccessContiguousTileIteratorConcept
| using cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::Element = Element_ | 
| using cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::Fragment = Array<Element, ThreadShape::kCount> | 
| using cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::Index = typename TensorRef::Index | 
| using cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::Iterations = MatrixShape< ThreadShape::kRow / Policy::LaneMmaShape::kK, ThreadShape::kColumn / Policy::LaneMmaShape::kN > | 
| using cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::Layout = layout::RowMajorInterleaved<4> | 
| using cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::LongIndex = typename TensorRef::LongIndex | 
| using cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::Policy = Policy_ | 
| using cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::Shape = Shape_ | 
| using cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::TensorCoord = typename TensorRef::TensorCoord | 
| using cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::TensorRef = TensorRef<Element, Layout> | 
| using cutlass::gemm::warp::MmaSimtTileIterator< Shape_, Operand::kB, Element_, layout::RowMajorInterleaved< 4 >, Policy_, PartitionsK, PartitionGroupSize >::ThreadShape = MatrixShape< Shape::kRow, Shape::kColumn / Policy::WarpShape::kColumn > | 
| 
 | inline | 
| 
 | inline | 
| 
 | inline | 
| 
 | inline | 
| 
 | inline | 
| 
 | inline | 
| 
 | inline | 
| 
 | inline | 
| 
 | inline | 
Notify the iterator which k-group it is currently pointing to.
This does not advance the iterator. Rather, it overrides its internal tracking with constant-valued k-group index to enable the compiler to fold constants and achieve more efficient code.
This is used by some nontrivial permuted layouts.
| 
 | inline | 
| 
 | inline | 
| 
 | static | 
| 
 | static | 
| 
 | static | 
| 
 | static | 
 1.8.11
 1.8.11