|  | CUTLASS
    CUDA Templates for Linear Algebra Subroutines and Solvers | 
#include <mma_tensor_op_tile_iterator_sm70.h>
| Public Types | |
| using | Shape = Shape_ | 
| Shape of tile to load (concept: PitchLinearShape)  More... | |
| using | Element = Element_ | 
| Element type.  More... | |
| using | Layout = cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, kKBlock > | 
| Layout of source tile.  More... | |
| using | InstructionShape = InstructionShape_ | 
| Shape of one matrix product operation (concept: MatrixShape)  More... | |
| using | TensorRef = TensorRef< Element, Layout > | 
| TensorRef type for loading element from a tensor.  More... | |
| using | Index = typename TensorRef::Index | 
| Index type.  More... | |
| using | LongIndex = typename TensorRef::LongIndex | 
| Long Index type.  More... | |
| using | TensorCoord = typename TensorRef::TensorCoord | 
| Coordinate for an element in the tensor.  More... | |
| using | Base = MmaVoltaTensorOpMultiplicandTileIterator< layout::PitchLinearShape< Shape::kRow, Shape::kColumn >, kOperand, Element, layout::VoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, kKBlock >, layout::PitchLinearShape< InstructionShape::kRow, InstructionShape::kColumn >, kOpDelta, kThreads > | 
| Underlying tile iterator implementation.  More... | |
| using | Fragment = Array< Element, Shape::kCount/kThreads *2 > | 
| Fragment object holding a thread's part of a tile.  More... | |
| Public Member Functions | |
| CUTLASS_HOST_DEVICE | MmaVoltaTensorOpMultiplicandTileIterator () | 
| Default ctor constructs null iterator.  More... | |
| CUTLASS_HOST_DEVICE | MmaVoltaTensorOpMultiplicandTileIterator (TensorRef const &ref, int lane_id) | 
| Constructor from TensorRef.  More... | |
| CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & | add_pointer_offset (LongIndex offset) | 
| Adds a pointer offset to internal pointer(s) to advance through memory.  More... | |
| CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & | add_tile_offset (TensorCoord const &tile_offset) | 
| CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & | operator++ () | 
| Advances the iterator along the advance dimension.  More... | |
| CUTLASS_HOST_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & | operator-- () | 
| Advances the iterator along the advance dimension.  More... | |
| CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & | operator+= (TensorCoord const &tile_offset) | 
| CUTLASS_DEVICE MmaVoltaTensorOpMultiplicandTileIterator & | operator-= (TensorCoord const &tile_offset) | 
| CUTLASS_HOST_DEVICE void | load (Fragment &frag) const | 
| Loads a fragment from memory at the location pointed to by the iterator.  More... | |
| CUTLASS_DEVICE void | load_with_pointer_offset (Fragment &frag, Index pointer_offset) const | 
| Loads a fragment from memory with additional logical offset.  More... | |
| CUTLASS_DEVICE void | load_with_byte_offset (Fragment &frag, Index byte_offset) const | 
| Loads a fragment from memory with additional logical offset.  More... | |
| CUTLASS_DEVICE void | load (Fragment &frag, TensorCoord const &tile_offset) const | 
| Loads a fragment from memory with logical offset in units of whole tiles.  More... | |
| CUTLASS_DEVICE void | load (Fragment &frag, TensorCoord const &tile_offset, Index pointer_offset) const | 
| Loads a fragment from memory with logical offset in units of whole tiles.  More... | |
| CUTLASS_DEVICE void | load_with_byte_offset (Fragment &frag, TensorCoord const &tile_offset, Index byte_offset) const | 
| Loads a fragment from memory with logical offset in units of whole tiles.  More... | |
| CUTLASS_DEVICE void | set_kgroup_index (int k_group) | 
| Static Public Attributes | |
| static Operand const | kOperand = Operand_ | 
| Operand tag.  More... | |
| static int const | kKBlock = KBlock | 
| KBlock size.  More... | |
| static int const | kOpDelta = OpDelta_ | 
| static int const | kThreads = 32 | 
| Number of participating threads.  More... | |
This tile iterator is specialized for 32-thread TensorOps. It uses LDS to load from shared memory and therefore must be initialized with a TensorRef to shared memory.
Satisfies: ReadableRandomAccessContiguousTileIteratorConcept
| using cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::Base = MmaVoltaTensorOpMultiplicandTileIterator< layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element, layout::VoltaTensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value, kKBlock>, layout::PitchLinearShape<InstructionShape::kRow, InstructionShape::kColumn>, kOpDelta, kThreads> | 
| using cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::Element = Element_ | 
| using cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::Fragment = Array<Element, Shape::kCount / kThreads * 2> | 
| using cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::Index = typename TensorRef::Index | 
| using cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::InstructionShape = InstructionShape_ | 
| using cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::Layout = cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits<Element_>::value, kKBlock> | 
| using cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::LongIndex = typename TensorRef::LongIndex | 
| using cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::Shape = Shape_ | 
| using cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::TensorCoord = typename TensorRef::TensorCoord | 
| using cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< Shape_, Operand_, Element_, cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise< sizeof_bits< Element_ >::value, KBlock >, InstructionShape_, OpDelta_, 32 >::TensorRef = TensorRef<Element, Layout> | 
| 
 | inline | 
| 
 | inline | 
| 
 | inline | 
| 
 | inline | 
Advances an iterator along logical dimensions of matrix in units of whole tiles
| 
 | inline | 
| 
 | inline | 
| frag | fragment to load from the tensor | 
| tile_offset | loads a tile with a logical offset in units of whole tiles | 
| 
 | inline | 
| frag | fragment to load from the tensor | 
| tile_offset | loads a tile with a logical offset in units of whole tiles | 
| pointer_offset | loads a tile with a logical offset AND a pointer offset | 
| 
 | inline | 
| frag | fragment to load from the tensor | 
| byte_offset | loads a tile with a linear offset | 
| 
 | inline | 
| frag | fragment to load from the tensor | 
| tile_offset | loads a tile with a logical offset in units of whole tiles | 
| byte_offset | loads a tile with a logical offset AND a pointer offset | 
| 
 | inline | 
| frag | fragment to load from the tensor | 
| pointer_offset | loads a tile with a linear offset | 
| 
 | inline | 
| 
 | inline | 
advances in units of whole tiles along the logical coordinate space of the tensor
| 
 | inline | 
advances in units of whole tiles along the logical coordinate space of the tensor
| 
 | inline | 
| 
 | inline | 
Notify the iterator which k-group it is currently pointing to.
This does not advance the iterator. Rather, it overrides its internal tracking with constant-valued k-group index to enable the compiler to fold constants and achieve more efficient code.
This is used by some nontrivial permuted layouts.
| 
 | static | 
| 
 | static | 
Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
| 
 | static | 
| 
 | static | 
 1.8.11
 1.8.11