S<1, 0, 2>, // ABlockTransfer SrcAccessOrder !!!!!! Determined by Layout , Always 1-0-2 If A is row major ,
// ABlockTransfer ThreadCluster Lengths_K0_M_K1 S<8,32,1> :: Calculation : First Number 8 = (KPerBlock) / ABlockTransfer SrcScalar PerVector (row-col-row) ! A Tensor is row major
// Calculation Second Number 32 = ( BlockSize ) / ( FirstNumber (8) ) !!! = 8
// Caldulation Third Number = 1
2, // ABlockTransfer SrcVectorDim !! If A is row major this is always 2
8, // ABlockTransfer SrcScalar PerVector // How you read 'A tensor' data from global memory
8, // ABlockTransfer DstScalar PerVector_K1 // How you write 'A tensor' data to shared memory
0, // ABlockLds AddExtraM
// Tensor A
// For Tensor B these define how to copy data from Global to Shared Mem