// Adapt from https://github.com/vllm-project/vllm/blob/v0.7.1/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
TensortCsScaleAViewAsC=tiled_mma.get_slice(thread_idx).partition_C(sScaleAViewAsC);// (MMA,MMA_M,MMA_N,PIPE), `thread_mma` above is correct when partitioning A and B, but it is not correct when partitioning C.
// We release buffers to producer warps(dma load) with some mmas in flight
PipelineStatesmem_pipe_release=smem_pipe_read;
// Per block scale values for operand A and B
usingRegLayoutScaleAViewAsC=decltype(make_layout_like(tCsScaleAViewAsC(_,_,_,0).layout()));// `make_layout_like` makes a compact layout.
usingRegLayoutScaleAEssential=decltype(filter_zeros(RegLayoutScaleAViewAsC{}.stride(),RegLayoutScaleAViewAsC{}.shape()));// an interface to traverse the underlying storage for the compact layout mentioned above
tCrScaleAViewAsC.data()[0]=__shfl_sync(0xffffffff,tCrScaleAViewAsC.data()[0]*scale_b,0);// `tCrScaleAViewAsC.data()[0]` are all same in a warp group when `ScaleMsPerTile == 1`.
tCrScaleAViewAsC.data()[0]=__shfl_sync(0xffffffff,tCrScaleAViewAsC.data()[0]*scale_b,0);// `tCrScaleAViewAsC.data()[0]` are all same in a warp group when `ScaleMsPerTile == 1`.