printf("use tid %d num %d off %d %d\n",threadIdx.x,ordered_src_access_idx[Number<ordered_gather_dim>{}](),src_coord_.GetOffset(),gather_offset);
// printf("use tid %d num %d off %d %d\n", threadIdx.x, ordered_src_access_idx[Number<ordered_gather_dim>{}](), src_coord_.GetOffset(), gather_offset );