#pragma once #include #include using fp8_e4_t = cute::float_e4m3_t; using fp8_e5_t = cute::float_e5m2_t; struct __CUDA_ALIGN__(2) fp8_e4_2_t { fp8_e4_t x; fp8_e4_t y; }; struct __CUDA_ALIGN__(4) fp8_e4_4_t { fp8_e4_t x; fp8_e4_t y; fp8_e4_t z; fp8_e4_t w; }; struct __CUDA_ALIGN__(8) fp8_e4_8_t { fp8_e4_4_t x; fp8_e4_4_t y; }; struct __CUDA_ALIGN__(16) fp8_e4_16_t { fp8_e4_8_t x; fp8_e4_8_t y; }; struct __CUDA_ALIGN__(32) fp8_e4_32_t { fp8_e4_16_t x; fp8_e4_16_t y; __device__ __forceinline__ fp8_e4_32_t &operator=(const ulonglong4 &rhs) { x.x = *(fp8_e4_8_t *)&rhs.x; x.y = *(fp8_e4_8_t *)&rhs.y; y.x = *(fp8_e4_8_t *)&rhs.z; y.y = *(fp8_e4_8_t *)&rhs.w; return *this; } }; struct __CUDA_ALIGN__(2) fp8_e5_2_t { fp8_e5_t x; fp8_e5_t y; }; struct __CUDA_ALIGN__(4) fp8_e5_4_t { fp8_e5_t x; fp8_e5_t y; fp8_e5_t z; fp8_e5_t w; }; struct __CUDA_ALIGN__(8) fp8_e5_8_t { fp8_e5_4_t x; fp8_e5_4_t y; }; struct __CUDA_ALIGN__(16) fp8_e5_16_t { fp8_e5_8_t x; fp8_e5_8_t y; }; struct __CUDA_ALIGN__(32) fp8_e5_32_t { fp8_e5_16_t x; fp8_e5_16_t y; __device__ __forceinline__ fp8_e5_32_t &operator=(const ulonglong4 &rhs) { x.x = *(fp8_e5_8_t *)&rhs.x; x.y = *(fp8_e5_8_t *)&rhs.y; y.x = *(fp8_e5_8_t *)&rhs.z; y.y = *(fp8_e5_8_t *)&rhs.w; return *this; } };