Add FP8 support to CP implementation with KV P2P (#1114)

* add window_size to AttnFuncWithCP Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add seq_offsets_qkvo for cudnn thd Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add seq_offsets_qkvo to AttnFuncWithCP Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix seq_offsets calculation of cudnn thd Signed-off-by: Xiaowei Ren <xren@nvidia.com> * remove a thd assert Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix bias for thd test Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add thd test for cudnn FA with CP Signed-off-by: Xiaowei Ren <xren@nvidia.com> * skip GQA/MQA test for cuDNN THD Signed-off-by: Xiaowei Ren <xren@nvidia.com> * make sure seq_offsets are computed with qkv_group of hd_hd_hd while CP>1 Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix seq_offsets inputs Signed-off-by: Xiaowei Ren <xren@nvidia.com> * remove two comments Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix attn mask type for cudnn thd with cp Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix attn_mask_type check Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix attn_mask_type for cudnn fa with thd Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix a typo Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix out dout in bwd Signed-off-by: Xiaowei Ren <xren@nvidia.com> * assert cudnn+thd does not support attn bias Signed-off-by: Xiaowei Ren <xren@nvidia.com> * check if attn_mask_type has padding Signed-off-by: Xiaowei Ren <xren@nvidia.com> * minor change Signed-off-by: Xiaowei Ren <xren@nvidia.com> * change cp test batch size to 2 Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix code format Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix two assert info Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix assert comment Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix assert comments Signed-off-by: Xiaowei Ren <xren@nvidia.com> * minor fix Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix assert comments Signed-off-by: Xiaowei Ren <xren@nvidia.com> * assert swa+CP cannot work with thd format Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add a new CP function for swa Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add a missing dgrads Signed-off-by: Xiaowei Ren <xren@nvidia.com> * minor change Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add draft fwd function for swa+cp Signed-off-by: Xiaowei Ren <xren@nvidia.com> * minor change Signed-off-by: Xiaowei Ren <xren@nvidia.com> * enable flash attention for swa+cp Signed-off-by: Xiaowei Ren <xren@nvidia.com> * remove an assert of swa+cp Signed-off-by: Xiaowei Ren <xren@nvidia.com> * call SWAFuncWithCP for swa+cp Signed-off-by: Xiaowei Ren <xren@nvidia.com> * typo fix Signed-off-by: Xiaowei Ren <xren@nvidia.com> * use 2hd layout Signed-off-by: Xiaowei Ren <xren@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * change qkv_format check Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add a code comment Signed-off-by: Xiaowei Ren <xren@nvidia.com> * tensor shape bug fix Signed-off-by: Xiaowei Ren <xren@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * tensor shape fix Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add function to compute cu_seqlens of a cp rank Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add cu_seqlens and cu_seqlens_padded to context parallelism Signed-off-by: Xiaowei Ren <xren@nvidia.com> * typo fix Signed-off-by: Xiaowei Ren <xren@nvidia.com> * minor change Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix FlashAttention output sequence length Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix cu_seqlens_kv_per_step calculation Signed-off-by: Xiaowei Ren <xren@nvidia.com> * zero dQKV for ending padded tokens Signed-off-by: Xiaowei Ren <xren@nvidia.com> * zero dQKV tensors of FlashAttention Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix softmax_lse correction Signed-off-by: Xiaowei Ren <xren@nvidia.com> * remove padded tokens of KV to save comounication Signed-off-by: Xiaowei Ren <xren@nvidia.com> * do not need to zero dkv for FlashAttention any mroe Signed-off-by: Xiaowei Ren <xren@nvidia.com> * zero out tensors Signed-off-by: Xiaowei Ren <xren@nvidia.com> * remove redundant code Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix CP unit test Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix kv shape of cp test with thd format Signed-off-by: Xiaowei Ren <xren@nvidia.com> * update cp unit test Signed-off-by: Xiaowei Ren <xren@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add simple code framework Signed-off-by: Xiaowei Ren <xren@nvidia.com> * try not to have a separate CP function for SWA Signed-off-by: Xiaowei Ren <xren@nvidia.com> * backup some code change Signed-off-by: Xiaowei Ren <xren@nvidia.com> * back up code Signed-off-by: Xiaowei Ren <xren@nvidia.com> * clean up fwd implementation of SWAFuncWithCP Signed-off-by: Xiaowei Ren <xren@nvidia.com> * remove redundant code Signed-off-by: Xiaowei Ren <xren@nvidia.com> * code cleaning Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix assert info Signed-off-by: Xiaowei Ren <xren@nvidia.com> * reduce kv chunk concat overheads Signed-off-by: Xiaowei Ren <xren@nvidia.com> * minor change Signed-off-by: Xiaowei Ren <xren@nvidia.com> * make AttnFuncWithCP and SWAFuncWithCP have same API Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add a docstring Signed-off-by: Xiaowei Ren <xren@nvidia.com> * preliminary implementation of SWAFuncWithCP forward seems working Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix output shape of SWAFuncWithCP Signed-off-by: Xiaowei Ren <xren@nvidia.com> * code refactoring for FlashAttention and add a code placeholder for bwd Signed-off-by: Xiaowei Ren <xren@nvidia.com> * use gather_along_first_dim Signed-off-by: Xiaowei Ren <xren@nvidia.com> * finish the preliminary implementation of bwd Signed-off-by: Xiaowei Ren <xren@nvidia.com> * remove redundant code Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix assert condition Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add draft implementation of SWA+CP with FusedAttention Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix attention mask type of swa+cp Signed-off-by: Xiaowei Ren <xren@nvidia.com> * code cleaning Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add qkv_layout Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add missing window_size argument Signed-off-by: Xiaowei Ren <xren@nvidia.com> * typo fix Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix kv shape of swa+cp Signed-off-by: Xiaowei Ren <xren@nvidia.com> * bug and typo fix Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix dout shape Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add multi stream in fwd of swa+cp Signed-off-by: Xiaowei Ren <xren@nvidia.com> * save chunk_ids_to_kv_ag in fwd Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add multi stream in bwd of swa+cp Signed-off-by: Xiaowei Ren <xren@nvidia.com> * minor fix to cp stream sync Signed-off-by: Xiaowei Ren <xren@nvidia.com> * rename AttnFuncWithCP Signed-off-by: Xiaowei Ren <xren@nvidia.com> * check if window size is None Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix docstring of AttnFuncWithCP Signed-off-by: Xiaowei Ren <xren@nvidia.com> * minor fix Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add env var for users to choose KV ag or KV p2p Signed-off-by: Xiaowei Ren <xren@nvidia.com> * update cp tests Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix window size in cp unit test Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix pytest skip messages Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add cp_comm_type into API Signed-off-by: Xiaowei Ren <xren@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * code cleaning Signed-off-by: Xiaowei Ren <xren@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add deterministic konb in cuDNN fused attn backend Signed-off-by: Xiaowei Ren <xren@nvidia.com> * pass fp8 and fp8_meta to attn_func_with_cp Signed-off-by: Xiaowei Ren <xren@nvidia.com> * assert only Fused Attn can support FP8+CP Signed-off-by: Xiaowei Ren <xren@nvidia.com> * remove redundant assert Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add a fwd draft implementation of FP8 + CP Signed-off-by: Xiaowei Ren <xren@nvidia.com> * save fp8 and fp8_meta Signed-off-by: Xiaowei Ren <xren@nvidia.com> * assert sequence length divisible requirements Signed-off-by: Xiaowei Ren <xren@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove a redundant qkv_layout compute Signed-off-by: Xiaowei Ren <xren@nvidia.com> * if condition change Signed-off-by: Xiaowei Ren <xren@nvidia.com> * some typo fix Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add support table of context parallelism Signed-off-by: Xiaowei Ren <xren@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * typo and code format fix Signed-off-by: Xiaowei Ren <xren@nvidia.com> * do not print multiple disabling messages Signed-off-by: Xiaowei Ren <xren@nvidia.com> * bug fix Signed-off-by: Xiaowei Ren <xren@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix aux_ctx_tensors of FP8 Signed-off-by: Xiaowei Ren <xren@nvidia.com> * bug fix Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix device in torch.arange and adjust code for the PR of MLA Signed-off-by: Xiaowei Ren <xren@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * commit code change for FP8+CP Signed-off-by: Xiaowei Ren <xren@nvidia.com> * commit more code change for FP8+CP Signed-off-by: Xiaowei Ren <xren@nvidia.com> * commit more fp8 code for FP8+CP Signed-off-by: Xiaowei Ren <xren@nvidia.com> * bug fixes Signed-off-by: Xiaowei Ren <xren@nvidia.com> * bug fix Signed-off-by: Xiaowei Ren <xren@nvidia.com> * cast merged CP results from FP32 to BF16 Signed-off-by: Xiaowei Ren <xren@nvidia.com> * typo fix Signed-off-by: Xiaowei Ren <xren@nvidia.com> * minor change Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix softmax_lse Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix some bugs of FP8 dkv exchange Signed-off-by: Xiaowei Ren <xren@nvidia.com> * typo fix Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add FP8 unit test Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix typos and clean asserts Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix get_p2p_comm_info Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix dkv p2p exchange Signed-off-by: Xiaowei Ren <xren@nvidia.com> * minor fix Signed-off-by: Xiaowei Ren <xren@nvidia.com> * change FP8 dkv P2P to A2A Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add FP8+CP unit test Signed-off-by: Xiaowei Ren <xren@nvidia.com> * typo fix Signed-off-by: Xiaowei Ren <xren@nvidia.com> * assert amax reduction is needed for FP8+CP Signed-off-by: Xiaowei Ren <xren@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove duplicated code Signed-off-by: Xiaowei Ren <xren@nvidia.com> * destroy process group in CP unit test Signed-off-by: Xiaowei Ren <xren@nvidia.com> * remove interval from fp8_recipe because it has been deprecated Signed-off-by: Xiaowei Ren <xren@nvidia.com> * try to fix the failed CP test with the latest CI pipeline Signed-off-by: Xiaowei Ren <xren@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove redundant f before string Signed-off-by: Xiaowei Ren <xren@nvidia.com> * change META_O_CP Signed-off-by: Xiaowei Ren <xren@nvidia.com> --------- Signed-off-by: Xiaowei Ren <xren@nvidia.com> Co-authored-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Xiaowei Ren <xren@cs-cw-dfw-login-01.cm.cluster>

Add FP8 support to CP implementation with KV P2P (#1114)
* add window_size to AttnFuncWithCP Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add seq_offsets_qkvo for cudnn thd Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add seq_offsets_qkvo to AttnFuncWithCP Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix seq_offsets calculation of cudnn thd Signed-off-by: Xiaowei Ren <xren@nvidia.com> * remove a thd assert Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix bias for thd test Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add thd test for cudnn FA with CP Signed-off-by: Xiaowei Ren <xren@nvidia.com> * skip GQA/MQA test for cuDNN THD Signed-off-by: Xiaowei Ren <xren@nvidia.com> * make sure seq_offsets are computed with qkv_group of hd_hd_hd while CP>1 Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix seq_offsets inputs Signed-off-by: Xiaowei Ren <xren@nvidia.com> * remove two comments Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix attn mask type for cudnn thd with cp Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix attn_mask_type check Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix attn_mask_type for cudnn fa with thd Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix a typo Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix out dout in bwd Signed-off-by: Xiaowei Ren <xren@nvidia.com> * assert cudnn+thd does not support attn bias Signed-off-by: Xiaowei Ren <xren@nvidia.com> * check if attn_mask_type has padding Signed-off-by: Xiaowei Ren <xren@nvidia.com> * minor change Signed-off-by: Xiaowei Ren <xren@nvidia.com> * change cp test batch size to 2 Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix code format Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix two assert info Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix assert comment Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix assert comments Signed-off-by: Xiaowei Ren <xren@nvidia.com> * minor fix Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix assert comments Signed-off-by: Xiaowei Ren <xren@nvidia.com> * assert swa+CP cannot work with thd format Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add a new CP function for swa Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add a missing dgrads Signed-off-by: Xiaowei Ren <xren@nvidia.com> * minor change Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add draft fwd function for swa+cp Signed-off-by: Xiaowei Ren <xren@nvidia.com> * minor change Signed-off-by: Xiaowei Ren <xren@nvidia.com> * enable flash attention for swa+cp Signed-off-by: Xiaowei Ren <xren@nvidia.com> * remove an assert of swa+cp Signed-off-by: Xiaowei Ren <xren@nvidia.com> * call SWAFuncWithCP for swa+cp Signed-off-by: Xiaowei Ren <xren@nvidia.com> * typo fix Signed-off-by: Xiaowei Ren <xren@nvidia.com> * use 2hd layout Signed-off-by: Xiaowei Ren <xren@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * change qkv_format check Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add a code comment Signed-off-by: Xiaowei Ren <xren@nvidia.com> * tensor shape bug fix Signed-off-by: Xiaowei Ren <xren@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * tensor shape fix Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add function to compute cu_seqlens of a cp rank Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add cu_seqlens and cu_seqlens_padded to context parallelism Signed-off-by: Xiaowei Ren <xren@nvidia.com> * typo fix Signed-off-by: Xiaowei Ren <xren@nvidia.com> * minor change Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix FlashAttention output sequence length Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix cu_seqlens_kv_per_step calculation Signed-off-by: Xiaowei Ren <xren@nvidia.com> * zero dQKV for ending padded tokens Signed-off-by: Xiaowei Ren <xren@nvidia.com> * zero dQKV tensors of FlashAttention Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix softmax_lse correction Signed-off-by: Xiaowei Ren <xren@nvidia.com> * remove padded tokens of KV to save comounication Signed-off-by: Xiaowei Ren <xren@nvidia.com> * do not need to zero dkv for FlashAttention any mroe Signed-off-by: Xiaowei Ren <xren@nvidia.com> * zero out tensors Signed-off-by: Xiaowei Ren <xren@nvidia.com> * remove redundant code Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix CP unit test Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix kv shape of cp test with thd format Signed-off-by: Xiaowei Ren <xren@nvidia.com> * update cp unit test Signed-off-by: Xiaowei Ren <xren@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add simple code framework Signed-off-by: Xiaowei Ren <xren@nvidia.com> * try not to have a separate CP function for SWA Signed-off-by: Xiaowei Ren <xren@nvidia.com> * backup some code change Signed-off-by: Xiaowei Ren <xren@nvidia.com> * back up code Signed-off-by: Xiaowei Ren <xren@nvidia.com> * clean up fwd implementation of SWAFuncWithCP Signed-off-by: Xiaowei Ren <xren@nvidia.com> * remove redundant code Signed-off-by: Xiaowei Ren <xren@nvidia.com> * code cleaning Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix assert info Signed-off-by: Xiaowei Ren <xren@nvidia.com> * reduce kv chunk concat overheads Signed-off-by: Xiaowei Ren <xren@nvidia.com> * minor change Signed-off-by: Xiaowei Ren <xren@nvidia.com> * make AttnFuncWithCP and SWAFuncWithCP have same API Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add a docstring Signed-off-by: Xiaowei Ren <xren@nvidia.com> * preliminary implementation of SWAFuncWithCP forward seems working Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix output shape of SWAFuncWithCP Signed-off-by: Xiaowei Ren <xren@nvidia.com> * code refactoring for FlashAttention and add a code placeholder for bwd Signed-off-by: Xiaowei Ren <xren@nvidia.com> * use gather_along_first_dim Signed-off-by: Xiaowei Ren <xren@nvidia.com> * finish the preliminary implementation of bwd Signed-off-by: Xiaowei Ren <xren@nvidia.com> * remove redundant code Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix assert condition Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add draft implementation of SWA+CP with FusedAttention Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix attention mask type of swa+cp Signed-off-by: Xiaowei Ren <xren@nvidia.com> * code cleaning Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add qkv_layout Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add missing window_size argument Signed-off-by: Xiaowei Ren <xren@nvidia.com> * typo fix Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix kv shape of swa+cp Signed-off-by: Xiaowei Ren <xren@nvidia.com> * bug and typo fix Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix dout shape Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add multi stream in fwd of swa+cp Signed-off-by: Xiaowei Ren <xren@nvidia.com> * save chunk_ids_to_kv_ag in fwd Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add multi stream in bwd of swa+cp Signed-off-by: Xiaowei Ren <xren@nvidia.com> * minor fix to cp stream sync Signed-off-by: Xiaowei Ren <xren@nvidia.com> * rename AttnFuncWithCP Signed-off-by: Xiaowei Ren <xren@nvidia.com> * check if window size is None Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix docstring of AttnFuncWithCP Signed-off-by: Xiaowei Ren <xren@nvidia.com> * minor fix Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add env var for users to choose KV ag or KV p2p Signed-off-by: Xiaowei Ren <xren@nvidia.com> * update cp tests Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix window size in cp unit test Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix pytest skip messages Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add cp_comm_type into API Signed-off-by: Xiaowei Ren <xren@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * code cleaning Signed-off-by: Xiaowei Ren <xren@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add deterministic konb in cuDNN fused attn backend Signed-off-by: Xiaowei Ren <xren@nvidia.com> * pass fp8 and fp8_meta to attn_func_with_cp Signed-off-by: Xiaowei Ren <xren@nvidia.com> * assert only Fused Attn can support FP8+CP Signed-off-by: Xiaowei Ren <xren@nvidia.com> * remove redundant assert Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add a fwd draft implementation of FP8 + CP Signed-off-by: Xiaowei Ren <xren@nvidia.com> * save fp8 and fp8_meta Signed-off-by: Xiaowei Ren <xren@nvidia.com> * assert sequence length divisible requirements Signed-off-by: Xiaowei Ren <xren@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove a redundant qkv_layout compute Signed-off-by: Xiaowei Ren <xren@nvidia.com> * if condition change Signed-off-by: Xiaowei Ren <xren@nvidia.com> * some typo fix Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add support table of context parallelism Signed-off-by: Xiaowei Ren <xren@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * typo and code format fix Signed-off-by: Xiaowei Ren <xren@nvidia.com> * do not print multiple disabling messages Signed-off-by: Xiaowei Ren <xren@nvidia.com> * bug fix Signed-off-by: Xiaowei Ren <xren@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix aux_ctx_tensors of FP8 Signed-off-by: Xiaowei Ren <xren@nvidia.com> * bug fix Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix device in torch.arange and adjust code for the PR of MLA Signed-off-by: Xiaowei Ren <xren@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * commit code change for FP8+CP Signed-off-by: Xiaowei Ren <xren@nvidia.com> * commit more code change for FP8+CP Signed-off-by: Xiaowei Ren <xren@nvidia.com> * commit more fp8 code for FP8+CP Signed-off-by: Xiaowei Ren <xren@nvidia.com> * bug fixes Signed-off-by: Xiaowei Ren <xren@nvidia.com> * bug fix Signed-off-by: Xiaowei Ren <xren@nvidia.com> * cast merged CP results from FP32 to BF16 Signed-off-by: Xiaowei Ren <xren@nvidia.com> * typo fix Signed-off-by: Xiaowei Ren <xren@nvidia.com> * minor change Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix softmax_lse Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix some bugs of FP8 dkv exchange Signed-off-by: Xiaowei Ren <xren@nvidia.com> * typo fix Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add FP8 unit test Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix typos and clean asserts Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix get_p2p_comm_info Signed-off-by: Xiaowei Ren <xren@nvidia.com> * fix dkv p2p exchange Signed-off-by: Xiaowei Ren <xren@nvidia.com> * minor fix Signed-off-by: Xiaowei Ren <xren@nvidia.com> * change FP8 dkv P2P to A2A Signed-off-by: Xiaowei Ren <xren@nvidia.com> * add FP8+CP unit test Signed-off-by: Xiaowei Ren <xren@nvidia.com> * typo fix Signed-off-by: Xiaowei Ren <xren@nvidia.com> * assert amax reduction is needed for FP8+CP Signed-off-by: Xiaowei Ren <xren@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove duplicated code Signed-off-by: Xiaowei Ren <xren@nvidia.com> * destroy process group in CP unit test Signed-off-by: Xiaowei Ren <xren@nvidia.com> * remove interval from fp8_recipe because it has been deprecated Signed-off-by: Xiaowei Ren <xren@nvidia.com> * try to fix the failed CP test with the latest CI pipeline Signed-off-by: Xiaowei Ren <xren@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove redundant f before string Signed-off-by: Xiaowei Ren <xren@nvidia.com> * change META_O_CP Signed-off-by: Xiaowei Ren <xren@nvidia.com> --------- Signed-off-by: Xiaowei Ren <xren@nvidia.com> Co-authored-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Xiaowei Ren <xren@cs-cw-dfw-login-01.cm.cluster>
26c8fcc9 · Xiaowei Ren · GitHub · 525de6cc · 26c8fcc9 · 26c8fcc9
Unverified Commit 26c8fcc9 authored Aug 20, 2024 by Xiaowei Ren Committed by GitHub Aug 20, 2024
3 changed files
--- a/tests/pytorch/fused_attn/run_fused_attn_with_cp.py
+++ b/tests/pytorch/fused_attn/run_fused_attn_with_cp.py
@@ -2,15 +2,18 @@
 #
 # See LICENSE for license information.

-import os, sys
+import os, sys, logging
+from contextlib import nullcontext
 import torch
 import torch.distributed as dist
 from transformer_engine.pytorch.attention import DotProductAttention
 from transformer_engine.pytorch.attention import get_cu_seqlens_on_cp_rank
 import transformer_engine_torch as tex
 from test_fused_attn_with_cp import model_configs_flash_attn, model_configs_fused_attn
+from transformer_engine.pytorch.fp8 import fp8_autocast
+from transformer_engine.common.recipe import DelayedScaling

-dtypes = {"fp16": torch.float16, "bf16": torch.bfloat16}
+dtypes = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp8": torch.bfloat16}


 def run_dpa_with_cp(
@@ -57,6 +60,9 @@ def run_dpa_with_cp(
    assert rank in cp_comm_ranks
    cp_comm_group = dist.new_group(cp_comm_ranks, backend="nccl")

+    if dtype == "fp8":
+        fp8_recipe = DelayedScaling(fp8_dpa=True)
+
    # instantiate core attn module
    core_attn = DotProductAttention(
        config.num_heads,
@@ -171,6 +177,13 @@ def run_dpa_with_cp(
    # run core_attn without CP
    for x in [q, k, v]:
        x.requires_grad = True
+
+    if dtype == "fp8":
+        fp8_context = fp8_autocast(enabled=True, fp8_recipe=fp8_recipe, fp8_group=cp_comm_group)
+    else:
+        fp8_context = nullcontext()
+
+    with fp8_context:
        out = core_attn(
            q,
            k,
@@ -180,7 +193,9 @@ def run_dpa_with_cp(
            cu_seqlens_q=cu_seqlens_q,
            cu_seqlens_kv=cu_seqlens_kv,
            cu_seqlens_q_padded=None if cu_seqlens_q_padded is None else cu_seqlens_q_padded[:-1],
-        cu_seqlens_kv_padded=None if cu_seqlens_kv_padded is None else cu_seqlens_kv_padded[:-1],
+            cu_seqlens_kv_padded=(
+                None if cu_seqlens_kv_padded is None else cu_seqlens_kv_padded[:-1]
+            ),
        )
        out.backward(dout)

@@ -226,6 +241,14 @@ def run_dpa_with_cp(
    core_attn.set_context_parallel_group(
        cp_comm_group, cp_comm_ranks, torch.cuda.Stream(), cp_comm_type
    )
+
+    if dtype == "fp8":
+        core_attn.reset_fp8_meta_tensors()
+        fp8_context = fp8_autocast(enabled=True, fp8_recipe=fp8_recipe, fp8_group=cp_comm_group)
+    else:
+        fp8_context = nullcontext()
+
+    with fp8_context:
        out_ = core_attn(
            q_,
            k_,
@@ -235,7 +258,9 @@ def run_dpa_with_cp(
            cu_seqlens_q=cu_seqlens_q,
            cu_seqlens_kv=cu_seqlens_kv,
            cu_seqlens_q_padded=None if cu_seqlens_q_padded is None else cu_seqlens_q_padded[:-1],
-        cu_seqlens_kv_padded=None if cu_seqlens_kv_padded is None else cu_seqlens_kv_padded[:-1],
+            cu_seqlens_kv_padded=(
+                None if cu_seqlens_kv_padded is None else cu_seqlens_kv_padded[:-1]
+            ),
        )
        out_.backward(dout_)

@@ -244,13 +269,6 @@ def run_dpa_with_cp(
        assert torch.all(~torch.isinf(x))

    # compare results with and without CP
-    tols = dict(atol=5e-3, rtol=5e-3)
-    if dtype == "bf16":
-        if config.num_heads == config.num_gqa_groups:
-            tols = dict(atol=2.5e-2, rtol=2.5e-2)
-        else:
-            tols = dict(atol=3.5e-2, rtol=3.5e-2)
-
    if qkv_format == "bshd" or qkv_format == "sbhd":
        dq, dk, dv, out = [
            x.view(
@@ -309,32 +327,55 @@ def run_dpa_with_cp(
    else:
        assert False, f"{qkv_format} is an unsupported qkv_format!"

+    if dtype == "bf16":
+        if config.num_heads == config.num_gqa_groups:
+            tols = dict(atol=2.5e-2, rtol=2.5e-2)
+        else:
+            tols = dict(atol=3.5e-2, rtol=3.5e-2)
+    elif dtype == "fp16":
+        tols = dict(atol=5e-3, rtol=5e-3)
+    elif dtype == "fp8":
+        tols = dict(atol=5e-1, rtol=5e-1)
+        rmse_tol = 0.1
+    else:
+        assert False, f"{dtype} is an unsupported dtype!"
+
+    def _rmse(a, b):
+        return torch.sqrt((a - b).square().mean()).item()
+
+    def _error(a, b):
+        if dtype != "fp8":
+            torch.testing.assert_close(a, b, **tols)
+        else:
+            try:
+                torch.testing.assert_close(a, b, **tols)
+            except Exception as e:
+                logging.debug(e)
+
+            rmse = _rmse(a, b)
+            rmse_range = max(a.max().item(), b.max().item()) - min(a.min().item(), b.min().item())
+            assert (
+                rmse < rmse_tol * rmse_range
+            ), "RMSE {:.5f} is over tolerance {:.5f} ({:.5f} * {:.5f})".format(
+                rmse, rmse_tol * rmse_range, rmse_tol, rmse_range
+            )
+
    if qkv_format == "bshd":
-        torch.testing.assert_close(out_[:, 0], out[:, 0], **tols)
-        torch.testing.assert_close(dq_[:, 0], dq[:, 0], **tols)
-        torch.testing.assert_close(dk_[:, 0], dk[:, 0], **tols)
-        torch.testing.assert_close(dv_[:, 0], dv[:, 0], **tols)
-        torch.testing.assert_close(out_[:, 1], out[:, 1], **tols)
-        torch.testing.assert_close(dq_[:, 1], dq[:, 1], **tols)
-        torch.testing.assert_close(dk_[:, 1], dk[:, 1], **tols)
-        torch.testing.assert_close(dv_[:, 1], dv[:, 1], **tols)
+        for a, b in zip([out_, dq_, dk_, dv_], [out, dq, dk, dv]):
+            _error(a[:, 0], b[:, 0])
+            _error(a[:, 1], b[:, 1])
    elif qkv_format == "sbhd":
-        torch.testing.assert_close(out_[0], out[0], **tols)
-        torch.testing.assert_close(dq_[0], dq[0], **tols)
-        torch.testing.assert_close(dk_[0], dk[0], **tols)
-        torch.testing.assert_close(dv_[0], dv[0], **tols)
-        torch.testing.assert_close(out_[1], out[1], **tols)
-        torch.testing.assert_close(dq_[1], dq[1], **tols)
-        torch.testing.assert_close(dk_[1], dk[1], **tols)
-        torch.testing.assert_close(dv_[1], dv[1], **tols)
+        for a, b in zip([out_, dq_, dk_, dv_], [out, dq, dk, dv]):
+            _error(a[0], b[0])
+            _error(a[1], b[1])
    elif qkv_format == "thd":
-        torch.testing.assert_close(out_, out, **tols)
-        torch.testing.assert_close(dq_, dq, **tols)
-        torch.testing.assert_close(dk_, dk, **tols)
-        torch.testing.assert_close(dv_, dv, **tols)
+        for a, b in zip([out_, dq_, dk_, dv_], [out, dq, dk, dv]):
+            _error(a, b)
    else:
        assert False, f"{qkv_format} is an unsupported qkv_format!"

+    dist.destroy_process_group()
+

 def main(**kwargs):
    run_dpa_with_cp(**kwargs)

--- a/tests/pytorch/fused_attn/test_fused_attn_with_cp.py
+++ b/tests/pytorch/fused_attn/test_fused_attn_with_cp.py
@@ -90,7 +90,7 @@ model_configs_fused_attn = {

 @pytest.mark.skipif(get_cudnn_version() < (8, 9, 7), reason="cuDNN 8.9.7+ is required.")
 @pytest.mark.skipif(get_device_compute_capability() < (8, 0), reason="CP tests require sm80+.")
-@pytest.mark.parametrize("dtype", ["bf16", "fp16"])
+@pytest.mark.parametrize("dtype", ["bf16", "fp16", "fp8"])
 @pytest.mark.parametrize("model", model_configs_fused_attn.keys())
 @pytest.mark.parametrize("qkv_format", ["bshd", "sbhd", "thd"])
 @pytest.mark.parametrize("cp_comm_type", ["p2p", "all_gather"])
@@ -121,8 +121,16 @@ def test_cp_with_fused_attention(dtype, model, qkv_format, cp_comm_type):
        )
    if config.window_size != (-1, 0) and config.window_size != (-1, -1):
        pytest.skip(
-            f"Fused attention does not support sliding window attention + context parallelism yet!"
+            "Fused attention does not support sliding window attention + context parallelism yet!"
+        )
+    if cp_comm_type == "all_gather" and dtype == "fp8":
+        pytest.skip(
+            "CP implementation with KV all-gather does not support FP8 + context parallelism yet!"
        )
+    if dtype == "fp8" and qkv_format == "thd":
+        pytest.skip("FP8 attention cannot work with THD format yet!")
+    if dtype == "fp8" and config.attn_bias_type != "no_bias":
+        pytest.skip("FP8 attention cannot work with bias yet!")

    subprocess.run(
        get_bash_arguments(

--- a/transformer_engine/pytorch/attention.py
+++ b/transformer_engine/pytorch/attention.py