INFO 05-28 17:52:54 __init__.py:193] Automatically detected platform rocm. INFO 05-28 17:52:54 __init__.py:193] Automatically detected platform rocm. INFO 05-28 17:52:54 __init__.py:193] Automatically detected platform rocm. INFO 05-28 17:52:54 __init__.py:193] Automatically detected platform rocm. INFO 05-28 17:52:54 __init__.py:193] Automatically detected platform rocm. INFO 05-28 17:52:54 __init__.py:193] Automatically detected platform rocm. INFO 05-28 17:52:54 __init__.py:193] Automatically detected platform rocm. INFO 05-28 17:52:54 __init__.py:193] Automatically detected platform rocm. Could not load Sliding Tile Attention. <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< Could not load Sliding Tile Attention. <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< Could not load Sliding Tile Attention. <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< Could not load Sliding Tile Attention. <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< Could not load Sliding Tile Attention. <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< Could not load Sliding Tile Attention. <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< Could not load Sliding Tile Attention. --> loading model from /home/model/HunyuanVideo/hunyuan-video-t2v-720p <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< Could not load Sliding Tile Attention. <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Total training parameters = 12821.012544 M --> Initializing FSDP with sharding strategy: full >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> --> applying fdsp activation checkpointing... --> applying fdsp activation checkpointing... --> applying fdsp activation checkpointing... --> applying fdsp activation checkpointing... --> applying fdsp activation checkpointing... --> model loaded --> applying fdsp activation checkpointing... FullyShardedDataParallel( (_fsdp_wrapped_module): HYVideoDiffusionTransformer( (img_in): PatchEmbed( (proj): Conv3d(16, 3072, kernel_size=(1, 2, 2), stride=(1, 2, 2)) (norm): Identity() ) (txt_in): SingleTokenRefiner( (input_embedder): Linear(in_features=4096, out_features=3072, bias=True) (t_embedder): TimestepEmbedder( (mlp): Sequential( (0): Linear(in_features=256, out_features=3072, bias=True) (1): SiLU() (2): Linear(in_features=3072, out_features=3072, bias=True) ) ) (c_embedder): TextProjection( (linear_1): Linear(in_features=4096, out_features=3072, bias=True) (act_1): SiLU() (linear_2): Linear(in_features=3072, out_features=3072, bias=True) ) (individual_token_refiner): IndividualTokenRefiner( (blocks): ModuleList( (0-1): 2 x IndividualTokenRefinerBlock( (norm1): LayerNorm((3072,), eps=1e-06, elementwise_affine=True) (self_attn_qkv): Linear(in_features=3072, out_features=9216, bias=True) (self_attn_q_norm): Identity() (self_attn_k_norm): Identity() (self_attn_proj): Linear(in_features=3072, out_features=3072, bias=True) (norm2): LayerNorm((3072,), eps=1e-06, elementwise_affine=True) (mlp): MLP( (fc1): Linear(in_features=3072, out_features=12288, bias=True) (act): SiLU() (drop1): Dropout(p=0.0, inplace=False) (norm): Identity() (fc2): Linear(in_features=12288, out_features=3072, bias=True) (drop2): Dropout(p=0.0, inplace=False) ) (adaLN_modulation): Sequential( (0): SiLU() (1): Linear(in_features=3072, out_features=6144, bias=True) ) ) ) ) ) (time_in): TimestepEmbedder( (mlp): Sequential( (0): Linear(in_features=256, out_features=3072, bias=True) (1): SiLU() (2): Linear(in_features=3072, out_features=3072, bias=True) ) ) (vector_in): MLPEmbedder( (in_layer): Linear(in_features=768, out_features=3072, bias=True) (silu): SiLU() (out_layer): Linear(in_features=3072, out_features=3072, bias=True) ) (guidance_in): TimestepEmbedder( (mlp): Sequential( (0): Linear(in_features=256, out_features=3072, bias=True) (1): SiLU() (2): Linear(in_features=3072, out_features=3072, bias=True) ) ) (double_blocks): ModuleList( (0-19): 20 x FullyShardedDataParallel( (_fsdp_wrapped_module): CheckpointWrapper( (_checkpoint_wrapped_module): MMDoubleStreamBlock( (img_mod): ModulateDiT( (act): SiLU() (linear): Linear(in_features=3072, out_features=18432, bias=True) ) (img_norm1): LayerNorm((3072,), eps=1e-06, elementwise_affine=False) (img_attn_qkv): Linear(in_features=3072, out_features=9216, bias=True) (img_attn_q_norm): RMSNorm() (img_attn_k_norm): RMSNorm() (img_attn_proj): Linear(in_features=3072, out_features=3072, bias=True) (img_norm2): LayerNorm((3072,), eps=1e-06, elementwise_affine=False) (img_mlp): MLP( (fc1): Linear(in_features=3072, out_features=12288, bias=True) (act): GELU(approximate='tanh') (drop1): Dropout(p=0.0, inplace=False) (norm): Identity() (fc2): Linear(in_features=12288, out_features=3072, bias=True) (drop2): Dropout(p=0.0, inplace=False) ) (txt_mod): ModulateDiT( (act): SiLU() (linear): Linear(in_features=3072, out_features=18432, bias=True) ) (txt_norm1): LayerNorm((3072,), eps=1e-06, elementwise_affine=False) (txt_attn_qkv): Linear(in_features=3072, out_features=9216, bias=True) (txt_attn_q_norm): RMSNorm() (txt_attn_k_norm): RMSNorm() (txt_attn_proj): Linear(in_features=3072, out_features=3072, bias=True) (txt_norm2): LayerNorm((3072,), eps=1e-06, elementwise_affine=False) (txt_mlp): MLP( (fc1): Linear(in_features=3072, out_features=12288, bias=True) (act): GELU(approximate='tanh') (drop1): Dropout(p=0.0, inplace=False) (norm): Identity() (fc2): Linear(in_features=12288, out_features=3072, bias=True) (drop2): Dropout(p=0.0, inplace=False) ) ) ) ) ) (single_blocks): ModuleList( (0-39): 40 x FullyShardedDataParallel( (_fsdp_wrapped_module): CheckpointWrapper( (_checkpoint_wrapped_module): MMSingleStreamBlock( (linear1): Linear(in_features=3072, out_features=21504, bias=True) (linear2): Linear(in_features=15360, out_features=3072, bias=True) (q_norm): RMSNorm() (k_norm): RMSNorm() (pre_norm): LayerNorm((3072,), eps=1e-06, elementwise_affine=False) (mlp_act): GELU(approximate='tanh') (modulation): ModulateDiT( (act): SiLU() (linear): Linear(in_features=3072, out_features=9216, bias=True) ) ) ) ) ) (final_layer): FinalLayer( (norm_final): LayerNorm((3072,), eps=1e-06, elementwise_affine=False) (linear): Linear(in_features=3072, out_features=64, bias=True) (adaLN_modulation): Sequential( (0): SiLU() (1): Linear(in_features=3072, out_features=6144, bias=True) ) ) ) ) optimizer: AdamW ( Parameter Group 0 amsgrad: False betas: (0.9, 0.999) capturable: False differentiable: False eps: 1e-08 foreach: None fused: None lr: 1e-05 maximize: False weight_decay: 0.01 ) ***** Running training ***** Num examples = 101 Dataloader size = 13 Num Epochs = 1 Resume training from step 0 Instantaneous batch size per device = 1 Total train batch size (w. data & sequence parallel, accumulation) = 2.0 Gradient Accumulation steps = 1 Total optimization steps = 7 Total training parameters per FSDP shard = 1.602626568 B Master weight dtype: torch.float32 --> applying fdsp activation checkpointing... --> applying fdsp activation checkpointing... zll step_time: 134.47s avg_step_time: 134.46719479560852 zll step_time: 120.07s avg_step_time: 127.27018117904663 zll step_time: 120.99s avg_step_time: 125.17721494038899 zll step_time: 119.39s avg_step_time: 123.72936379909515 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ hipDeviceSynchronize 84.17% 100.669s 84.17% 100.669s 139.624ms 0.000us 0.00% 35.552ms 49.309us 721 hipMemcpyWithStream 13.02% 15.574s 13.02% 15.575s 31.338ms 0.000us 0.00% 16.812ms 33.826us 497 aten::copy_ 0.61% 723.697ms 12.56% 15.023s 2.012ms 4.218s 3.48% 4.234s 567.014us 7468 hipLaunchKernel 0.21% 254.906ms 0.21% 257.146ms 17.724us 0.000us 0.00% 0.000us 0.000us 14508 record_param_comms 0.16% 191.936ms 0.22% 258.760ms 125.734us 3.354s 2.77% 3.356s 1.631ms 2058 SeqAllToAll4D 0.13% 160.703ms 84.60% 101.188s 140.539ms 0.000us 0.00% 1.990s 2.764ms 720 FullyShardedDataParallel.forward 0.13% 160.570ms 38.82% 46.431s 761.167ms 0.000us 0.00% 49.040s 803.937ms 61 MulBackward0 0.13% 150.497ms 21.22% 25.382s 103.599ms 0.000us 0.00% 23.817s 97.213ms 245 aten::addmm 0.11% 129.307ms 0.12% 145.753ms 219.839us 9.688s 8.00% 9.688s 14.613ms 663 aten::cat 0.10% 114.163ms 0.10% 118.092ms 103.862us 1.513s 1.25% 1.513s 1.331ms 1137 aten::empty_strided 0.07% 86.079ms 0.07% 86.179ms 17.229us 0.000us 0.00% 0.000us 0.000us 5002 hipStreamWaitEvent 0.07% 79.675ms 0.07% 79.675ms 25.693us 2.587ms 0.00% 2.587ms 0.834us 3101 aten::empty 0.06% 73.401ms 0.06% 73.406ms 14.103us 0.000us 0.00% 0.000us 0.000us 5205 aten::mul 0.06% 70.853ms 0.06% 75.138ms 75.744us 524.286ms 0.43% 524.286ms 528.514us 992 aten::mm 0.06% 69.226ms 0.06% 71.864ms 105.682us 5.081s 4.20% 5.081s 7.472ms 680 FullyShardedDataParallel._post_backward_hook 0.05% 62.576ms 0.09% 103.313ms 1.694ms 0.000us 0.00% 942.268ms 15.447ms 61 CompiledFunctionBackward 0.05% 55.039ms 0.07% 78.289ms 355.858us 0.000us 0.00% 887.844ms 4.036ms 220 FullyShardedDataParallel._pre_forward 0.05% 54.081ms 0.07% 88.569ms 1.452ms 0.000us 0.00% 768.969ms 12.606ms 61 FullyShardedDataParallel._pre_backward_prefetch 0.04% 49.068ms 0.06% 74.882ms 1.228ms 0.000us 0.00% 479.663ms 7.863ms 61 aten::sum 0.04% 48.855ms 0.05% 64.011ms 66.195us 211.543ms 0.17% 212.407ms 219.655us 967 FullyShardedDataParallel._post_forward 0.04% 48.429ms 0.04% 50.913ms 834.635us 0.000us 0.00% 0.000us 0.000us 61 Torch-Compiled Region 0.04% 42.560ms 12.51% 14.964s 33.780ms 0.000us 0.00% 848.329ms 1.915ms 443 autograd::engine::evaluate_function: ToCopyBackward0... 0.03% 38.424ms 0.09% 107.529ms 97.842us 0.000us 0.00% 387.258ms 352.373us 1099 aten::view 0.03% 37.252ms 0.03% 37.282ms 4.034us 0.000us 0.00% 0.000us 0.000us 9243 aten::cos 0.03% 36.933ms 0.03% 36.933ms 12.311ms 0.000us 0.00% 0.000us 0.000us 3 aten::fill_ 0.03% 34.353ms 0.03% 35.624ms 21.590us 427.635ms 0.35% 427.635ms 259.173us 1650 autograd::engine::evaluate_function: SiluBackward0 0.03% 32.563ms 0.03% 40.804ms 453.380us 0.000us 0.00% 1.159ms 12.880us 90 aten::slice 0.03% 31.365ms 0.03% 37.865ms 5.520us 0.000us 0.00% 0.000us 0.000us 6859 _AllGather 0.03% 30.101ms 0.09% 106.934ms 891.117us 0.000us 0.00% 13.714ms 114.284us 120 aten::add 0.02% 27.042ms 0.02% 29.512ms 43.851us 522.337ms 0.43% 522.337ms 776.132us 673 aten::sin 0.02% 26.174ms 0.02% 26.190ms 8.730ms 0.000us 0.00% 120.322us 40.107us 3 aten::_to_copy 0.02% 25.695ms 0.15% 180.799ms 49.643us 0.000us 0.00% 1.144s 314.141us 3642 FlashAttnVarlenQKVPackedFunc 0.02% 25.084ms 0.03% 35.592ms 291.737us 30.005s 24.78% 30.005s 245.939ms 122 c10d::alltoall_base_ 0.02% 24.680ms 0.14% 166.255ms 230.910us 0.000us 0.00% 1.295s 1.798ms 720 aten::as_strided 0.02% 22.408ms 0.02% 22.413ms 1.221us 0.000us 0.00% 0.000us 0.000us 18355 aten::native_layer_norm 0.02% 19.702ms 0.04% 43.696ms 178.350us 161.866ms 0.13% 1.021s 4.169ms 245 aten::transpose 0.01% 17.624ms 0.02% 25.553ms 6.562us 0.000us 0.00% 0.000us 0.000us 3894 hipMemcpyAsync 0.01% 17.259ms 0.01% 17.532ms 9.019us 0.000us 0.00% 0.000us 0.000us 1944 hipExtModuleLaunchKernel 0.01% 16.958ms 0.01% 17.750ms 13.168us 0.000us 0.00% 0.000us 0.000us 1348 hipExtLaunchKernel 0.01% 16.930ms 0.01% 16.930ms 16.453us 0.000us 0.00% 0.000us 0.000us 1029 aten::reshape 0.01% 16.312ms 0.04% 50.454ms 8.016us 0.000us 0.00% 119.156ms 18.932us 6294 FlashAttnVarlenQKVPackedFuncBackward 0.01% 15.858ms 0.02% 23.675ms 381.850us 62.035s 51.24% 62.035s 1.001s 62 aten::silu 0.01% 15.726ms 0.01% 17.393ms 102.311us 176.068ms 0.15% 176.068ms 1.036ms 170 SeqAllToAll4DBackward 0.01% 15.420ms 58.25% 69.674s 290.310ms 0.000us 0.00% 865.868ms 3.608ms 240 triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_0 0.01% 13.937ms 0.01% 17.359ms 54.247us 172.325ms 0.14% 172.325ms 538.517us 320 triton_poi_fused__to_copy_add_mul_0 0.01% 13.831ms 0.01% 17.311ms 96.170us 171.884ms 0.14% 171.884ms 954.911us 180 aten::nonzero 0.01% 13.214ms 0.58% 697.830ms 5.673ms 5.169ms 0.00% 5.176ms 42.084us 123 autograd::engine::evaluate_function: ViewBackward0 0.01% 12.922ms 0.02% 26.886ms 14.668us 0.000us 0.00% 39.840ms 21.735us 1833 hipModuleLaunchKernel 0.01% 12.778ms 0.01% 12.778ms 15.526us 0.000us 0.00% 0.000us 0.000us 823 TorchDynamo Cache Lookup 0.01% 12.355ms 0.01% 12.355ms 27.890us 0.000us 0.00% 0.000us 0.000us 443 autograd::engine::evaluate_function: SplitWithSizesB... 0.01% 12.328ms 0.02% 28.375ms 176.243us 0.000us 0.00% 327.384ms 2.033ms 161 IndexFirstAxis 0.01% 12.234ms 0.02% 22.471ms 184.189us 0.000us 0.00% 252.336ms 2.068ms 122 autograd::engine::evaluate_function: CompiledFunctio... 0.01% 12.120ms 0.08% 92.165ms 418.932us 0.000us 0.00% 887.844ms 4.036ms 220 FullyShardedDataParallel._pre_backward_hook 0.01% 11.743ms 0.07% 88.988ms 1.459ms 0.000us 0.00% 479.663ms 7.863ms 61 aten::gelu 0.01% 11.488ms 0.01% 12.689ms 79.307us 626.050ms 0.52% 626.050ms 3.913ms 160 aten::empty_like 0.01% 10.780ms 0.05% 57.682ms 19.795us 0.000us 0.00% 0.000us 0.000us 2914 autograd::engine::evaluate_function: SliceBackward0 0.01% 10.111ms 0.08% 93.638ms 76.627us 0.000us 0.00% 822.836ms 673.352us 1222 aten::clone 0.01% 9.851ms 0.16% 189.594ms 115.818us 0.000us 0.00% 2.078s 1.269ms 1637 autograd::engine::evaluate_function: torch::autograd... 0.01% 9.445ms 0.10% 113.658ms 1.863ms 0.000us 0.00% 942.268ms 15.447ms 61 aten::split_with_sizes 0.01% 9.365ms 0.01% 10.823ms 28.334us 0.000us 0.00% 0.000us 0.000us 382 aten::select 0.01% 8.969ms 0.01% 10.807ms 6.320us 0.000us 0.00% 0.000us 0.000us 1710 aten::linear 0.01% 8.641ms 0.35% 421.174ms 317.627us 0.000us 0.00% 19.784s 14.920ms 1326 aten::t 0.01% 7.810ms 0.01% 16.990ms 7.895us 0.000us 0.00% 0.000us 0.000us 2152 aten::add_ 0.01% 7.807ms 0.01% 8.418ms 20.092us 108.564ms 0.09% 108.564ms 259.103us 419 aten::narrow 0.01% 7.701ms 0.02% 21.681ms 8.260us 0.000us 0.00% 0.000us 0.000us 2625 aten::unsqueeze 0.01% 7.339ms 0.01% 8.495ms 8.529us 0.000us 0.00% 0.000us 0.000us 996 AddmmBackward0 0.01% 7.234ms 0.08% 91.333ms 266.277us 0.000us 0.00% 5.081s 14.814ms 343 hipPointerGetAttribute 0.01% 7.193ms 0.01% 7.193ms 1.890us 0.000us 0.00% 0.000us 0.000us 3806 aten::to 0.01% 6.860ms 0.16% 188.490ms 46.029us 0.000us 0.00% 1.144s 279.390us 4095 triton_red_fused__to_copy_mul_sum_0 0.01% 6.717ms 0.01% 8.204ms 51.278us 741.703ms 0.61% 741.703ms 4.636ms 160 hipMemsetAsync 0.01% 6.682ms 0.01% 6.682ms 11.249us 0.000us 0.00% 0.000us 0.000us 594 c10d::allgather_ 0.01% 6.670ms 0.05% 55.681ms 464.008us 0.000us 0.00% 11.372ms 94.763us 120 IndexFirstAxisBackward 0.01% 6.549ms 0.01% 16.586ms 267.509us 0.000us 0.00% 171.469ms 2.766ms 62 detach 0.01% 6.502ms 0.01% 6.502ms 2.906us 0.000us 0.00% 0.000us 0.000us 2237 autograd::engine::evaluate_function: AddmmBackward0 0.01% 6.448ms 0.10% 120.042ms 349.977us 0.000us 0.00% 5.193s 15.140ms 343 aten::cumsum 0.01% 6.343ms 0.01% 7.118ms 56.493us 765.929us 0.00% 790.570us 6.274us 126 aten::gather 0.01% 6.290ms 0.01% 6.871ms 56.323us 252.336ms 0.21% 252.336ms 2.068ms 122 aten::zero_ 0.01% 6.257ms 0.03% 35.239ms 25.063us 0.000us 0.00% 426.693ms 303.480us 1406 autograd::engine::evaluate_function: SeqAllToAll4DBa... 0.00% 5.910ms 58.26% 69.680s 290.335ms 0.000us 0.00% 865.868ms 3.608ms 240 FullyShardedDataParallel._post_backward_prefetch 0.00% 5.902ms 0.00% 5.902ms 96.750us 0.000us 0.00% 0.000us 0.000us 61 aten::_local_scalar_dense 0.00% 5.815ms 0.13% 155.651ms 563.951us 1.914ms 0.00% 1.914ms 6.933us 276 autograd::engine::evaluate_function: AddBackward0 0.00% 5.724ms 0.09% 102.964ms 279.036us 0.000us 0.00% 492.652ms 1.335ms 369 IndexPutFirstAxis 0.00% 5.608ms 0.02% 20.935ms 171.602us 0.000us 0.00% 152.980ms 1.254ms 122 aten::max 0.00% 5.441ms 0.01% 7.271ms 59.602us 1.394ms 0.00% 1.394ms 11.430us 122 triton_red_fused__to_copy_add_div_mul_pow_sum_1 0.00% 5.400ms 0.01% 6.159ms 38.493us 82.836ms 0.07% 82.836ms 517.723us 160 aten::zeros 0.00% 5.230ms 0.05% 55.406ms 39.407us 0.000us 0.00% 426.693ms 303.480us 1406 FullyShardedDataParallel._pre_forward_prefetch 0.00% 5.220ms 0.00% 5.220ms 85.574us 0.000us 0.00% 0.000us 0.000us 61 FullyShardedDataParallel.rate_limiter 0.00% 5.200ms 0.00% 5.870ms 48.508us 0.000us 0.00% 12.803ms 105.813us 121 aten::index 0.00% 5.054ms 0.00% 5.522ms 86.274us 54.427ms 0.04% 54.427ms 850.425us 64 hipEventDestroy 0.00% 5.046ms 0.00% 5.046ms 1.565us 102.326ms 0.08% 102.326ms 31.749us 3223 aten::_index_put_impl_ 0.00% 4.724ms 0.00% 5.518ms 45.228us 107.095ms 0.09% 107.095ms 877.829us 122 aten::div_ 0.00% 4.576ms 0.00% 4.772ms 39.117us 107.464ms 0.09% 107.464ms 880.855us 122 aten::expand 0.00% 4.516ms 0.00% 5.448ms 5.890us 0.000us 0.00% 0.000us 0.000us 925 aten::native_layer_norm_backward 0.00% 4.351ms 0.01% 8.985ms 71.878us 142.020ms 0.12% 575.991ms 4.608ms 125 ViewBackward0 0.00% 4.287ms 0.01% 13.931ms 7.600us 0.000us 0.00% 39.840ms 21.735us 1833 IndexPutFirstAxisBackward 0.00% 4.100ms 0.01% 9.721ms 156.787us 0.000us 0.00% 54.421ms 877.755us 62 NativeLayerNormBackward0 0.00% 4.026ms 0.01% 13.462ms 107.694us 0.000us 0.00% 575.991ms 4.608ms 125 aten::split 0.00% 3.777ms 0.01% 10.301ms 45.987us 0.000us 0.00% 0.000us 0.000us 224 _AllGatherBackward 0.00% 3.660ms 0.00% 4.762ms 79.375us 0.000us 0.00% 0.000us 0.000us 60 aten::silu_backward 0.00% 3.489ms 0.00% 3.662ms 40.689us 594.725us 0.00% 594.725us 6.608us 90 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Self CPU time total: 119.608s Self CUDA time total: 121.065s ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ hipDeviceSynchronize 84.34% 100.874s 84.34% 100.874s 139.909ms 0.000us 0.00% 24.750ms 34.327us 721 hipMemcpyWithStream 12.87% 15.398s 12.87% 15.398s 30.982ms 0.000us 0.00% 37.278ms 75.006us 497 aten::copy_ 0.39% 470.019ms 12.42% 14.849s 1.988ms 4.315s 3.57% 4.352s 582.774us 7468 hipLaunchKernel 0.23% 273.186ms 0.23% 276.039ms 19.048us 0.000us 0.00% 0.000us 0.000us 14492 record_param_comms 0.15% 185.043ms 0.21% 255.320ms 124.062us 3.328s 2.75% 3.330s 1.618ms 2058 MulBackward0 0.14% 170.507ms 21.09% 25.223s 102.952ms 0.000us 0.00% 23.703s 96.748ms 245 SeqAllToAll4D 0.14% 166.285ms 84.78% 101.398s 140.831ms 0.000us 0.00% 2.198s 3.052ms 720 aten::cat 0.10% 114.532ms 0.10% 117.869ms 103.666us 1.526s 1.26% 1.526s 1.342ms 1137 aten::addmm 0.10% 114.191ms 0.11% 128.915ms 194.443us 9.409s 7.79% 9.409s 14.191ms 663 FullyShardedDataParallel.forward 0.09% 107.907ms 38.81% 46.416s 760.918ms 0.000us 0.00% 54.027s 885.688ms 61 hipStreamWaitEvent 0.07% 83.513ms 0.07% 83.513ms 26.931us 18.123ms 0.02% 18.123ms 5.844us 3101 aten::empty_strided 0.07% 80.149ms 0.07% 80.413ms 16.102us 0.000us 0.00% 0.000us 0.000us 4994 aten::mm 0.06% 76.452ms 0.07% 79.348ms 116.688us 5.038s 4.17% 5.038s 7.408ms 680 aten::empty 0.06% 74.737ms 0.06% 74.743ms 14.360us 0.000us 0.00% 1.410ms 0.271us 5205 aten::mul 0.06% 71.322ms 0.06% 75.360ms 75.968us 522.945ms 0.43% 522.945ms 527.162us 992 FullyShardedDataParallel._post_backward_hook 0.06% 71.071ms 0.10% 118.335ms 1.940ms 0.000us 0.00% 659.092ms 10.805ms 61 CompiledFunctionBackward 0.05% 65.442ms 0.08% 90.612ms 411.872us 0.000us 0.00% 887.629ms 4.035ms 220 FullyShardedDataParallel._pre_backward_prefetch 0.05% 53.838ms 0.08% 89.791ms 1.472ms 0.000us 0.00% 479.762ms 7.865ms 61 aten::sum 0.04% 51.546ms 0.05% 65.168ms 67.884us 221.470ms 0.18% 222.217ms 231.476us 960 autograd::engine::evaluate_function: ToCopyBackward0... 0.03% 40.798ms 0.10% 116.159ms 105.695us 0.000us 0.00% 424.719ms 386.459us 1099 aten::view 0.03% 39.252ms 0.03% 39.284ms 4.254us 0.000us 0.00% 0.000us 0.000us 9235 aten::cos 0.03% 37.595ms 0.03% 37.595ms 12.532ms 0.000us 0.00% 0.000us 0.000us 3 Torch-Compiled Region 0.03% 37.273ms 12.35% 14.774s 33.349ms 0.000us 0.00% 888.850ms 2.006ms 443 aten::fill_ 0.03% 35.838ms 0.03% 36.891ms 22.358us 447.575ms 0.37% 447.575ms 271.257us 1650 FullyShardedDataParallel._pre_forward 0.03% 34.556ms 0.05% 57.712ms 946.096us 0.000us 0.00% 797.142ms 13.068ms 61 FullyShardedDataParallel._post_forward 0.03% 33.746ms 0.03% 35.436ms 580.920us 0.000us 0.00% 0.000us 0.000us 61 aten::slice 0.03% 30.247ms 0.03% 36.853ms 5.392us 0.000us 0.00% 0.000us 0.000us 6835 autograd::engine::evaluate_function: SiluBackward0 0.02% 27.480ms 0.03% 37.096ms 412.178us 0.000us 0.00% 912.176us 10.135us 90 _AllGather 0.02% 27.112ms 0.09% 105.043ms 875.357us 0.000us 0.00% 15.267ms 127.226us 120 c10d::alltoall_base_ 0.02% 27.012ms 0.13% 160.987ms 223.593us 0.000us 0.00% 1.510s 2.097ms 720 aten::_to_copy 0.02% 26.293ms 0.15% 176.841ms 48.556us 0.000us 0.00% 1.227s 336.996us 3642 aten::add 0.02% 24.730ms 0.02% 26.316ms 39.103us 556.948ms 0.46% 556.948ms 827.559us 673 aten::as_strided 0.02% 22.725ms 0.02% 22.776ms 1.244us 0.000us 0.00% 0.000us 0.000us 18315 hipExtLaunchKernel 0.02% 21.978ms 0.02% 21.978ms 21.358us 0.000us 0.00% 0.000us 0.000us 1029 FlashAttnVarlenQKVPackedFuncBackward 0.02% 21.750ms 0.03% 30.578ms 493.199us 61.959s 51.28% 61.959s 999.332ms 62 FlashAttnVarlenQKVPackedFunc 0.02% 21.687ms 0.03% 30.772ms 252.226us 30.004s 24.83% 30.004s 245.934ms 122 aten::transpose 0.02% 20.748ms 0.02% 28.924ms 7.428us 0.000us 0.00% 0.000us 0.000us 3894 aten::reshape 0.01% 17.483ms 0.05% 54.504ms 8.671us 0.000us 0.00% 119.745ms 19.049us 6286 hipMemcpyAsync 0.01% 17.180ms 0.01% 17.496ms 9.000us 0.000us 0.00% 0.000us 0.000us 1944 SeqAllToAll4DBackward 0.01% 17.097ms 58.29% 69.723s 290.511ms 0.000us 0.00% 945.056ms 3.938ms 240 aten::native_layer_norm 0.01% 16.975ms 0.03% 39.369ms 160.689us 156.084ms 0.13% 1.018s 4.156ms 245 hipExtModuleLaunchKernel 0.01% 16.967ms 0.01% 17.735ms 13.157us 0.000us 0.00% 0.000us 0.000us 1348 autograd::engine::evaluate_function: SplitWithSizesB... 0.01% 14.421ms 0.03% 32.365ms 201.026us 0.000us 0.00% 337.643ms 2.097ms 161 aten::silu 0.01% 14.013ms 0.01% 15.206ms 89.448us 170.886ms 0.14% 170.886ms 1.005ms 170 autograd::engine::evaluate_function: ViewBackward0 0.01% 13.863ms 0.03% 29.939ms 16.333us 0.000us 0.00% 40.116ms 21.886us 1833 aten::sin 0.01% 13.499ms 0.01% 13.499ms 4.500ms 0.000us 0.00% 0.000us 0.000us 3 FullyShardedDataParallel._pre_backward_hook 0.01% 13.386ms 0.09% 105.679ms 1.732ms 0.000us 0.00% 479.762ms 7.865ms 61 autograd::engine::evaluate_function: CompiledFunctio... 0.01% 13.350ms 0.09% 106.064ms 482.110us 0.000us 0.00% 887.629ms 4.035ms 220 TorchDynamo Cache Lookup 0.01% 13.305ms 0.01% 13.305ms 30.033us 0.000us 0.00% 0.000us 0.000us 443 triton_poi_fused__to_copy_add_mul_0 0.01% 13.267ms 0.01% 16.583ms 92.128us 174.771ms 0.14% 174.771ms 970.947us 180 aten::nonzero 0.01% 12.377ms 0.59% 700.100ms 5.692ms 5.191ms 0.00% 5.191ms 42.202us 123 triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_0 0.01% 12.189ms 0.01% 14.780ms 46.188us 172.164ms 0.14% 172.164ms 538.013us 320 hipModuleLaunchKernel 0.01% 11.967ms 0.01% 11.967ms 14.541us 0.000us 0.00% 0.000us 0.000us 823 autograd::engine::evaluate_function: torch::autograd... 0.01% 11.378ms 0.11% 130.830ms 2.145ms 0.000us 0.00% 659.092ms 10.805ms 61 aten::gelu 0.01% 11.181ms 0.01% 12.218ms 76.362us 600.131ms 0.50% 600.131ms 3.751ms 160 IndexFirstAxis 0.01% 11.156ms 0.02% 20.487ms 167.925us 0.000us 0.00% 265.784ms 2.179ms 122 aten::empty_like 0.01% 11.050ms 0.05% 60.395ms 20.726us 0.000us 0.00% 0.000us 0.000us 2914 aten::clone 0.01% 11.016ms 0.16% 196.145ms 119.820us 0.000us 0.00% 2.073s 1.267ms 1637 autograd::engine::evaluate_function: SliceBackward0 0.01% 10.676ms 0.09% 105.181ms 86.073us 0.000us 0.00% 844.795ms 691.322us 1222 aten::split_with_sizes 0.01% 9.496ms 0.01% 11.130ms 29.136us 0.000us 0.00% 0.000us 0.000us 382 aten::select 0.01% 9.223ms 0.01% 11.006ms 6.436us 0.000us 0.00% 0.000us 0.000us 1710 aten::add_ 0.01% 8.742ms 0.01% 9.247ms 22.498us 112.105ms 0.09% 112.105ms 272.762us 411 AddmmBackward0 0.01% 8.306ms 0.09% 101.951ms 297.235us 0.000us 0.00% 5.038s 14.687ms 343 c10d::allgather_ 0.01% 8.213ms 0.05% 55.967ms 466.392us 0.000us 0.00% 12.355ms 102.957us 120 aten::t 0.01% 7.960ms 0.02% 19.557ms 9.088us 0.000us 0.00% 0.000us 0.000us 2152 detach 0.01% 7.923ms 0.01% 7.923ms 3.554us 0.000us 0.00% 0.000us 0.000us 2229 autograd::engine::evaluate_function: AddmmBackward0 0.01% 7.661ms 0.11% 132.821ms 387.232us 0.000us 0.00% 5.154s 15.026ms 343 aten::narrow 0.01% 7.552ms 0.02% 21.454ms 8.173us 0.000us 0.00% 0.000us 0.000us 2625 aten::unsqueeze 0.01% 7.312ms 0.01% 8.478ms 8.581us 0.000us 0.00% 0.000us 0.000us 988 triton_red_fused__to_copy_mul_sum_0 0.01% 7.240ms 0.01% 8.937ms 55.855us 739.728ms 0.61% 739.728ms 4.623ms 160 IndexFirstAxisBackward 0.01% 7.154ms 0.02% 18.750ms 302.421us 0.000us 0.00% 171.658ms 2.769ms 62 aten::to 0.01% 7.147ms 0.15% 184.687ms 45.189us 0.000us 0.00% 1.227s 300.304us 4087 hipPointerGetAttribute 0.01% 7.085ms 0.01% 7.085ms 1.861us 0.000us 0.00% 0.000us 0.000us 3806 autograd::engine::evaluate_function: SeqAllToAll4DBa... 0.01% 7.069ms 58.30% 69.730s 290.541ms 0.000us 0.00% 945.056ms 3.938ms 240 FullyShardedDataParallel._post_backward_prefetch 0.01% 6.769ms 0.01% 6.769ms 110.964us 0.000us 0.00% 0.000us 0.000us 61 aten::zero_ 0.01% 6.589ms 0.03% 37.442ms 26.630us 0.000us 0.00% 446.634ms 317.663us 1406 aten::linear 0.01% 6.487ms 0.32% 379.426ms 286.143us 0.000us 0.00% 19.236s 14.507ms 1326 hipMemsetAsync 0.01% 6.358ms 0.01% 6.358ms 10.757us 0.000us 0.00% 0.000us 0.000us 591 autograd::engine::evaluate_function: AddBackward0 0.01% 6.151ms 0.10% 121.073ms 328.110us 0.000us 0.00% 492.153ms 1.334ms 369 aten::zeros 0.01% 6.049ms 0.05% 58.812ms 41.829us 0.000us 0.00% 446.634ms 317.663us 1406 aten::index 0.00% 5.955ms 0.01% 6.475ms 101.171us 67.835ms 0.06% 67.835ms 1.060ms 64 aten::cumsum 0.00% 5.870ms 0.01% 6.377ms 51.016us 770.567us 0.00% 789.287us 6.314us 125 aten::_local_scalar_dense 0.00% 5.849ms 0.12% 146.023ms 561.628us 1.952ms 0.00% 1.952ms 7.508us 260 triton_red_fused__to_copy_add_div_mul_pow_sum_1 0.00% 5.777ms 0.01% 6.589ms 41.179us 83.000ms 0.07% 83.000ms 518.752us 160 aten::gather 0.00% 5.706ms 0.01% 6.093ms 49.944us 265.784ms 0.22% 265.784ms 2.179ms 122 aten::max 0.00% 5.351ms 0.01% 6.896ms 56.524us 1.386ms 0.00% 1.386ms 11.361us 122 aten::div_ 0.00% 5.139ms 0.00% 5.409ms 44.336us 109.135ms 0.09% 109.135ms 894.547us 122 hipEventDestroy 0.00% 5.004ms 0.00% 5.004ms 1.553us 86.195ms 0.07% 86.195ms 26.744us 3223 aten::native_layer_norm_backward 0.00% 4.993ms 0.01% 10.272ms 82.177us 143.927ms 0.12% 568.432ms 4.547ms 125 ViewBackward0 0.00% 4.966ms 0.01% 16.044ms 8.753us 0.000us 0.00% 40.116ms 21.886us 1833 IndexPutFirstAxis 0.00% 4.778ms 0.02% 19.170ms 157.129us 0.000us 0.00% 153.055ms 1.255ms 122 FullyShardedDataParallel.rate_limiter 0.00% 4.648ms 0.00% 5.123ms 42.335us 0.000us 0.00% 8.688ms 71.801us 121 aten::_index_put_impl_ 0.00% 4.476ms 0.00% 5.205ms 42.667us 107.175ms 0.09% 107.175ms 878.487us 122 NativeLayerNormBackward0 0.00% 4.462ms 0.01% 15.502ms 124.015us 0.000us 0.00% 568.432ms 4.547ms 125 SliceBackward0 0.00% 4.406ms 0.07% 88.611ms 72.513us 0.000us 0.00% 773.724ms 633.162us 1222 IndexPutFirstAxisBackward 0.00% 4.353ms 0.01% 10.930ms 176.283us 0.000us 0.00% 67.829ms 1.094ms 62 aten::expand 0.00% 4.254ms 0.00% 5.111ms 5.526us 0.000us 0.00% 0.000us 0.000us 925 _AllGatherBackward 0.00% 4.086ms 0.00% 5.406ms 90.098us 0.000us 0.00% 0.000us 0.000us 60 aten::silu_backward 0.00% 4.006ms 0.00% 4.293ms 47.701us 560.491us 0.00% 560.491us 6.228us 90 ToCopyBackward0 0.00% 3.954ms 0.05% 55.103ms 50.139us 0.000us 0.00% 384.098ms 349.498us 1099 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Self CPU time total: 119.604s Self CUDA time total: 120.816s ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ hipDeviceSynchronize 85.21% 101.924s 85.21% 101.924s 141.365ms 0.000us 0.00% 43.724ms 60.643us 721 hipMemcpyWithStream 11.94% 14.280s 11.94% 14.281s 28.734ms 0.000us 0.00% 18.427ms 37.077us 497 aten::copy_ 1.48% 1.769s 11.49% 13.741s 1.840ms 4.207s 3.48% 4.225s 565.770us 7468 hipLaunchKernel 0.21% 254.242ms 0.21% 256.925ms 17.392us 0.000us 0.00% 0.000us 0.000us 14773 record_param_comms 0.16% 193.482ms 0.22% 261.265ms 126.951us 4.571s 3.78% 4.573s 2.222ms 2058 FullyShardedDataParallel.forward 0.14% 167.552ms 38.83% 46.444s 761.375ms 0.000us 0.00% 47.723s 782.337ms 61 SeqAllToAll4D 0.14% 166.631ms 85.66% 102.455s 142.299ms 0.000us 0.00% 3.250s 4.514ms 720 MulBackward0 0.13% 151.091ms 21.23% 25.389s 103.628ms 0.000us 0.00% 23.873s 97.440ms 245 aten::addmm 0.11% 129.069ms 0.12% 146.035ms 220.263us 8.557s 7.07% 8.557s 12.906ms 663 aten::cat 0.09% 113.382ms 0.10% 117.692ms 103.511us 1.550s 1.28% 1.551s 1.364ms 1137 aten::empty_strided 0.07% 85.226ms 0.07% 85.429ms 16.640us 0.000us 0.00% 0.000us 0.000us 5134 hipStreamWaitEvent 0.07% 80.428ms 0.07% 80.428ms 25.936us 4.430ms 0.00% 4.430ms 1.429us 3101 aten::mul 0.06% 72.538ms 0.06% 77.198ms 77.820us 574.958ms 0.48% 574.958ms 579.595us 992 aten::mm 0.06% 72.228ms 0.06% 74.987ms 110.274us 5.055s 4.18% 5.055s 7.434ms 680 aten::empty 0.06% 71.854ms 0.06% 71.864ms 13.807us 0.000us 0.00% 0.000us 0.000us 5205 FullyShardedDataParallel._post_backward_hook 0.06% 66.167ms 0.09% 109.344ms 1.793ms 0.000us 0.00% 978.005ms 16.033ms 61 CompiledFunctionBackward 0.05% 56.229ms 0.07% 78.955ms 358.884us 0.000us 0.00% 877.270ms 3.988ms 220 FullyShardedDataParallel._pre_forward 0.05% 54.734ms 0.07% 88.337ms 1.448ms 0.000us 0.00% 845.205ms 13.856ms 61 FullyShardedDataParallel._pre_backward_prefetch 0.04% 49.085ms 0.06% 76.325ms 1.251ms 0.000us 0.00% 447.509ms 7.336ms 61 aten::sum 0.04% 48.708ms 0.05% 64.897ms 66.904us 221.518ms 0.18% 222.451ms 229.331us 970 FullyShardedDataParallel._post_forward 0.04% 48.408ms 0.04% 51.486ms 844.030us 0.000us 0.00% 0.000us 0.000us 61 Torch-Compiled Region 0.03% 41.506ms 11.44% 13.684s 30.889ms 0.000us 0.00% 800.928ms 1.808ms 443 aten::view 0.03% 39.621ms 0.03% 39.656ms 4.230us 0.000us 0.00% 0.000us 0.000us 9375 autograd::engine::evaluate_function: ToCopyBackward0... 0.03% 37.777ms 0.09% 106.974ms 97.338us 0.000us 0.00% 414.852ms 377.481us 1099 aten::cos 0.03% 36.692ms 0.03% 36.692ms 12.231ms 0.000us 0.00% 0.000us 0.000us 3 aten::slice 0.03% 33.674ms 0.03% 41.061ms 5.660us 0.000us 0.00% 0.000us 0.000us 7255 aten::fill_ 0.03% 33.553ms 0.03% 34.717ms 21.041us 428.657ms 0.35% 428.657ms 259.792us 1650 autograd::engine::evaluate_function: SiluBackward0 0.03% 32.542ms 0.03% 40.940ms 454.892us 0.000us 0.00% 963.525us 10.706us 90 _AllGather 0.03% 30.460ms 0.09% 107.792ms 898.268us 0.000us 0.00% 13.186ms 109.887us 120 aten::_to_copy 0.02% 27.360ms 0.15% 182.368ms 50.074us 0.000us 0.00% 1.169s 320.921us 3642 aten::add 0.02% 27.065ms 0.02% 29.408ms 43.697us 626.019ms 0.52% 626.019ms 930.191us 673 FlashAttnVarlenQKVPackedFunc 0.02% 26.802ms 0.03% 37.341ms 306.071us 30.004s 24.79% 30.004s 245.937ms 122 c10d::alltoall_base_ 0.02% 25.732ms 0.14% 169.395ms 235.271us 0.000us 0.00% 2.544s 3.534ms 720 aten::as_strided 0.02% 24.090ms 0.02% 24.141ms 1.270us 0.000us 0.00% 0.000us 0.000us 19015 aten::native_layer_norm 0.02% 19.620ms 0.04% 44.107ms 180.030us 177.916ms 0.15% 1.051s 4.289ms 245 hipMemcpyAsync 0.02% 18.528ms 0.02% 18.807ms 9.674us 0.000us 0.00% 0.000us 0.000us 1944 aten::transpose 0.01% 17.847ms 0.02% 26.055ms 6.691us 0.000us 0.00% 0.000us 0.000us 3894 aten::reshape 0.01% 17.581ms 0.04% 53.127ms 8.267us 0.000us 0.00% 119.875ms 18.655us 6426 hipExtModuleLaunchKernel 0.01% 17.395ms 0.02% 18.239ms 13.531us 0.000us 0.00% 0.000us 0.000us 1348 hipExtLaunchKernel 0.01% 17.275ms 0.01% 17.275ms 16.788us 0.000us 0.00% 0.000us 0.000us 1029 aten::silu 0.01% 15.799ms 0.01% 17.457ms 102.688us 40.812ms 0.03% 40.812ms 240.072us 170 FlashAttnVarlenQKVPackedFuncBackward 0.01% 15.688ms 0.02% 23.329ms 376.280us 61.975s 51.20% 61.975s 999.597ms 62 SeqAllToAll4DBackward 0.01% 15.406ms 58.24% 69.659s 290.244ms 0.000us 0.00% 833.080ms 3.471ms 240 triton_poi_fused__to_copy_add_mul_0 0.01% 14.288ms 0.01% 17.867ms 99.260us 171.166ms 0.14% 171.166ms 950.923us 180 triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_0 0.01% 13.864ms 0.01% 17.292ms 54.037us 172.222ms 0.14% 172.222ms 538.194us 320 aten::sin 0.01% 13.583ms 0.01% 13.596ms 4.532ms 0.000us 0.00% 0.000us 0.000us 3 IndexFirstAxis 0.01% 13.098ms 0.02% 23.301ms 190.994us 0.000us 0.00% 252.516ms 2.070ms 122 aten::nonzero 0.01% 13.066ms 0.58% 696.951ms 5.666ms 5.139ms 0.00% 5.172ms 42.053us 123 hipModuleLaunchKernel 0.01% 12.840ms 0.01% 12.843ms 15.606us 0.000us 0.00% 0.000us 0.000us 823 autograd::engine::evaluate_function: ViewBackward0 0.01% 12.442ms 0.02% 27.282ms 14.884us 0.000us 0.00% 40.170ms 21.915us 1833 TorchDynamo Cache Lookup 0.01% 12.400ms 0.01% 12.400ms 27.992us 0.000us 0.00% 0.000us 0.000us 443 autograd::engine::evaluate_function: CompiledFunctio... 0.01% 12.031ms 0.08% 92.566ms 420.753us 0.000us 0.00% 877.270ms 3.988ms 220 aten::gelu 0.01% 11.897ms 0.01% 13.175ms 82.346us 465.737ms 0.38% 465.737ms 2.911ms 160 autograd::engine::evaluate_function: SplitWithSizesB... 0.01% 11.568ms 0.02% 27.950ms 173.602us 0.000us 0.00% 327.354ms 2.033ms 161 FullyShardedDataParallel._pre_backward_hook 0.01% 11.522ms 0.08% 90.272ms 1.480ms 0.000us 0.00% 447.509ms 7.336ms 61 aten::empty_like 0.01% 11.357ms 0.05% 57.407ms 19.700us 0.000us 0.00% 0.000us 0.000us 2914 aten::clone 0.01% 10.212ms 0.16% 189.522ms 115.774us 0.000us 0.00% 2.087s 1.275ms 1637 aten::split_with_sizes 0.01% 9.761ms 0.01% 11.447ms 29.965us 0.000us 0.00% 0.000us 0.000us 382 autograd::engine::evaluate_function: SliceBackward0 0.01% 9.594ms 0.08% 91.960ms 75.254us 0.000us 0.00% 827.660ms 677.299us 1222 autograd::engine::evaluate_function: torch::autograd... 0.01% 9.585ms 0.10% 119.849ms 1.965ms 0.000us 0.00% 978.005ms 16.033ms 61 aten::linear 0.01% 9.400ms 0.35% 424.332ms 320.009us 0.000us 0.00% 17.550s 13.236ms 1326 aten::select 0.01% 9.244ms 0.01% 11.143ms 6.516us 0.000us 0.00% 0.000us 0.000us 1710 detach 0.01% 8.616ms 0.01% 8.624ms 3.640us 0.000us 0.00% 0.000us 0.000us 2369 aten::t 0.01% 8.275ms 0.01% 17.808ms 8.275us 0.000us 0.00% 0.000us 0.000us 2152 aten::unsqueeze 0.01% 8.143ms 0.01% 9.494ms 8.416us 0.000us 0.00% 0.000us 0.000us 1128 aten::linalg_vector_norm 0.01% 7.992ms 0.01% 9.924ms 47.710us 8.306ms 0.01% 8.306ms 39.931us 208 aten::narrow 0.01% 7.918ms 0.02% 22.432ms 8.546us 0.000us 0.00% 0.000us 0.000us 2625 aten::add_ 0.01% 7.651ms 0.01% 8.293ms 15.051us 112.570ms 0.09% 112.570ms 204.301us 551 aten::to 0.01% 7.590ms 0.16% 190.747ms 45.126us 0.000us 0.00% 1.169s 276.507us 4227 hipMemsetAsync 0.01% 7.417ms 0.01% 7.417ms 11.393us 0.000us 0.00% 0.000us 0.000us 651 AddmmBackward0 0.01% 7.292ms 0.08% 95.218ms 277.603us 0.000us 0.00% 5.055s 14.738ms 343 hipPointerGetAttribute 0.01% 7.196ms 0.01% 7.196ms 1.891us 0.000us 0.00% 0.000us 0.000us 3806 IndexFirstAxisBackward 0.01% 6.728ms 0.01% 16.592ms 267.619us 0.000us 0.00% 171.516ms 2.766ms 62 c10d::allgather_ 0.01% 6.722ms 0.05% 56.436ms 470.301us 0.000us 0.00% 10.836ms 90.301us 120 triton_red_fused__to_copy_mul_sum_0 0.01% 6.495ms 0.01% 7.894ms 49.337us 729.490ms 0.60% 729.490ms 4.559ms 160 aten::cumsum 0.01% 6.260ms 0.01% 7.064ms 56.062us 763.850us 0.00% 788.010us 6.254us 126 aten::gather 0.01% 6.227ms 0.01% 6.797ms 55.710us 252.516ms 0.21% 252.516ms 2.070ms 122 autograd::engine::evaluate_function: AddmmBackward0 0.01% 6.133ms 0.10% 123.182ms 359.131us 0.000us 0.00% 5.172s 15.079ms 343 hipEventDestroy 0.01% 6.105ms 0.01% 6.107ms 1.895us 174.757ms 0.14% 174.757ms 54.222us 3223 FullyShardedDataParallel._post_backward_prefetch 0.01% 6.038ms 0.01% 6.038ms 98.979us 0.000us 0.00% 0.000us 0.000us 61 aten::_local_scalar_dense 0.00% 5.850ms 0.12% 142.022ms 263.003us 1.881ms 0.00% 1.881ms 3.484us 540 autograd::engine::evaluate_function: AddBackward0 0.00% 5.822ms 0.09% 104.546ms 283.323us 0.000us 0.00% 461.879ms 1.252ms 369 aten::zeros 0.00% 5.677ms 0.04% 53.774ms 38.246us 0.000us 0.00% 427.724ms 304.214us 1406 IndexPutFirstAxis 0.00% 5.666ms 0.02% 21.254ms 174.215us 0.000us 0.00% 153.060ms 1.255ms 122 aten::zero_ 0.00% 5.609ms 0.03% 33.789ms 24.032us 0.000us 0.00% 427.724ms 304.214us 1406 autograd::engine::evaluate_function: SeqAllToAll4DBa... 0.00% 5.506ms 58.24% 69.664s 290.267ms 0.000us 0.00% 833.080ms 3.471ms 240 aten::max 0.00% 5.315ms 0.01% 7.101ms 58.207us 1.385ms 0.00% 1.385ms 11.356us 122 FullyShardedDataParallel.rate_limiter 0.00% 5.249ms 0.00% 5.852ms 48.365us 0.000us 0.00% 3.942ms 32.580us 121 triton_red_fused__to_copy_add_div_mul_pow_sum_1 0.00% 5.233ms 0.01% 6.005ms 37.532us 83.171ms 0.07% 83.171ms 519.821us 160 FullyShardedDataParallel._pre_forward_prefetch 0.00% 4.979ms 0.00% 4.979ms 81.619us 0.000us 0.00% 0.000us 0.000us 61 Optimizer.step#AdamW.step 0.00% 4.963ms 0.11% 132.474ms 132.474ms 0.000us 0.00% 188.764ms 188.764ms 1 aten::_index_put_impl_ 0.00% 4.906ms 0.00% 5.711ms 46.811us 107.178ms 0.09% 107.178ms 878.510us 122 aten::index 0.00% 4.848ms 0.00% 5.350ms 83.590us 54.505ms 0.05% 54.505ms 851.640us 64 aten::expand 0.00% 4.612ms 0.00% 5.615ms 6.070us 0.000us 0.00% 0.000us 0.000us 925 aten::div_ 0.00% 4.578ms 0.00% 4.832ms 39.606us 172.789ms 0.14% 172.789ms 1.416ms 122 ViewBackward0 0.00% 4.346ms 0.01% 14.786ms 8.067us 0.000us 0.00% 40.170ms 21.915us 1833 aten::native_layer_norm_backward 0.00% 4.296ms 0.01% 11.187ms 89.495us 144.034ms 0.12% 571.646ms 4.573ms 125 aten::detach 0.00% 4.209ms 0.01% 12.833ms 5.417us 0.000us 0.00% 0.000us 0.000us 2369 IndexPutFirstAxisBackward 0.00% 4.198ms 0.01% 9.627ms 155.268us 0.000us 0.00% 54.499ms 879.012us 62 NativeLayerNormBackward0 0.00% 4.003ms 0.01% 15.677ms 125.420us 0.000us 0.00% 571.646ms 4.573ms 125 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Self CPU time total: 119.609s Self CUDA time total: 121.038s ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ hipDeviceSynchronize 84.26% 100.788s 84.26% 100.788s 139.789ms 0.000us 0.00% 36.200ms 50.208us 721 hipMemcpyWithStream 12.98% 15.523s 12.98% 15.524s 31.235ms 0.000us 0.00% 1.097s 2.208ms 497 aten::copy_ 0.33% 395.468ms 12.50% 14.957s 2.003ms 4.225s 3.49% 5.322s 712.630us 7468 hipLaunchKernel 0.20% 237.425ms 0.20% 239.753ms 16.408us 0.000us 0.00% 0.000us 0.000us 14612 record_param_comms 0.16% 188.615ms 0.21% 252.423ms 122.654us 3.437s 2.84% 3.441s 1.672ms 2058 FullyShardedDataParallel.forward 0.13% 161.153ms 38.81% 46.428s 761.115ms 0.000us 0.00% 52.804s 865.636ms 61 SeqAllToAll4D 0.13% 160.025ms 84.69% 101.302s 140.697ms 0.000us 0.00% 2.137s 2.967ms 720 MulBackward0 0.12% 142.327ms 21.24% 25.408s 103.707ms 0.000us 0.00% 23.869s 97.425ms 245 aten::addmm 0.11% 131.100ms 0.12% 146.762ms 221.361us 9.625s 7.95% 9.625s 14.518ms 663 aten::cat 0.10% 114.312ms 0.10% 118.754ms 104.445us 1.512s 1.25% 1.512s 1.330ms 1137 aten::empty_strided 0.07% 83.582ms 0.07% 83.930ms 16.607us 0.000us 0.00% 0.000us 0.000us 5054 hipStreamWaitEvent 0.07% 78.121ms 0.07% 78.121ms 25.192us 6.723ms 0.01% 6.723ms 2.168us 3101 aten::mul 0.06% 71.910ms 0.06% 76.511ms 77.128us 556.532ms 0.46% 556.532ms 561.020us 992 aten::empty 0.06% 71.472ms 0.06% 71.515ms 13.740us 0.000us 0.00% 0.000us 0.000us 5205 aten::mm 0.05% 65.419ms 0.06% 68.045ms 100.066us 5.045s 4.17% 5.045s 7.419ms 680 FullyShardedDataParallel._post_backward_hook 0.05% 60.062ms 0.09% 103.015ms 1.689ms 0.000us 0.00% 904.802ms 14.833ms 61 CompiledFunctionBackward 0.05% 58.706ms 0.07% 79.072ms 359.420us 0.000us 0.00% 895.266ms 4.069ms 220 FullyShardedDataParallel._pre_forward 0.04% 53.345ms 0.07% 86.934ms 1.425ms 0.000us 0.00% 2.083s 34.151ms 61 FullyShardedDataParallel._post_forward 0.04% 49.262ms 0.04% 52.247ms 856.513us 0.000us 0.00% 0.000us 0.000us 61 aten::sum 0.04% 48.151ms 0.05% 63.399ms 65.495us 220.450ms 0.18% 221.342ms 228.659us 968 FullyShardedDataParallel._pre_backward_prefetch 0.04% 44.390ms 0.06% 69.640ms 1.142ms 0.000us 0.00% 473.323ms 7.759ms 61 aten::cos 0.03% 40.084ms 0.03% 40.095ms 13.365ms 0.000us 0.00% 5.600us 1.867us 3 Torch-Compiled Region 0.03% 39.930ms 12.46% 14.904s 33.643ms 0.000us 0.00% 1.848s 4.172ms 443 aten::view 0.03% 39.441ms 0.03% 39.482ms 4.248us 0.000us 0.00% 0.000us 0.000us 9295 autograd::engine::evaluate_function: ToCopyBackward0... 0.03% 37.498ms 0.09% 102.112ms 92.914us 0.000us 0.00% 431.187ms 392.345us 1099 aten::fill_ 0.03% 32.539ms 0.03% 33.816ms 20.495us 433.696ms 0.36% 433.696ms 262.846us 1650 aten::slice 0.03% 31.933ms 0.03% 38.721ms 5.520us 0.000us 0.00% 0.000us 0.000us 7015 _AllGather 0.02% 29.883ms 0.09% 109.193ms 909.940us 0.000us 0.00% 11.454ms 95.449us 120 aten::add 0.02% 27.065ms 0.02% 29.374ms 43.647us 566.207ms 0.47% 566.207ms 841.319us 673 c10d::alltoall_base_ 0.02% 26.745ms 0.14% 163.090ms 226.513us 0.000us 0.00% 1.438s 1.997ms 720 aten::sin 0.02% 26.507ms 0.02% 26.507ms 8.836ms 0.000us 0.00% 0.000us 0.000us 3 aten::_to_copy 0.02% 25.691ms 0.15% 174.045ms 47.788us 0.000us 0.00% 1.238s 339.900us 3642 autograd::engine::evaluate_function: SiluBackward0 0.02% 25.273ms 0.03% 33.477ms 371.965us 0.000us 0.00% 900.492us 10.005us 90 FlashAttnVarlenQKVPackedFunc 0.02% 24.961ms 0.03% 35.411ms 290.257us 30.004s 24.80% 30.012s 245.997ms 122 aten::as_strided 0.02% 23.084ms 0.02% 23.127ms 1.242us 0.000us 0.00% 0.000us 0.000us 18615 aten::native_layer_norm 0.02% 19.704ms 0.04% 43.799ms 178.771us 156.194ms 0.13% 1.014s 4.137ms 245 aten::transpose 0.01% 17.700ms 0.02% 25.765ms 6.617us 0.000us 0.00% 0.000us 0.000us 3894 aten::reshape 0.01% 17.293ms 0.04% 52.929ms 8.341us 0.000us 0.00% 119.761ms 18.872us 6346 hipMemcpyAsync 0.01% 16.227ms 0.01% 16.484ms 8.479us 0.000us 0.00% 0.000us 0.000us 1944 hipExtLaunchKernel 0.01% 15.702ms 0.01% 15.702ms 15.260us 0.000us 0.00% 0.000us 0.000us 1029 FlashAttnVarlenQKVPackedFuncBackward 0.01% 15.336ms 0.02% 22.799ms 367.720us 61.910s 51.16% 61.910s 998.547ms 62 aten::silu 0.01% 15.281ms 0.01% 16.855ms 99.147us 150.898ms 0.12% 150.898ms 887.635us 170 hipExtModuleLaunchKernel 0.01% 15.208ms 0.01% 16.017ms 11.882us 0.000us 0.00% 0.000us 0.000us 1348 aten::nonzero 0.01% 14.269ms 0.58% 695.256ms 5.652ms 5.076ms 0.00% 5.081ms 41.308us 123 SeqAllToAll4DBackward 0.01% 14.189ms 58.26% 69.692s 290.384ms 0.000us 0.00% 912.214ms 3.801ms 240 triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_0 0.01% 13.436ms 0.01% 16.413ms 51.291us 172.484ms 0.14% 172.484ms 539.014us 320 IndexFirstAxis 0.01% 12.383ms 0.02% 22.669ms 185.810us 0.000us 0.00% 252.494ms 2.070ms 122 triton_poi_fused__to_copy_add_mul_0 0.01% 12.324ms 0.01% 15.316ms 85.088us 171.952ms 0.14% 171.952ms 955.287us 180 TorchDynamo Cache Lookup 0.01% 12.240ms 0.01% 12.240ms 27.630us 0.000us 0.00% 0.000us 0.000us 443 autograd::engine::evaluate_function: ViewBackward0 0.01% 12.207ms 0.02% 25.937ms 14.150us 0.000us 0.00% 40.065ms 21.858us 1833 autograd::engine::evaluate_function: SplitWithSizesB... 0.01% 11.801ms 0.02% 27.372ms 170.014us 0.000us 0.00% 327.096ms 2.032ms 161 aten::gelu 0.01% 11.616ms 0.01% 12.852ms 80.324us 628.496ms 0.52% 628.496ms 3.928ms 160 autograd::engine::evaluate_function: CompiledFunctio... 0.01% 11.503ms 0.08% 92.023ms 418.289us 0.000us 0.00% 895.266ms 4.069ms 220 FullyShardedDataParallel._pre_backward_hook 0.01% 11.401ms 0.07% 83.406ms 1.367ms 0.000us 0.00% 473.323ms 7.759ms 61 hipModuleLaunchKernel 0.01% 10.926ms 0.01% 10.926ms 13.276us 0.000us 0.00% 0.000us 0.000us 823 aten::empty_like 0.01% 10.751ms 0.05% 56.430ms 19.365us 0.000us 0.00% 0.000us 0.000us 2914 aten::linear 0.01% 10.633ms 0.35% 423.541ms 319.413us 0.000us 0.00% 19.666s 14.831ms 1326 aten::split_with_sizes 0.01% 9.871ms 0.01% 11.485ms 30.067us 0.000us 0.00% 0.000us 0.000us 382 aten::clone 0.01% 9.671ms 0.15% 183.458ms 112.070us 0.000us 0.00% 2.072s 1.266ms 1637 autograd::engine::evaluate_function: SliceBackward0 0.01% 9.390ms 0.07% 88.612ms 72.514us 0.000us 0.00% 830.460ms 679.591us 1222 aten::select 0.01% 9.342ms 0.01% 11.148ms 6.519us 0.000us 0.00% 0.000us 0.000us 1710 autograd::engine::evaluate_function: torch::autograd... 0.01% 9.068ms 0.09% 112.929ms 1.851ms 0.000us 0.00% 904.802ms 14.833ms 61 AddmmBackward0 0.01% 7.979ms 0.07% 86.524ms 252.256us 0.000us 0.00% 5.045s 14.708ms 343 c10d::allgather_ 0.01% 7.895ms 0.05% 57.818ms 481.814us 0.000us 0.00% 9.660ms 80.497us 120 aten::narrow 0.01% 7.848ms 0.02% 21.865ms 8.330us 0.000us 0.00% 0.000us 0.000us 2625 aten::unsqueeze 0.01% 7.737ms 0.01% 8.990ms 8.578us 0.000us 0.00% 0.000us 0.000us 1048 aten::t 0.01% 7.666ms 0.01% 16.573ms 7.701us 0.000us 0.00% 0.000us 0.000us 2152 aten::to 0.01% 7.546ms 0.15% 182.266ms 43.951us 0.000us 0.00% 1.238s 298.508us 4147 aten::add_ 0.01% 7.418ms 0.01% 8.124ms 17.249us 112.621ms 0.09% 112.621ms 239.111us 471 aten::split 0.01% 7.134ms 0.01% 13.547ms 60.477us 0.000us 0.00% 0.000us 0.000us 224 detach 0.01% 6.853ms 0.01% 6.853ms 2.994us 0.000us 0.00% 0.000us 0.000us 2289 IndexFirstAxisBackward 0.01% 6.393ms 0.01% 16.253ms 262.140us 0.000us 0.00% 171.590ms 2.768ms 62 hipMemsetAsync 0.01% 6.317ms 0.01% 6.317ms 10.338us 0.000us 0.00% 0.000us 0.000us 611 aten::cumsum 0.01% 6.314ms 0.01% 7.129ms 56.581us 765.130us 0.00% 790.090us 6.271us 126 aten::gather 0.01% 6.296ms 0.01% 6.888ms 56.462us 252.494ms 0.21% 252.494ms 2.070ms 122 autograd::engine::evaluate_function: SeqAllToAll4DBa... 0.01% 6.158ms 58.27% 69.698s 290.410ms 0.000us 0.00% 912.214ms 3.801ms 240 hipPointerGetAttribute 0.01% 6.128ms 0.01% 6.128ms 1.610us 0.000us 0.00% 0.000us 0.000us 3806 IndexPutFirstAxis 0.01% 6.036ms 0.02% 21.367ms 175.136us 0.000us 0.00% 153.039ms 1.254ms 122 triton_red_fused__to_copy_mul_sum_0 0.01% 6.015ms 0.01% 7.151ms 44.694us 747.025ms 0.62% 747.025ms 4.669ms 160 aten::zero_ 0.00% 5.962ms 0.03% 33.169ms 23.591us 0.000us 0.00% 432.768ms 307.801us 1406 aten::_local_scalar_dense 0.00% 5.668ms 0.14% 162.691ms 428.133us 1.831ms 0.00% 1.831ms 4.819us 380 autograd::engine::evaluate_function: AddmmBackward0 0.00% 5.654ms 0.09% 113.288ms 330.287us 0.000us 0.00% 5.161s 15.046ms 343 aten::max 0.00% 5.628ms 0.01% 7.470ms 61.233us 1.377ms 0.00% 1.377ms 11.287us 122 FullyShardedDataParallel._post_backward_prefetch 0.00% 5.431ms 0.00% 5.431ms 89.035us 0.000us 0.00% 0.000us 0.000us 61 autograd::engine::evaluate_function: AddBackward0 0.00% 5.344ms 0.08% 96.288ms 260.942us 0.000us 0.00% 489.476ms 1.326ms 369 aten::zeros 0.00% 5.295ms 0.04% 52.615ms 37.422us 0.000us 0.00% 432.768ms 307.801us 1406 aten::linalg_vector_norm 0.00% 5.189ms 0.01% 6.446ms 50.357us 6.687ms 0.01% 6.687ms 52.239us 128 aten::index 0.00% 5.120ms 0.00% 5.578ms 87.152us 54.361ms 0.04% 54.361ms 849.395us 64 aten::_index_put_impl_ 0.00% 4.958ms 0.00% 5.766ms 47.266us 107.108ms 0.09% 107.108ms 877.935us 122 triton_red_fused__to_copy_add_div_mul_pow_sum_1 0.00% 4.908ms 0.00% 5.568ms 34.800us 83.488ms 0.07% 83.488ms 521.799us 160 FullyShardedDataParallel.rate_limiter 0.00% 4.907ms 0.00% 5.409ms 44.706us 0.000us 0.00% 1.309s 10.814ms 121 FullyShardedDataParallel._pre_forward_prefetch 0.00% 4.888ms 0.00% 4.888ms 80.137us 0.000us 0.00% 0.000us 0.000us 61 aten::expand 0.00% 4.669ms 0.00% 5.578ms 6.030us 0.000us 0.00% 0.000us 0.000us 925 hipEventDestroy 0.00% 4.602ms 0.00% 4.602ms 1.428us 1.232s 1.02% 1.232s 382.276us 3223 aten::div_ 0.00% 4.275ms 0.00% 4.499ms 36.875us 106.972ms 0.09% 106.972ms 876.822us 122 aten::native_layer_norm_backward 0.00% 4.261ms 0.01% 8.740ms 69.922us 142.876ms 0.12% 570.958ms 4.568ms 125 ViewBackward0 0.00% 4.117ms 0.01% 13.703ms 7.476us 0.000us 0.00% 40.065ms 21.858us 1833 NativeLayerNormBackward0 0.00% 3.937ms 0.01% 13.061ms 104.488us 0.000us 0.00% 570.958ms 4.568ms 125 IndexPutFirstAxisBackward 0.00% 3.914ms 0.01% 9.545ms 153.953us 0.000us 0.00% 54.355ms 876.687us 62 Optimizer.step#AdamW.step 0.00% 3.617ms 0.11% 129.649ms 129.649ms 0.000us 0.00% 192.587ms 192.587ms 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Self CPU time total: 119.617s Self CUDA time total: 121.008s ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ hipDeviceSynchronize 84.39% 100.947s 84.40% 100.947s 140.010ms 0.000us 0.00% 22.695ms 31.477us 721 hipMemcpyWithStream 12.75% 15.248s 12.75% 15.248s 30.681ms 0.000us 0.00% 289.679ms 582.855us 497 aten::copy_ 0.47% 557.461ms 12.29% 14.701s 1.969ms 4.209s 3.49% 4.499s 602.405us 7468 hipLaunchKernel 0.22% 258.406ms 0.22% 260.750ms 17.855us 0.000us 0.00% 2.880us 0.000us 14604 record_param_comms 0.17% 197.487ms 0.22% 264.506ms 128.526us 3.278s 2.71% 3.280s 1.594ms 2058 SeqAllToAll4D 0.14% 164.261ms 84.84% 101.479s 140.943ms 0.000us 0.00% 2.205s 3.062ms 720 FullyShardedDataParallel.forward 0.14% 163.525ms 38.80% 46.414s 760.885ms 0.000us 0.00% 50.589s 829.320ms 61 MulBackward0 0.13% 153.208ms 21.13% 25.277s 103.172ms 0.000us 0.00% 23.731s 96.860ms 245 aten::addmm 0.11% 130.625ms 0.12% 147.483ms 222.448us 9.535s 7.90% 9.535s 14.382ms 663 aten::cat 0.10% 119.669ms 0.10% 123.811ms 108.893us 1.512s 1.25% 1.512s 1.330ms 1137 aten::empty_strided 0.07% 87.012ms 0.07% 87.122ms 17.252us 0.000us 0.00% 0.000us 0.000us 5050 hipStreamWaitEvent 0.07% 79.904ms 0.07% 79.904ms 25.767us 25.324ms 0.02% 25.324ms 8.166us 3101 aten::empty 0.06% 73.305ms 0.06% 73.309ms 14.084us 0.000us 0.00% 0.000us 0.000us 5205 aten::mm 0.06% 72.259ms 0.06% 74.953ms 110.225us 5.059s 4.19% 5.059s 7.440ms 680 aten::mul 0.06% 69.652ms 0.06% 74.141ms 74.739us 505.159ms 0.42% 505.159ms 509.233us 992 FullyShardedDataParallel._post_backward_hook 0.05% 64.519ms 0.09% 106.795ms 1.751ms 0.000us 0.00% 714.963ms 11.721ms 61 CompiledFunctionBackward 0.05% 56.192ms 0.07% 79.485ms 361.296us 0.000us 0.00% 875.612ms 3.980ms 220 FullyShardedDataParallel._pre_forward 0.04% 53.278ms 0.07% 86.754ms 1.422ms 0.000us 0.00% 737.419ms 12.089ms 61 aten::sum 0.04% 49.696ms 0.05% 65.222ms 67.447us 211.664ms 0.18% 212.531ms 219.784us 967 FullyShardedDataParallel._pre_backward_prefetch 0.04% 48.120ms 0.06% 75.780ms 1.242ms 0.000us 0.00% 479.883ms 7.867ms 61 FullyShardedDataParallel._post_forward 0.04% 47.755ms 0.04% 50.440ms 826.878us 0.000us 0.00% 0.000us 0.000us 61 Torch-Compiled Region 0.03% 40.873ms 12.24% 14.641s 33.050ms 0.000us 0.00% 1.095s 2.472ms 443 aten::view 0.03% 39.465ms 0.03% 39.502ms 4.252us 0.000us 0.00% 0.000us 0.000us 9291 autograd::engine::evaluate_function: ToCopyBackward0... 0.03% 38.473ms 0.09% 109.679ms 99.799us 0.000us 0.00% 388.637ms 353.628us 1099 aten::cos 0.03% 36.409ms 0.03% 36.409ms 12.136ms 0.000us 0.00% 0.000us 0.000us 3 aten::fill_ 0.03% 34.626ms 0.03% 35.851ms 21.728us 427.565ms 0.35% 427.565ms 259.130us 1650 aten::slice 0.03% 33.081ms 0.03% 40.522ms 5.786us 0.000us 0.00% 0.000us 0.000us 7003 autograd::engine::evaluate_function: SiluBackward0 0.03% 30.990ms 0.03% 39.789ms 442.099us 0.000us 0.00% 998.567us 11.095us 90 _AllGather 0.03% 30.557ms 0.09% 109.153ms 909.607us 0.000us 0.00% 17.857ms 148.809us 120 aten::add 0.02% 27.689ms 0.03% 30.078ms 44.693us 545.004ms 0.45% 545.004ms 809.813us 673 aten::_to_copy 0.02% 27.209ms 0.15% 182.563ms 50.127us 0.000us 0.00% 1.164s 319.572us 3642 aten::sin 0.02% 27.014ms 0.02% 27.024ms 9.008ms 0.000us 0.00% 0.000us 0.000us 3 FlashAttnVarlenQKVPackedFunc 0.02% 26.943ms 0.03% 37.729ms 309.254us 30.004s 24.84% 30.004s 245.935ms 122 c10d::alltoall_base_ 0.02% 25.590ms 0.14% 171.406ms 238.063us 0.000us 0.00% 1.508s 2.095ms 720 aten::as_strided 0.02% 23.856ms 0.02% 23.921ms 1.286us 0.000us 0.00% 0.000us 0.000us 18595 aten::native_layer_norm 0.02% 20.044ms 0.04% 44.733ms 182.582us 155.344ms 0.13% 1.004s 4.100ms 245 aten::silu 0.02% 18.431ms 0.02% 20.073ms 118.078us 102.938ms 0.09% 102.938ms 605.518us 170 aten::reshape 0.02% 18.358ms 0.05% 53.950ms 8.507us 0.000us 0.00% 118.785ms 18.730us 6342 aten::transpose 0.02% 18.043ms 0.02% 26.141ms 6.713us 0.000us 0.00% 0.000us 0.000us 3894 hipMemcpyAsync 0.02% 17.999ms 0.02% 18.308ms 9.418us 0.000us 0.00% 0.000us 0.000us 1944 hipExtLaunchKernel 0.01% 17.173ms 0.01% 17.173ms 16.689us 0.000us 0.00% 0.000us 0.000us 1029 hipExtModuleLaunchKernel 0.01% 16.862ms 0.01% 17.669ms 13.108us 0.000us 0.00% 0.000us 0.000us 1348 FlashAttnVarlenQKVPackedFuncBackward 0.01% 15.920ms 0.02% 23.768ms 383.355us 62.116s 51.44% 62.116s 1.002s 62 SeqAllToAll4DBackward 0.01% 14.952ms 58.33% 69.774s 290.725ms 0.000us 0.00% 900.677ms 3.753ms 240 triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_0 0.01% 13.995ms 0.01% 17.146ms 53.581us 172.262ms 0.14% 172.262ms 538.319us 320 triton_poi_fused__to_copy_add_mul_0 0.01% 13.973ms 0.01% 17.207ms 95.593us 169.853ms 0.14% 169.853ms 943.630us 180 aten::nonzero 0.01% 13.142ms 0.58% 692.571ms 5.631ms 5.123ms 0.00% 5.123ms 41.651us 123 IndexFirstAxis 0.01% 12.792ms 0.02% 23.121ms 189.519us 0.000us 0.00% 252.362ms 2.069ms 122 hipModuleLaunchKernel 0.01% 12.584ms 0.01% 12.584ms 15.290us 0.000us 0.00% 0.000us 0.000us 823 autograd::engine::evaluate_function: ViewBackward0 0.01% 12.165ms 0.02% 26.348ms 14.374us 0.000us 0.00% 39.654ms 21.634us 1833 TorchDynamo Cache Lookup 0.01% 12.161ms 0.01% 12.161ms 27.453us 0.000us 0.00% 0.000us 0.000us 443 autograd::engine::evaluate_function: CompiledFunctio... 0.01% 12.031ms 0.08% 93.227ms 423.760us 0.000us 0.00% 875.612ms 3.980ms 220 aten::gelu 0.01% 11.978ms 0.01% 13.156ms 82.225us 584.944ms 0.48% 584.944ms 3.656ms 160 autograd::engine::evaluate_function: SplitWithSizesB... 0.01% 11.927ms 0.02% 28.581ms 177.519us 0.000us 0.00% 329.352ms 2.046ms 161 FullyShardedDataParallel._pre_backward_hook 0.01% 11.885ms 0.08% 90.091ms 1.477ms 0.000us 0.00% 479.883ms 7.867ms 61 aten::empty_like 0.01% 10.928ms 0.05% 57.438ms 19.711us 0.000us 0.00% 0.000us 0.000us 2914 aten::split_with_sizes 0.01% 10.453ms 0.01% 12.167ms 31.851us 0.000us 0.00% 0.000us 0.000us 382 aten::clone 0.01% 10.109ms 0.16% 189.225ms 115.592us 0.000us 0.00% 2.073s 1.266ms 1637 autograd::engine::evaluate_function: SliceBackward0 0.01% 9.843ms 0.08% 95.828ms 78.419us 0.000us 0.00% 822.904ms 673.408us 1222 aten::linear 0.01% 9.554ms 0.36% 429.295ms 323.752us 0.000us 0.00% 19.479s 14.690ms 1326 autograd::engine::evaluate_function: torch::autograd... 0.01% 9.452ms 0.10% 117.091ms 1.920ms 0.000us 0.00% 714.963ms 11.721ms 61 aten::select 0.01% 9.201ms 0.01% 11.173ms 6.534us 0.000us 0.00% 0.000us 0.000us 1710 aten::add_ 0.01% 8.133ms 0.01% 8.775ms 18.791us 108.768ms 0.09% 108.768ms 232.907us 467 aten::t 0.01% 7.981ms 0.01% 17.309ms 8.043us 0.000us 0.00% 0.000us 0.000us 2152 aten::narrow 0.01% 7.952ms 0.02% 22.491ms 8.568us 0.000us 0.00% 0.000us 0.000us 2625 aten::to 0.01% 7.842ms 0.16% 191.188ms 46.147us 0.000us 0.00% 1.164s 280.927us 4143 aten::unsqueeze 0.01% 7.479ms 0.01% 8.722ms 8.354us 0.000us 0.00% 0.000us 0.000us 1044 AddmmBackward0 0.01% 7.359ms 0.08% 94.651ms 275.951us 0.000us 0.00% 5.059s 14.749ms 343 hipMemsetAsync 0.01% 6.938ms 0.01% 6.938ms 11.374us 0.000us 0.00% 0.000us 0.000us 610 detach 0.01% 6.908ms 0.01% 6.908ms 3.023us 0.000us 0.00% 0.000us 0.000us 2285 hipPointerGetAttribute 0.01% 6.903ms 0.01% 6.903ms 1.814us 0.000us 0.00% 0.000us 0.000us 3806 triton_red_fused__to_copy_mul_sum_0 0.01% 6.727ms 0.01% 8.328ms 52.050us 729.750ms 0.60% 729.750ms 4.561ms 160 IndexFirstAxisBackward 0.01% 6.727ms 0.01% 17.087ms 275.600us 0.000us 0.00% 171.603ms 2.768ms 62 c10d::allgather_ 0.01% 6.705ms 0.05% 56.677ms 472.312us 0.000us 0.00% 15.971ms 133.095us 120 aten::zero_ 0.01% 6.352ms 0.03% 35.676ms 25.374us 0.000us 0.00% 426.623ms 303.430us 1406 aten::gather 0.01% 6.311ms 0.01% 6.866ms 56.276us 252.362ms 0.21% 252.362ms 2.069ms 122 aten::cumsum 0.01% 6.310ms 0.01% 6.916ms 56.226us 768.333us 0.00% 774.413us 6.296us 123 FullyShardedDataParallel._post_backward_prefetch 0.01% 6.010ms 0.01% 6.010ms 98.524us 0.000us 0.00% 0.000us 0.000us 61 autograd::engine::evaluate_function: AddmmBackward0 0.00% 5.789ms 0.10% 122.986ms 358.561us 0.000us 0.00% 5.171s 15.075ms 343 aten::_local_scalar_dense 0.00% 5.701ms 0.13% 157.012ms 422.075us 1.907ms 0.00% 1.907ms 5.127us 372 IndexPutFirstAxis 0.00% 5.663ms 0.02% 21.144ms 173.312us 0.000us 0.00% 153.130ms 1.255ms 122 aten::zeros 0.00% 5.571ms 0.05% 56.419ms 40.127us 0.000us 0.00% 426.623ms 303.430us 1406 autograd::engine::evaluate_function: SeqAllToAll4DBa... 0.00% 5.481ms 58.34% 69.779s 290.748ms 0.000us 0.00% 900.677ms 3.753ms 240 autograd::engine::evaluate_function: AddBackward0 0.00% 5.428ms 0.09% 104.505ms 283.211us 0.000us 0.00% 493.195ms 1.337ms 369 triton_red_fused__to_copy_add_div_mul_pow_sum_1 0.00% 5.393ms 0.01% 6.184ms 38.651us 82.538ms 0.07% 82.538ms 515.863us 160 aten::max 0.00% 5.386ms 0.01% 7.181ms 58.861us 1.383ms 0.00% 1.383ms 11.333us 122 hipEventDestroy 0.00% 5.092ms 0.00% 5.092ms 1.580us 385.146ms 0.32% 385.146ms 119.499us 3223 FullyShardedDataParallel.rate_limiter 0.00% 5.048ms 0.00% 5.637ms 46.588us 0.000us 0.00% 3.053ms 25.233us 121 aten::index 0.00% 5.046ms 0.00% 5.527ms 86.354us 54.621ms 0.05% 54.621ms 853.456us 64 FullyShardedDataParallel._pre_forward_prefetch 0.00% 4.868ms 0.00% 4.872ms 79.876us 0.000us 0.00% 0.000us 0.000us 61 aten::linalg_vector_norm 0.00% 4.777ms 0.00% 5.979ms 48.215us 6.667ms 0.01% 6.667ms 53.770us 124 aten::_index_put_impl_ 0.00% 4.764ms 0.00% 5.589ms 45.814us 107.247ms 0.09% 107.247ms 879.071us 122 aten::expand 0.00% 4.576ms 0.00% 5.549ms 5.999us 0.000us 0.00% 0.000us 0.000us 925 aten::div_ 0.00% 4.561ms 0.00% 4.801ms 39.354us 107.217ms 0.09% 107.217ms 878.827us 122 aten::native_layer_norm_backward 0.00% 4.375ms 0.01% 9.263ms 74.102us 144.461ms 0.12% 571.313ms 4.571ms 125 IndexPutFirstAxisBackward 0.00% 4.253ms 0.01% 9.841ms 158.724us 0.000us 0.00% 54.615ms 880.881us 62 aten::split 0.00% 4.110ms 0.01% 10.937ms 48.824us 0.000us 0.00% 0.000us 0.000us 224 ViewBackward0 0.00% 3.948ms 0.01% 14.153ms 7.721us 0.000us 0.00% 39.654ms 21.634us 1833 NativeLayerNormBackward0 0.00% 3.907ms 0.01% 13.667ms 109.332us 0.000us 0.00% 571.313ms 4.571ms 125 _AllGatherBackward 0.00% 3.739ms 0.00% 4.963ms 82.720us 0.000us 0.00% 0.000us 0.000us 60 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Self CPU time total: 119.613s Self CUDA time total: 120.765s ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ hipDeviceSynchronize 84.29% 100.815s 84.29% 100.815s 139.827ms 0.000us 0.00% 11.784ms 16.344us 721 hipMemcpyWithStream 12.64% 15.119s 12.64% 15.119s 30.421ms 0.000us 0.00% 54.022ms 108.697us 497 aten::copy_ 0.37% 445.284ms 12.22% 14.615s 1.957ms 4.319s 3.58% 4.373s 585.533us 7468 hipLaunchKernel 0.22% 268.445ms 0.23% 270.948ms 18.423us 0.000us 0.00% 1.377ms 0.094us 14707 record_param_comms 0.18% 210.239ms 0.23% 280.909ms 136.496us 3.142s 2.61% 3.144s 1.528ms 2058 SeqAllToAll4D 0.15% 179.284ms 84.78% 101.391s 140.821ms 0.000us 0.00% 2.162s 3.003ms 720 FullyShardedDataParallel.forward 0.15% 178.636ms 38.80% 46.401s 760.674ms 0.000us 0.00% 47.949s 786.055ms 61 MulBackward0 0.14% 163.895ms 21.11% 25.242s 103.029ms 0.000us 0.00% 23.721s 96.822ms 245 aten::addmm 0.12% 138.023ms 0.13% 156.645ms 236.266us 9.019s 7.48% 9.019s 13.604ms 663 aten::cat 0.10% 119.951ms 0.10% 124.906ms 109.856us 1.514s 1.26% 1.514s 1.332ms 1137 aten::empty_strided 0.08% 91.475ms 0.08% 91.560ms 17.949us 0.000us 0.00% 0.000us 0.000us 5101 hipStreamWaitEvent 0.07% 82.342ms 0.07% 82.342ms 26.553us 256.703ms 0.21% 256.703ms 82.781us 3101 aten::empty 0.07% 82.087ms 0.07% 82.091ms 15.771us 0.000us 0.00% 0.000us 0.000us 5205 aten::mul 0.07% 78.171ms 0.07% 82.778ms 83.446us 857.156ms 0.71% 857.156ms 864.069us 992 aten::mm 0.06% 73.790ms 0.06% 76.596ms 112.641us 5.047s 4.19% 5.047s 7.422ms 680 FullyShardedDataParallel._post_backward_hook 0.06% 70.646ms 0.10% 115.973ms 1.901ms 0.000us 0.00% 654.530ms 10.730ms 61 FullyShardedDataParallel._pre_forward 0.05% 60.434ms 0.08% 98.634ms 1.617ms 0.000us 0.00% 939.898ms 15.408ms 61 CompiledFunctionBackward 0.05% 59.356ms 0.07% 83.468ms 379.399us 0.000us 0.00% 884.686ms 4.021ms 220 FullyShardedDataParallel._post_forward 0.04% 53.518ms 0.05% 56.883ms 932.508us 0.000us 0.00% 0.000us 0.000us 61 aten::sum 0.04% 53.185ms 0.06% 69.853ms 72.386us 220.715ms 0.18% 221.503ms 229.536us 965 FullyShardedDataParallel._pre_backward_prefetch 0.04% 52.385ms 0.07% 80.737ms 1.324ms 0.000us 0.00% 468.482ms 7.680ms 61 Torch-Compiled Region 0.04% 44.803ms 12.16% 14.545s 32.832ms 0.000us 0.00% 893.348ms 2.017ms 443 aten::view 0.04% 44.421ms 0.04% 44.458ms 4.759us 0.000us 0.00% 0.000us 0.000us 9342 aten::slice 0.03% 37.753ms 0.04% 45.785ms 6.398us 0.000us 0.00% 0.000us 0.000us 7156 autograd::engine::evaluate_function: ToCopyBackward0... 0.03% 37.094ms 0.10% 117.912ms 107.290us 0.000us 0.00% 426.388ms 387.978us 1099 aten::cos 0.03% 37.060ms 0.03% 37.092ms 12.364ms 0.000us 0.00% 9.920us 3.307us 3 aten::fill_ 0.03% 36.417ms 0.03% 37.900ms 22.970us 446.123ms 0.37% 446.123ms 270.378us 1650 _AllGather 0.03% 33.180ms 0.10% 120.552ms 1.005ms 0.000us 0.00% 13.376ms 111.464us 120 aten::sin 0.03% 33.150ms 0.03% 33.150ms 11.050ms 0.000us 0.00% 0.000us 0.000us 3 c10d::alltoall_base_ 0.03% 32.510ms 0.15% 184.724ms 256.562us 0.000us 0.00% 1.481s 2.057ms 720 aten::_to_copy 0.02% 29.698ms 0.17% 202.827ms 55.691us 0.000us 0.00% 1.235s 339.178us 3642 aten::add 0.02% 29.617ms 0.03% 32.248ms 47.917us 591.601ms 0.49% 591.601ms 879.051us 673 FlashAttnVarlenQKVPackedFunc 0.02% 27.531ms 0.03% 39.165ms 321.022us 30.004s 24.89% 30.004s 245.936ms 122 autograd::engine::evaluate_function: SiluBackward0 0.02% 26.916ms 0.03% 35.840ms 398.218us 0.000us 0.00% 933.289us 10.370us 90 aten::as_strided 0.02% 26.165ms 0.02% 26.193ms 1.390us 0.000us 0.00% 0.000us 0.000us 18850 aten::native_layer_norm 0.02% 21.612ms 0.04% 47.641ms 194.453us 164.670ms 0.14% 1.027s 4.190ms 245 aten::transpose 0.02% 20.405ms 0.02% 29.470ms 7.568us 0.000us 0.00% 0.000us 0.000us 3894 aten::reshape 0.02% 19.193ms 0.05% 59.393ms 9.290us 0.000us 0.00% 119.734ms 18.729us 6393 hipMemcpyAsync 0.02% 18.353ms 0.02% 18.669ms 9.603us 0.000us 0.00% 0.000us 0.000us 1944 hipExtModuleLaunchKernel 0.02% 18.092ms 0.02% 18.868ms 13.997us 0.000us 0.00% 0.000us 0.000us 1348 hipExtLaunchKernel 0.01% 17.813ms 0.01% 17.813ms 17.311us 0.000us 0.00% 0.000us 0.000us 1029 FlashAttnVarlenQKVPackedFuncBackward 0.01% 17.030ms 0.02% 25.480ms 410.975us 61.978s 51.40% 61.978s 999.644ms 62 aten::silu 0.01% 16.550ms 0.02% 18.324ms 107.787us 151.786ms 0.13% 151.786ms 892.860us 170 SeqAllToAll4DBackward 0.01% 16.211ms 58.32% 69.746s 290.608ms 0.000us 0.00% 940.326ms 3.918ms 240 aten::nonzero 0.01% 14.979ms 0.58% 690.356ms 5.613ms 5.146ms 0.00% 6.519ms 53.004us 123 triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_0 0.01% 14.638ms 0.02% 18.477ms 57.740us 172.633ms 0.14% 172.633ms 539.478us 320 triton_poi_fused__to_copy_add_mul_0 0.01% 13.873ms 0.01% 17.429ms 96.826us 172.593ms 0.14% 172.593ms 958.850us 180 IndexFirstAxis 0.01% 13.746ms 0.02% 25.290ms 207.293us 0.000us 0.00% 252.225ms 2.067ms 122 TorchDynamo Cache Lookup 0.01% 13.733ms 0.01% 13.733ms 31.001us 0.000us 0.00% 0.000us 0.000us 443 autograd::engine::evaluate_function: ViewBackward0 0.01% 13.057ms 0.02% 28.437ms 15.514us 0.000us 0.00% 40.108ms 21.881us 1833 aten::gelu 0.01% 12.731ms 0.01% 14.110ms 88.186us 582.548ms 0.48% 582.548ms 3.641ms 160 autograd::engine::evaluate_function: SplitWithSizesB... 0.01% 12.668ms 0.03% 30.123ms 187.102us 0.000us 0.00% 327.378ms 2.033ms 161 hipModuleLaunchKernel 0.01% 12.599ms 0.01% 12.599ms 15.309us 0.000us 0.00% 0.000us 0.000us 823 aten::empty_like 0.01% 12.391ms 0.05% 63.495ms 21.790us 0.000us 0.00% 0.000us 0.000us 2914 autograd::engine::evaluate_function: CompiledFunctio... 0.01% 12.192ms 0.08% 97.510ms 443.227us 0.000us 0.00% 884.686ms 4.021ms 220 FullyShardedDataParallel._pre_backward_hook 0.01% 11.988ms 0.08% 95.101ms 1.559ms 0.000us 0.00% 468.482ms 7.680ms 61 aten::split_with_sizes 0.01% 11.232ms 0.01% 13.131ms 34.374us 0.000us 0.00% 0.000us 0.000us 382 autograd::engine::evaluate_function: torch::autograd... 0.01% 11.012ms 0.11% 127.965ms 2.098ms 0.000us 0.00% 654.530ms 10.730ms 61 aten::clone 0.01% 10.719ms 0.17% 201.914ms 123.344us 0.000us 0.00% 2.081s 1.271ms 1637 aten::select 0.01% 10.365ms 0.01% 12.382ms 7.241us 0.000us 0.00% 0.000us 0.000us 1710 autograd::engine::evaluate_function: SliceBackward0 0.01% 10.157ms 0.09% 101.854ms 83.350us 0.000us 0.00% 842.437ms 689.392us 1222 aten::linear 0.01% 10.068ms 0.38% 456.123ms 343.984us 0.000us 0.00% 18.456s 13.919ms 1326 autograd::engine::evaluate_function: AddmmBackward0 0.01% 9.902ms 0.11% 130.544ms 380.593us 0.000us 0.00% 5.163s 15.053ms 343 aten::narrow 0.01% 8.988ms 0.02% 25.457ms 9.698us 0.000us 0.00% 0.000us 0.000us 2625 detach 0.01% 8.769ms 0.01% 8.769ms 3.754us 0.000us 0.00% 0.000us 0.000us 2336 aten::unsqueeze 0.01% 8.629ms 0.01% 9.968ms 9.104us 0.000us 0.00% 0.000us 0.000us 1095 aten::t 0.01% 8.606ms 0.02% 19.329ms 8.982us 0.000us 0.00% 0.000us 0.000us 2152 c10d::allgather_ 0.01% 8.552ms 0.05% 63.839ms 531.992us 0.000us 0.00% 11.043ms 92.023us 120 aten::add_ 0.01% 8.488ms 0.01% 9.062ms 17.495us 112.333ms 0.09% 112.333ms 216.859us 518 AddmmBackward0 0.01% 8.080ms 0.08% 97.569ms 284.457us 0.000us 0.00% 5.047s 14.714ms 343 aten::to 0.01% 7.743ms 0.18% 211.501ms 50.429us 0.000us 0.00% 1.235s 294.537us 4194 hipPointerGetAttribute 0.01% 7.667ms 0.01% 7.672ms 2.016us 0.000us 0.00% 0.000us 0.000us 3806 IndexFirstAxisBackward 0.01% 7.232ms 0.02% 18.514ms 298.613us 0.000us 0.00% 171.718ms 2.770ms 62 aten::gather 0.01% 7.100ms 0.01% 7.749ms 63.513us 252.225ms 0.21% 252.225ms 2.067ms 122 aten::cumsum 0.01% 7.042ms 0.01% 7.892ms 63.133us 768.331us 0.00% 786.892us 6.295us 125 triton_red_fused__to_copy_mul_sum_0 0.01% 6.934ms 0.01% 8.453ms 52.832us 736.670ms 0.61% 736.670ms 4.604ms 160 hipMemsetAsync 0.01% 6.823ms 0.01% 6.823ms 11.094us 0.000us 0.00% 0.000us 0.000us 615 aten::linalg_vector_norm 0.01% 6.638ms 0.01% 8.100ms 46.288us 7.442ms 0.01% 7.442ms 42.524us 175 FullyShardedDataParallel._post_backward_prefetch 0.01% 6.619ms 0.01% 6.619ms 108.504us 0.000us 0.00% 0.000us 0.000us 61 autograd::engine::evaluate_function: SeqAllToAll4DBa... 0.01% 6.540ms 58.32% 69.752s 290.635ms 0.000us 0.00% 940.326ms 3.918ms 240 aten::_local_scalar_dense 0.01% 6.354ms 0.12% 138.818ms 292.865us 2.121ms 0.00% 2.121ms 4.476us 474 aten::zero_ 0.01% 6.323ms 0.03% 36.857ms 26.214us 0.000us 0.00% 445.188ms 316.634us 1406 IndexPutFirstAxis 0.01% 6.298ms 0.02% 23.451ms 192.220us 0.000us 0.00% 152.956ms 1.254ms 122 hipEventDestroy 0.01% 6.277ms 0.01% 6.277ms 1.948us 159.248ms 0.13% 159.248ms 49.410us 3223 aten::zeros 0.01% 6.231ms 0.05% 61.787ms 43.946us 0.000us 0.00% 445.188ms 316.634us 1406 aten::max 0.01% 6.098ms 0.01% 8.167ms 66.943us 1.391ms 0.00% 1.391ms 11.403us 122 autograd::engine::evaluate_function: AddBackward0 0.00% 5.917ms 0.09% 110.475ms 299.390us 0.000us 0.00% 486.866ms 1.319ms 369 aten::index 0.00% 5.783ms 0.01% 6.309ms 98.570us 54.637ms 0.05% 54.637ms 853.701us 64 triton_red_fused__to_copy_add_div_mul_pow_sum_1 0.00% 5.627ms 0.01% 6.473ms 40.456us 83.179ms 0.07% 83.179ms 519.867us 160 FullyShardedDataParallel._pre_forward_prefetch 0.00% 5.573ms 0.00% 5.573ms 91.369us 0.000us 0.00% 0.000us 0.000us 61 FullyShardedDataParallel.rate_limiter 0.00% 5.573ms 0.01% 6.152ms 50.844us 0.000us 0.00% 3.563ms 29.450us 121 aten::expand 0.00% 5.553ms 0.01% 6.616ms 7.152us 0.000us 0.00% 0.000us 0.000us 925 aten::_index_put_impl_ 0.00% 5.448ms 0.01% 6.343ms 51.991us 107.077ms 0.09% 107.077ms 877.681us 122 aten::div_ 0.00% 4.893ms 0.00% 5.079ms 41.633us 109.444ms 0.09% 109.444ms 897.081us 122 aten::native_layer_norm_backward 0.00% 4.839ms 0.01% 9.662ms 77.296us 144.927ms 0.12% 570.384ms 4.563ms 125 Optimizer.step#AdamW.step 0.00% 4.712ms 0.12% 141.030ms 141.030ms 0.000us 0.00% 192.205ms 192.205ms 1 IndexPutFirstAxisBackward 0.00% 4.313ms 0.01% 10.717ms 172.849us 0.000us 0.00% 54.630ms 881.131us 62 aten::split 0.00% 4.201ms 0.01% 11.844ms 52.875us 0.000us 0.00% 0.000us 0.000us 224 ViewBackward0 0.00% 4.197ms 0.01% 15.340ms 8.369us 0.000us 0.00% 40.108ms 21.881us 1833 NativeLayerNormBackward0 0.00% 4.040ms 0.01% 14.413ms 115.301us 0.000us 0.00% 570.384ms 4.563ms 125 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Self CPU time total: 119.599s Self CUDA time total: 120.571s ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ hipDeviceSynchronize 84.13% 100.632s 84.13% 100.633s 139.574ms 0.000us 0.00% 44.802ms 62.139us 721 hipMemcpyWithStream 13.00% 15.548s 13.00% 15.548s 31.284ms 0.000us 0.00% 38.850ms 78.169us 497 aten::copy_ 0.48% 571.017ms 12.54% 14.995s 2.008ms 4.309s 3.56% 4.349s 582.307us 7468 hipLaunchKernel 0.21% 248.859ms 0.21% 251.254ms 17.110us 0.000us 0.00% 0.000us 0.000us 14685 record_param_comms 0.16% 194.944ms 0.22% 263.480ms 128.027us 3.302s 2.73% 3.304s 1.605ms 2058 FullyShardedDataParallel.forward 0.15% 179.267ms 38.82% 46.439s 761.289ms 0.000us 0.00% 52.677s 863.562ms 61 SeqAllToAll4D 0.14% 164.052ms 84.58% 101.163s 140.504ms 0.000us 0.00% 1.979s 2.748ms 720 MulBackward0 0.13% 152.332ms 21.22% 25.385s 103.611ms 0.000us 0.00% 23.870s 97.427ms 245 aten::addmm 0.11% 132.979ms 0.13% 150.635ms 227.201us 9.635s 7.96% 9.635s 14.533ms 663 aten::cat 0.10% 116.317ms 0.10% 121.361ms 106.738us 1.527s 1.26% 1.527s 1.343ms 1137 aten::empty_strided 0.08% 90.520ms 0.08% 90.668ms 17.813us 0.000us 0.00% 0.000us 0.000us 5090 hipStreamWaitEvent 0.07% 79.311ms 0.07% 79.311ms 25.576us 7.844ms 0.01% 7.844ms 2.529us 3101 aten::empty 0.06% 74.226ms 0.06% 74.230ms 14.261us 0.000us 0.00% 0.000us 0.000us 5205 aten::mm 0.06% 72.799ms 0.06% 75.579ms 111.146us 5.064s 4.18% 5.064s 7.448ms 680 aten::mul 0.06% 68.475ms 0.06% 73.025ms 73.614us 597.022ms 0.49% 597.022ms 601.837us 992 FullyShardedDataParallel._post_backward_hook 0.05% 64.741ms 0.09% 107.566ms 1.763ms 0.000us 0.00% 924.418ms 15.154ms 61 CompiledFunctionBackward 0.05% 56.354ms 0.07% 79.313ms 360.515us 0.000us 0.00% 876.167ms 3.983ms 220 FullyShardedDataParallel._pre_forward 0.05% 54.796ms 0.08% 89.985ms 1.475ms 0.000us 0.00% 1.786s 29.277ms 61 aten::sum 0.04% 50.336ms 0.06% 66.195ms 68.383us 221.204ms 0.18% 222.071ms 229.412us 968 FullyShardedDataParallel._post_forward 0.04% 48.639ms 0.04% 51.511ms 844.448us 0.000us 0.00% 0.000us 0.000us 61 FullyShardedDataParallel._pre_backward_prefetch 0.04% 48.075ms 0.06% 75.906ms 1.244ms 0.000us 0.00% 473.773ms 7.767ms 61 aten::view 0.03% 40.847ms 0.03% 40.880ms 4.381us 0.000us 0.00% 0.000us 0.000us 9331 Torch-Compiled Region 0.03% 40.291ms 12.49% 14.937s 33.719ms 0.000us 0.00% 874.884ms 1.975ms 443 autograd::engine::evaluate_function: ToCopyBackward0... 0.03% 37.509ms 0.09% 107.883ms 98.164us 0.000us 0.00% 414.760ms 377.397us 1099 aten::slice 0.03% 35.601ms 0.04% 43.350ms 6.086us 0.000us 0.00% 0.000us 0.000us 7123 aten::fill_ 0.03% 35.225ms 0.03% 36.585ms 22.173us 433.568ms 0.36% 433.568ms 262.769us 1650 aten::cos 0.03% 34.245ms 0.03% 34.267ms 11.422ms 0.000us 0.00% 9.761us 3.254us 3 autograd::engine::evaluate_function: SiluBackward0 0.03% 31.852ms 0.03% 40.426ms 449.178us 0.000us 0.00% 927.526us 10.306us 90 _AllGather 0.03% 30.930ms 0.09% 110.970ms 924.751us 0.000us 0.00% 11.608ms 96.737us 120 aten::_to_copy 0.02% 27.653ms 0.16% 187.876ms 51.586us 0.000us 0.00% 1.227s 336.790us 3642 aten::add 0.02% 27.628ms 0.03% 30.014ms 44.597us 560.307ms 0.46% 560.307ms 832.551us 673 FlashAttnVarlenQKVPackedFunc 0.02% 27.086ms 0.03% 38.023ms 311.666us 30.005s 24.79% 30.005s 245.939ms 122 c10d::alltoall_base_ 0.02% 25.647ms 0.14% 168.940ms 234.638us 0.000us 0.00% 1.272s 1.766ms 720 aten::as_strided 0.02% 24.681ms 0.02% 24.721ms 1.315us 0.000us 0.00% 0.000us 0.000us 18795 aten::native_layer_norm 0.02% 20.070ms 0.04% 44.874ms 183.158us 156.692ms 0.13% 1.021s 4.169ms 245 aten::transpose 0.02% 19.176ms 0.02% 27.479ms 7.057us 0.000us 0.00% 0.000us 0.000us 3894 hipMemcpyAsync 0.02% 18.440ms 0.02% 18.745ms 9.643us 0.000us 0.00% 0.000us 0.000us 1944 aten::reshape 0.01% 17.789ms 0.05% 54.178ms 8.489us 0.000us 0.00% 119.990ms 18.801us 6382 hipExtModuleLaunchKernel 0.01% 17.126ms 0.02% 17.946ms 13.313us 0.000us 0.00% 0.000us 0.000us 1348 hipExtLaunchKernel 0.01% 17.119ms 0.01% 17.119ms 16.636us 0.000us 0.00% 0.000us 0.000us 1029 aten::silu 0.01% 15.803ms 0.01% 17.426ms 102.506us 126.236ms 0.10% 126.236ms 742.564us 170 FlashAttnVarlenQKVPackedFuncBackward 0.01% 15.710ms 0.02% 23.556ms 379.933us 61.961s 51.19% 61.961s 999.366ms 62 SeqAllToAll4DBackward 0.01% 15.203ms 58.24% 69.658s 290.240ms 0.000us 0.00% 851.375ms 3.547ms 240 triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_0 0.01% 14.437ms 0.01% 17.849ms 55.779us 172.431ms 0.14% 172.431ms 538.846us 320 aten::sin 0.01% 14.082ms 0.01% 14.092ms 4.697ms 0.000us 0.00% 4.800us 1.600us 3 triton_poi_fused__to_copy_add_mul_0 0.01% 14.050ms 0.01% 17.374ms 96.525us 171.418ms 0.14% 171.418ms 952.322us 180 aten::nonzero 0.01% 13.300ms 0.58% 694.937ms 5.650ms 5.141ms 0.00% 5.167ms 42.006us 123 IndexFirstAxis 0.01% 13.133ms 0.02% 23.456ms 192.259us 0.000us 0.00% 252.274ms 2.068ms 122 hipModuleLaunchKernel 0.01% 12.788ms 0.01% 12.788ms 15.538us 0.000us 0.00% 0.000us 0.000us 823 autograd::engine::evaluate_function: ViewBackward0 0.01% 12.520ms 0.02% 26.815ms 14.629us 0.000us 0.00% 40.239ms 21.953us 1833 TorchDynamo Cache Lookup 0.01% 12.248ms 0.01% 12.248ms 27.648us 0.000us 0.00% 0.000us 0.000us 443 autograd::engine::evaluate_function: SplitWithSizesB... 0.01% 12.001ms 0.02% 28.555ms 177.358us 0.000us 0.00% 327.970ms 2.037ms 161 autograd::engine::evaluate_function: CompiledFunctio... 0.01% 11.995ms 0.08% 93.070ms 423.047us 0.000us 0.00% 876.167ms 3.983ms 220 aten::gelu 0.01% 11.808ms 0.01% 13.157ms 82.233us 613.952ms 0.51% 613.952ms 3.837ms 160 FullyShardedDataParallel._pre_backward_hook 0.01% 11.563ms 0.08% 89.873ms 1.473ms 0.000us 0.00% 473.773ms 7.767ms 61 aten::empty_like 0.01% 11.224ms 0.05% 57.993ms 19.901us 0.000us 0.00% 0.000us 0.000us 2914 aten::split_with_sizes 0.01% 11.062ms 0.01% 13.017ms 34.076us 0.000us 0.00% 0.000us 0.000us 382 aten::clone 0.01% 10.456ms 0.16% 190.503ms 116.373us 0.000us 0.00% 2.083s 1.272ms 1637 aten::linear 0.01% 10.229ms 0.37% 443.489ms 334.456us 0.000us 0.00% 19.688s 14.848ms 1326 autograd::engine::evaluate_function: SliceBackward0 0.01% 9.902ms 0.08% 96.433ms 78.914us 0.000us 0.00% 829.633ms 678.914us 1222 aten::select 0.01% 9.805ms 0.01% 11.765ms 6.880us 0.000us 0.00% 0.000us 0.000us 1710 autograd::engine::evaluate_function: torch::autograd... 0.01% 9.610ms 0.10% 118.063ms 1.935ms 0.000us 0.00% 924.418ms 15.154ms 61 aten::t 0.01% 8.418ms 0.02% 18.295ms 8.501us 0.000us 0.00% 0.000us 0.000us 2152 aten::narrow 0.01% 8.141ms 0.02% 23.071ms 8.789us 0.000us 0.00% 0.000us 0.000us 2625 aten::add_ 0.01% 8.038ms 0.01% 8.664ms 17.088us 112.618ms 0.09% 112.618ms 222.126us 507 aten::unsqueeze 0.01% 7.903ms 0.01% 9.223ms 8.508us 0.000us 0.00% 0.000us 0.000us 1084 aten::to 0.01% 7.402ms 0.16% 196.030ms 46.864us 0.000us 0.00% 1.227s 293.232us 4183 AddmmBackward0 0.01% 7.379ms 0.08% 95.766ms 279.202us 0.000us 0.00% 5.064s 14.765ms 343 detach 0.01% 7.063ms 0.01% 7.063ms 3.038us 0.000us 0.00% 0.000us 0.000us 2325 c10d::allgather_ 0.01% 6.917ms 0.05% 58.231ms 485.256us 0.000us 0.00% 9.255ms 77.121us 120 IndexFirstAxisBackward 0.01% 6.847ms 0.01% 17.133ms 276.343us 0.000us 0.00% 171.664ms 2.769ms 62 hipMemsetAsync 0.01% 6.831ms 0.01% 6.831ms 11.199us 0.000us 0.00% 0.000us 0.000us 610 hipPointerGetAttribute 0.01% 6.757ms 0.01% 6.757ms 1.775us 0.000us 0.00% 0.000us 0.000us 3806 triton_red_fused__to_copy_mul_sum_0 0.01% 6.673ms 0.01% 8.058ms 50.364us 728.273ms 0.60% 728.273ms 4.552ms 160 aten::zero_ 0.01% 6.468ms 0.03% 36.351ms 25.854us 0.000us 0.00% 432.630ms 307.702us 1406 aten::linalg_vector_norm 0.01% 6.379ms 0.01% 7.886ms 48.088us 7.243ms 0.01% 7.243ms 44.166us 164 aten::cumsum 0.01% 6.367ms 0.01% 7.156ms 57.249us 763.532us 0.00% 782.412us 6.259us 125 aten::gather 0.01% 6.329ms 0.01% 6.937ms 56.863us 252.274ms 0.21% 252.274ms 2.068ms 122 autograd::engine::evaluate_function: AddmmBackward0 0.01% 6.311ms 0.10% 124.696ms 363.544us 0.000us 0.00% 5.181s 15.105ms 343 IndexPutFirstAxis 0.00% 5.891ms 0.02% 21.759ms 178.354us 0.000us 0.00% 153.046ms 1.254ms 122 aten::_local_scalar_dense 0.00% 5.817ms 0.14% 163.121ms 360.887us 1.946ms 0.00% 1.946ms 4.304us 452 FullyShardedDataParallel._post_backward_prefetch 0.00% 5.778ms 0.00% 5.778ms 94.729us 0.000us 0.00% 0.000us 0.000us 61 aten::max 0.00% 5.570ms 0.01% 7.416ms 60.787us 1.397ms 0.00% 1.397ms 11.447us 122 autograd::engine::evaluate_function: SeqAllToAll4DBa... 0.00% 5.566ms 58.24% 69.663s 290.263ms 0.000us 0.00% 851.375ms 3.547ms 240 aten::zeros 0.00% 5.475ms 0.05% 57.092ms 40.606us 0.000us 0.00% 432.630ms 307.702us 1406 triton_red_fused__to_copy_add_div_mul_pow_sum_1 0.00% 5.450ms 0.01% 6.209ms 38.804us 83.265ms 0.07% 83.265ms 520.405us 160 autograd::engine::evaluate_function: AddBackward0 0.00% 5.261ms 0.09% 104.464ms 283.099us 0.000us 0.00% 489.298ms 1.326ms 369 FullyShardedDataParallel.rate_limiter 0.00% 5.117ms 0.00% 5.725ms 47.315us 0.000us 0.00% 6.412ms 52.991us 121 FullyShardedDataParallel._pre_forward_prefetch 0.00% 4.997ms 0.00% 4.997ms 81.923us 0.000us 0.00% 0.000us 0.000us 61 aten::index 0.00% 4.986ms 0.00% 5.473ms 85.515us 54.407ms 0.04% 54.407ms 850.116us 64 hipEventDestroy 0.00% 4.964ms 0.00% 4.983ms 1.546us 1.181s 0.98% 1.181s 366.501us 3223 aten::_index_put_impl_ 0.00% 4.921ms 0.00% 5.835ms 47.829us 107.164ms 0.09% 107.164ms 878.396us 122 aten::expand 0.00% 4.660ms 0.00% 5.680ms 6.141us 0.000us 0.00% 0.000us 0.000us 925 aten::div_ 0.00% 4.590ms 0.00% 4.786ms 39.232us 107.484ms 0.09% 107.484ms 881.013us 122 aten::native_layer_norm_backward 0.00% 4.562ms 0.01% 9.190ms 73.522us 144.258ms 0.12% 575.273ms 4.602ms 125 Optimizer.step#AdamW.step 0.00% 4.297ms 0.11% 125.816ms 125.816ms 0.000us 0.00% 194.580ms 194.580ms 1 IndexPutFirstAxisBackward 0.00% 4.276ms 0.01% 9.828ms 158.519us 0.000us 0.00% 54.401ms 877.435us 62 aten::split 0.00% 4.113ms 0.01% 11.164ms 49.838us 0.000us 0.00% 0.000us 0.000us 224 aten::detach 0.00% 3.958ms 0.01% 11.021ms 4.740us 0.000us 0.00% 0.000us 0.000us 2325 _AllGatherBackward 0.00% 3.753ms 0.00% 5.006ms 83.431us 0.000us 0.00% 0.000us 0.000us 60 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Self CPU time total: 119.610s Self CUDA time total: 121.035s ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ hipDeviceSynchronize 84.39% 100.929s 84.39% 100.930s 139.986ms 0.000us 0.00% 100.221ms 139.003us 721 hipMemcpyWithStream 12.74% 15.237s 12.74% 15.238s 30.659ms 0.000us 0.00% 307.614ms 618.941us 497 aten::copy_ 0.49% 581.287ms 12.29% 14.697s 1.968ms 4.260s 3.53% 4.316s 577.969us 7468 hipLaunchKernel 0.19% 231.868ms 0.20% 234.117ms 15.585us 0.000us 0.00% 0.000us 0.000us 15022 record_param_comms 0.17% 198.539ms 0.22% 265.703ms 129.107us 3.285s 2.72% 3.287s 1.597ms 2058 SeqAllToAll4D 0.14% 161.529ms 84.84% 101.468s 140.928ms 0.000us 0.00% 2.313s 3.212ms 720 FullyShardedDataParallel.forward 0.13% 161.187ms 38.80% 46.407s 760.774ms 0.000us 0.00% 48.077s 788.152ms 61 MulBackward0 0.13% 153.535ms 21.12% 25.260s 103.102ms 0.000us 0.00% 23.724s 96.832ms 245 aten::addmm 0.11% 130.876ms 0.12% 148.089ms 223.362us 9.421s 7.80% 9.421s 14.210ms 663 aten::cat 0.10% 116.946ms 0.10% 121.291ms 106.676us 1.509s 1.25% 1.509s 1.327ms 1137 aten::empty_strided 0.07% 89.148ms 0.07% 89.261ms 16.989us 0.000us 0.00% 0.000us 0.000us 5254 hipStreamWaitEvent 0.07% 84.079ms 0.07% 84.079ms 27.113us 15.072ms 0.01% 15.072ms 4.860us 3101 aten::empty 0.06% 74.372ms 0.06% 74.381ms 14.290us 0.000us 0.00% 0.000us 0.000us 5205 aten::mm 0.06% 72.923ms 0.06% 75.641ms 111.236us 5.063s 4.19% 5.063s 7.446ms 680 aten::mul 0.06% 69.748ms 0.06% 74.225ms 74.824us 505.835ms 0.42% 505.835ms 509.915us 992 FullyShardedDataParallel._post_backward_hook 0.06% 66.994ms 0.09% 111.980ms 1.836ms 0.000us 0.00% 708.647ms 11.617ms 61 CompiledFunctionBackward 0.05% 56.094ms 0.07% 79.062ms 359.373us 0.000us 0.00% 873.281ms 3.969ms 220 FullyShardedDataParallel._pre_forward 0.05% 54.196ms 0.07% 87.254ms 1.430ms 0.000us 0.00% 694.515ms 11.385ms 61 aten::sum 0.04% 49.796ms 0.06% 65.789ms 67.894us 212.749ms 0.18% 213.594ms 220.428us 969 FullyShardedDataParallel._pre_backward_prefetch 0.04% 49.746ms 0.06% 77.689ms 1.274ms 0.000us 0.00% 473.979ms 7.770ms 61 FullyShardedDataParallel._post_forward 0.04% 48.606ms 0.04% 52.170ms 855.244us 0.000us 0.00% 0.000us 0.000us 61 Torch-Compiled Region 0.03% 40.261ms 12.23% 14.632s 33.028ms 0.000us 0.00% 875.950ms 1.977ms 443 aten::view 0.03% 39.866ms 0.03% 39.932ms 4.206us 0.000us 0.00% 0.000us 0.000us 9495 aten::cos 0.03% 39.515ms 0.03% 39.515ms 13.172ms 0.000us 0.00% 0.000us 0.000us 3 autograd::engine::evaluate_function: ToCopyBackward0... 0.03% 38.601ms 0.09% 110.336ms 100.396us 0.000us 0.00% 387.852ms 352.914us 1099 aten::slice 0.03% 35.944ms 0.04% 43.853ms 5.759us 0.000us 0.00% 0.000us 0.000us 7615 aten::fill_ 0.03% 34.608ms 0.03% 35.874ms 21.742us 438.021ms 0.36% 438.021ms 265.467us 1650 autograd::engine::evaluate_function: SiluBackward0 0.03% 30.353ms 0.03% 38.994ms 433.268us 0.000us 0.00% 1.058ms 11.760us 90 _AllGather 0.03% 30.205ms 0.09% 109.960ms 916.333us 0.000us 0.00% 17.641ms 147.007us 120 aten::add 0.02% 27.587ms 0.03% 29.909ms 44.441us 561.057ms 0.46% 561.057ms 833.666us 673 aten::sin 0.02% 27.535ms 0.02% 27.535ms 9.178ms 0.000us 0.00% 0.000us 0.000us 3 aten::_to_copy 0.02% 26.984ms 0.15% 182.106ms 50.002us 0.000us 0.00% 1.189s 326.513us 3642 FlashAttnVarlenQKVPackedFunc 0.02% 26.603ms 0.03% 37.512ms 307.472us 30.004s 24.86% 30.004s 245.937ms 122 c10d::alltoall_base_ 0.02% 25.266ms 0.14% 168.907ms 234.592us 0.000us 0.00% 1.552s 2.155ms 720 aten::as_strided 0.02% 24.830ms 0.02% 24.883ms 1.269us 0.000us 0.00% 0.000us 0.000us 19615 aten::native_layer_norm 0.02% 19.671ms 0.04% 44.504ms 181.651us 175.209ms 0.15% 1.043s 4.257ms 245 aten::transpose 0.02% 18.745ms 0.02% 26.986ms 6.930us 0.000us 0.00% 0.000us 0.000us 3894 aten::reshape 0.02% 18.040ms 0.05% 54.017ms 8.252us 0.000us 0.00% 118.759ms 18.142us 6546 hipMemcpyAsync 0.01% 17.691ms 0.02% 17.964ms 9.241us 0.000us 0.00% 0.000us 0.000us 1944 hipExtModuleLaunchKernel 0.01% 16.863ms 0.01% 17.670ms 13.109us 0.000us 0.00% 0.000us 0.000us 1348 hipExtLaunchKernel 0.01% 16.638ms 0.01% 16.642ms 16.173us 0.000us 0.00% 0.000us 0.000us 1029 FlashAttnVarlenQKVPackedFuncBackward 0.01% 15.905ms 0.02% 23.585ms 380.398us 62.039s 51.40% 62.039s 1.001s 62 aten::silu 0.01% 15.690ms 0.01% 17.300ms 101.767us 131.724ms 0.11% 131.724ms 774.849us 170 SeqAllToAll4DBackward 0.01% 15.300ms 58.34% 69.776s 290.732ms 0.000us 0.00% 953.845ms 3.974ms 240 triton_poi_fused__to_copy_add_mul_0 0.01% 13.976ms 0.01% 17.342ms 96.343us 169.595ms 0.14% 169.595ms 942.192us 180 triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_0 0.01% 13.796ms 0.01% 16.980ms 53.062us 172.156ms 0.14% 172.156ms 537.989us 320 aten::nonzero 0.01% 13.245ms 0.58% 692.721ms 5.632ms 5.086ms 0.00% 256.475ms 2.085ms 123 autograd::engine::evaluate_function: ViewBackward0 0.01% 12.544ms 0.02% 27.285ms 14.885us 0.000us 0.00% 39.653ms 21.633us 1833 IndexFirstAxis 0.01% 12.378ms 0.02% 22.734ms 186.346us 0.000us 0.00% 252.197ms 2.067ms 122 FullyShardedDataParallel._pre_backward_hook 0.01% 12.352ms 0.08% 92.446ms 1.516ms 0.000us 0.00% 473.979ms 7.770ms 61 hipModuleLaunchKernel 0.01% 12.320ms 0.01% 12.320ms 14.970us 0.000us 0.00% 0.000us 0.000us 823 autograd::engine::evaluate_function: CompiledFunctio... 0.01% 12.232ms 0.08% 93.028ms 422.853us 0.000us 0.00% 873.281ms 3.969ms 220 autograd::engine::evaluate_function: SplitWithSizesB... 0.01% 12.186ms 0.02% 28.534ms 177.230us 0.000us 0.00% 327.363ms 2.033ms 161 TorchDynamo Cache Lookup 0.01% 12.169ms 0.01% 12.169ms 27.469us 0.000us 0.00% 0.000us 0.000us 443 aten::gelu 0.01% 11.666ms 0.01% 12.823ms 80.146us 582.606ms 0.48% 582.606ms 3.641ms 160 aten::linalg_vector_norm 0.01% 11.341ms 0.01% 13.895ms 42.364us 8.760ms 0.01% 8.760ms 26.708us 328 aten::empty_like 0.01% 11.224ms 0.05% 58.231ms 19.983us 0.000us 0.00% 0.000us 0.000us 2914 aten::split_with_sizes 0.01% 10.281ms 0.01% 12.055ms 31.558us 0.000us 0.00% 0.000us 0.000us 382 aten::cumsum 0.01% 10.076ms 0.01% 10.933ms 86.768us 768.013us 0.00% 792.973us 6.293us 126 aten::clone 0.01% 10.039ms 0.17% 198.376ms 121.182us 0.000us 0.00% 2.087s 1.275ms 1637 autograd::engine::evaluate_function: SliceBackward0 0.01% 10.006ms 0.08% 95.722ms 78.332us 0.000us 0.00% 831.983ms 680.837us 1222 aten::linear 0.01% 9.792ms 0.36% 429.874ms 324.188us 0.000us 0.00% 19.274s 14.535ms 1326 aten::select 0.01% 9.595ms 0.01% 11.557ms 6.758us 0.000us 0.00% 0.000us 0.000us 1710 autograd::engine::evaluate_function: torch::autograd... 0.01% 9.531ms 0.10% 122.360ms 2.006ms 0.000us 0.00% 708.647ms 11.617ms 61 aten::add_ 0.01% 8.324ms 0.01% 8.925ms 13.301us 108.856ms 0.09% 108.856ms 162.229us 671 aten::narrow 0.01% 8.176ms 0.02% 23.042ms 8.778us 0.000us 0.00% 0.000us 0.000us 2625 aten::unsqueeze 0.01% 8.036ms 0.01% 9.385ms 7.520us 0.000us 0.00% 0.000us 0.000us 1248 aten::t 0.01% 8.021ms 0.01% 17.846ms 8.293us 0.000us 0.00% 0.000us 0.000us 2152 AddmmBackward0 0.01% 7.654ms 0.08% 95.965ms 279.783us 0.000us 0.00% 5.063s 14.762ms 343 aten::to 0.01% 7.450ms 0.16% 190.371ms 43.794us 0.000us 0.00% 1.189s 273.559us 4347 detach 0.01% 7.311ms 0.01% 7.311ms 2.937us 0.000us 0.00% 0.000us 0.000us 2489 hipMemsetAsync 0.01% 7.279ms 0.01% 7.279ms 11.165us 0.000us 0.00% 0.000us 0.000us 652 Optimizer.step#AdamW.step 0.01% 7.108ms 0.10% 119.669ms 119.669ms 0.000us 0.00% 199.712ms 199.712ms 1 hipPointerGetAttribute 0.01% 6.961ms 0.01% 6.961ms 1.829us 0.000us 0.00% 0.000us 0.000us 3806 IndexFirstAxisBackward 0.01% 6.803ms 0.01% 17.130ms 276.285us 0.000us 0.00% 171.592ms 2.768ms 62 c10d::allgather_ 0.01% 6.781ms 0.05% 57.338ms 477.816us 0.000us 0.00% 15.784ms 131.531us 120 triton_red_fused__to_copy_mul_sum_0 0.01% 6.673ms 0.01% 8.144ms 50.898us 727.124ms 0.60% 727.124ms 4.545ms 160 aten::gather 0.01% 6.353ms 0.01% 6.927ms 56.782us 252.197ms 0.21% 252.197ms 2.067ms 122 autograd::engine::evaluate_function: AddmmBackward0 0.01% 6.234ms 0.10% 124.590ms 363.236us 0.000us 0.00% 5.176s 15.089ms 343 aten::zero_ 0.01% 6.024ms 0.03% 35.423ms 25.195us 0.000us 0.00% 437.085ms 310.872us 1406 FullyShardedDataParallel._post_backward_prefetch 0.00% 5.906ms 0.00% 5.906ms 96.822us 0.000us 0.00% 0.000us 0.000us 61 aten::_local_scalar_dense 0.00% 5.838ms 0.13% 158.109ms 202.704us 1.893ms 0.00% 1.893ms 2.427us 780 IndexPutFirstAxis 0.00% 5.708ms 0.02% 21.272ms 174.359us 0.000us 0.00% 153.006ms 1.254ms 122 autograd::engine::evaluate_function: SeqAllToAll4DBa... 0.00% 5.705ms 58.34% 69.781s 290.756ms 0.000us 0.00% 953.845ms 3.974ms 240 aten::zeros 0.00% 5.639ms 0.05% 56.239ms 39.999us 0.000us 0.00% 437.085ms 310.872us 1406 autograd::engine::evaluate_function: AddBackward0 0.00% 5.585ms 0.09% 106.274ms 288.005us 0.000us 0.00% 494.156ms 1.339ms 369 triton_red_fused__to_copy_add_div_mul_pow_sum_1 0.00% 5.391ms 0.01% 6.145ms 38.406us 82.745ms 0.07% 82.745ms 517.159us 160 aten::max 0.00% 5.275ms 0.01% 7.107ms 58.257us 1.369ms 0.00% 1.369ms 11.222us 122 FullyShardedDataParallel.rate_limiter 0.00% 5.248ms 0.00% 5.831ms 48.191us 0.000us 0.00% 23.894ms 197.472us 121 aten::index 0.00% 5.096ms 0.00% 5.563ms 86.928us 54.600ms 0.05% 54.600ms 853.126us 64 hipEventDestroy 0.00% 5.082ms 0.00% 5.084ms 1.577us 445.166ms 0.37% 445.166ms 138.122us 3223 aten::_index_put_impl_ 0.00% 4.888ms 0.00% 5.691ms 46.646us 107.122ms 0.09% 107.124ms 878.069us 122 FullyShardedDataParallel._pre_forward_prefetch 0.00% 4.870ms 0.00% 4.870ms 79.842us 0.000us 0.00% 0.000us 0.000us 61 aten::expand 0.00% 4.657ms 0.00% 5.696ms 6.157us 0.000us 0.00% 0.000us 0.000us 925 aten::div_ 0.00% 4.618ms 0.00% 4.840ms 39.669us 109.093ms 0.09% 109.093ms 894.202us 122 aten::mul_ 0.00% 4.538ms 0.01% 6.591ms 20.155us 22.174ms 0.02% 22.174ms 67.809us 327 aten::native_layer_norm_backward 0.00% 4.473ms 0.01% 9.062ms 72.498us 143.190ms 0.12% 578.460ms 4.628ms 125 IndexPutFirstAxisBackward 0.00% 4.323ms 0.01% 9.977ms 160.924us 0.000us 0.00% 54.594ms 880.545us 62 ViewBackward0 0.00% 4.071ms 0.01% 14.714ms 8.027us 0.000us 0.00% 39.653ms 21.633us 1833 aten::split 0.00% 4.007ms 0.01% 11.047ms 49.316us 0.000us 0.00% 0.000us 0.000us 224 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Self CPU time total: 119.604s Self CUDA time total: 120.708s zll step_time: 133.97s avg_step_time: 125.77657899856567 zll step_time: 119.83s avg_step_time: 124.78558504581451 zll step_time: 119.08s avg_step_time: 123.97018020493644 --> saving checkpoint at step 7 --> checkpoint saved at step 7