Llama_pretraining_8ka.log 97.8 KB
Newer Older
yangzhong's avatar
yangzhong committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
+ export HSA_FORCE_FINE_GRAIN_PCIE=1
+ HSA_FORCE_FINE_GRAIN_PCIE=1
+ export OMP_NUM_THREADS=1
+ OMP_NUM_THREADS=1
+ export HSA_FORCE_FINE_GRAIN_PCIE=1
+ HSA_FORCE_FINE_GRAIN_PCIE=1
+ export OMP_NUM_THREADS=1
+ OMP_NUM_THREADS=1
+ export NCCL_P2P_LEVEL=PXB
+ NCCL_P2P_LEVEL=PXB
+ export NCCL_ALGO=Ring
+ NCCL_ALGO=Ring
+ export NCCL_NCHANNELS_PER_PEER=16
+ NCCL_NCHANNELS_PER_PEER=16
+ export NCCL_P2P_LEVEL=PXB
+ NCCL_P2P_LEVEL=PXB
+ export NCCL_ALGO=Ring
+ NCCL_ALGO=Ring
+ export NCCL_NCHANNELS_PER_PEER=16
+ NCCL_NCHANNELS_PER_PEER=16
+ export NCCL_MIN_NCHANNELS=32
+ NCCL_MIN_NCHANNELS=32
+ export NCCL_MIN_NCHANNELS=32
+ NCCL_MIN_NCHANNELS=32
+ export NCCL_MAX_NCHANNELS=32
+ NCCL_MAX_NCHANNELS=32
+ export NCCL_MAX_NCHANNELS=32
+ NCCL_MAX_NCHANNELS=32
+ export NCCL_IB_TIMEOUT=22
+ NCCL_IB_TIMEOUT=22
+ export CUDA_DEVICE_MAX_CONNECTIONS=1
+ export NCCL_IB_TIMEOUT=22
+ NCCL_IB_TIMEOUT=22
+ export CUDA_DEVICE_MAX_CONNECTIONS=1
+ CUDA_DEVICE_MAX_CONNECTIONS=1
+ export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+ NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+ CUDA_DEVICE_MAX_CONNECTIONS=1
+ export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+ NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+ export NCCL_NET_GDR_LEVEL=7
+ NCCL_NET_GDR_LEVEL=7
+ export NCCL_NET_GDR_LEVEL=7
+ NCCL_NET_GDR_LEVEL=7
+ export NCCL_NET_GDR_READ=1
+ export NCCL_NET_GDR_READ=1
+ NCCL_NET_GDR_READ=1
+ NCCL_NET_GDR_READ=1
+ export RCCL_SDMA_COPY_ENABLE=0
+ RCCL_SDMA_COPY_ENABLE=0
+ export RCCL_SDMA_COPY_ENABLE=0
+ RCCL_SDMA_COPY_ENABLE=0
+ export GLOG_minloglevel=3
+ GLOG_minloglevel=3
+ export GLOG_minloglevel=3
+ GLOG_minloglevel=3
+ export ALLREDUCE_STREAM_WITH_COMPUTE=1
+ ALLREDUCE_STREAM_WITH_COMPUTE=1
+ export SENDRECV_STREAM_WITH_COMPUTE=1
+ export ALLREDUCE_STREAM_WITH_COMPUTE=1
+ ALLREDUCE_STREAM_WITH_COMPUTE=1
+ export SENDRECV_STREAM_WITH_COMPUTE=1
+ SENDRECV_STREAM_WITH_COMPUTE=1
+ SENDRECV_STREAM_WITH_COMPUTE=1
+ export cache_size_limit=64
+ cache_size_limit=64
+ export cache_size_limit=64
+ cache_size_limit=64
+ SAVE_PATH=./tmp_7b
+ TENSORBOARD_LOGS_PATH=./tmp_7b
+ SAVE_PATH=./tmp_7b
+ TENSORBOARD_LOGS_PATH=./tmp_7b
+ DATA_PATH=/models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document
+ DATA_PATH=/models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document
+ GPT_MODEL_ARGS=(--num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights)
+ export HSA_FORCE_FINE_GRAIN_PCIE=1
+ HSA_FORCE_FINE_GRAIN_PCIE=1
+ export OMP_NUM_THREADS=1
+ OMP_NUM_THREADS=1
+ export NCCL_P2P_LEVEL=PXB
+ GPT_MODEL_ARGS=(--num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights)
+ NCCL_P2P_LEVEL=PXB
+ export NCCL_ALGO=Ring
+ NCCL_ALGO=Ring
+ export NCCL_NCHANNELS_PER_PEER=16
+ NCCL_NCHANNELS_PER_PEER=16
+ export NCCL_MIN_NCHANNELS=32
+ NCCL_MIN_NCHANNELS=32
+ export NVTE_FLASH_ATTN=1
+ NVTE_FLASH_ATTN=1
+ export NVTE_FLASH_ATTN=1
+ NVTE_FLASH_ATTN=1
+ export NCCL_MAX_NCHANNELS=32
+ NCCL_MAX_NCHANNELS=32
+ export NCCL_IB_TIMEOUT=22
+ NCCL_IB_TIMEOUT=22
+ TRAINING_ARGS=(--transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass)
+ TRAINING_ARGS=(--transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass)
+ export CUDA_DEVICE_MAX_CONNECTIONS=1
+ CUDA_DEVICE_MAX_CONNECTIONS=1
+ export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+ NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+ export NCCL_NET_GDR_LEVEL=7
+ NCCL_NET_GDR_LEVEL=7
+ MODEL_PARALLEL_ARGS=(--sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2)
+ DATA_ARGS=(--data-path $DATA_PATH --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model)
+ MODEL_PARALLEL_ARGS=(--sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2)
+ DATA_ARGS=(--data-path $DATA_PATH --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model)
+ export NCCL_NET_GDR_READ=1
+ NCCL_NET_GDR_READ=1
+ export RCCL_SDMA_COPY_ENABLE=0
+ RCCL_SDMA_COPY_ENABLE=0
+ export GLOG_minloglevel=3
+ GLOG_minloglevel=3
+ EVAL_AND_LOGGING_ARGS=(--log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir $TENSORBOARD_LOGS_PATH)
+ EVAL_AND_LOGGING_ARGS=(--log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir $TENSORBOARD_LOGS_PATH)
+ export ALLREDUCE_STREAM_WITH_COMPUTE=1
+ ALLREDUCE_STREAM_WITH_COMPUTE=1
+ export SENDRECV_STREAM_WITH_COMPUTE=1
+ SENDRECV_STREAM_WITH_COMPUTE=1
+ export cache_size_limit=64
+ PROFILE_ARGS=(--profile --profile-step-start 4 --profile-step-end 5 --use-pytorch-profiler --profile-ranks 0 1 2 3 4 5 6 7 --profile-dir prof_data)
+ PROFILE_ARGS=(--profile --profile-step-start 4 --profile-step-end 5 --use-pytorch-profiler --profile-ranks 0 1 2 3 4 5 6 7 --profile-dir prof_data)
+ cache_size_limit=64
+ SAVE_PATH=./tmp_7b
+ TENSORBOARD_LOGS_PATH=./tmp_7b
+ RANK=1
+ LOCAL_RANK=1
+ WORLD_SIZE=8
+ RANK=0
+ LOCAL_RANK=0
+ WORLD_SIZE=8
+ DATA_PATH=/models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document
+ GPT_MODEL_ARGS=(--num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights)
+ DIST_URL=localhost
+ DIST_PORT=34577
+ DIST_URL=localhost
+ DIST_PORT=34577
+ DISTRIBUTED_ARGS=(--rank ${RANK} --world-size ${WORLD_SIZE} --local-rank ${LOCAL_RANK} --dist-url tcp://${DIST_URL}:${DIST_PORT})
+ DISTRIBUTED_ARGS=(--rank ${RANK} --world-size ${WORLD_SIZE} --local-rank ${LOCAL_RANK} --dist-url tcp://${DIST_URL}:${DIST_PORT})
+ export NVTE_FLASH_ATTN=1
+ NVTE_FLASH_ATTN=1
+ APP='python -u pretrain_gpt.py         --num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights         --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass         --sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2         --data-path /models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model         --log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir ./tmp_7b         --rank 0 --world-size 8 --local-rank 0 --dist-url tcp://localhost:34577 '
+ export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ APP='python -u pretrain_gpt.py         --num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights         --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass         --sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2         --data-path /models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model         --log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir ./tmp_7b         --rank 1 --world-size 8 --local-rank 1 --dist-url tcp://localhost:34577 '
+ export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ TRAINING_ARGS=(--transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass)
+ HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ case ${LOCAL_RANK} in
+ export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ case ${LOCAL_RANK} in
+ export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ MODEL_PARALLEL_ARGS=(--sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2)
+ DATA_ARGS=(--data-path $DATA_PATH --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model)
+ EVAL_AND_LOGGING_ARGS=(--log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir $TENSORBOARD_LOGS_PATH)
+ numactl --cpunodebind=1 --membind=1 python -u pretrain_gpt.py --num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass --sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --data-path /models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model --log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir ./tmp_7b --rank 1 --world-size 8 --local-rank 1 --dist-url tcp://localhost:34577
+ numactl --cpunodebind=0 --membind=0 python -u pretrain_gpt.py --num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass --sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --data-path /models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model --log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir ./tmp_7b --rank 0 --world-size 8 --local-rank 0 --dist-url tcp://localhost:34577
+ PROFILE_ARGS=(--profile --profile-step-start 4 --profile-step-end 5 --use-pytorch-profiler --profile-ranks 0 1 2 3 4 5 6 7 --profile-dir prof_data)
+ RANK=3
+ LOCAL_RANK=3
+ WORLD_SIZE=8
+ DIST_URL=localhost
+ DIST_PORT=34577
+ DISTRIBUTED_ARGS=(--rank ${RANK} --world-size ${WORLD_SIZE} --local-rank ${LOCAL_RANK} --dist-url tcp://${DIST_URL}:${DIST_PORT})
+ APP='python -u pretrain_gpt.py         --num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights         --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass         --sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2         --data-path /models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model         --log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir ./tmp_7b         --rank 3 --world-size 8 --local-rank 3 --dist-url tcp://localhost:34577 '
+ export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ case ${LOCAL_RANK} in
+ export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ numactl --cpunodebind=3 --membind=3 python -u pretrain_gpt.py --num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass --sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --data-path /models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model --log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir ./tmp_7b --rank 3 --world-size 8 --local-rank 3 --dist-url tcp://localhost:34577
+ export HSA_FORCE_FINE_GRAIN_PCIE=1
+ HSA_FORCE_FINE_GRAIN_PCIE=1
+ export HSA_FORCE_FINE_GRAIN_PCIE=1
+ HSA_FORCE_FINE_GRAIN_PCIE=1
+ export OMP_NUM_THREADS=1
+ OMP_NUM_THREADS=1
+ export OMP_NUM_THREADS=1
+ OMP_NUM_THREADS=1
+ export NCCL_P2P_LEVEL=PXB
+ NCCL_P2P_LEVEL=PXB
+ export NCCL_P2P_LEVEL=PXB
+ NCCL_P2P_LEVEL=PXB
+ export NCCL_ALGO=Ring
+ NCCL_ALGO=Ring
+ export NCCL_ALGO=Ring
+ NCCL_ALGO=Ring
+ export NCCL_NCHANNELS_PER_PEER=16
+ export NCCL_NCHANNELS_PER_PEER=16
+ NCCL_NCHANNELS_PER_PEER=16
+ export NCCL_MIN_NCHANNELS=32
+ NCCL_MIN_NCHANNELS=32
+ NCCL_NCHANNELS_PER_PEER=16
+ export NCCL_MIN_NCHANNELS=32
+ NCCL_MIN_NCHANNELS=32
+ export NCCL_MAX_NCHANNELS=32
+ NCCL_MAX_NCHANNELS=32
+ export NCCL_IB_TIMEOUT=22
+ NCCL_IB_TIMEOUT=22
+ export NCCL_MAX_NCHANNELS=32
+ NCCL_MAX_NCHANNELS=32
+ export NCCL_IB_TIMEOUT=22
+ NCCL_IB_TIMEOUT=22
+ export CUDA_DEVICE_MAX_CONNECTIONS=1
+ export CUDA_DEVICE_MAX_CONNECTIONS=1
+ CUDA_DEVICE_MAX_CONNECTIONS=1
+ CUDA_DEVICE_MAX_CONNECTIONS=1
+ export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+ NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+ export NCCL_NET_GDR_LEVEL=7
+ NCCL_NET_GDR_LEVEL=7
+ export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+ NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+ export NCCL_NET_GDR_LEVEL=7
+ NCCL_NET_GDR_LEVEL=7
+ export NCCL_NET_GDR_READ=1
+ export NCCL_NET_GDR_READ=1
+ NCCL_NET_GDR_READ=1
+ export RCCL_SDMA_COPY_ENABLE=0
+ NCCL_NET_GDR_READ=1
+ export RCCL_SDMA_COPY_ENABLE=0
+ RCCL_SDMA_COPY_ENABLE=0
+ RCCL_SDMA_COPY_ENABLE=0
+ export GLOG_minloglevel=3
+ GLOG_minloglevel=3
+ export GLOG_minloglevel=3
+ GLOG_minloglevel=3
+ export ALLREDUCE_STREAM_WITH_COMPUTE=1
+ ALLREDUCE_STREAM_WITH_COMPUTE=1
+ export SENDRECV_STREAM_WITH_COMPUTE=1
+ export ALLREDUCE_STREAM_WITH_COMPUTE=1
+ ALLREDUCE_STREAM_WITH_COMPUTE=1
+ export SENDRECV_STREAM_WITH_COMPUTE=1
+ SENDRECV_STREAM_WITH_COMPUTE=1
+ SENDRECV_STREAM_WITH_COMPUTE=1
+ export cache_size_limit=64
+ cache_size_limit=64
+ SAVE_PATH=./tmp_7b
+ export cache_size_limit=64
+ cache_size_limit=64
+ SAVE_PATH=./tmp_7b
+ TENSORBOARD_LOGS_PATH=./tmp_7b
+ TENSORBOARD_LOGS_PATH=./tmp_7b
+ DATA_PATH=/models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document
+ DATA_PATH=/models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document
+ GPT_MODEL_ARGS=(--num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights)
+ GPT_MODEL_ARGS=(--num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights)
+ export NVTE_FLASH_ATTN=1
+ NVTE_FLASH_ATTN=1
+ export NVTE_FLASH_ATTN=1
+ NVTE_FLASH_ATTN=1
+ TRAINING_ARGS=(--transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass)
+ TRAINING_ARGS=(--transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass)
+ MODEL_PARALLEL_ARGS=(--sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2)
+ MODEL_PARALLEL_ARGS=(--sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2)
+ DATA_ARGS=(--data-path $DATA_PATH --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model)
+ DATA_ARGS=(--data-path $DATA_PATH --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model)
+ EVAL_AND_LOGGING_ARGS=(--log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir $TENSORBOARD_LOGS_PATH)
+ EVAL_AND_LOGGING_ARGS=(--log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir $TENSORBOARD_LOGS_PATH)
+ PROFILE_ARGS=(--profile --profile-step-start 4 --profile-step-end 5 --use-pytorch-profiler --profile-ranks 0 1 2 3 4 5 6 7 --profile-dir prof_data)
+ PROFILE_ARGS=(--profile --profile-step-start 4 --profile-step-end 5 --use-pytorch-profiler --profile-ranks 0 1 2 3 4 5 6 7 --profile-dir prof_data)
+ RANK=4
+ LOCAL_RANK=4
+ RANK=6
+ LOCAL_RANK=6
+ WORLD_SIZE=8
+ WORLD_SIZE=8
+ DIST_URL=localhost
+ DIST_URL=localhost
+ DIST_PORT=34577
+ DIST_PORT=34577
+ DISTRIBUTED_ARGS=(--rank ${RANK} --world-size ${WORLD_SIZE} --local-rank ${LOCAL_RANK} --dist-url tcp://${DIST_URL}:${DIST_PORT})
+ DISTRIBUTED_ARGS=(--rank ${RANK} --world-size ${WORLD_SIZE} --local-rank ${LOCAL_RANK} --dist-url tcp://${DIST_URL}:${DIST_PORT})
+ APP='python -u pretrain_gpt.py         --num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights         --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass         --sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2         --data-path /models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model         --log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir ./tmp_7b         --rank 6 --world-size 8 --local-rank 6 --dist-url tcp://localhost:34577 '
+ APP='python -u pretrain_gpt.py         --num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights         --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass         --sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2         --data-path /models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model         --log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir ./tmp_7b         --rank 4 --world-size 8 --local-rank 4 --dist-url tcp://localhost:34577 '
+ export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ case ${LOCAL_RANK} in
+ export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ case ${LOCAL_RANK} in
+ export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ numactl --cpunodebind=6 --membind=6 python -u pretrain_gpt.py --num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass --sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --data-path /models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model --log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir ./tmp_7b --rank 6 --world-size 8 --local-rank 6 --dist-url tcp://localhost:34577
+ numactl --cpunodebind=4 --membind=4 python -u pretrain_gpt.py --num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass --sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --data-path /models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model --log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir ./tmp_7b --rank 4 --world-size 8 --local-rank 4 --dist-url tcp://localhost:34577
+ export HSA_FORCE_FINE_GRAIN_PCIE=1
+ HSA_FORCE_FINE_GRAIN_PCIE=1
+ export OMP_NUM_THREADS=1
+ OMP_NUM_THREADS=1
+ export NCCL_P2P_LEVEL=PXB
+ NCCL_P2P_LEVEL=PXB
+ export NCCL_ALGO=Ring
+ NCCL_ALGO=Ring
+ export NCCL_NCHANNELS_PER_PEER=16
+ NCCL_NCHANNELS_PER_PEER=16
+ export NCCL_MIN_NCHANNELS=32
+ NCCL_MIN_NCHANNELS=32
+ export NCCL_MAX_NCHANNELS=32
+ NCCL_MAX_NCHANNELS=32
+ export NCCL_IB_TIMEOUT=22
+ NCCL_IB_TIMEOUT=22
+ export CUDA_DEVICE_MAX_CONNECTIONS=1
+ CUDA_DEVICE_MAX_CONNECTIONS=1
+ export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+ NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+ export NCCL_NET_GDR_LEVEL=7
+ NCCL_NET_GDR_LEVEL=7
+ export NCCL_NET_GDR_READ=1
+ NCCL_NET_GDR_READ=1
+ export RCCL_SDMA_COPY_ENABLE=0
+ RCCL_SDMA_COPY_ENABLE=0
+ export GLOG_minloglevel=3
+ GLOG_minloglevel=3
+ export ALLREDUCE_STREAM_WITH_COMPUTE=1
+ ALLREDUCE_STREAM_WITH_COMPUTE=1
+ export SENDRECV_STREAM_WITH_COMPUTE=1
+ SENDRECV_STREAM_WITH_COMPUTE=1
+ export cache_size_limit=64
+ cache_size_limit=64
+ SAVE_PATH=./tmp_7b
+ TENSORBOARD_LOGS_PATH=./tmp_7b
+ DATA_PATH=/models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document
+ GPT_MODEL_ARGS=(--num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights)
+ export NVTE_FLASH_ATTN=1
+ NVTE_FLASH_ATTN=1
+ TRAINING_ARGS=(--transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass)
+ MODEL_PARALLEL_ARGS=(--sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2)
+ DATA_ARGS=(--data-path $DATA_PATH --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model)
+ EVAL_AND_LOGGING_ARGS=(--log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir $TENSORBOARD_LOGS_PATH)
+ PROFILE_ARGS=(--profile --profile-step-start 4 --profile-step-end 5 --use-pytorch-profiler --profile-ranks 0 1 2 3 4 5 6 7 --profile-dir prof_data)
+ RANK=2
+ LOCAL_RANK=2
+ WORLD_SIZE=8
+ export HSA_FORCE_FINE_GRAIN_PCIE=1
+ HSA_FORCE_FINE_GRAIN_PCIE=1
+ DIST_URL=localhost
+ DIST_PORT=34577
+ export OMP_NUM_THREADS=1
+ OMP_NUM_THREADS=1
+ DISTRIBUTED_ARGS=(--rank ${RANK} --world-size ${WORLD_SIZE} --local-rank ${LOCAL_RANK} --dist-url tcp://${DIST_URL}:${DIST_PORT})
+ export NCCL_P2P_LEVEL=PXB
+ NCCL_P2P_LEVEL=PXB
+ export NCCL_ALGO=Ring
+ NCCL_ALGO=Ring
+ export NCCL_NCHANNELS_PER_PEER=16
+ NCCL_NCHANNELS_PER_PEER=16
+ export NCCL_MIN_NCHANNELS=32
+ NCCL_MIN_NCHANNELS=32
+ export NCCL_MAX_NCHANNELS=32
+ NCCL_MAX_NCHANNELS=32
+ export NCCL_IB_TIMEOUT=22
+ NCCL_IB_TIMEOUT=22
+ APP='python -u pretrain_gpt.py         --num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights         --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass         --sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2         --data-path /models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model         --log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir ./tmp_7b         --rank 2 --world-size 8 --local-rank 2 --dist-url tcp://localhost:34577 '
+ export CUDA_DEVICE_MAX_CONNECTIONS=1
+ CUDA_DEVICE_MAX_CONNECTIONS=1
+ export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+ export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+ export NCCL_NET_GDR_LEVEL=7
+ NCCL_NET_GDR_LEVEL=7
+ export NCCL_NET_GDR_READ=1
+ NCCL_NET_GDR_READ=1
+ export RCCL_SDMA_COPY_ENABLE=0
+ RCCL_SDMA_COPY_ENABLE=0
+ export GLOG_minloglevel=3
+ GLOG_minloglevel=3
+ case ${LOCAL_RANK} in
+ export ALLREDUCE_STREAM_WITH_COMPUTE=1
+ ALLREDUCE_STREAM_WITH_COMPUTE=1
+ export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ export SENDRECV_STREAM_WITH_COMPUTE=1
+ SENDRECV_STREAM_WITH_COMPUTE=1
+ export cache_size_limit=64
+ cache_size_limit=64
+ SAVE_PATH=./tmp_7b
+ TENSORBOARD_LOGS_PATH=./tmp_7b
+ DATA_PATH=/models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document
+ numactl --cpunodebind=2 --membind=2 python -u pretrain_gpt.py --num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass --sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --data-path /models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model --log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir ./tmp_7b --rank 2 --world-size 8 --local-rank 2 --dist-url tcp://localhost:34577
+ GPT_MODEL_ARGS=(--num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights)
+ export NVTE_FLASH_ATTN=1
+ NVTE_FLASH_ATTN=1
+ TRAINING_ARGS=(--transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass)
+ MODEL_PARALLEL_ARGS=(--sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2)
+ DATA_ARGS=(--data-path $DATA_PATH --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model)
+ EVAL_AND_LOGGING_ARGS=(--log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir $TENSORBOARD_LOGS_PATH)
+ PROFILE_ARGS=(--profile --profile-step-start 4 --profile-step-end 5 --use-pytorch-profiler --profile-ranks 0 1 2 3 4 5 6 7 --profile-dir prof_data)
+ RANK=5
+ LOCAL_RANK=5
+ WORLD_SIZE=8
+ DIST_URL=localhost
+ DIST_PORT=34577
+ DISTRIBUTED_ARGS=(--rank ${RANK} --world-size ${WORLD_SIZE} --local-rank ${LOCAL_RANK} --dist-url tcp://${DIST_URL}:${DIST_PORT})
+ APP='python -u pretrain_gpt.py         --num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights         --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass         --sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2         --data-path /models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model         --log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir ./tmp_7b         --rank 5 --world-size 8 --local-rank 5 --dist-url tcp://localhost:34577 '
+ export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ case ${LOCAL_RANK} in
+ export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ numactl --cpunodebind=5 --membind=5 python -u pretrain_gpt.py --num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass --sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --data-path /models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model --log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir ./tmp_7b --rank 5 --world-size 8 --local-rank 5 --dist-url tcp://localhost:34577
+ export HSA_FORCE_FINE_GRAIN_PCIE=1
+ HSA_FORCE_FINE_GRAIN_PCIE=1
+ export OMP_NUM_THREADS=1
+ OMP_NUM_THREADS=1
+ export NCCL_P2P_LEVEL=PXB
+ NCCL_P2P_LEVEL=PXB
+ export NCCL_ALGO=Ring
+ NCCL_ALGO=Ring
+ export NCCL_NCHANNELS_PER_PEER=16
+ NCCL_NCHANNELS_PER_PEER=16
+ export NCCL_MIN_NCHANNELS=32
+ NCCL_MIN_NCHANNELS=32
+ export NCCL_MAX_NCHANNELS=32
+ NCCL_MAX_NCHANNELS=32
+ export NCCL_IB_TIMEOUT=22
+ NCCL_IB_TIMEOUT=22
+ export CUDA_DEVICE_MAX_CONNECTIONS=1
+ CUDA_DEVICE_MAX_CONNECTIONS=1
+ export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+ NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+ export NCCL_NET_GDR_LEVEL=7
+ NCCL_NET_GDR_LEVEL=7
+ export NCCL_NET_GDR_READ=1
+ NCCL_NET_GDR_READ=1
+ export RCCL_SDMA_COPY_ENABLE=0
+ RCCL_SDMA_COPY_ENABLE=0
+ export GLOG_minloglevel=3
+ GLOG_minloglevel=3
+ export ALLREDUCE_STREAM_WITH_COMPUTE=1
+ ALLREDUCE_STREAM_WITH_COMPUTE=1
+ export SENDRECV_STREAM_WITH_COMPUTE=1
+ SENDRECV_STREAM_WITH_COMPUTE=1
+ export cache_size_limit=64
+ cache_size_limit=64
+ SAVE_PATH=./tmp_7b
+ TENSORBOARD_LOGS_PATH=./tmp_7b
+ DATA_PATH=/models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document
+ GPT_MODEL_ARGS=(--num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights)
+ export NVTE_FLASH_ATTN=1
+ NVTE_FLASH_ATTN=1
+ TRAINING_ARGS=(--transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass)
+ MODEL_PARALLEL_ARGS=(--sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2)
+ DATA_ARGS=(--data-path $DATA_PATH --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model)
+ EVAL_AND_LOGGING_ARGS=(--log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir $TENSORBOARD_LOGS_PATH)
+ PROFILE_ARGS=(--profile --profile-step-start 4 --profile-step-end 5 --use-pytorch-profiler --profile-ranks 0 1 2 3 4 5 6 7 --profile-dir prof_data)
+ RANK=7
+ LOCAL_RANK=7
+ WORLD_SIZE=8
+ DIST_URL=localhost
+ DIST_PORT=34577
+ DISTRIBUTED_ARGS=(--rank ${RANK} --world-size ${WORLD_SIZE} --local-rank ${LOCAL_RANK} --dist-url tcp://${DIST_URL}:${DIST_PORT})
+ APP='python -u pretrain_gpt.py         --num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights         --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass         --sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2         --data-path /models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model         --log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir ./tmp_7b         --rank 7 --world-size 8 --local-rank 7 --dist-url tcp://localhost:34577 '
+ export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ case ${LOCAL_RANK} in
+ export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+ numactl --cpunodebind=7 --membind=7 python -u pretrain_gpt.py --num-layers 32 --hidden-size 4096 --ffn-hidden-size 11008 --num-attention-heads 32 --max-position-embeddings 4096 --normalization RMSNorm --position-embedding-type rope --untie-embeddings-and-output-weights --transformer-impl local --use-legacy-models --micro-batch-size 1 --global-batch-size 64 --train-iters 50 --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --init-method-std 0.006 --clip-grad 1.0 --bf16 --use-distributed-optimizer --disable-bias-linear --attention-dropout 0 --hidden-dropout 0 --swiglu --lr 3.0e-5 --lr-decay-style cosine --min-lr 3.0e-6 --lr-warmup-iters 1 --ckpt-format torch --ddp-average-in-collective --overlap-grad-reduce --use-flash-attn-cutlass --sequence-parallel --tensor-model-parallel-size 1 --pipeline-model-parallel-size 2 --data-path /models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document --seq-length 4096 --split 949,50,1 --tokenizer-type Llama2Tokenizer --tokenizer-model /models1/Llama-2-7b-chat-hf/tokenizer.model --log-interval 1 --log-throughput --save-interval 1000 --eval-interval 1000 --eval-iters 3 --tensorboard-dir ./tmp_7b --rank 7 --world-size 8 --local-rank 7 --dist-url tcp://localhost:34577
/megatron-lm/megatron/training/arguments.py:601: UserWarning: Disabling sequence parallelism because tensor model parallelism is disabled
  warnings.warn("Disabling sequence parallelism because tensor model parallelism is disabled")
[WARNING  | megatron.core.rerun_state_machine]: RerunStateMachine initialized in mode disabled
/megatron-lm/megatron/training/arguments.py:601: UserWarning: Disabling sequence parallelism because tensor model parallelism is disabled
  warnings.warn("Disabling sequence parallelism because tensor model parallelism is disabled")
[WARNING  | megatron.core.rerun_state_machine]: RerunStateMachine initialized in mode disabled
/megatron-lm/megatron/training/arguments.py:601: UserWarning: Disabling sequence parallelism because tensor model parallelism is disabled
  warnings.warn("Disabling sequence parallelism because tensor model parallelism is disabled")
[WARNING  | megatron.core.rerun_state_machine]: RerunStateMachine initialized in mode disabled
/megatron-lm/megatron/training/arguments.py:601: UserWarning: Disabling sequence parallelism because tensor model parallelism is disabled
  warnings.warn("Disabling sequence parallelism because tensor model parallelism is disabled")
/megatron-lm/megatron/training/arguments.py:601: UserWarning: Disabling sequence parallelism because tensor model parallelism is disabled
  warnings.warn("Disabling sequence parallelism because tensor model parallelism is disabled")
/megatron-lm/megatron/training/arguments.py:601: UserWarning: Disabling sequence parallelism because tensor model parallelism is disabled
  warnings.warn("Disabling sequence parallelism because tensor model parallelism is disabled")
[WARNING  | megatron.core.rerun_state_machine]: RerunStateMachine initialized in mode disabled
/megatron-lm/megatron/training/arguments.py:601: UserWarning: Disabling sequence parallelism because tensor model parallelism is disabled
  warnings.warn("Disabling sequence parallelism because tensor model parallelism is disabled")
using world size: 8, data-parallel size: 4, context-parallel size: 1, hierarchical context-parallel sizes: Nonetensor-model-parallel size: 1, encoder-tensor-model-parallel size: 0, pipeline-model-parallel size: 2, encoder-pipeline-model-parallel size: 0
WARNING: overriding default arguments for tokenizer_type:GPT2BPETokenizer                        with tokenizer_type:Llama2Tokenizer
WARNING: Setting args.overlap_p2p_comm and args.align_param_gather to False since non-interleaved schedule does not support overlapping p2p communication and aligned param AG
accumulate and all-reduce gradients in fp32 for bfloat16 data type.
using torch.bfloat16 for parameters ...
/megatron-lm/megatron/training/arguments.py:601: UserWarning: Disabling sequence parallelism because tensor model parallelism is disabled
  warnings.warn("Disabling sequence parallelism because tensor model parallelism is disabled")
------------------------ arguments ------------------------
  accumulate_allreduce_grads_in_fp32 .............. True
  adam_beta1 ...................................... 0.9
  adam_beta2 ...................................... 0.95
  adam_eps ........................................ 1e-08
  add_bias_linear ................................. False
  add_position_embedding .......................... True
  add_qkv_bias .................................... False
  adlr_autoresume ................................. False
  adlr_autoresume_interval ........................ 1000
  align_grad_reduce ............................... True
  align_param_gather .............................. False
  app_tag_run_name ................................ None
  app_tag_run_version ............................. 0.0.0
  apply_layernorm_1p .............................. False
  apply_query_key_layer_scaling ................... False
  apply_residual_connection_post_layernorm ........ False
  apply_rope_fusion ............................... True
  async_save ...................................... None
  async_tensor_model_parallel_allreduce ........... True
  attention_backend ............................... AttnBackend.auto
  attention_dropout ............................... 0.0
  attention_softmax_in_fp32 ....................... False
  auto_detect_ckpt_format ......................... False
  barrier_with_L1_time ............................ True
  bert_binary_head ................................ True
  bert_embedder_type .............................. megatron
  bert_load ....................................... None
  bf16 ............................................ True
  bias_dropout_fusion ............................. True
  bias_gelu_fusion ................................ False
  bias_swiglu_fusion .............................. True
  biencoder_projection_dim ........................ 0
  biencoder_shared_query_context_model ............ False
  block_data_path ................................. None
  calculate_per_token_loss ........................ False
  check_for_nan_in_loss_and_grad .................. True
  check_for_spiky_loss ............................ False
  check_weight_hash_across_dp_replicas_interval ... None
  ckpt_assume_constant_structure .................. False
  ckpt_convert_format ............................. None
  ckpt_convert_save ............................... None
  ckpt_convert_update_legacy_dist_opt_format ...... False
  ckpt_format ..................................... torch
  ckpt_fully_parallel_load ........................ False
  ckpt_fully_parallel_save ........................ True
  ckpt_fully_parallel_save_deprecated ............. False
  ckpt_step ....................................... None
  classes_fraction ................................ 1.0
  clip_grad ....................................... 1.0
  clone_scatter_output_in_embedding ............... True
  config_logger_dir ............................... 
  consumed_train_samples .......................... 0
  consumed_valid_samples .......................... 0
  context_parallel_size ........................... 1
  cp_comm_type .................................... ['p2p']
  create_attention_mask_in_dataloader ............. True
  cross_entropy_loss_fusion ....................... False
  data_args_path .................................. None
  data_cache_path ................................. None
  data_parallel_random_init ....................... False
  data_parallel_size .............................. 4
  data_path ....................................... ['/models/datasets/openwebtext/openwebtext-llama-7b/openwebtext-llama-7b_text_document']
  data_per_class_fraction ......................... 1.0
  data_sharding ................................... True
  dataloader_type ................................. single
  ddp_average_in_collective ....................... True
  ddp_bucket_size ................................. None
  decoder_first_pipeline_num_layers ............... None
  decoder_last_pipeline_num_layers ................ None
  decoder_num_layers .............................. None
  decoder_seq_length .............................. None
  decoupled_lr .................................... None
  decoupled_min_lr ................................ None
  decrease_batch_size_if_needed ................... False
  defer_embedding_wgrad_compute ................... False
  deprecated_use_mcore_models ..................... False
  deterministic_mode .............................. False
  dino_bottleneck_size ............................ 256
  dino_freeze_last_layer .......................... 1
  dino_head_hidden_size ........................... 2048
  dino_local_crops_number ......................... 10
  dino_local_img_size ............................. 96
  dino_norm_last_layer ............................ False
  dino_teacher_temp ............................... 0.07
  dino_warmup_teacher_temp ........................ 0.04
  dino_warmup_teacher_temp_epochs ................. 30
  disable_straggler_on_startup .................... False
  dist_ckpt_format_deprecated ..................... None
  dist_ckpt_strictness ............................ assume_ok_unexpected
  dist_url ........................................ tcp://localhost:34577
  distribute_saved_activations .................... False
  distributed_backend ............................. nccl
  distributed_timeout_minutes ..................... 10
  embedding_path .................................. None
  empty_unused_memory_level ....................... 0
  enable_ft_package ............................... False
  enable_one_logger ............................... True
  encoder_num_layers .............................. 32
  encoder_pipeline_model_parallel_size ............ 0
  encoder_seq_length .............................. 4096
  encoder_tensor_model_parallel_size .............. 0
  end_weight_decay ................................ 0.1
  eod_mask_loss ................................... False
  error_injection_rate ............................ 0
  error_injection_type ............................ transient_error
  eval_interval ................................... 1000
  eval_iters ...................................... 3
  evidence_data_path .............................. None
  exit_duration_in_mins ........................... None
  exit_interval ................................... None
  exit_on_missing_checkpoint ...................... False
  exit_signal_handler ............................. False
  exp_avg_dtype ................................... torch.float32
  exp_avg_sq_dtype ................................ torch.float32
  expert_model_parallel_size ...................... 1
  expert_tensor_parallel_size ..................... 1
  ffn_hidden_size ................................. 11008
  finetune ........................................ False
  flash_decode .................................... False
  fp16 ............................................ False
  fp16_lm_cross_entropy ........................... False
  fp32_residual_connection ........................ False
  fp8 ............................................. None
  fp8_amax_compute_algo ........................... most_recent
  fp8_amax_history_len ............................ 1
  fp8_interval .................................... 1
  fp8_margin ...................................... 0
  fp8_param_gather ................................ False
  fp8_wgrad ....................................... True
  global_batch_size ............................... 64
  gradient_accumulation_fusion .................... True
  group_query_attention ........................... False
  head_lr_mult .................................... 1.0
  hidden_dropout .................................. 0.0
  hidden_size ..................................... 4096
  hierarchical_context_parallel_sizes ............. None
  hybrid_attention_ratio .......................... 0.0
  hybrid_mlp_ratio ................................ 0.0
  hybrid_override_pattern ......................... None
  hysteresis ...................................... 2
  ict_head_size ................................... None
  ict_load ........................................ None
  img_h ........................................... 224
  img_w ........................................... 224
  indexer_batch_size .............................. 128
  indexer_log_interval ............................ 1000
  inference_batch_times_seqlen_threshold .......... -1
  inference_max_seq_length ........................ 2560
  init_method_std ................................. 0.006
  init_method_xavier_uniform ...................... False
  initial_loss_scale .............................. 4294967296
  iter_per_epoch .................................. 1250
  kv_channels ..................................... 128
  kv_lora_rank .................................... 32
  lazy_mpu_init ................................... None
  load ............................................ None
  local_rank ...................................... 0
  log_interval .................................... 1
  log_loss_scale_to_tensorboard ................... True
  log_memory_to_tensorboard ....................... False
  log_num_zeros_in_grad ........................... False
  log_params_norm ................................. False
  log_progress .................................... False
  log_straggler ................................... False
  log_throughput .................................. True
  log_timers_to_tensorboard ....................... False
  log_validation_ppl_to_tensorboard ............... False
  log_world_size_to_tensorboard ................... False
  logging_level ................................... None
  loss_scale ...................................... None
  loss_scale_window ............................... 1000
  lr .............................................. 3e-05
  lr_decay_iters .................................. None
  lr_decay_samples ................................ None
  lr_decay_style .................................. cosine
  lr_warmup_fraction .............................. None
  lr_warmup_init .................................. 0.0
  lr_warmup_iters ................................. 1
  lr_warmup_samples ............................... 0
  lr_wsd_decay_iters .............................. None
  lr_wsd_decay_samples ............................ None
  lr_wsd_decay_style .............................. exponential
  main_grads_dtype ................................ torch.float32
  main_params_dtype ............................... torch.float32
  make_vocab_size_divisible_by .................... 128
  manual_gc ....................................... False
  manual_gc_eval .................................. True
  manual_gc_interval .............................. 0
  mask_factor ..................................... 1.0
  mask_prob ....................................... 0.15
  mask_type ....................................... random
  masked_softmax_fusion ........................... True
  max_position_embeddings ......................... 4096
  max_tokens_to_oom ............................... 12000
  memory_snapshot_path ............................ snapshot.pickle
  merge_file ...................................... None
  micro_batch_size ................................ 1
  microbatch_group_size_per_vp_stage .............. None
  min_loss_scale .................................. 1.0
  min_lr .......................................... 3e-06
  mmap_bin_files .................................. True
  mock_data ....................................... False
  moe_aux_loss_coeff .............................. 0.0
  moe_expert_capacity_factor ...................... None
  moe_extended_tp ................................. False
  moe_ffn_hidden_size ............................. 11008
  moe_grouped_gemm ................................ False
  moe_input_jitter_eps ............................ None
  moe_layer_freq .................................. 1
  moe_layer_recompute ............................. False
  moe_pad_expert_input_to_capacity ................ False
  moe_per_layer_logging ........................... False
  moe_router_load_balancing_type .................. aux_loss
  moe_router_pre_softmax .......................... False
  moe_router_topk ................................. 2
  moe_router_topk_limited_devices ................. None
  moe_router_topk_scaling_factor .................. None
  moe_shared_expert_intermediate_size ............. None
  moe_shared_expert_overlap ....................... False
  moe_token_dispatcher_type ....................... allgather
  moe_token_drop_policy ........................... probs
  moe_use_legacy_grouped_gemm ..................... False
  moe_use_upcycling ............................... False
  moe_z_loss_coeff ................................ None
  multi_latent_attention .......................... False
  nccl_communicator_config_path ................... None
  no_load_optim ................................... None
  no_load_rng ..................................... None
  no_persist_layer_norm ........................... False
  no_save_optim ................................... None
  no_save_rng ..................................... None
  non_persistent_ckpt_type ........................ None
  non_persistent_global_ckpt_dir .................. None
  non_persistent_local_ckpt_algo .................. fully_parallel
  non_persistent_local_ckpt_dir ................... None
  non_persistent_save_interval .................... None
  norm_epsilon .................................... 1e-05
  normalization ................................... RMSNorm
  num_attention_heads ............................. 32
  num_channels .................................... 3
  num_classes ..................................... 1000
  num_dataset_builder_threads ..................... 1
  num_distributed_optimizer_instances ............. 1
  num_experts ..................................... None
  num_layers ...................................... 32
  num_layers_per_virtual_pipeline_stage ........... None
  num_query_groups ................................ 1
  num_workers ..................................... 2
  one_logger_async ................................ False
  one_logger_project .............................. megatron-lm
  one_logger_run_name ............................. None
  onnx_safe ....................................... None
  openai_gelu ..................................... False
  optimizer ....................................... adam
  output_bert_embeddings .......................... False
  overlap_grad_reduce ............................. True
  overlap_p2p_comm ................................ False
  overlap_p2p_comm_warmup_flush ................... False
  overlap_param_gather ............................ False
  overlap_param_gather_with_optimizer_step ........ False
  override_opt_param_scheduler .................... False
  params_dtype .................................... torch.bfloat16
  patch_dim ....................................... 16
  per_split_data_args_path ........................ None
  perform_initialization .......................... True
  pipeline_model_parallel_size .................... 2
  pipeline_model_parallel_split_rank .............. None
  position_embedding_type ......................... rope
  pretrained_checkpoint ........................... None
  profile ......................................... False
  profile_dir ..................................... ./
  profile_ranks ................................... [0]
  profile_step_end ................................ 12
  profile_step_start .............................. 10
  q_lora_rank ..................................... None
  qk_head_dim ..................................... 128
  qk_layernorm .................................... False
  qk_pos_emb_head_dim ............................. 64
  query_in_block_prob ............................. 0.1
  rampup_batch_size ............................... None
  rank ............................................ 0
  recompute_granularity ........................... None
  recompute_method ................................ None
  recompute_num_layers ............................ None
  record_memory_history ........................... False
  renormalize_blend_weights ....................... False
  rerun_mode ...................................... disabled
  reset_attention_mask ............................ False
  reset_position_ids .............................. False
  retriever_report_topk_accuracies ................ []
  retriever_score_scaling ......................... False
  retriever_seq_length ............................ 256
  retro_add_retriever ............................. False
  retro_attention_gate ............................ 1
  retro_cyclic_train_iters ........................ None
  retro_encoder_attention_dropout ................. 0.1
  retro_encoder_hidden_dropout .................... 0.1
  retro_encoder_layers ............................ 2
  retro_num_neighbors ............................. 2
  retro_num_retrieved_chunks ...................... 2
  retro_project_dir ............................... None
  retro_verify_neighbor_count ..................... True
  rotary_base ..................................... 10000
  rotary_interleaved .............................. False
  rotary_percent .................................. 1.0
  rotary_scaling_factor ........................... 1.0
  rotary_seq_len_interpolation_factor ............. None
  s3_cache_path ................................... None
  sample_rate ..................................... 1.0
  save ............................................ None
  save_interval ................................... 1000
  scatter_gather_tensors_in_pipeline .............. True
  seed ............................................ 1234
  seq_length ...................................... 4096
  sequence_parallel ............................... False
  sgd_momentum .................................... 0.9
  short_seq_prob .................................. 0.1
  skip_train ...................................... False
  skipped_train_samples ........................... 0
  spec ............................................ None
  split ........................................... 949,50,1
  squared_relu .................................... False
  standalone_embedding_stage ...................... False
  start_weight_decay .............................. 0.1
  straggler_ctrlr_port ............................ 65535
  straggler_minmax_count .......................... 1
  swiglu .......................................... True
  swin_backbone_type .............................. tiny
  tensor_model_parallel_size ...................... 1
  tensorboard_dir ................................. ./tmp_7b
  tensorboard_log_interval ........................ 1
  tensorboard_queue_size .......................... 1000
  test_data_path .................................. None
  test_mode ....................................... False
  tiktoken_num_special_tokens ..................... 1000
  tiktoken_pattern ................................ None
  tiktoken_special_tokens ......................... None
  timing_log_level ................................ 0
  timing_log_option ............................... minmax
  titles_data_path ................................ None
  tokenizer_model ................................. /models1/Llama-2-7b-chat-hf/tokenizer.model
  tokenizer_type .................................. Llama2Tokenizer
  tp_comm_bootstrap_backend ....................... nccl
  tp_comm_bulk_dgrad .............................. True
  tp_comm_bulk_wgrad .............................. True
  tp_comm_overlap ................................. False
  tp_comm_overlap_ag .............................. True
  tp_comm_overlap_cfg ............................. None
  tp_comm_overlap_rs .............................. True
  tp_comm_overlap_rs_dgrad ........................ False
  tp_comm_split_ag ................................ True
  tp_comm_split_rs ................................ True
  train_data_path ................................. None
  train_iters ..................................... 50
  train_samples ................................... None
  train_sync_interval ............................. None
  transformer_impl ................................ local
  transformer_pipeline_model_parallel_size ........ 2
  untie_embeddings_and_output_weights ............. True
  use_checkpoint_args ............................. False
  use_checkpoint_opt_param_scheduler .............. False
  use_cpu_initialization .......................... None
  use_dist_ckpt ................................... False
  use_dist_ckpt_deprecated ........................ False
  use_distributed_optimizer ....................... True
  use_flash_attn .................................. True
  use_flash_attn_cutlass .......................... True
  use_flash_attn_torch ............................ False
  use_flash_attn_triton ........................... False
  use_hip_profiler ................................ False
  use_legacy_models ............................... True
  use_mp_args_from_checkpoint_args ................ False
  use_one_sent_docs ............................... False
  use_precision_aware_optimizer ................... False
  use_pytorch_profiler ............................ False
  use_ring_exchange_p2p ........................... False
  use_rope_scaling ................................ False
  use_rotary_position_embeddings .................. False
  use_tokenizer_model_from_checkpoint_args ........ True
  use_torch_fsdp2 ................................. False
  use_tp_pp_dp_mapping ............................ False
  v_head_dim ...................................... 128
  valid_data_path ................................. None
  variable_seq_lengths ............................ False
  virtual_pipeline_model_parallel_size ............ None
  vision_backbone_type ............................ vit
  vision_pretraining .............................. False
  vision_pretraining_type ......................... classify
  vocab_extra_ids ................................. 0
  vocab_file ...................................... None
  vocab_size ...................................... None
  wandb_exp_name .................................. 
  wandb_project ................................... 
  wandb_save_dir .................................. 
  weight_decay .................................... 0.1
  weight_decay_incr_style ......................... constant
  wgrad_deferral_limit ............................ 0
  world_size ...................................... 8
  yaml_cfg ........................................ None
-------------------- end of arguments ---------------------
> building Llama2Tokenizer tokenizer ...
[WARNING  | megatron.core.rerun_state_machine]: RerunStateMachine initialized in mode disabled
[WARNING  | megatron.core.rerun_state_machine]: RerunStateMachine initialized in mode disabled
 > padded vocab (size: 32000) with 0 dummy tokens (new size: 32000)
[WARNING  | megatron.core.rerun_state_machine]: RerunStateMachine initialized in mode disabled
> initializing torch distributed ...
2025-10-30 15:48:22.868274: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/tensorboard/compat/__init__.py", line 42, in tf
    from tensorboard.compat import notf  # noqa: F401
ImportError: cannot import name 'notf' from 'tensorboard.compat' (/usr/local/lib/python3.10/dist-packages/tensorboard/compat/__init__.py)

During handling of the above exception, another exception occurred:

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/tensorboard/compat/__init__.py", line 42, in tf
    from tensorboard.compat import notf  # noqa: F401
ImportError: cannot import name 'notf' from 'tensorboard.compat' (/usr/local/lib/python3.10/dist-packages/tensorboard/compat/__init__.py)

During handling of the above exception, another exception occurred:

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
> setting tensorboard ...
WARNING: one_logger package is required to enable e2e metrics tracking. please go to https://confluence.nvidia.com/display/MLWFO/Package+Repositories for details to install it
[WARNING  | megatron.core.rerun_state_machine]: RerunStateMachine initialized in mode disabled
> initialized tensor model parallel with size 1
> initialized pipeline model parallel with size 2
> setting random seeds to 1234 ...
> compiling dataset index builder ...
make: Entering directory '/megatron-lm/megatron/core/datasets'
make: Nothing to be done for 'default'.
make: Leaving directory '/megatron-lm/megatron/core/datasets'
>>> done with dataset index builder. Compilation time: 0.032 seconds
> compiling and loading fused kernels ...
>>> done with compiling and loading fused kernels. Compilation time: 1.511 seconds
time to initialize megatron (seconds): 5.644
[after megatron is initialized] datetime: 2025-10-30 15:48:27 
building GPT model ...
 > number of parameters on (tensor, pipeline) model parallel rank (0, 1): 3369209856
GPTModel(
  (language_model): TransformerLanguageModel(
    (embedding): Embedding(
      (word_embeddings): VocabParallelEmbedding()
      (embedding_dropout): Dropout(p=0.0, inplace=False)
    )
    (rotary_pos_emb): RotaryEmbedding()
    (encoder): ParallelTransformer(
      (layers): ModuleList(
        (0-15): 16 x ParallelTransformerLayer(
          (input_norm): RMSNorm()
          (self_attention): ParallelAttention(
            (query_key_value): ColumnParallelLinear()
            (core_attention): CoreAttention(
              (scale_mask_softmax): FusedScaleMaskSoftmax()
              (attention_dropout): Dropout(p=0.0, inplace=False)
            )
            (core_attention_flash): FlashSelfAttention()
            (dense): RowParallelLinear()
          )
          (post_attention_norm): RMSNorm()
          (mlp): ParallelMLP(
            (dense_h_to_4h): ColumnParallelLinear()
            (dense_4h_to_h): RowParallelLinear()
          )
        )
      )
    )
  )
)
 > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 3369205760
[after model, optimizer, and learning rate scheduler are built] datetime: 2025-10-30 15:48:27 
> building train, validation, and test datasets ...
 > datasets target sizes (minimum size):
    train:      3200
    validation: 192
    test:       192
> building train, validation, and test datasets for GPT ...
> finished creating GPT datasets ...
[after dataloaders are built] datetime: 2025-10-30 15:48:27 
done with setup ...
training ...
(min, max) time across ranks (ms):
    model-and-optimizer-setup ......................: (199.88, 204.83)
    train/valid/test-data-iterators-setup ..........: (360.81, 415.69)
[before the start of training step] datetime: 2025-10-30 15:48:27 
Number of parameters in transformer layers in billions:  6.48
Number of parameters in embedding layers in billions: 0.26
Total number of parameters in billions: 6.74
Number of parameters in most loaded shard in billions: 3.3693
 [2025-10-30 15:49:58] iteration        1/      50 | consumed samples:           64 | elapsed time per iteration (ms): 90286.8 | throughput per GPU (TFLOP/s/GPU): 16.7 | learning rate: 3.000000E-05 | global batch size:    64 | lm loss: 1.045589E+01 | loss scale: 1.0 | grad norm: 270.234 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
Number of parameters in other shards in billions: 3.2383
Theoretical memory footprints: weight and optimizer=28919.29 MB
[Rank 4] (after 1 iterations) memory (MB) | allocated: 28987.10498046875 | max allocated: 32813.6396484375 | reserved: 36168.0 | max reserved: 36168.0
[Rank 0] (after 1 iterations) memory (MB) | allocated: 28920.36181640625 | max allocated: 39419.125 | reserved: 40814.0 | max reserved: 40814.0
Could not open /var/log/hylog/.
 [2025-10-30 15:50:38] iteration        2/      50 | consumed samples:          128 | elapsed time per iteration (ms): 40295.7 | throughput per GPU (TFLOP/s/GPU): 37.5 | learning rate: 2.997226E-05 | global batch size:    64 | lm loss: 1.046932E+01 | loss scale: 1.0 | grad norm: 242.090 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 15:51:18] iteration        3/      50 | consumed samples:          192 | elapsed time per iteration (ms): 39819.8 | throughput per GPU (TFLOP/s/GPU): 37.9 | learning rate: 2.988916E-05 | global batch size:    64 | lm loss: 8.424404E+00 | loss scale: 1.0 | grad norm: 502.275 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 15:51:57] iteration        4/      50 | consumed samples:          256 | elapsed time per iteration (ms): 39600.0 | throughput per GPU (TFLOP/s/GPU): 38.1 | learning rate: 2.975105E-05 | global batch size:    64 | lm loss: 1.290415E+01 | loss scale: 1.0 | grad norm: 174.010 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 15:52:37] iteration        5/      50 | consumed samples:          320 | elapsed time per iteration (ms): 39826.9 | throughput per GPU (TFLOP/s/GPU): 37.9 | learning rate: 2.955848E-05 | global batch size:    64 | lm loss: 9.753544E+00 | loss scale: 1.0 | grad norm: 47.956 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 15:53:17] iteration        6/      50 | consumed samples:          384 | elapsed time per iteration (ms): 39826.8 | throughput per GPU (TFLOP/s/GPU): 37.9 | learning rate: 2.931225E-05 | global batch size:    64 | lm loss: 9.120786E+00 | loss scale: 1.0 | grad norm: 165.685 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 15:53:57] iteration        7/      50 | consumed samples:          448 | elapsed time per iteration (ms): 39536.7 | throughput per GPU (TFLOP/s/GPU): 38.2 | learning rate: 2.901338E-05 | global batch size:    64 | lm loss: 8.215652E+00 | loss scale: 1.0 | grad norm: 40.534 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 15:54:37] iteration        8/      50 | consumed samples:          512 | elapsed time per iteration (ms): 40029.6 | throughput per GPU (TFLOP/s/GPU): 37.7 | learning rate: 2.866308E-05 | global batch size:    64 | lm loss: 7.065186E+00 | loss scale: 1.0 | grad norm: 10.479 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 15:55:16] iteration        9/      50 | consumed samples:          576 | elapsed time per iteration (ms): 39653.0 | throughput per GPU (TFLOP/s/GPU): 38.1 | learning rate: 2.826280E-05 | global batch size:    64 | lm loss: 7.098128E+00 | loss scale: 1.0 | grad norm: 8.814 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 15:55:56] iteration       10/      50 | consumed samples:          640 | elapsed time per iteration (ms): 39674.7 | throughput per GPU (TFLOP/s/GPU): 38.1 | learning rate: 2.781419E-05 | global batch size:    64 | lm loss: 6.366463E+00 | loss scale: 1.0 | grad norm: 6.373 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 15:56:36] iteration       11/      50 | consumed samples:          704 | elapsed time per iteration (ms): 40139.3 | throughput per GPU (TFLOP/s/GPU): 37.6 | learning rate: 2.731908E-05 | global batch size:    64 | lm loss: 6.430417E+00 | loss scale: 1.0 | grad norm: 6.818 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 15:57:16] iteration       12/      50 | consumed samples:          768 | elapsed time per iteration (ms): 39490.8 | throughput per GPU (TFLOP/s/GPU): 38.2 | learning rate: 2.677952E-05 | global batch size:    64 | lm loss: 6.327631E+00 | loss scale: 1.0 | grad norm: 3.020 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 15:57:55] iteration       13/      50 | consumed samples:          832 | elapsed time per iteration (ms): 39506.9 | throughput per GPU (TFLOP/s/GPU): 38.2 | learning rate: 2.619772E-05 | global batch size:    64 | lm loss: 6.092177E+00 | loss scale: 1.0 | grad norm: 2.616 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 15:58:35] iteration       14/      50 | consumed samples:          896 | elapsed time per iteration (ms): 39718.0 | throughput per GPU (TFLOP/s/GPU): 38.0 | learning rate: 2.557606E-05 | global batch size:    64 | lm loss: 6.129852E+00 | loss scale: 1.0 | grad norm: 4.508 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 15:59:14] iteration       15/      50 | consumed samples:          960 | elapsed time per iteration (ms): 39658.4 | throughput per GPU (TFLOP/s/GPU): 38.1 | learning rate: 2.491711E-05 | global batch size:    64 | lm loss: 6.379290E+00 | loss scale: 1.0 | grad norm: 15.828 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 15:59:54] iteration       16/      50 | consumed samples:         1024 | elapsed time per iteration (ms): 39884.8 | throughput per GPU (TFLOP/s/GPU): 37.9 | learning rate: 2.422357E-05 | global batch size:    64 | lm loss: 6.207567E+00 | loss scale: 1.0 | grad norm: 3.418 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 16:00:34] iteration       17/      50 | consumed samples:         1088 | elapsed time per iteration (ms): 39599.3 | throughput per GPU (TFLOP/s/GPU): 38.1 | learning rate: 2.349830E-05 | global batch size:    64 | lm loss: 6.430919E+00 | loss scale: 1.0 | grad norm: 18.031 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 16:01:14] iteration       18/      50 | consumed samples:         1152 | elapsed time per iteration (ms): 39926.3 | throughput per GPU (TFLOP/s/GPU): 37.8 | learning rate: 2.274427E-05 | global batch size:    64 | lm loss: 6.162337E+00 | loss scale: 1.0 | grad norm: 14.185 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 16:01:54] iteration       19/      50 | consumed samples:         1216 | elapsed time per iteration (ms): 39747.6 | throughput per GPU (TFLOP/s/GPU): 38.0 | learning rate: 2.196458E-05 | global batch size:    64 | lm loss: 5.844732E+00 | loss scale: 1.0 | grad norm: 3.080 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 16:02:34] iteration       20/      50 | consumed samples:         1280 | elapsed time per iteration (ms): 39922.5 | throughput per GPU (TFLOP/s/GPU): 37.8 | learning rate: 2.116243E-05 | global batch size:    64 | lm loss: 5.706470E+00 | loss scale: 1.0 | grad norm: 7.213 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 16:03:13] iteration       21/      50 | consumed samples:         1344 | elapsed time per iteration (ms): 39735.6 | throughput per GPU (TFLOP/s/GPU): 38.0 | learning rate: 2.034112E-05 | global batch size:    64 | lm loss: 5.828917E+00 | loss scale: 1.0 | grad norm: 5.298 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 16:03:53] iteration       22/      50 | consumed samples:         1408 | elapsed time per iteration (ms): 39615.3 | throughput per GPU (TFLOP/s/GPU): 38.1 | learning rate: 1.950403E-05 | global batch size:    64 | lm loss: 5.981213E+00 | loss scale: 1.0 | grad norm: 3.407 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 16:04:38] iteration       23/      50 | consumed samples:         1472 | elapsed time per iteration (ms): 44924.4 | throughput per GPU (TFLOP/s/GPU): 33.6 | learning rate: 1.865460E-05 | global batch size:    64 | lm loss: 5.460212E+00 | loss scale: 1.0 | grad norm: 4.123 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 16:05:44] iteration       24/      50 | consumed samples:         1536 | elapsed time per iteration (ms): 65925.0 | throughput per GPU (TFLOP/s/GPU): 22.9 | learning rate: 1.779631E-05 | global batch size:    64 | lm loss: 5.226260E+00 | loss scale: 1.0 | grad norm: 5.918 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 16:06:24] iteration       25/      50 | consumed samples:         1600 | elapsed time per iteration (ms): 40182.0 | throughput per GPU (TFLOP/s/GPU): 37.6 | learning rate: 1.693270E-05 | global batch size:    64 | lm loss: 5.305700E+00 | loss scale: 1.0 | grad norm: 1.561 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 16:07:04] iteration       26/      50 | consumed samples:         1664 | elapsed time per iteration (ms): 39762.6 | throughput per GPU (TFLOP/s/GPU): 38.0 | learning rate: 1.606730E-05 | global batch size:    64 | lm loss: 5.153278E+00 | loss scale: 1.0 | grad norm: 2.196 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 16:07:44] iteration       27/      50 | consumed samples:         1728 | elapsed time per iteration (ms): 39921.2 | throughput per GPU (TFLOP/s/GPU): 37.8 | learning rate: 1.520369E-05 | global batch size:    64 | lm loss: 5.105300E+00 | loss scale: 1.0 | grad norm: 1.382 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 16:08:23] iteration       28/      50 | consumed samples:         1792 | elapsed time per iteration (ms): 39815.2 | throughput per GPU (TFLOP/s/GPU): 37.9 | learning rate: 1.434540E-05 | global batch size:    64 | lm loss: 4.925309E+00 | loss scale: 1.0 | grad norm: 1.777 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 16:09:03] iteration       29/      50 | consumed samples:         1856 | elapsed time per iteration (ms): 39752.5 | throughput per GPU (TFLOP/s/GPU): 38.0 | learning rate: 1.349597E-05 | global batch size:    64 | lm loss: 5.181439E+00 | loss scale: 1.0 | grad norm: 1.845 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 16:09:43] iteration       30/      50 | consumed samples:         1920 | elapsed time per iteration (ms): 39643.3 | throughput per GPU (TFLOP/s/GPU): 38.1 | learning rate: 1.265888E-05 | global batch size:    64 | lm loss: 5.208538E+00 | loss scale: 1.0 | grad norm: 2.059 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 16:10:23] iteration       31/      50 | consumed samples:         1984 | elapsed time per iteration (ms): 39988.1 | throughput per GPU (TFLOP/s/GPU): 37.8 | learning rate: 1.183757E-05 | global batch size:    64 | lm loss: 4.890507E+00 | loss scale: 1.0 | grad norm: 1.327 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 16:11:03] iteration       32/      50 | consumed samples:         2048 | elapsed time per iteration (ms): 40034.3 | throughput per GPU (TFLOP/s/GPU): 37.7 | learning rate: 1.103542E-05 | global batch size:    64 | lm loss: 4.993505E+00 | loss scale: 1.0 | grad norm: 1.328 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 16:11:43] iteration       33/      50 | consumed samples:         2112 | elapsed time per iteration (ms): 39777.8 | throughput per GPU (TFLOP/s/GPU): 38.0 | learning rate: 1.025573E-05 | global batch size:    64 | lm loss: 5.216469E+00 | loss scale: 1.0 | grad norm: 1.112 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 16:12:22] iteration       34/      50 | consumed samples:         2176 | elapsed time per iteration (ms): 39744.7 | throughput per GPU (TFLOP/s/GPU): 38.0 | learning rate: 9.501700E-06 | global batch size:    64 | lm loss: 5.064697E+00 | loss scale: 1.0 | grad norm: 1.134 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 16:13:02] iteration       35/      50 | consumed samples:         2240 | elapsed time per iteration (ms): 39795.0 | throughput per GPU (TFLOP/s/GPU): 37.9 | learning rate: 8.776425E-06 | global batch size:    64 | lm loss: 4.957899E+00 | loss scale: 1.0 | grad norm: 0.992 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 16:13:42] iteration       36/      50 | consumed samples:         2304 | elapsed time per iteration (ms): 39734.5 | throughput per GPU (TFLOP/s/GPU): 38.0 | learning rate: 8.082888E-06 | global batch size:    64 | lm loss: 5.244042E+00 | loss scale: 1.0 | grad norm: 1.144 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 16:14:22] iteration       37/      50 | consumed samples:         2368 | elapsed time per iteration (ms): 39786.1 | throughput per GPU (TFLOP/s/GPU): 38.0 | learning rate: 7.423938E-06 | global batch size:    64 | lm loss: 5.032987E+00 | loss scale: 1.0 | grad norm: 0.960 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 16:15:01] iteration       38/      50 | consumed samples:         2432 | elapsed time per iteration (ms): 39439.9 | throughput per GPU (TFLOP/s/GPU): 38.3 | learning rate: 6.802284E-06 | global batch size:    64 | lm loss: 4.990102E+00 | loss scale: 1.0 | grad norm: 0.900 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 16:15:41] iteration       39/      50 | consumed samples:         2496 | elapsed time per iteration (ms): 39764.0 | throughput per GPU (TFLOP/s/GPU): 38.0 | learning rate: 6.220479E-06 | global batch size:    64 | lm loss: 4.856393E+00 | loss scale: 1.0 | grad norm: 1.125 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 16:16:20] iteration       40/      50 | consumed samples:         2560 | elapsed time per iteration (ms): 39466.0 | throughput per GPU (TFLOP/s/GPU): 38.3 | learning rate: 5.680916E-06 | global batch size:    64 | lm loss: 5.073430E+00 | loss scale: 1.0 | grad norm: 1.028 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 16:17:00] iteration       41/      50 | consumed samples:         2624 | elapsed time per iteration (ms): 39639.2 | throughput per GPU (TFLOP/s/GPU): 38.1 | learning rate: 5.185811E-06 | global batch size:    64 | lm loss: 5.006877E+00 | loss scale: 1.0 | grad norm: 0.856 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 16:17:40] iteration       42/      50 | consumed samples:         2688 | elapsed time per iteration (ms): 39736.3 | throughput per GPU (TFLOP/s/GPU): 38.0 | learning rate: 4.737197E-06 | global batch size:    64 | lm loss: 4.772885E+00 | loss scale: 1.0 | grad norm: 0.983 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 16:18:19] iteration       43/      50 | consumed samples:         2752 | elapsed time per iteration (ms): 39481.7 | throughput per GPU (TFLOP/s/GPU): 38.2 | learning rate: 4.336920E-06 | global batch size:    64 | lm loss: 4.907492E+00 | loss scale: 1.0 | grad norm: 0.823 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 16:18:59] iteration       44/      50 | consumed samples:         2816 | elapsed time per iteration (ms): 39949.9 | throughput per GPU (TFLOP/s/GPU): 37.8 | learning rate: 3.986624E-06 | global batch size:    64 | lm loss: 4.758832E+00 | loss scale: 1.0 | grad norm: 1.009 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 16:19:39] iteration       45/      50 | consumed samples:         2880 | elapsed time per iteration (ms): 39817.1 | throughput per GPU (TFLOP/s/GPU): 37.9 | learning rate: 3.687747E-06 | global batch size:    64 | lm loss: 4.631381E+00 | loss scale: 1.0 | grad norm: 0.807 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 16:20:19] iteration       46/      50 | consumed samples:         2944 | elapsed time per iteration (ms): 39566.5 | throughput per GPU (TFLOP/s/GPU): 38.2 | learning rate: 3.441519E-06 | global batch size:    64 | lm loss: 4.772638E+00 | loss scale: 1.0 | grad norm: 1.379 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 16:20:58] iteration       47/      50 | consumed samples:         3008 | elapsed time per iteration (ms): 39582.4 | throughput per GPU (TFLOP/s/GPU): 38.2 | learning rate: 3.248951E-06 | global batch size:    64 | lm loss: 4.898998E+00 | loss scale: 1.0 | grad norm: 0.859 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 16:21:38] iteration       48/      50 | consumed samples:         3072 | elapsed time per iteration (ms): 39651.1 | throughput per GPU (TFLOP/s/GPU): 38.1 | learning rate: 3.110835E-06 | global batch size:    64 | lm loss: 5.095502E+00 | loss scale: 1.0 | grad norm: 0.887 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 16:22:17] iteration       49/      50 | consumed samples:         3136 | elapsed time per iteration (ms): 39759.3 | throughput per GPU (TFLOP/s/GPU): 38.0 | learning rate: 3.027737E-06 | global batch size:    64 | lm loss: 4.849247E+00 | loss scale: 1.0 | grad norm: 0.706 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
 [2025-10-30 16:22:57] iteration       50/      50 | consumed samples:         3200 | elapsed time per iteration (ms): 39629.9 | throughput per GPU (TFLOP/s/GPU): 38.1 | learning rate: 3.000000E-06 | global batch size:    64 | lm loss: 5.294223E+00 | loss scale: 1.0 | grad norm: 0.789 | num zeros: 0 | number of skipped iterations:   0 | number of nan iterations:   0 |
[after training is done] datetime: 2025-10-30 16:22:57 
[WARNING  | megatron.core.rerun_state_machine]: Setting RerunStateMachine mode disabled
Evaluating on 192 samples
Evaluating iter 1/3
Evaluating iter 2/3
Evaluating iter 3/3
(min, max) time across ranks (ms):
    evaluate .......................................: (16322.27, 16446.91)
----------------------------------------------------------------------------------------------------------------
[WARNING  | megatron.core.rerun_state_machine]: Setting RerunStateMachine mode disabled
[WARNING  | megatron.core.rerun_state_machine]: Setting RerunStateMachine mode disabled
 validation loss at iteration 50 on validation set | lm loss value: 4.873671E+00 | lm loss PPL: 1.308002E+02 | 
----------------------------------------------------------------------------------------------------------------
[WARNING  | megatron.core.rerun_state_machine]: Setting RerunStateMachine mode disabled
Evaluating on 192 samples
Evaluating iter 1/3
Evaluating iter 2/3
Evaluating iter 3/3
(min, max) time across ranks (ms):
    evaluate .......................................: (5993.56, 6118.05)
----------------------------------------------------------------------------------------------------------
 validation loss at iteration 50 on test set | lm loss value: 5.035580E+00 | lm loss PPL: 1.537888E+02 | 
----------------------------------------------------------------------------------------------------------
[WARNING  | megatron.core.rerun_state_machine]: Setting RerunStateMachine mode disabled
[WARNING  | megatron.core.rerun_state_machine]: Setting RerunStateMachine mode disabled
WARNING: Logging before InitGoogleLogging() is written to STDERR
W1030 16:23:21.049338 936590 ProcessGroupNCCL.cpp:1197] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present,  but this warning has only been added since PyTorch 2.4 (function operator())
WARNING: Logging before InitGoogleLogging() is written to STDERR
W1030 16:23:21.139221 936594 ProcessGroupNCCL.cpp:1197] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present,  but this warning has only been added since PyTorch 2.4 (function operator())
WARNING: Logging before InitGoogleLogging() is written to STDERR
W1030 16:23:21.723956 936591 ProcessGroupNCCL.cpp:1197] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present,  but this warning has only been added since PyTorch 2.4 (function operator())
WARNING: Logging before InitGoogleLogging() is written to STDERR
W1030 16:23:21.764384 936596 ProcessGroupNCCL.cpp:1197] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present,  but this warning has only been added since PyTorch 2.4 (function operator())
WARNING: Logging before InitGoogleLogging() is written to STDERR
W1030 16:23:21.794720 936589 ProcessGroupNCCL.cpp:1197] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present,  but this warning has only been added since PyTorch 2.4 (function operator())
WARNING: Logging before InitGoogleLogging() is written to STDERR
W1030 16:23:22.382212 936597 ProcessGroupNCCL.cpp:1197] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present,  but this warning has only been added since PyTorch 2.4 (function operator())
WARNING: Logging before InitGoogleLogging() is written to STDERR
W1030 16:23:22.453176 936593 ProcessGroupNCCL.cpp:1197] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present,  but this warning has only been added since PyTorch 2.4 (function operator())
WARNING: Logging before InitGoogleLogging() is written to STDERR
W1030 16:23:52.474910 936599 ProcessGroupNCCL.cpp:1197] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present,  but this warning has only been added since PyTorch 2.4 (function operator())