Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
688448db
Commit
688448db
authored
Mar 14, 2025
by
silencealiang
Browse files
更新代码
parent
a02a5490
Pipeline
#2503
passed with stage
Changes
823
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
26 additions
and
70 deletions
+26
-70
tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev.json
...p2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev.json
+1
-0
tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_lts.json
...p2_resume_torch_dist_dgx_a100_1N8G/golden_values_lts.json
+1
-0
tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
...tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
+2
-2
tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/golden_values_dev.json
...orch_dist_local_spec_dgx_a100_1N8G/golden_values_dev.json
+1
-0
tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/golden_values_lts.json
...orch_dist_local_spec_dgx_a100_1N8G/golden_values_lts.json
+1
-0
tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/model_config.yaml
...ume_torch_dist_local_spec_dgx_a100_1N8G/model_config.yaml
+2
-2
tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/golden_values_dev.json
.../bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/golden_values_dev.json
+1
-1
tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/golden_values_lts.json
.../bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/golden_values_lts.json
+1
-1
tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/model_config.yaml
.../bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/model_config.yaml
+2
-2
tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G/golden_values_dev.json
...pp4_vp2_resume_torch_dgx_a100_1N8G/golden_values_dev.json
+1
-0
tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G/golden_values_lts.json
...pp4_vp2_resume_torch_dgx_a100_1N8G/golden_values_lts.json
+1
-0
tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G/model_config.yaml
..._tp1_pp4_vp2_resume_torch_dgx_a100_1N8G/model_config.yaml
+2
-2
tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json
...bert/bert_mr_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json
+1
-1
tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json
...bert/bert_mr_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json
+1
-1
tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml
...ases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml
+2
-2
tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/golden_values_dev.json
...tp2_pp2_resume_torch_dgx_a100_1N8G/golden_values_dev.json
+1
-0
tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/golden_values_lts.json
...tp2_pp2_resume_torch_dgx_a100_1N8G/golden_values_lts.json
+1
-0
tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
...t_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
+2
-2
tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev.json
...ightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev.json
+1
-53
tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts.json
...ightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts.json
+1
-1
No files found.
tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.49574
,
"5"
:
10.48424
,
"10"
:
10.49936
,
"15"
:
10.46628
,
"20"
:
10.44794
,
"25"
:
10.34964
,
"30"
:
10.17263
,
"35"
:
10.04261
,
"40"
:
9.90783
,
"45"
:
9.75774
,
"50"
:
9.67693
,
"55"
:
9.55372
,
"60"
:
9.4546
,
"65"
:
9.42161
,
"70"
:
9.3011
,
"75"
:
9.32209
,
"80"
:
9.26181
,
"85"
:
9.2967
,
"90"
:
9.23338
,
"95"
:
9.2382
,
"100"
:
9.10601
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
2182.0
,
"5"
:
2584.0
,
"10"
:
2205.0
,
"15"
:
2539.0
,
"20"
:
2089.0
,
"25"
:
2604.0
,
"30"
:
2913.0
,
"35"
:
2967.0
,
"40"
:
2378.0
,
"45"
:
3923.0
,
"50"
:
3599.0
,
"55"
:
3628.0
,
"60"
:
2617.0
,
"65"
:
3408.0
,
"70"
:
3944.0
,
"75"
:
4932.0
,
"80"
:
3598.0
,
"85"
:
4221.0
,
"90"
:
4643.0
,
"95"
:
4427.0
,
"100"
:
3170.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1754654208.0
,
"5"
:
1754654208.0
,
"10"
:
1754654208.0
,
"15"
:
1754654208.0
,
"20"
:
1754654208.0
,
"25"
:
1754654208.0
,
"30"
:
1754654208.0
,
"35"
:
1754654208.0
,
"40"
:
1754654208.0
,
"45"
:
1754654208.0
,
"50"
:
1754654208.0
,
"55"
:
1754654208.0
,
"60"
:
1754654208.0
,
"65"
:
1754654208.0
,
"70"
:
1754654208.0
,
"75"
:
1754654208.0
,
"80"
:
1754654208.0
,
"85"
:
1754654208.0
,
"90"
:
1754654208.0
,
"95"
:
1754654208.0
,
"100"
:
1754654208.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
2313432064.0
,
"5"
:
3055894528.0
,
"10"
:
3055894528.0
,
"15"
:
3055894528.0
,
"20"
:
3055894528.0
,
"25"
:
3055894528.0
,
"30"
:
3055894528.0
,
"35"
:
3055894528.0
,
"40"
:
3055894528.0
,
"45"
:
3055894528.0
,
"50"
:
3055894528.0
,
"55"
:
3055894528.0
,
"60"
:
3055894528.0
,
"65"
:
3055894528.0
,
"70"
:
3055894528.0
,
"75"
:
3055894528.0
,
"80"
:
3055894528.0
,
"85"
:
3055894528.0
,
"90"
:
3055894528.0
,
"95"
:
3055894528.0
,
"100"
:
3055894528.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
13.24712
,
"5"
:
1.27471
,
"10"
:
1.26014
,
"15"
:
1.26739
,
"20"
:
1.25462
,
"25"
:
1.25062
,
"30"
:
1.25515
,
"35"
:
1.25462
,
"40"
:
1.257
,
"45"
:
1.25361
,
"50"
:
1.25724
,
"55"
:
1.25002
,
"60"
:
1.25409
,
"65"
:
1.24828
,
"70"
:
1.44976
,
"75"
:
1.24651
,
"80"
:
1.45548
,
"85"
:
1.2481
,
"90"
:
1.25739
,
"95"
:
1.26824
,
"100"
:
1.25641
}}}
\ No newline at end of file
tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_lts.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.49574
,
"5"
:
10.48398
,
"10"
:
10.49943
,
"15"
:
10.4663
,
"20"
:
10.44775
,
"25"
:
10.34964
,
"30"
:
10.1728
,
"35"
:
10.04262
,
"40"
:
9.90767
,
"45"
:
9.75792
,
"50"
:
9.67684
,
"55"
:
9.55378
,
"60"
:
9.45458
,
"65"
:
9.42133
,
"70"
:
9.30109
,
"75"
:
9.32203
,
"80"
:
9.26184
,
"85"
:
9.29667
,
"90"
:
9.23332
,
"95"
:
9.23793
,
"100"
:
9.10611
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
2182.0
,
"5"
:
2568.0
,
"10"
:
2108.0
,
"15"
:
2533.0
,
"20"
:
2151.0
,
"25"
:
2601.0
,
"30"
:
2801.0
,
"35"
:
3107.0
,
"40"
:
2294.0
,
"45"
:
3909.0
,
"50"
:
3482.0
,
"55"
:
3606.0
,
"60"
:
2653.0
,
"65"
:
3341.0
,
"70"
:
3849.0
,
"75"
:
5090.0
,
"80"
:
3613.0
,
"85"
:
4194.0
,
"90"
:
4618.0
,
"95"
:
4439.0
,
"100"
:
3224.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1754654208.0
,
"5"
:
1754654208.0
,
"10"
:
1754654208.0
,
"15"
:
1755702784.0
,
"20"
:
1754654208.0
,
"25"
:
1754654208.0
,
"30"
:
1754654208.0
,
"35"
:
1754654208.0
,
"40"
:
1754654208.0
,
"45"
:
1754654208.0
,
"50"
:
1754654208.0
,
"55"
:
1754654208.0
,
"60"
:
1754654208.0
,
"65"
:
1754654208.0
,
"70"
:
1754654208.0
,
"75"
:
1754654208.0
,
"80"
:
1754654208.0
,
"85"
:
1754654208.0
,
"90"
:
1754654208.0
,
"95"
:
1754654208.0
,
"100"
:
1754654208.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
2300849152.0
,
"5"
:
3043311616.0
,
"10"
:
3043311616.0
,
"15"
:
3043311616.0
,
"20"
:
3043311616.0
,
"25"
:
3043311616.0
,
"30"
:
3043311616.0
,
"35"
:
3043311616.0
,
"40"
:
3043311616.0
,
"45"
:
3043311616.0
,
"50"
:
3043311616.0
,
"55"
:
3043311616.0
,
"60"
:
3043311616.0
,
"65"
:
3043311616.0
,
"70"
:
3043311616.0
,
"75"
:
3043311616.0
,
"80"
:
3043311616.0
,
"85"
:
3043311616.0
,
"90"
:
3043311616.0
,
"95"
:
3043311616.0
,
"100"
:
3043311616.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
13.69855
,
"5"
:
1.14588
,
"10"
:
1.142
,
"15"
:
1.13872
,
"20"
:
1.13346
,
"25"
:
1.13589
,
"30"
:
1.13376
,
"35"
:
1.13181
,
"40"
:
1.13363
,
"45"
:
1.1355
,
"50"
:
1.13479
,
"55"
:
1.16002
,
"60"
:
1.15257
,
"65"
:
1.1392
,
"70"
:
1.32661
,
"75"
:
1.1411
,
"80"
:
1.14105
,
"85"
:
1.15914
,
"90"
:
1.14305
,
"95"
:
1.14054
,
"100"
:
1.29661
}}}
\ No newline at end of file
tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
View file @
688448db
...
...
@@ -20,8 +20,8 @@ MODEL_ARGS:
--train-iters
:
100
--timing-log-level
:
2
--lr-decay-iters
:
990000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-bert_00_text_sentence
--vocab-file
:
${DATA_PATH}/vocab.txt
--split
:
949,50,1
...
...
tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/golden_values_dev.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.49566
,
"5"
:
10.48412
,
"10"
:
10.49946
,
"15"
:
10.46625
,
"20"
:
10.44783
,
"25"
:
10.34967
,
"30"
:
10.17283
,
"35"
:
10.04281
,
"40"
:
9.90782
,
"45"
:
9.75786
,
"50"
:
9.67692
,
"55"
:
9.55379
,
"60"
:
9.45457
,
"65"
:
9.42149
,
"70"
:
9.30109
,
"75"
:
9.32221
,
"80"
:
9.26179
,
"85"
:
9.29668
,
"90"
:
9.23347
,
"95"
:
9.23813
,
"100"
:
9.10619
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
2183.0
,
"5"
:
2683.0
,
"10"
:
2206.0
,
"15"
:
2493.0
,
"20"
:
2165.0
,
"25"
:
2528.0
,
"30"
:
2774.0
,
"35"
:
3054.0
,
"40"
:
2250.0
,
"45"
:
3947.0
,
"50"
:
3608.0
,
"55"
:
3626.0
,
"60"
:
2776.0
,
"65"
:
3410.0
,
"70"
:
3977.0
,
"75"
:
4842.0
,
"80"
:
3634.0
,
"85"
:
4149.0
,
"90"
:
4712.0
,
"95"
:
4379.0
,
"100"
:
3097.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1767237120.0
,
"5"
:
1767237120.0
,
"10"
:
1767237120.0
,
"15"
:
1767237120.0
,
"20"
:
1767237120.0
,
"25"
:
1767237120.0
,
"30"
:
1767237120.0
,
"35"
:
1767237120.0
,
"40"
:
1767237120.0
,
"45"
:
1767237120.0
,
"50"
:
1767237120.0
,
"55"
:
1767237120.0
,
"60"
:
1767237120.0
,
"65"
:
1767237120.0
,
"70"
:
1767237120.0
,
"75"
:
1767237120.0
,
"80"
:
1767237120.0
,
"85"
:
1767237120.0
,
"90"
:
1767237120.0
,
"95"
:
1767237120.0
,
"100"
:
1767237120.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
2313432064.0
,
"5"
:
3055894528.0
,
"10"
:
3055894528.0
,
"15"
:
3055894528.0
,
"20"
:
3055894528.0
,
"25"
:
3055894528.0
,
"30"
:
3055894528.0
,
"35"
:
3055894528.0
,
"40"
:
3055894528.0
,
"45"
:
3055894528.0
,
"50"
:
3055894528.0
,
"55"
:
3055894528.0
,
"60"
:
3055894528.0
,
"65"
:
3055894528.0
,
"70"
:
3055894528.0
,
"75"
:
3055894528.0
,
"80"
:
3055894528.0
,
"85"
:
3055894528.0
,
"90"
:
3055894528.0
,
"95"
:
3055894528.0
,
"100"
:
3055894528.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
11.79142
,
"5"
:
1.14146
,
"10"
:
1.17812
,
"15"
:
1.14757
,
"20"
:
1.13488
,
"25"
:
1.13555
,
"30"
:
1.16806
,
"35"
:
1.13623
,
"40"
:
1.13913
,
"45"
:
1.14262
,
"50"
:
1.13979
,
"55"
:
1.13753
,
"60"
:
1.13567
,
"65"
:
1.14117
,
"70"
:
1.13534
,
"75"
:
1.13522
,
"80"
:
1.13276
,
"85"
:
1.13285
,
"90"
:
1.13482
,
"95"
:
1.13479
,
"100"
:
1.13076
}}}
\ No newline at end of file
tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/golden_values_lts.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.49566
,
"5"
:
10.48418
,
"10"
:
10.49947
,
"15"
:
10.46646
,
"20"
:
10.44777
,
"25"
:
10.34987
,
"30"
:
10.17278
,
"35"
:
10.04282
,
"40"
:
9.90771
,
"45"
:
9.75789
,
"50"
:
9.67683
,
"55"
:
9.55376
,
"60"
:
9.45455
,
"65"
:
9.42139
,
"70"
:
9.30101
,
"75"
:
9.32207
,
"80"
:
9.26182
,
"85"
:
9.29681
,
"90"
:
9.23351
,
"95"
:
9.2381
,
"100"
:
9.10611
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
2183.0
,
"5"
:
2533.0
,
"10"
:
2162.0
,
"15"
:
2548.0
,
"20"
:
2180.0
,
"25"
:
2557.0
,
"30"
:
2908.0
,
"35"
:
2999.0
,
"40"
:
2252.0
,
"45"
:
3808.0
,
"50"
:
3622.0
,
"55"
:
3598.0
,
"60"
:
2567.0
,
"65"
:
3371.0
,
"70"
:
4001.0
,
"75"
:
5046.0
,
"80"
:
3461.0
,
"85"
:
4137.0
,
"90"
:
4512.0
,
"95"
:
4417.0
,
"100"
:
3152.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1767237120.0
,
"5"
:
1767237120.0
,
"10"
:
1767237120.0
,
"15"
:
1767237120.0
,
"20"
:
1767237120.0
,
"25"
:
1767237120.0
,
"30"
:
1768285696.0
,
"35"
:
1767237120.0
,
"40"
:
1767237120.0
,
"45"
:
1767237120.0
,
"50"
:
1767237120.0
,
"55"
:
1767237120.0
,
"60"
:
1768285696.0
,
"65"
:
1767237120.0
,
"70"
:
1767237120.0
,
"75"
:
1767237120.0
,
"80"
:
1767237120.0
,
"85"
:
1767237120.0
,
"90"
:
1767237120.0
,
"95"
:
1767237120.0
,
"100"
:
1767237120.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
2313432064.0
,
"5"
:
3055894528.0
,
"10"
:
3055894528.0
,
"15"
:
3055894528.0
,
"20"
:
3055894528.0
,
"25"
:
3055894528.0
,
"30"
:
3055894528.0
,
"35"
:
3055894528.0
,
"40"
:
3055894528.0
,
"45"
:
3055894528.0
,
"50"
:
3055894528.0
,
"55"
:
3055894528.0
,
"60"
:
3055894528.0
,
"65"
:
3055894528.0
,
"70"
:
3055894528.0
,
"75"
:
3055894528.0
,
"80"
:
3055894528.0
,
"85"
:
3055894528.0
,
"90"
:
3055894528.0
,
"95"
:
3055894528.0
,
"100"
:
3055894528.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
12.34148
,
"5"
:
1.17345
,
"10"
:
1.15021
,
"15"
:
1.17577
,
"20"
:
1.18238
,
"25"
:
1.42203
,
"30"
:
1.18928
,
"35"
:
1.18467
,
"40"
:
1.17861
,
"45"
:
1.18052
,
"50"
:
1.18213
,
"55"
:
1.19019
,
"60"
:
1.18562
,
"65"
:
1.1842
,
"70"
:
1.17896
,
"75"
:
1.17997
,
"80"
:
1.18574
,
"85"
:
1.18887
,
"90"
:
1.18285
,
"95"
:
1.18023
,
"100"
:
1.18199
}}}
\ No newline at end of file
tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/model_config.yaml
View file @
688448db
...
...
@@ -20,8 +20,8 @@ MODEL_ARGS:
--train-iters
:
100
--timing-log-level
:
2
--lr-decay-iters
:
990000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-bert_00_text_sentence
--vocab-file
:
${DATA_PATH}/vocab.txt
--split
:
949,50,1
...
...
tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/golden_values_dev.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.54308
,
10.53881
,
10.55633
,
10.53805
,
10.525
89
,
10.49568
,
10.45958
,
10.32846
,
10.17264
,
9.96952
]},
"num-zero
s"
:
{
"start_step"
:
0
,
"end_step"
:
34
,
"step_interval"
:
5
,
"values"
:
[
22584
.0
,
20
590.0
,
27442.0
,
22852.0
,
22567
.0
,
20
740.0
,
23315.0
]
},
"iteration
_
tim
ing_avg"
:
0.7692817647058824
}
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.54308
,
"5"
:
10.54015
,
"10"
:
10.54067
,
"15"
:
10.56164
,
"20"
:
10.54299
,
"25"
:
10.5
3
25
3
,
"30"
:
10.45969
,
"35"
:
10.31933
,
"40"
:
10.18146
,
"45"
:
10.03915
,
"50"
:
9.91421
}},
"mem-allocated-byte
s"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1416466432.0
,
"5"
:
1416466432.0
,
"10"
:
1416466432.0
,
"15"
:
1416466432
.0
,
"
20
"
:
2277237760.0
,
"25"
:
2277237760.0
,
"30"
:
2277237760.0
,
"35"
:
2277237760.0
,
"40"
:
2277237760.0
,
"45"
:
2277237760.0
,
"50"
:
2277237760.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
4155522048.0
,
"5"
:
4155523072.0
,
"10"
:
4155523072.0
,
"15"
:
4155523072
.0
,
"
20
"
:
5016294400.0
,
"25"
:
5016294400.0
,
"30"
:
5016294400.0
,
"35"
:
5016294400.0
,
"40"
:
5016294400.0
,
"45"
:
5016294400.0
,
"50"
:
5016294400.0
}
},
"iteration
-
tim
e"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
14.03874
,
"5"
:
0.9798
,
"10"
:
0.94533
,
"15"
:
0.84098
,
"20"
:
0.837
,
"25"
:
0.85187
,
"30"
:
0.85092
,
"35"
:
0.81519
,
"40"
:
0.79898
,
"45"
:
0.80833
,
"50"
:
1.05286
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
"nan"
,
"5"
:
"nan"
,
"10"
:
"nan"
,
"15"
:
"nan"
,
"20"
:
20512.0
,
"25"
:
29674.0
,
"30"
:
21582.0
,
"35"
:
23934.0
,
"40"
:
23635.0
,
"45"
:
32392.0
,
"50"
:
31688.0
}}
}
\ No newline at end of file
tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/golden_values_lts.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.54308
,
10.53881
,
10.55633
,
10.53805
,
10.525
89
,
10.49568
,
10.45958
,
10.32846
,
10.17264
,
9.96952
]},
"num-zero
s"
:
{
"start_step"
:
0
,
"end_step"
:
34
,
"step_interval"
:
5
,
"values"
:
[
22584
.0
,
20
590.0
,
27442.0
,
22852.0
,
22567
.0
,
20
740.0
,
23315.0
]
},
"iteration
_
tim
ing_avg"
:
0.7692817647058824
}
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.54308
,
"5"
:
10.54015
,
"10"
:
10.54067
,
"15"
:
10.56164
,
"20"
:
10.54299
,
"25"
:
10.5
3
25
3
,
"30"
:
10.45969
,
"35"
:
10.31933
,
"40"
:
10.18146
,
"45"
:
10.03915
,
"50"
:
9.91421
}},
"mem-allocated-byte
s"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1416466432.0
,
"5"
:
1416466432.0
,
"10"
:
1416466432.0
,
"15"
:
1416466432
.0
,
"
20
"
:
2277237760.0
,
"25"
:
2277237760.0
,
"30"
:
2277237760.0
,
"35"
:
2277237760.0
,
"40"
:
2277237760.0
,
"45"
:
2277237760.0
,
"50"
:
2277237760.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
4155522048.0
,
"5"
:
4155523072.0
,
"10"
:
4155523072.0
,
"15"
:
4155523072
.0
,
"
20
"
:
5016294400.0
,
"25"
:
5016294400.0
,
"30"
:
5016294400.0
,
"35"
:
5016294400.0
,
"40"
:
5016294400.0
,
"45"
:
5016294400.0
,
"50"
:
5016294400.0
}
},
"iteration
-
tim
e"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
14.89307
,
"5"
:
0.78162
,
"10"
:
0.83533
,
"15"
:
0.81477
,
"20"
:
0.89929
,
"25"
:
1.00162
,
"30"
:
0.78191
,
"35"
:
0.79314
,
"40"
:
1.12991
,
"45"
:
0.97013
,
"50"
:
0.80459
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
"nan"
,
"5"
:
"nan"
,
"10"
:
"nan"
,
"15"
:
"nan"
,
"20"
:
20512.0
,
"25"
:
29674.0
,
"30"
:
21582.0
,
"35"
:
23934.0
,
"40"
:
23635.0
,
"45"
:
32392.0
,
"50"
:
31688.0
}}
}
\ No newline at end of file
tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/model_config.yaml
View file @
688448db
...
...
@@ -20,8 +20,8 @@ MODEL_ARGS:
--train-iters
:
50
--timing-log-level
:
2
--lr-decay-iters
:
990000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-bert_00_text_sentence
--vocab-file
:
${DATA_PATH}/vocab.txt
--split
:
949,50,1
...
...
tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G/golden_values_dev.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.54308
,
"5"
:
10.54015
,
"10"
:
10.54067
,
"15"
:
10.56164
,
"20"
:
10.54299
,
"25"
:
10.53253
,
"30"
:
10.45969
,
"35"
:
10.31933
,
"40"
:
10.18146
,
"45"
:
10.03915
,
"50"
:
9.91421
,
"55"
:
9.75787
,
"60"
:
9.62542
,
"65"
:
9.56458
,
"70"
:
9.44843
,
"75"
:
9.43593
,
"80"
:
9.35302
,
"85"
:
9.39268
,
"90"
:
9.29853
,
"95"
:
9.29715
,
"100"
:
9.17013
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1416466432.0
,
"5"
:
1416466432.0
,
"10"
:
1416466432.0
,
"15"
:
1416466432.0
,
"20"
:
2277237760.0
,
"25"
:
2277237760.0
,
"30"
:
2277237760.0
,
"35"
:
2277237760.0
,
"40"
:
2277237760.0
,
"45"
:
2277237760.0
,
"50"
:
2277237760.0
,
"55"
:
2277237760.0
,
"60"
:
2277237760.0
,
"65"
:
2277237760.0
,
"70"
:
2277237760.0
,
"75"
:
2277237760.0
,
"80"
:
2277237760.0
,
"85"
:
2277237760.0
,
"90"
:
2277237760.0
,
"95"
:
2277237760.0
,
"100"
:
2277237760.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
4155522048.0
,
"5"
:
4155523072.0
,
"10"
:
4155523072.0
,
"15"
:
4155523072.0
,
"20"
:
5016294400.0
,
"25"
:
5016294400.0
,
"30"
:
5016294400.0
,
"35"
:
5016294400.0
,
"40"
:
5016294400.0
,
"45"
:
5016294400.0
,
"50"
:
5016294400.0
,
"55"
:
5016294400.0
,
"60"
:
5016294400.0
,
"65"
:
5016294400.0
,
"70"
:
5016294400.0
,
"75"
:
5016294400.0
,
"80"
:
5016294400.0
,
"85"
:
5016294400.0
,
"90"
:
5016294400.0
,
"95"
:
5016294400.0
,
"100"
:
5016294400.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
14.16083
,
"5"
:
0.87236
,
"10"
:
0.96745
,
"15"
:
0.76125
,
"20"
:
0.80903
,
"25"
:
0.79936
,
"30"
:
0.91232
,
"35"
:
0.78252
,
"40"
:
0.7981
,
"45"
:
0.7982
,
"50"
:
1.16802
,
"55"
:
1.0754
,
"60"
:
0.81262
,
"65"
:
0.78986
,
"70"
:
1.17774
,
"75"
:
0.90398
,
"80"
:
0.89328
,
"85"
:
0.8043
,
"90"
:
0.8754
,
"95"
:
0.90921
,
"100"
:
0.82266
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
"nan"
,
"5"
:
"nan"
,
"10"
:
"nan"
,
"15"
:
"nan"
,
"20"
:
20512.0
,
"25"
:
29674.0
,
"30"
:
21582.0
,
"35"
:
23934.0
,
"40"
:
23635.0
,
"45"
:
32392.0
,
"50"
:
31688.0
,
"55"
:
30923.0
,
"60"
:
24642.0
,
"65"
:
26839.0
,
"70"
:
31192.0
,
"75"
:
40009.0
,
"80"
:
29301.0
,
"85"
:
31592.0
,
"90"
:
33685.0
,
"95"
:
33411.0
,
"100"
:
22706.0
}}}
\ No newline at end of file
tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G/golden_values_lts.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.54308
,
"5"
:
10.54015
,
"10"
:
10.54067
,
"15"
:
10.56164
,
"20"
:
10.54299
,
"25"
:
10.53253
,
"30"
:
10.45969
,
"35"
:
10.31933
,
"40"
:
10.18146
,
"45"
:
10.03915
,
"50"
:
9.91421
,
"55"
:
9.75787
,
"60"
:
9.62542
,
"65"
:
9.56458
,
"70"
:
9.44843
,
"75"
:
9.43593
,
"80"
:
9.35302
,
"85"
:
9.39268
,
"90"
:
9.29853
,
"95"
:
9.29715
,
"100"
:
9.17013
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1416466432.0
,
"5"
:
1416466432.0
,
"10"
:
1416466432.0
,
"15"
:
1416466432.0
,
"20"
:
2277237760.0
,
"25"
:
2277237760.0
,
"30"
:
2277237760.0
,
"35"
:
2277237760.0
,
"40"
:
2277237760.0
,
"45"
:
2277237760.0
,
"50"
:
2277237760.0
,
"55"
:
2277237760.0
,
"60"
:
2277237760.0
,
"65"
:
2277237760.0
,
"70"
:
2277237760.0
,
"75"
:
2277237760.0
,
"80"
:
2277237760.0
,
"85"
:
2277237760.0
,
"90"
:
2277237760.0
,
"95"
:
2277237760.0
,
"100"
:
2277237760.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
4155522048.0
,
"5"
:
4155523072.0
,
"10"
:
4155523072.0
,
"15"
:
4155523072.0
,
"20"
:
5016294400.0
,
"25"
:
5016294400.0
,
"30"
:
5016294400.0
,
"35"
:
5016294400.0
,
"40"
:
5016294400.0
,
"45"
:
5016294400.0
,
"50"
:
5016294400.0
,
"55"
:
5016294400.0
,
"60"
:
5016294400.0
,
"65"
:
5016294400.0
,
"70"
:
5016294400.0
,
"75"
:
5016294400.0
,
"80"
:
5016294400.0
,
"85"
:
5016294400.0
,
"90"
:
5016294400.0
,
"95"
:
5016294400.0
,
"100"
:
5016294400.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
18.37849
,
"5"
:
0.83211
,
"10"
:
0.87143
,
"15"
:
0.80812
,
"20"
:
0.92064
,
"25"
:
1.00154
,
"30"
:
0.80774
,
"35"
:
0.78379
,
"40"
:
1.05451
,
"45"
:
0.79216
,
"50"
:
0.81105
,
"55"
:
1.02367
,
"60"
:
0.86175
,
"65"
:
0.80509
,
"70"
:
1.11382
,
"75"
:
0.77236
,
"80"
:
0.81252
,
"85"
:
0.95294
,
"90"
:
0.80408
,
"95"
:
0.76715
,
"100"
:
1.02921
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
"nan"
,
"5"
:
"nan"
,
"10"
:
"nan"
,
"15"
:
"nan"
,
"20"
:
20512.0
,
"25"
:
29674.0
,
"30"
:
21582.0
,
"35"
:
23934.0
,
"40"
:
23635.0
,
"45"
:
32392.0
,
"50"
:
31688.0
,
"55"
:
30923.0
,
"60"
:
24642.0
,
"65"
:
26839.0
,
"70"
:
31192.0
,
"75"
:
40009.0
,
"80"
:
29301.0
,
"85"
:
31592.0
,
"90"
:
33685.0
,
"95"
:
33411.0
,
"100"
:
22706.0
}}}
\ No newline at end of file
tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G/model_config.yaml
View file @
688448db
...
...
@@ -20,8 +20,8 @@ MODEL_ARGS:
--train-iters
:
100
--timing-log-level
:
2
--lr-decay-iters
:
990000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-bert_00_text_sentence
--vocab-file
:
${DATA_PATH}/vocab.txt
--split
:
949,50,1
...
...
tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.43755
,
10.43
58
7
,
10.44704
,
10.44395
,
10.44965
,
10.44295
,
10.32757
,
10.23341
,
10.09049
,
9.93294
]},
"num-zero
s"
:
{
"start_step"
:
0
,
"end_step"
:
34
,
"step_interval"
:
5
,
"values"
:
[
27979.0
,
20991.0
,
29735.0
,
24779.0
,
26808.0
,
33075.0
,
24387
.0
]
},
"iteration
_timing_avg"
:
0.7523635294117648
}
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.43755
,
"5"
:
10.43739
,
"10"
:
10.45
58
2
,
"15"
:
10.45606
,
"20"
:
10.44388
,
"25"
:
10.42748
,
"30"
:
10.39565
,
"35"
:
10.24752
,
"40"
:
10.11101
,
"45"
:
9.99773
,
"50"
:
9.88142
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1210690048.0
,
"5"
:
1210690048.0
,
"10"
:
1210690048.0
,
"15"
:
1210690048.0
,
"20"
:
1952102912.0
,
"25"
:
1952102912.0
,
"30"
:
1952102912.0
,
"35"
:
1952102912.0
,
"40"
:
1952102912.0
,
"45"
:
1952102912.0
,
"50"
:
1952102912.0
}},
"mem-max-allocated-byte
s"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
2127591936.0
,
"5"
:
2127592960.0
,
"10"
:
2127592960.0
,
"15"
:
2127592960.0
,
"20"
:
2867957248.0
,
"25"
:
2867957248.0
,
"30"
:
2867957248.0
,
"35"
:
2867957248.0
,
"40"
:
2867957248.0
,
"45"
:
2867957248.0
,
"50"
:
2867957248
.0
}
},
"iteration
-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
12.58907
,
"5"
:
1.10935
,
"10"
:
1.09468
,
"15"
:
1.08912
,
"20"
:
1.39243
,
"25"
:
1.1296
,
"30"
:
1.11603
,
"35"
:
1.34495
,
"40"
:
1.39742
,
"45"
:
1.11931
,
"50"
:
1.12017
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
"nan"
,
"5"
:
"nan"
,
"10"
:
"nan"
,
"15"
:
"nan"
,
"20"
:
22585.0
,
"25"
:
33970.0
,
"30"
:
23056.0
,
"35"
:
26873.0
,
"40"
:
22716.0
,
"45"
:
35165.0
,
"50"
:
31348.0
}}
}
\ No newline at end of file
tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.43755
,
10.43
58
7
,
10.44704
,
10.44395
,
10.44965
,
10.44295
,
10.32757
,
10.23341
,
10.09049
,
9.93294
]},
"num-zero
s"
:
{
"start_step"
:
0
,
"end_step"
:
34
,
"step_interval"
:
5
,
"values"
:
[
27979.0
,
20991.0
,
29735.0
,
24779.0
,
26808.0
,
33075.0
,
24387
.0
]
},
"iteration
_
tim
ing_avg"
:
0.7523635294117648
}
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.43755
,
"5"
:
10.43739
,
"10"
:
10.45
58
2
,
"15"
:
10.45606
,
"20"
:
10.44388
,
"25"
:
10.42748
,
"30"
:
10.39565
,
"35"
:
10.24752
,
"40"
:
10.11101
,
"45"
:
9.99773
,
"50"
:
9.88142
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1210690048.0
,
"5"
:
1210690048.0
,
"10"
:
1210690048.0
,
"15"
:
1210690048.0
,
"20"
:
1952102912.0
,
"25"
:
1952102912.0
,
"30"
:
1952102912.0
,
"35"
:
1952102912.0
,
"40"
:
1952102912.0
,
"45"
:
1952102912.0
,
"50"
:
1952102912.0
}},
"mem-max-allocated-byte
s"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
2127591936.0
,
"5"
:
2127592960.0
,
"10"
:
2127592960.0
,
"15"
:
2127592960.0
,
"20"
:
2867957248.0
,
"25"
:
2867957248.0
,
"30"
:
2867957248.0
,
"35"
:
2867957248.0
,
"40"
:
2867957248.0
,
"45"
:
2867957248.0
,
"50"
:
2867957248
.0
}
},
"iteration
-
tim
e"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
11.34618
,
"5"
:
1.16528
,
"10"
:
1.16219
,
"15"
:
1.16386
,
"20"
:
1.18006
,
"25"
:
1.37981
,
"30"
:
1.17956
,
"35"
:
1.39195
,
"40"
:
1.18406
,
"45"
:
1.19213
,
"50"
:
1.18956
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
"nan"
,
"5"
:
"nan"
,
"10"
:
"nan"
,
"15"
:
"nan"
,
"20"
:
22585.0
,
"25"
:
33970.0
,
"30"
:
23056.0
,
"35"
:
26873.0
,
"40"
:
22716.0
,
"45"
:
35165.0
,
"50"
:
31348.0
}}
}
\ No newline at end of file
tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml
View file @
688448db
...
...
@@ -20,8 +20,8 @@ MODEL_ARGS:
--train-iters
:
50
--timing-log-level
:
2
--lr-decay-iters
:
990000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-bert_00_text_sentence
--vocab-file
:
${DATA_PATH}/vocab.txt
--split
:
949,50,1
...
...
tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/golden_values_dev.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.43755
,
"5"
:
10.43739
,
"10"
:
10.45582
,
"15"
:
10.45606
,
"20"
:
10.44388
,
"25"
:
10.42748
,
"30"
:
10.39565
,
"35"
:
10.24752
,
"40"
:
10.11101
,
"45"
:
9.99773
,
"50"
:
9.88142
,
"55"
:
9.73343
,
"60"
:
9.62126
,
"65"
:
9.55968
,
"70"
:
9.44177
,
"75"
:
9.43855
,
"80"
:
9.35357
,
"85"
:
9.38316
,
"90"
:
9.30523
,
"95"
:
9.30959
,
"100"
:
9.17509
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1210690048.0
,
"5"
:
1210690048.0
,
"10"
:
1210690048.0
,
"15"
:
1210690048.0
,
"20"
:
1952102912.0
,
"25"
:
1952102912.0
,
"30"
:
1952102912.0
,
"35"
:
1952102912.0
,
"40"
:
1952102912.0
,
"45"
:
1952102912.0
,
"50"
:
1952102912.0
,
"55"
:
1952102912.0
,
"60"
:
1952102912.0
,
"65"
:
1952102912.0
,
"70"
:
1952102912.0
,
"75"
:
1952102912.0
,
"80"
:
1952102912.0
,
"85"
:
1952102912.0
,
"90"
:
1952102912.0
,
"95"
:
1952102912.0
,
"100"
:
1952102912.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
2127591936.0
,
"5"
:
2127592960.0
,
"10"
:
2127592960.0
,
"15"
:
2127592960.0
,
"20"
:
2867957248.0
,
"25"
:
2867957248.0
,
"30"
:
2867957248.0
,
"35"
:
2867957248.0
,
"40"
:
2867957248.0
,
"45"
:
2867957248.0
,
"50"
:
2867957248.0
,
"55"
:
2867957248.0
,
"60"
:
2867957248.0
,
"65"
:
2867957248.0
,
"70"
:
2867957248.0
,
"75"
:
2867957248.0
,
"80"
:
2867957248.0
,
"85"
:
2867957248.0
,
"90"
:
2867957248.0
,
"95"
:
2867957248.0
,
"100"
:
2867957248.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.13637
,
"5"
:
1.33125
,
"10"
:
1.3265
,
"15"
:
1.31217
,
"20"
:
1.64322
,
"25"
:
1.33727
,
"30"
:
1.34028
,
"35"
:
1.55181
,
"40"
:
1.6152
,
"45"
:
1.37118
,
"50"
:
1.37854
,
"55"
:
1.29942
,
"60"
:
1.29229
,
"65"
:
1.30075
,
"70"
:
1.29686
,
"75"
:
1.36267
,
"80"
:
1.3054
,
"85"
:
1.31603
,
"90"
:
1.28771
,
"95"
:
1.29886
,
"100"
:
1.29338
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
"nan"
,
"5"
:
"nan"
,
"10"
:
"nan"
,
"15"
:
"nan"
,
"20"
:
22585.0
,
"25"
:
33970.0
,
"30"
:
23056.0
,
"35"
:
26873.0
,
"40"
:
22716.0
,
"45"
:
35165.0
,
"50"
:
31348.0
,
"55"
:
32824.0
,
"60"
:
23375.0
,
"65"
:
26746.0
,
"70"
:
30011.0
,
"75"
:
39617.0
,
"80"
:
31497.0
,
"85"
:
31636.0
,
"90"
:
32832.0
,
"95"
:
38873.0
,
"100"
:
24755.0
}}}
\ No newline at end of file
tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/golden_values_lts.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.43755
,
"5"
:
10.43739
,
"10"
:
10.45582
,
"15"
:
10.45606
,
"20"
:
10.44388
,
"25"
:
10.42748
,
"30"
:
10.39565
,
"35"
:
10.24752
,
"40"
:
10.11101
,
"45"
:
9.99773
,
"50"
:
9.88142
,
"55"
:
9.73343
,
"60"
:
9.62126
,
"65"
:
9.55968
,
"70"
:
9.44177
,
"75"
:
9.43855
,
"80"
:
9.35357
,
"85"
:
9.38316
,
"90"
:
9.30523
,
"95"
:
9.30959
,
"100"
:
9.17509
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1210690048.0
,
"5"
:
1210690048.0
,
"10"
:
1210690048.0
,
"15"
:
1210690048.0
,
"20"
:
1952102912.0
,
"25"
:
1952102912.0
,
"30"
:
1952102912.0
,
"35"
:
1952102912.0
,
"40"
:
1952102912.0
,
"45"
:
1952102912.0
,
"50"
:
1952102912.0
,
"55"
:
1952102912.0
,
"60"
:
1952102912.0
,
"65"
:
1952102912.0
,
"70"
:
1952102912.0
,
"75"
:
1952102912.0
,
"80"
:
1952102912.0
,
"85"
:
1952102912.0
,
"90"
:
1952102912.0
,
"95"
:
1952102912.0
,
"100"
:
1952102912.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
2127591936.0
,
"5"
:
2127592960.0
,
"10"
:
2127592960.0
,
"15"
:
2127592960.0
,
"20"
:
2867957248.0
,
"25"
:
2867957248.0
,
"30"
:
2867957248.0
,
"35"
:
2867957248.0
,
"40"
:
2867957248.0
,
"45"
:
2867957248.0
,
"50"
:
2867957248.0
,
"55"
:
2867957248.0
,
"60"
:
2867957248.0
,
"65"
:
2867957248.0
,
"70"
:
2867957248.0
,
"75"
:
2867957248.0
,
"80"
:
2867957248.0
,
"85"
:
2867957248.0
,
"90"
:
2867957248.0
,
"95"
:
2867957248.0
,
"100"
:
2867957248.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
12.4846
,
"5"
:
1.29736
,
"10"
:
1.172
,
"15"
:
1.15505
,
"20"
:
1.16055
,
"25"
:
1.56732
,
"30"
:
1.18656
,
"35"
:
1.37506
,
"40"
:
1.17417
,
"45"
:
1.16819
,
"50"
:
1.17821
,
"55"
:
1.16442
,
"60"
:
1.19327
,
"65"
:
1.1497
,
"70"
:
1.17034
,
"75"
:
1.15327
,
"80"
:
1.1535
,
"85"
:
1.16731
,
"90"
:
1.15301
,
"95"
:
1.1516
,
"100"
:
1.16053
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
"nan"
,
"5"
:
"nan"
,
"10"
:
"nan"
,
"15"
:
"nan"
,
"20"
:
22585.0
,
"25"
:
33970.0
,
"30"
:
23056.0
,
"35"
:
26873.0
,
"40"
:
22716.0
,
"45"
:
35165.0
,
"50"
:
31348.0
,
"55"
:
32824.0
,
"60"
:
23375.0
,
"65"
:
26746.0
,
"70"
:
30011.0
,
"75"
:
39617.0
,
"80"
:
31497.0
,
"85"
:
31636.0
,
"90"
:
32832.0
,
"95"
:
38873.0
,
"100"
:
24755.0
}}}
\ No newline at end of file
tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
View file @
688448db
...
...
@@ -20,8 +20,8 @@ MODEL_ARGS:
--train-iters
:
100
--timing-log-level
:
2
--lr-decay-iters
:
990000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-bert_00_text_sentence
--vocab-file
:
${DATA_PATH}/vocab.txt
--split
:
949,50,1
...
...
tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.49411
,
10.4825
,
10.49242
,
10.47802
,
10.46608
,
10.35193
,
10.17693
,
10.07728
,
9.88753
,
9.68034
]
},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
1931.0
,
2555.0
,
2017.0
,
2135.0
,
2440.0
,
2464.0
,
3070.0
,
3006.0
,
2932.0
,
2303.0
]
},
"iteration-time"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.94975
,
0.67196
,
0.67378
,
0.66862
,
0.69618
,
0.66936
,
0.67757
,
0.67189
,
0.67519
,
0.67762
]
}
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.49405
,
"5"
:
10.49924
,
"10"
:
10.49606
,
"15"
:
10.48729
,
"20"
:
10.4657
,
"25"
:
10.39493
,
"30"
:
10.21023
,
"35"
:
10.0733
,
"40"
:
9.93987
,
"45"
:
9.75668
,
"50"
:
9.69018
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
2018.0
,
"5"
:
2803.0
,
"10"
:
2242.0
,
"15"
:
2551.0
,
"20"
:
2294.0
,
"25"
:
2736.0
,
"30"
:
2631.0
,
"35"
:
2878.0
,
"40"
:
1867.0
,
"45"
:
4062.0
,
"50"
:
3040.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
3375511040.0
,
"5"
:
3375511040.0
,
"10"
:
3375511040.0
,
"15"
:
3375511040.0
,
"20"
:
3375511040.0
,
"25"
:
3375511040.0
,
"30"
:
3375511040.0
,
"35"
:
3375511040.0
,
"40"
:
3375511040.0
,
"45"
:
3375511040.0
,
"50"
:
3375511040.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
4165166080.0
,
"5"
:
5630557184.0
,
"10"
:
5630557184.0
,
"15"
:
5630557184.0
,
"20"
:
5630557184.0
,
"25"
:
5630557184.0
,
"30"
:
5630557184.0
,
"35"
:
5630557184.0
,
"40"
:
5630557184.0
,
"45"
:
5630557184.0
,
"50"
:
5630557184.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
6.38574
,
"5"
:
0.7037
,
"10"
:
0.74678
,
"15"
:
0.67056
,
"20"
:
0.66842
,
"25"
:
0.93898
,
"30"
:
0.68891
,
"35"
:
0.87958
,
"40"
:
0.66027
,
"45"
:
0.66606
,
"50"
:
0.6644
}}}
\ No newline at end of file
tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.49405
,
10.48276
,
10.49249
,
10.47813
,
10.46623
,
10.35183
,
10.17697
,
10.07728
,
9.8875
,
9.68029
]},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
2018.0
,
2636.0
,
2067.0
,
2225.0
,
2555.0
,
2554.0
,
2969.0
,
2935.0
,
2967.0
,
2287.0
]},
"iteration_timing_avg"
:
0.5847132352941178
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.49405
,
"5"
:
10.49933
,
"10"
:
10.49631
,
"15"
:
10.4873
,
"20"
:
10.46572
,
"25"
:
10.39496
,
"30"
:
10.2104
,
"35"
:
10.07333
,
"40"
:
9.94011
,
"45"
:
9.75651
,
"50"
:
9.69025
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
2018.0
,
"5"
:
2740.0
,
"10"
:
2260.0
,
"15"
:
2649.0
,
"20"
:
2205.0
,
"25"
:
2675.0
,
"30"
:
2687.0
,
"35"
:
2930.0
,
"40"
:
1853.0
,
"45"
:
4016.0
,
"50"
:
2978.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
3375511040.0
,
"5"
:
3375511040.0
,
"10"
:
3375511040.0
,
"15"
:
3375511040.0
,
"20"
:
3375511040.0
,
"25"
:
3375511040.0
,
"30"
:
3375511040.0
,
"35"
:
3375511040.0
,
"40"
:
3375511040.0
,
"45"
:
3375511040.0
,
"50"
:
3375511040.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
4153629696.0
,
"5"
:
5620069376.0
,
"10"
:
5620069376.0
,
"15"
:
5620069376.0
,
"20"
:
5620069376.0
,
"25"
:
5620069376.0
,
"30"
:
5620069376.0
,
"35"
:
5620069376.0
,
"40"
:
5620069376.0
,
"45"
:
5620069376.0
,
"50"
:
5620069376.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.76357
,
"5"
:
0.63754
,
"10"
:
0.61134
,
"15"
:
0.62429
,
"20"
:
0.60864
,
"25"
:
0.8008
,
"30"
:
0.61228
,
"35"
:
0.84121
,
"40"
:
0.6217
,
"45"
:
0.62022
,
"50"
:
0.61774
}}}
\ No newline at end of file
Prev
1
…
10
11
12
13
14
15
16
17
18
…
42
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment