Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
688448db
Commit
688448db
authored
Mar 14, 2025
by
silencealiang
Browse files
更新代码
parent
a02a5490
Pipeline
#2503
passed with stage
Changes
823
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
89 additions
and
285 deletions
+89
-285
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts.json
...mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts.json
+1
-1
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml
...3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev.json
..._overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev.json
+1
-53
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts.json
..._overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts.json
+1
-1
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml
...mizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev.json
..._reduce_param_gather_dgx_a100_1N8G/golden_values_dev.json
+1
-53
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts.json
..._reduce_param_gather_dgx_a100_1N8G/golden_values_lts.json
+1
-1
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml
..._grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml
+58
-56
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_dev.json
...er_overlap_optimizer_dgx_a100_1N8G/golden_values_dev.json
+1
-53
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_lts.json
...er_overlap_optimizer_dgx_a100_1N8G/golden_values_lts.json
+1
-1
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/model_config.yaml
..._gather_overlap_optimizer_dgx_a100_1N8G/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_dev.json
...p_grad_reduce_untied_dgx_a100_1N8G/golden_values_dev.json
+1
-53
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_lts.json
...p_grad_reduce_untied_dgx_a100_1N8G/golden_values_lts.json
+1
-1
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml
...verlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_dev.json
...e_torch_decoupled_lr_dgx_a100_1N8G/golden_values_dev.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_lts.json
...e_torch_decoupled_lr_dgx_a100_1N8G/golden_values_lts.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/model_config.yaml
...resume_torch_decoupled_lr_dgx_a100_1N8G/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev.json
...ulate_per_token_loss_dgx_a100_1N8G/golden_values_dev.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_lts.json
...ulate_per_token_loss_dgx_a100_1N8G/golden_values_lts.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml
..._calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml
+3
-2
No files found.
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.82005
,
10.87449
,
10.87798
,
10.79509
,
10.68164
,
10.59517
,
10.10046
,
10.21236
,
10.13863
,
9.80877
]},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
1559.0
,
1719.0
,
1856.0
,
1791.0
,
1900.0
,
1709.0
,
1627.0
,
1831.0
,
2272.0
,
2312.0
]},
"iteration_timing_avg"
:
0.12502588235294115
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.82005
,
"5"
:
10.85284
,
"10"
:
10.78455
,
"15"
:
10.79229
,
"20"
:
10.69211
,
"25"
:
10.52412
,
"30"
:
10.34552
,
"35"
:
10.26242
,
"40"
:
10.07239
,
"45"
:
9.811
,
"50"
:
9.88415
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1559.0
,
"5"
:
1840.0
,
"10"
:
1380.0
,
"15"
:
1848.0
,
"20"
:
1601.0
,
"25"
:
1635.0
,
"30"
:
1908.0
,
"35"
:
1925.0
,
"40"
:
2126.0
,
"45"
:
2086.0
,
"50"
:
2298.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
733859840.0
,
"5"
:
733859840.0
,
"10"
:
733859840.0
,
"15"
:
733859840.0
,
"20"
:
733859840.0
,
"25"
:
733859840.0
,
"30"
:
733859840.0
,
"35"
:
733859840.0
,
"40"
:
733859840.0
,
"45"
:
733859840.0
,
"50"
:
733859840.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
3838895104.0
,
"5"
:
4122703872.0
,
"10"
:
4122703872.0
,
"15"
:
4122703872.0
,
"20"
:
4122703872.0
,
"25"
:
4122703872.0
,
"30"
:
4122703872.0
,
"35"
:
4122703872.0
,
"40"
:
4122703872.0
,
"45"
:
4122703872.0
,
"50"
:
4122703872.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
17.71283
,
"5"
:
0.16448
,
"10"
:
0.16446
,
"15"
:
0.16389
,
"20"
:
0.16438
,
"25"
:
0.15866
,
"30"
:
0.15768
,
"35"
:
0.15941
,
"40"
:
0.15987
,
"45"
:
0.16075
,
"50"
:
0.16301
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
50
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -48,4 +48,5 @@ MODEL_ARGS:
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--attention-backend
:
unfused
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
regular
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.82005
,
10.87447
,
10.87799
,
10.79507
,
10.68165
,
10.59511
,
10.10047
,
10.2124
,
10.13861
,
9.80876
]
},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
1562.0
,
1738.0
,
1852.0
,
1802.0
,
1917.0
,
1765.0
,
1570.0
,
1949.0
,
2251.0
,
2270.0
]
},
"iteration-time"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
14.96968
,
0.16347
,
0.16403
,
0.16317
,
0.162
,
0.16129
,
0.16268
,
0.16156
,
0.16212
,
0.16407
]
}
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.82005
,
"5"
:
10.85286
,
"10"
:
10.7845
,
"15"
:
10.79231
,
"20"
:
10.6921
,
"25"
:
10.52408
,
"30"
:
10.34555
,
"35"
:
10.26239
,
"40"
:
10.07241
,
"45"
:
9.81101
,
"50"
:
9.88416
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1562.0
,
"5"
:
1861.0
,
"10"
:
1339.0
,
"15"
:
1948.0
,
"20"
:
1698.0
,
"25"
:
1687.0
,
"30"
:
1930.0
,
"35"
:
1927.0
,
"40"
:
2061.0
,
"45"
:
2060.0
,
"50"
:
2330.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
522976256.0
,
"5"
:
522976256.0
,
"10"
:
522976256.0
,
"15"
:
522976256.0
,
"20"
:
522976256.0
,
"25"
:
522976256.0
,
"30"
:
522976256.0
,
"35"
:
522976256.0
,
"40"
:
522976256.0
,
"45"
:
522976256.0
,
"50"
:
522976256.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
3768845312.0
,
"5"
:
3912737792.0
,
"10"
:
3912737792.0
,
"15"
:
3912737792.0
,
"20"
:
3912737792.0
,
"25"
:
3912737792.0
,
"30"
:
3912737792.0
,
"35"
:
3912737792.0
,
"40"
:
3912737792.0
,
"45"
:
3912737792.0
,
"50"
:
3912737792.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
19.16738
,
"5"
:
0.16325
,
"10"
:
0.16427
,
"15"
:
0.16183
,
"20"
:
0.16039
,
"25"
:
0.16182
,
"30"
:
0.16047
,
"35"
:
0.16389
,
"40"
:
0.15815
,
"45"
:
0.15745
,
"50"
:
0.15915
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.82005
,
10.87449
,
10.87798
,
10.79511
,
10.68164
,
10.59513
,
10.10043
,
10.21239
,
10.13865
,
9.80879
]},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
1559.0
,
1719.0
,
1856.0
,
1735.0
,
1873.0
,
1765.0
,
1535.0
,
1910.0
,
2278.0
,
2247.0
]},
"iteration_timing_avg"
:
0.12168999999999999
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.82005
,
"5"
:
10.85284
,
"10"
:
10.78455
,
"15"
:
10.7923
,
"20"
:
10.69211
,
"25"
:
10.52414
,
"30"
:
10.34555
,
"35"
:
10.2624
,
"40"
:
10.07237
,
"45"
:
9.81103
,
"50"
:
9.88417
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1559.0
,
"5"
:
1840.0
,
"10"
:
1380.0
,
"15"
:
1850.0
,
"20"
:
1668.0
,
"25"
:
1607.0
,
"30"
:
1945.0
,
"35"
:
1860.0
,
"40"
:
2022.0
,
"45"
:
2042.0
,
"50"
:
2292.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
523003904.0
,
"5"
:
523003904.0
,
"10"
:
523003904.0
,
"15"
:
523003904.0
,
"20"
:
523003904.0
,
"25"
:
523003904.0
,
"30"
:
523003904.0
,
"35"
:
523003904.0
,
"40"
:
523003904.0
,
"45"
:
523003904.0
,
"50"
:
523003904.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
3768872960.0
,
"5"
:
3912765440.0
,
"10"
:
3912765440.0
,
"15"
:
3912765440.0
,
"20"
:
3912765440.0
,
"25"
:
3912765440.0
,
"30"
:
3912765440.0
,
"35"
:
3912765440.0
,
"40"
:
3912765440.0
,
"45"
:
3912765440.0
,
"50"
:
3912765440.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
18.09905
,
"5"
:
0.16241
,
"10"
:
0.16341
,
"15"
:
0.15828
,
"20"
:
0.15929
,
"25"
:
0.15899
,
"30"
:
0.16171
,
"35"
:
0.15966
,
"40"
:
0.15804
,
"45"
:
0.15972
,
"50"
:
0.15901
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
50
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -50,4 +50,5 @@ MODEL_ARGS:
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--attention-backend
:
unfused
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
regular
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.82005
,
10.87447
,
10.87799
,
10.79507
,
10.68165
,
10.59511
,
10.10047
,
10.2124
,
10.13861
,
9.80876
]
},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
1562.0
,
1738.0
,
1852.0
,
1802.0
,
1917.0
,
1765.0
,
1570.0
,
1949.0
,
2251.0
,
2270.0
]
},
"iteration-time"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
17.23575
,
0.17553
,
0.34737
,
0.17165
,
0.32526
,
0.17081
,
0.32706
,
0.17037
,
0.3321
,
0.16992
]
}
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.82005
,
"5"
:
10.85286
,
"10"
:
10.7845
,
"15"
:
10.79231
,
"20"
:
10.6921
,
"25"
:
10.52408
,
"30"
:
10.34555
,
"35"
:
10.26239
,
"40"
:
10.07241
,
"45"
:
9.81101
,
"50"
:
9.88416
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1562.0
,
"5"
:
1861.0
,
"10"
:
1339.0
,
"15"
:
1948.0
,
"20"
:
1698.0
,
"25"
:
1687.0
,
"30"
:
1930.0
,
"35"
:
1927.0
,
"40"
:
2061.0
,
"45"
:
2060.0
,
"50"
:
2330.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
522976256.0
,
"5"
:
522976256.0
,
"10"
:
522976256.0
,
"15"
:
522976256.0
,
"20"
:
522976256.0
,
"25"
:
522976256.0
,
"30"
:
522976256.0
,
"35"
:
522976256.0
,
"40"
:
522976256.0
,
"45"
:
522976256.0
,
"50"
:
522976256.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
3768845312.0
,
"5"
:
3912737792.0
,
"10"
:
3912737792.0
,
"15"
:
3912737792.0
,
"20"
:
3912737792.0
,
"25"
:
3912737792.0
,
"30"
:
3912737792.0
,
"35"
:
3912737792.0
,
"40"
:
3912737792.0
,
"45"
:
3912737792.0
,
"50"
:
3912737792.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
18.07048
,
"5"
:
0.17182
,
"10"
:
0.16227
,
"15"
:
0.16202
,
"20"
:
0.16214
,
"25"
:
0.16227
,
"30"
:
0.16231
,
"35"
:
0.16221
,
"40"
:
0.16257
,
"45"
:
0.16117
,
"50"
:
0.16119
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.82005
,
10.87449
,
10.87798
,
10.79511
,
10.68164
,
10.59513
,
10.10043
,
10.21239
,
10.13865
,
9.80879
]},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
1559.0
,
1719.0
,
1856.0
,
1735.0
,
1873.0
,
1765.0
,
1535.0
,
1910.0
,
2278.0
,
2247.0
]},
"iteration_timing_avg"
:
0.12873676470588236
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.82005
,
"5"
:
10.85284
,
"10"
:
10.78455
,
"15"
:
10.7923
,
"20"
:
10.69211
,
"25"
:
10.52414
,
"30"
:
10.34555
,
"35"
:
10.2624
,
"40"
:
10.07237
,
"45"
:
9.81103
,
"50"
:
9.88417
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1559.0
,
"5"
:
1840.0
,
"10"
:
1380.0
,
"15"
:
1850.0
,
"20"
:
1668.0
,
"25"
:
1607.0
,
"30"
:
1945.0
,
"35"
:
1860.0
,
"40"
:
2022.0
,
"45"
:
2042.0
,
"50"
:
2292.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
523003904.0
,
"5"
:
523003904.0
,
"10"
:
523003904.0
,
"15"
:
523003904.0
,
"20"
:
523003904.0
,
"25"
:
523003904.0
,
"30"
:
523003904.0
,
"35"
:
523003904.0
,
"40"
:
523003904.0
,
"45"
:
523003904.0
,
"50"
:
523003904.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
3768872960.0
,
"5"
:
3912765440.0
,
"10"
:
3912765440.0
,
"15"
:
3912765440.0
,
"20"
:
3912765440.0
,
"25"
:
3912765440.0
,
"30"
:
3912765440.0
,
"35"
:
3912765440.0
,
"40"
:
3912765440.0
,
"45"
:
3912765440.0
,
"50"
:
3912765440.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
15.80767
,
"5"
:
0.16165
,
"10"
:
0.16385
,
"15"
:
0.16229
,
"20"
:
0.16237
,
"25"
:
0.1618
,
"30"
:
0.1643
,
"35"
:
0.16116
,
"40"
:
0.16294
,
"45"
:
0.16266
,
"50"
:
0.16228
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
50
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -44,6 +44,7 @@ MODEL_ARGS:
--overlap-grad-reduce
:
true
--overlap-param-gather
:
true
--check-weight-hash-across-dp-replicas-interval
:
10
--disable-gloo-process-groups
:
true
--ckpt-fully-parallel-load
:
true
--deterministic-mode
:
true
--no-gradient-accumulation-fusion
:
true
...
...
@@ -53,4 +54,5 @@ MODEL_ARGS:
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--attention-backend
:
unfused
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
regular
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_dev.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.82005
,
10.87447
,
10.87799
,
10.79508
,
10.68163
,
10.59514
,
10.10047
,
10.21237
,
10.13864
,
9.80877
]
},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
1562.0
,
1738.0
,
1852.0
,
1796.0
,
1869.0
,
1788.0
,
1517.0
,
1941.0
,
2226.0
,
2214.0
]
},
"iteration-time"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
17.43169
,
0.16677
,
0.33581
,
0.16498
,
0.33103
,
0.16418
,
0.33146
,
0.16539
,
0.33075
,
0.1651
]
}
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.82005
,
"5"
:
10.85286
,
"10"
:
10.7845
,
"15"
:
10.79231
,
"20"
:
10.69208
,
"25"
:
10.52411
,
"30"
:
10.34557
,
"35"
:
10.2624
,
"40"
:
10.07239
,
"45"
:
9.811
,
"50"
:
9.8842
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1562.0
,
"5"
:
1861.0
,
"10"
:
1339.0
,
"15"
:
1964.0
,
"20"
:
1696.0
,
"25"
:
1558.0
,
"30"
:
1887.0
,
"35"
:
1887.0
,
"40"
:
2113.0
,
"45"
:
2114.0
,
"50"
:
2342.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
522977280.0
,
"5"
:
522977280.0
,
"10"
:
522977280.0
,
"15"
:
522977280.0
,
"20"
:
522977280.0
,
"25"
:
522977280.0
,
"30"
:
522977280.0
,
"35"
:
522977280.0
,
"40"
:
522977280.0
,
"45"
:
522977280.0
,
"50"
:
522977280.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
3768846336.0
,
"5"
:
3912738816.0
,
"10"
:
3912738816.0
,
"15"
:
3912738816.0
,
"20"
:
3912738816.0
,
"25"
:
3912738816.0
,
"30"
:
3912738816.0
,
"35"
:
3912738816.0
,
"40"
:
3912738816.0
,
"45"
:
3912738816.0
,
"50"
:
3912738816.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
19.10362
,
"5"
:
0.16434
,
"10"
:
0.1658
,
"15"
:
0.16354
,
"20"
:
0.16555
,
"25"
:
0.16274
,
"30"
:
0.16422
,
"35"
:
0.16143
,
"40"
:
0.16856
,
"45"
:
0.16893
,
"50"
:
0.16867
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_lts.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.82005
,
10.87449
,
10.87799
,
10.79508
,
10.68166
,
10.59514
,
10.10042
,
10.21238
,
10.13865
,
9.80879
]},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
1559.0
,
1719.0
,
1857.0
,
1746.0
,
1883.0
,
1738.0
,
1475.0
,
1851.0
,
2303.0
,
2258.0
]},
"iteration_timing_avg"
:
0.12873676470588236
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.82005
,
"5"
:
10.85284
,
"10"
:
10.78451
,
"15"
:
10.79227
,
"20"
:
10.69215
,
"25"
:
10.52412
,
"30"
:
10.34553
,
"35"
:
10.26239
,
"40"
:
10.07239
,
"45"
:
9.81101
,
"50"
:
9.8842
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1559.0
,
"5"
:
1840.0
,
"10"
:
1336.0
,
"15"
:
1910.0
,
"20"
:
1640.0
,
"25"
:
1694.0
,
"30"
:
1894.0
,
"35"
:
1955.0
,
"40"
:
2147.0
,
"45"
:
2157.0
,
"50"
:
2389.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
523004928.0
,
"5"
:
523004928.0
,
"10"
:
523004928.0
,
"15"
:
523004928.0
,
"20"
:
523004928.0
,
"25"
:
523004928.0
,
"30"
:
523004928.0
,
"35"
:
523004928.0
,
"40"
:
523004928.0
,
"45"
:
523004928.0
,
"50"
:
523004928.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
3768873984.0
,
"5"
:
3912766464.0
,
"10"
:
3912766464.0
,
"15"
:
3912766464.0
,
"20"
:
3912766464.0
,
"25"
:
3912766464.0
,
"30"
:
3912766464.0
,
"35"
:
3912766464.0
,
"40"
:
3912766464.0
,
"45"
:
3912766464.0
,
"50"
:
3912766464.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
21.58641
,
"5"
:
0.16306
,
"10"
:
0.16416
,
"15"
:
0.16288
,
"20"
:
0.16323
,
"25"
:
0.1694
,
"30"
:
0.16231
,
"35"
:
0.16648
,
"40"
:
0.16317
,
"45"
:
0.16593
,
"50"
:
0.16425
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
50
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -54,4 +54,5 @@ MODEL_ARGS:
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--attention-backend
:
unfused
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
regular
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_dev.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.9359
,
10.93551
,
10.9424
,
10.88073
,
10.75652
,
10.66333
,
10.16716
,
10.27244
,
10.19575
,
9.86005
]
},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
22727668.0
,
23021008.0
,
22501280.0
,
22830020.0
,
22739656.0
,
22548262.0
,
22955680.0
,
22589964.0
,
22660156.0
,
22884572.0
]
},
"iteration-time"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
16.12696
,
0.16574
,
0.16735
,
0.16507
,
0.1657
,
0.16626
,
0.16614
,
0.16517
,
0.16625
,
0.16568
]
}
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.9359
,
"5"
:
10.9322
,
"10"
:
10.91082
,
"15"
:
10.85725
,
"20"
:
10.7709
,
"25"
:
10.60557
,
"30"
:
10.40545
,
"35"
:
10.31363
,
"40"
:
10.12334
,
"45"
:
9.87564
,
"50"
:
9.94453
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
22727668.0
,
"5"
:
22715306.0
,
"10"
:
22919026.0
,
"15"
:
22821242.0
,
"20"
:
22693800.0
,
"25"
:
22819536.0
,
"30"
:
22631092.0
,
"35"
:
22787886.0
,
"40"
:
22658198.0
,
"45"
:
22674644.0
,
"50"
:
22904428.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
521936896.0
,
"5"
:
521936896.0
,
"10"
:
521936896.0
,
"15"
:
521936896.0
,
"20"
:
521936896.0
,
"25"
:
521936896.0
,
"30"
:
521936896.0
,
"35"
:
521936896.0
,
"40"
:
521936896.0
,
"45"
:
521936896.0
,
"50"
:
521936896.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
3769915392.0
,
"5"
:
3914746880.0
,
"10"
:
3914746880.0
,
"15"
:
3914746880.0
,
"20"
:
3914746880.0
,
"25"
:
3914746880.0
,
"30"
:
3914746880.0
,
"35"
:
3914746880.0
,
"40"
:
3914746880.0
,
"45"
:
3914746880.0
,
"50"
:
3914746880.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
14.90437
,
"5"
:
0.15853
,
"10"
:
0.15748
,
"15"
:
0.15817
,
"20"
:
0.15827
,
"25"
:
0.1568
,
"30"
:
0.1606
,
"35"
:
0.16038
,
"40"
:
0.15929
,
"45"
:
0.16015
,
"50"
:
0.17077
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_lts.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.9359
,
10.93547
,
10.94238
,
10.88073
,
10.75653
,
10.66332
,
10.1672
,
10.27241
,
10.19577
,
9.86006
]},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
22727686.0
,
23020980.0
,
22501260.0
,
22830024.0
,
22739772.0
,
22548148.0
,
22955712.0
,
22589816.0
,
22660000.0
,
22884332.0
]},
"iteration_timing_avg"
:
0.12799705882352944
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.9359
,
"5"
:
10.93225
,
"10"
:
10.91081
,
"15"
:
10.85723
,
"20"
:
10.77091
,
"25"
:
10.60558
,
"30"
:
10.40544
,
"35"
:
10.31364
,
"40"
:
10.12333
,
"45"
:
9.8756
,
"50"
:
9.94451
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
22727686.0
,
"5"
:
22715312.0
,
"10"
:
22919004.0
,
"15"
:
22821282.0
,
"20"
:
22693812.0
,
"25"
:
22819580.0
,
"30"
:
22631132.0
,
"35"
:
22787906.0
,
"40"
:
22658304.0
,
"45"
:
22674764.0
,
"50"
:
22904438.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
523016192.0
,
"5"
:
523016192.0
,
"10"
:
523016192.0
,
"15"
:
523016192.0
,
"20"
:
523016192.0
,
"25"
:
523016192.0
,
"30"
:
523016192.0
,
"35"
:
523016192.0
,
"40"
:
523016192.0
,
"45"
:
523016192.0
,
"50"
:
523016192.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
3769943040.0
,
"5"
:
3914774528.0
,
"10"
:
3914774528.0
,
"15"
:
3914774528.0
,
"20"
:
3914774528.0
,
"25"
:
3914774528.0
,
"30"
:
3914774528.0
,
"35"
:
3914774528.0
,
"40"
:
3914774528.0
,
"45"
:
3914774528.0
,
"50"
:
3914774528.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
19.24942
,
"5"
:
0.158
,
"10"
:
0.15909
,
"15"
:
0.15799
,
"20"
:
0.15892
,
"25"
:
0.15911
,
"30"
:
0.15833
,
"35"
:
0.15767
,
"40"
:
0.15693
,
"45"
:
0.16146
,
"50"
:
0.15756
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
50
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -51,4 +51,5 @@ MODEL_ARGS:
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--attention-backend
:
unfused
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
regular
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_dev.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.81873
,
"5"
:
10.85264
,
"10"
:
10.78415
,
"15"
:
10.7931
,
"20"
:
10.6921
,
"25"
:
10.52359
,
"30"
:
10.34496
,
"35"
:
10.25889
,
"40"
:
10.07079
,
"45"
:
9.80318
,
"50"
:
9.87688
,
"55"
:
9.85528
,
"60"
:
9.46661
,
"65"
:
8.91692
,
"70"
:
9.69269
,
"75"
:
9.37788
,
"80"
:
9.36796
,
"85"
:
9.576
,
"90"
:
9.77252
,
"95"
:
9.46897
,
"100"
:
9.34559
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1541.0
,
"5"
:
1835.0
,
"10"
:
1425.0
,
"15"
:
1935.0
,
"20"
:
1728.0
,
"25"
:
1634.0
,
"30"
:
1899.0
,
"35"
:
1945.0
,
"40"
:
2144.0
,
"45"
:
2092.0
,
"50"
:
2322.0
,
"55"
:
2333.0
,
"60"
:
2386.0
,
"65"
:
2636.0
,
"70"
:
3071.0
,
"75"
:
2522.0
,
"80"
:
3165.0
,
"85"
:
3334.0
,
"90"
:
2941.0
,
"95"
:
3321.0
,
"100"
:
3378.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
731763200.0
,
"5"
:
731763200.0
,
"10"
:
731763200.0
,
"15"
:
731763200.0
,
"20"
:
731763200.0
,
"25"
:
731763200.0
,
"30"
:
731763200.0
,
"35"
:
731763200.0
,
"40"
:
731763200.0
,
"45"
:
731763200.0
,
"50"
:
731763200.0
,
"55"
:
731763200.0
,
"60"
:
731763200.0
,
"65"
:
731763200.0
,
"70"
:
731763200.0
,
"75"
:
731763200.0
,
"80"
:
731763200.0
,
"85"
:
731763200.0
,
"90"
:
731763200.0
,
"95"
:
731763200.0
,
"100"
:
731763200.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
2368927744.0
,
"5"
:
2649590784.0
,
"10"
:
2649590784.0
,
"15"
:
2649590784.0
,
"20"
:
2649590784.0
,
"25"
:
2649590784.0
,
"30"
:
2649590784.0
,
"35"
:
2649590784.0
,
"40"
:
2649590784.0
,
"45"
:
2649590784.0
,
"50"
:
2649590784.0
,
"55"
:
2649590784.0
,
"60"
:
2649590784.0
,
"65"
:
2649590784.0
,
"70"
:
2649590784.0
,
"75"
:
2649590784.0
,
"80"
:
2649590784.0
,
"85"
:
2649590784.0
,
"90"
:
2649590784.0
,
"95"
:
2649590784.0
,
"100"
:
2649590784.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
14.75021
,
"5"
:
0.15575
,
"10"
:
0.15818
,
"15"
:
0.15592
,
"20"
:
0.15584
,
"25"
:
0.15211
,
"30"
:
0.15253
,
"35"
:
0.15336
,
"40"
:
0.15465
,
"45"
:
0.1517
,
"50"
:
0.16501
,
"55"
:
0.16299
,
"60"
:
0.1657
,
"65"
:
0.16693
,
"70"
:
0.15946
,
"75"
:
0.15155
,
"80"
:
0.15175
,
"85"
:
0.15073
,
"90"
:
0.14954
,
"95"
:
0.14899
,
"100"
:
0.14722
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/golden_values_lts.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.82005
,
"5"
:
10.85285
,
"10"
:
10.78449
,
"15"
:
10.79226
,
"20"
:
10.69196
,
"25"
:
10.52317
,
"30"
:
10.34507
,
"35"
:
10.25889
,
"40"
:
10.07027
,
"45"
:
9.80301
,
"50"
:
9.87673
,
"55"
:
9.85527
,
"60"
:
9.46636
,
"65"
:
8.9166
,
"70"
:
9.69277
,
"75"
:
9.37814
,
"80"
:
9.368
,
"85"
:
9.57597
,
"90"
:
9.77245
,
"95"
:
9.46913
,
"100"
:
9.34575
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1559.0
,
"5"
:
1915.0
,
"10"
:
1361.0
,
"15"
:
1831.0
,
"20"
:
1695.0
,
"25"
:
1596.0
,
"30"
:
1821.0
,
"35"
:
1872.0
,
"40"
:
2121.0
,
"45"
:
2090.0
,
"50"
:
2395.0
,
"55"
:
2324.0
,
"60"
:
2357.0
,
"65"
:
2606.0
,
"70"
:
3130.0
,
"75"
:
2556.0
,
"80"
:
3224.0
,
"85"
:
3412.0
,
"90"
:
2988.0
,
"95"
:
3347.0
,
"100"
:
3383.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
733859840.0
,
"5"
:
733859840.0
,
"10"
:
733859840.0
,
"15"
:
733859840.0
,
"20"
:
733859840.0
,
"25"
:
733859840.0
,
"30"
:
733859840.0
,
"35"
:
733859840.0
,
"40"
:
733859840.0
,
"45"
:
733859840.0
,
"50"
:
733859840.0
,
"55"
:
733859840.0
,
"60"
:
733859840.0
,
"65"
:
733859840.0
,
"70"
:
733859840.0
,
"75"
:
733859840.0
,
"80"
:
733859840.0
,
"85"
:
733859840.0
,
"90"
:
733859840.0
,
"95"
:
733859840.0
,
"100"
:
733859840.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
3838895104.0
,
"5"
:
4122703872.0
,
"10"
:
4122703872.0
,
"15"
:
4122703872.0
,
"20"
:
4122703872.0
,
"25"
:
4122703872.0
,
"30"
:
4122703872.0
,
"35"
:
4122703872.0
,
"40"
:
4122703872.0
,
"45"
:
4122703872.0
,
"50"
:
4122703872.0
,
"55"
:
4122703872.0
,
"60"
:
4122703872.0
,
"65"
:
4122703872.0
,
"70"
:
4122703872.0
,
"75"
:
4122703872.0
,
"80"
:
4122703872.0
,
"85"
:
4122703872.0
,
"90"
:
4122703872.0
,
"95"
:
4122703872.0
,
"100"
:
4122703872.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
18.81818
,
"5"
:
0.15605
,
"10"
:
0.15612
,
"15"
:
0.15668
,
"20"
:
0.15734
,
"25"
:
0.15595
,
"30"
:
0.15634
,
"35"
:
0.15597
,
"40"
:
0.15654
,
"45"
:
0.15538
,
"50"
:
0.15456
,
"55"
:
0.15493
,
"60"
:
0.15593
,
"65"
:
0.15527
,
"70"
:
0.15564
,
"75"
:
0.15555
,
"80"
:
0.15422
,
"85"
:
0.1551
,
"90"
:
0.1533
,
"95"
:
0.15475
,
"100"
:
0.15459
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
100
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -48,4 +48,5 @@ MODEL_ARGS:
--ckpt-format
:
torch
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
ckpt-resume
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.81873
,
"5"
:
10.85262
,
"10"
:
10.78413
,
"15"
:
10.79311
,
"20"
:
10.69219
,
"25"
:
10.52454
,
"30"
:
10.34542
,
"35"
:
10.26245
,
"40"
:
10.07286
,
"45"
:
9.8112
,
"50"
:
9.88428
,
"55"
:
9.86376
,
"60"
:
9.47981
,
"65"
:
8.93093
,
"70"
:
9.71205
,
"75"
:
9.4002
,
"80"
:
9.39074
,
"85"
:
9.60143
,
"90"
:
9.8051
,
"95"
:
9.5081
,
"100"
:
9.39221
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1541.0
,
"5"
:
1912.0
,
"10"
:
1317.0
,
"15"
:
1921.0
,
"20"
:
1595.0
,
"25"
:
1666.0
,
"30"
:
1933.0
,
"35"
:
1920.0
,
"40"
:
2094.0
,
"45"
:
2101.0
,
"50"
:
2362.0
,
"55"
:
2269.0
,
"60"
:
2379.0
,
"65"
:
2624.0
,
"70"
:
3128.0
,
"75"
:
2551.0
,
"80"
:
3192.0
,
"85"
:
3503.0
,
"90"
:
2966.0
,
"95"
:
3326.0
,
"100"
:
3383.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
731763200.0
,
"5"
:
731763200.0
,
"10"
:
731763200.0
,
"15"
:
731763200.0
,
"20"
:
731763200.0
,
"25"
:
731763200.0
,
"30"
:
731763200.0
,
"35"
:
731763200.0
,
"40"
:
731763200.0
,
"45"
:
731763200.0
,
"50"
:
731763200.0
,
"55"
:
731763200.0
,
"60"
:
731763200.0
,
"65"
:
731763200.0
,
"70"
:
731763200.0
,
"75"
:
731763200.0
,
"80"
:
731763200.0
,
"85"
:
731763200.0
,
"90"
:
731763200.0
,
"95"
:
731763200.0
,
"100"
:
731763200.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
2368927744.0
,
"5"
:
2649590784.0
,
"10"
:
2649590784.0
,
"15"
:
2649590784.0
,
"20"
:
2649590784.0
,
"25"
:
2649590784.0
,
"30"
:
2649590784.0
,
"35"
:
2649590784.0
,
"40"
:
2649590784.0
,
"45"
:
2649590784.0
,
"50"
:
2649590784.0
,
"55"
:
2649590784.0
,
"60"
:
2649590784.0
,
"65"
:
2649590784.0
,
"70"
:
2649590784.0
,
"75"
:
2649590784.0
,
"80"
:
2649590784.0
,
"85"
:
2649590784.0
,
"90"
:
2649590784.0
,
"95"
:
2649590784.0
,
"100"
:
2649590784.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
16.11545
,
"5"
:
0.1692
,
"10"
:
0.16514
,
"15"
:
0.16462
,
"20"
:
0.15963
,
"25"
:
0.16538
,
"30"
:
0.1555
,
"35"
:
0.15427
,
"40"
:
0.15505
,
"45"
:
0.15352
,
"50"
:
0.15187
,
"55"
:
0.15496
,
"60"
:
0.15039
,
"65"
:
0.15056
,
"70"
:
0.14765
,
"75"
:
0.15137
,
"80"
:
0.15663
,
"85"
:
0.16052
,
"90"
:
0.15557
,
"95"
:
0.15598
,
"100"
:
0.15273
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/golden_values_lts.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.82005
,
"5"
:
10.85284
,
"10"
:
10.78455
,
"15"
:
10.79229
,
"20"
:
10.69211
,
"25"
:
10.52412
,
"30"
:
10.34552
,
"35"
:
10.26242
,
"40"
:
10.07239
,
"45"
:
9.811
,
"50"
:
9.88415
,
"55"
:
9.86374
,
"60"
:
9.47965
,
"65"
:
8.93065
,
"70"
:
9.71216
,
"75"
:
9.40049
,
"80"
:
9.39075
,
"85"
:
9.6014
,
"90"
:
9.80503
,
"95"
:
9.50817
,
"100"
:
9.39236
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1559.0
,
"5"
:
1840.0
,
"10"
:
1380.0
,
"15"
:
1848.0
,
"20"
:
1601.0
,
"25"
:
1635.0
,
"30"
:
1908.0
,
"35"
:
1925.0
,
"40"
:
2126.0
,
"45"
:
2086.0
,
"50"
:
2298.0
,
"55"
:
2284.0
,
"60"
:
2337.0
,
"65"
:
2636.0
,
"70"
:
3136.0
,
"75"
:
2539.0
,
"80"
:
3253.0
,
"85"
:
3363.0
,
"90"
:
3004.0
,
"95"
:
3333.0
,
"100"
:
3447.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
733859840.0
,
"5"
:
733859840.0
,
"10"
:
733859840.0
,
"15"
:
733859840.0
,
"20"
:
733859840.0
,
"25"
:
733859840.0
,
"30"
:
733859840.0
,
"35"
:
733859840.0
,
"40"
:
733859840.0
,
"45"
:
733859840.0
,
"50"
:
733859840.0
,
"55"
:
733859840.0
,
"60"
:
733859840.0
,
"65"
:
733859840.0
,
"70"
:
733859840.0
,
"75"
:
733859840.0
,
"80"
:
733859840.0
,
"85"
:
733859840.0
,
"90"
:
733859840.0
,
"95"
:
733859840.0
,
"100"
:
733859840.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
3838895104.0
,
"5"
:
4122703872.0
,
"10"
:
4122703872.0
,
"15"
:
4122703872.0
,
"20"
:
4122703872.0
,
"25"
:
4122703872.0
,
"30"
:
4122703872.0
,
"35"
:
4122703872.0
,
"40"
:
4122703872.0
,
"45"
:
4122703872.0
,
"50"
:
4122703872.0
,
"55"
:
4122703872.0
,
"60"
:
4122703872.0
,
"65"
:
4122703872.0
,
"70"
:
4122703872.0
,
"75"
:
4122703872.0
,
"80"
:
4122703872.0
,
"85"
:
4122703872.0
,
"90"
:
4122703872.0
,
"95"
:
4122703872.0
,
"100"
:
4122703872.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
22.44598
,
"5"
:
0.17072
,
"10"
:
0.16018
,
"15"
:
0.16147
,
"20"
:
0.15588
,
"25"
:
0.15643
,
"30"
:
0.15744
,
"35"
:
0.15702
,
"40"
:
0.15705
,
"45"
:
0.15718
,
"50"
:
0.15547
,
"55"
:
0.1569
,
"60"
:
0.1592
,
"65"
:
0.1591
,
"70"
:
0.15725
,
"75"
:
0.1566
,
"80"
:
0.15569
,
"85"
:
0.15565
,
"90"
:
0.15537
,
"95"
:
0.15899
,
"100"
:
0.15823
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
100
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -49,4 +49,5 @@ MODEL_ARGS:
--ckpt-format
:
torch_dist
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
ckpt-resume
Prev
1
…
21
22
23
24
25
26
27
28
29
…
42
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment