Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
688448db
Commit
688448db
authored
Mar 14, 2025
by
silencealiang
Browse files
更新代码
parent
a02a5490
Pipeline
#2503
passed with stage
Changes
823
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
134 additions
and
116 deletions
+134
-116
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev.json
...nterleaved_no_fusion_dgx_a100_1N8G/golden_values_dev.json
+1
-53
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_lts.json
...nterleaved_no_fusion_dgx_a100_1N8G/golden_values_lts.json
+1
-1
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml
...ngs_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/golden_values_dev.json
..._disable_bias_linear_dgx_a100_1N8G/golden_values_dev.json
+1
-53
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/golden_values_lts.json
..._disable_bias_linear_dgx_a100_1N8G/golden_values_lts.json
+1
-1
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/model_config.yaml
...1_pp4_disable_bias_linear_dgx_a100_1N8G/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear_dgx_a100_1N8G/golden_values_dev.json
..._disable_bias_linear_dgx_a100_1N8G/golden_values_dev.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear_dgx_a100_1N8G/golden_values_lts.json
..._disable_bias_linear_dgx_a100_1N8G/golden_values_lts.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear_dgx_a100_1N8G/model_config.yaml
..._ckpt_disable_bias_linear_dgx_a100_1N8G/model_config.yaml
+54
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/golden_values_dev.json
..._disable_bias_linear_dgx_a100_1N8G/golden_values_dev.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/golden_values_lts.json
..._disable_bias_linear_dgx_a100_1N8G/golden_values_lts.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/model_config.yaml
..._dist_disable_bias_linear_dgx_a100_1N8G/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_dev.json
..._disable_bias_linear_dgx_a100_1N8G/golden_values_dev.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_lts.json
..._disable_bias_linear_dgx_a100_1N8G/golden_values_lts.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/model_config.yaml
...stent_disable_bias_linear_dgx_a100_1N8G/model_config.yaml
+54
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/golden_values_dev.json
...st_sequence_parallel_dgx_a100_1N8G/golden_values_dev.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/golden_values_lts.json
...st_sequence_parallel_dgx_a100_1N8G/golden_values_lts.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/model_config.yaml
...ch_dist_sequence_parallel_dgx_a100_1N8G/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_dev.json
...me_torch_dist_swiglu_dgx_a100_1N8G/golden_values_dev.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_lts.json
...me_torch_dist_swiglu_dgx_a100_1N8G/golden_values_lts.json
+1
-0
No files found.
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.84474
,
10.87688
,
10.90253
,
10.81872
,
10.67849
,
10.60076
,
10.06361
,
10.19267
,
10.11344
,
9.75987
]
},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
1769.0
,
2129.0
,
1987.0
,
1961.0
,
1961.0
,
1886.0
,
1655.0
,
2130.0
,
2315.0
,
2362.0
]
},
"iteration-time"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
8.72642
,
0.16194
,
0.15926
,
0.15956
,
0.15972
,
0.1623
,
0.16029
,
0.15863
,
0.15947
,
0.15935
]
}
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.84474
,
"5"
:
10.8642
,
"10"
:
10.82152
,
"15"
:
10.81201
,
"20"
:
10.71869
,
"25"
:
10.53034
,
"30"
:
10.33576
,
"35"
:
10.24082
,
"40"
:
10.05009
,
"45"
:
9.76761
,
"50"
:
9.85505
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1769.0
,
"5"
:
2061.0
,
"10"
:
1636.0
,
"15"
:
2011.0
,
"20"
:
1779.0
,
"25"
:
1875.0
,
"30"
:
2074.0
,
"35"
:
2069.0
,
"40"
:
2190.0
,
"45"
:
2153.0
,
"50"
:
2508.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
888098304.0
,
"5"
:
888098304.0
,
"10"
:
888098304.0
,
"15"
:
888098304.0
,
"20"
:
888098304.0
,
"25"
:
888098304.0
,
"30"
:
888098304.0
,
"35"
:
888098304.0
,
"40"
:
888098304.0
,
"45"
:
888098304.0
,
"50"
:
888098304.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
3215778304.0
,
"5"
:
3575244288.0
,
"10"
:
3575244288.0
,
"15"
:
3575244288.0
,
"20"
:
3575244288.0
,
"25"
:
3575244288.0
,
"30"
:
3575244288.0
,
"35"
:
3575244288.0
,
"40"
:
3575244288.0
,
"45"
:
3575244288.0
,
"50"
:
3575244288.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
8.69754
,
"5"
:
0.16083
,
"10"
:
0.16079
,
"15"
:
0.16126
,
"20"
:
0.16129
,
"25"
:
0.16055
,
"30"
:
0.1609
,
"35"
:
0.16119
,
"40"
:
0.16222
,
"45"
:
0.16081
,
"50"
:
0.15983
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_lts.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.84474
,
10.87687
,
10.90254
,
10.81872
,
10.67848
,
10.60075
,
10.06363
,
10.19268
,
10.11342
,
9.75986
]},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
1776.0
,
2161.0
,
2052.0
,
1892.0
,
1971.0
,
1946.0
,
1701.0
,
1985.0
,
2295.0
,
2293.0
]},
"iteration_timing_avg"
:
0.11052176470588236
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.84474
,
"5"
:
10.86418
,
"10"
:
10.82155
,
"15"
:
10.81195
,
"20"
:
10.71872
,
"25"
:
10.53036
,
"30"
:
10.3358
,
"35"
:
10.24082
,
"40"
:
10.05008
,
"45"
:
9.76762
,
"50"
:
9.85505
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1776.0
,
"5"
:
2128.0
,
"10"
:
1615.0
,
"15"
:
2021.0
,
"20"
:
1775.0
,
"25"
:
1916.0
,
"30"
:
2029.0
,
"35"
:
2107.0
,
"40"
:
2174.0
,
"45"
:
2110.0
,
"50"
:
2363.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
888098304.0
,
"5"
:
888098304.0
,
"10"
:
888098304.0
,
"15"
:
888098304.0
,
"20"
:
888098304.0
,
"25"
:
888098304.0
,
"30"
:
888098304.0
,
"35"
:
888098304.0
,
"40"
:
888098304.0
,
"45"
:
888098304.0
,
"50"
:
888098304.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
3215778304.0
,
"5"
:
3575244288.0
,
"10"
:
3575244288.0
,
"15"
:
3575244288.0
,
"20"
:
3575244288.0
,
"25"
:
3575244288.0
,
"30"
:
3575244288.0
,
"35"
:
3575244288.0
,
"40"
:
3575244288.0
,
"45"
:
3575244288.0
,
"50"
:
3575244288.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.1728
,
"5"
:
0.15783
,
"10"
:
0.15696
,
"15"
:
0.15564
,
"20"
:
0.15887
,
"25"
:
0.15731
,
"30"
:
0.15635
,
"35"
:
0.1571
,
"40"
:
0.15637
,
"45"
:
0.15705
,
"50"
:
0.15413
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
50
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -50,4 +50,5 @@ MODEL_ARGS:
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--attention-backend
:
unfused
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
regular
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/golden_values_dev.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.79205
,
10.86789
,
10.89149
,
10.78328
,
10.66126
,
10.58275
,
10.08467
,
10.19448
,
10.13785
,
9.81454
]
},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
1580.0
,
1778.0
,
1849.0
,
1841.0
,
1884.0
,
1679.0
,
1544.0
,
1953.0
,
2449.0
,
2335.0
]
},
"iteration-time"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.79458
,
0.16744
,
0.16286
,
0.16276
,
0.16292
,
0.16346
,
0.16288
,
0.16273
,
0.16282
,
0.16245
]
}
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.79205
,
"5"
:
10.84695
,
"10"
:
10.77106
,
"15"
:
10.79093
,
"20"
:
10.68042
,
"25"
:
10.50715
,
"30"
:
10.33325
,
"35"
:
10.25545
,
"40"
:
10.05544
,
"45"
:
9.80575
,
"50"
:
9.89082
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1580.0
,
"5"
:
1901.0
,
"10"
:
1346.0
,
"15"
:
1926.0
,
"20"
:
1643.0
,
"25"
:
1683.0
,
"30"
:
1867.0
,
"35"
:
2020.0
,
"40"
:
2252.0
,
"45"
:
2243.0
,
"50"
:
2459.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
716833792.0
,
"5"
:
716833792.0
,
"10"
:
716833792.0
,
"15"
:
716833792.0
,
"20"
:
716833792.0
,
"25"
:
716833792.0
,
"30"
:
716833792.0
,
"35"
:
716833792.0
,
"40"
:
716833792.0
,
"45"
:
716833792.0
,
"50"
:
716833792.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
2399714304.0
,
"5"
:
2683412480.0
,
"10"
:
2683412480.0
,
"15"
:
2683412480.0
,
"20"
:
2683412480.0
,
"25"
:
2683412480.0
,
"30"
:
2683412480.0
,
"35"
:
2683412480.0
,
"40"
:
2683412480.0
,
"45"
:
2683412480.0
,
"50"
:
2683412480.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
8.70564
,
"5"
:
0.16109
,
"10"
:
0.15745
,
"15"
:
0.15861
,
"20"
:
0.15886
,
"25"
:
0.15817
,
"30"
:
0.15999
,
"35"
:
0.16113
,
"40"
:
0.15887
,
"45"
:
0.16006
,
"50"
:
0.1597
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/golden_values_lts.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.79205
,
10.86789
,
10.89149
,
10.78328
,
10.66126
,
10.58275
,
10.08467
,
10.19448
,
10.13785
,
9.81454
]},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
1580.0
,
1778.0
,
1849.0
,
1841.0
,
1884.0
,
1679.0
,
1544.0
,
1953.0
,
2449.0
,
2335.0
]},
"iteration_timing_avg"
:
0.12243558823529416
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.79205
,
"5"
:
10.84695
,
"10"
:
10.77106
,
"15"
:
10.79093
,
"20"
:
10.68042
,
"25"
:
10.50715
,
"30"
:
10.33325
,
"35"
:
10.25545
,
"40"
:
10.05544
,
"45"
:
9.80575
,
"50"
:
9.89082
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1580.0
,
"5"
:
1901.0
,
"10"
:
1346.0
,
"15"
:
1926.0
,
"20"
:
1643.0
,
"25"
:
1683.0
,
"30"
:
1867.0
,
"35"
:
2020.0
,
"40"
:
2252.0
,
"45"
:
2243.0
,
"50"
:
2459.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
714736640.0
,
"5"
:
714736640.0
,
"10"
:
714736640.0
,
"15"
:
714736640.0
,
"20"
:
714736640.0
,
"25"
:
714736640.0
,
"30"
:
714736640.0
,
"35"
:
714736640.0
,
"40"
:
714736640.0
,
"45"
:
714736640.0
,
"50"
:
714736640.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
2399714304.0
,
"5"
:
2681315328.0
,
"10"
:
2681315328.0
,
"15"
:
2681315328.0
,
"20"
:
2681315328.0
,
"25"
:
2681315328.0
,
"30"
:
2681315328.0
,
"35"
:
2681315328.0
,
"40"
:
2681315328.0
,
"45"
:
2681315328.0
,
"50"
:
2681315328.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
13.48918
,
"5"
:
0.16426
,
"10"
:
0.16419
,
"15"
:
0.15777
,
"20"
:
0.15716
,
"25"
:
0.15773
,
"30"
:
0.15842
,
"35"
:
0.15959
,
"40"
:
0.15581
,
"45"
:
0.15603
,
"50"
:
0.15595
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
50
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -49,4 +49,5 @@ MODEL_ARGS:
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--attention-backend
:
unfused
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
regular
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear_dgx_a100_1N8G/golden_values_dev.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.79205
,
"5"
:
10.84695
,
"10"
:
10.77106
,
"15"
:
10.79093
,
"20"
:
10.68042
,
"25"
:
10.50715
,
"30"
:
10.33325
,
"35"
:
10.25545
,
"40"
:
10.05544
,
"45"
:
9.80575
,
"50"
:
9.89082
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1580.0
,
"5"
:
1901.0
,
"10"
:
1346.0
,
"15"
:
1926.0
,
"20"
:
1643.0
,
"25"
:
1683.0
,
"30"
:
1867.0
,
"35"
:
2020.0
,
"40"
:
2252.0
,
"45"
:
2243.0
,
"50"
:
2459.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
716833792.0
,
"5"
:
716833792.0
,
"10"
:
716833792.0
,
"15"
:
716833792.0
,
"20"
:
716833792.0
,
"25"
:
716833792.0
,
"30"
:
716833792.0
,
"35"
:
716833792.0
,
"40"
:
716833792.0
,
"45"
:
716833792.0
,
"50"
:
716833792.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
2399714304.0
,
"5"
:
2683412480.0
,
"10"
:
2683412480.0
,
"15"
:
2683412480.0
,
"20"
:
2683412480.0
,
"25"
:
2683412480.0
,
"30"
:
2683412480.0
,
"35"
:
2683412480.0
,
"40"
:
2683412480.0
,
"45"
:
2683412480.0
,
"50"
:
2683412480.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
8.70564
,
"5"
:
0.16109
,
"10"
:
0.15745
,
"15"
:
0.15861
,
"20"
:
0.15886
,
"25"
:
0.15817
,
"30"
:
0.15999
,
"35"
:
0.16113
,
"40"
:
0.15887
,
"45"
:
0.16006
,
"50"
:
0.1597
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear_dgx_a100_1N8G/golden_values_lts.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.79205
,
"5"
:
10.84695
,
"10"
:
10.77106
,
"15"
:
10.79093
,
"20"
:
10.68042
,
"25"
:
10.50715
,
"30"
:
10.33325
,
"35"
:
10.25545
,
"40"
:
10.05544
,
"45"
:
9.80575
,
"50"
:
9.89082
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1580.0
,
"5"
:
1901.0
,
"10"
:
1346.0
,
"15"
:
1926.0
,
"20"
:
1643.0
,
"25"
:
1683.0
,
"30"
:
1867.0
,
"35"
:
2020.0
,
"40"
:
2252.0
,
"45"
:
2243.0
,
"50"
:
2459.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
714736640.0
,
"5"
:
714736640.0
,
"10"
:
714736640.0
,
"15"
:
714736640.0
,
"20"
:
714736640.0
,
"25"
:
714736640.0
,
"30"
:
714736640.0
,
"35"
:
714736640.0
,
"40"
:
714736640.0
,
"45"
:
714736640.0
,
"50"
:
714736640.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
2399714304.0
,
"5"
:
2681315328.0
,
"10"
:
2681315328.0
,
"15"
:
2681315328.0
,
"20"
:
2681315328.0
,
"25"
:
2681315328.0
,
"30"
:
2681315328.0
,
"35"
:
2681315328.0
,
"40"
:
2681315328.0
,
"45"
:
2681315328.0
,
"50"
:
2681315328.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
12.48837
,
"5"
:
0.15716
,
"10"
:
0.1577
,
"15"
:
0.1575
,
"20"
:
0.15694
,
"25"
:
0.15689
,
"30"
:
0.16393
,
"35"
:
0.15702
,
"40"
:
0.15586
,
"45"
:
0.1552
,
"50"
:
0.15598
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear_dgx_a100_1N8G/model_config.yaml
0 → 100644
View file @
688448db
ENV_VARS
:
CUDA_DEVICE_MAX_CONNECTIONS
:
1
NVTE_ALLOW_NONDETERMINISTIC_ALGO
:
0
NCCL_ALGO
:
Tree
CUBLAS_WORKSPACE_CONFIG
:
:4096:8
MODEL_ARGS
:
--num-layers
:
12
--hidden-size
:
512
--num-attention-heads
:
8
--log-params-norm
:
true
--log-num-zeros-in-grad
:
true
--log-validation-ppl-to-tensorboard
:
true
--log-timers-to-tensorboard
:
true
--tensorboard-dir
:
${TENSORBOARD_PATH}
--micro-batch-size
:
4
--global-batch-size
:
32
--seq-length
:
1024
--max-position-embeddings
:
1024
--train-iters
:
50
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_SAVE_PATH}
--load
:
${CHECKPOINT_LOAD_PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
--split
:
949,50,1
--distributed-backend
:
nccl
--lr
:
0.00015
--lr-decay-style
:
cosine
--min-lr
:
1.0e-5
--weight-decay
:
1e-2
--clip-grad
:
1.0
--lr-warmup-fraction
:
.01
--log-interval
:
1
--save-interval
:
10000
--eval-interval
:
1000
--eval-iters
:
10
--transformer-impl
:
transformer_engine
--tensor-model-parallel-size
:
1
--pipeline-model-parallel-size
:
4
--disable-bias-linear
:
true
--async-save
:
true
--use-persistent-ckpt-worker
:
true
--deterministic-mode
:
true
--no-gradient-accumulation-fusion
:
true
--attention-softmax-in-fp32
:
true
--use-mcore-models
:
true
--ckpt-format
:
torch_dist
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--attention-backend
:
unfused
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
regular
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/golden_values_dev.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.79196
,
"5"
:
10.84767
,
"10"
:
10.76997
,
"15"
:
10.79032
,
"20"
:
10.68032
,
"25"
:
10.5078
,
"30"
:
10.3335
,
"35"
:
10.25557
,
"40"
:
10.05566
,
"45"
:
9.80602
,
"50"
:
9.89125
,
"55"
:
9.87089
,
"60"
:
9.4846
,
"65"
:
8.94044
,
"70"
:
9.7223
,
"75"
:
9.40865
,
"80"
:
9.39753
,
"85"
:
9.60719
,
"90"
:
9.81041
,
"95"
:
9.51159
,
"100"
:
9.39705
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1605.0
,
"5"
:
1978.0
,
"10"
:
1442.0
,
"15"
:
1952.0
,
"20"
:
1667.0
,
"25"
:
1734.0
,
"30"
:
1952.0
,
"35"
:
2043.0
,
"40"
:
2231.0
,
"45"
:
2197.0
,
"50"
:
2405.0
,
"55"
:
2212.0
,
"60"
:
2367.0
,
"65"
:
2639.0
,
"70"
:
3196.0
,
"75"
:
2592.0
,
"80"
:
3222.0
,
"85"
:
3406.0
,
"90"
:
3002.0
,
"95"
:
3368.0
,
"100"
:
3152.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
716834304.0
,
"5"
:
716834304.0
,
"10"
:
716834304.0
,
"15"
:
716834304.0
,
"20"
:
716834304.0
,
"25"
:
716834304.0
,
"30"
:
716834304.0
,
"35"
:
716834304.0
,
"40"
:
716834304.0
,
"45"
:
716834304.0
,
"50"
:
716834304.0
,
"55"
:
716834304.0
,
"60"
:
716834304.0
,
"65"
:
716834304.0
,
"70"
:
716834304.0
,
"75"
:
716834304.0
,
"80"
:
716834304.0
,
"85"
:
716834304.0
,
"90"
:
716834304.0
,
"95"
:
716834304.0
,
"100"
:
716834304.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1910424576.0
,
"5"
:
2193074176.0
,
"10"
:
2193074176.0
,
"15"
:
2193074176.0
,
"20"
:
2193074176.0
,
"25"
:
2193074176.0
,
"30"
:
2193074176.0
,
"35"
:
2193074176.0
,
"40"
:
2193074176.0
,
"45"
:
2193074176.0
,
"50"
:
2193074176.0
,
"55"
:
2193074176.0
,
"60"
:
2193074176.0
,
"65"
:
2193074176.0
,
"70"
:
2193074176.0
,
"75"
:
2193074176.0
,
"80"
:
2193074176.0
,
"85"
:
2193074176.0
,
"90"
:
2193074176.0
,
"95"
:
2193074176.0
,
"100"
:
2193074176.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
8.46967
,
"5"
:
0.1483
,
"10"
:
0.14544
,
"15"
:
0.14588
,
"20"
:
0.14639
,
"25"
:
0.14549
,
"30"
:
0.14597
,
"35"
:
0.14608
,
"40"
:
0.14578
,
"45"
:
0.14542
,
"50"
:
0.14492
,
"55"
:
0.14474
,
"60"
:
0.14635
,
"65"
:
0.14621
,
"70"
:
0.14453
,
"75"
:
0.14374
,
"80"
:
0.14465
,
"85"
:
0.14456
,
"90"
:
0.14413
,
"95"
:
0.14445
,
"100"
:
0.14399
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/golden_values_lts.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.79205
,
"5"
:
10.84695
,
"10"
:
10.77106
,
"15"
:
10.79093
,
"20"
:
10.68042
,
"25"
:
10.50715
,
"30"
:
10.33325
,
"35"
:
10.25545
,
"40"
:
10.05544
,
"45"
:
9.80575
,
"50"
:
9.89082
,
"55"
:
9.87063
,
"60"
:
9.48478
,
"65"
:
8.94022
,
"70"
:
9.72243
,
"75"
:
9.40907
,
"80"
:
9.3976
,
"85"
:
9.60746
,
"90"
:
9.81041
,
"95"
:
9.5116
,
"100"
:
9.39722
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1580.0
,
"5"
:
1901.0
,
"10"
:
1346.0
,
"15"
:
1926.0
,
"20"
:
1643.0
,
"25"
:
1683.0
,
"30"
:
1867.0
,
"35"
:
2020.0
,
"40"
:
2252.0
,
"45"
:
2243.0
,
"50"
:
2459.0
,
"55"
:
2291.0
,
"60"
:
2404.0
,
"65"
:
2474.0
,
"70"
:
3102.0
,
"75"
:
2603.0
,
"80"
:
3420.0
,
"85"
:
3388.0
,
"90"
:
2904.0
,
"95"
:
3333.0
,
"100"
:
3347.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
714736640.0
,
"5"
:
714736640.0
,
"10"
:
714736640.0
,
"15"
:
714736640.0
,
"20"
:
714736640.0
,
"25"
:
714736640.0
,
"30"
:
714736640.0
,
"35"
:
714736640.0
,
"40"
:
714736640.0
,
"45"
:
714736640.0
,
"50"
:
714736640.0
,
"55"
:
714736640.0
,
"60"
:
714736640.0
,
"65"
:
714736640.0
,
"70"
:
714736640.0
,
"75"
:
714736640.0
,
"80"
:
714736640.0
,
"85"
:
714736640.0
,
"90"
:
714736640.0
,
"95"
:
714736640.0
,
"100"
:
714736640.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
2399714304.0
,
"5"
:
2681315328.0
,
"10"
:
2681315328.0
,
"15"
:
2681315328.0
,
"20"
:
2681315328.0
,
"25"
:
2681315328.0
,
"30"
:
2681315328.0
,
"35"
:
2681315328.0
,
"40"
:
2681315328.0
,
"45"
:
2681315328.0
,
"50"
:
2681315328.0
,
"55"
:
2681315328.0
,
"60"
:
2681315328.0
,
"65"
:
2681315328.0
,
"70"
:
2681315328.0
,
"75"
:
2681315328.0
,
"80"
:
2681315328.0
,
"85"
:
2681315328.0
,
"90"
:
2681315328.0
,
"95"
:
2681315328.0
,
"100"
:
2681315328.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
9.27792
,
"5"
:
0.15646
,
"10"
:
0.15784
,
"15"
:
0.15721
,
"20"
:
0.15673
,
"25"
:
0.15668
,
"30"
:
0.15634
,
"35"
:
0.1575
,
"40"
:
0.1572
,
"45"
:
0.15552
,
"50"
:
0.15469
,
"55"
:
0.16595
,
"60"
:
0.16703
,
"65"
:
0.16692
,
"70"
:
0.15969
,
"75"
:
0.15799
,
"80"
:
0.15892
,
"85"
:
0.15874
,
"90"
:
0.159
,
"95"
:
0.16041
,
"100"
:
0.15753
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
100
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -49,4 +49,5 @@ MODEL_ARGS:
--ckpt-format
:
torch_dist
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
ckpt-resume
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_dev.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.79196
,
"5"
:
10.84767
,
"10"
:
10.76997
,
"15"
:
10.79032
,
"20"
:
10.68032
,
"25"
:
10.5078
,
"30"
:
10.3335
,
"35"
:
10.25557
,
"40"
:
10.05566
,
"45"
:
9.80602
,
"50"
:
9.89125
,
"55"
:
9.87089
,
"60"
:
9.4846
,
"65"
:
8.94044
,
"70"
:
9.7223
,
"75"
:
9.40865
,
"80"
:
9.39753
,
"85"
:
9.60719
,
"90"
:
9.81041
,
"95"
:
9.51159
,
"100"
:
9.39705
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1605.0
,
"5"
:
1978.0
,
"10"
:
1442.0
,
"15"
:
1952.0
,
"20"
:
1667.0
,
"25"
:
1734.0
,
"30"
:
1952.0
,
"35"
:
2043.0
,
"40"
:
2231.0
,
"45"
:
2197.0
,
"50"
:
2405.0
,
"55"
:
2212.0
,
"60"
:
2367.0
,
"65"
:
2639.0
,
"70"
:
3196.0
,
"75"
:
2592.0
,
"80"
:
3222.0
,
"85"
:
3406.0
,
"90"
:
3002.0
,
"95"
:
3368.0
,
"100"
:
3152.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
716834304.0
,
"5"
:
716834304.0
,
"10"
:
716834304.0
,
"15"
:
716834304.0
,
"20"
:
716834304.0
,
"25"
:
716834304.0
,
"30"
:
716834304.0
,
"35"
:
716834304.0
,
"40"
:
716834304.0
,
"45"
:
716834304.0
,
"50"
:
716834304.0
,
"55"
:
716834304.0
,
"60"
:
716834304.0
,
"65"
:
716834304.0
,
"70"
:
716834304.0
,
"75"
:
716834304.0
,
"80"
:
716834304.0
,
"85"
:
716834304.0
,
"90"
:
716834304.0
,
"95"
:
716834304.0
,
"100"
:
716834304.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1910424576.0
,
"5"
:
2193074176.0
,
"10"
:
2193074176.0
,
"15"
:
2193074176.0
,
"20"
:
2193074176.0
,
"25"
:
2193074176.0
,
"30"
:
2193074176.0
,
"35"
:
2193074176.0
,
"40"
:
2193074176.0
,
"45"
:
2193074176.0
,
"50"
:
2193074176.0
,
"55"
:
2193074176.0
,
"60"
:
2193074176.0
,
"65"
:
2193074176.0
,
"70"
:
2193074176.0
,
"75"
:
2193074176.0
,
"80"
:
2193074176.0
,
"85"
:
2193074176.0
,
"90"
:
2193074176.0
,
"95"
:
2193074176.0
,
"100"
:
2193074176.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
8.46967
,
"5"
:
0.1483
,
"10"
:
0.14544
,
"15"
:
0.14588
,
"20"
:
0.14639
,
"25"
:
0.14549
,
"30"
:
0.14597
,
"35"
:
0.14608
,
"40"
:
0.14578
,
"45"
:
0.14542
,
"50"
:
0.14492
,
"55"
:
0.14474
,
"60"
:
0.14635
,
"65"
:
0.14621
,
"70"
:
0.14453
,
"75"
:
0.14374
,
"80"
:
0.14465
,
"85"
:
0.14456
,
"90"
:
0.14413
,
"95"
:
0.14445
,
"100"
:
0.14399
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_lts.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.79205
,
"5"
:
10.84695
,
"10"
:
10.77106
,
"15"
:
10.79093
,
"20"
:
10.68042
,
"25"
:
10.50715
,
"30"
:
10.33325
,
"35"
:
10.25545
,
"40"
:
10.05544
,
"45"
:
9.80575
,
"50"
:
9.89082
,
"55"
:
9.87063
,
"60"
:
9.48478
,
"65"
:
8.94022
,
"70"
:
9.72243
,
"75"
:
9.40907
,
"80"
:
9.3976
,
"85"
:
9.60746
,
"90"
:
9.81041
,
"95"
:
9.5116
,
"100"
:
9.39722
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1580.0
,
"5"
:
1901.0
,
"10"
:
1346.0
,
"15"
:
1926.0
,
"20"
:
1643.0
,
"25"
:
1683.0
,
"30"
:
1867.0
,
"35"
:
2020.0
,
"40"
:
2252.0
,
"45"
:
2243.0
,
"50"
:
2459.0
,
"55"
:
2291.0
,
"60"
:
2404.0
,
"65"
:
2474.0
,
"70"
:
3102.0
,
"75"
:
2603.0
,
"80"
:
3420.0
,
"85"
:
3388.0
,
"90"
:
2904.0
,
"95"
:
3333.0
,
"100"
:
3347.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
714736640.0
,
"5"
:
714736640.0
,
"10"
:
714736640.0
,
"15"
:
714736640.0
,
"20"
:
714736640.0
,
"25"
:
714736640.0
,
"30"
:
714736640.0
,
"35"
:
714736640.0
,
"40"
:
714736640.0
,
"45"
:
714736640.0
,
"50"
:
714736640.0
,
"55"
:
714736640.0
,
"60"
:
714736640.0
,
"65"
:
714736640.0
,
"70"
:
714736640.0
,
"75"
:
714736640.0
,
"80"
:
714736640.0
,
"85"
:
714736640.0
,
"90"
:
714736640.0
,
"95"
:
714736640.0
,
"100"
:
714736640.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
2399714304.0
,
"5"
:
2681315328.0
,
"10"
:
2681315328.0
,
"15"
:
2681315328.0
,
"20"
:
2681315328.0
,
"25"
:
2681315328.0
,
"30"
:
2681315328.0
,
"35"
:
2681315328.0
,
"40"
:
2681315328.0
,
"45"
:
2681315328.0
,
"50"
:
2681315328.0
,
"55"
:
2681315328.0
,
"60"
:
2681315328.0
,
"65"
:
2681315328.0
,
"70"
:
2681315328.0
,
"75"
:
2681315328.0
,
"80"
:
2681315328.0
,
"85"
:
2681315328.0
,
"90"
:
2681315328.0
,
"95"
:
2681315328.0
,
"100"
:
2681315328.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
11.50484
,
"5"
:
0.15616
,
"10"
:
0.15661
,
"15"
:
0.15542
,
"20"
:
0.15597
,
"25"
:
0.15623
,
"30"
:
0.15732
,
"35"
:
0.15649
,
"40"
:
0.15774
,
"45"
:
0.15673
,
"50"
:
0.15646
,
"55"
:
0.1599
,
"60"
:
0.16087
,
"65"
:
0.16049
,
"70"
:
0.15987
,
"75"
:
0.15957
,
"80"
:
0.16064
,
"85"
:
0.16045
,
"90"
:
0.15984
,
"95"
:
0.15992
,
"100"
:
0.15958
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/model_config.yaml
0 → 100644
View file @
688448db
ENV_VARS
:
CUDA_DEVICE_MAX_CONNECTIONS
:
1
NVTE_ALLOW_NONDETERMINISTIC_ALGO
:
0
NCCL_ALGO
:
Tree
CUBLAS_WORKSPACE_CONFIG
:
:4096:8
MODEL_ARGS
:
--num-layers
:
12
--hidden-size
:
512
--num-attention-heads
:
8
--log-params-norm
:
true
--log-num-zeros-in-grad
:
true
--log-validation-ppl-to-tensorboard
:
true
--log-timers-to-tensorboard
:
true
--tensorboard-dir
:
${TENSORBOARD_PATH}
--micro-batch-size
:
4
--global-batch-size
:
32
--seq-length
:
1024
--max-position-embeddings
:
1024
--train-iters
:
100
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_SAVE_PATH}
--load
:
${CHECKPOINT_LOAD_PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
--split
:
949,50,1
--distributed-backend
:
nccl
--lr
:
0.00015
--lr-decay-style
:
cosine
--min-lr
:
1.0e-5
--weight-decay
:
1e-2
--clip-grad
:
1.0
--lr-warmup-fraction
:
.01
--log-interval
:
1
--save-interval
:
50
--eval-interval
:
1000
--eval-iters
:
10
--transformer-impl
:
transformer_engine
--tensor-model-parallel-size
:
1
--pipeline-model-parallel-size
:
4
--disable-bias-linear
:
true
--async-save
:
true
--use-persistent-ckpt-worker
:
true
--deterministic-mode
:
true
--no-gradient-accumulation-fusion
:
true
--attention-softmax-in-fp32
:
true
--use-checkpoint-opt_param-scheduler
:
true
--use-mcore-models
:
true
--ckpt-format
:
torch_dist
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
ckpt-resume
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/golden_values_dev.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.79196
,
"5"
:
10.84662
,
"10"
:
10.76844
,
"15"
:
10.78913
,
"20"
:
10.67859
,
"25"
:
10.50479
,
"30"
:
10.33089
,
"35"
:
10.25263
,
"40"
:
10.05242
,
"45"
:
9.80271
,
"50"
:
9.8884
,
"55"
:
9.86828
,
"60"
:
9.48223
,
"65"
:
8.93813
,
"70"
:
9.72081
,
"75"
:
9.40746
,
"80"
:
9.39636
,
"85"
:
9.60619
,
"90"
:
9.80953
,
"95"
:
9.51078
,
"100"
:
9.39612
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1613.0
,
"5"
:
1926.0
,
"10"
:
1432.0
,
"15"
:
1941.0
,
"20"
:
1592.0
,
"25"
:
1650.0
,
"30"
:
1891.0
,
"35"
:
1963.0
,
"40"
:
2255.0
,
"45"
:
2132.0
,
"50"
:
2411.0
,
"55"
:
2240.0
,
"60"
:
2443.0
,
"65"
:
2672.0
,
"70"
:
3168.0
,
"75"
:
2545.0
,
"80"
:
3353.0
,
"85"
:
3257.0
,
"90"
:
3171.0
,
"95"
:
3247.0
,
"100"
:
3375.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
717083136.0
,
"5"
:
717083136.0
,
"10"
:
717083136.0
,
"15"
:
717083136.0
,
"20"
:
717083136.0
,
"25"
:
717083136.0
,
"30"
:
717083136.0
,
"35"
:
717083136.0
,
"40"
:
717083136.0
,
"45"
:
717083136.0
,
"50"
:
717083136.0
,
"55"
:
717083136.0
,
"60"
:
717083136.0
,
"65"
:
717083136.0
,
"70"
:
717083136.0
,
"75"
:
717083136.0
,
"80"
:
717083136.0
,
"85"
:
717083136.0
,
"90"
:
717083136.0
,
"95"
:
717083136.0
,
"100"
:
717083136.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1910562816.0
,
"5"
:
2193323008.0
,
"10"
:
2193323008.0
,
"15"
:
2193323008.0
,
"20"
:
2193323008.0
,
"25"
:
2193323008.0
,
"30"
:
2193323008.0
,
"35"
:
2193323008.0
,
"40"
:
2193323008.0
,
"45"
:
2193323008.0
,
"50"
:
2193323008.0
,
"55"
:
2193323008.0
,
"60"
:
2193323008.0
,
"65"
:
2193323008.0
,
"70"
:
2193323008.0
,
"75"
:
2193323008.0
,
"80"
:
2193323008.0
,
"85"
:
2193323008.0
,
"90"
:
2193323008.0
,
"95"
:
2193323008.0
,
"100"
:
2193323008.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
13.72057
,
"5"
:
0.15784
,
"10"
:
0.15852
,
"15"
:
0.1575
,
"20"
:
0.15713
,
"25"
:
0.15769
,
"30"
:
0.15681
,
"35"
:
0.15447
,
"40"
:
0.15299
,
"45"
:
0.15347
,
"50"
:
0.15277
,
"55"
:
0.15216
,
"60"
:
0.15166
,
"65"
:
0.1519
,
"70"
:
0.15205
,
"75"
:
0.15222
,
"80"
:
0.15253
,
"85"
:
0.15199
,
"90"
:
0.15133
,
"95"
:
0.15154
,
"100"
:
0.15192
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/golden_values_lts.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.79208
,
"5"
:
10.8459
,
"10"
:
10.76945
,
"15"
:
10.78965
,
"20"
:
10.67868
,
"25"
:
10.50409
,
"30"
:
10.33064
,
"35"
:
10.25257
,
"40"
:
10.0522
,
"45"
:
9.80243
,
"50"
:
9.88792
,
"55"
:
9.86799
,
"60"
:
9.48248
,
"65"
:
8.93796
,
"70"
:
9.72094
,
"75"
:
9.40786
,
"80"
:
9.39646
,
"85"
:
9.60638
,
"90"
:
9.8096
,
"95"
:
9.51078
,
"100"
:
9.39625
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1633.0
,
"5"
:
1952.0
,
"10"
:
1432.0
,
"15"
:
1852.0
,
"20"
:
1592.0
,
"25"
:
1743.0
,
"30"
:
1953.0
,
"35"
:
1986.0
,
"40"
:
2180.0
,
"45"
:
2177.0
,
"50"
:
2468.0
,
"55"
:
2268.0
,
"60"
:
2427.0
,
"65"
:
2640.0
,
"70"
:
3158.0
,
"75"
:
2618.0
,
"80"
:
3274.0
,
"85"
:
3266.0
,
"90"
:
3078.0
,
"95"
:
3342.0
,
"100"
:
3345.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
714985472.0
,
"5"
:
714985472.0
,
"10"
:
714985472.0
,
"15"
:
714985472.0
,
"20"
:
714985472.0
,
"25"
:
714985472.0
,
"30"
:
714985472.0
,
"35"
:
714985472.0
,
"40"
:
714985472.0
,
"45"
:
714985472.0
,
"50"
:
714985472.0
,
"55"
:
714985472.0
,
"60"
:
714985472.0
,
"65"
:
714985472.0
,
"70"
:
714985472.0
,
"75"
:
714985472.0
,
"80"
:
714985472.0
,
"85"
:
714985472.0
,
"90"
:
714985472.0
,
"95"
:
714985472.0
,
"100"
:
714985472.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
2399852544.0
,
"5"
:
2681564160.0
,
"10"
:
2681564160.0
,
"15"
:
2681564160.0
,
"20"
:
2681564160.0
,
"25"
:
2681564160.0
,
"30"
:
2681564160.0
,
"35"
:
2681564160.0
,
"40"
:
2681564160.0
,
"45"
:
2681564160.0
,
"50"
:
2681564160.0
,
"55"
:
2681564160.0
,
"60"
:
2681564160.0
,
"65"
:
2681564160.0
,
"70"
:
2681564160.0
,
"75"
:
2681564160.0
,
"80"
:
2681564160.0
,
"85"
:
2681564160.0
,
"90"
:
2681564160.0
,
"95"
:
2681564160.0
,
"100"
:
2681564160.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
15.13387
,
"5"
:
0.16765
,
"10"
:
0.16782
,
"15"
:
0.16572
,
"20"
:
0.16589
,
"25"
:
0.16624
,
"30"
:
0.16596
,
"35"
:
0.16694
,
"40"
:
0.16658
,
"45"
:
0.1656
,
"50"
:
0.16593
,
"55"
:
0.16847
,
"60"
:
0.16671
,
"65"
:
0.16618
,
"70"
:
0.16477
,
"75"
:
0.1663
,
"80"
:
0.16601
,
"85"
:
0.16704
,
"90"
:
0.16563
,
"95"
:
0.16515
,
"100"
:
0.16582
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
100
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -49,4 +49,5 @@ MODEL_ARGS:
--ckpt-format
:
torch_dist
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
ckpt-resume
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_dev.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.74036
,
"5"
:
10.79133
,
"10"
:
10.71217
,
"15"
:
10.75916
,
"20"
:
10.68909
,
"25"
:
10.5421
,
"30"
:
10.45456
,
"35"
:
10.38155
,
"40"
:
10.24241
,
"45"
:
9.9827
,
"50"
:
10.06896
,
"55"
:
9.98885
,
"60"
:
9.66601
,
"65"
:
9.07115
,
"70"
:
9.81824
,
"75"
:
9.55308
,
"80"
:
9.51136
,
"85"
:
9.70682
,
"90"
:
9.87981
,
"95"
:
9.60074
,
"100"
:
9.49208
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
2496.0
,
"5"
:
2768.0
,
"10"
:
2420.0
,
"15"
:
2572.0
,
"20"
:
2580.0
,
"25"
:
2521.0
,
"30"
:
2632.0
,
"35"
:
2626.0
,
"40"
:
2628.0
,
"45"
:
2362.0
,
"50"
:
2543.0
,
"55"
:
2498.0
,
"60"
:
2239.0
,
"65"
:
2652.0
,
"70"
:
3100.0
,
"75"
:
2597.0
,
"80"
:
3019.0
,
"85"
:
3171.0
,
"90"
:
3464.0
,
"95"
:
3134.0
,
"100"
:
2555.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
717157888.0
,
"5"
:
717157888.0
,
"10"
:
717157888.0
,
"15"
:
717157888.0
,
"20"
:
717157888.0
,
"25"
:
717157888.0
,
"30"
:
717157888.0
,
"35"
:
717157888.0
,
"40"
:
717157888.0
,
"45"
:
717157888.0
,
"50"
:
717157888.0
,
"55"
:
717157888.0
,
"60"
:
717157888.0
,
"65"
:
717157888.0
,
"70"
:
717157888.0
,
"75"
:
717157888.0
,
"80"
:
717157888.0
,
"85"
:
717157888.0
,
"90"
:
717157888.0
,
"95"
:
717157888.0
,
"100"
:
717157888.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1910556672.0
,
"5"
:
2194053120.0
,
"10"
:
2194053120.0
,
"15"
:
2194053120.0
,
"20"
:
2194053120.0
,
"25"
:
2194053120.0
,
"30"
:
2194053120.0
,
"35"
:
2194053120.0
,
"40"
:
2194053120.0
,
"45"
:
2194053120.0
,
"50"
:
2194053120.0
,
"55"
:
2194053120.0
,
"60"
:
2194053120.0
,
"65"
:
2194053120.0
,
"70"
:
2194053120.0
,
"75"
:
2194053120.0
,
"80"
:
2194053120.0
,
"85"
:
2194053120.0
,
"90"
:
2194053120.0
,
"95"
:
2194053120.0
,
"100"
:
2194053120.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
14.70264
,
"5"
:
0.15996
,
"10"
:
0.15856
,
"15"
:
0.15761
,
"20"
:
0.15818
,
"25"
:
0.15823
,
"30"
:
0.15624
,
"35"
:
0.1572
,
"40"
:
0.15555
,
"45"
:
0.15747
,
"50"
:
0.15543
,
"55"
:
0.15768
,
"60"
:
0.15761
,
"65"
:
0.1577
,
"70"
:
0.41222
,
"75"
:
0.15706
,
"80"
:
0.15755
,
"85"
:
0.15717
,
"90"
:
0.15749
,
"95"
:
0.15708
,
"100"
:
0.15789
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_lts.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.74049
,
"5"
:
10.79201
,
"10"
:
10.71088
,
"15"
:
10.76031
,
"20"
:
10.68908
,
"25"
:
10.54336
,
"30"
:
10.45425
,
"35"
:
10.38323
,
"40"
:
10.24297
,
"45"
:
9.98344
,
"50"
:
10.06864
,
"55"
:
9.9892
,
"60"
:
9.66702
,
"65"
:
9.07244
,
"70"
:
9.81879
,
"75"
:
9.55278
,
"80"
:
9.51061
,
"85"
:
9.70753
,
"90"
:
9.87996
,
"95"
:
9.60069
,
"100"
:
9.49261
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
2527.0
,
"5"
:
2875.0
,
"10"
:
2475.0
,
"15"
:
2508.0
,
"20"
:
2634.0
,
"25"
:
2391.0
,
"30"
:
2505.0
,
"35"
:
2580.0
,
"40"
:
2568.0
,
"45"
:
2375.0
,
"50"
:
2618.0
,
"55"
:
2379.0
,
"60"
:
2183.0
,
"65"
:
2639.0
,
"70"
:
3090.0
,
"75"
:
2496.0
,
"80"
:
3076.0
,
"85"
:
3189.0
,
"90"
:
3454.0
,
"95"
:
3150.0
,
"100"
:
2593.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
715322368.0
,
"5"
:
715322368.0
,
"10"
:
715322368.0
,
"15"
:
715322368.0
,
"20"
:
715322368.0
,
"25"
:
715322368.0
,
"30"
:
715322368.0
,
"35"
:
715322368.0
,
"40"
:
715322368.0
,
"45"
:
715322368.0
,
"50"
:
715322368.0
,
"55"
:
715322368.0
,
"60"
:
715322368.0
,
"65"
:
715322368.0
,
"70"
:
715322368.0
,
"75"
:
715322368.0
,
"80"
:
715322368.0
,
"85"
:
715322368.0
,
"90"
:
715322368.0
,
"95"
:
715322368.0
,
"100"
:
715322368.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
2402991104.0
,
"5"
:
2683341824.0
,
"10"
:
2683341824.0
,
"15"
:
2683341824.0
,
"20"
:
2683341824.0
,
"25"
:
2683341824.0
,
"30"
:
2683341824.0
,
"35"
:
2683341824.0
,
"40"
:
2683341824.0
,
"45"
:
2683341824.0
,
"50"
:
2683341824.0
,
"55"
:
2683341824.0
,
"60"
:
2683341824.0
,
"65"
:
2683341824.0
,
"70"
:
2683341824.0
,
"75"
:
2683341824.0
,
"80"
:
2683341824.0
,
"85"
:
2683341824.0
,
"90"
:
2683341824.0
,
"95"
:
2683341824.0
,
"100"
:
2683341824.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
17.64292
,
"5"
:
0.17363
,
"10"
:
0.17156
,
"15"
:
0.17206
,
"20"
:
0.1701
,
"25"
:
0.17207
,
"30"
:
0.16951
,
"35"
:
0.17005
,
"40"
:
0.17036
,
"45"
:
0.17005
,
"50"
:
0.16935
,
"55"
:
0.16909
,
"60"
:
0.16956
,
"65"
:
0.16911
,
"70"
:
0.16772
,
"75"
:
0.16805
,
"80"
:
0.16819
,
"85"
:
0.16813
,
"90"
:
0.30023
,
"95"
:
0.16879
,
"100"
:
0.16784
}}}
\ No newline at end of file
Prev
1
…
19
20
21
22
23
24
25
26
27
…
42
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment