Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
688448db
Commit
688448db
authored
Mar 14, 2025
by
silencealiang
Browse files
更新代码
parent
a02a5490
Pipeline
#2503
passed with stage
Changes
823
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
34 additions
and
72 deletions
+34
-72
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/golden_values_lts.json
...iform_full_recompute_dgx_a100_1N8G/golden_values_lts.json
+1
-1
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml
...p1_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_dev.json
...p2p_nondeterministic_dgx_a100_1N8G/golden_values_dev.json
+1
-1
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_lts.json
...p2p_nondeterministic_dgx_a100_1N8G/golden_values_lts.json
+1
-1
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/model_config.yaml
..._a2a_p2p_nondeterministic_dgx_a100_1N8G/model_config.yaml
+3
-3
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_dev.json
...p2p_nondeterministic_dgx_a100_1N8G/golden_values_dev.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_lts.json
...p2p_nondeterministic_dgx_a100_1N8G/golden_values_lts.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/model_config.yaml
..._a2a_p2p_nondeterministic_dgx_a100_1N8G/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev.json
...allel_dist_optimizer_dgx_a100_1N8G/golden_values_dev.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_lts.json
...allel_dist_optimizer_dgx_a100_1N8G/golden_values_lts.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml
...s2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/golden_values_dev.json
...dist_rope_embeddings_dgx_a100_1N8G/golden_values_dev.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/golden_values_lts.json
...dist_rope_embeddings_dgx_a100_1N8G/golden_values_lts.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml
...orch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev.json
...nterleaved_no_fusion_dgx_a100_1N8G/golden_values_dev.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_lts.json
...nterleaved_no_fusion_dgx_a100_1N8G/golden_values_lts.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml
...ngs_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/golden_values_dev.json
..._pp2_rope_embeddings_dgx_a100_1N8G/golden_values_dev.json
+1
-53
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/golden_values_lts.json
..._pp2_rope_embeddings_dgx_a100_1N8G/golden_values_lts.json
+1
-1
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml
...e_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml
+3
-2
No files found.
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/golden_values_lts.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.8401
,
10.87262
,
10.85023
,
10.79645
,
10.68149
,
10.60617
,
10.1277
,
10.22183
,
10.13794
,
9.8231
]},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
1670.0
,
1901.0
,
1923.0
,
1922.0
,
2020.0
,
1815.0
,
1713.0
,
1963.0
,
2266.0
,
2324.0
]},
"iteration_timing_avg"
:
0.09164500000000002
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.8401
,
"5"
:
10.84032
,
"10"
:
10.81341
,
"15"
:
10.80278
,
"20"
:
10.70496
,
"25"
:
10.53846
,
"30"
:
10.35517
,
"35"
:
10.27147
,
"40"
:
10.08045
,
"45"
:
9.82292
,
"50"
:
9.90114
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1670.0
,
"5"
:
1970.0
,
"10"
:
1436.0
,
"15"
:
1918.0
,
"20"
:
1786.0
,
"25"
:
1610.0
,
"30"
:
2039.0
,
"35"
:
2001.0
,
"40"
:
2321.0
,
"45"
:
2205.0
,
"50"
:
2365.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1230390272.0
,
"5"
:
1230390272.0
,
"10"
:
1230390272.0
,
"15"
:
1230390272.0
,
"20"
:
1230390272.0
,
"25"
:
1230390272.0
,
"30"
:
1230390272.0
,
"35"
:
1230390272.0
,
"40"
:
1230390272.0
,
"45"
:
1230390272.0
,
"50"
:
1230390272.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1984492032.0
,
"5"
:
2531972608.0
,
"10"
:
2531972608.0
,
"15"
:
2531972608.0
,
"20"
:
2531972608.0
,
"25"
:
2531972608.0
,
"30"
:
2531972608.0
,
"35"
:
2531972608.0
,
"40"
:
2531972608.0
,
"45"
:
2531972608.0
,
"50"
:
2531972608.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
8.0418
,
"5"
:
0.12998
,
"10"
:
0.12656
,
"15"
:
0.12621
,
"20"
:
0.13103
,
"25"
:
0.12628
,
"30"
:
0.12409
,
"35"
:
0.12632
,
"40"
:
0.13313
,
"45"
:
0.12545
,
"50"
:
0.12421
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
50
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -50,4 +50,5 @@ MODEL_ARGS:
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--attention-backend
:
unfused
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
regular
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_dev.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.82974
,
10.85934
,
10.88536
,
10.78981
,
10.64534
,
10.56415
,
9.99534
,
10.13972
,
10.06259
,
9.71481
]
},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
261.0
,
256.0
,
258.0
,
250.0
,
243.0
,
265.0
,
254.0
,
299.0
,
299.0
,
294.0
]
},
"iteration
_
tim
ing_avg"
:
0.3993126470588235
}
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.82974
,
"5"
:
10.84387
,
"10"
:
10.79336
,
"15"
:
10.77992
,
"20"
:
10.67707
,
"25"
:
10.48581
,
"30"
:
10.28464
,
"35"
:
10.18863
,
"40"
:
9.99275
,
"45"
:
9.72154
,
"50"
:
9.82122
}
},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
214.0
,
"5"
:
270.0
,
"10"
:
224.0
,
"15"
:
235.0
,
"20"
:
242.0
,
"25"
:
260.0
,
"30"
:
280.0
,
"35"
:
300.0
,
"40"
:
334.0
,
"45"
:
324.0
,
"50"
:
298.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
829378048.0
,
"5"
:
829378048.0
,
"10"
:
829378048.0
,
"15"
:
829378048.0
,
"20"
:
829378048.0
,
"25"
:
829378048.0
,
"30"
:
829378048.0
,
"35"
:
829378048.0
,
"40"
:
829378048.0
,
"45"
:
829378048.0
,
"50"
:
829378048.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
891564544.0
,
"5"
:
1248933376.0
,
"10"
:
1250505728.0
,
"15"
:
1250505728.0
,
"20"
:
1250505728.0
,
"25"
:
1250505728.0
,
"30"
:
1250505728.0
,
"35"
:
1250505728.0
,
"40"
:
1250505728.0
,
"45"
:
1250505728.0
,
"50"
:
1250505728.0
}
},
"iteration
-
tim
e"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
20.58657
,
"5"
:
0.44565
,
"10"
:
0.45716
,
"15"
:
0.50953
,
"20"
:
0.44872
,
"25"
:
0.44791
,
"30"
:
0.44871
,
"35"
:
0.44188
,
"40"
:
0.44233
,
"45"
:
0.44161
,
"50"
:
0.44069
}}
}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_lts.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.85803
,
10.88122
,
10.858
32
,
10.80987
,
10.66115
,
10.55375
,
10.01843
,
10
.14234
,
10.05958
,
9.71149
]},
"num-zero
s"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
244.0
,
231.0
,
243.0
,
257.0
,
247.0
,
267.0
,
256.0
,
299.0
,
318.0
,
325
.0
]
},
"iteration
_
tim
ing_avg"
:
0.3993126470588235
}
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.82975
,
"5"
:
10.8439
,
"10"
:
10.79337
,
"15"
:
10.77994
,
"20"
:
10.67712
,
"25"
:
10.
4
858
4
,
"30"
:
10.28468
,
"35"
:
10.18859
,
"40"
:
9.99279
,
"45"
:
9.72153
,
"50"
:
9.82127
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
226.0
,
"5"
:
275.0
,
"10"
:
181.0
,
"15"
:
253.0
,
"20"
:
248.0
,
"25"
:
207.0
,
"30"
:
265.0
,
"35"
:
281.0
,
"40"
:
315.0
,
"45"
:
282.0
,
"50"
:
336.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
831212544.0
,
"5"
:
831212544.0
,
"
10
"
:
831212544.0
,
"15"
:
831212544.0
,
"20"
:
831212544.0
,
"25"
:
831212544.0
,
"30"
:
831212544.0
,
"35"
:
831212544.0
,
"40"
:
831212544.0
,
"45"
:
831212544.0
,
"50"
:
831212544.0
}},
"mem-max-allocated-byte
s"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
89158
24
6
4.0
,
"5"
:
1250786304.0
,
"10"
:
1250786304.0
,
"15"
:
1250786304.0
,
"20"
:
1250786304.0
,
"25"
:
1251832320.0
,
"30"
:
1251832320.0
,
"35"
:
1251832320.0
,
"40"
:
1251832320.0
,
"45"
:
1251832320.0
,
"50"
:
1251832320
.0
}
},
"iteration
-
tim
e"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
13.63617
,
"5"
:
0.42436
,
"10"
:
0.41552
,
"15"
:
0.4158
,
"20"
:
0.41223
,
"25"
:
0.40643
,
"30"
:
0.40417
,
"35"
:
0.40442
,
"40"
:
0.40546
,
"45"
:
0.40627
,
"50"
:
0.40596
}}
}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/model_config.yaml
View file @
688448db
...
...
@@ -17,8 +17,8 @@ MODEL_ARGS:
--train-iters
:
50
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -50,5 +50,5 @@ MODEL_ARGS:
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--attention-backend
:
flash
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
regular
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_dev.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.82974
,
"5"
:
10.84387
,
"10"
:
10.79336
,
"15"
:
10.77992
,
"20"
:
10.67707
,
"25"
:
10.48581
,
"30"
:
10.28464
,
"35"
:
10.18863
,
"40"
:
9.99275
,
"45"
:
9.72154
,
"50"
:
9.82122
,
"55"
:
9.79605
,
"60"
:
9.41615
,
"65"
:
8.85917
,
"70"
:
9.67001
,
"75"
:
9.3564
,
"80"
:
9.34748
,
"85"
:
9.55946
,
"90"
:
9.77362
,
"95"
:
9.47863
,
"100"
:
9.35146
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
214.0
,
"5"
:
270.0
,
"10"
:
224.0
,
"15"
:
235.0
,
"20"
:
242.0
,
"25"
:
260.0
,
"30"
:
280.0
,
"35"
:
300.0
,
"40"
:
334.0
,
"45"
:
324.0
,
"50"
:
298.0
,
"55"
:
390.0
,
"60"
:
342.0
,
"65"
:
394.0
,
"70"
:
411.0
,
"75"
:
319.0
,
"80"
:
414.0
,
"85"
:
441.0
,
"90"
:
381.0
,
"95"
:
398.0
,
"100"
:
431.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
829378048.0
,
"5"
:
829378048.0
,
"10"
:
829378048.0
,
"15"
:
829378048.0
,
"20"
:
829378048.0
,
"25"
:
829378048.0
,
"30"
:
829378048.0
,
"35"
:
829378048.0
,
"40"
:
829378048.0
,
"45"
:
829378048.0
,
"50"
:
829378048.0
,
"55"
:
829378048.0
,
"60"
:
829378048.0
,
"65"
:
829378048.0
,
"70"
:
829378048.0
,
"75"
:
829378048.0
,
"80"
:
829378048.0
,
"85"
:
829378048.0
,
"90"
:
829378048.0
,
"95"
:
829378048.0
,
"100"
:
829378048.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
892610560.0
,
"5"
:
1248933376.0
,
"10"
:
1248933376.0
,
"15"
:
1248933376.0
,
"20"
:
1248933376.0
,
"25"
:
1248933376.0
,
"30"
:
1248933376.0
,
"35"
:
1249456128.0
,
"40"
:
1249456128.0
,
"45"
:
1249456128.0
,
"50"
:
1249980928.0
,
"55"
:
1249980928.0
,
"60"
:
1249980928.0
,
"65"
:
1249980928.0
,
"70"
:
1249980928.0
,
"75"
:
1250504192.0
,
"80"
:
1250504192.0
,
"85"
:
1250504192.0
,
"90"
:
1250505728.0
,
"95"
:
1250505728.0
,
"100"
:
1250505728.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
24.66296
,
"5"
:
0.45069
,
"10"
:
0.44192
,
"15"
:
0.44436
,
"20"
:
0.442
,
"25"
:
0.44288
,
"30"
:
0.44618
,
"35"
:
0.44139
,
"40"
:
0.44072
,
"45"
:
0.44429
,
"50"
:
0.43893
,
"55"
:
0.43569
,
"60"
:
0.43551
,
"65"
:
0.43912
,
"70"
:
0.44568
,
"75"
:
0.44023
,
"80"
:
0.43745
,
"85"
:
0.43617
,
"90"
:
0.43925
,
"95"
:
0.43653
,
"100"
:
0.43561
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_lts.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.82975
,
"5"
:
10.8439
,
"10"
:
10.79337
,
"15"
:
10.77994
,
"20"
:
10.67712
,
"25"
:
10.48584
,
"30"
:
10.28468
,
"35"
:
10.18859
,
"40"
:
9.99279
,
"45"
:
9.72153
,
"50"
:
9.82127
,
"55"
:
9.79611
,
"60"
:
9.41616
,
"65"
:
8.85917
,
"70"
:
9.67001
,
"75"
:
9.35641
,
"80"
:
9.34751
,
"85"
:
9.55947
,
"90"
:
9.77366
,
"95"
:
9.47865
,
"100"
:
9.35145
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
226.0
,
"5"
:
275.0
,
"10"
:
181.0
,
"15"
:
253.0
,
"20"
:
248.0
,
"25"
:
207.0
,
"30"
:
265.0
,
"35"
:
281.0
,
"40"
:
315.0
,
"45"
:
282.0
,
"50"
:
336.0
,
"55"
:
373.0
,
"60"
:
343.0
,
"65"
:
389.0
,
"70"
:
436.0
,
"75"
:
337.0
,
"80"
:
395.0
,
"85"
:
419.0
,
"90"
:
412.0
,
"95"
:
405.0
,
"100"
:
394.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
831212544.0
,
"5"
:
831212544.0
,
"10"
:
831212544.0
,
"15"
:
831212544.0
,
"20"
:
831212544.0
,
"25"
:
831212544.0
,
"30"
:
831212544.0
,
"35"
:
831212544.0
,
"40"
:
831212544.0
,
"45"
:
831212544.0
,
"50"
:
831212544.0
,
"55"
:
831212544.0
,
"60"
:
831212544.0
,
"65"
:
831212544.0
,
"70"
:
831212544.0
,
"75"
:
831212544.0
,
"80"
:
831212544.0
,
"85"
:
831212544.0
,
"90"
:
831212544.0
,
"95"
:
831212544.0
,
"100"
:
831212544.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
891582464.0
,
"5"
:
1250786304.0
,
"10"
:
1250786304.0
,
"15"
:
1250786304.0
,
"20"
:
1250786304.0
,
"25"
:
1250786304.0
,
"30"
:
1250786304.0
,
"35"
:
1250786304.0
,
"40"
:
1251834880.0
,
"45"
:
1251834880.0
,
"50"
:
1251834880.0
,
"55"
:
1251834880.0
,
"60"
:
1251834880.0
,
"65"
:
1251834880.0
,
"70"
:
1251834880.0
,
"75"
:
1251834880.0
,
"80"
:
1251834880.0
,
"85"
:
1251834880.0
,
"90"
:
1251834880.0
,
"95"
:
1251834880.0
,
"100"
:
1251834880.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
14.7102
,
"5"
:
0.46307
,
"10"
:
0.41777
,
"15"
:
0.41661
,
"20"
:
0.41769
,
"25"
:
0.42698
,
"30"
:
0.41765
,
"35"
:
0.42804
,
"40"
:
0.42081
,
"45"
:
0.42234
,
"50"
:
0.41276
,
"55"
:
0.43287
,
"60"
:
0.43055
,
"65"
:
0.43352
,
"70"
:
0.42189
,
"75"
:
0.42153
,
"80"
:
0.41723
,
"85"
:
0.40522
,
"90"
:
0.40231
,
"95"
:
0.4016
,
"100"
:
0.40172
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/model_config.yaml
View file @
688448db
...
...
@@ -17,8 +17,8 @@ MODEL_ARGS:
--train-iters
:
100
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -51,4 +51,5 @@ MODEL_ARGS:
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--attention-backend
:
flash
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
ckpt-resume
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.82949
,
"5"
:
10.84768
,
"10"
:
10.79952
,
"15"
:
10.83278
,
"20"
:
10.75815
,
"25"
:
10.59944
,
"30"
:
10.44255
,
"35"
:
10.35518
,
"40"
:
10.17871
,
"45"
:
9.93731
,
"50"
:
9.99597
,
"55"
:
9.96506
,
"60"
:
9.59206
,
"65"
:
9.01654
,
"70"
:
9.78255
,
"75"
:
9.48023
,
"80"
:
9.4506
,
"85"
:
9.65781
,
"90"
:
9.84565
,
"95"
:
9.54832
,
"100"
:
9.43863
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
30437.0
,
"5"
:
35925.0
,
"10"
:
29186.0
,
"15"
:
34264.0
,
"20"
:
32053.0
,
"25"
:
30879.0
,
"30"
:
33163.0
,
"35"
:
34561.0
,
"40"
:
35765.0
,
"45"
:
35584.0
,
"50"
:
39786.0
,
"55"
:
37204.0
,
"60"
:
40266.0
,
"65"
:
41421.0
,
"70"
:
45637.0
,
"75"
:
40348.0
,
"80"
:
46876.0
,
"85"
:
49638.0
,
"90"
:
49468.0
,
"95"
:
47017.0
,
"100"
:
45528.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
936543232.0
,
"5"
:
936543744.0
,
"10"
:
936542720.0
,
"15"
:
936543232.0
,
"20"
:
936544768.0
,
"25"
:
936543232.0
,
"30"
:
936543232.0
,
"35"
:
936541184.0
,
"40"
:
936542720.0
,
"45"
:
936543232.0
,
"50"
:
936544256.0
,
"55"
:
936546816.0
,
"60"
:
936547328.0
,
"65"
:
936556032.0
,
"70"
:
936546816.0
,
"75"
:
936544256.0
,
"80"
:
936556544.0
,
"85"
:
936553984.0
,
"90"
:
936546304.0
,
"95"
:
936548352.0
,
"100"
:
936551936.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
2451792384.0
,
"5"
:
2720901120.0
,
"10"
:
2720901120.0
,
"15"
:
2720901120.0
,
"20"
:
2720901120.0
,
"25"
:
2720901120.0
,
"30"
:
2720901120.0
,
"35"
:
2721362432.0
,
"40"
:
2721362432.0
,
"45"
:
2721362432.0
,
"50"
:
2724393984.0
,
"55"
:
2724393984.0
,
"60"
:
2728018432.0
,
"65"
:
2738826240.0
,
"70"
:
2738826240.0
,
"75"
:
2740684288.0
,
"80"
:
2740684288.0
,
"85"
:
2740684288.0
,
"90"
:
2741338624.0
,
"95"
:
2741338624.0
,
"100"
:
2741338624.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
17.42632
,
"5"
:
0.24404
,
"10"
:
0.242
,
"15"
:
0.23944
,
"20"
:
0.23931
,
"25"
:
0.23806
,
"30"
:
0.23357
,
"35"
:
0.23421
,
"40"
:
0.23628
,
"45"
:
0.23522
,
"50"
:
0.23575
,
"55"
:
0.24699
,
"60"
:
0.24808
,
"65"
:
0.25066
,
"70"
:
0.23754
,
"75"
:
0.23814
,
"80"
:
0.23925
,
"85"
:
0.23699
,
"90"
:
0.23541
,
"95"
:
0.23763
,
"100"
:
0.23866
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_lts.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.82949
,
"5"
:
10.84751
,
"10"
:
10.79994
,
"15"
:
10.83348
,
"20"
:
10.75739
,
"25"
:
10.59863
,
"30"
:
10.44207
,
"35"
:
10.35534
,
"40"
:
10.17846
,
"45"
:
9.93775
,
"50"
:
9.99583
,
"55"
:
9.96526
,
"60"
:
9.59209
,
"65"
:
9.01675
,
"70"
:
9.78268
,
"75"
:
9.4802
,
"80"
:
9.45051
,
"85"
:
9.65787
,
"90"
:
9.84587
,
"95"
:
9.54779
,
"100"
:
9.43905
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
30304.0
,
"5"
:
35542.0
,
"10"
:
29062.0
,
"15"
:
34559.0
,
"20"
:
31981.0
,
"25"
:
30845.0
,
"30"
:
32894.0
,
"35"
:
34952.0
,
"40"
:
36358.0
,
"45"
:
35638.0
,
"50"
:
40119.0
,
"55"
:
36895.0
,
"60"
:
39710.0
,
"65"
:
41463.0
,
"70"
:
45566.0
,
"75"
:
40307.0
,
"80"
:
46882.0
,
"85"
:
50049.0
,
"90"
:
49238.0
,
"95"
:
47300.0
,
"100"
:
45898.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
936567296.0
,
"5"
:
936566784.0
,
"10"
:
936566784.0
,
"15"
:
936567808.0
,
"20"
:
936568832.0
,
"25"
:
936565760.0
,
"30"
:
936568320.0
,
"35"
:
936564736.0
,
"40"
:
936566784.0
,
"45"
:
936566784.0
,
"50"
:
936567808.0
,
"55"
:
936570880.0
,
"60"
:
936570880.0
,
"65"
:
936580608.0
,
"70"
:
936571392.0
,
"75"
:
936568320.0
,
"80"
:
936580608.0
,
"85"
:
936578560.0
,
"90"
:
936569856.0
,
"95"
:
936572416.0
,
"100"
:
936576512.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
24.80877
,
"5"
:
0.2156
,
"10"
:
0.23039
,
"15"
:
0.21152
,
"20"
:
0.21327
,
"25"
:
0.2116
,
"30"
:
0.20846
,
"35"
:
0.2099
,
"40"
:
0.20891
,
"45"
:
0.20828
,
"50"
:
0.20799
,
"55"
:
0.20851
,
"60"
:
0.20961
,
"65"
:
0.21172
,
"70"
:
0.20966
,
"75"
:
0.20994
,
"80"
:
0.21009
,
"85"
:
0.20683
,
"90"
:
0.20599
,
"95"
:
0.20814
,
"100"
:
0.20924
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml
View file @
688448db
...
...
@@ -17,8 +17,8 @@ MODEL_ARGS:
--train-iters
:
100
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -52,4 +52,5 @@ MODEL_ARGS:
--ckpt-format
:
torch_dist
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
ckpt-resume
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/golden_values_dev.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.84764
,
"5"
:
10.86567
,
"10"
:
10.82469
,
"15"
:
10.81348
,
"20"
:
10.72058
,
"25"
:
10.53162
,
"30"
:
10.33683
,
"35"
:
10.24089
,
"40"
:
10.05113
,
"45"
:
9.76815
,
"50"
:
9.85503
,
"55"
:
9.82458
,
"60"
:
9.44286
,
"65"
:
8.89124
,
"70"
:
9.67905
,
"75"
:
9.36822
,
"80"
:
9.35789
,
"85"
:
9.56054
,
"90"
:
9.77055
,
"95"
:
9.48111
,
"100"
:
9.34966
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1736.0
,
"5"
:
1989.0
,
"10"
:
1643.0
,
"15"
:
1984.0
,
"20"
:
1713.0
,
"25"
:
1775.0
,
"30"
:
2005.0
,
"35"
:
2093.0
,
"40"
:
2238.0
,
"45"
:
2229.0
,
"50"
:
2348.0
,
"55"
:
2407.0
,
"60"
:
2545.0
,
"65"
:
2732.0
,
"70"
:
3041.0
,
"75"
:
2930.0
,
"80"
:
3261.0
,
"85"
:
3370.0
,
"90"
:
3188.0
,
"95"
:
3193.0
,
"100"
:
3397.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
894390272.0
,
"5"
:
894390272.0
,
"10"
:
894390272.0
,
"15"
:
894390272.0
,
"20"
:
894390272.0
,
"25"
:
894390272.0
,
"30"
:
894390272.0
,
"35"
:
894390272.0
,
"40"
:
894390272.0
,
"45"
:
894390272.0
,
"50"
:
894390272.0
,
"55"
:
894390272.0
,
"60"
:
894390272.0
,
"65"
:
894390272.0
,
"70"
:
894390272.0
,
"75"
:
894390272.0
,
"80"
:
894390272.0
,
"85"
:
894390272.0
,
"90"
:
894390272.0
,
"95"
:
894390272.0
,
"100"
:
894390272.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
2233004032.0
,
"5"
:
2597712896.0
,
"10"
:
2597712896.0
,
"15"
:
2597712896.0
,
"20"
:
2597712896.0
,
"25"
:
2597712896.0
,
"30"
:
2597712896.0
,
"35"
:
2597712896.0
,
"40"
:
2597712896.0
,
"45"
:
2597712896.0
,
"50"
:
2597712896.0
,
"55"
:
2597712896.0
,
"60"
:
2597712896.0
,
"65"
:
2597712896.0
,
"70"
:
2597712896.0
,
"75"
:
2597712896.0
,
"80"
:
2597712896.0
,
"85"
:
2597712896.0
,
"90"
:
2597712896.0
,
"95"
:
2597712896.0
,
"100"
:
2597712896.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
9.04286
,
"5"
:
0.12525
,
"10"
:
0.12905
,
"15"
:
0.12687
,
"20"
:
0.12848
,
"25"
:
0.12854
,
"30"
:
0.12621
,
"35"
:
0.1283
,
"40"
:
0.12782
,
"45"
:
0.12535
,
"50"
:
0.12584
,
"55"
:
0.12504
,
"60"
:
0.1249
,
"65"
:
0.36941
,
"70"
:
0.12553
,
"75"
:
0.12455
,
"80"
:
0.12658
,
"85"
:
0.12479
,
"90"
:
0.12521
,
"95"
:
0.12546
,
"100"
:
0.1255
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/golden_values_lts.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.8468
,
"5"
:
10.8657
,
"10"
:
10.82411
,
"15"
:
10.8128
,
"20"
:
10.72008
,
"25"
:
10.53151
,
"30"
:
10.33655
,
"35"
:
10.24133
,
"40"
:
10.05096
,
"45"
:
9.76804
,
"50"
:
9.85531
,
"55"
:
9.82458
,
"60"
:
9.4433
,
"65"
:
8.89103
,
"70"
:
9.67922
,
"75"
:
9.36864
,
"80"
:
9.35829
,
"85"
:
9.56053
,
"90"
:
9.77063
,
"95"
:
9.48104
,
"100"
:
9.34984
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1707.0
,
"5"
:
2121.0
,
"10"
:
1606.0
,
"15"
:
1959.0
,
"20"
:
1756.0
,
"25"
:
1848.0
,
"30"
:
2091.0
,
"35"
:
2089.0
,
"40"
:
2156.0
,
"45"
:
2137.0
,
"50"
:
2317.0
,
"55"
:
2485.0
,
"60"
:
2487.0
,
"65"
:
2748.0
,
"70"
:
3067.0
,
"75"
:
2801.0
,
"80"
:
3131.0
,
"85"
:
3343.0
,
"90"
:
3084.0
,
"95"
:
3062.0
,
"100"
:
3270.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
888098304.0
,
"5"
:
888098304.0
,
"10"
:
888098304.0
,
"15"
:
888098304.0
,
"20"
:
888098304.0
,
"25"
:
888098304.0
,
"30"
:
888098304.0
,
"35"
:
888098304.0
,
"40"
:
888098304.0
,
"45"
:
888098304.0
,
"50"
:
888098304.0
,
"55"
:
888098304.0
,
"60"
:
888098304.0
,
"65"
:
888098304.0
,
"70"
:
888098304.0
,
"75"
:
888098304.0
,
"80"
:
888098304.0
,
"85"
:
888098304.0
,
"90"
:
888098304.0
,
"95"
:
888098304.0
,
"100"
:
888098304.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
3212632576.0
,
"5"
:
3572098560.0
,
"10"
:
3572098560.0
,
"15"
:
3572098560.0
,
"20"
:
3572098560.0
,
"25"
:
3572098560.0
,
"30"
:
3572098560.0
,
"35"
:
3572098560.0
,
"40"
:
3572098560.0
,
"45"
:
3572098560.0
,
"50"
:
3572098560.0
,
"55"
:
3572098560.0
,
"60"
:
3572098560.0
,
"65"
:
3572098560.0
,
"70"
:
3572098560.0
,
"75"
:
3572098560.0
,
"80"
:
3572098560.0
,
"85"
:
3572098560.0
,
"90"
:
3572098560.0
,
"95"
:
3572098560.0
,
"100"
:
3572098560.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.16354
,
"5"
:
0.14461
,
"10"
:
0.14503
,
"15"
:
0.14287
,
"20"
:
0.14648
,
"25"
:
0.14267
,
"30"
:
0.14304
,
"35"
:
0.14471
,
"40"
:
0.14334
,
"45"
:
0.14299
,
"50"
:
0.14181
,
"55"
:
0.14263
,
"60"
:
0.14235
,
"65"
:
0.14203
,
"70"
:
0.14227
,
"75"
:
0.14188
,
"80"
:
0.14258
,
"85"
:
0.14302
,
"90"
:
0.14176
,
"95"
:
0.14354
,
"100"
:
0.14267
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
100
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -49,4 +49,5 @@ MODEL_ARGS:
--ckpt-format
:
torch_dist
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
ckpt-resume
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.84554
,
"5"
:
10.86415
,
"10"
:
10.82215
,
"15"
:
10.81274
,
"20"
:
10.71915
,
"25"
:
10.53056
,
"30"
:
10.33604
,
"35"
:
10.24047
,
"40"
:
10.05025
,
"45"
:
9.76775
,
"50"
:
9.85479
,
"55"
:
9.82458
,
"60"
:
9.44264
,
"65"
:
8.89112
,
"70"
:
9.6789
,
"75"
:
9.36801
,
"80"
:
9.3576
,
"85"
:
9.56029
,
"90"
:
9.77049
,
"95"
:
9.48101
,
"100"
:
9.34984
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1700.0
,
"5"
:
2064.0
,
"10"
:
1561.0
,
"15"
:
1975.0
,
"20"
:
1696.0
,
"25"
:
1796.0
,
"30"
:
2014.0
,
"35"
:
2041.0
,
"40"
:
2189.0
,
"45"
:
2150.0
,
"50"
:
2403.0
,
"55"
:
2453.0
,
"60"
:
2540.0
,
"65"
:
2707.0
,
"70"
:
3080.0
,
"75"
:
2725.0
,
"80"
:
3156.0
,
"85"
:
3362.0
,
"90"
:
3032.0
,
"95"
:
3108.0
,
"100"
:
3352.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
890195968.0
,
"5"
:
890195968.0
,
"10"
:
890195968.0
,
"15"
:
890195968.0
,
"20"
:
890195968.0
,
"25"
:
890195968.0
,
"30"
:
890195968.0
,
"35"
:
890195968.0
,
"40"
:
890195968.0
,
"45"
:
890195968.0
,
"50"
:
890195968.0
,
"55"
:
890195968.0
,
"60"
:
890195968.0
,
"65"
:
890195968.0
,
"70"
:
890195968.0
,
"75"
:
890195968.0
,
"80"
:
890195968.0
,
"85"
:
890195968.0
,
"90"
:
890195968.0
,
"95"
:
890195968.0
,
"100"
:
890195968.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
2236149760.0
,
"5"
:
2596664320.0
,
"10"
:
2596664320.0
,
"15"
:
2596664320.0
,
"20"
:
2596664320.0
,
"25"
:
2596664320.0
,
"30"
:
2596664320.0
,
"35"
:
2596664320.0
,
"40"
:
2596664320.0
,
"45"
:
2596664320.0
,
"50"
:
2596664320.0
,
"55"
:
2596664320.0
,
"60"
:
2596664320.0
,
"65"
:
2596664320.0
,
"70"
:
2596664320.0
,
"75"
:
2596664320.0
,
"80"
:
2596664320.0
,
"85"
:
2596664320.0
,
"90"
:
2596664320.0
,
"95"
:
2596664320.0
,
"100"
:
2596664320.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
8.95666
,
"5"
:
0.15388
,
"10"
:
0.15258
,
"15"
:
0.15019
,
"20"
:
0.14968
,
"25"
:
0.14923
,
"30"
:
0.14924
,
"35"
:
0.14855
,
"40"
:
0.14992
,
"45"
:
0.14894
,
"50"
:
0.14897
,
"55"
:
0.15057
,
"60"
:
0.14854
,
"65"
:
0.14894
,
"70"
:
0.15078
,
"75"
:
0.14842
,
"80"
:
0.1482
,
"85"
:
0.14764
,
"90"
:
0.14679
,
"95"
:
0.14761
,
"100"
:
0.1488
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_lts.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.84474
,
"5"
:
10.86418
,
"10"
:
10.82155
,
"15"
:
10.81195
,
"20"
:
10.71872
,
"25"
:
10.53036
,
"30"
:
10.3358
,
"35"
:
10.24082
,
"40"
:
10.05008
,
"45"
:
9.76762
,
"50"
:
9.85505
,
"55"
:
9.82465
,
"60"
:
9.44305
,
"65"
:
8.89104
,
"70"
:
9.67902
,
"75"
:
9.36836
,
"80"
:
9.35799
,
"85"
:
9.56032
,
"90"
:
9.77055
,
"95"
:
9.48101
,
"100"
:
9.34997
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1776.0
,
"5"
:
2128.0
,
"10"
:
1615.0
,
"15"
:
2021.0
,
"20"
:
1775.0
,
"25"
:
1916.0
,
"30"
:
2029.0
,
"35"
:
2107.0
,
"40"
:
2174.0
,
"45"
:
2110.0
,
"50"
:
2363.0
,
"55"
:
2460.0
,
"60"
:
2462.0
,
"65"
:
2724.0
,
"70"
:
2952.0
,
"75"
:
2823.0
,
"80"
:
3222.0
,
"85"
:
3314.0
,
"90"
:
3087.0
,
"95"
:
3146.0
,
"100"
:
3331.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
888098304.0
,
"5"
:
888098304.0
,
"10"
:
888098304.0
,
"15"
:
888098304.0
,
"20"
:
888098304.0
,
"25"
:
888098304.0
,
"30"
:
888098304.0
,
"35"
:
888098304.0
,
"40"
:
888098304.0
,
"45"
:
888098304.0
,
"50"
:
888098304.0
,
"55"
:
888098304.0
,
"60"
:
888098304.0
,
"65"
:
888098304.0
,
"70"
:
888098304.0
,
"75"
:
888098304.0
,
"80"
:
888098304.0
,
"85"
:
888098304.0
,
"90"
:
888098304.0
,
"95"
:
888098304.0
,
"100"
:
888098304.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
3215778304.0
,
"5"
:
3575244288.0
,
"10"
:
3575244288.0
,
"15"
:
3575244288.0
,
"20"
:
3575244288.0
,
"25"
:
3575244288.0
,
"30"
:
3575244288.0
,
"35"
:
3575244288.0
,
"40"
:
3575244288.0
,
"45"
:
3575244288.0
,
"50"
:
3575244288.0
,
"55"
:
3575244288.0
,
"60"
:
3575244288.0
,
"65"
:
3575244288.0
,
"70"
:
3575244288.0
,
"75"
:
3575244288.0
,
"80"
:
3575244288.0
,
"85"
:
3575244288.0
,
"90"
:
3575244288.0
,
"95"
:
3575244288.0
,
"100"
:
3575244288.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
13.33569
,
"5"
:
0.16136
,
"10"
:
0.15782
,
"15"
:
0.15802
,
"20"
:
0.15824
,
"25"
:
0.16808
,
"30"
:
0.16851
,
"35"
:
0.1675
,
"40"
:
0.16865
,
"45"
:
0.16815
,
"50"
:
0.16766
,
"55"
:
0.1655
,
"60"
:
0.16617
,
"65"
:
0.16519
,
"70"
:
0.16575
,
"75"
:
0.16497
,
"80"
:
0.16524
,
"85"
:
0.16595
,
"90"
:
0.16421
,
"95"
:
0.16539
,
"100"
:
0.16546
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
100
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -50,4 +50,5 @@ MODEL_ARGS:
--ckpt-format
:
torch_dist
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
ckpt-resume
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/golden_values_dev.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.8468
,
10.87769
,
10.90302
,
10.82026
,
10.67979
,
10.60157
,
10.06449
,
10.19316
,
10.11411
,
9.76007
]
},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
1692.0
,
2044.0
,
2005.0
,
2007.0
,
1945.0
,
1868.0
,
1701.0
,
2085.0
,
2389.0
,
2377.0
]
},
"iteration-time"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.20538
,
0.14353
,
0.14213
,
0.14213
,
0.14068
,
0.14104
,
0.14078
,
0.14149
,
0.14065
,
0.14118
]
}
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.8468
,
"5"
:
10.86571
,
"10"
:
10.82412
,
"15"
:
10.8128
,
"20"
:
10.7201
,
"25"
:
10.53149
,
"30"
:
10.33653
,
"35"
:
10.24134
,
"40"
:
10.05092
,
"45"
:
9.76805
,
"50"
:
9.85531
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1692.0
,
"5"
:
2135.0
,
"10"
:
1681.0
,
"15"
:
2053.0
,
"20"
:
1708.0
,
"25"
:
1835.0
,
"30"
:
2038.0
,
"35"
:
2087.0
,
"40"
:
2276.0
,
"45"
:
2125.0
,
"50"
:
2363.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
888098304.0
,
"5"
:
888098304.0
,
"10"
:
888098304.0
,
"15"
:
888098304.0
,
"20"
:
888098304.0
,
"25"
:
888098304.0
,
"30"
:
888098304.0
,
"35"
:
888098304.0
,
"40"
:
888098304.0
,
"45"
:
888098304.0
,
"50"
:
888098304.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
3212632576.0
,
"5"
:
3572098560.0
,
"10"
:
3572098560.0
,
"15"
:
3572098560.0
,
"20"
:
3572098560.0
,
"25"
:
3572098560.0
,
"30"
:
3572098560.0
,
"35"
:
3572098560.0
,
"40"
:
3572098560.0
,
"45"
:
3572098560.0
,
"50"
:
3572098560.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
9.88958
,
"5"
:
0.14651
,
"10"
:
0.14518
,
"15"
:
0.14433
,
"20"
:
0.14484
,
"25"
:
0.14428
,
"30"
:
0.14459
,
"35"
:
0.1448
,
"40"
:
0.14541
,
"45"
:
0.14409
,
"50"
:
0.14459
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/golden_values_lts.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.8468
,
10.87772
,
10.90302
,
10.82024
,
10.67979
,
10.60157
,
10.06448
,
10.19311
,
10.1141
,
9.76008
]},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
1707.0
,
2086.0
,
2030.0
,
2000.0
,
1910.0
,
1894.0
,
1744.0
,
2071.0
,
2344.0
,
2377.0
]},
"iteration_timing_avg"
:
0.11051617647058823
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.8468
,
"5"
:
10.8657
,
"10"
:
10.82411
,
"15"
:
10.8128
,
"20"
:
10.72008
,
"25"
:
10.53151
,
"30"
:
10.33655
,
"35"
:
10.24133
,
"40"
:
10.05096
,
"45"
:
9.76804
,
"50"
:
9.85531
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1707.0
,
"5"
:
2121.0
,
"10"
:
1606.0
,
"15"
:
1959.0
,
"20"
:
1756.0
,
"25"
:
1848.0
,
"30"
:
2091.0
,
"35"
:
2089.0
,
"40"
:
2156.0
,
"45"
:
2137.0
,
"50"
:
2317.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
888098304.0
,
"5"
:
888098304.0
,
"10"
:
888098304.0
,
"15"
:
888098304.0
,
"20"
:
888098304.0
,
"25"
:
888098304.0
,
"30"
:
888098304.0
,
"35"
:
888098304.0
,
"40"
:
888098304.0
,
"45"
:
888098304.0
,
"50"
:
888098304.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
3212632576.0
,
"5"
:
3572098560.0
,
"10"
:
3572098560.0
,
"15"
:
3572098560.0
,
"20"
:
3572098560.0
,
"25"
:
3572098560.0
,
"30"
:
3572098560.0
,
"35"
:
3572098560.0
,
"40"
:
3572098560.0
,
"45"
:
3572098560.0
,
"50"
:
3572098560.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.69368
,
"5"
:
0.1479
,
"10"
:
0.14574
,
"15"
:
0.14499
,
"20"
:
0.14659
,
"25"
:
0.14524
,
"30"
:
0.14507
,
"35"
:
0.14609
,
"40"
:
0.1467
,
"45"
:
0.14341
,
"50"
:
0.14274
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
50
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -49,4 +49,5 @@ MODEL_ARGS:
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--attention-backend
:
unfused
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
regular
Prev
1
…
18
19
20
21
22
23
24
25
26
…
42
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment