Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
688448db
Commit
688448db
authored
Mar 14, 2025
by
silencealiang
Browse files
更新代码
parent
a02a5490
Pipeline
#2503
passed with stage
Changes
823
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
508 additions
and
12 deletions
+508
-12
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml
...ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev.json
...edding_wgrad_compute_dgx_a100_1N8G/golden_values_dev.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_lts.json
...edding_wgrad_compute_dgx_a100_1N8G/golden_values_lts.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml
...r_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev.json
...p2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_lts.json
...p2_resume_torch_dist_dgx_a100_1N8G/golden_values_lts.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
...tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_dev.json
...n_mask_in_dataloader_dgx_a100_1N8G/golden_values_dev.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_lts.json
...n_mask_in_dataloader_dgx_a100_1N8G/golden_values_lts.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml
...ention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev.json
...st_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts.json
...st_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
...ch_dist_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_dev.json
...ist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_dev.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_lts.json
...ist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_lts.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/model_config.yaml
...rch_dist_reshard_1x4xNone_dgx_a100_1N8G/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_dgx_a100_1N8G/golden_values_dev.json
...z3_resume_torch_dist_dgx_a100_1N8G/golden_values_dev.json
+142
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_dgx_a100_1N8G/golden_values_lts.json
...z3_resume_torch_dist_dgx_a100_1N8G/golden_values_lts.json
+142
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
...2_zp_z3_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
+54
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/golden_values_dev.json
...2parallel_top2router_dgx_a100_1N8G/golden_values_dev.json
+142
-0
No files found.
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
100
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -48,4 +48,5 @@ MODEL_ARGS:
--ckpt-format
:
torch_dist
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
ckpt-resume
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.92655
,
"5"
:
10.92715
,
"10"
:
10.90788
,
"15"
:
10.88296
,
"20"
:
10.77598
,
"25"
:
10.59263
,
"30"
:
10.39177
,
"35"
:
10.297
,
"40"
:
10.09664
,
"45"
:
9.84468
,
"50"
:
9.90938
,
"55"
:
9.87767
,
"60"
:
9.4912
,
"65"
:
8.94239
,
"70"
:
9.72271
,
"75"
:
9.41883
,
"80"
:
9.40054
,
"85"
:
9.61183
,
"90"
:
9.81021
,
"95"
:
9.51721
,
"100"
:
9.40125
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
61.0
,
"5"
:
67.0
,
"10"
:
45.0
,
"15"
:
63.0
,
"20"
:
62.0
,
"25"
:
59.0
,
"30"
:
62.0
,
"35"
:
73.0
,
"40"
:
68.0
,
"45"
:
80.0
,
"50"
:
96.0
,
"55"
:
51.0
,
"60"
:
83.0
,
"65"
:
93.0
,
"70"
:
91.0
,
"75"
:
76.0
,
"80"
:
78.0
,
"85"
:
78.0
,
"90"
:
88.0
,
"95"
:
82.0
,
"100"
:
90.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
487096832.0
,
"5"
:
487096832.0
,
"10"
:
487096832.0
,
"15"
:
487096832.0
,
"20"
:
487096832.0
,
"25"
:
487096832.0
,
"30"
:
487096832.0
,
"35"
:
487096832.0
,
"40"
:
487096832.0
,
"45"
:
487096832.0
,
"50"
:
487096832.0
,
"55"
:
487096832.0
,
"60"
:
487096832.0
,
"65"
:
487096832.0
,
"70"
:
487096832.0
,
"75"
:
487096832.0
,
"80"
:
487096832.0
,
"85"
:
487096832.0
,
"90"
:
487096832.0
,
"95"
:
487096832.0
,
"100"
:
487096832.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1668052480.0
,
"5"
:
1848125952.0
,
"10"
:
1848125952.0
,
"15"
:
1848125952.0
,
"20"
:
1848125952.0
,
"25"
:
1848125952.0
,
"30"
:
1848125952.0
,
"35"
:
1848125952.0
,
"40"
:
1848125952.0
,
"45"
:
1848125952.0
,
"50"
:
1848125952.0
,
"55"
:
1848125952.0
,
"60"
:
1848125952.0
,
"65"
:
1848125952.0
,
"70"
:
1848125952.0
,
"75"
:
1848125952.0
,
"80"
:
1848125952.0
,
"85"
:
1848125952.0
,
"90"
:
1848125952.0
,
"95"
:
1848125952.0
,
"100"
:
1848125952.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.8138
,
"5"
:
0.19926
,
"10"
:
0.19439
,
"15"
:
0.19389
,
"20"
:
0.19552
,
"25"
:
0.19186
,
"30"
:
0.19341
,
"35"
:
0.19268
,
"40"
:
0.19289
,
"45"
:
0.19218
,
"50"
:
0.19214
,
"55"
:
0.19236
,
"60"
:
0.19561
,
"65"
:
0.19299
,
"70"
:
0.19296
,
"75"
:
0.19308
,
"80"
:
0.19336
,
"85"
:
0.19452
,
"90"
:
0.19164
,
"95"
:
0.19304
,
"100"
:
0.19217
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_lts.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.92705
,
"5"
:
10.92799
,
"10"
:
10.90789
,
"15"
:
10.88313
,
"20"
:
10.77626
,
"25"
:
10.59138
,
"30"
:
10.39195
,
"35"
:
10.29687
,
"40"
:
10.0964
,
"45"
:
9.84466
,
"50"
:
9.90919
,
"55"
:
9.87765
,
"60"
:
9.49125
,
"65"
:
8.94236
,
"70"
:
9.72262
,
"75"
:
9.4191
,
"80"
:
9.40075
,
"85"
:
9.61211
,
"90"
:
9.81017
,
"95"
:
9.51717
,
"100"
:
9.40147
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
68.0
,
"5"
:
64.0
,
"10"
:
61.0
,
"15"
:
58.0
,
"20"
:
64.0
,
"25"
:
58.0
,
"30"
:
85.0
,
"35"
:
66.0
,
"40"
:
85.0
,
"45"
:
82.0
,
"50"
:
68.0
,
"55"
:
84.0
,
"60"
:
71.0
,
"65"
:
85.0
,
"70"
:
92.0
,
"75"
:
62.0
,
"80"
:
87.0
,
"85"
:
74.0
,
"90"
:
71.0
,
"95"
:
79.0
,
"100"
:
72.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
487096320.0
,
"5"
:
487096320.0
,
"10"
:
487096320.0
,
"15"
:
487096320.0
,
"20"
:
487096320.0
,
"25"
:
487096320.0
,
"30"
:
487096320.0
,
"35"
:
487096320.0
,
"40"
:
487096320.0
,
"45"
:
487096320.0
,
"50"
:
487096320.0
,
"55"
:
487096320.0
,
"60"
:
487096320.0
,
"65"
:
487096320.0
,
"70"
:
487096320.0
,
"75"
:
487096320.0
,
"80"
:
487096320.0
,
"85"
:
487096320.0
,
"90"
:
487096320.0
,
"95"
:
487096320.0
,
"100"
:
487096320.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
2158389248.0
,
"5"
:
2338462720.0
,
"10"
:
2338462720.0
,
"15"
:
2338462720.0
,
"20"
:
2338462720.0
,
"25"
:
2338462720.0
,
"30"
:
2338462720.0
,
"35"
:
2338462720.0
,
"40"
:
2338462720.0
,
"45"
:
2338462720.0
,
"50"
:
2338462720.0
,
"55"
:
2338462720.0
,
"60"
:
2338462720.0
,
"65"
:
2338462720.0
,
"70"
:
2338462720.0
,
"75"
:
2338462720.0
,
"80"
:
2338462720.0
,
"85"
:
2338462720.0
,
"90"
:
2338462720.0
,
"95"
:
2338462720.0
,
"100"
:
2338462720.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.88233
,
"5"
:
0.22608
,
"10"
:
0.21553
,
"15"
:
0.21336
,
"20"
:
0.21247
,
"25"
:
0.21243
,
"30"
:
0.23729
,
"35"
:
0.2257
,
"40"
:
0.21253
,
"45"
:
0.21718
,
"50"
:
0.21345
,
"55"
:
0.21376
,
"60"
:
0.21327
,
"65"
:
0.21242
,
"70"
:
0.21194
,
"75"
:
0.21274
,
"80"
:
0.21252
,
"85"
:
0.21061
,
"90"
:
0.21024
,
"95"
:
0.21239
,
"100"
:
0.21117
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
100
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -48,4 +48,5 @@ MODEL_ARGS:
--ckpt-format
:
torch_dist
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
ckpt-resume
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.92655
,
"5"
:
10.9272
,
"10"
:
10.90786
,
"15"
:
10.88292
,
"20"
:
10.77603
,
"25"
:
10.59271
,
"30"
:
10.39175
,
"35"
:
10.297
,
"40"
:
10.09664
,
"45"
:
9.84468
,
"50"
:
9.9094
,
"55"
:
9.87765
,
"60"
:
9.49117
,
"65"
:
8.94241
,
"70"
:
9.72269
,
"75"
:
9.41888
,
"80"
:
9.40055
,
"85"
:
9.61184
,
"90"
:
9.81022
,
"95"
:
9.51724
,
"100"
:
9.4013
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1637.0
,
"5"
:
1988.0
,
"10"
:
1422.0
,
"15"
:
1936.0
,
"20"
:
1566.0
,
"25"
:
1705.0
,
"30"
:
1974.0
,
"35"
:
2043.0
,
"40"
:
2249.0
,
"45"
:
2145.0
,
"50"
:
2454.0
,
"55"
:
2388.0
,
"60"
:
2479.0
,
"65"
:
2674.0
,
"70"
:
3241.0
,
"75"
:
2687.0
,
"80"
:
3465.0
,
"85"
:
3382.0
,
"90"
:
3023.0
,
"95"
:
3415.0
,
"100"
:
3347.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
487096832.0
,
"5"
:
487096832.0
,
"10"
:
487096832.0
,
"15"
:
487096832.0
,
"20"
:
487096832.0
,
"25"
:
487096832.0
,
"30"
:
487096832.0
,
"35"
:
487096832.0
,
"40"
:
487096832.0
,
"45"
:
487096832.0
,
"50"
:
487096832.0
,
"55"
:
487096832.0
,
"60"
:
487096832.0
,
"65"
:
487096832.0
,
"70"
:
487096832.0
,
"75"
:
487096832.0
,
"80"
:
487096832.0
,
"85"
:
487096832.0
,
"90"
:
487096832.0
,
"95"
:
487096832.0
,
"100"
:
487096832.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1229747712.0
,
"5"
:
1409821184.0
,
"10"
:
1409821184.0
,
"15"
:
1409821184.0
,
"20"
:
1409821184.0
,
"25"
:
1409821184.0
,
"30"
:
1409821184.0
,
"35"
:
1409821184.0
,
"40"
:
1409821184.0
,
"45"
:
1409821184.0
,
"50"
:
1409821184.0
,
"55"
:
1409821184.0
,
"60"
:
1409821184.0
,
"65"
:
1409821184.0
,
"70"
:
1409821184.0
,
"75"
:
1409821184.0
,
"80"
:
1409821184.0
,
"85"
:
1409821184.0
,
"90"
:
1409821184.0
,
"95"
:
1409821184.0
,
"100"
:
1409821184.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
13.17732
,
"5"
:
0.20296
,
"10"
:
0.20325
,
"15"
:
0.20174
,
"20"
:
0.20216
,
"25"
:
0.20151
,
"30"
:
0.20223
,
"35"
:
0.20172
,
"40"
:
0.20152
,
"45"
:
0.20108
,
"50"
:
0.20046
,
"55"
:
0.1934
,
"60"
:
0.19326
,
"65"
:
0.19362
,
"70"
:
0.19278
,
"75"
:
0.19295
,
"80"
:
0.19307
,
"85"
:
0.19325
,
"90"
:
0.19304
,
"95"
:
0.19317
,
"100"
:
0.19328
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_lts.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.92705
,
"5"
:
10.92795
,
"10"
:
10.90786
,
"15"
:
10.88314
,
"20"
:
10.77629
,
"25"
:
10.5914
,
"30"
:
10.39194
,
"35"
:
10.29685
,
"40"
:
10.09639
,
"45"
:
9.84463
,
"50"
:
9.90918
,
"55"
:
9.87766
,
"60"
:
9.49126
,
"65"
:
8.94236
,
"70"
:
9.72266
,
"75"
:
9.41909
,
"80"
:
9.40076
,
"85"
:
9.61209
,
"90"
:
9.81018
,
"95"
:
9.51718
,
"100"
:
9.40151
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1627.0
,
"5"
:
2010.0
,
"10"
:
1368.0
,
"15"
:
1897.0
,
"20"
:
1626.0
,
"25"
:
1769.0
,
"30"
:
1899.0
,
"35"
:
1988.0
,
"40"
:
2199.0
,
"45"
:
2158.0
,
"50"
:
2494.0
,
"55"
:
2485.0
,
"60"
:
2351.0
,
"65"
:
2777.0
,
"70"
:
3197.0
,
"75"
:
2615.0
,
"80"
:
3395.0
,
"85"
:
3340.0
,
"90"
:
3060.0
,
"95"
:
3408.0
,
"100"
:
3242.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
487096320.0
,
"5"
:
487096320.0
,
"10"
:
487096320.0
,
"15"
:
487096320.0
,
"20"
:
487096320.0
,
"25"
:
487096320.0
,
"30"
:
487096320.0
,
"35"
:
487096320.0
,
"40"
:
487096320.0
,
"45"
:
487096320.0
,
"50"
:
487096320.0
,
"55"
:
487096320.0
,
"60"
:
487096320.0
,
"65"
:
487096320.0
,
"70"
:
487096320.0
,
"75"
:
487096320.0
,
"80"
:
487096320.0
,
"85"
:
487096320.0
,
"90"
:
487096320.0
,
"95"
:
487096320.0
,
"100"
:
487096320.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1720084480.0
,
"5"
:
1900157952.0
,
"10"
:
1900157952.0
,
"15"
:
1900157952.0
,
"20"
:
1900157952.0
,
"25"
:
1900157952.0
,
"30"
:
1900157952.0
,
"35"
:
1900157952.0
,
"40"
:
1900157952.0
,
"45"
:
1900157952.0
,
"50"
:
1900157952.0
,
"55"
:
1900157952.0
,
"60"
:
1900157952.0
,
"65"
:
1900157952.0
,
"70"
:
1900157952.0
,
"75"
:
1900157952.0
,
"80"
:
1900157952.0
,
"85"
:
1900157952.0
,
"90"
:
1900157952.0
,
"95"
:
1900157952.0
,
"100"
:
1900157952.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
13.97156
,
"5"
:
0.2168
,
"10"
:
0.21367
,
"15"
:
0.22327
,
"20"
:
0.20978
,
"25"
:
0.20953
,
"30"
:
0.21033
,
"35"
:
0.20882
,
"40"
:
0.21062
,
"45"
:
0.20902
,
"50"
:
0.20932
,
"55"
:
0.21153
,
"60"
:
0.20966
,
"65"
:
0.20901
,
"70"
:
0.20892
,
"75"
:
0.21183
,
"80"
:
0.21189
,
"85"
:
0.21367
,
"90"
:
0.21386
,
"95"
:
0.21529
,
"100"
:
0.21247
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
100
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -47,4 +47,5 @@ MODEL_ARGS:
--ckpt-format
:
torch_dist
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
ckpt-resume
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_dev.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.92655
,
"5"
:
10.9272
,
"10"
:
10.90786
,
"15"
:
10.88292
,
"20"
:
10.77603
,
"25"
:
10.59271
,
"30"
:
10.39175
,
"35"
:
10.297
,
"40"
:
10.09664
,
"45"
:
9.84468
,
"50"
:
9.9094
,
"55"
:
9.87765
,
"60"
:
9.49117
,
"65"
:
8.94241
,
"70"
:
9.72269
,
"75"
:
9.41888
,
"80"
:
9.40055
,
"85"
:
9.61184
,
"90"
:
9.81022
,
"95"
:
9.51724
,
"100"
:
9.4013
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1637.0
,
"5"
:
1988.0
,
"10"
:
1422.0
,
"15"
:
1936.0
,
"20"
:
1566.0
,
"25"
:
1705.0
,
"30"
:
1974.0
,
"35"
:
2043.0
,
"40"
:
2249.0
,
"45"
:
2145.0
,
"50"
:
2454.0
,
"55"
:
2388.0
,
"60"
:
2479.0
,
"65"
:
2674.0
,
"70"
:
3241.0
,
"75"
:
2687.0
,
"80"
:
3465.0
,
"85"
:
3382.0
,
"90"
:
3023.0
,
"95"
:
3415.0
,
"100"
:
3347.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
487096832.0
,
"5"
:
487096832.0
,
"10"
:
487096832.0
,
"15"
:
487096832.0
,
"20"
:
487096832.0
,
"25"
:
487096832.0
,
"30"
:
487096832.0
,
"35"
:
487096832.0
,
"40"
:
487096832.0
,
"45"
:
487096832.0
,
"50"
:
487096832.0
,
"55"
:
487096832.0
,
"60"
:
487096832.0
,
"65"
:
487096832.0
,
"70"
:
487096832.0
,
"75"
:
487096832.0
,
"80"
:
487096832.0
,
"85"
:
487096832.0
,
"90"
:
487096832.0
,
"95"
:
487096832.0
,
"100"
:
487096832.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1225553408.0
,
"5"
:
1405626880.0
,
"10"
:
1405626880.0
,
"15"
:
1405626880.0
,
"20"
:
1405626880.0
,
"25"
:
1405626880.0
,
"30"
:
1405626880.0
,
"35"
:
1405626880.0
,
"40"
:
1405626880.0
,
"45"
:
1405626880.0
,
"50"
:
1405626880.0
,
"55"
:
1405626880.0
,
"60"
:
1405626880.0
,
"65"
:
1405626880.0
,
"70"
:
1405626880.0
,
"75"
:
1405626880.0
,
"80"
:
1405626880.0
,
"85"
:
1405626880.0
,
"90"
:
1405626880.0
,
"95"
:
1405626880.0
,
"100"
:
1405626880.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
12.43309
,
"5"
:
0.19435
,
"10"
:
0.19438
,
"15"
:
0.19481
,
"20"
:
0.19447
,
"25"
:
0.19273
,
"30"
:
0.19383
,
"35"
:
0.19374
,
"40"
:
0.19351
,
"45"
:
0.19317
,
"50"
:
0.19324
,
"55"
:
0.19031
,
"60"
:
0.19029
,
"65"
:
0.1911
,
"70"
:
0.19168
,
"75"
:
0.19169
,
"80"
:
0.1923
,
"85"
:
0.19181
,
"90"
:
0.19164
,
"95"
:
0.19197
,
"100"
:
0.19113
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_lts.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.92705
,
"5"
:
10.92795
,
"10"
:
10.90786
,
"15"
:
10.88314
,
"20"
:
10.77629
,
"25"
:
10.5914
,
"30"
:
10.39194
,
"35"
:
10.29685
,
"40"
:
10.09639
,
"45"
:
9.84463
,
"50"
:
9.90918
,
"55"
:
9.87766
,
"60"
:
9.49126
,
"65"
:
8.94236
,
"70"
:
9.72266
,
"75"
:
9.41909
,
"80"
:
9.40076
,
"85"
:
9.61209
,
"90"
:
9.81018
,
"95"
:
9.51718
,
"100"
:
9.40151
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1627.0
,
"5"
:
2010.0
,
"10"
:
1368.0
,
"15"
:
1897.0
,
"20"
:
1626.0
,
"25"
:
1769.0
,
"30"
:
1899.0
,
"35"
:
1988.0
,
"40"
:
2199.0
,
"45"
:
2158.0
,
"50"
:
2494.0
,
"55"
:
2485.0
,
"60"
:
2351.0
,
"65"
:
2777.0
,
"70"
:
3197.0
,
"75"
:
2615.0
,
"80"
:
3395.0
,
"85"
:
3340.0
,
"90"
:
3060.0
,
"95"
:
3408.0
,
"100"
:
3242.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
487096320.0
,
"5"
:
487096320.0
,
"10"
:
487096320.0
,
"15"
:
487096320.0
,
"20"
:
487096320.0
,
"25"
:
487096320.0
,
"30"
:
487096320.0
,
"35"
:
487096320.0
,
"40"
:
487096320.0
,
"45"
:
487096320.0
,
"50"
:
487096320.0
,
"55"
:
487096320.0
,
"60"
:
487096320.0
,
"65"
:
487096320.0
,
"70"
:
487096320.0
,
"75"
:
487096320.0
,
"80"
:
487096320.0
,
"85"
:
487096320.0
,
"90"
:
487096320.0
,
"95"
:
487096320.0
,
"100"
:
487096320.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1715890176.0
,
"5"
:
1895963648.0
,
"10"
:
1895963648.0
,
"15"
:
1895963648.0
,
"20"
:
1895963648.0
,
"25"
:
1895963648.0
,
"30"
:
1895963648.0
,
"35"
:
1895963648.0
,
"40"
:
1895963648.0
,
"45"
:
1895963648.0
,
"50"
:
1895963648.0
,
"55"
:
1895963648.0
,
"60"
:
1895963648.0
,
"65"
:
1895963648.0
,
"70"
:
1895963648.0
,
"75"
:
1895963648.0
,
"80"
:
1895963648.0
,
"85"
:
1895963648.0
,
"90"
:
1895963648.0
,
"95"
:
1895963648.0
,
"100"
:
1895963648.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.12901
,
"5"
:
0.21538
,
"10"
:
0.21548
,
"15"
:
0.2128
,
"20"
:
0.21291
,
"25"
:
0.21127
,
"30"
:
0.21513
,
"35"
:
0.21158
,
"40"
:
0.213
,
"45"
:
0.21093
,
"50"
:
0.2091
,
"55"
:
0.20696
,
"60"
:
0.21221
,
"65"
:
0.20519
,
"70"
:
0.2076
,
"75"
:
0.20862
,
"80"
:
0.20653
,
"85"
:
0.20713
,
"90"
:
0.20604
,
"95"
:
0.21111
,
"100"
:
0.20922
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
100
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -48,4 +48,5 @@ MODEL_ARGS:
--ckpt-format
:
torch_dist
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
ckpt-resume
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.92655
,
"5"
:
10.9272
,
"10"
:
10.90786
,
"15"
:
10.88292
,
"20"
:
10.77603
,
"25"
:
10.59271
,
"30"
:
10.39175
,
"35"
:
10.297
,
"40"
:
10.09664
,
"45"
:
9.84468
,
"50"
:
9.9094
,
"55"
:
9.87765
,
"60"
:
9.49117
,
"65"
:
8.94241
,
"70"
:
9.72269
,
"75"
:
9.41888
,
"80"
:
9.40055
,
"85"
:
9.61184
,
"90"
:
9.81022
,
"95"
:
9.51724
,
"100"
:
9.4013
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1637.0
,
"5"
:
1988.0
,
"10"
:
1422.0
,
"15"
:
1936.0
,
"20"
:
1566.0
,
"25"
:
1705.0
,
"30"
:
1974.0
,
"35"
:
2043.0
,
"40"
:
2249.0
,
"45"
:
2145.0
,
"50"
:
2454.0
,
"55"
:
2388.0
,
"60"
:
2479.0
,
"65"
:
2674.0
,
"70"
:
3241.0
,
"75"
:
2687.0
,
"80"
:
3465.0
,
"85"
:
3382.0
,
"90"
:
3023.0
,
"95"
:
3415.0
,
"100"
:
3347.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
490242560.0
,
"5"
:
490242560.0
,
"10"
:
490242560.0
,
"15"
:
490242560.0
,
"20"
:
490242560.0
,
"25"
:
490242560.0
,
"30"
:
490242560.0
,
"35"
:
490242560.0
,
"40"
:
490242560.0
,
"45"
:
490242560.0
,
"50"
:
490242560.0
,
"55"
:
490242560.0
,
"60"
:
490242560.0
,
"65"
:
490242560.0
,
"70"
:
490242560.0
,
"75"
:
490242560.0
,
"80"
:
490242560.0
,
"85"
:
490242560.0
,
"90"
:
490242560.0
,
"95"
:
490242560.0
,
"100"
:
490242560.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1228699136.0
,
"5"
:
1414015488.0
,
"10"
:
1414015488.0
,
"15"
:
1414015488.0
,
"20"
:
1414015488.0
,
"25"
:
1414015488.0
,
"30"
:
1414015488.0
,
"35"
:
1414015488.0
,
"40"
:
1414015488.0
,
"45"
:
1414015488.0
,
"50"
:
1414015488.0
,
"55"
:
1414015488.0
,
"60"
:
1414015488.0
,
"65"
:
1414015488.0
,
"70"
:
1414015488.0
,
"75"
:
1414015488.0
,
"80"
:
1414015488.0
,
"85"
:
1414015488.0
,
"90"
:
1414015488.0
,
"95"
:
1414015488.0
,
"100"
:
1414015488.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
11.55848
,
"5"
:
0.19452
,
"10"
:
0.1941
,
"15"
:
0.19706
,
"20"
:
0.19456
,
"25"
:
0.19225
,
"30"
:
0.19466
,
"35"
:
0.19187
,
"40"
:
0.19248
,
"45"
:
0.1906
,
"50"
:
0.19117
,
"55"
:
0.20393
,
"60"
:
0.20447
,
"65"
:
0.20474
,
"70"
:
0.20347
,
"75"
:
0.20347
,
"80"
:
0.20417
,
"85"
:
0.2045
,
"90"
:
0.20333
,
"95"
:
0.20388
,
"100"
:
0.20321
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.92705
,
"5"
:
10.92795
,
"10"
:
10.90786
,
"15"
:
10.88314
,
"20"
:
10.77629
,
"25"
:
10.5914
,
"30"
:
10.39194
,
"35"
:
10.29685
,
"40"
:
10.09639
,
"45"
:
9.84463
,
"50"
:
9.90918
,
"55"
:
9.87766
,
"60"
:
9.49126
,
"65"
:
8.94236
,
"70"
:
9.72266
,
"75"
:
9.41909
,
"80"
:
9.40076
,
"85"
:
9.61209
,
"90"
:
9.81018
,
"95"
:
9.51718
,
"100"
:
9.40151
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1627.0
,
"5"
:
2010.0
,
"10"
:
1368.0
,
"15"
:
1897.0
,
"20"
:
1626.0
,
"25"
:
1769.0
,
"30"
:
1899.0
,
"35"
:
1988.0
,
"40"
:
2199.0
,
"45"
:
2158.0
,
"50"
:
2494.0
,
"55"
:
2485.0
,
"60"
:
2351.0
,
"65"
:
2777.0
,
"70"
:
3197.0
,
"75"
:
2615.0
,
"80"
:
3395.0
,
"85"
:
3340.0
,
"90"
:
3060.0
,
"95"
:
3408.0
,
"100"
:
3242.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
487096320.0
,
"5"
:
487096320.0
,
"10"
:
487096320.0
,
"15"
:
487096320.0
,
"20"
:
487096320.0
,
"25"
:
487096320.0
,
"30"
:
487096320.0
,
"35"
:
487096320.0
,
"40"
:
487096320.0
,
"45"
:
487096320.0
,
"50"
:
487096320.0
,
"55"
:
487096320.0
,
"60"
:
487096320.0
,
"65"
:
487096320.0
,
"70"
:
487096320.0
,
"75"
:
487096320.0
,
"80"
:
487096320.0
,
"85"
:
487096320.0
,
"90"
:
487096320.0
,
"95"
:
487096320.0
,
"100"
:
487096320.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1720084480.0
,
"5"
:
1900157952.0
,
"10"
:
1900157952.0
,
"15"
:
1900157952.0
,
"20"
:
1900157952.0
,
"25"
:
1900157952.0
,
"30"
:
1900157952.0
,
"35"
:
1900157952.0
,
"40"
:
1900157952.0
,
"45"
:
1900157952.0
,
"50"
:
1900157952.0
,
"55"
:
1900157952.0
,
"60"
:
1900157952.0
,
"65"
:
1900157952.0
,
"70"
:
1900157952.0
,
"75"
:
1900157952.0
,
"80"
:
1900157952.0
,
"85"
:
1900157952.0
,
"90"
:
1900157952.0
,
"95"
:
1900157952.0
,
"100"
:
1900157952.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
11.22421
,
"5"
:
0.2135
,
"10"
:
0.21228
,
"15"
:
0.21124
,
"20"
:
0.21112
,
"25"
:
0.21341
,
"30"
:
0.21004
,
"35"
:
0.21039
,
"40"
:
0.21245
,
"45"
:
0.21157
,
"50"
:
0.21206
,
"55"
:
0.21309
,
"60"
:
0.21493
,
"65"
:
0.2203
,
"70"
:
0.21919
,
"75"
:
0.2139
,
"80"
:
0.21624
,
"85"
:
0.21803
,
"90"
:
0.21757
,
"95"
:
0.21527
,
"100"
:
0.21237
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
100
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -48,4 +48,5 @@ MODEL_ARGS:
--ckpt-format
:
torch_dist
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
ckpt-resume
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_dev.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.92655
,
"5"
:
10.92718
,
"10"
:
10.90795
,
"15"
:
10.88296
,
"20"
:
10.77593
,
"25"
:
10.59272
,
"30"
:
10.39174
,
"35"
:
10.29697
,
"40"
:
10.09661
,
"45"
:
9.84472
,
"50"
:
9.90947
,
"55"
:
9.87772
,
"60"
:
9.49122
,
"65"
:
8.94261
,
"70"
:
9.72277
,
"75"
:
9.41891
,
"80"
:
9.40056
,
"85"
:
9.61186
,
"90"
:
9.81027
,
"95"
:
9.51723
,
"100"
:
9.40137
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1611.0
,
"5"
:
1973.0
,
"10"
:
1470.0
,
"15"
:
1891.0
,
"20"
:
1584.0
,
"25"
:
1645.0
,
"30"
:
1962.0
,
"35"
:
1981.0
,
"40"
:
2112.0
,
"45"
:
2100.0
,
"50"
:
2531.0
,
"55"
:
2378.0
,
"60"
:
2386.0
,
"65"
:
2711.0
,
"70"
:
3230.0
,
"75"
:
2725.0
,
"80"
:
3457.0
,
"85"
:
3332.0
,
"90"
:
3085.0
,
"95"
:
3461.0
,
"100"
:
3332.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
438469120.0
,
"5"
:
438469120.0
,
"10"
:
438469120.0
,
"15"
:
438469120.0
,
"20"
:
438469120.0
,
"25"
:
438469120.0
,
"30"
:
438469120.0
,
"35"
:
438469120.0
,
"40"
:
438469120.0
,
"45"
:
438469120.0
,
"50"
:
438469120.0
,
"55"
:
438469120.0
,
"60"
:
438469120.0
,
"65"
:
438469120.0
,
"70"
:
438469120.0
,
"75"
:
438469120.0
,
"80"
:
438469120.0
,
"85"
:
438469120.0
,
"90"
:
438469120.0
,
"95"
:
438469120.0
,
"100"
:
438469120.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1179678208.0
,
"5"
:
1361717760.0
,
"10"
:
1361717760.0
,
"15"
:
1361717760.0
,
"20"
:
1361717760.0
,
"25"
:
1361717760.0
,
"30"
:
1361717760.0
,
"35"
:
1361717760.0
,
"40"
:
1361717760.0
,
"45"
:
1361717760.0
,
"50"
:
1361717760.0
,
"55"
:
1361717760.0
,
"60"
:
1361717760.0
,
"65"
:
1361717760.0
,
"70"
:
1361717760.0
,
"75"
:
1361717760.0
,
"80"
:
1361717760.0
,
"85"
:
1361717760.0
,
"90"
:
1361717760.0
,
"95"
:
1361717760.0
,
"100"
:
1361717760.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
11.67908
,
"5"
:
0.18507
,
"10"
:
0.18222
,
"15"
:
0.18185
,
"20"
:
0.181
,
"25"
:
0.18035
,
"30"
:
0.18093
,
"35"
:
0.18016
,
"40"
:
0.17965
,
"45"
:
0.17953
,
"50"
:
0.17971
,
"55"
:
0.17583
,
"60"
:
0.1751
,
"65"
:
0.17527
,
"70"
:
0.17444
,
"75"
:
0.17517
,
"80"
:
0.17438
,
"85"
:
0.17443
,
"90"
:
0.17435
,
"95"
:
0.17419
,
"100"
:
0.17558
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_lts.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.92655
,
"5"
:
10.92717
,
"10"
:
10.90792
,
"15"
:
10.88291
,
"20"
:
10.77595
,
"25"
:
10.59266
,
"30"
:
10.39176
,
"35"
:
10.29699
,
"40"
:
10.09666
,
"45"
:
9.84474
,
"50"
:
9.90944
,
"55"
:
9.87774
,
"60"
:
9.49116
,
"65"
:
8.94259
,
"70"
:
9.72275
,
"75"
:
9.4189
,
"80"
:
9.40056
,
"85"
:
9.61183
,
"90"
:
9.81023
,
"95"
:
9.51721
,
"100"
:
9.4013
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1587.0
,
"5"
:
1991.0
,
"10"
:
1408.0
,
"15"
:
1899.0
,
"20"
:
1647.0
,
"25"
:
1674.0
,
"30"
:
1912.0
,
"35"
:
1972.0
,
"40"
:
2247.0
,
"45"
:
2075.0
,
"50"
:
2469.0
,
"55"
:
2421.0
,
"60"
:
2487.0
,
"65"
:
2765.0
,
"70"
:
3291.0
,
"75"
:
2709.0
,
"80"
:
3493.0
,
"85"
:
3365.0
,
"90"
:
3095.0
,
"95"
:
3435.0
,
"100"
:
3327.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
435847168.0
,
"5"
:
435847168.0
,
"10"
:
435847168.0
,
"15"
:
435847168.0
,
"20"
:
435847168.0
,
"25"
:
436895744.0
,
"30"
:
435847168.0
,
"35"
:
435847168.0
,
"40"
:
435847168.0
,
"45"
:
435847168.0
,
"50"
:
435847168.0
,
"55"
:
435847168.0
,
"60"
:
435847168.0
,
"65"
:
435847168.0
,
"70"
:
435847168.0
,
"75"
:
435847168.0
,
"80"
:
435847168.0
,
"85"
:
435847168.0
,
"90"
:
435847168.0
,
"95"
:
435847168.0
,
"100"
:
435847168.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1179682816.0
,
"5"
:
1359626240.0
,
"10"
:
1359626240.0
,
"15"
:
1359626240.0
,
"20"
:
1359626240.0
,
"25"
:
1359626240.0
,
"30"
:
1359626240.0
,
"35"
:
1359626240.0
,
"40"
:
1359626240.0
,
"45"
:
1359626240.0
,
"50"
:
1359626240.0
,
"55"
:
1359626240.0
,
"60"
:
1359626240.0
,
"65"
:
1359626240.0
,
"70"
:
1359626240.0
,
"75"
:
1359626240.0
,
"80"
:
1359626240.0
,
"85"
:
1359626240.0
,
"90"
:
1359626240.0
,
"95"
:
1359626240.0
,
"100"
:
1359626240.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
13.04316
,
"5"
:
0.1807
,
"10"
:
0.17867
,
"15"
:
0.17689
,
"20"
:
0.17644
,
"25"
:
0.17764
,
"30"
:
0.17742
,
"35"
:
0.1794
,
"40"
:
0.17805
,
"45"
:
0.17812
,
"50"
:
0.18362
,
"55"
:
0.17265
,
"60"
:
0.17303
,
"65"
:
0.17109
,
"70"
:
0.17167
,
"75"
:
0.17216
,
"80"
:
0.17147
,
"85"
:
0.17705
,
"90"
:
0.17916
,
"95"
:
0.17291
,
"100"
:
0.17146
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/model_config.yaml
View file @
688448db
...
...
@@ -17,8 +17,8 @@ MODEL_ARGS:
--train-iters
:
100
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -44,4 +44,5 @@ MODEL_ARGS:
--ckpt-format
:
torch_dist
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
ckpt-resume
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_dgx_a100_1N8G/golden_values_dev.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.88789
,
"5"
:
10.90966
,
"10"
:
10.87793
,
"15"
:
10.86382
,
"20"
:
10.75082
,
"25"
:
10.5988
,
"30"
:
10.40099
,
"35"
:
10.30785
,
"40"
:
10.10955
,
"45"
:
9.85867
,
"50"
:
9.92084
,
"55"
:
9.88535
,
"60"
:
9.50758
,
"65"
:
8.95821
,
"70"
:
9.72738
,
"75"
:
9.42579
,
"80"
:
9.40535
,
"85"
:
9.61537
,
"90"
:
9.81263
,
"95"
:
9.52135
,
"100"
:
9.40103
}
},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1742.0
,
"5"
:
2115.0
,
"10"
:
1468.0
,
"15"
:
1877.0
,
"20"
:
1665.0
,
"25"
:
1643.0
,
"30"
:
1900.0
,
"35"
:
2086.0
,
"40"
:
2185.0
,
"45"
:
2254.0
,
"50"
:
2496.0
,
"55"
:
2418.0
,
"60"
:
2489.0
,
"65"
:
2697.0
,
"70"
:
3267.0
,
"75"
:
2631.0
,
"80"
:
3442.0
,
"85"
:
3440.0
,
"90"
:
3075.0
,
"95"
:
3348.0
,
"100"
:
3389.0
}
},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
246437376.0
,
"5"
:
246437376.0
,
"10"
:
246437376.0
,
"15"
:
246437376.0
,
"20"
:
246437376.0
,
"25"
:
246437376.0
,
"30"
:
246437376.0
,
"35"
:
246437376.0
,
"40"
:
246437376.0
,
"45"
:
246437376.0
,
"50"
:
246437376.0
,
"55"
:
246437376.0
,
"60"
:
246437376.0
,
"65"
:
246437376.0
,
"70"
:
246437376.0
,
"75"
:
246437376.0
,
"80"
:
246437376.0
,
"85"
:
246437376.0
,
"90"
:
246437376.0
,
"95"
:
246437376.0
,
"100"
:
246437376.0
}
},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1570924032.0
,
"5"
:
1634534400.0
,
"10"
:
1634534400.0
,
"15"
:
1634534400.0
,
"20"
:
1634534400.0
,
"25"
:
1634534400.0
,
"30"
:
1634593280.0
,
"35"
:
1634593280.0
,
"40"
:
1634593280.0
,
"45"
:
1634593280.0
,
"50"
:
1634593280.0
,
"55"
:
1634593280.0
,
"60"
:
1634593280.0
,
"65"
:
1634593280.0
,
"70"
:
1634593280.0
,
"75"
:
1634593280.0
,
"80"
:
1634593280.0
,
"85"
:
1634593280.0
,
"90"
:
1634593280.0
,
"95"
:
1634593280.0
,
"100"
:
1634593280.0
}
},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
6.22721
,
"5"
:
0.27333
,
"10"
:
0.27017
,
"15"
:
0.26846
,
"20"
:
0.26818
,
"25"
:
0.26614
,
"30"
:
0.26524
,
"35"
:
0.30697
,
"40"
:
0.2925
,
"45"
:
0.26534
,
"50"
:
0.26504
,
"55"
:
0.26684
,
"60"
:
0.26501
,
"65"
:
0.26543
,
"70"
:
0.26612
,
"75"
:
0.26476
,
"80"
:
0.26501
,
"85"
:
0.26505
,
"90"
:
0.26596
,
"95"
:
0.26599
,
"100"
:
0.2641
}
}
}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_dgx_a100_1N8G/golden_values_lts.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.88789
,
"5"
:
10.90966
,
"10"
:
10.87793
,
"15"
:
10.86382
,
"20"
:
10.75082
,
"25"
:
10.5988
,
"30"
:
10.40099
,
"35"
:
10.30785
,
"40"
:
10.10955
,
"45"
:
9.85867
,
"50"
:
9.92084
,
"55"
:
9.88535
,
"60"
:
9.50758
,
"65"
:
8.95821
,
"70"
:
9.72738
,
"75"
:
9.42579
,
"80"
:
9.40535
,
"85"
:
9.61537
,
"90"
:
9.81263
,
"95"
:
9.52135
,
"100"
:
9.40103
}
},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1742.0
,
"5"
:
2115.0
,
"10"
:
1468.0
,
"15"
:
1877.0
,
"20"
:
1665.0
,
"25"
:
1643.0
,
"30"
:
1900.0
,
"35"
:
2086.0
,
"40"
:
2185.0
,
"45"
:
2254.0
,
"50"
:
2496.0
,
"55"
:
2418.0
,
"60"
:
2489.0
,
"65"
:
2697.0
,
"70"
:
3267.0
,
"75"
:
2631.0
,
"80"
:
3442.0
,
"85"
:
3440.0
,
"90"
:
3075.0
,
"95"
:
3348.0
,
"100"
:
3389.0
}
},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
246437376.0
,
"5"
:
246437376.0
,
"10"
:
246437376.0
,
"15"
:
246437376.0
,
"20"
:
246437376.0
,
"25"
:
246437376.0
,
"30"
:
246437376.0
,
"35"
:
246437376.0
,
"40"
:
246437376.0
,
"45"
:
246437376.0
,
"50"
:
246437376.0
,
"55"
:
246437376.0
,
"60"
:
246437376.0
,
"65"
:
246437376.0
,
"70"
:
246437376.0
,
"75"
:
246437376.0
,
"80"
:
246437376.0
,
"85"
:
246437376.0
,
"90"
:
246437376.0
,
"95"
:
246437376.0
,
"100"
:
246437376.0
}
},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1570924032.0
,
"5"
:
1634534400.0
,
"10"
:
1634534400.0
,
"15"
:
1634534400.0
,
"20"
:
1634534400.0
,
"25"
:
1634534400.0
,
"30"
:
1634593280.0
,
"35"
:
1634593280.0
,
"40"
:
1634593280.0
,
"45"
:
1634593280.0
,
"50"
:
1634593280.0
,
"55"
:
1634593280.0
,
"60"
:
1634593280.0
,
"65"
:
1634593280.0
,
"70"
:
1634593280.0
,
"75"
:
1634593280.0
,
"80"
:
1634593280.0
,
"85"
:
1634593280.0
,
"90"
:
1634593280.0
,
"95"
:
1634593280.0
,
"100"
:
1634593280.0
}
},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
6.22721
,
"5"
:
0.27333
,
"10"
:
0.27017
,
"15"
:
0.26846
,
"20"
:
0.26818
,
"25"
:
0.26614
,
"30"
:
0.26524
,
"35"
:
0.30697
,
"40"
:
0.2925
,
"45"
:
0.26534
,
"50"
:
0.26504
,
"55"
:
0.26684
,
"60"
:
0.26501
,
"65"
:
0.26543
,
"70"
:
0.26612
,
"75"
:
0.26476
,
"80"
:
0.26501
,
"85"
:
0.26505
,
"90"
:
0.26596
,
"95"
:
0.26599
,
"100"
:
0.2641
}
}
}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
0 → 100644
View file @
688448db
ENV_VARS
:
CUDA_DEVICE_MAX_CONNECTIONS
:
1
NVTE_ALLOW_NONDETERMINISTIC_ALGO
:
0
NCCL_ALGO
:
Tree
CUBLAS_WORKSPACE_CONFIG
:
:4096:8
MODEL_ARGS
:
--num-layers
:
12
--hidden-size
:
512
--num-attention-heads
:
8
--log-params-norm
:
true
--log-num-zeros-in-grad
:
true
--log-validation-ppl-to-tensorboard
:
true
--log-timers-to-tensorboard
:
true
--log-memory-to-tensorboard
:
true
--tensorboard-dir
:
${TENSORBOARD_PATH}
--micro-batch-size
:
4
--global-batch-size
:
32
--seq-length
:
1024
--max-position-embeddings
:
1024
--train-iters
:
100
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_SAVE_PATH}
--load
:
${CHECKPOINT_LOAD_PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
--split
:
949,50,1
--distributed-backend
:
nccl
--lr
:
0.00015
--lr-decay-style
:
cosine
--min-lr
:
1.0e-5
--weight-decay
:
1e-2
--clip-grad
:
1.0
--lr-warmup-fraction
:
.01
--log-interval
:
1
--save-interval
:
50
--eval-interval
:
1000
--eval-iters
:
10
--transformer-impl
:
transformer_engine
--tensor-model-parallel-size
:
2
--use-custom-fsdp
:
true
--calculate-per-token-loss
:
true
--data-parallel-sharding-strategy
:
optim_grads_params
--use-distributed-optimizer
:
true
--deterministic-mode
:
true
--no-gradient-accumulation-fusion
:
true
--attention-softmax-in-fp32
:
true
--use-checkpoint-opt_param-scheduler
:
true
--use-mcore-models
:
true
--ckpt-format
:
torch_dist
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
TEST_TYPE
:
ckpt-resume
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/golden_values_dev.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.81978
,
"5"
:
10.85277
,
"10"
:
10.79054
,
"15"
:
10.81259
,
"20"
:
10.71561
,
"25"
:
10.52391
,
"30"
:
10.33354
,
"35"
:
10.22869
,
"40"
:
10.04307
,
"45"
:
9.77101
,
"50"
:
9.86315
,
"55"
:
9.82489
,
"60"
:
9.45369
,
"65"
:
8.89336
,
"70"
:
9.69013
,
"75"
:
9.38429
,
"80"
:
9.37031
,
"85"
:
9.58022
,
"90"
:
9.78525
,
"95"
:
9.49638
,
"100"
:
9.36739
}
},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
27138.0
,
"5"
:
32036.0
,
"10"
:
26255.0
,
"15"
:
31309.0
,
"20"
:
28869.0
,
"25"
:
28605.0
,
"30"
:
30817.0
,
"35"
:
32882.0
,
"40"
:
35373.0
,
"45"
:
35484.0
,
"50"
:
2136527.0
,
"55"
:
2135084.0
,
"60"
:
2137981.0
,
"65"
:
2138995.0
,
"70"
:
2142528.0
,
"75"
:
2215276.0
,
"80"
:
2144227.0
,
"85"
:
2146040.0
,
"90"
:
2146440.0
,
"95"
:
2144187.0
,
"100"
:
2144354.0
}
},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
668320768.0
,
"5"
:
668306944.0
,
"10"
:
668313600.0
,
"15"
:
668326912.0
,
"20"
:
668314112.0
,
"25"
:
668332544.0
,
"30"
:
668326912.0
,
"35"
:
668337664.0
,
"40"
:
668306432.0
,
"45"
:
668297728.0
,
"50"
:
668282880.0
,
"55"
:
668265984.0
,
"60"
:
668249088.0
,
"65"
:
668242944.0
,
"70"
:
668224512.0
,
"75"
:
668213248.0
,
"80"
:
668222464.0
,
"85"
:
668234752.0
,
"90"
:
668237312.0
,
"95"
:
668223488.0
,
"100"
:
668209664.0
}
},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
2355231744.0
,
"5"
:
2605464064.0
,
"10"
:
2605464064.0
,
"15"
:
2605464064.0
,
"20"
:
2605464064.0
,
"25"
:
2615321600.0
,
"30"
:
2615321600.0
,
"35"
:
2618603520.0
,
"40"
:
2618603520.0
,
"45"
:
2618603520.0
,
"50"
:
2618603520.0
,
"55"
:
2618603520.0
,
"60"
:
2618603520.0
,
"65"
:
2618603520.0
,
"70"
:
2618603520.0
,
"75"
:
2618603520.0
,
"80"
:
2618603520.0
,
"85"
:
2618603520.0
,
"90"
:
2618603520.0
,
"95"
:
2618603520.0
,
"100"
:
2618603520.0
}
},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
6.84429
,
"5"
:
0.49894
,
"10"
:
0.4932
,
"15"
:
0.48106
,
"20"
:
0.48362
,
"25"
:
0.48615
,
"30"
:
0.49038
,
"35"
:
0.49011
,
"40"
:
0.50012
,
"45"
:
0.49982
,
"50"
:
0.49286
,
"55"
:
0.92115
,
"60"
:
0.49142
,
"65"
:
0.49128
,
"70"
:
0.49444
,
"75"
:
0.49725
,
"80"
:
0.4978
,
"85"
:
0.49747
,
"90"
:
0.497
,
"95"
:
0.49687
,
"100"
:
0.49788
}
}
}
\ No newline at end of file
Prev
1
…
27
28
29
30
31
32
33
34
35
…
42
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment