Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
688448db
Commit
688448db
authored
Mar 14, 2025
by
silencealiang
Browse files
更新代码
parent
a02a5490
Pipeline
#2503
passed with stage
Changes
823
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
34 additions
and
277 deletions
+34
-277
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_lts.json
...e_tp2_pp2_ep2_te_4experts2parallel/golden_values_lts.json
+1
-53
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/model_config.yaml
..._mcore_tp2_pp2_ep2_te_4experts2parallel/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml
...e_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev.json
...ightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev.json
+1
-53
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_lts.json
...ightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_lts.json
+1
-53
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml
...45m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_dev.json
...00_1N8G_mcore_tp4_pp1_resume_torch/golden_values_dev.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_lts.json
...00_1N8G_mcore_tp4_pp1_resume_torch/golden_values_lts.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/model_config.yaml
...gx_a100_1N8G_mcore_tp4_pp1_resume_torch/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_dev.json
...8G_mcore_tp4_pp1_resume_torch_dist/golden_values_dev.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_lts.json
...8G_mcore_tp4_pp1_resume_torch_dist/golden_values_lts.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/model_config.yaml
...00_1N8G_mcore_tp4_pp1_resume_torch_dist/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/golden_values_dev.json
...dist_optimizer_overlap_grad_reduce/golden_values_dev.json
+1
-50
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/golden_values_lts.json
...dist_optimizer_overlap_grad_reduce/golden_values_lts.json
+1
-1
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/model_config.yaml
..._pp1_dist_optimizer_overlap_grad_reduce/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/golden_values_dev.json
...0_1N8G_tp1_pp1_overlap_grad_reduce/golden_values_dev.json
+1
-50
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/golden_values_lts.json
...0_1N8G_tp1_pp1_overlap_grad_reduce/golden_values_lts.json
+1
-1
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/model_config.yaml
...x_a100_1N8G_tp1_pp1_overlap_grad_reduce/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_dev.json
...345m_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_dev.json
+1
-1
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_lts.json
...345m_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_lts.json
+1
-1
No files found.
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_lts.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.82445
,
10.86393
,
10.85733
,
10.80809
,
10.70951
,
10.63738
,
10.16425
,
10.28201
,
10.19003
,
9.88697
]
},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
12678.0
,
16220.0
,
16626.0
,
16055.0
,
13829.0
,
14904.0
,
12931.0
,
15765.0
,
16771.0
,
17621.0
]
},
"iteration-time"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
25.19848
,
0.70611
,
0.70356
,
0.70548
,
0.70285
,
0.70488
,
0.70589
,
0.70459
,
0.70261
,
0.71213
]
}
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.82445
,
"5"
:
10.83944
,
"10"
:
10.7889
,
"15"
:
10.82831
,
"20"
:
10.72949
,
"25"
:
10.57667
,
"30"
:
10.40631
,
"35"
:
10.3135
,
"40"
:
10.13964
,
"45"
:
9.90704
,
"50"
:
9.96951
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
12856.0
,
"5"
:
15993.0
,
"10"
:
12573.0
,
"15"
:
14651.0
,
"20"
:
13663.0
,
"25"
:
13137.0
,
"30"
:
14643.0
,
"35"
:
15376.0
,
"40"
:
16684.0
,
"45"
:
16099.0
,
"50"
:
18966.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
626182656.0
,
"5"
:
626185728.0
,
"10"
:
626182656.0
,
"15"
:
626185216.0
,
"20"
:
626186240.0
,
"25"
:
626183168.0
,
"30"
:
626183680.0
,
"35"
:
626184704.0
,
"40"
:
626185728.0
,
"45"
:
626475008.0
,
"50"
:
626184704.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1819317248.0
,
"5"
:
2050089472.0
,
"10"
:
2050089472.0
,
"15"
:
2050322944.0
,
"20"
:
2050322944.0
,
"25"
:
2050322944.0
,
"30"
:
2050322944.0
,
"35"
:
2050341376.0
,
"40"
:
2050341376.0
,
"45"
:
2050341376.0
,
"50"
:
2050341376.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
18.15563
,
"5"
:
0.44565
,
"10"
:
0.43891
,
"15"
:
0.44112
,
"20"
:
0.44197
,
"25"
:
0.44184
,
"30"
:
0.43708
,
"35"
:
0.43675
,
"40"
:
0.43865
,
"45"
:
0.44326
,
"50"
:
0.44012
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
50
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -51,4 +51,5 @@ MODEL_ARGS:
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--attention-backend
:
unfused
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
regular
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
100
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -51,4 +51,5 @@ MODEL_ARGS:
--data-cache-path
:
${DATA_CACHE_PATH}
--fp16
:
true
--apply-query-key-layer-scaling
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
ckpt-resume
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.86122
,
10.88647
,
10.87773
,
10.83111
,
10.7165
,
10.60619
,
10.13147
,
10.22767
,
10.15929
,
9.83482
]
},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
1694.0
,
2148.0
,
2169.0
,
2103.0
,
1991.0
,
1900.0
,
1707.0
,
2189.0
,
2557.0
,
2606.0
]
},
"iteration-time"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
9.61991
,
0.29135
,
0.28852
,
0.28971
,
0.29221
,
0.28994
,
0.28976
,
0.28887
,
0.28975
,
0.2869
]
}
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.86122
,
"5"
:
10.88248
,
"10"
:
10.83515
,
"15"
:
10.82747
,
"20"
:
10.72762
,
"25"
:
10.55769
,
"30"
:
10.37915
,
"35"
:
10.28345
,
"40"
:
10.08809
,
"45"
:
9.82642
,
"50"
:
9.91341
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1694.0
,
"5"
:
2127.0
,
"10"
:
1548.0
,
"15"
:
1997.0
,
"20"
:
1846.0
,
"25"
:
1802.0
,
"30"
:
2112.0
,
"35"
:
2172.0
,
"40"
:
2560.0
,
"45"
:
2397.0
,
"50"
:
2761.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
382956544.0
,
"5"
:
382956544.0
,
"10"
:
382956544.0
,
"15"
:
382956544.0
,
"20"
:
382956544.0
,
"25"
:
382956544.0
,
"30"
:
382956544.0
,
"35"
:
382956544.0
,
"40"
:
382956544.0
,
"45"
:
382956544.0
,
"50"
:
382956544.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1496754688.0
,
"5"
:
1628741632.0
,
"10"
:
1628741632.0
,
"15"
:
1628741632.0
,
"20"
:
1628741632.0
,
"25"
:
1628741632.0
,
"30"
:
1628741632.0
,
"35"
:
1628741632.0
,
"40"
:
1628741632.0
,
"45"
:
1628741632.0
,
"50"
:
1628741632.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
4.5526
,
"5"
:
0.28707
,
"10"
:
0.28966
,
"15"
:
0.28958
,
"20"
:
0.28862
,
"25"
:
0.28956
,
"30"
:
0.28644
,
"35"
:
0.28887
,
"40"
:
0.28562
,
"45"
:
0.28552
,
"50"
:
0.28692
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_lts.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.86122
,
10.88647
,
10.87773
,
10.83111
,
10.7165
,
10.60623
,
10.13146
,
10.2277
,
10.15933
,
9.8348
]
},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
1694.0
,
2148.0
,
2169.0
,
2103.0
,
1991.0
,
1869.0
,
1760.0
,
2214.0
,
2529.0
,
2587.0
]
},
"iteration-time"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
11.72537
,
0.29824
,
0.29549
,
0.29574
,
0.29514
,
0.29533
,
0.29415
,
0.30722
,
0.29731
,
0.29867
]
}
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.86122
,
"5"
:
10.88248
,
"10"
:
10.83515
,
"15"
:
10.82747
,
"20"
:
10.72762
,
"25"
:
10.55769
,
"30"
:
10.37919
,
"35"
:
10.28344
,
"40"
:
10.08807
,
"45"
:
9.82644
,
"50"
:
9.9134
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1694.0
,
"5"
:
2127.0
,
"10"
:
1548.0
,
"15"
:
1997.0
,
"20"
:
1846.0
,
"25"
:
1700.0
,
"30"
:
2165.0
,
"35"
:
2194.0
,
"40"
:
2540.0
,
"45"
:
2414.0
,
"50"
:
2586.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
382956544.0
,
"5"
:
382956544.0
,
"10"
:
382956544.0
,
"15"
:
382956544.0
,
"20"
:
382956544.0
,
"25"
:
382956544.0
,
"30"
:
382956544.0
,
"35"
:
382956544.0
,
"40"
:
382956544.0
,
"45"
:
382956544.0
,
"50"
:
382956544.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1497803776.0
,
"5"
:
1628741632.0
,
"10"
:
1628741632.0
,
"15"
:
1628741632.0
,
"20"
:
1628741632.0
,
"25"
:
1628741632.0
,
"30"
:
1628741632.0
,
"35"
:
1628741632.0
,
"40"
:
1628741632.0
,
"45"
:
1628741632.0
,
"50"
:
1628741632.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
8.04015
,
"5"
:
0.30609
,
"10"
:
0.30611
,
"15"
:
0.30476
,
"20"
:
0.30451
,
"25"
:
0.3037
,
"30"
:
0.30473
,
"35"
:
0.30527
,
"40"
:
0.30608
,
"45"
:
0.30141
,
"50"
:
0.30553
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
50
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -46,4 +46,5 @@ MODEL_ARGS:
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--apply-query-key-layer-scaling
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
regular
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_dev.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.86104
,
"5"
:
10.88371
,
"10"
:
10.84263
,
"15"
:
10.87936
,
"20"
:
10.87404
,
"25"
:
10.82866
,
"30"
:
10.77191
,
"35"
:
10.67622
,
"40"
:
10.517
,
"45"
:
10.28436
,
"50"
:
10.27862
,
"55"
:
10.20113
,
"60"
:
9.83306
,
"65"
:
9.26979
,
"70"
:
9.92663
,
"75"
:
9.61385
,
"80"
:
9.56419
,
"85"
:
9.74319
,
"90"
:
9.92148
,
"95"
:
9.6163
,
"100"
:
9.5087
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
284527616.0
,
"5"
:
284527616.0
,
"10"
:
284527616.0
,
"15"
:
284527616.0
,
"20"
:
416513536.0
,
"25"
:
416513536.0
,
"30"
:
416513536.0
,
"35"
:
416513536.0
,
"40"
:
416513536.0
,
"45"
:
416513536.0
,
"50"
:
416513536.0
,
"55"
:
416513536.0
,
"60"
:
416513536.0
,
"65"
:
416513536.0
,
"70"
:
416513536.0
,
"75"
:
416513536.0
,
"80"
:
416513536.0
,
"85"
:
416513536.0
,
"90"
:
416513536.0
,
"95"
:
416513536.0
,
"100"
:
416513536.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1465367552.0
,
"5"
:
1465367552.0
,
"10"
:
1465368576.0
,
"15"
:
1465368576.0
,
"20"
:
1597092352.0
,
"25"
:
1597092352.0
,
"30"
:
1597092352.0
,
"35"
:
1597092352.0
,
"40"
:
1597092352.0
,
"45"
:
1597092352.0
,
"50"
:
1597092352.0
,
"55"
:
1597092352.0
,
"60"
:
1597092352.0
,
"65"
:
1597092352.0
,
"70"
:
1597092352.0
,
"75"
:
1597092352.0
,
"80"
:
1597092352.0
,
"85"
:
1597092352.0
,
"90"
:
1597092352.0
,
"95"
:
1597092352.0
,
"100"
:
1597092352.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
3.88381
,
"5"
:
0.28491
,
"10"
:
0.28089
,
"15"
:
0.28096
,
"20"
:
0.2941
,
"25"
:
0.29217
,
"30"
:
0.29189
,
"35"
:
0.29014
,
"40"
:
0.29008
,
"45"
:
0.28992
,
"50"
:
0.29002
,
"55"
:
0.29062
,
"60"
:
0.29185
,
"65"
:
0.28998
,
"70"
:
0.28985
,
"75"
:
0.29115
,
"80"
:
0.29089
,
"85"
:
0.29148
,
"90"
:
0.2908
,
"95"
:
0.29004
,
"100"
:
0.29109
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
"nan"
,
"5"
:
"nan"
,
"10"
:
"nan"
,
"15"
:
"nan"
,
"20"
:
1620.0
,
"25"
:
2028.0
,
"30"
:
2272.0
,
"35"
:
1848.0
,
"40"
:
1954.0
,
"45"
:
2388.0
,
"50"
:
2548.0
,
"55"
:
2269.0
,
"60"
:
2744.0
,
"65"
:
2295.0
,
"70"
:
3777.0
,
"75"
:
3002.0
,
"80"
:
3528.0
,
"85"
:
3660.0
,
"90"
:
3705.0
,
"95"
:
4147.0
,
"100"
:
3569.0
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/golden_values_lts.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.86104
,
"5"
:
10.88371
,
"10"
:
10.84263
,
"15"
:
10.87936
,
"20"
:
10.87404
,
"25"
:
10.82866
,
"30"
:
10.77191
,
"35"
:
10.67622
,
"40"
:
10.517
,
"45"
:
10.28436
,
"50"
:
10.27862
,
"55"
:
10.20112
,
"60"
:
9.83306
,
"65"
:
9.26979
,
"70"
:
9.92662
,
"75"
:
9.61385
,
"80"
:
9.56419
,
"85"
:
9.74319
,
"90"
:
9.92149
,
"95"
:
9.6163
,
"100"
:
9.5087
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
284527616.0
,
"5"
:
284527616.0
,
"10"
:
284527616.0
,
"15"
:
284527616.0
,
"20"
:
416513536.0
,
"25"
:
416513536.0
,
"30"
:
416513536.0
,
"35"
:
416513536.0
,
"40"
:
416513536.0
,
"45"
:
416513536.0
,
"50"
:
416513536.0
,
"55"
:
416513536.0
,
"60"
:
416513536.0
,
"65"
:
416513536.0
,
"70"
:
416513536.0
,
"75"
:
416513536.0
,
"80"
:
416513536.0
,
"85"
:
416513536.0
,
"90"
:
416513536.0
,
"95"
:
416513536.0
,
"100"
:
416513536.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1465368064.0
,
"5"
:
1465368064.0
,
"10"
:
1465368064.0
,
"15"
:
1465368064.0
,
"20"
:
1596303360.0
,
"25"
:
1596303360.0
,
"30"
:
1596303360.0
,
"35"
:
1596303360.0
,
"40"
:
1596303360.0
,
"45"
:
1596303360.0
,
"50"
:
1596303360.0
,
"55"
:
1596303360.0
,
"60"
:
1596303360.0
,
"65"
:
1596303360.0
,
"70"
:
1596303360.0
,
"75"
:
1596304896.0
,
"80"
:
1596305408.0
,
"85"
:
1596305408.0
,
"90"
:
1596305408.0
,
"95"
:
1596305408.0
,
"100"
:
1596305920.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
11.25031
,
"5"
:
0.29691
,
"10"
:
0.29639
,
"15"
:
0.29502
,
"20"
:
0.3291
,
"25"
:
0.30842
,
"30"
:
0.31824
,
"35"
:
0.31378
,
"40"
:
0.31056
,
"45"
:
0.30902
,
"50"
:
0.30807
,
"55"
:
0.30895
,
"60"
:
0.31556
,
"65"
:
0.308
,
"70"
:
0.31154
,
"75"
:
0.30756
,
"80"
:
0.314
,
"85"
:
0.3103
,
"90"
:
0.3142
,
"95"
:
0.30701
,
"100"
:
0.30658
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
"nan"
,
"5"
:
"nan"
,
"10"
:
"nan"
,
"15"
:
"nan"
,
"20"
:
1620.0
,
"25"
:
2028.0
,
"30"
:
2272.0
,
"35"
:
1848.0
,
"40"
:
1954.0
,
"45"
:
2388.0
,
"50"
:
2605.0
,
"55"
:
2341.0
,
"60"
:
2883.0
,
"65"
:
2307.0
,
"70"
:
3652.0
,
"75"
:
2877.0
,
"80"
:
3537.0
,
"85"
:
3698.0
,
"90"
:
3545.0
,
"95"
:
4040.0
,
"100"
:
3472.0
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
100
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -46,4 +46,5 @@ MODEL_ARGS:
--data-cache-path
:
${DATA_CACHE_PATH}
--fp16
:
true
--apply-query-key-layer-scaling
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
ckpt-resume
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_dev.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.86104
,
"5"
:
10.88371
,
"10"
:
10.84263
,
"15"
:
10.87936
,
"20"
:
10.87404
,
"25"
:
10.82866
,
"30"
:
10.77191
,
"35"
:
10.67622
,
"40"
:
10.517
,
"45"
:
10.28436
,
"50"
:
10.27862
,
"55"
:
10.20113
,
"60"
:
9.83306
,
"65"
:
9.26979
,
"70"
:
9.92663
,
"75"
:
9.61385
,
"80"
:
9.56419
,
"85"
:
9.74319
,
"90"
:
9.92148
,
"95"
:
9.6163
,
"100"
:
9.5087
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
284527616.0
,
"5"
:
284527616.0
,
"10"
:
284527616.0
,
"15"
:
284527616.0
,
"20"
:
416513536.0
,
"25"
:
416513536.0
,
"30"
:
416513536.0
,
"35"
:
416513536.0
,
"40"
:
416513536.0
,
"45"
:
416513536.0
,
"50"
:
416513536.0
,
"55"
:
416513536.0
,
"60"
:
416513536.0
,
"65"
:
416513536.0
,
"70"
:
416513536.0
,
"75"
:
416513536.0
,
"80"
:
416513536.0
,
"85"
:
416513536.0
,
"90"
:
416513536.0
,
"95"
:
416513536.0
,
"100"
:
416513536.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1465368064.0
,
"5"
:
1465368064.0
,
"10"
:
1465368576.0
,
"15"
:
1465368576.0
,
"20"
:
1597092352.0
,
"25"
:
1597092352.0
,
"30"
:
1597092352.0
,
"35"
:
1597092352.0
,
"40"
:
1597092352.0
,
"45"
:
1597092352.0
,
"50"
:
1597092352.0
,
"55"
:
1597092352.0
,
"60"
:
1597092352.0
,
"65"
:
1597092352.0
,
"70"
:
1597092352.0
,
"75"
:
1597092352.0
,
"80"
:
1597092352.0
,
"85"
:
1597092352.0
,
"90"
:
1597092352.0
,
"95"
:
1597092352.0
,
"100"
:
1597092352.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
4.45696
,
"5"
:
0.28792
,
"10"
:
0.28811
,
"15"
:
0.28636
,
"20"
:
0.30153
,
"25"
:
0.29748
,
"30"
:
0.29505
,
"35"
:
0.29452
,
"40"
:
0.29464
,
"45"
:
0.29589
,
"50"
:
0.29474
,
"55"
:
0.29138
,
"60"
:
0.29052
,
"65"
:
0.28928
,
"70"
:
0.29165
,
"75"
:
0.29065
,
"80"
:
0.29154
,
"85"
:
0.29123
,
"90"
:
0.29106
,
"95"
:
0.29151
,
"100"
:
0.29157
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
"nan"
,
"5"
:
"nan"
,
"10"
:
"nan"
,
"15"
:
"nan"
,
"20"
:
1620.0
,
"25"
:
2028.0
,
"30"
:
2272.0
,
"35"
:
1848.0
,
"40"
:
1954.0
,
"45"
:
2388.0
,
"50"
:
2548.0
,
"55"
:
2269.0
,
"60"
:
2744.0
,
"65"
:
2295.0
,
"70"
:
3777.0
,
"75"
:
3002.0
,
"80"
:
3528.0
,
"85"
:
3660.0
,
"90"
:
3705.0
,
"95"
:
4147.0
,
"100"
:
3569.0
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/golden_values_lts.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.86104
,
"5"
:
10.88371
,
"10"
:
10.84263
,
"15"
:
10.87936
,
"20"
:
10.87404
,
"25"
:
10.82866
,
"30"
:
10.77191
,
"35"
:
10.67622
,
"40"
:
10.517
,
"45"
:
10.28436
,
"50"
:
10.27862
,
"55"
:
10.20112
,
"60"
:
9.83306
,
"65"
:
9.26979
,
"70"
:
9.92662
,
"75"
:
9.61385
,
"80"
:
9.56419
,
"85"
:
9.74319
,
"90"
:
9.92149
,
"95"
:
9.6163
,
"100"
:
9.5087
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
284527616.0
,
"5"
:
284527616.0
,
"10"
:
284527616.0
,
"15"
:
284527616.0
,
"20"
:
416513536.0
,
"25"
:
416513536.0
,
"30"
:
416513536.0
,
"35"
:
416513536.0
,
"40"
:
416513536.0
,
"45"
:
416513536.0
,
"50"
:
416513536.0
,
"55"
:
416513536.0
,
"60"
:
416513536.0
,
"65"
:
416513536.0
,
"70"
:
416513536.0
,
"75"
:
416513536.0
,
"80"
:
416513536.0
,
"85"
:
416513536.0
,
"90"
:
416513536.0
,
"95"
:
416513536.0
,
"100"
:
416513536.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1463270912.0
,
"5"
:
1465368576.0
,
"10"
:
1465368576.0
,
"15"
:
1465368576.0
,
"20"
:
1597092352.0
,
"25"
:
1597092352.0
,
"30"
:
1597092352.0
,
"35"
:
1597092352.0
,
"40"
:
1597092352.0
,
"45"
:
1597092352.0
,
"50"
:
1597092352.0
,
"55"
:
1597092352.0
,
"60"
:
1597092352.0
,
"65"
:
1597092352.0
,
"70"
:
1597092352.0
,
"75"
:
1597092352.0
,
"80"
:
1597092352.0
,
"85"
:
1597092352.0
,
"90"
:
1597092352.0
,
"95"
:
1597092352.0
,
"100"
:
1597092352.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
7.29813
,
"5"
:
0.29344
,
"10"
:
0.29293
,
"15"
:
0.29093
,
"20"
:
0.30728
,
"25"
:
0.31074
,
"30"
:
0.30695
,
"35"
:
0.30576
,
"40"
:
0.30871
,
"45"
:
0.31067
,
"50"
:
0.30611
,
"55"
:
0.3052
,
"60"
:
0.30899
,
"65"
:
0.30587
,
"70"
:
0.30945
,
"75"
:
0.30233
,
"80"
:
0.30465
,
"85"
:
0.30549
,
"90"
:
0.30363
,
"95"
:
0.30609
,
"100"
:
0.3023
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
"nan"
,
"5"
:
"nan"
,
"10"
:
"nan"
,
"15"
:
"nan"
,
"20"
:
1620.0
,
"25"
:
2028.0
,
"30"
:
2272.0
,
"35"
:
1848.0
,
"40"
:
1954.0
,
"45"
:
2388.0
,
"50"
:
2605.0
,
"55"
:
2341.0
,
"60"
:
2883.0
,
"65"
:
2307.0
,
"70"
:
3652.0
,
"75"
:
2877.0
,
"80"
:
3537.0
,
"85"
:
3698.0
,
"90"
:
3545.0
,
"95"
:
4040.0
,
"100"
:
3472.0
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
100
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -47,4 +47,5 @@ MODEL_ARGS:
--data-cache-path
:
${DATA_CACHE_PATH}
--fp16
:
true
--apply-query-key-layer-scaling
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
ckpt-resume
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/golden_values_dev.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.87346
,
10.89625
,
10.88939
,
10.88681
,
10.8893
,
10.84863
,
10.6962
,
10.63919
,
10.53931
,
10.31119
]
},
"iteration-time"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
4.95266
,
0.07818
,
0.07961
,
0.07716
,
0.08368
,
0.08327
,
0.08409
,
0.08371
,
0.08372
,
0.08387
]
},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
32
,
"step_interval"
:
5
,
"values"
:
[
1300.0
,
1287.0
,
1565.0
,
1441.0
,
1419.0
,
1295.0
,
1177.0
]
}
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.87346
,
"5"
:
10.86718
,
"10"
:
10.85561
,
"15"
:
10.88831
,
"20"
:
10.87704
,
"25"
:
10.84986
,
"30"
:
10.76439
,
"35"
:
10.68583
,
"40"
:
10.52311
,
"45"
:
10.32331
,
"50"
:
10.29634
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
454770688.0
,
"5"
:
454770688.0
,
"10"
:
454770688.0
,
"15"
:
454770688.0
,
"20"
:
518880768.0
,
"25"
:
518880768.0
,
"30"
:
518880768.0
,
"35"
:
518880768.0
,
"40"
:
518880768.0
,
"45"
:
518880768.0
,
"50"
:
518880768.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
4511150592.0
,
"5"
:
4544705536.0
,
"10"
:
4544705536.0
,
"15"
:
4544705536.0
,
"20"
:
4607767040.0
,
"25"
:
4607767040.0
,
"30"
:
4607767040.0
,
"35"
:
4607767040.0
,
"40"
:
4607767040.0
,
"45"
:
4607767040.0
,
"50"
:
4607767040.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
5.60068
,
"5"
:
0.07688
,
"10"
:
0.07554
,
"15"
:
0.07566
,
"20"
:
0.33206
,
"25"
:
0.08271
,
"30"
:
0.08222
,
"35"
:
0.08267
,
"40"
:
0.08317
,
"45"
:
0.08236
,
"50"
:
0.08327
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
"nan"
,
"5"
:
"nan"
,
"10"
:
"nan"
,
"15"
:
"nan"
,
"20"
:
1221.0
,
"25"
:
1129.0
,
"30"
:
1441.0
,
"35"
:
1322.0
,
"40"
:
1381.0
,
"45"
:
1282.0
,
"50"
:
1426.0
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/golden_values_lts.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.87346
,
10.89625
,
10.88939
,
10.88681
,
10.8893
,
10.84864
,
10.6962
,
10.63918
,
10.5393
,
10.31119
]},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
32
,
"step_interval"
:
5
,
"values"
:
[
1298.0
,
1352.0
,
1590.0
,
1403.0
,
1435.0
,
1266.0
,
1195.0
]},
"iteration_timing_avg"
:
0.07655911764705883
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.87346
,
"5"
:
10.86718
,
"10"
:
10.85561
,
"15"
:
10.88831
,
"20"
:
10.87704
,
"25"
:
10.84986
,
"30"
:
10.7644
,
"35"
:
10.68583
,
"40"
:
10.5231
,
"45"
:
10.32331
,
"50"
:
10.29634
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
454770688.0
,
"5"
:
454770688.0
,
"10"
:
454770688.0
,
"15"
:
454770688.0
,
"20"
:
518880768.0
,
"25"
:
518880768.0
,
"30"
:
518880768.0
,
"35"
:
518880768.0
,
"40"
:
518880768.0
,
"45"
:
518880768.0
,
"50"
:
518880768.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
4511150592.0
,
"5"
:
4544705536.0
,
"10"
:
4544705536.0
,
"15"
:
4544705536.0
,
"20"
:
4607767040.0
,
"25"
:
4607767040.0
,
"30"
:
4607767040.0
,
"35"
:
4607767040.0
,
"40"
:
4607767040.0
,
"45"
:
4607767040.0
,
"50"
:
4607767040.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.97955
,
"5"
:
0.09865
,
"10"
:
0.09755
,
"15"
:
0.08417
,
"20"
:
0.09136
,
"25"
:
0.09055
,
"30"
:
0.09084
,
"35"
:
0.09134
,
"40"
:
0.09058
,
"45"
:
0.09138
,
"50"
:
0.09003
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
"nan"
,
"5"
:
"nan"
,
"10"
:
"nan"
,
"15"
:
"nan"
,
"20"
:
1279.0
,
"25"
:
1219.0
,
"30"
:
1421.0
,
"35"
:
1249.0
,
"40"
:
1452.0
,
"45"
:
1336.0
,
"50"
:
1455.0
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
50
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -48,4 +48,5 @@ MODEL_ARGS:
--data-cache-path
:
${DATA_CACHE_PATH}
--fp16
:
true
--apply-query-key-layer-scaling
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
regular
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/golden_values_dev.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.87346
,
10.89625
,
10.88939
,
10.88681
,
10.88931
,
10.84864
,
10.6962
,
10.63918
,
10.5393
,
10.31119
]
},
"iteration-time"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
5.32064
,
0.08204
,
0.08233
,
0.08176
,
0.09748
,
0.0966
,
0.09648
,
0.09617
,
0.09604
,
0.09646
]
},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
32
,
"step_interval"
:
5
,
"values"
:
[
1112.0
,
1124.0
,
1229.0
,
1665.0
,
1269.0
,
1219.0
,
1572.0
]
}
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.87346
,
"5"
:
10.86718
,
"10"
:
10.85561
,
"15"
:
10.88831
,
"20"
:
10.87704
,
"25"
:
10.84986
,
"30"
:
10.7644
,
"35"
:
10.68582
,
"40"
:
10.5231
,
"45"
:
10.32331
,
"50"
:
10.29634
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
911795200.0
,
"5"
:
911795200.0
,
"10"
:
911795200.0
,
"15"
:
911795200.0
,
"20"
:
1426769408.0
,
"25"
:
1426769408.0
,
"30"
:
1426769408.0
,
"35"
:
1426769408.0
,
"40"
:
1426769408.0
,
"45"
:
1426769408.0
,
"50"
:
1426769408.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
4738548736.0
,
"5"
:
4772103680.0
,
"10"
:
4772103680.0
,
"15"
:
4772103680.0
,
"20"
:
5286291456.0
,
"25"
:
5286291456.0
,
"30"
:
5286291456.0
,
"35"
:
5286291456.0
,
"40"
:
5286291456.0
,
"45"
:
5286291456.0
,
"50"
:
5286291456.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
3.51674
,
"5"
:
0.08141
,
"10"
:
0.08052
,
"15"
:
0.07992
,
"20"
:
0.09632
,
"25"
:
0.09637
,
"30"
:
0.09667
,
"35"
:
0.09681
,
"40"
:
0.09734
,
"45"
:
0.09652
,
"50"
:
0.09765
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
"nan"
,
"5"
:
"nan"
,
"10"
:
"nan"
,
"15"
:
"nan"
,
"20"
:
1974.0
,
"25"
:
1113.0
,
"30"
:
994.0
,
"35"
:
1045.0
,
"40"
:
1324.0
,
"45"
:
1573.0
,
"50"
:
1267.0
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/golden_values_lts.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.87346
,
10.89625
,
10.88939
,
10.88681
,
10.88931
,
10.84864
,
10.6962
,
10.63918
,
10.53931
,
10.31119
]},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
32
,
"step_interval"
:
5
,
"values"
:
[
1131.0
,
1173.0
,
1218.0
,
1783.0
,
1278.0
,
1244.0
,
1555.0
]},
"iteration_timing_avg"
:
0.07975499999999999
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.87346
,
"5"
:
10.86718
,
"10"
:
10.85561
,
"15"
:
10.88831
,
"20"
:
10.87703
,
"25"
:
10.84986
,
"30"
:
10.76439
,
"35"
:
10.68583
,
"40"
:
10.5231
,
"45"
:
10.32331
,
"50"
:
10.29634
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
911795200.0
,
"5"
:
911795200.0
,
"10"
:
911795200.0
,
"15"
:
911795200.0
,
"20"
:
1426769408.0
,
"25"
:
1426769408.0
,
"30"
:
1426769408.0
,
"35"
:
1426769408.0
,
"40"
:
1426769408.0
,
"45"
:
1426769408.0
,
"50"
:
1426769408.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
4738548736.0
,
"5"
:
4772103680.0
,
"10"
:
4772103680.0
,
"15"
:
4772103680.0
,
"20"
:
5286291456.0
,
"25"
:
5286291456.0
,
"30"
:
5286291456.0
,
"35"
:
5286291456.0
,
"40"
:
5286291456.0
,
"45"
:
5286291456.0
,
"50"
:
5286291456.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
7.38932
,
"5"
:
0.08356
,
"10"
:
0.08398
,
"15"
:
0.09924
,
"20"
:
0.09907
,
"25"
:
0.09964
,
"30"
:
0.09945
,
"35"
:
0.10076
,
"40"
:
0.09872
,
"45"
:
0.09961
,
"50"
:
0.09911
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
"nan"
,
"5"
:
"nan"
,
"10"
:
"nan"
,
"15"
:
"nan"
,
"20"
:
2075.0
,
"25"
:
1126.0
,
"30"
:
1049.0
,
"35"
:
1033.0
,
"40"
:
1364.0
,
"45"
:
1599.0
,
"50"
:
1249.0
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
50
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -47,4 +47,5 @@ MODEL_ARGS:
--data-cache-path
:
${DATA_CACHE_PATH}
--fp16
:
true
--apply-query-key-layer-scaling
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
regular
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_dev.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.84009
,
10.89314
,
10.908
,
10.87524
,
10.86367
,
10.83848
,
10.64647
,
10.62126
,
10.53743
,
10.24831
]},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
33
,
"step_interval"
:
5
,
"values"
:
[
2044.0
,
2242.0
,
2368.0
,
2598.0
,
2188.0
,
1850.0
,
2436.0
]},
"iteration_timing_avg"
:
0.10581941176470588
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.84009
,
"5"
:
10.87275
,
"10"
:
10.8333
,
"15"
:
10.87115
,
"20"
:
10.85956
,
"25"
:
10.8165
,
"30"
:
10.7379
,
"35"
:
10.66607
,
"40"
:
10.50091
,
"45"
:
10.26832
,
"50"
:
10.25759
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
685659136.0
,
"5"
:
685659136.0
,
"10"
:
685659136.0
,
"15"
:
685659136.0
,
"20"
:
1043027456.0
,
"25"
:
1043027456.0
,
"30"
:
1043027456.0
,
"35"
:
1043027456.0
,
"40"
:
1043027456.0
,
"45"
:
1043027456.0
,
"50"
:
1043027456.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
3187304960.0
,
"5"
:
3187305472.0
,
"10"
:
3187305472.0
,
"15"
:
3187305472.0
,
"20"
:
3544935936.0
,
"25"
:
3544935936.0
,
"30"
:
3544935936.0
,
"35"
:
3544935936.0
,
"40"
:
3544935936.0
,
"45"
:
3544935936.0
,
"50"
:
3544935936.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
4.83724
,
"5"
:
0.1196
,
"10"
:
0.11844
,
"15"
:
0.11713
,
"20"
:
0.12863
,
"25"
:
0.12877
,
"30"
:
0.13001
,
"35"
:
0.12746
,
"40"
:
0.127
,
"45"
:
0.12743
,
"50"
:
0.12672
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
"nan"
,
"5"
:
"nan"
,
"10"
:
"nan"
,
"15"
:
"nan"
,
"20"
:
2206.0
,
"25"
:
1990.0
,
"30"
:
2857.0
,
"35"
:
2070.0
,
"40"
:
2038.0
,
"45"
:
2212.0
,
"50"
:
2256.0
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_lts.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.84009
,
10.89314
,
10.908
,
10.87524
,
10.86367
,
10.83848
,
10.64647
,
10.62126
,
10.53743
,
10.24831
]},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
33
,
"step_interval"
:
5
,
"values"
:
[
2044.0
,
2242.0
,
2368.0
,
2598.0
,
2188.0
,
1850.0
,
2436.0
]},
"iteration_timing_avg"
:
0.10581941176470588
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.84009
,
"5"
:
10.87275
,
"10"
:
10.8333
,
"15"
:
10.87115
,
"20"
:
10.85956
,
"25"
:
10.8165
,
"30"
:
10.7379
,
"35"
:
10.66607
,
"40"
:
10.50091
,
"45"
:
10.26832
,
"50"
:
10.25759
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
685659136.0
,
"5"
:
685659136.0
,
"10"
:
685659136.0
,
"15"
:
685659136.0
,
"20"
:
1043027456.0
,
"25"
:
1043027456.0
,
"30"
:
1043027456.0
,
"35"
:
1043027456.0
,
"40"
:
1043027456.0
,
"45"
:
1043027456.0
,
"50"
:
1043027456.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
3187304960.0
,
"5"
:
3187305472.0
,
"10"
:
3187305472.0
,
"15"
:
3187305472.0
,
"20"
:
3544935936.0
,
"25"
:
3544935936.0
,
"30"
:
3544935936.0
,
"35"
:
3544935936.0
,
"40"
:
3544935936.0
,
"45"
:
3544935936.0
,
"50"
:
3544935936.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
13.4701
,
"5"
:
0.1208
,
"10"
:
0.1223
,
"15"
:
0.11887
,
"20"
:
0.12942
,
"25"
:
0.12991
,
"30"
:
0.12979
,
"35"
:
0.12982
,
"40"
:
0.12913
,
"45"
:
0.12942
,
"50"
:
0.1287
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
"nan"
,
"5"
:
"nan"
,
"10"
:
"nan"
,
"15"
:
"nan"
,
"20"
:
2206.0
,
"25"
:
1990.0
,
"30"
:
2857.0
,
"35"
:
2070.0
,
"40"
:
2038.0
,
"45"
:
2212.0
,
"50"
:
2256.0
}}}
\ No newline at end of file
Prev
1
…
14
15
16
17
18
19
20
21
22
…
42
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment