Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
688448db
Commit
688448db
authored
Mar 14, 2025
by
silencealiang
Browse files
更新代码
parent
a02a5490
Pipeline
#2503
passed with stage
Changes
823
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
42 additions
and
31 deletions
+42
-31
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/golden_values_lts.json
...1N8G_tp2_pp2_resume_torch_4experts/golden_values_lts.json
+1
-1
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/model_config.yaml
...a100_1N8G_tp2_pp2_resume_torch_4experts/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/golden_values_dev.json
...2_resume_torch_overlap_grad_reduce/golden_values_dev.json
+1
-1
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/golden_values_lts.json
...2_resume_torch_overlap_grad_reduce/golden_values_lts.json
+1
-1
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/model_config.yaml
...p2_pp2_resume_torch_overlap_grad_reduce/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_dev.json
...345m_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_dev.json
+1
-1
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_lts.json
...345m_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_lts.json
+1
-1
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml
...gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/golden_values_dev.json
...0_1N8G_tp4_pp1_overlap_grad_reduce/golden_values_dev.json
+1
-1
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/golden_values_lts.json
...0_1N8G_tp4_pp1_overlap_grad_reduce/golden_values_lts.json
+1
-1
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/model_config.yaml
...x_a100_1N8G_tp4_pp1_overlap_grad_reduce/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/golden_values_dev.json
...dgx_a100_1N8G_tp4_pp1_resume_torch/golden_values_dev.json
+1
-1
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/golden_values_lts.json
...dgx_a100_1N8G_tp4_pp1_resume_torch/golden_values_lts.json
+1
-1
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/model_config.yaml
...htly_dgx_a100_1N8G_tp4_pp1_resume_torch/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel/model_config.yaml
...re_nondet_tp1_pp1_fp8_no_model_parallel/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline/model_config.yaml
...x_h100_1N8G_mcore_tp1_pp1_bf16_baseline/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel/model_config.yaml
...N8G_mcore_tp1_pp1_fp8_no_model_parallel/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp/model_config.yaml
...ekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp/model_config.yaml
...y_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp/model_config.yaml
...gx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp/model_config.yaml
+3
-2
No files found.
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/golden_values_lts.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
[
10.80264
,
10.85778
,
10.86259
,
10.83903
,
10.82934
,
10.81016
,
10.60251
,
10.61471
,
10.54092
,
10.27186
,
10.24338
,
10.02058
,
10.03017
,
9.99471
,
9.84885
,
9.34867
,
9.67263
,
9.2457
,
9.53365
,
9.67548
]},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
84
,
"step_interval"
:
5
,
"values"
:
[
8571.0
,
7897.0
,
7748.0
,
9008.0
,
9165.0
,
8986.0
,
9155.0
,
7960.0
,
7684.0
,
9743.0
,
8727.0
,
9382.0
,
10992.0
,
11177.0
,
11270.0
,
13404.0
,
11533.0
]},
"iteration_timing_avg"
:
0.3735462686567164
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.76813
,
"5"
:
10.82426
,
"10"
:
10.7488
,
"15"
:
10.82093
,
"20"
:
10.79407
,
"25"
:
10.74528
,
"30"
:
10.68463
,
"35"
:
10.62109
,
"40"
:
10.47053
,
"45"
:
10.24915
,
"50"
:
10.27379
,
"55"
:
10.20448
,
"60"
:
9.84999
,
"65"
:
9.28499
,
"70"
:
9.94476
,
"75"
:
9.62753
,
"80"
:
9.57725
,
"85"
:
9.76823
,
"90"
:
9.93273
,
"95"
:
9.64547
,
"100"
:
9.53769
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
607448576.0
,
"5"
:
607448576.0
,
"10"
:
607448576.0
,
"15"
:
607448576.0
,
"20"
:
944340992.0
,
"25"
:
944037888.0
,
"30"
:
944954368.0
,
"35"
:
944078848.0
,
"40"
:
944078848.0
,
"45"
:
944078848.0
,
"50"
:
944992256.0
,
"55"
:
944078848.0
,
"60"
:
944078848.0
,
"65"
:
943674368.0
,
"70"
:
945127424.0
,
"75"
:
944078848.0
,
"80"
:
944322560.0
,
"85"
:
944078848.0
,
"90"
:
944078848.0
,
"95"
:
944078848.0
,
"100"
:
944993280.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1843249152.0
,
"5"
:
1843369472.0
,
"10"
:
1844654592.0
,
"15"
:
1844654592.0
,
"20"
:
2181567488.0
,
"25"
:
2181567488.0
,
"30"
:
2181567488.0
,
"35"
:
2181567488.0
,
"40"
:
2181567488.0
,
"45"
:
2181567488.0
,
"50"
:
2181567488.0
,
"55"
:
2181567488.0
,
"60"
:
2181567488.0
,
"65"
:
2181567488.0
,
"70"
:
2181567488.0
,
"75"
:
2181567488.0
,
"80"
:
2181567488.0
,
"85"
:
2181567488.0
,
"90"
:
2181567488.0
,
"95"
:
2181635584.0
,
"100"
:
2181635584.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
18.6534
,
"5"
:
0.52336
,
"10"
:
0.51659
,
"15"
:
0.52097
,
"20"
:
0.5413
,
"25"
:
0.56055
,
"30"
:
0.53271
,
"35"
:
0.54237
,
"40"
:
0.5352
,
"45"
:
0.53408
,
"50"
:
0.53304
,
"55"
:
0.53075
,
"60"
:
0.53399
,
"65"
:
0.53294
,
"70"
:
0.53179
,
"75"
:
0.69389
,
"80"
:
0.531
,
"85"
:
0.52842
,
"90"
:
0.53117
,
"95"
:
0.53133
,
"100"
:
0.72087
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
"nan"
,
"5"
:
"nan"
,
"10"
:
"nan"
,
"15"
:
"nan"
,
"20"
:
7024.0
,
"25"
:
7902.0
,
"30"
:
8336.0
,
"35"
:
7346.0
,
"40"
:
7522.0
,
"45"
:
8100.0
,
"50"
:
8998.0
,
"55"
:
8207.0
,
"60"
:
9031.0
,
"65"
:
7785.0
,
"70"
:
10580.0
,
"75"
:
9533.0
,
"80"
:
11195.0
,
"85"
:
11864.0
,
"90"
:
12414.0
,
"95"
:
13058.0
,
"100"
:
10097.0
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
100
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -51,4 +51,5 @@ MODEL_ARGS:
--data-cache-path
:
${DATA_CACHE_PATH}
--fp16
:
true
--apply-query-key-layer-scaling
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
ckpt-resume
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/golden_values_dev.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
[
10.85929
,
10.89211
,
10.87639
,
10.86988
,
10.88179
,
10.83898
,
10.66589
,
10.62691
,
10.52461
,
10.25708
,
10.19741
,
9.9562
,
9.96369
,
9.91398
,
9.79604
,
9.2686
,
9.61975
,
9.19501
,
9.47332
,
9.62216
]},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
83
,
"step_interval"
:
5
,
"values"
:
[
2458.0
,
2527.0
,
2467.0
,
2148.0
,
2250.0
,
2467.0
,
2528.0
,
3656.0
,
3275.0
,
3203.0
,
3297.0
,
3364.0
,
3789.0
,
3277.0
,
3660.0
,
3733.0
,
4815.0
]},
"iteration_timing_avg"
:
0.1628459701492537
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.85929
,
"5"
:
10.87929
,
"10"
:
10.84772
,
"15"
:
10.86867
,
"20"
:
10.87317
,
"25"
:
10.83338
,
"30"
:
10.75624
,
"35"
:
10.66844
,
"40"
:
10.50171
,
"45"
:
10.28002
,
"50"
:
10.25621
,
"55"
:
10.18314
,
"60"
:
9.79897
,
"65"
:
9.24752
,
"70"
:
9.91362
,
"75"
:
9.58564
,
"80"
:
9.54312
,
"85"
:
9.72736
,
"90"
:
9.90472
,
"95"
:
9.6077
,
"100"
:
9.49935
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
380476416.0
,
"5"
:
380476416.0
,
"10"
:
380476416.0
,
"15"
:
380476416.0
,
"20"
:
560287232.0
,
"25"
:
560287232.0
,
"30"
:
560287232.0
,
"35"
:
561073664.0
,
"40"
:
560287232.0
,
"45"
:
561597952.0
,
"50"
:
561597952.0
,
"55"
:
561073664.0
,
"60"
:
561073664.0
,
"65"
:
561597952.0
,
"70"
:
560287232.0
,
"75"
:
560287232.0
,
"80"
:
560287232.0
,
"85"
:
560287232.0
,
"90"
:
560287232.0
,
"95"
:
561597952.0
,
"100"
:
560287232.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1702977024.0
,
"5"
:
1702977536.0
,
"10"
:
1702977536.0
,
"15"
:
1702977536.0
,
"20"
:
1884361216.0
,
"25"
:
1884361216.0
,
"30"
:
1884361216.0
,
"35"
:
1884361216.0
,
"40"
:
1884361216.0
,
"45"
:
1884361216.0
,
"50"
:
1884361216.0
,
"55"
:
1884361216.0
,
"60"
:
1884361216.0
,
"65"
:
1884361216.0
,
"70"
:
1884361216.0
,
"75"
:
1884361216.0
,
"80"
:
1884361216.0
,
"85"
:
1884361216.0
,
"90"
:
1884361216.0
,
"95"
:
1884361216.0
,
"100"
:
1884361216.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
4.70051
,
"5"
:
0.18489
,
"10"
:
0.1885
,
"15"
:
0.18516
,
"20"
:
0.19623
,
"25"
:
0.19562
,
"30"
:
0.19558
,
"35"
:
0.19543
,
"40"
:
0.19414
,
"45"
:
0.19546
,
"50"
:
0.1943
,
"55"
:
0.19481
,
"60"
:
0.19412
,
"65"
:
0.19731
,
"70"
:
0.19502
,
"75"
:
0.1953
,
"80"
:
0.19592
,
"85"
:
0.19662
,
"90"
:
0.19524
,
"95"
:
0.19564
,
"100"
:
0.19497
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
"nan"
,
"5"
:
"nan"
,
"10"
:
"nan"
,
"15"
:
"nan"
,
"20"
:
1808.0
,
"25"
:
2385.0
,
"30"
:
2591.0
,
"35"
:
1997.0
,
"40"
:
1959.0
,
"45"
:
2368.0
,
"50"
:
3073.0
,
"55"
:
2580.0
,
"60"
:
2853.0
,
"65"
:
2346.0
,
"70"
:
3572.0
,
"75"
:
2886.0
,
"80"
:
3459.0
,
"85"
:
4068.0
,
"90"
:
3747.0
,
"95"
:
4088.0
,
"100"
:
3436.0
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/golden_values_lts.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
[
10.85929
,
10.89211
,
10.87639
,
10.86988
,
10.88179
,
10.83898
,
10.66589
,
10.62691
,
10.52461
,
10.25708
,
10.19741
,
9.9562
,
9.96369
,
9.91398
,
9.79604
,
9.2686
,
9.61975
,
9.19501
,
9.47332
,
9.62216
]},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
83
,
"step_interval"
:
5
,
"values"
:
[
2458.0
,
2527.0
,
2467.0
,
2148.0
,
2250.0
,
2467.0
,
2528.0
,
3656.0
,
3275.0
,
3203.0
,
3297.0
,
3364.0
,
3789.0
,
3277.0
,
3660.0
,
3733.0
,
4815.0
]},
"iteration_timing_avg"
:
0.1628459701492537
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.85929
,
"5"
:
10.87929
,
"10"
:
10.84772
,
"15"
:
10.86867
,
"20"
:
10.87317
,
"25"
:
10.83338
,
"30"
:
10.75624
,
"35"
:
10.66844
,
"40"
:
10.50171
,
"45"
:
10.28002
,
"50"
:
10.25621
,
"55"
:
10.18314
,
"60"
:
9.79897
,
"65"
:
9.24752
,
"70"
:
9.91362
,
"75"
:
9.58564
,
"80"
:
9.54312
,
"85"
:
9.72736
,
"90"
:
9.90472
,
"95"
:
9.6077
,
"100"
:
9.49935
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
378903552.0
,
"5"
:
379952128.0
,
"10"
:
379952128.0
,
"15"
:
379952128.0
,
"20"
:
560549376.0
,
"25"
:
560549376.0
,
"30"
:
560549376.0
,
"35"
:
560549376.0
,
"40"
:
560549376.0
,
"45"
:
560549376.0
,
"50"
:
560549376.0
,
"55"
:
561073664.0
,
"60"
:
561073664.0
,
"65"
:
560549376.0
,
"70"
:
560549376.0
,
"75"
:
560549376.0
,
"80"
:
560549376.0
,
"85"
:
560549376.0
,
"90"
:
560549376.0
,
"95"
:
560549376.0
,
"100"
:
560549376.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1704025600.0
,
"5"
:
1704026112.0
,
"10"
:
1704026112.0
,
"15"
:
1704026112.0
,
"20"
:
1886196224.0
,
"25"
:
1886196224.0
,
"30"
:
1886196224.0
,
"35"
:
1886196224.0
,
"40"
:
1886196224.0
,
"45"
:
1886196224.0
,
"50"
:
1886196224.0
,
"55"
:
1886196224.0
,
"60"
:
1886196224.0
,
"65"
:
1886196224.0
,
"70"
:
1886196224.0
,
"75"
:
1886196224.0
,
"80"
:
1886196224.0
,
"85"
:
1886196224.0
,
"90"
:
1886196224.0
,
"95"
:
1886196224.0
,
"100"
:
1886196224.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
13.26991
,
"5"
:
0.1958
,
"10"
:
0.19444
,
"15"
:
0.194
,
"20"
:
0.20361
,
"25"
:
0.20332
,
"30"
:
0.20368
,
"35"
:
0.20417
,
"40"
:
0.20368
,
"45"
:
0.20398
,
"50"
:
0.2037
,
"55"
:
0.20453
,
"60"
:
0.20433
,
"65"
:
0.20387
,
"70"
:
0.20373
,
"75"
:
0.20399
,
"80"
:
0.20347
,
"85"
:
0.20432
,
"90"
:
0.2036
,
"95"
:
0.20374
,
"100"
:
0.20437
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
"nan"
,
"5"
:
"nan"
,
"10"
:
"nan"
,
"15"
:
"nan"
,
"20"
:
1808.0
,
"25"
:
2385.0
,
"30"
:
2591.0
,
"35"
:
1997.0
,
"40"
:
1959.0
,
"45"
:
2368.0
,
"50"
:
3073.0
,
"55"
:
2580.0
,
"60"
:
2853.0
,
"65"
:
2346.0
,
"70"
:
3572.0
,
"75"
:
2886.0
,
"80"
:
3459.0
,
"85"
:
4068.0
,
"90"
:
3747.0
,
"95"
:
4088.0
,
"100"
:
3436.0
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
100
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -48,4 +48,5 @@ MODEL_ARGS:
--data-cache-path
:
${DATA_CACHE_PATH}
--fp16
:
true
--apply-query-key-layer-scaling
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
ckpt-resume
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_dev.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.86312
,
10.87712
,
10.87347
,
10.88278
,
10.89457
,
10.84427
,
10.69023
,
10.62687
,
10.53974
,
10.26525
]},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
33
,
"step_interval"
:
5
,
"values"
:
[
2244.0
,
2273.0
,
2447.0
,
2031.0
,
2134.0
,
2491.0
,
2380.0
]},
"iteration_timing_avg"
:
0.23144205882352942
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.86312
,
"5"
:
10.86984
,
"10"
:
10.84273
,
"15"
:
10.88712
,
"20"
:
10.87623
,
"25"
:
10.83465
,
"30"
:
10.75356
,
"35"
:
10.67297
,
"40"
:
10.50224
,
"45"
:
10.28079
,
"50"
:
10.27239
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
284527616.0
,
"5"
:
284527616.0
,
"10"
:
284527616.0
,
"15"
:
284527616.0
,
"20"
:
416513536.0
,
"25"
:
416513536.0
,
"30"
:
416513536.0
,
"35"
:
416513536.0
,
"40"
:
416513536.0
,
"45"
:
416513536.0
,
"50"
:
416513536.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1464318464.0
,
"5"
:
1464319488.0
,
"10"
:
1464320000.0
,
"15"
:
1464320000.0
,
"20"
:
1597091840.0
,
"25"
:
1597091840.0
,
"30"
:
1597091840.0
,
"35"
:
1597091840.0
,
"40"
:
1597092352.0
,
"45"
:
1597092352.0
,
"50"
:
1597092352.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
5.08228
,
"5"
:
0.27074
,
"10"
:
0.26257
,
"15"
:
0.26176
,
"20"
:
0.27712
,
"25"
:
0.27706
,
"30"
:
0.27709
,
"35"
:
0.28021
,
"40"
:
0.28046
,
"45"
:
0.27903
,
"50"
:
0.27978
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
"nan"
,
"5"
:
"nan"
,
"10"
:
"nan"
,
"15"
:
"nan"
,
"20"
:
1645.0
,
"25"
:
2124.0
,
"30"
:
2345.0
,
"35"
:
1780.0
,
"40"
:
1936.0
,
"45"
:
2289.0
,
"50"
:
2738.0
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_lts.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.86312
,
10.87712
,
10.87347
,
10.88278
,
10.89457
,
10.84427
,
10.69023
,
10.62687
,
10.53974
,
10.26525
]},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
33
,
"step_interval"
:
5
,
"values"
:
[
2244.0
,
2273.0
,
2447.0
,
2031.0
,
2134.0
,
2491.0
,
2380.0
]},
"iteration_timing_avg"
:
0.23144205882352942
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.86312
,
"5"
:
10.86984
,
"10"
:
10.84273
,
"15"
:
10.88712
,
"20"
:
10.87623
,
"25"
:
10.83465
,
"30"
:
10.75356
,
"35"
:
10.67297
,
"40"
:
10.50224
,
"45"
:
10.28079
,
"50"
:
10.27239
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
284527616.0
,
"5"
:
284527616.0
,
"10"
:
284527616.0
,
"15"
:
284527616.0
,
"20"
:
416513536.0
,
"25"
:
416513536.0
,
"30"
:
416513536.0
,
"35"
:
416513536.0
,
"40"
:
416513536.0
,
"45"
:
416513536.0
,
"50"
:
416513536.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1465368064.0
,
"5"
:
1465368064.0
,
"10"
:
1465368576.0
,
"15"
:
1465368576.0
,
"20"
:
1596304896.0
,
"25"
:
1596304896.0
,
"30"
:
1596304896.0
,
"35"
:
1596304896.0
,
"40"
:
1596304896.0
,
"45"
:
1596305408.0
,
"50"
:
1596305408.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
6.797
,
"5"
:
0.28708
,
"10"
:
0.286
,
"15"
:
0.28021
,
"20"
:
0.30007
,
"25"
:
0.29697
,
"30"
:
0.29501
,
"35"
:
0.29587
,
"40"
:
0.29259
,
"45"
:
0.2983
,
"50"
:
0.29365
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
"nan"
,
"5"
:
"nan"
,
"10"
:
"nan"
,
"15"
:
"nan"
,
"20"
:
1645.0
,
"25"
:
2124.0
,
"30"
:
2345.0
,
"35"
:
1780.0
,
"40"
:
1936.0
,
"45"
:
2289.0
,
"50"
:
2738.0
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
50
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -46,4 +46,5 @@ MODEL_ARGS:
--data-cache-path
:
${DATA_CACHE_PATH}
--fp16
:
true
--apply-query-key-layer-scaling
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
regular
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/golden_values_dev.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.86312
,
10.87712
,
10.87347
,
10.88278
,
10.89457
,
10.84427
,
10.69023
,
10.62687
,
10.53974
,
10.26525
]},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
33
,
"step_interval"
:
5
,
"values"
:
[
2244.0
,
2273.0
,
2447.0
,
2031.0
,
2134.0
,
2491.0
,
2380.0
]},
"iteration_timing_avg"
:
0.23131970588235293
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.86312
,
"5"
:
10.86984
,
"10"
:
10.84273
,
"15"
:
10.88712
,
"20"
:
10.87623
,
"25"
:
10.83465
,
"30"
:
10.75356
,
"35"
:
10.67297
,
"40"
:
10.50224
,
"45"
:
10.28079
,
"50"
:
10.27239
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
284527616.0
,
"5"
:
284527616.0
,
"10"
:
284527616.0
,
"15"
:
284527616.0
,
"20"
:
416513536.0
,
"25"
:
416513536.0
,
"30"
:
416513536.0
,
"35"
:
416513536.0
,
"40"
:
416513536.0
,
"45"
:
416513536.0
,
"50"
:
416513536.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1464318464.0
,
"5"
:
1464320000.0
,
"10"
:
1464320000.0
,
"15"
:
1464320000.0
,
"20"
:
1597089792.0
,
"25"
:
1597091328.0
,
"30"
:
1597092352.0
,
"35"
:
1597092352.0
,
"40"
:
1597092352.0
,
"45"
:
1597092352.0
,
"50"
:
1597092352.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
4.11891
,
"5"
:
0.27161
,
"10"
:
0.26629
,
"15"
:
0.2637
,
"20"
:
0.2814
,
"25"
:
0.28361
,
"30"
:
0.28297
,
"35"
:
0.28276
,
"40"
:
0.28313
,
"45"
:
0.2873
,
"50"
:
0.28552
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
"nan"
,
"5"
:
"nan"
,
"10"
:
"nan"
,
"15"
:
"nan"
,
"20"
:
1645.0
,
"25"
:
2124.0
,
"30"
:
2345.0
,
"35"
:
1780.0
,
"40"
:
1936.0
,
"45"
:
2289.0
,
"50"
:
2738.0
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/golden_values_lts.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.86312
,
10.87712
,
10.87347
,
10.88278
,
10.89457
,
10.84427
,
10.69023
,
10.62687
,
10.53974
,
10.26525
]},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
33
,
"step_interval"
:
5
,
"values"
:
[
2244.0
,
2273.0
,
2447.0
,
2031.0
,
2134.0
,
2491.0
,
2380.0
]},
"iteration_timing_avg"
:
0.23131970588235293
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.86312
,
"5"
:
10.86984
,
"10"
:
10.84273
,
"15"
:
10.88712
,
"20"
:
10.87623
,
"25"
:
10.83465
,
"30"
:
10.75356
,
"35"
:
10.67297
,
"40"
:
10.50224
,
"45"
:
10.28079
,
"50"
:
10.27239
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
284527616.0
,
"5"
:
284527616.0
,
"10"
:
284527616.0
,
"15"
:
284527616.0
,
"20"
:
416513536.0
,
"25"
:
416513536.0
,
"30"
:
416513536.0
,
"35"
:
416513536.0
,
"40"
:
416513536.0
,
"45"
:
416513536.0
,
"50"
:
416513536.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1464319488.0
,
"5"
:
1464320000.0
,
"10"
:
1465368576.0
,
"15"
:
1465368576.0
,
"20"
:
1596305408.0
,
"25"
:
1596305408.0
,
"30"
:
1596305408.0
,
"35"
:
1596305408.0
,
"40"
:
1596305408.0
,
"45"
:
1596305408.0
,
"50"
:
1596305920.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
7.22206
,
"5"
:
0.28793
,
"10"
:
0.2833
,
"15"
:
0.28906
,
"20"
:
0.29969
,
"25"
:
0.30075
,
"30"
:
0.29561
,
"35"
:
0.30149
,
"40"
:
0.29547
,
"45"
:
0.30118
,
"50"
:
0.29352
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
"nan"
,
"5"
:
"nan"
,
"10"
:
"nan"
,
"15"
:
"nan"
,
"20"
:
1645.0
,
"25"
:
2124.0
,
"30"
:
2345.0
,
"35"
:
1780.0
,
"40"
:
1936.0
,
"45"
:
2289.0
,
"50"
:
2738.0
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
50
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -47,4 +47,5 @@ MODEL_ARGS:
--data-cache-path
:
${DATA_CACHE_PATH}
--fp16
:
true
--apply-query-key-layer-scaling
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
regular
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/golden_values_dev.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
[
10.86312
,
10.87712
,
10.87347
,
10.88278
,
10.89457
,
10.84427
,
10.69023
,
10.62687
,
10.53974
,
10.26525
,
10.21403
,
9.9801
,
9.96977
,
9.93973
,
9.81158
,
9.28667
,
9.63194
,
9.19732
,
9.48341
,
9.62985
]},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
83
,
"step_interval"
:
5
,
"values"
:
[
2244.0
,
2273.0
,
2447.0
,
2031.0
,
2134.0
,
2491.0
,
2380.0
,
3451.0
,
3205.0
,
2940.0
,
3143.0
,
3310.0
,
3884.0
,
3232.0
,
3491.0
,
3751.0
,
5022.0
]},
"iteration_timing_avg"
:
0.22914074626865674
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.86312
,
"5"
:
10.86984
,
"10"
:
10.84273
,
"15"
:
10.88712
,
"20"
:
10.87623
,
"25"
:
10.83465
,
"30"
:
10.75356
,
"35"
:
10.67297
,
"40"
:
10.50224
,
"45"
:
10.28079
,
"50"
:
10.27239
,
"55"
:
10.20076
,
"60"
:
9.84045
,
"65"
:
9.27781
,
"70"
:
9.92981
,
"75"
:
9.61573
,
"80"
:
9.56042
,
"85"
:
9.74259
,
"90"
:
9.91759
,
"95"
:
9.61376
,
"100"
:
9.50538
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
284527616.0
,
"5"
:
284527616.0
,
"10"
:
284527616.0
,
"15"
:
284527616.0
,
"20"
:
416513536.0
,
"25"
:
416513536.0
,
"30"
:
416513536.0
,
"35"
:
416513536.0
,
"40"
:
416513536.0
,
"45"
:
416513536.0
,
"50"
:
416513536.0
,
"55"
:
416513536.0
,
"60"
:
416513536.0
,
"65"
:
416513536.0
,
"70"
:
416513536.0
,
"75"
:
416513536.0
,
"80"
:
416513536.0
,
"85"
:
416513536.0
,
"90"
:
416513536.0
,
"95"
:
416513536.0
,
"100"
:
416513536.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1464319488.0
,
"5"
:
1464319488.0
,
"10"
:
1464320000.0
,
"15"
:
1464320000.0
,
"20"
:
1594994688.0
,
"25"
:
1597091840.0
,
"30"
:
1597091840.0
,
"35"
:
1597091840.0
,
"40"
:
1597092352.0
,
"45"
:
1597092352.0
,
"50"
:
1597092352.0
,
"55"
:
1597092352.0
,
"60"
:
1597092352.0
,
"65"
:
1597092352.0
,
"70"
:
1597092352.0
,
"75"
:
1597092352.0
,
"80"
:
1597092352.0
,
"85"
:
1597092352.0
,
"90"
:
1597092352.0
,
"95"
:
1597092352.0
,
"100"
:
1597092352.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
3.9389
,
"5"
:
0.26761
,
"10"
:
0.26783
,
"15"
:
0.26387
,
"20"
:
0.27882
,
"25"
:
0.27734
,
"30"
:
0.2767
,
"35"
:
0.277
,
"40"
:
0.27635
,
"45"
:
0.27694
,
"50"
:
0.28016
,
"55"
:
0.27883
,
"60"
:
0.28002
,
"65"
:
0.27862
,
"70"
:
0.27887
,
"75"
:
0.27972
,
"80"
:
0.27714
,
"85"
:
0.27759
,
"90"
:
0.27766
,
"95"
:
0.27789
,
"100"
:
0.27817
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
"nan"
,
"5"
:
"nan"
,
"10"
:
"nan"
,
"15"
:
"nan"
,
"20"
:
1645.0
,
"25"
:
2124.0
,
"30"
:
2345.0
,
"35"
:
1780.0
,
"40"
:
1936.0
,
"45"
:
2289.0
,
"50"
:
2738.0
,
"55"
:
2309.0
,
"60"
:
2740.0
,
"65"
:
2151.0
,
"70"
:
3646.0
,
"75"
:
2891.0
,
"80"
:
3546.0
,
"85"
:
3681.0
,
"90"
:
3861.0
,
"95"
:
4152.0
,
"100"
:
3405.0
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/golden_values_lts.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
[
10.86312
,
10.87712
,
10.87347
,
10.88278
,
10.89457
,
10.84427
,
10.69023
,
10.62687
,
10.53974
,
10.26525
,
10.21403
,
9.9801
,
9.96977
,
9.93973
,
9.81158
,
9.28667
,
9.63194
,
9.19732
,
9.48341
,
9.62985
]},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
83
,
"step_interval"
:
5
,
"values"
:
[
2244.0
,
2273.0
,
2447.0
,
2031.0
,
2134.0
,
2491.0
,
2380.0
,
3451.0
,
3205.0
,
2940.0
,
3143.0
,
3310.0
,
3884.0
,
3232.0
,
3491.0
,
3751.0
,
5022.0
]},
"iteration_timing_avg"
:
0.22914074626865674
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.86312
,
"5"
:
10.86984
,
"10"
:
10.84273
,
"15"
:
10.88712
,
"20"
:
10.87623
,
"25"
:
10.83465
,
"30"
:
10.75356
,
"35"
:
10.67297
,
"40"
:
10.50224
,
"45"
:
10.28079
,
"50"
:
10.27239
,
"55"
:
10.20076
,
"60"
:
9.84045
,
"65"
:
9.27781
,
"70"
:
9.92981
,
"75"
:
9.61573
,
"80"
:
9.56042
,
"85"
:
9.74259
,
"90"
:
9.91759
,
"95"
:
9.61376
,
"100"
:
9.50538
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
284527616.0
,
"5"
:
284527616.0
,
"10"
:
284527616.0
,
"15"
:
284527616.0
,
"20"
:
416513536.0
,
"25"
:
416513536.0
,
"30"
:
416513536.0
,
"35"
:
416513536.0
,
"40"
:
416513536.0
,
"45"
:
416513536.0
,
"50"
:
416513536.0
,
"55"
:
416513536.0
,
"60"
:
416513536.0
,
"65"
:
416513536.0
,
"70"
:
416513536.0
,
"75"
:
416513536.0
,
"80"
:
416513536.0
,
"85"
:
416513536.0
,
"90"
:
416513536.0
,
"95"
:
416513536.0
,
"100"
:
416513536.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1465368064.0
,
"5"
:
1465368064.0
,
"10"
:
1465368576.0
,
"15"
:
1465368576.0
,
"20"
:
1596305920.0
,
"25"
:
1596305920.0
,
"30"
:
1596305920.0
,
"35"
:
1596305920.0
,
"40"
:
1596305920.0
,
"45"
:
1596305920.0
,
"50"
:
1596305920.0
,
"55"
:
1596305920.0
,
"60"
:
1596305920.0
,
"65"
:
1596305920.0
,
"70"
:
1596305920.0
,
"75"
:
1596305920.0
,
"80"
:
1596305920.0
,
"85"
:
1596305920.0
,
"90"
:
1596305920.0
,
"95"
:
1596305920.0
,
"100"
:
1596305920.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
9.41683
,
"5"
:
0.29649
,
"10"
:
0.2936
,
"15"
:
0.29369
,
"20"
:
0.30302
,
"25"
:
0.29665
,
"30"
:
0.30347
,
"35"
:
0.29671
,
"40"
:
0.29818
,
"45"
:
0.29562
,
"50"
:
0.30562
,
"55"
:
0.29659
,
"60"
:
0.29349
,
"65"
:
0.29455
,
"70"
:
0.30009
,
"75"
:
0.29572
,
"80"
:
0.29482
,
"85"
:
0.29505
,
"90"
:
0.29548
,
"95"
:
0.29481
,
"100"
:
0.30221
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
"nan"
,
"5"
:
"nan"
,
"10"
:
"nan"
,
"15"
:
"nan"
,
"20"
:
1645.0
,
"25"
:
2124.0
,
"30"
:
2345.0
,
"35"
:
1780.0
,
"40"
:
1936.0
,
"45"
:
2289.0
,
"50"
:
2738.0
,
"55"
:
2309.0
,
"60"
:
2740.0
,
"65"
:
2151.0
,
"70"
:
3646.0
,
"75"
:
2891.0
,
"80"
:
3546.0
,
"85"
:
3681.0
,
"90"
:
3861.0
,
"95"
:
4152.0
,
"100"
:
3405.0
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
100
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -47,4 +47,5 @@ MODEL_ARGS:
--data-cache-path
:
${DATA_CACHE_PATH}
--fp16
:
true
--apply-query-key-layer-scaling
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
ckpt-resume
tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel/model_config.yaml
View file @
688448db
...
...
@@ -17,8 +17,8 @@ MODEL_ARGS:
--train-iters
:
2000
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -45,4 +45,5 @@ MODEL_ARGS:
--ckpt-format
:
torch_dist
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
regular
tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
2000
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -45,4 +45,5 @@ MODEL_ARGS:
--data-cache-path
:
${DATA_CACHE_PATH}
--fp16
:
true
--apply-query-key-layer-scaling
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
regular
tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel/model_config.yaml
View file @
688448db
...
...
@@ -20,8 +20,8 @@ MODEL_ARGS:
--train-iters
:
2000
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -49,4 +49,5 @@ MODEL_ARGS:
--ckpt-format
:
torch_dist
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
regular
tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp/model_config.yaml
View file @
688448db
...
...
@@ -20,8 +20,8 @@ MODEL_ARGS:
--train-iters
:
2000
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -49,4 +49,5 @@ MODEL_ARGS:
--ckpt-format
:
torch_dist
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
regular
tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp/model_config.yaml
View file @
688448db
...
...
@@ -20,8 +20,8 @@ MODEL_ARGS:
--train-iters
:
2000
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -49,4 +49,5 @@ MODEL_ARGS:
--ckpt-format
:
torch_dist
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
regular
tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp/model_config.yaml
View file @
688448db
...
...
@@ -20,8 +20,8 @@ MODEL_ARGS:
--train-iters
:
2000
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -50,4 +50,5 @@ MODEL_ARGS:
--ckpt-format
:
torch_dist
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
regular
Prev
1
…
16
17
18
19
20
21
22
23
24
…
42
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment