Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
688448db
Commit
688448db
authored
Mar 14, 2025
by
silencealiang
Browse files
更新代码
parent
a02a5490
Pipeline
#2503
passed with stage
Changes
823
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
147 additions
and
119 deletions
+147
-119
tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp/model_config.yaml
..._1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp/model_config.yaml
...y_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed/golden_values_dev.json
...dgx_a100_1N8G_tp1_pp4_memory_speed/golden_values_dev.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed/model_config.yaml
...b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed/model_config.yaml
+59
-0
tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed/golden_values_dev.json
...dgx_a100_1N8G_tp4_pp1_memory_speed/golden_values_dev.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed/golden_values_lts.json
...dgx_a100_1N8G_tp4_pp1_memory_speed/golden_values_lts.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed/model_config.yaml
...b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed/model_config.yaml
+58
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev.json
...er_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev.json
+1
-53
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts.json
...er_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts.json
+1
-1
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
...timizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_dev.json
..._dist_dist_optimizer_dgx_a100_1N8G/golden_values_dev.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_lts.json
..._dist_dist_optimizer_dgx_a100_1N8G/golden_values_lts.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml
...torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev.json
...er_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts.json
...er_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
...timizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_dev.json
...iform_full_recompute_dgx_a100_1N8G/golden_values_dev.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_lts.json
...iform_full_recompute_dgx_a100_1N8G/golden_values_lts.json
+1
-0
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml
...st_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml
+3
-2
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/golden_values_dev.json
...iform_full_recompute_dgx_a100_1N8G/golden_values_dev.json
+1
-53
No files found.
tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp/model_config.yaml
View file @
688448db
...
...
@@ -20,8 +20,8 @@ MODEL_ARGS:
--train-iters
:
2000
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -52,4 +52,5 @@ MODEL_ARGS:
--ckpt-format
:
torch_dist
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
regular
tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp/model_config.yaml
View file @
688448db
...
...
@@ -20,8 +20,8 @@ MODEL_ARGS:
--train-iters
:
2000
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -49,4 +49,5 @@ MODEL_ARGS:
--ckpt-format
:
torch_dist
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
regular
tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed/golden_values_dev.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
25
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
12.57679
,
"5"
:
12.5818
,
"10"
:
12.47354
,
"15"
:
11.80609
,
"20"
:
11.49702
,
"25"
:
10.98467
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
25
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
521041248.0
,
"5"
:
520997440.0
,
"10"
:
521179808.0
,
"15"
:
521592416.0
,
"20"
:
521133664.0
,
"25"
:
523544832.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
25
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
24510808064.0
,
"5"
:
24510808064.0
,
"10"
:
24510808064.0
,
"15"
:
24510808064.0
,
"20"
:
24510808064.0
,
"25"
:
24510808064.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
25
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
52700401664.0
,
"5"
:
60489064448.0
,
"10"
:
60489064448.0
,
"15"
:
60489064448.0
,
"20"
:
60489064448.0
,
"25"
:
60489064448.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
25
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
"nan"
,
"5"
:
"nan"
,
"10"
:
2.87864
,
"15"
:
"nan"
,
"20"
:
2.89414
,
"25"
:
"nan"
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed/model_config.yaml
0 → 100644
View file @
688448db
ENV_VARS
:
CUDA_DEVICE_MAX_CONNECTIONS
:
1
NVTE_ALLOW_NONDETERMINISTIC_ALGO
:
0
NCCL_ALGO
:
Tree
CUBLAS_WORKSPACE_CONFIG
:
:4096:8
NVTE_FWD_LAYERNORM_SM_MARGIN
:
16
NVTE_BWD_LAYERNORM_SM_MARGIN
:
16
MODEL_ARGS
:
--num-layers
:
32
--hidden-size
:
4096
--num-attention-heads
:
32
--group-query-attention
:
true
--num-query-groups
:
8
--untie-embeddings-and-output-weights
:
true
--log-throughput
:
true
--log-params-norm
:
true
--log-num-zeros-in-grad
:
true
--log-validation-ppl-to-tensorboard
:
true
--log-timers-to-tensorboard
:
true
--tensorboard-dir
:
${TENSORBOARD_PATH}
--micro-batch-size
:
1
--global-batch-size
:
8
--seq-length
:
8192
--max-position-embeddings
:
8192
--train-iters
:
25
--timing-log-level
:
0
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_SAVE_PATH}
--load
:
${CHECKPOINT_LOAD_PATH}
--tokenizer-type
:
NullTokenizer
--vocab-size
:
131072
--mock-data
:
true
--split
:
949,50,1
--distributed-backend
:
nccl
--lr
:
0.00015
--lr-decay-style
:
cosine
--min-lr
:
1.0e-5
--weight-decay
:
1e-2
--clip-grad
:
1.0
--lr-warmup-fraction
:
.01
--log-interval
:
2
--save-interval
:
10000
--eval-interval
:
1000
--eval-iters
:
5
--transformer-impl
:
transformer_engine
--tensor-model-parallel-size
:
1
--pipeline-model-parallel-size
:
4
--num-layers-per-virtual-pipeline-stage
:
1
--use-distributed-optimizer
:
true
--overlap-grad-reduce
:
true
--overlap-param-gather
:
true
--deterministic-mode
:
true
--no-gradient-accumulation-fusion
:
true
--use-mcore-models
:
true
--ckpt-format
:
torch_dist
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
regular
tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed/golden_values_dev.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
25
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
12.61262
,
"5"
:
12.60238
,
"10"
:
12.49879
,
"15"
:
11.82067
,
"20"
:
11.50566
,
"25"
:
10.99243
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
25
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
523040896.0
,
"5"
:
523012096.0
,
"10"
:
523190944.0
,
"15"
:
523625088.0
,
"20"
:
523224032.0
,
"25"
:
525635776.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
25
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
20634214400.0
,
"5"
:
20634214400.0
,
"10"
:
20634214400.0
,
"15"
:
20634214400.0
,
"20"
:
20634214400.0
,
"25"
:
20634214400.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
25
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
51333926912.0
,
"5"
:
58188226560.0
,
"10"
:
58188226560.0
,
"15"
:
58188226560.0
,
"20"
:
58188226560.0
,
"25"
:
58188226560.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
25
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
"nan"
,
"5"
:
"nan"
,
"10"
:
2.72059
,
"15"
:
"nan"
,
"20"
:
2.72404
,
"25"
:
"nan"
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed/golden_values_lts.json
0 → 100644
View file @
688448db
{}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed/model_config.yaml
0 → 100644
View file @
688448db
ENV_VARS
:
CUDA_DEVICE_MAX_CONNECTIONS
:
1
NVTE_ALLOW_NONDETERMINISTIC_ALGO
:
0
NCCL_ALGO
:
Tree
CUBLAS_WORKSPACE_CONFIG
:
:4096:8
NVTE_FWD_LAYERNORM_SM_MARGIN
:
16
NVTE_BWD_LAYERNORM_SM_MARGIN
:
16
MODEL_ARGS
:
--num-layers
:
32
--hidden-size
:
4096
--num-attention-heads
:
32
--group-query-attention
:
true
--num-query-groups
:
8
--untie-embeddings-and-output-weights
:
true
--log-throughput
:
true
--log-params-norm
:
true
--log-num-zeros-in-grad
:
true
--log-validation-ppl-to-tensorboard
:
true
--log-timers-to-tensorboard
:
true
--tensorboard-dir
:
${TENSORBOARD_PATH}
--micro-batch-size
:
2
--global-batch-size
:
8
--seq-length
:
8192
--max-position-embeddings
:
8192
--train-iters
:
25
--timing-log-level
:
0
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_SAVE_PATH}
--load
:
${CHECKPOINT_LOAD_PATH}
--tokenizer-type
:
NullTokenizer
--vocab-size
:
131072
--mock-data
:
true
--split
:
949,50,1
--distributed-backend
:
nccl
--lr
:
0.00015
--lr-decay-style
:
cosine
--min-lr
:
1.0e-5
--weight-decay
:
1e-2
--clip-grad
:
1.0
--lr-warmup-fraction
:
.01
--log-interval
:
2
--save-interval
:
10000
--eval-interval
:
1000
--eval-iters
:
5
--transformer-impl
:
transformer_engine
--tensor-model-parallel-size
:
4
--pipeline-model-parallel-size
:
1
--use-distributed-optimizer
:
true
--overlap-grad-reduce
:
true
--overlap-param-gather
:
true
--deterministic-mode
:
true
--no-gradient-accumulation-fusion
:
true
--use-mcore-models
:
true
--ckpt-format
:
torch_dist
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
regular
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.8401
,
10.87259
,
10.85024
,
10.79646
,
10.68156
,
10.60618
,
10.12768
,
10.22185
,
10.13788
,
9.82309
]
},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
1698.0
,
1855.0
,
1949.0
,
1968.0
,
1881.0
,
1783.0
,
1653.0
,
2037.0
,
2313.0
,
2300.0
]
},
"iteration-time"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
5.37706
,
0.09618
,
0.09432
,
0.09666
,
0.09442
,
0.09619
,
0.09453
,
0.0975
,
0.09517
,
0.09727
]
}
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.8401
,
"5"
:
10.84034
,
"10"
:
10.8134
,
"15"
:
10.80277
,
"20"
:
10.70494
,
"25"
:
10.53848
,
"30"
:
10.3552
,
"35"
:
10.27145
,
"40"
:
10.08048
,
"45"
:
9.82288
,
"50"
:
9.90119
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1698.0
,
"5"
:
1900.0
,
"10"
:
1421.0
,
"15"
:
1946.0
,
"20"
:
1765.0
,
"25"
:
1726.0
,
"30"
:
2022.0
,
"35"
:
1962.0
,
"40"
:
2274.0
,
"45"
:
2172.0
,
"50"
:
2369.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
552128000.0
,
"5"
:
552128000.0
,
"10"
:
552128000.0
,
"15"
:
552128000.0
,
"20"
:
552128000.0
,
"25"
:
552128000.0
,
"30"
:
552128000.0
,
"35"
:
552128000.0
,
"40"
:
552128000.0
,
"45"
:
552128000.0
,
"50"
:
552128000.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
4576452608.0
,
"5"
:
4673069056.0
,
"10"
:
4673069056.0
,
"15"
:
4673069056.0
,
"20"
:
4673069056.0
,
"25"
:
4673069056.0
,
"30"
:
4673069056.0
,
"35"
:
4673069056.0
,
"40"
:
4673069056.0
,
"45"
:
4673069056.0
,
"50"
:
4673069056.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
5.82685
,
"5"
:
0.09636
,
"10"
:
0.09453
,
"15"
:
0.0951
,
"20"
:
0.09324
,
"25"
:
0.09311
,
"30"
:
0.09279
,
"35"
:
0.0934
,
"40"
:
0.09774
,
"45"
:
0.09122
,
"50"
:
0.08864
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.8401
,
10.87262
,
10.85025
,
10.79646
,
10.68152
,
10.60614
,
10.12765
,
10.22184
,
10.13787
,
9.82312
]},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
1670.0
,
1901.0
,
1954.0
,
1932.0
,
1998.0
,
1768.0
,
1651.0
,
2063.0
,
2348.0
,
2324.0
]},
"iteration_timing_avg"
:
0.06904588235294119
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.8401
,
"5"
:
10.84032
,
"10"
:
10.8134
,
"15"
:
10.80276
,
"20"
:
10.70493
,
"25"
:
10.53847
,
"30"
:
10.35518
,
"35"
:
10.27143
,
"40"
:
10.08046
,
"45"
:
9.82288
,
"50"
:
9.90114
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1670.0
,
"5"
:
1970.0
,
"10"
:
1397.0
,
"15"
:
1886.0
,
"20"
:
1785.0
,
"25"
:
1695.0
,
"30"
:
2086.0
,
"35"
:
1976.0
,
"40"
:
2349.0
,
"45"
:
2240.0
,
"50"
:
2338.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
552238592.0
,
"5"
:
552238592.0
,
"10"
:
552238592.0
,
"15"
:
552238592.0
,
"20"
:
552238592.0
,
"25"
:
552238592.0
,
"30"
:
552238592.0
,
"35"
:
552238592.0
,
"40"
:
552238592.0
,
"45"
:
552238592.0
,
"50"
:
552238592.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
4576563200.0
,
"5"
:
4673179648.0
,
"10"
:
4673179648.0
,
"15"
:
4673179648.0
,
"20"
:
4673179648.0
,
"25"
:
4673179648.0
,
"30"
:
4673179648.0
,
"35"
:
4673179648.0
,
"40"
:
4673179648.0
,
"45"
:
4673179648.0
,
"50"
:
4673179648.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
7.79296
,
"5"
:
0.08936
,
"10"
:
0.08747
,
"15"
:
0.09067
,
"20"
:
0.08679
,
"25"
:
0.08868
,
"30"
:
0.08685
,
"35"
:
0.08887
,
"40"
:
0.08682
,
"45"
:
0.08792
,
"50"
:
0.08604
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
50
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -49,4 +49,5 @@ MODEL_ARGS:
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--attention-backend
:
unfused
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
regular
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_dev.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.84023
,
"5"
:
10.84074
,
"10"
:
10.81392
,
"15"
:
10.80238
,
"20"
:
10.70474
,
"25"
:
10.53876
,
"30"
:
10.35537
,
"35"
:
10.2716
,
"40"
:
10.08036
,
"45"
:
9.8231
,
"50"
:
9.90117
,
"55"
:
9.86414
,
"60"
:
9.48062
,
"65"
:
8.93763
,
"70"
:
9.7102
,
"75"
:
9.40888
,
"80"
:
9.39066
,
"85"
:
9.59766
,
"90"
:
9.80366
,
"95"
:
9.50574
,
"100"
:
9.38807
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1659.0
,
"5"
:
1886.0
,
"10"
:
1388.0
,
"15"
:
1827.0
,
"20"
:
1686.0
,
"25"
:
1696.0
,
"30"
:
1877.0
,
"35"
:
1967.0
,
"40"
:
2300.0
,
"45"
:
2176.0
,
"50"
:
2249.0
,
"55"
:
2468.0
,
"60"
:
2471.0
,
"65"
:
2688.0
,
"70"
:
3271.0
,
"75"
:
2633.0
,
"80"
:
3351.0
,
"85"
:
3332.0
,
"90"
:
2984.0
,
"95"
:
3459.0
,
"100"
:
3555.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
552128512.0
,
"5"
:
552128512.0
,
"10"
:
552128512.0
,
"15"
:
552128512.0
,
"20"
:
552128512.0
,
"25"
:
552128512.0
,
"30"
:
552128512.0
,
"35"
:
552128512.0
,
"40"
:
552128512.0
,
"45"
:
552128512.0
,
"50"
:
552128512.0
,
"55"
:
552128512.0
,
"60"
:
552128512.0
,
"65"
:
552128512.0
,
"70"
:
552128512.0
,
"75"
:
552128512.0
,
"80"
:
552128512.0
,
"85"
:
552128512.0
,
"90"
:
552128512.0
,
"95"
:
552128512.0
,
"100"
:
552128512.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
2615097856.0
,
"5"
:
2711714304.0
,
"10"
:
2711714304.0
,
"15"
:
2711714304.0
,
"20"
:
2711714304.0
,
"25"
:
2711714304.0
,
"30"
:
2711714304.0
,
"35"
:
2711714304.0
,
"40"
:
2711714304.0
,
"45"
:
2711714304.0
,
"50"
:
2711714304.0
,
"55"
:
2711714304.0
,
"60"
:
2711714304.0
,
"65"
:
2711714304.0
,
"70"
:
2711714304.0
,
"75"
:
2711714304.0
,
"80"
:
2711714304.0
,
"85"
:
2711714304.0
,
"90"
:
2711714304.0
,
"95"
:
2711714304.0
,
"100"
:
2711714304.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
5.51223
,
"5"
:
0.08691
,
"10"
:
0.085
,
"15"
:
0.0859
,
"20"
:
0.08404
,
"25"
:
0.08464
,
"30"
:
0.08355
,
"35"
:
0.08189
,
"40"
:
0.08107
,
"45"
:
0.08112
,
"50"
:
0.08147
,
"55"
:
0.08204
,
"60"
:
0.08108
,
"65"
:
0.08132
,
"70"
:
0.0801
,
"75"
:
0.0805
,
"80"
:
0.08087
,
"85"
:
0.08073
,
"90"
:
0.08118
,
"95"
:
0.0798
,
"100"
:
0.0816
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/golden_values_lts.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.8401
,
"5"
:
10.84032
,
"10"
:
10.8134
,
"15"
:
10.80276
,
"20"
:
10.70493
,
"25"
:
10.53847
,
"30"
:
10.35518
,
"35"
:
10.27143
,
"40"
:
10.08046
,
"45"
:
9.82288
,
"50"
:
9.90114
,
"55"
:
9.86426
,
"60"
:
9.48028
,
"65"
:
8.93744
,
"70"
:
9.71023
,
"75"
:
9.40882
,
"80"
:
9.39078
,
"85"
:
9.59744
,
"90"
:
9.8039
,
"95"
:
9.50564
,
"100"
:
9.38814
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1670.0
,
"5"
:
1970.0
,
"10"
:
1397.0
,
"15"
:
1886.0
,
"20"
:
1785.0
,
"25"
:
1695.0
,
"30"
:
2086.0
,
"35"
:
1976.0
,
"40"
:
2349.0
,
"45"
:
2240.0
,
"50"
:
2338.0
,
"55"
:
2364.0
,
"60"
:
2474.0
,
"65"
:
2762.0
,
"70"
:
3207.0
,
"75"
:
2625.0
,
"80"
:
3502.0
,
"85"
:
3356.0
,
"90"
:
3142.0
,
"95"
:
3385.0
,
"100"
:
3449.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
552238592.0
,
"5"
:
552238592.0
,
"10"
:
552238592.0
,
"15"
:
552238592.0
,
"20"
:
552238592.0
,
"25"
:
552238592.0
,
"30"
:
552238592.0
,
"35"
:
552238592.0
,
"40"
:
552238592.0
,
"45"
:
552238592.0
,
"50"
:
552238592.0
,
"55"
:
552238592.0
,
"60"
:
552238592.0
,
"65"
:
552238592.0
,
"70"
:
552238592.0
,
"75"
:
552238592.0
,
"80"
:
552238592.0
,
"85"
:
552238592.0
,
"90"
:
552238592.0
,
"95"
:
552238592.0
,
"100"
:
552238592.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
4576563200.0
,
"5"
:
4673179648.0
,
"10"
:
4673179648.0
,
"15"
:
4673179648.0
,
"20"
:
4673179648.0
,
"25"
:
4673179648.0
,
"30"
:
4673179648.0
,
"35"
:
4673179648.0
,
"40"
:
4673179648.0
,
"45"
:
4673179648.0
,
"50"
:
4673179648.0
,
"55"
:
4673179648.0
,
"60"
:
4673179648.0
,
"65"
:
4673179648.0
,
"70"
:
4673179648.0
,
"75"
:
4673179648.0
,
"80"
:
4673179648.0
,
"85"
:
4673179648.0
,
"90"
:
4673179648.0
,
"95"
:
4673179648.0
,
"100"
:
4673179648.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
9.8249
,
"5"
:
0.09011
,
"10"
:
0.09012
,
"15"
:
0.09032
,
"20"
:
0.08958
,
"25"
:
0.0911
,
"30"
:
0.0899
,
"35"
:
0.09078
,
"40"
:
0.08965
,
"45"
:
0.09255
,
"50"
:
0.0906
,
"55"
:
0.08977
,
"60"
:
0.0869
,
"65"
:
0.08684
,
"70"
:
0.08704
,
"75"
:
0.08628
,
"80"
:
0.08639
,
"85"
:
0.08662
,
"90"
:
0.08701
,
"95"
:
0.08613
,
"100"
:
0.0859
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
100
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -50,4 +50,5 @@ MODEL_ARGS:
--ckpt-format
:
torch_dist
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
ckpt-resume
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.84023
,
"5"
:
10.84074
,
"10"
:
10.81392
,
"15"
:
10.80238
,
"20"
:
10.70474
,
"25"
:
10.53876
,
"30"
:
10.35537
,
"35"
:
10.2716
,
"40"
:
10.08036
,
"45"
:
9.8231
,
"50"
:
9.90117
,
"55"
:
9.86414
,
"60"
:
9.48062
,
"65"
:
8.93763
,
"70"
:
9.7102
,
"75"
:
9.40888
,
"80"
:
9.39066
,
"85"
:
9.59766
,
"90"
:
9.80366
,
"95"
:
9.50574
,
"100"
:
9.38807
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1659.0
,
"5"
:
1886.0
,
"10"
:
1388.0
,
"15"
:
1827.0
,
"20"
:
1686.0
,
"25"
:
1696.0
,
"30"
:
1877.0
,
"35"
:
1967.0
,
"40"
:
2300.0
,
"45"
:
2176.0
,
"50"
:
2249.0
,
"55"
:
2468.0
,
"60"
:
2471.0
,
"65"
:
2688.0
,
"70"
:
3271.0
,
"75"
:
2633.0
,
"80"
:
3351.0
,
"85"
:
3332.0
,
"90"
:
2984.0
,
"95"
:
3459.0
,
"100"
:
3555.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
552128512.0
,
"5"
:
552128512.0
,
"10"
:
552128512.0
,
"15"
:
552128512.0
,
"20"
:
552128512.0
,
"25"
:
552128512.0
,
"30"
:
552128512.0
,
"35"
:
552128512.0
,
"40"
:
552128512.0
,
"45"
:
552128512.0
,
"50"
:
552128512.0
,
"55"
:
552128512.0
,
"60"
:
552128512.0
,
"65"
:
552128512.0
,
"70"
:
552128512.0
,
"75"
:
552128512.0
,
"80"
:
552128512.0
,
"85"
:
552128512.0
,
"90"
:
552128512.0
,
"95"
:
552128512.0
,
"100"
:
552128512.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
2615097856.0
,
"5"
:
2711714304.0
,
"10"
:
2711714304.0
,
"15"
:
2711714304.0
,
"20"
:
2711714304.0
,
"25"
:
2711714304.0
,
"30"
:
2711714304.0
,
"35"
:
2711714304.0
,
"40"
:
2711714304.0
,
"45"
:
2711714304.0
,
"50"
:
2711714304.0
,
"55"
:
2711714304.0
,
"60"
:
2711714304.0
,
"65"
:
2711714304.0
,
"70"
:
2711714304.0
,
"75"
:
2711714304.0
,
"80"
:
2711714304.0
,
"85"
:
2711714304.0
,
"90"
:
2711714304.0
,
"95"
:
2711714304.0
,
"100"
:
2711714304.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
5.45269
,
"5"
:
0.08237
,
"10"
:
0.08305
,
"15"
:
0.08328
,
"20"
:
0.08344
,
"25"
:
0.08281
,
"30"
:
0.08195
,
"35"
:
0.08111
,
"40"
:
0.08016
,
"45"
:
0.07836
,
"50"
:
0.07936
,
"55"
:
0.07906
,
"60"
:
0.08023
,
"65"
:
0.07916
,
"70"
:
0.08026
,
"75"
:
0.07938
,
"80"
:
0.07948
,
"85"
:
0.07874
,
"90"
:
0.07885
,
"95"
:
0.0779
,
"100"
:
0.08116
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.8401
,
"5"
:
10.84032
,
"10"
:
10.8134
,
"15"
:
10.80276
,
"20"
:
10.70493
,
"25"
:
10.53847
,
"30"
:
10.35518
,
"35"
:
10.27143
,
"40"
:
10.08046
,
"45"
:
9.82288
,
"50"
:
9.90114
,
"55"
:
9.86426
,
"60"
:
9.48028
,
"65"
:
8.93744
,
"70"
:
9.71023
,
"75"
:
9.40882
,
"80"
:
9.39078
,
"85"
:
9.59744
,
"90"
:
9.8039
,
"95"
:
9.50564
,
"100"
:
9.38814
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1670.0
,
"5"
:
1970.0
,
"10"
:
1397.0
,
"15"
:
1886.0
,
"20"
:
1785.0
,
"25"
:
1695.0
,
"30"
:
2086.0
,
"35"
:
1976.0
,
"40"
:
2349.0
,
"45"
:
2240.0
,
"50"
:
2338.0
,
"55"
:
2364.0
,
"60"
:
2474.0
,
"65"
:
2762.0
,
"70"
:
3207.0
,
"75"
:
2625.0
,
"80"
:
3502.0
,
"85"
:
3356.0
,
"90"
:
3142.0
,
"95"
:
3385.0
,
"100"
:
3449.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
552238592.0
,
"5"
:
552238592.0
,
"10"
:
552238592.0
,
"15"
:
552238592.0
,
"20"
:
552238592.0
,
"25"
:
552238592.0
,
"30"
:
552238592.0
,
"35"
:
552238592.0
,
"40"
:
552238592.0
,
"45"
:
552238592.0
,
"50"
:
552238592.0
,
"55"
:
552238592.0
,
"60"
:
552238592.0
,
"65"
:
552238592.0
,
"70"
:
552238592.0
,
"75"
:
552238592.0
,
"80"
:
552238592.0
,
"85"
:
552238592.0
,
"90"
:
552238592.0
,
"95"
:
552238592.0
,
"100"
:
552238592.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
4576563200.0
,
"5"
:
4673179648.0
,
"10"
:
4673179648.0
,
"15"
:
4673179648.0
,
"20"
:
4673179648.0
,
"25"
:
4673179648.0
,
"30"
:
4673179648.0
,
"35"
:
4673179648.0
,
"40"
:
4673179648.0
,
"45"
:
4673179648.0
,
"50"
:
4673179648.0
,
"55"
:
4673179648.0
,
"60"
:
4673179648.0
,
"65"
:
4673179648.0
,
"70"
:
4673179648.0
,
"75"
:
4673179648.0
,
"80"
:
4673179648.0
,
"85"
:
4673179648.0
,
"90"
:
4673179648.0
,
"95"
:
4673179648.0
,
"100"
:
4673179648.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.41235
,
"5"
:
0.08775
,
"10"
:
0.08849
,
"15"
:
0.08737
,
"20"
:
0.08713
,
"25"
:
0.08696
,
"30"
:
0.08757
,
"35"
:
0.08803
,
"40"
:
0.08782
,
"45"
:
0.08739
,
"50"
:
0.08653
,
"55"
:
0.08734
,
"60"
:
0.08891
,
"65"
:
0.1011
,
"70"
:
0.08925
,
"75"
:
0.08826
,
"80"
:
0.08863
,
"85"
:
0.08797
,
"90"
:
0.08896
,
"95"
:
0.08827
,
"100"
:
0.08947
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
100
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -49,4 +49,5 @@ MODEL_ARGS:
--ckpt-format
:
torch_dist
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
ckpt-resume
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_dev.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.84023
,
"5"
:
10.84074
,
"10"
:
10.81392
,
"15"
:
10.80242
,
"20"
:
10.70474
,
"25"
:
10.53872
,
"30"
:
10.35534
,
"35"
:
10.27156
,
"40"
:
10.08035
,
"45"
:
9.82307
,
"50"
:
9.90117
,
"55"
:
9.86415
,
"60"
:
9.48061
,
"65"
:
8.9376
,
"70"
:
9.71013
,
"75"
:
9.40885
,
"80"
:
9.39066
,
"85"
:
9.59761
,
"90"
:
9.80368
,
"95"
:
9.50575
,
"100"
:
9.38809
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1659.0
,
"5"
:
1886.0
,
"10"
:
1413.0
,
"15"
:
1912.0
,
"20"
:
1710.0
,
"25"
:
1666.0
,
"30"
:
2033.0
,
"35"
:
2032.0
,
"40"
:
2271.0
,
"45"
:
2171.0
,
"50"
:
2321.0
,
"55"
:
2330.0
,
"60"
:
2399.0
,
"65"
:
2573.0
,
"70"
:
3346.0
,
"75"
:
2588.0
,
"80"
:
3342.0
,
"85"
:
3296.0
,
"90"
:
3157.0
,
"95"
:
3269.0
,
"100"
:
3445.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1232487936.0
,
"5"
:
1232487936.0
,
"10"
:
1232487936.0
,
"15"
:
1232487936.0
,
"20"
:
1232487936.0
,
"25"
:
1232487936.0
,
"30"
:
1232487936.0
,
"35"
:
1232487936.0
,
"40"
:
1232487936.0
,
"45"
:
1232487936.0
,
"50"
:
1232487936.0
,
"55"
:
1232487936.0
,
"60"
:
1232487936.0
,
"65"
:
1232487936.0
,
"70"
:
1232487936.0
,
"75"
:
1232487936.0
,
"80"
:
1232487936.0
,
"85"
:
1232487936.0
,
"90"
:
1232487936.0
,
"95"
:
1232487936.0
,
"100"
:
1232487936.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1984492544.0
,
"5"
:
2534070272.0
,
"10"
:
2534070272.0
,
"15"
:
2534070272.0
,
"20"
:
2534070272.0
,
"25"
:
2534070272.0
,
"30"
:
2534070272.0
,
"35"
:
2534070272.0
,
"40"
:
2534070272.0
,
"45"
:
2534070272.0
,
"50"
:
2534070272.0
,
"55"
:
2534070272.0
,
"60"
:
2534070272.0
,
"65"
:
2534070272.0
,
"70"
:
2534070272.0
,
"75"
:
2534070272.0
,
"80"
:
2534070272.0
,
"85"
:
2534070272.0
,
"90"
:
2534070272.0
,
"95"
:
2534070272.0
,
"100"
:
2534070272.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
6.57779
,
"5"
:
0.12157
,
"10"
:
0.11891
,
"15"
:
0.1176
,
"20"
:
0.11702
,
"25"
:
0.11688
,
"30"
:
0.11766
,
"35"
:
0.11769
,
"40"
:
0.11717
,
"45"
:
0.11722
,
"50"
:
0.11804
,
"55"
:
0.11618
,
"60"
:
0.11829
,
"65"
:
0.11649
,
"70"
:
0.11804
,
"75"
:
0.11577
,
"80"
:
0.11793
,
"85"
:
0.11663
,
"90"
:
0.1178
,
"95"
:
0.11648
,
"100"
:
0.11531
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/golden_values_lts.json
0 → 100644
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.8401
,
"5"
:
10.84032
,
"10"
:
10.81341
,
"15"
:
10.80278
,
"20"
:
10.70496
,
"25"
:
10.53846
,
"30"
:
10.35517
,
"35"
:
10.27147
,
"40"
:
10.08045
,
"45"
:
9.82292
,
"50"
:
9.90114
,
"55"
:
9.86422
,
"60"
:
9.48029
,
"65"
:
8.93749
,
"70"
:
9.71025
,
"75"
:
9.40879
,
"80"
:
9.39077
,
"85"
:
9.59743
,
"90"
:
9.80386
,
"95"
:
9.50565
,
"100"
:
9.38812
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1670.0
,
"5"
:
1970.0
,
"10"
:
1436.0
,
"15"
:
1918.0
,
"20"
:
1786.0
,
"25"
:
1610.0
,
"30"
:
2039.0
,
"35"
:
2001.0
,
"40"
:
2321.0
,
"45"
:
2205.0
,
"50"
:
2365.0
,
"55"
:
2489.0
,
"60"
:
2508.0
,
"65"
:
2719.0
,
"70"
:
3241.0
,
"75"
:
2643.0
,
"80"
:
3368.0
,
"85"
:
3336.0
,
"90"
:
2961.0
,
"95"
:
3533.0
,
"100"
:
3432.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1230390272.0
,
"5"
:
1230390272.0
,
"10"
:
1230390272.0
,
"15"
:
1230390272.0
,
"20"
:
1230390272.0
,
"25"
:
1230390272.0
,
"30"
:
1230390272.0
,
"35"
:
1230390272.0
,
"40"
:
1230390272.0
,
"45"
:
1230390272.0
,
"50"
:
1230390272.0
,
"55"
:
1230390272.0
,
"60"
:
1230390272.0
,
"65"
:
1230390272.0
,
"70"
:
1230390272.0
,
"75"
:
1230390272.0
,
"80"
:
1230390272.0
,
"85"
:
1230390272.0
,
"90"
:
1230390272.0
,
"95"
:
1230390272.0
,
"100"
:
1230390272.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1984492032.0
,
"5"
:
2531972608.0
,
"10"
:
2531972608.0
,
"15"
:
2531972608.0
,
"20"
:
2531972608.0
,
"25"
:
2531972608.0
,
"30"
:
2531972608.0
,
"35"
:
2531972608.0
,
"40"
:
2531972608.0
,
"45"
:
2531972608.0
,
"50"
:
2531972608.0
,
"55"
:
2531972608.0
,
"60"
:
2531972608.0
,
"65"
:
2531972608.0
,
"70"
:
2531972608.0
,
"75"
:
2531972608.0
,
"80"
:
2531972608.0
,
"85"
:
2531972608.0
,
"90"
:
2531972608.0
,
"95"
:
2531972608.0
,
"100"
:
2531972608.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
100
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
8.60398
,
"5"
:
0.12229
,
"10"
:
0.12251
,
"15"
:
0.12206
,
"20"
:
0.1226
,
"25"
:
0.12185
,
"30"
:
0.12287
,
"35"
:
0.12365
,
"40"
:
0.12186
,
"45"
:
0.12198
,
"50"
:
0.1223
,
"55"
:
0.12246
,
"60"
:
0.12181
,
"65"
:
0.12238
,
"70"
:
0.12276
,
"75"
:
0.12137
,
"80"
:
0.12307
,
"85"
:
0.1219
,
"90"
:
0.1217
,
"95"
:
0.12183
,
"100"
:
0.12252
}}}
\ No newline at end of file
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml
View file @
688448db
...
...
@@ -19,8 +19,8 @@ MODEL_ARGS:
--train-iters
:
100
--timing-log-level
:
2
--lr-decay-iters
:
320000
--save
:
${CHECKPOINT_PATH}
--load
:
${CHECKPOINT_PATH}
--save
:
${CHECKPOINT_
SAVE_
PATH}
--load
:
${CHECKPOINT_
LOAD_
PATH}
--data-path
:
${DATA_PATH}/my-gpt3_00_text_document
--vocab-file
:
${DATA_PATH}/bpe/vocab.json
--merge-file
:
${DATA_PATH}/bpe/merges.txt
...
...
@@ -50,4 +50,5 @@ MODEL_ARGS:
--ckpt-format
:
torch_dist
--data-cache-path
:
${DATA_CACHE_PATH}
--bf16
:
true
--log-memory-to-tensorboard
:
true
TEST_TYPE
:
ckpt-resume
tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/golden_values_dev.json
View file @
688448db
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.8401
,
10.87259
,
10.85023
,
10.79646
,
10.68153
,
10.60619
,
10.12767
,
10.22185
,
10.13787
,
9.82307
]
},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
1698.0
,
1855.0
,
1896.0
,
1866.0
,
2032.0
,
1814.0
,
1664.0
,
1961.0
,
2306.0
,
2403.0
]
},
"iteration-time"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
8.00253
,
0.13176
,
0.13026
,
0.13184
,
0.13023
,
0.13135
,
0.13014
,
0.13143
,
0.1305
,
0.13191
]
}
}
\ No newline at end of file
{
"lm loss"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
10.8401
,
"5"
:
10.84034
,
"10"
:
10.81341
,
"15"
:
10.80277
,
"20"
:
10.70495
,
"25"
:
10.53848
,
"30"
:
10.35523
,
"35"
:
10.27145
,
"40"
:
10.08043
,
"45"
:
9.82293
,
"50"
:
9.90114
}},
"num-zeros"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1698.0
,
"5"
:
1900.0
,
"10"
:
1454.0
,
"15"
:
1969.0
,
"20"
:
1774.0
,
"25"
:
1736.0
,
"30"
:
1970.0
,
"35"
:
1941.0
,
"40"
:
2237.0
,
"45"
:
2180.0
,
"50"
:
2328.0
}},
"mem-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1230390272.0
,
"5"
:
1230390272.0
,
"10"
:
1230390272.0
,
"15"
:
1230390272.0
,
"20"
:
1230390272.0
,
"25"
:
1230390272.0
,
"30"
:
1230390272.0
,
"35"
:
1230390272.0
,
"40"
:
1230390272.0
,
"45"
:
1230390272.0
,
"50"
:
1230390272.0
}},
"mem-max-allocated-bytes"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
1984492032.0
,
"5"
:
2531972608.0
,
"10"
:
2531972608.0
,
"15"
:
2531972608.0
,
"20"
:
2531972608.0
,
"25"
:
2531972608.0
,
"30"
:
2531972608.0
,
"35"
:
2531972608.0
,
"40"
:
2531972608.0
,
"45"
:
2531972608.0
,
"50"
:
2531972608.0
}},
"iteration-time"
:
{
"start_step"
:
1
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
{
"1"
:
6.57733
,
"5"
:
0.12925
,
"10"
:
0.12965
,
"15"
:
0.12911
,
"20"
:
0.12836
,
"25"
:
0.12886
,
"30"
:
0.12957
,
"35"
:
0.12947
,
"40"
:
0.12911
,
"45"
:
0.12814
,
"50"
:
0.12753
}}}
\ No newline at end of file
Prev
1
…
17
18
19
20
21
22
23
24
25
…
42
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment