Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
3aca1415
Commit
3aca1415
authored
Apr 29, 2024
by
liangjing
Browse files
Merge branch 'megatron-lm_dtk24.04' into 'main'
Megatron lm dtk24.04 See merge request
!1
parents
0024a5c6
1005e9d3
Pipeline
#1806
passed with stage
Changes
204
Pipelines
3
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
205 additions
and
36 deletions
+205
-36
tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_1nodes_50steps_core_enabled_sequence_parallel.json
...p1_pp4_1nodes_50steps_core_enabled_sequence_parallel.json
+1
-0
tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_1nodes_50steps_core_enabled_swiglu.json
...gpt3/gpt3_tp1_pp4_1nodes_50steps_core_enabled_swiglu.json
+1
-0
tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_1nodes_50steps_core_enabled_untie_embeddings_and_outputs.json
...es_50steps_core_enabled_untie_embeddings_and_outputs.json
+1
-0
tests/functional_tests/test_results/gpt3/gpt3_tp2_pp2_1nodes_50steps.json
..._tests/test_results/gpt3/gpt3_tp2_pp2_1nodes_50steps.json
+1
-1
tests/functional_tests/test_results/gpt3/gpt3_tp2_pp2_1nodes_50steps_core_enabled.json
...esults/gpt3/gpt3_tp2_pp2_1nodes_50steps_core_enabled.json
+1
-0
tests/functional_tests/test_results/gpt3/gpt3_tp2_pp2_1nodes_50steps_te_enabled.json
..._results/gpt3/gpt3_tp2_pp2_1nodes_50steps_te_enabled.json
+1
-0
tests/functional_tests/test_results/gpt3/gpt3_tp4_pp1_1nodes_50steps.json
..._tests/test_results/gpt3/gpt3_tp4_pp1_1nodes_50steps.json
+1
-1
tests/functional_tests/test_results/gpt3/gpt3_tp4_pp1_1nodes_50steps_core_enabled.json
...esults/gpt3/gpt3_tp4_pp1_1nodes_50steps_core_enabled.json
+1
-0
tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_resume_checkpoint_test.sh
.../bert/pretrain_bert_distributed_resume_checkpoint_test.sh
+3
-3
tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh
...tests/test_scripts/bert/pretrain_bert_distributed_test.sh
+3
-3
tests/functional_tests/test_scripts/bert/sbatch_bert_distributed_resume_checkpoint_test.sh
...ts/bert/sbatch_bert_distributed_resume_checkpoint_test.sh
+5
-3
tests/functional_tests/test_scripts/bert/sbatch_bert_distributed_test.sh
...l_tests/test_scripts/bert/sbatch_bert_distributed_test.sh
+5
-3
tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_resume_checkpoint_test.sh
.../gpt3/pretrain_gpt3_distributed_resume_checkpoint_test.sh
+5
-4
tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh
...tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh
+36
-11
tests/functional_tests/test_scripts/gpt3/sbatch_gpt3_distributed_resume_checkpoint_test.sh
...ts/gpt3/sbatch_gpt3_distributed_resume_checkpoint_test.sh
+5
-3
tests/functional_tests/test_scripts/gpt3/sbatch_gpt3_distributed_test.sh
...l_tests/test_scripts/gpt3/sbatch_gpt3_distributed_test.sh
+11
-4
tests/unit_tests/models/__init__.py
tests/unit_tests/models/__init__.py
+0
-0
tests/unit_tests/models/test_gpt_embedding.py
tests/unit_tests/models/test_gpt_embedding.py
+50
-0
tests/unit_tests/models/test_gpt_model.py
tests/unit_tests/models/test_gpt_model.py
+74
-0
tests/unit_tests/pipeline_parallel/__init__.py
tests/unit_tests/pipeline_parallel/__init__.py
+0
-0
No files found.
tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_1nodes_50steps_core_enabled_sequence_parallel.json
0 → 100644
View file @
3aca1415
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
41
,
"step_interval"
:
5
,
"values"
:
[
10.79471
,
10.86601
,
10.89073
,
10.78482
,
10.6587
,
10.58125
,
10.0813
,
10.19422
,
10.13437
]},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
41
,
"step_interval"
:
5
,
"values"
:
[
1609.0
,
1850.0
,
1921.0
,
1942.0
,
1853.0
,
1674.0
,
1544.0
,
1884.0
,
2438.0
]},
"iteration_timing_avg"
:
0.12650857142857144
}
\ No newline at end of file
tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_1nodes_50steps_core_enabled_swiglu.json
0 → 100644
View file @
3aca1415
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
40
,
"step_interval"
:
5
,
"values"
:
[
10.73442
,
10.82095
,
10.84047
,
10.75831
,
10.70386
,
10.63718
,
10.20959
,
10.36611
]},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
40
,
"step_interval"
:
5
,
"values"
:
[
2625.0
,
2815.0
,
2837.0
,
2870.0
,
2755.0
,
2617.0
,
2345.0
,
2529.0
]},
"iteration_timing_avg"
:
0.1255659259259259
}
\ No newline at end of file
tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_1nodes_50steps_core_enabled_untie_embeddings_and_outputs.json
0 → 100644
View file @
3aca1415
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
40
,
"step_interval"
:
5
,
"values"
:
[
10.89427
,
10.9106
,
10.917
,
10.84465
,
10.70825
,
10.63519
,
10.15543
,
10.26206
]},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
40
,
"step_interval"
:
5
,
"values"
:
[
22727188.0
,
23020756.0
,
22501138.0
,
22830610.0
,
22739638.0
,
22547160.0
,
22955250.0
,
22589434.0
]},
"iteration_timing_avg"
:
0.12411037037037034
}
\ No newline at end of file
tests/functional_tests/test_results/gpt3/gpt3_tp2_pp2_1nodes_50steps.json
View file @
3aca1415
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
48
,
"step_interval"
:
5
,
"values"
:
[
10.85
716
,
10.8
8973
,
10.87
9
,
10.87
014
,
10.8
7978
,
10.84
463
,
10.67
266
,
10.62
932
,
10.52
767
,
10.25
362
]},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
3
1
,
"step_interval"
:
5
,
"values"
:
[
24
5
0.0
,
2
396
.0
,
25
23
.0
,
2
24
2.0
,
22
25
.0
,
2
478
.0
,
2
53
6.0
]},
"iteration_timing_avg"
:
0.1
1416968750000002
}
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.85
543
,
10.8
9355
,
10.87
608
,
10.87
365
,
10.8
8042
,
10.84
182
,
10.67
177
,
10.62
854
,
10.52
511
,
10.25
229
]},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
3
3
,
"step_interval"
:
5
,
"values"
:
[
24
7
0.0
,
2
444
.0
,
25
70
.0
,
2
19
2.0
,
22
41
.0
,
2
574
.0
,
2
47
6.0
]},
"iteration_timing_avg"
:
0.1
4008088235294117
}
\ No newline at end of file
tests/functional_tests/test_results/gpt3/gpt3_tp2_pp2_1nodes_50steps_core_enabled.json
0 → 100644
View file @
3aca1415
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.92215
,
10.93714
,
10.89742
,
10.87588
,
10.75165
,
10.65713
,
10.1606
,
10.24967
,
10.15339
,
9.84198
]},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
1655.0
,
1837.0
,
1968.0
,
1854.0
,
1811.0
,
1810.0
,
1593.0
,
1997.0
,
2315.0
,
2343.0
]},
"iteration_timing_avg"
:
0.13743323529411763
}
tests/functional_tests/test_results/gpt3/gpt3_tp2_pp2_1nodes_50steps_te_enabled.json
0 → 100644
View file @
3aca1415
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.8559
,
10.89255
,
10.8665
,
10.81693
,
10.69856
,
10.60955
,
10.10845
,
10.21443
,
10.12855
,
9.80126
]},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
1693.0
,
1878.0
,
1977.0
,
1871.0
,
2022.0
,
1716.0
,
1646.0
,
2006.0
,
2280.0
,
2365.0
]},
"iteration_timing_avg"
:
0.12973323529411762
}
\ No newline at end of file
tests/functional_tests/test_results/gpt3/gpt3_tp4_pp1_1nodes_50steps.json
View file @
3aca1415
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.8
6276
,
10.8
8058
,
10.87
527
,
10.88
402
,
10.891
73
,
10.84
724
,
10.68
86
,
10.62
864
,
10.5
3925
,
10.26
646
]},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
33
,
"step_interval"
:
5
,
"values"
:
[
2
19
9.0
,
23
0
6.0
,
24
1
2.0
,
20
3
2.0
,
2
077
.0
,
2
475
.0
,
23
47
.0
]},
"iteration_timing_avg"
:
0.
154810
2941176470
7
}
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.8
5921
,
10.8
797
,
10.87
381
,
10.88
658
,
10.
8
891
2
,
10.84
826
,
10.68
571
,
10.62
946
,
10.5
4289
,
10.26
918
]},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
33
,
"step_interval"
:
5
,
"values"
:
[
2
28
9.0
,
236
8
.0
,
242
7
.0
,
202
3
.0
,
2
234
.0
,
2
501
.0
,
23
16
.0
]},
"iteration_timing_avg"
:
0.
204195
2941176470
6
}
\ No newline at end of file
tests/functional_tests/test_results/gpt3/gpt3_tp4_pp1_1nodes_50steps_core_enabled.json
0 → 100644
View file @
3aca1415
{
"lm loss"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
10.86168
,
10.88879
,
10.87894
,
10.8312
,
10.71384
,
10.61221
,
10.13333
,
10.23204
,
10.16051
,
9.83654
]},
"num-zeros"
:
{
"start_step"
:
0
,
"end_step"
:
50
,
"step_interval"
:
5
,
"values"
:
[
1854.0
,
2137.0
,
2162.0
,
2176.0
,
2072.0
,
1947.0
,
1702.0
,
2222.0
,
2457.0
,
2535.0
]},
"iteration_timing_avg"
:
0.20128235294117644
}
tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_resume_checkpoint_test.sh
View file @
3aca1415
...
@@ -17,10 +17,10 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
...
@@ -17,10 +17,10 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
# Runs the "345M" parameter model
# Runs the "345M" parameter model
DISTRIBUTED_ARGS
=
"--nproc_per_node
$GPUS_PER_NODE
--nnodes
$NNODES
--node_rank
$NODE_RANK
--master_addr
$MASTER_ADDR
--master_port
$MASTER_PORT
"
DISTRIBUTED_ARGS
=
"--nproc_per_node
$GPUS_PER_NODE
--nnodes
$NNODES
"
# Run for 100 iterations
# Run for 100 iterations
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
\
torchrun
$DISTRIBUTED_ARGS
\
pretrain_bert.py
\
pretrain_bert.py
\
--use-checkpoint-args
\
--use-checkpoint-args
\
--use-checkpoint-opt_param-scheduler
\
--use-checkpoint-opt_param-scheduler
\
...
@@ -61,7 +61,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
...
@@ -61,7 +61,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
echo
50
>
$CHECKPOINT_PATH
/latest_checkpointed_iteration.txt
echo
50
>
$CHECKPOINT_PATH
/latest_checkpointed_iteration.txt
# Resume from 50th iteration ckpt and continue to 100 iterations
# Resume from 50th iteration ckpt and continue to 100 iterations
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
\
torchrun
$DISTRIBUTED_ARGS
\
pretrain_bert.py
\
pretrain_bert.py
\
--use-checkpoint-args
\
--use-checkpoint-args
\
--use-checkpoint-opt_param-scheduler
\
--use-checkpoint-opt_param-scheduler
\
...
...
tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh
View file @
3aca1415
#! /bin/bash
#! /bin/bash
set
-
o
xtrace
set
-
x
DATA_PATH
=
$1
DATA_PATH
=
$1
CHECKPOINT_PATH
=
$2
CHECKPOINT_PATH
=
$2
...
@@ -19,9 +19,9 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
...
@@ -19,9 +19,9 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
# Runs the "345M" parameter model
# Runs the "345M" parameter model
DISTRIBUTED_ARGS
=
"--nproc_per_node
$GPUS_PER_NODE
--nnodes
$NNODES
--node_rank
$NODE_RANK
--master_addr
$MASTER_ADDR
--master_port
$MASTER_PORT
"
DISTRIBUTED_ARGS
=
"--nproc_per_node
$GPUS_PER_NODE
--nnodes
$NNODES
"
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
\
torchrun
$DISTRIBUTED_ARGS
\
pretrain_bert.py
\
pretrain_bert.py
\
--num-layers
24
\
--num-layers
24
\
--hidden-size
1024
\
--hidden-size
1024
\
...
...
tests/functional_tests/test_scripts/bert/sbatch_bert_distributed_resume_checkpoint_test.sh
View file @
3aca1415
#!/bin/bash
#!/bin/bash
# Parameters
# Parameters
#SBATCH --account=adlr
#SBATCH --account=adlr
_nlp_llmnext
#SBATCH --job-name=adlr-ci:megatron-job
#SBATCH --job-name=adlr
_nlp_llmnext
-ci:megatron-job
#SBATCH --nodes=1
#SBATCH --nodes=1
#SBATCH --partition=luna
#SBATCH --partition=luna
...
@@ -10,7 +10,9 @@ DATA_PATH=/workspace/data/bert_data/my-bert_00_text_sentence
...
@@ -10,7 +10,9 @@ DATA_PATH=/workspace/data/bert_data/my-bert_00_text_sentence
CHECKPOINT_PATH
=
/workspace/checkpoints
CHECKPOINT_PATH
=
/workspace/checkpoints
TENSORBOARD_DIR
=
/workspace/logs
TENSORBOARD_DIR
=
/workspace/logs
srun
--output
$BASE_DIR
/results/slurm-%j.out
--error
$BASE_DIR
/results/slurm-%j.out
--container-image
gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel
--container-mounts
$BASE_DIR
/logs:/workspace/logs,
$BASE_DIR
/checkpoints:/workspace/checkpoints,
$BUILD_DIR
:/workspace/megatron-lm,
$DATA_DIR
:/workspace/data
--no-container-mount-home
bash
-c
"
echo
'Running tests using $PYTORCH_IMAGE image'
srun
--output
$BASE_DIR
/results/slurm-%j.out
--error
$BASE_DIR
/results/slurm-%j.out
--container-image
$PYTORCH_IMAGE
--container-mounts
$BASE_DIR
/logs:/workspace/logs,
$BASE_DIR
/checkpoints:/workspace/checkpoints,
$BUILD_DIR
:/workspace/megatron-lm,
$DATA_DIR
:/workspace/data
--no-container-mount-home
bash
-c
"
ls
ls
cd /workspace/megatron-lm
cd /workspace/megatron-lm
./tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_resume_checkpoint_test.sh
$DATA_PATH
$CHECKPOINT_PATH
$TENSORBOARD_DIR
$TP_SIZE
$PP_SIZE
$NUM_NODES
"
./tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_resume_checkpoint_test.sh
$DATA_PATH
$CHECKPOINT_PATH
$TENSORBOARD_DIR
$TP_SIZE
$PP_SIZE
$NUM_NODES
"
\ No newline at end of file
tests/functional_tests/test_scripts/bert/sbatch_bert_distributed_test.sh
View file @
3aca1415
#!/bin/bash
#!/bin/bash
# Parameters
# Parameters
#SBATCH --account=adlr
#SBATCH --account=adlr
_nlp_llmnext
#SBATCH --job-name=adlr-ci:megatron-job
#SBATCH --job-name=adlr
_nlp_llmnext
-ci:megatron-job
#SBATCH --nodes=1
#SBATCH --nodes=1
#SBATCH --partition=luna
#SBATCH --partition=luna
...
@@ -10,7 +10,9 @@ DATA_PATH=/workspace/data/bert_data/my-bert_00_text_sentence
...
@@ -10,7 +10,9 @@ DATA_PATH=/workspace/data/bert_data/my-bert_00_text_sentence
CHECKPOINT_PATH
=
/workspace/checkpoints
CHECKPOINT_PATH
=
/workspace/checkpoints
TENSORBOARD_DIR
=
/workspace/logs
TENSORBOARD_DIR
=
/workspace/logs
srun
--output
$BASE_DIR
/results/slurm-%j.out
--error
$BASE_DIR
/results/slurm-%j.out
--container-image
gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel
--container-mounts
$BASE_DIR
/logs:/workspace/logs,
$BASE_DIR
/checkpoints:/workspace/checkpoints,
$BUILD_DIR
:/workspace/megatron-lm,
$DATA_DIR
:/workspace/data
--no-container-mount-home
bash
-c
"
echo
'Running tests using $PYTORCH_IMAGE image'
srun
--output
$BASE_DIR
/results/slurm-%j.out
--error
$BASE_DIR
/results/slurm-%j.out
--container-image
$PYTORCH_IMAGE
--container-mounts
$BASE_DIR
/logs:/workspace/logs,
$BASE_DIR
/checkpoints:/workspace/checkpoints,
$BUILD_DIR
:/workspace/megatron-lm,
$DATA_DIR
:/workspace/data
--no-container-mount-home
bash
-c
"
ls
ls
cd /workspace/megatron-lm
cd /workspace/megatron-lm
./tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh
$DATA_PATH
$CHECKPOINT_PATH
$TENSORBOARD_DIR
$TP_SIZE
$PP_SIZE
$NUM_NODES
$MAX_STEPS
$VP_SIZE
"
./tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh
$DATA_PATH
$CHECKPOINT_PATH
$TENSORBOARD_DIR
$TP_SIZE
$PP_SIZE
$NUM_NODES
$MAX_STEPS
$VP_SIZE
"
\ No newline at end of file
tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_resume_checkpoint_test.sh
View file @
3aca1415
...
@@ -17,10 +17,10 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
...
@@ -17,10 +17,10 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
# Runs the "345M" parameter model
# Runs the "345M" parameter model
DISTRIBUTED_ARGS
=
"--nproc_per_node
$GPUS_PER_NODE
--nnodes
$NNODES
--node_rank
$NODE_RANK
--master_addr
$MASTER_ADDR
--master_port
$MASTER_PORT
"
DISTRIBUTED_ARGS
=
"--nproc_per_node
$GPUS_PER_NODE
--nnodes
$NNODES
"
# Run for 100 iterations and save checkpoint at 50
# Run for 100 iterations and save checkpoint at 50
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
\
torchrun
$DISTRIBUTED_ARGS
\
pretrain_gpt.py
\
pretrain_gpt.py
\
--use-checkpoint-args
\
--use-checkpoint-args
\
--use-checkpoint-opt_param-scheduler
\
--use-checkpoint-opt_param-scheduler
\
...
@@ -65,7 +65,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
...
@@ -65,7 +65,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
echo
50
>
$CHECKPOINT_PATH
/latest_checkpointed_iteration.txt
echo
50
>
$CHECKPOINT_PATH
/latest_checkpointed_iteration.txt
# Resume from 50th iteration ckpt and continue to 100 iterations
# Resume from 50th iteration ckpt and continue to 100 iterations
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
\
torchrun
$DISTRIBUTED_ARGS
\
pretrain_gpt.py
\
pretrain_gpt.py
\
--use-checkpoint-args
\
--use-checkpoint-args
\
--use-checkpoint-opt_param-scheduler
\
--use-checkpoint-opt_param-scheduler
\
...
@@ -105,4 +105,5 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
...
@@ -105,4 +105,5 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
--tensor-model-parallel-size
$TP_SIZE
\
--tensor-model-parallel-size
$TP_SIZE
\
--pipeline-model-parallel-size
$PP_SIZE
\
--pipeline-model-parallel-size
$PP_SIZE
\
--no-gradient-accumulation-fusion
\
--no-gradient-accumulation-fusion
\
--fp16
--fp16
\ No newline at end of file
tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh
View file @
3aca1415
#! /bin/bash
#! /bin/bash
set
-x
DATA_PATH
=
$1
DATA_PATH
=
$1
CHECKPOINT_PATH
=
$2
CHECKPOINT_PATH
=
$2
TENSORBOARD_DIR
=
$3
TENSORBOARD_DIR
=
$3
TP_SIZE
=
$4
USE_TE
=
$4
PP_SIZE
=
$5
TP_SIZE
=
$5
NNODES
=
$6
PP_SIZE
=
$6
MAX_STEPS
=
$7
NNODES
=
$7
VP_SIZE
=
$8
MAX_STEPS
=
$8
MBS
=
$9
USE_CORE
=
$9
GBS
=
${
10
}
VP_SIZE
=
${
10
}
MBS
=
${
11
}
GBS
=
${
12
}
ADDITIONAL_PARAMS
=
${
13
}
GPUS_PER_NODE
=
8
GPUS_PER_NODE
=
8
# Change for multinode config
# Change for multinode config
MASTER_ADDR
=
localhost
MASTER_ADDR
=
localhost
...
@@ -18,12 +22,31 @@ NODE_RANK=0
...
@@ -18,12 +22,31 @@ NODE_RANK=0
WORLD_SIZE
=
$((
$GPUS_PER_NODE
*
$NNODES
))
WORLD_SIZE
=
$((
$GPUS_PER_NODE
*
$NNODES
))
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
TRANSFORMER_IMPL
=
local
TRAINING_DTYPE
=
fp16
CALLING_SCRIPT
=
pretrain_gpt.py
if
[[
$USE_CORE
-eq
1
]]
;
then
echo
"Running using megatron core"
TRANSFORMER_IMPL
=
local
TRAINING_DTYPE
=
bf16
CALLING_SCRIPT
=
pretrain_gpt_core.py
export
NVTE_ALLOW_NONDETERMINISTIC_ALGO
=
0
fi
if
[[
$USE_TE
-eq
1
]]
;
then
echo
"Running with TransformerEngine ..."
TRANSFORMER_IMPL
=
transformer_engine
TRAINING_DTYPE
=
bf16
else
echo
"Running with local transformer implementation ..."
fi
# Runs the "345M" parameter model
# Runs the "345M" parameter model
DISTRIBUTED_ARGS
=
"--nproc_per_node
$GPUS_PER_NODE
--nnodes
$NNODES
--node_rank
$NODE_RANK
--master_addr
$MASTER_ADDR
--master_port
$MASTER_PORT
"
DISTRIBUTED_ARGS
=
"--nproc_per_node
$GPUS_PER_NODE
--nnodes
$NNODES
"
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
\
torchrun
$DISTRIBUTED_ARGS
\
pretrain_gpt.py
\
$CALLING_SCRIPT
\
--num-layers
12
\
--num-layers
12
\
--hidden-size
512
\
--hidden-size
512
\
--num-attention-heads
8
\
--num-attention-heads
8
\
...
@@ -57,8 +80,10 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
...
@@ -57,8 +80,10 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
--save-interval
10000
\
--save-interval
10000
\
--eval-interval
1000
\
--eval-interval
1000
\
--eval-iters
10
\
--eval-iters
10
\
--transformer-impl
$TRANSFORMER_IMPL
\
--tensor-model-parallel-size
$TP_SIZE
\
--tensor-model-parallel-size
$TP_SIZE
\
--pipeline-model-parallel-size
$PP_SIZE
\
--pipeline-model-parallel-size
$PP_SIZE
\
${
VP_SIZE
:+--num-layers-per-virtual-pipeline-stage
"
$VP_SIZE
"
}
\
${
VP_SIZE
:+--num-layers-per-virtual-pipeline-stage
"
$VP_SIZE
"
}
\
${
ADDITIONAL_PARAMS
:+
$ADDITIONAL_PARAMS
}
\
--no-gradient-accumulation-fusion
\
--no-gradient-accumulation-fusion
\
--
fp16
--
${
TRAINING_DTYPE
}
tests/functional_tests/test_scripts/gpt3/sbatch_gpt3_distributed_resume_checkpoint_test.sh
View file @
3aca1415
#!/bin/bash
#!/bin/bash
# Parameters
# Parameters
#SBATCH --account=adlr
#SBATCH --account=adlr
_nlp_llmnext
#SBATCH --job-name=adlr-ci:megatron-job
#SBATCH --job-name=adlr
_nlp_llmnext
-ci:megatron-job
#SBATCH --nodes=1
#SBATCH --nodes=1
#SBATCH --partition=luna
#SBATCH --partition=luna
...
@@ -10,7 +10,9 @@ DATA_PATH=/workspace/data/gpt3_data/my-gpt3_00_text_document
...
@@ -10,7 +10,9 @@ DATA_PATH=/workspace/data/gpt3_data/my-gpt3_00_text_document
CHECKPOINT_PATH
=
/workspace/checkpoints
CHECKPOINT_PATH
=
/workspace/checkpoints
TENSORBOARD_DIR
=
/workspace/logs
TENSORBOARD_DIR
=
/workspace/logs
srun
--output
$BASE_DIR
/results/slurm-%j.out
--error
$BASE_DIR
/results/slurm-%j.out
--container-image
gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel
--container-mounts
$BASE_DIR
/logs:/workspace/logs,
$BASE_DIR
/checkpoints:/workspace/checkpoints,
$BUILD_DIR
:/workspace/megatron-lm,
$DATA_DIR
:/workspace/data
--no-container-mount-home
bash
-c
"
echo
'Running tests using $PYTORCH_IMAGE image'
srun
--output
$BASE_DIR
/results/slurm-%j.out
--error
$BASE_DIR
/results/slurm-%j.out
--container-image
$PYTORCH_IMAGE
--container-mounts
$BASE_DIR
/logs:/workspace/logs,
$BASE_DIR
/checkpoints:/workspace/checkpoints,
$BUILD_DIR
:/workspace/megatron-lm,
$DATA_DIR
:/workspace/data
--no-container-mount-home
bash
-c
"
ls
ls
cd /workspace/megatron-lm
cd /workspace/megatron-lm
./tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_resume_checkpoint_test.sh
$DATA_PATH
$CHECKPOINT_PATH
$TENSORBOARD_DIR
$TP_SIZE
$PP_SIZE
$NUM_NODES
"
./tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_resume_checkpoint_test.sh
$DATA_PATH
$CHECKPOINT_PATH
$TENSORBOARD_DIR
$TP_SIZE
$PP_SIZE
$NUM_NODES
"
\ No newline at end of file
tests/functional_tests/test_scripts/gpt3/sbatch_gpt3_distributed_test.sh
View file @
3aca1415
#!/bin/bash
#!/bin/bash
# Parameters
# Parameters
#SBATCH --account=adlr
#SBATCH --account=adlr
_nlp_llmnext
#SBATCH --job-name=adlr-ci:megatron-job
#SBATCH --job-name=adlr
_nlp_llmnext
-ci:megatron-job
#SBATCH --nodes=1
#SBATCH --nodes=1
#SBATCH --partition=luna
#SBATCH --partition=luna
...
@@ -10,7 +10,14 @@ DATA_PATH=/workspace/data/gpt3_data/my-gpt3_00_text_document
...
@@ -10,7 +10,14 @@ DATA_PATH=/workspace/data/gpt3_data/my-gpt3_00_text_document
CHECKPOINT_PATH
=
/workspace/checkpoints
CHECKPOINT_PATH
=
/workspace/checkpoints
TENSORBOARD_DIR
=
/workspace/logs
TENSORBOARD_DIR
=
/workspace/logs
srun
--output
$BASE_DIR
/results/slurm-%j.out
--error
$BASE_DIR
/results/slurm-%j.out
--container-image
gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel
--container-mounts
$BASE_DIR
/logs:/workspace/logs,
$BASE_DIR
/checkpoints:/workspace/checkpoints,
$BUILD_DIR
:/workspace/megatron-lm,
$DATA_DIR
:/workspace/data
--no-container-mount-home
bash
-c
"
if
[[
-n
$MBS
]]
;
then
MBS
=
4
;
fi
if
[[
-n
$GBS
]]
;
then
GBS
=
32
;
fi
if
[[
-n
$VP_SIZE
]]
;
then
VP_SIZE
=
""
;
fi
echo
'Running tests using $PYTORCH_IMAGE image'
srun
--output
$BASE_DIR
/results/slurm-%j.out
--error
$BASE_DIR
/results/slurm-%j.out
--container-image
$PYTORCH_IMAGE
--container-mounts
$BASE_DIR
/logs:/workspace/logs,
$BASE_DIR
/checkpoints:/workspace/checkpoints,
$BUILD_DIR
:/workspace/megatron-lm,
$DATA_DIR
:/workspace/data
--no-container-mount-home
bash
-c
"
ls
ls
cd /workspace/megatron-lm
cd /workspace/megatron-lm
./tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh
$DATA_PATH
$CHECKPOINT_PATH
$TENSORBOARD_DIR
$TP_SIZE
$PP_SIZE
$NUM_NODES
$MAX_STEPS
$
VP_SIZE
$MBS
$GBS
"
./tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh
$DATA_PATH
$CHECKPOINT_PATH
$TENSORBOARD_DIR
$USE_TE
$TP_SIZE
$PP_SIZE
$NUM_NODES
$MAX_STEPS
$
USE_CORE
\"
$VP_SIZE
\"
\"
$MBS
\"
\"
$GBS
\"
\"
$ADDITIONAL_PARAMS
\"
"
tests/unit_tests/models/__init__.py
0 → 100644
View file @
3aca1415
tests/unit_tests/models/test_gpt_embedding.py
0 → 100644
View file @
3aca1415
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
import
pytest
import
torch
from
megatron.core.transformer.transformer_config
import
TransformerConfig
from
megatron.core.models.gpt.gpt_embedding
import
GPTEmbedding
from
tests.unit_tests.test_utilities
import
Utils
class
TestGPTEmbedding
:
def
setup_method
(
self
,
method
):
Utils
.
initialize_model_parallel
(
1
,
1
)
transformer_config
=
TransformerConfig
(
num_layers
=
2
,
hidden_size
=
12
,
num_attention_heads
=
4
,
use_cpu_initialization
=
True
)
self
.
gpt_embedding
=
GPTEmbedding
(
config
=
transformer_config
,
vocab_size
=
100
,
max_sequence_length
=
4
,
add_position_embedding
=
True
)
def
teardown_method
(
self
,
method
):
Utils
.
destroy_model_parallel
()
def
test_constructor
(
self
):
assert
isinstance
(
self
.
gpt_embedding
,
GPTEmbedding
)
num_weights
=
sum
([
p
.
numel
()
for
p
in
self
.
gpt_embedding
.
parameters
()])
assert
num_weights
==
1248
def
test_zero_parameters
(
self
):
sum_weights
=
sum
([
p
.
sum
()
for
p
in
self
.
gpt_embedding
.
parameters
()])
assert
sum_weights
!=
0
self
.
gpt_embedding
.
zero_parameters
()
sum_weights
=
sum
([
p
.
sum
()
for
p
in
self
.
gpt_embedding
.
parameters
()])
assert
sum_weights
==
0
def
test_cpu_forward
(
self
):
input_ids
=
torch
.
tensor
([
0
,
1
,
2
,
3
],
dtype
=
torch
.
int64
).
repeat
((
2
,
1
))
position_ids
=
torch
.
tensor
([
0
,
1
,
2
,
3
],
dtype
=
torch
.
int64
).
repeat
((
2
,
1
))
embeddings
=
self
.
gpt_embedding
(
input_ids
,
position_ids
)
assert
embeddings
.
device
.
type
==
'cpu'
assert
embeddings
.
shape
[
0
]
==
self
.
gpt_embedding
.
max_sequence_length
assert
embeddings
.
shape
[
1
]
==
input_ids
.
shape
[
0
]
assert
embeddings
.
shape
[
2
]
==
self
.
gpt_embedding
.
config
.
hidden_size
def
test_gpu_forward
(
self
):
self
.
gpt_embedding
.
cuda
()
input_ids
=
torch
.
tensor
([
0
,
1
,
2
,
3
],
dtype
=
torch
.
int64
).
repeat
((
2
,
1
)).
cuda
()
position_ids
=
torch
.
tensor
([
0
,
1
,
2
,
3
],
dtype
=
torch
.
int64
).
repeat
((
2
,
1
)).
cuda
()
embeddings
=
self
.
gpt_embedding
(
input_ids
,
position_ids
)
assert
embeddings
.
device
.
type
==
'cuda'
assert
embeddings
.
shape
[
0
]
==
self
.
gpt_embedding
.
max_sequence_length
assert
embeddings
.
shape
[
1
]
==
input_ids
.
shape
[
0
]
assert
embeddings
.
shape
[
2
]
==
self
.
gpt_embedding
.
config
.
hidden_size
\ No newline at end of file
tests/unit_tests/models/test_gpt_model.py
0 → 100644
View file @
3aca1415
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
import
pytest
import
torch
from
megatron.core.transformer.transformer_config
import
TransformerConfig
from
megatron.core.models.gpt.gpt_model
import
GPTModel
from
tests.unit_tests.test_utilities
import
Utils
from
megatron.core.tensor_parallel.random
import
model_parallel_cuda_manual_seed
class
TestGPTModel
:
def
setup_method
(
self
,
method
):
Utils
.
initialize_model_parallel
(
1
,
1
)
model_parallel_cuda_manual_seed
(
123
)
transformer_config
=
TransformerConfig
(
num_layers
=
2
,
hidden_size
=
12
,
num_attention_heads
=
4
,
use_cpu_initialization
=
True
)
self
.
gpt_model
=
GPTModel
(
config
=
transformer_config
,
vocab_size
=
100
,
max_sequence_length
=
4
)
def
teardown_method
(
self
,
method
):
Utils
.
destroy_model_parallel
()
def
test_constructor
(
self
):
assert
isinstance
(
self
.
gpt_model
,
GPTModel
)
assert
self
.
gpt_model
.
max_sequence_length
==
4
num_weights
=
sum
([
p
.
numel
()
for
p
in
self
.
gpt_model
.
parameters
()])
assert
num_weights
==
6240
def
test_set_input_tensor
(
self
):
config
:
TransformerConfig
=
self
.
gpt_model
.
config
sequence_length
=
self
.
gpt_model
.
max_sequence_length
micro_batch_size
=
2
# [sequence length, batch size, hidden size]
input_tensor
=
torch
.
ones
((
sequence_length
,
micro_batch_size
,
config
.
hidden_size
))
self
.
gpt_model
.
set_input_tensor
(
input_tensor
)
assert
self
.
gpt_model
.
decoder
.
input_tensor
.
shape
[
0
]
==
sequence_length
assert
self
.
gpt_model
.
decoder
.
input_tensor
.
shape
[
1
]
==
micro_batch_size
assert
self
.
gpt_model
.
decoder
.
input_tensor
.
shape
[
2
]
==
config
.
hidden_size
def
test_post_process_forward
(
self
):
config
:
TransformerConfig
=
self
.
gpt_model
.
config
sequence_length
=
self
.
gpt_model
.
max_sequence_length
micro_batch_size
=
2
self
.
gpt_model
.
cuda
()
data
=
list
(
range
(
sequence_length
))
input_ids
=
torch
.
tensor
(
data
,
dtype
=
torch
.
int64
).
repeat
((
micro_batch_size
,
1
)).
cuda
()
position_ids
=
torch
.
tensor
(
data
,
dtype
=
torch
.
int64
).
repeat
((
micro_batch_size
,
1
)).
cuda
()
attention_mask
=
torch
.
ones
((
1
,
1
,
sequence_length
,
sequence_length
),
dtype
=
bool
).
cuda
()
logits
=
self
.
gpt_model
.
forward
(
input_ids
=
input_ids
,
position_ids
=
position_ids
,
attention_mask
=
attention_mask
)
assert
logits
.
shape
[
0
]
==
micro_batch_size
assert
logits
.
shape
[
1
]
==
sequence_length
assert
logits
.
shape
[
2
]
==
self
.
gpt_model
.
vocab_size
def
test_no_post_process_forward
(
self
):
pass
def
test_no_preprocess_forward
(
self
):
pass
def
test_state_dict_for_save_checkpoint
(
self
):
pass
def
test_load_state_dict
(
self
):
pass
tests/unit_tests/pipeline_parallel/__init__.py
0 → 100644
View file @
3aca1415
Prev
1
…
4
5
6
7
8
9
10
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment