Commit ff7b28f4 authored by Jared Casper's avatar Jared Casper
Browse files

Fix bug in core pipeline schedule

parent 2c493fb3
...@@ -87,6 +87,7 @@ unit_tests: ...@@ -87,6 +87,7 @@ unit_tests:
- export BUILD_DIR=`pwd` - export BUILD_DIR=`pwd`
- export RUN_NAME=${RUN_MODEL}_tp${TP_SIZE}_pp${PP_SIZE}_${NUM_NODES}nodes_${MAX_STEPS}steps - export RUN_NAME=${RUN_MODEL}_tp${TP_SIZE}_pp${PP_SIZE}_${NUM_NODES}nodes_${MAX_STEPS}steps
- export TP_SIZE PP_SIZE NUM_NODES MAX_STEPS VP_SIZE - export TP_SIZE PP_SIZE NUM_NODES MAX_STEPS VP_SIZE
- export MBS GBS
- export DATA_DIR=$DATA_DIR - export DATA_DIR=$DATA_DIR
- echo "Run name is $RUN_NAME" - echo "Run name is $RUN_NAME"
- mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints
...@@ -100,7 +101,7 @@ unit_tests: ...@@ -100,7 +101,7 @@ unit_tests:
- export RESULTS_DIR=$BASE_DIR/results - export RESULTS_DIR=$BASE_DIR/results
- export CHECKPOINTS_DIR=$BASE_DIR/checkpoints - export CHECKPOINTS_DIR=$BASE_DIR/checkpoints
- echo "Submitting job" - echo "Submitting job"
- sbatch_submission=`sbatch $BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_test.sh --export=BASE_DIR,BUILD_DIR,DATA_DIR,TP_SIZE,PP_SIZE,NUM_NODES,MAX_STEPS,VP_SIZE` - sbatch_submission=`sbatch $BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_test.sh --export=BASE_DIR,BUILD_DIR,DATA_DIR,TP_SIZE,PP_SIZE,NUM_NODES,MAX_STEPS,VP_SIZE,MBS,GBS`
- export SLURM_JOBID=$(echo $sbatch_submission| grep 'Submitted batch job' | awk '{ print $4 }'); - export SLURM_JOBID=$(echo $sbatch_submission| grep 'Submitted batch job' | awk '{ print $4 }');
- bash $BUILD_DIR/tests/functional_tests/shell_test_utils/jobwait.sh $SLURM_JOBID - bash $BUILD_DIR/tests/functional_tests/shell_test_utils/jobwait.sh $SLURM_JOBID
- \[ ! -z ${SLURM_JOBID} \] && echo -e " --------------------------------------------------\n" - \[ ! -z ${SLURM_JOBID} \] && echo -e " --------------------------------------------------\n"
...@@ -167,6 +168,19 @@ train.gpt3.345m_tp1_pp2_1node_50steps: ...@@ -167,6 +168,19 @@ train.gpt3.345m_tp1_pp2_1node_50steps:
TIME_LIMIT: "20:00" TIME_LIMIT: "20:00"
TEST_LEVEL: L0 TEST_LEVEL: L0
train.gpt3.345m_tp1_pp4_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
TP_SIZE: 1
PP_SIZE: 4
VP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
TIME_LIMIT: "20:00"
TEST_LEVEL: L0
resume.checkpoint.gpt3.345m_tp1_pp2_1node: resume.checkpoint.gpt3.345m_tp1_pp2_1node:
<<: *selene-test-resume-checkpoint-launcher <<: *selene-test-resume-checkpoint-launcher
variables: variables:
......
...@@ -593,7 +593,7 @@ def forward_backward_pipelining_with_interleaving(*, ...@@ -593,7 +593,7 @@ def forward_backward_pipelining_with_interleaving(*,
if not forward_only: if not forward_only:
if all_warmup_microbatches: if all_warmup_microbatches:
output_tensor_grads[num_model_chunks-1].append( output_tensor_grads[num_model_chunks-1].append(
p2p_communication.recv_backward(tensor_shape, timers=timers)) p2p_communication.recv_backward(tensor_shape, dtype=dtype, timers=timers))
for k in range(num_microbatches_remaining, total_num_microbatches): for k in range(num_microbatches_remaining, total_num_microbatches):
input_tensor_grad = backward_step_helper(k) input_tensor_grad = backward_step_helper(k)
next_backward_model_chunk_id = get_model_chunk_id(k+1, forward=False) next_backward_model_chunk_id = get_model_chunk_id(k+1, forward=False)
......
...@@ -53,9 +53,11 @@ class TestCIPipeline: ...@@ -53,9 +53,11 @@ class TestCIPipeline:
raise FileNotFoundError("Expected data is none") raise FileNotFoundError("Expected data is none")
expected = self.expected[loss_type] expected = self.expected[loss_type]
expected_list = expected["values"] expected_list = expected["values"]
print(expected_list)
actual_list = read_tb_logs_as_list(LOGS_DIR, loss_type) actual_list = read_tb_logs_as_list(LOGS_DIR, loss_type)
assert actual_list is not None, f"No TensorBoard events file was found in the logs for {loss_type}." assert actual_list is not None, f"No TensorBoard events file was found in the logs for {loss_type}."
for i, step in enumerate(range(expected["start_step"], expected["end_step"], expected["step_interval"])): for i, step in enumerate(range(expected["start_step"], expected["end_step"], expected["step_interval"])):
print(f"Checking step {step} against expected {i}")
if test_type == TypeOfTest.APPROX: if test_type == TypeOfTest.APPROX:
assert actual_list[step] == pytest.approx(expected=expected_list[i], rel=self.margin_loss), f"{self.job_name} : The loss at step {step} should be approximately {expected_list[i]} but it is {actual_list[step]}." assert actual_list[step] == pytest.approx(expected=expected_list[i], rel=self.margin_loss), f"{self.job_name} : The loss at step {step} should be approximately {expected_list[i]} but it is {actual_list[step]}."
else: else:
......
{"lm loss": {"start_step": 0, "end_step": 45, "step_interval": 5, "values": [10.7947, 10.85294, 10.87058, 10.83388, 10.83012, 10.78726, 10.56378, 10.57311, 10.48692]}, "num-zeros": {"start_step": 0, "end_step": 29, "step_interval": 5, "values": [2452.0, 2818.0, 2036.0, 2662.0, 2651.0, 2422.0]}, "iteration_timing_avg": 0.1187023333333333}
...@@ -7,7 +7,9 @@ TP_SIZE=$4 ...@@ -7,7 +7,9 @@ TP_SIZE=$4
PP_SIZE=$5 PP_SIZE=$5
NNODES=$6 NNODES=$6
MAX_STEPS=$7 MAX_STEPS=$7
VP_SIZE=$8
MBS=$9
GBS=${10}
GPUS_PER_NODE=8 GPUS_PER_NODE=8
# Change for multinode config # Change for multinode config
MASTER_ADDR=localhost MASTER_ADDR=localhost
...@@ -30,8 +32,8 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \ ...@@ -30,8 +32,8 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
--log-validation-ppl-to-tensorboard \ --log-validation-ppl-to-tensorboard \
--log-timers-to-tensorboard \ --log-timers-to-tensorboard \
--tensorboard-dir ${TENSORBOARD_DIR} \ --tensorboard-dir ${TENSORBOARD_DIR} \
--micro-batch-size 4 \ --micro-batch-size ${MBS:-4} \
--global-batch-size 32 \ --global-batch-size ${GBS:-32} \
--seq-length 1024 \ --seq-length 1024 \
--max-position-embeddings 1024 \ --max-position-embeddings 1024 \
--train-iters $MAX_STEPS \ --train-iters $MAX_STEPS \
...@@ -57,5 +59,6 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \ ...@@ -57,5 +59,6 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
--eval-iters 10 \ --eval-iters 10 \
--tensor-model-parallel-size $TP_SIZE \ --tensor-model-parallel-size $TP_SIZE \
--pipeline-model-parallel-size $PP_SIZE \ --pipeline-model-parallel-size $PP_SIZE \
${VP_SIZE:+--num-layers-per-virtual-pipeline-stage "$VP_SIZE"} \
--no-gradient-accumulation-fusion \ --no-gradient-accumulation-fusion \
--fp16 --fp16
...@@ -13,4 +13,4 @@ TENSORBOARD_DIR=/workspace/logs ...@@ -13,4 +13,4 @@ TENSORBOARD_DIR=/workspace/logs
srun --output $BASE_DIR/results/slurm-%j.out --error $BASE_DIR/results/slurm-%j.out --container-image gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel --container-mounts $BASE_DIR/logs:/workspace/logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c " srun --output $BASE_DIR/results/slurm-%j.out --error $BASE_DIR/results/slurm-%j.out --container-image gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel --container-mounts $BASE_DIR/logs:/workspace/logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c "
ls ls
cd /workspace/megatron-lm cd /workspace/megatron-lm
./tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh $DATA_PATH $CHECKPOINT_PATH $TENSORBOARD_DIR $TP_SIZE $PP_SIZE $NUM_NODES $MAX_STEPS" ./tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh $DATA_PATH $CHECKPOINT_PATH $TENSORBOARD_DIR $TP_SIZE $PP_SIZE $NUM_NODES $MAX_STEPS $VP_SIZE $MBS $GBS"
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment