Commit f5cf2e42 authored by Jared Casper's avatar Jared Casper
Browse files

Merge branch 'citest' into 'main'

Testing infrastructure for Megatron core

See merge request ADLR/megatron-lm!514
parents 95f872f5 8e6fa622
#!/bin/bash
# Parameters
#SBATCH --account=adlr
#SBATCH --job-name=adlr-ci:megatron-job
#SBATCH --nodes=1
#SBATCH --partition=luna
DATA_PATH=/workspace/data/gpt3_data/my-gpt3_00_text_document
CHECKPOINT_PATH=/workspace/checkpoints
TENSORBOARD_DIR=/workspace/logs
srun --output $BASE_DIR/results/slurm-%j.out --error $BASE_DIR/results/slurm-%j.out --container-image gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel --container-mounts $BASE_DIR/logs:/workspace/logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c "
ls
cd /workspace/megatron-lm
./tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_resume_checkpoint_test.sh $DATA_PATH $CHECKPOINT_PATH $TENSORBOARD_DIR $TP_SIZE $PP_SIZE $NUM_NODES"
\ No newline at end of file
#!/bin/bash
# Parameters
#SBATCH --account=adlr
#SBATCH --job-name=adlr-ci:megatron-job
#SBATCH --nodes=1
#SBATCH --partition=luna
DATA_PATH=/workspace/data/gpt3_data/my-gpt3_00_text_document
CHECKPOINT_PATH=/workspace/checkpoints
TENSORBOARD_DIR=/workspace/logs
srun --output $BASE_DIR/results/slurm-%j.out --error $BASE_DIR/results/slurm-%j.out --container-image gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel --container-mounts $BASE_DIR/logs:/workspace/logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c "
ls
cd /workspace/megatron-lm
./tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh $DATA_PATH $CHECKPOINT_PATH $TENSORBOARD_DIR $TP_SIZE $PP_SIZE $NUM_NODES $MAX_STEPS"
\ No newline at end of file
from megatron.core.tensor_parallel.cross_entropy import vocab_parallel_cross_entropy from megatron.core.tensor_parallel.cross_entropy import vocab_parallel_cross_entropy
import torch import torch
from tests.test_utilities import Utils from tests.unit_tests.test_utilities import Utils
import numpy as np import numpy as np
def test_vocab_parallel_cross_entropy(): def test_vocab_parallel_cross_entropy():
......
from megatron.core.tensor_parallel.data import broadcast_data from megatron.core.tensor_parallel.data import broadcast_data
import torch import torch
from tests.test_utilities import Utils from tests.unit_tests.test_utilities import Utils
def test_broadcast_data(): def test_broadcast_data():
Utils.initialize_model_parallel(2,4) Utils.initialize_model_parallel(2,4)
......
from megatron.core.tensor_parallel import mappings from megatron.core.tensor_parallel import mappings
from tests.test_utilities import Utils from tests.unit_tests.test_utilities import Utils
import torch import torch
def test_CopyToModelParallelRegion(): def test_CopyToModelParallelRegion():
......
...@@ -2,7 +2,7 @@ from megatron.core.tensor_parallel.random import CudaRNGStatesTracker ...@@ -2,7 +2,7 @@ from megatron.core.tensor_parallel.random import CudaRNGStatesTracker
from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
from megatron.core.tensor_parallel.random import _CUDA_RNG_STATE_TRACKER from megatron.core.tensor_parallel.random import _CUDA_RNG_STATE_TRACKER
from megatron.core.tensor_parallel.random import checkpoint from megatron.core.tensor_parallel.random import checkpoint
from tests.test_utilities import Utils from tests.unit_tests.test_utilities import Utils
import pytest import pytest
import torch import torch
......
import torch import torch
import megatron.core.tensor_parallel.utils as util import megatron.core.tensor_parallel.utils as util
import megatron.core.parallel_state as ps import megatron.core.parallel_state as ps
from tests.test_utilities import Utils from tests.unit_tests.test_utilities import Utils
rank = Utils.rank rank = Utils.rank
......
import torch import torch
import megatron.core.parallel_state as ps import megatron.core.parallel_state as ps
import pytest import pytest
from tests.test_utilities import Utils from tests.unit_tests.test_utilities import Utils
import os import os
rank = Utils.rank rank = Utils.rank
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment