"vscode:/vscode.git/clone" did not exist on "8db0b9f261cee4c99a0def09a26b8e7dbadf8f5c"
Commit f5cf2e42 authored by Jared Casper's avatar Jared Casper
Browse files

Merge branch 'citest' into 'main'

Testing infrastructure for Megatron core

See merge request ADLR/megatron-lm!514
parents 95f872f5 8e6fa622
#!/bin/bash
# Parameters
#SBATCH --account=adlr
#SBATCH --job-name=adlr-ci:megatron-job
#SBATCH --nodes=1
#SBATCH --partition=luna
DATA_PATH=/workspace/data/gpt3_data/my-gpt3_00_text_document
CHECKPOINT_PATH=/workspace/checkpoints
TENSORBOARD_DIR=/workspace/logs
srun --output $BASE_DIR/results/slurm-%j.out --error $BASE_DIR/results/slurm-%j.out --container-image gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel --container-mounts $BASE_DIR/logs:/workspace/logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c "
ls
cd /workspace/megatron-lm
./tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_resume_checkpoint_test.sh $DATA_PATH $CHECKPOINT_PATH $TENSORBOARD_DIR $TP_SIZE $PP_SIZE $NUM_NODES"
\ No newline at end of file
#!/bin/bash
# Parameters
#SBATCH --account=adlr
#SBATCH --job-name=adlr-ci:megatron-job
#SBATCH --nodes=1
#SBATCH --partition=luna
DATA_PATH=/workspace/data/gpt3_data/my-gpt3_00_text_document
CHECKPOINT_PATH=/workspace/checkpoints
TENSORBOARD_DIR=/workspace/logs
srun --output $BASE_DIR/results/slurm-%j.out --error $BASE_DIR/results/slurm-%j.out --container-image gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel --container-mounts $BASE_DIR/logs:/workspace/logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c "
ls
cd /workspace/megatron-lm
./tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh $DATA_PATH $CHECKPOINT_PATH $TENSORBOARD_DIR $TP_SIZE $PP_SIZE $NUM_NODES $MAX_STEPS"
\ No newline at end of file
from megatron.core.tensor_parallel.cross_entropy import vocab_parallel_cross_entropy
import torch
from tests.test_utilities import Utils
from tests.unit_tests.test_utilities import Utils
import numpy as np
def test_vocab_parallel_cross_entropy():
......
from megatron.core.tensor_parallel.data import broadcast_data
import torch
from tests.test_utilities import Utils
from tests.unit_tests.test_utilities import Utils
def test_broadcast_data():
Utils.initialize_model_parallel(2,4)
......
from megatron.core.tensor_parallel import mappings
from tests.test_utilities import Utils
from tests.unit_tests.test_utilities import Utils
import torch
def test_CopyToModelParallelRegion():
......
......@@ -2,7 +2,7 @@ from megatron.core.tensor_parallel.random import CudaRNGStatesTracker
from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
from megatron.core.tensor_parallel.random import _CUDA_RNG_STATE_TRACKER
from megatron.core.tensor_parallel.random import checkpoint
from tests.test_utilities import Utils
from tests.unit_tests.test_utilities import Utils
import pytest
import torch
......
import torch
import megatron.core.tensor_parallel.utils as util
import megatron.core.parallel_state as ps
from tests.test_utilities import Utils
from tests.unit_tests.test_utilities import Utils
rank = Utils.rank
......
import torch
import megatron.core.parallel_state as ps
import pytest
from tests.test_utilities import Utils
from tests.unit_tests.test_utilities import Utils
import os
rank = Utils.rank
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment