arguments.sh 2.18 KB
Newer Older
jerrrrry's avatar
jerrrrry committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
MLM_MODEL_CFG=$1

# Bash coloring
RED='\033[0;31m'
YELLOW='\033[0;33m'
GREEN='\033[0;32m'
BLUE='\033[0;34m'
PURPLE='\033[0;35m'
WHITE='\033[0;37m'

# Predefined logging
MLM_ERROR="${RED}ERROR:  ${WHITE}"
MLM_WARNING="${YELLOW}WARNING:${WHITE}"

if [ -z ${SANDBOX_ENV_SETUP} ]; then
    printf "${MLM_WARNING} ${PURPLE}SANDBOX_ENV_SETUP${WHITE} is not set!\n"
else
    source ${SANDBOX_ENV_SETUP}
fi

if [ -z ${SCRIPT_DIR} ]; then
    printf "${MLM_ERROR} Variable ${PURPLE}SCRIPT_DIR${WHITE} must be set!\n"
    exit 1
fi

if [ -z ${MLM_MODEL_CFG} ]; then
    printf "${MLM_ERROR} Variable ${PURPLE}MLM_MODEL_CFG${WHITE} must be set!\n"
    exit 1
fi

if [ -z ${MLM_MODEL_CFG} ]; then
    printf "${MLM_ERROR} Variable ${PURPLE}MLM_MODEL_CFG${WHITE} must be set!\n"
    exit 1
fi

if [ -z ${MLM_EXTRA_ARGS} ]; then
    printf "${MLM_WARNING} Use ${PURPLE}MLM_EXTRA_ARGS${WHITE} to provide additional arguments!\n"
fi

if [ -z ${MLM_WORK_DIR} ]; then
    export  MLM_WORK_DIR=/tmp/megatron_workspace
    printf "${MLM_WARNING} Variable ${PURPLE}MLM_WORK_DIR${WHITE} is set (default: ${MLM_WORK_DIR})!\n"
fi

if [ -z ${TP} ]; then
    TP=1
    printf "${MLM_WARNING} Variable ${PURPLE}TP${WHITE} not set! (default: ${TP})\n"
fi

if [ -z ${EP} ]; then
    EP=1
    printf "${MLM_WARNING} Variable ${PURPLE}EP${WHITE} not set! (default: ${EP})\n"
fi

if [ -z ${PP} ]; then
    PP=1
    printf "${MLM_WARNING} Variable ${PURPLE}PP${WHITE} not set! (default: ${PP})\n"
fi


#launch_config="torchrun --nproc_per_node=$((TP * EP * PP))"
if [ -z ${LAUNCH_SCRIPT} ]; then
    LAUNCH_SCRIPT="torchrun --nproc_per_node=$((TP * EP * PP))"
fi

# Install TensorRT Model Optimizer if haven't.
if [ -z ${MLM_SKIP_INSTALL} ]; then
    pip install -r ${SCRIPT_DIR}/requirements.txt
fi

export TOKENIZERS_PARALLELISM=False
export OMP_NUM_THREADS=1
export NCCL_IB_SL=1
export NCCL_IB_TIMEOUT=22
export CUDA_DEVICE_MAX_CONNECTIONS=1

# TE specific warning
printf "${MLM_WARNING} If you see core_attention  _extra_state missing error, use --export-force-local-attention\n"

# Base model specific arguments
if [ -z ${SANDBOX_ROOT} ]; then
    source "${SCRIPT_DIR}/conf/${MLM_MODEL_CFG}.sh"
else
    source "${SANDBOX_ROOT}/conf/model/${MLM_MODEL_CFG}.sh"
fi