run_BingBertSquad.sh 3.96 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/bin/bash

usage() {
  echo """
Usage: $0 [defined arguments...] [other arguments...]

[defined]
	-g, --num_gpus          num gpus per node to use
	-h, --help              this help text
	-n, --num_nodes         num nodes to use
	-e, --epochs		        num of training epochs
	-b, --batch_size 	      training batch size
  -p, --port              master port for nccl

[other arguments]
	all undefined arguments will be passed to the user's application
  """
}

validate_folder() {
	dir=$1
	dir_name=$2

	if [[ -d ${dir} ]]; then
		echo "Using ${dir_name}: ${dir}"
	else
		echo "${dir} folder not found"
		exit 1
	fi
}

remove_folder() {
	dir=$1
	dir_name=$2

	if [[ -d ${dir} ]]; then
		echo "The variable ${dir_name} is set to ${dir} which already exists, so removing and creating a fresh one"
    rm -rvf ${dir}
	fi
}

num_nodes=1
num_gpus=8
epochs=2
batch_size=24
enable_deepspeed=false
master_port=$((20000+RANDOM%5000))
LR=3e-5

while [[ $# -gt 0 ]]
do
key="$1"
case $key in
    -g|--num_gpus)
    num_gpus="$2"
    shift
    shift
    ;;
    -n|--num_nodes)
    num_nodes="$2"
    shift
    shift
    ;;
    -e|--epochs)
    epochs="$2"
    shift
    shift
    ;;
    -b|--batch_size)
    batch_size="$2"
    shift
    shift
    ;;
    -p|--master_port)
    master_port="$2"
    shift
    shift
    ;;
    -d|--deepspeed)
    enable_deepspeed=true
    shift
    ;;
    -h|--help)
    usage
    exit 0
    ;;
    *) # other arguments
    other_args="${other_args} $1"
    shift
    ;;
esac
done

# Validate path to BingBertSquad script
if [ -z "${BingBertSquad_DIR+x}" ]; then
  export BingBertSquad_DIR=../../../DeepSpeedExamples/BingBertSquad
  echo "BingBertSquad_DIR environment variable not set; trying default: ${BingBertSquad_DIR}"
fi
validate_folder ${BingBertSquad_DIR} "BingBertSquad_DIR"

# Validate path to processed Squad data
if [ -z "${SQUAD_DIR+x}" ]; then
  export SQUAD_DIR=/data/BingBertSquad
  echo "SQUAD_DIR environment variable not set; trying default: ${SQUAD_DIR}"
fi
validate_folder ${SQUAD_DIR} "SQUAD_DIR"

# Set output path
if [ -z "${OUTPUT_DIR+x}" ]; then
  export OUTPUT_DIR=/tmp/BingBertSquad-Output
  echo "OUTPUT_DIR environment variable not set; trying default: ${OUTPUT_DIR}"
fi
remove_folder ${OUTPUT_DIR} "OUTPUT_DIR"

echo "num_nodes: ${num_nodes}"
echo "num_gpus: ${num_gpus}"
echo "epochs: ${epochs}"
echo "batch_size: ${batch_size}"
echo "master_port: ${master_port}"
echo "deepspeed: ${enable_deepspeed}"
echo "other_args: ${other_args}"

EFFECTIVE_BATCH_SIZE=${batch_size}
124
MAX_GPU_BATCH_SIZE=3
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
PER_GPU_BATCH_SIZE=$((EFFECTIVE_BATCH_SIZE/num_gpus))
if [[ $PER_GPU_BATCH_SIZE -lt $MAX_GPU_BATCH_SIZE ]]; then
       GRAD_ACCUM_STEPS=1
else
       GRAD_ACCUM_STEPS=$((PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE))
fi

if [[ ${enable_deepspeed} == true ]]; then
  BingBertSquad_script=${BingBertSquad_DIR}/nvidia_run_squad_deepspeed.py
else
  BingBertSquad_script=${BingBertSquad_DIR}/nvidia_run_squad_baseline.py
fi

JOB_NAME="BingBertSquad_ds-${enable_deepspeed}_${num_gpus}-gpu"

#            --do_predict \
squad_args="--bert_model bert-large-uncased \
            --do_train \
            --do_lower_case \
            --train_file ${SQUAD_DIR}/train-v1.1.json \
            --predict_file ${SQUAD_DIR}/dev-v1.1.json \
            --train_batch_size ${PER_GPU_BATCH_SIZE} \
            --learning_rate ${LR} \
            --num_train_epochs ${epochs} \
            --max_seq_length 384 \
            --doc_stride 128 \
            --output_dir ${OUTPUT_DIR} \
            --gradient_accumulation_steps ${GRAD_ACCUM_STEPS} \
		      	--job_name ${JOB_NAME} \
            --model_file ${SQUAD_DIR}/training_state_checkpoint_162.tar
            "

run_cmd="deepspeed.pt \
      --num_nodes ${num_nodes} \
      --num_gpus ${num_gpus} \
      --master_port ${master_port}
      ${BingBertSquad_script} ${other_args} ${squad_args}"

echo ${run_cmd}
eval ${run_cmd}

set +x

#python ${BingBertSquad_DIR}/evaluate-v1.1.py ${SQUAD_DIR}/dev-v1.1.json ${OUTPUT_DIR}/predictions.json > ${OUTPUT_DIR}/CorrectnessScores.txt