run_BingBertSquad.sh 3.96 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/bin/bash

usage() {
  echo """
Usage: $0 [defined arguments...] [other arguments...]

[defined]
	-g, --num_gpus          num gpus per node to use
	-h, --help              this help text
	-n, --num_nodes         num nodes to use
	-e, --epochs		        num of training epochs
	-b, --batch_size 	      training batch size
  -p, --port              master port for nccl

[other arguments]
	all undefined arguments will be passed to the user's application
  """
}

validate_folder() {
	dir=$1
	dir_name=$2

	if [[ -d ${dir} ]]; then
		echo "Using ${dir_name}: ${dir}"
	else
		echo "${dir} folder not found"
		exit 1
	fi
}

remove_folder() {
	dir=$1
	dir_name=$2

	if [[ -d ${dir} ]]; then
		echo "The variable ${dir_name} is set to ${dir} which already exists, so removing and creating a fresh one"
    rm -rvf ${dir}
	fi
}

num_nodes=1
num_gpus=8
epochs=2
batch_size=24
enable_deepspeed=false
master_port=$((20000+RANDOM%5000))
LR=3e-5

while [[ $# -gt 0 ]]
do
key="$1"
case $key in
    -g|--num_gpus)
    num_gpus="$2"
    shift
    shift
    ;;
    -n|--num_nodes)
    num_nodes="$2"
    shift
    shift
    ;;
    -e|--epochs)
    epochs="$2"
    shift
    shift
    ;;
    -b|--batch_size)
    batch_size="$2"
    shift
    shift
    ;;
    -p|--master_port)
    master_port="$2"
    shift
    shift
    ;;
    -d|--deepspeed)
    enable_deepspeed=true
    shift
    ;;
    -h|--help)
    usage
    exit 0
    ;;
    *) # other arguments
    other_args="${other_args} $1"
    shift
    ;;
esac
done

# Validate path to BingBertSquad script
if [ -z "${BingBertSquad_DIR+x}" ]; then
aiss's avatar
aiss committed
96
  export BingBertSquad_DIR=../../../../DeepSpeedExamples/BingBertSquad
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
  echo "BingBertSquad_DIR environment variable not set; trying default: ${BingBertSquad_DIR}"
fi
validate_folder ${BingBertSquad_DIR} "BingBertSquad_DIR"

# Validate path to processed Squad data
if [ -z "${SQUAD_DIR+x}" ]; then
  export SQUAD_DIR=/data/BingBertSquad
  echo "SQUAD_DIR environment variable not set; trying default: ${SQUAD_DIR}"
fi
validate_folder ${SQUAD_DIR} "SQUAD_DIR"

# Set output path
if [ -z "${OUTPUT_DIR+x}" ]; then
  export OUTPUT_DIR=/tmp/BingBertSquad-Output
  echo "OUTPUT_DIR environment variable not set; trying default: ${OUTPUT_DIR}"
fi
remove_folder ${OUTPUT_DIR} "OUTPUT_DIR"

echo "num_nodes: ${num_nodes}"
echo "num_gpus: ${num_gpus}"
echo "epochs: ${epochs}"
echo "batch_size: ${batch_size}"
echo "master_port: ${master_port}"
echo "deepspeed: ${enable_deepspeed}"
echo "other_args: ${other_args}"

EFFECTIVE_BATCH_SIZE=${batch_size}
124
MAX_GPU_BATCH_SIZE=3
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
PER_GPU_BATCH_SIZE=$((EFFECTIVE_BATCH_SIZE/num_gpus))
if [[ $PER_GPU_BATCH_SIZE -lt $MAX_GPU_BATCH_SIZE ]]; then
       GRAD_ACCUM_STEPS=1
else
       GRAD_ACCUM_STEPS=$((PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE))
fi

if [[ ${enable_deepspeed} == true ]]; then
  BingBertSquad_script=${BingBertSquad_DIR}/nvidia_run_squad_deepspeed.py
else
  BingBertSquad_script=${BingBertSquad_DIR}/nvidia_run_squad_baseline.py
fi

JOB_NAME="BingBertSquad_ds-${enable_deepspeed}_${num_gpus}-gpu"

squad_args="--bert_model bert-large-uncased \
            --do_train \
            --do_lower_case \
            --train_file ${SQUAD_DIR}/train-v1.1.json \
            --predict_file ${SQUAD_DIR}/dev-v1.1.json \
            --train_batch_size ${PER_GPU_BATCH_SIZE} \
            --learning_rate ${LR} \
            --num_train_epochs ${epochs} \
            --max_seq_length 384 \
            --doc_stride 128 \
aiss's avatar
aiss committed
150
            --do_predict \
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
            --output_dir ${OUTPUT_DIR} \
            --gradient_accumulation_steps ${GRAD_ACCUM_STEPS} \
		      	--job_name ${JOB_NAME} \
            --model_file ${SQUAD_DIR}/training_state_checkpoint_162.tar
            "

run_cmd="deepspeed.pt \
      --num_nodes ${num_nodes} \
      --num_gpus ${num_gpus} \
      --master_port ${master_port}
      ${BingBertSquad_script} ${other_args} ${squad_args}"

echo ${run_cmd}
eval ${run_cmd}

set +x

#python ${BingBertSquad_DIR}/evaluate-v1.1.py ${SQUAD_DIR}/dev-v1.1.json ${OUTPUT_DIR}/predictions.json > ${OUTPUT_DIR}/CorrectnessScores.txt