install.sh 6.58 KB
Newer Older
Jeff Rasley's avatar
Jeff Rasley committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#!/bin/bash

set -e
err_report() {
    echo "Error on line $1"
    echo "Fail to install deepspeed"
}
trap 'err_report $LINENO' ERR

usage() {
  echo """
Usage: install.sh [options...]

By default will install deepspeed and all third party dependecies accross all machines listed in
hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally

[optional]
    -d, --deepspeed_only    Install only deepspeed and no third party dependencies
    -t, --third_party_only  Install only third party dependencies and not deepspeed
20
    -l, --local_only        Install only on local machine
21
22
23
    -s, --pip_sudo          Run pip install with sudo (default: no sudo)
    -r, --allow_sudo        Allow script to be run by root (probably don't want this, instead use --pip_sudo)
    -n, --no_clean          Do not clean prior build state, by default prior build files are removed before building wheels
24
    -m, --pip_mirror        Use the specified pip mirror (default: the default pip mirror)
25
    -H, --hostfile          Path to MPI-style hostfile (default: /job/hostfile)
26
    -a, --apex_commit       Install a specific commit hash of apex, instead of the one deepspeed points to
Jeff Rasley's avatar
Jeff Rasley committed
27
    -k, --skip_requirements Skip installing DeepSpeed requirements
Jeff Rasley's avatar
Jeff Rasley committed
28
29
30
31
32
33
34
35
36
    -h, --help              This help text
  """
}

ds_only=0
tp_only=0
deepspeed_install=1
third_party_install=1
local_only=0
37
pip_sudo=0
Jeff Rasley's avatar
Jeff Rasley committed
38
entire_dlts_job=1
39
hostfile=/job/hostfile
40
pip_mirror=""
41
apex_commit=""
Jeff Rasley's avatar
Jeff Rasley committed
42
skip_requirements=0
43
44
allow_sudo=0
no_clean=0
Jeff Rasley's avatar
Jeff Rasley committed
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65

while [[ $# -gt 0 ]]
do
key="$1"
case $key in
    -d|--deepspeed_only)
    deepspeed_install=1;
    third_party_install=0;
    ds_only=1;
    shift
    ;;
    -t|--third_party_only)
    deepspeed_install=0;
    third_party_install=1;
    tp_only=1;
    shift
    ;;
    -l|--local_only)
    local_only=1;
    shift
    ;;
66
67
68
69
70
71
72
73
74
    -s|--pip_sudo)
    pip_sudo=1;
    shift
    ;;
    -m|--pip_mirror)
    pip_mirror=$2;
    shift
    shift
    ;;
75
76
77
78
79
    -a|--apex_commit)
    apex_commit=$2;
    shift
    shift
    ;;
Jeff Rasley's avatar
Jeff Rasley committed
80
81
82
83
    -k|--skip_requirements)
    skip_requirements=1;
    shift
    ;;
84
85
86
87
88
89
90
91
    -r|--allow_sudo)
    allow_sudo=1;
    shift
    ;;
    -n|--no_clean)
    no_clean=1;
    shift
    ;;
92
93
94
95
96
97
98
99
100
    -H|--hostfile)
    hostfile=$2
    if [ ! -f $2 ]; then
        echo "User provided hostfile does not exist at $hostfile, exiting"
        exit 1
    fi
    shift
    shift
    ;;
Jeff Rasley's avatar
Jeff Rasley committed
101
102
103
104
105
106
107
108
109
110
111
112
113
    -h|--help)
    usage
    exit 0
    ;;
    *)
    echo "Unkown argument(s)"
    usage
    exit 1
    shift
    ;;
esac
done

114
115
116
117
118
119
120
121
122
user=`whoami`
if [ "$allow_sudo" == "0" ]; then
    if [ "$user" == "root" ]; then
        echo "WARNING: running as root, if you want to install DeepSpeed with sudo please use -s/--pip_sudo instead"
        usage
        exit 1
    fi
fi

Jeff Rasley's avatar
Jeff Rasley committed
123
124
125
126
127
128
if [ "$ds_only" == "1" ] && [ "$tp_only" == "1" ]; then
    echo "-d and -t are mutually exclusive, only choose one or none"
    usage
    exit 1
fi

129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
rm_if_exist() {
    echo "Attempting to remove $1"
    if [ -f $1 ]; then
        rm -v $1
    elif [ -d $1 ]; then
        rm -vr $1
    fi
}

if [ "$no_clean" == "0" ]; then
    # remove deepspeed build files
    rm_if_exist deepspeed/git_version_info.py
    rm_if_exist dist
    rm_if_exist build
    rm_if_exist deepspeed.egg-info
    # remove apex build files
    rm_if_exist third_party/apex/dist
    rm_if_exist third_party/apex/build
    rm_if_exist third_party/apex/apex.egg-info
fi

Jeff Rasley's avatar
Jeff Rasley committed
150
echo "Updating git hash/branch info"
151
152
153
echo "git_hash = '$(git rev-parse --short HEAD)'" > deepspeed/git_version_info.py
echo "git_branch = '$(git rev-parse --abbrev-ref HEAD)'" >> deepspeed/git_version_info.py
cat deepspeed/git_version_info.py
Jeff Rasley's avatar
Jeff Rasley committed
154

155
if [ "$pip_sudo" == "1" ]; then
156
    PIP_SUDO="sudo -H"
157
else
158
    PIP_SUDO=""
159
160
161
fi

if [ "$pip_mirror" != "" ]; then
162
    PIP_INSTALL="pip install -v -i $pip_mirror"
163
else
164
    PIP_INSTALL="pip install -v"
165
166
fi

167
if [ ! -f $hostfile ]; then
168
169
    echo "No hostfile exists at $hostfile, installing locally"
    local_only=1
Jeff Rasley's avatar
Jeff Rasley committed
170
171
fi

Jeff Rasley's avatar
Jeff Rasley committed
172
173
174
175
if [ "$skip_requirements" == "0" ]; then
    # Ensure dependencies are installed locally
    $PIP_SUDO $PIP_INSTALL -r requirements.txt
fi
176

177
178
179
180
181
182
183
# Build wheels
if [ "$third_party_install" == "1" ]; then
    echo "Checking out sub-module(s)"
    git submodule update --init --recursive

    echo "Building apex wheel"
    cd third_party/apex
184
185
186
187
188
189
190

    if [ "$apex_commit" != "" ]; then
        echo "Installing a non-standard version of apex at commit: $apex_commit"
        git fetch
        git checkout $apex_commit
    fi

191
    python setup.py -v --cpp_ext --cuda_ext bdist_wheel
192
    cd -
193
194

    echo "Installing apex locally so that deepspeed will build"
195
196
    $PIP_SUDO pip uninstall -y apex
    $PIP_SUDO $PIP_INSTALL third_party/apex/dist/apex*.whl
197
198
fi
if [ "$deepspeed_install" == "1" ]; then
199
    echo "Building deepspeed wheel"
200
    python setup.py -v bdist_wheel
201
fi
Jeff Rasley's avatar
Jeff Rasley committed
202

203
if [ "$local_only" == "1" ]; then
Jeff Rasley's avatar
Jeff Rasley committed
204
205
    if [ "$deepspeed_install" == "1" ]; then
        echo "Installing deepspeed"
206
207
        $PIP_SUDO pip uninstall -y deepspeed
        $PIP_SUDO $PIP_INSTALL dist/deepspeed*.whl
208
209
210
211
212
213
        python basic_install_test.py
        if [ $? == 0 ]; then
            echo "Installation is successful"
        else
            echo "Installation failed"
        fi
Jeff Rasley's avatar
Jeff Rasley committed
214
215
216
217
218
219
220
221
222
223
    fi
else
    local_path=`pwd`
    if [ -f $hostfile ]; then
        hosts=`cat $hostfile | awk '{print $1}' | paste -sd "," -`;
    else
        echo "hostfile not found, cannot proceed"
        exit 1
    fi
    export PDSH_RCMD_TYPE=ssh;
224
    tmp_wheel_path="/tmp/deepspeed_wheels"
Jeff Rasley's avatar
Jeff Rasley committed
225

226
    pdsh -w $hosts "if [ -d $tmp_wheel_path ]; then rm $tmp_wheel_path/*.whl; else mkdir -pv $tmp_wheel_path; fi"
227
    pdcp -w $hosts requirements.txt ${tmp_wheel_path}/
Jeff Rasley's avatar
Jeff Rasley committed
228
229
230
    if [ "$skip_requirements" == "0" ]; then
        pdsh -w $hosts "$PIP_SUDO $PIP_INSTALL -r ${tmp_wheel_path}/requirements.txt"
    fi
Jeff Rasley's avatar
Jeff Rasley committed
231
    if [ "$third_party_install" == "1" ]; then
232
        pdsh -w $hosts "$PIP_SUDO pip uninstall -y apex"
233
        pdcp -w $hosts third_party/apex/dist/apex*.whl $tmp_wheel_path/
234
        pdsh -w $hosts "$PIP_SUDO $PIP_INSTALL $tmp_wheel_path/apex*.whl"
Jeff Rasley's avatar
Jeff Rasley committed
235
236
237
238
        pdsh -w $hosts 'python -c "import apex"'
    fi
    if [ "$deepspeed_install" == "1" ]; then
        echo "Installing deepspeed"
239
        pdsh -w $hosts "$PIP_SUDO pip uninstall -y deepspeed"
240
        pdcp -w $hosts dist/deepspeed*.whl $tmp_wheel_path/
241
        pdcp -w $hosts basic_install_test.py $tmp_wheel_path/
242
        pdsh -w $hosts "$PIP_SUDO $PIP_INSTALL $tmp_wheel_path/deepspeed*.whl"
243
        pdsh -w $hosts "python $tmp_wheel_path/basic_install_test.py"
Jeff Rasley's avatar
Jeff Rasley committed
244
245
        echo "Installation is successful"
    fi
246
    pdsh -w $hosts "if [ -d $tmp_wheel_path ]; then rm $tmp_wheel_path/*.whl $tmp_wheel_path/basic_install_test.py $tmp_wheel_path/requirements.txt; rmdir $tmp_wheel_path; fi"
Jeff Rasley's avatar
Jeff Rasley committed
247
fi