dist_train.sh 619 Bytes
Newer Older
lishj6's avatar
lishj6 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
!/usr/bin/env bash

CONFIG=$1
GPUS=$2
PORT=${PORT:-28651}

# export GPU_FLUSH_ON_EXECUTION=1
export PYTORCH_MIOPEN_SUGGEST_NHWC=1         #.to(memory_format=torch.channels_last)
export MIOPEN_FIND_MODE=1
export LD_LIBRARY_PATH=/home/SparseDrive/package/miopen/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/home/SparseDrive/rocblas-install/lib:$LD_LIBRARY_PATH
export MIOPEN_PRECISION_FP32_FP32_FP32_TF32_FP32=1

PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
python3 -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
    $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3}  #--enable-profiler