dist_train.sh 580 Bytes
Newer Older
lishj6's avatar
init  
lishj6 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#!/usr/bin/env bash

CONFIG=$1
GPUS=$2
PORT=${PORT:-28509}

# 设置主节点地址(单机可省略)
export MASTER_ADDR="localhost"
# export GPU_FLUSH_ON_EXECUTION=1
export PYTORCH_MIOPEN_SUGGEST_NHWC=1         #.to(memory_format=torch.channels_last)
export MIOPEN_FIND_MODE=1
# export LD_LIBRARY_PATH=/opt/rocblas-install/lib:$LD_LIBRARY_PATH

PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
torchrun --nproc_per_node=$GPUS --master_port=$PORT \
    $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} --deterministic \
    # --enable-profiler \
    #--to_channels_last \