#!/usr/bin/env bash

CONFIG=$1
GPUS=$2
PORT=${PORT:-28650}

export NCCL_ALGO=Ring
export HSA_FORCE_FINE_GRAIN_PCIE=1
export NCCL_TOPO_FILE=null
export NCCL_RINGS="N0 0 7 6 5 4 3 2 1 N0|N1 1 2 3 4 5 6 7 0 N1|N2 2 1 0 7 6 5 4 3 N2|N3 3 4 5 6 7 0 1 2 N3|N4 4 3 2 1 0 7 6 5 N4|N5 5 6 7 0 1 2 3 4 N5|N6 6 5 4 3 2 1 0 7 N6|N7 7 0 1 2 3 4 5 6 N7"

# PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
# torchrun --nproc_per_node=$GPUS --master_port=$PORT \
#     $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3}

PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
torchrun --nnodes 2 --nproc_per_node 8 --node_rank 0 --master_addr=10.16.6.16 --master_port 4519 \
    $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} --deterministic




# 设置主节点地址（单机可省略）
# export MASTER_ADDR="30.149.248.7"
# export GPU_FLUSH_ON_EXECUTION=1
export PYTORCH_MIOPEN_SUGGEST_NHWC=1         #.to(memory_format=torch.channels_last)
export MIOPEN_FIND_MODE=1
export NCCL_ALGO=Ring
export HSA_FORCE_FINE_GRAIN_PCIE=1
# export LD_LIBRARY_PATH=/opt/rocblas-install/lib:$LD_LIBRARY_PATH
export NCCL_TOPO_FILE=null
# export NCCL_DEBUG=INFO
export NCCL_RINGS="N0 0 7 6 5 4 3 2 1 N0|N1 1 2 3 4 5 6 7 0 N1|N2 2 1 0 7 6 5 4 3 N2|N3 3 4 5 6 7 0 1 2 N3|N4 4 3 2 1 0 7 6 5 N4|N5 5 6 7 0 1 2 3 4 N5|N6 6 5 4 3 2 1 0 7 N6|N7 7 0 1 2 3 4 5 6 N7"
# PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
# torchrun --nnodes 2 --nproc_per_node 8 --node_rank 0 --master_addr=10.16.6.16 --master_port 9528 \
#     $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} --deterministic 
    # --enable-profiler \
    #--to_channels_last \
