update multi_node shell.

c6a6f638 · Rayyyyy · 74d060ff · c6a6f638 · c6a6f638 · c6a6f638
Commit c6a6f638 authored Feb 02, 2024 by Rayyyyy
7 changed files
--- a/LICENSE
+++ b/LICENSE
+None LICENSE Curenty
\ No newline at end of file
--- a/README.md
+++ b/README.md
@@ -188,9 +188,9 @@ bash train.sh
 ### 多机多卡
 Tips: 作者使用8个节点, 每个节点8张卡 (total_bsz = 8x8x32 = 2048) 进行的训练;
-如果需要设置其他卡数, 请修改-nproc_per_node参数。
 ```bash
-bash train_painter_vit_large.sh
+bash run_train_multi.sh
 ```
 ## 推理

--- a/docs/DATA.md
+++ b/docs/DATA.md
@@ -73,8 +73,8 @@ python data/ade20k/gen_json_ade20k_sem.py --split validation
 6. 为了确认能通过 detectron2 进行验证, 创建 `$Painter_ROOT/datasets/ade20k` to `$Painter_ROOT/datasets/ADEChallengeData2016` 的软连接, 然后执行下面的操作:
 ```bash
-# 创建软连接, 注意, 一定是datasets下面创建ADEChallengeData2016
+# 很重要！！！！创建软连接, 注意, 一定是datasets下面创建ADEChallengeData2016！！
-# ln -s $Painter_ROOT/datasets/ade20k datasets/ADEChallengeData2016
+ln -s $Painter_ROOT/datasets/ade20k datasets/ADEChallengeData2016
 # 执行
 python data/prepare_ade20k_sem_seg.py
 ```

--- a/requirements.txt
+++ b/requirements.txt
@@ -6,7 +6,7 @@ natsort  # for denoising
 wandb
 scikit-image==0.18.0
 git+https://github.com/svenkreiss/poseval.git
-tensorbord
+tensorboard
-fvcore==0.1.5
+fvcore
 yapf==0.40.1
 fairscale==0.4.13
\ No newline at end of file
--- a/run_train_multi.sh
+++ b/run_train_multi.sh
+ulimit -u 200000
+echo "START TIME: $(date)"
+hostfile=./hostfile
+np=$(cat $hostfile|sort|uniq |wc -l)
+np=$(($np*4))
+echo $np
+nodename=$(cat $hostfile |sed -n "1p")
+dist_url=`echo $nodename | awk '{print $1}'`
+which mpirun
+# 添加pythonlib环境, 用户需修改为自己的环境变量地址
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/path/of/conda/envs/{env_name}/lib
+export PYTHON=python3
+# -np 显卡数量
+# -x 将变量传递到single_process.sh脚本中
+mpirun -np $np --allow-run-as-root --hostfile hostfile --bind-to none -x dist_url -x PYTHON `pwd`/single_process.sh
+echo "END TIME: $(date)"
--- a/train_painter_vit_large.sh
+++ b/train_painter_vit_large.sh
 #!/bin/bash
+# NCCL相关的DEBUG信息显示
+export NCCL_DEBUG=INFO
+export NCCL_NET_PLUGIN=none
 export HSA_FORCE_FINE_GRAIN_PCIE=1
 export USE_MIOPEN_BATCHNORM=1
+export NCCL_P2P_LEVEL=5
+export WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+export NCCL_PLUGIN_P2P=ucx
+export NCCL_SHM_DISABLE=1
+export NCCL_IB_DISABLE=0 #不适用ib
+export NCCL_IB_HCA=mlx5_0
+export NCCL_CROSS_NIC=1
+export RCCL_NCHANNELS=4
+export MASTER_ADDR=$dist_url
+export MASTER_PORT=4321
+export RANK=$OMPI_COMM_WORLD_RANK
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
 DATA_PATH=/home/datasets
 name=painter_vit_large
-python -m torch.distributed.launch --nproc_per_node=8 \
+APP="python3 -u main_train.py  \
-	--nnodes=${WORLD_SIZE} --node_rank=$RANK \
-	--master_addr=$MASTER_ADDR --master_port=12358 \
-	--use_env main_train.py  \
    --batch_size 2 \
    --accum_iter 16  \
    --model painter_vit_large_patch16_input896x448_win_dec64_8glb_sl1 \
@@ -42,6 +57,32 @@ python -m torch.distributed.launch --nproc_per_node=8 \
    $DATA_PATH/light_enhance/enhance_lol_val.json \
    --output_dir models/$name \
    --log_dir models/$name/logs \
-    --finetune path/to/mae_pretrain_vit_large.pth \
+    --finetune path/to/mae_pretrain_vit_large.pth
-    # --log_wandb \
+"
+case $(expr $lrank % 4) in
+[0])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_1:1
+  export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_2:1
+  export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_3:1
+  export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac
--- a/train.sh
+++ b/train.sh
@@ -43,4 +43,3 @@ python -m torch.distributed.launch --nproc_per_node=4 \
    --log_dir models/$name/logs \
    --finetune path/to/mae_pretrain_vit_large.pth \
    # --log_wandb \