"src/vscode:/vscode.git/clone" did not exist on "64bd06519aa4ec57de2517fa46e1f8efb5ff6550"
Commit c6a6f638 authored by Rayyyyy's avatar Rayyyyy
Browse files

update multi_node shell.

parent 74d060ff
None LICENSE Curenty
\ No newline at end of file
...@@ -188,9 +188,9 @@ bash train.sh ...@@ -188,9 +188,9 @@ bash train.sh
### 多机多卡 ### 多机多卡
Tips: 作者使用8个节点, 每个节点8张卡 (total_bsz = 8x8x32 = 2048) 进行的训练; Tips: 作者使用8个节点, 每个节点8张卡 (total_bsz = 8x8x32 = 2048) 进行的训练;
如果需要设置其他卡数, 请修改-nproc_per_node参数。
```bash ```bash
bash train_painter_vit_large.sh bash run_train_multi.sh
``` ```
## 推理 ## 推理
......
...@@ -73,8 +73,8 @@ python data/ade20k/gen_json_ade20k_sem.py --split validation ...@@ -73,8 +73,8 @@ python data/ade20k/gen_json_ade20k_sem.py --split validation
6. 为了确认能通过 detectron2 进行验证, 创建 `$Painter_ROOT/datasets/ade20k` to `$Painter_ROOT/datasets/ADEChallengeData2016` 的软连接, 然后执行下面的操作: 6. 为了确认能通过 detectron2 进行验证, 创建 `$Painter_ROOT/datasets/ade20k` to `$Painter_ROOT/datasets/ADEChallengeData2016` 的软连接, 然后执行下面的操作:
```bash ```bash
# 创建软连接, 注意, 一定是datasets下面创建ADEChallengeData2016 # 很重要!!!!创建软连接, 注意, 一定是datasets下面创建ADEChallengeData2016!!
# ln -s $Painter_ROOT/datasets/ade20k datasets/ADEChallengeData2016 ln -s $Painter_ROOT/datasets/ade20k datasets/ADEChallengeData2016
# 执行 # 执行
python data/prepare_ade20k_sem_seg.py python data/prepare_ade20k_sem_seg.py
``` ```
......
...@@ -6,7 +6,7 @@ natsort # for denoising ...@@ -6,7 +6,7 @@ natsort # for denoising
wandb wandb
scikit-image==0.18.0 scikit-image==0.18.0
git+https://github.com/svenkreiss/poseval.git git+https://github.com/svenkreiss/poseval.git
tensorbord tensorboard
fvcore==0.1.5 fvcore
yapf==0.40.1 yapf==0.40.1
fairscale==0.4.13 fairscale==0.4.13
\ No newline at end of file
ulimit -u 200000
echo "START TIME: $(date)"
hostfile=./hostfile
np=$(cat $hostfile|sort|uniq |wc -l)
np=$(($np*4))
echo $np
nodename=$(cat $hostfile |sed -n "1p")
dist_url=`echo $nodename | awk '{print $1}'`
which mpirun
# 添加pythonlib环境, 用户需修改为自己的环境变量地址
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/path/of/conda/envs/{env_name}/lib
export PYTHON=python3
# -np 显卡数量
# -x 将变量传递到single_process.sh脚本中
mpirun -np $np --allow-run-as-root --hostfile hostfile --bind-to none -x dist_url -x PYTHON `pwd`/single_process.sh
echo "END TIME: $(date)"
#!/bin/bash #!/bin/bash
# NCCL相关的DEBUG信息显示
export NCCL_DEBUG=INFO
export NCCL_NET_PLUGIN=none
export HSA_FORCE_FINE_GRAIN_PCIE=1 export HSA_FORCE_FINE_GRAIN_PCIE=1
export USE_MIOPEN_BATCHNORM=1 export USE_MIOPEN_BATCHNORM=1
export NCCL_P2P_LEVEL=5
export WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
export NCCL_PLUGIN_P2P=ucx
export NCCL_SHM_DISABLE=1
export NCCL_IB_DISABLE=0 #不适用ib
export NCCL_IB_HCA=mlx5_0
export NCCL_CROSS_NIC=1
export RCCL_NCHANNELS=4
export MASTER_ADDR=$dist_url
export MASTER_PORT=4321
export RANK=$OMPI_COMM_WORLD_RANK
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
DATA_PATH=/home/datasets DATA_PATH=/home/datasets
name=painter_vit_large name=painter_vit_large
python -m torch.distributed.launch --nproc_per_node=8 \ APP="python3 -u main_train.py \
--nnodes=${WORLD_SIZE} --node_rank=$RANK \
--master_addr=$MASTER_ADDR --master_port=12358 \
--use_env main_train.py \
--batch_size 2 \ --batch_size 2 \
--accum_iter 16 \ --accum_iter 16 \
--model painter_vit_large_patch16_input896x448_win_dec64_8glb_sl1 \ --model painter_vit_large_patch16_input896x448_win_dec64_8glb_sl1 \
...@@ -42,6 +57,32 @@ python -m torch.distributed.launch --nproc_per_node=8 \ ...@@ -42,6 +57,32 @@ python -m torch.distributed.launch --nproc_per_node=8 \
$DATA_PATH/light_enhance/enhance_lol_val.json \ $DATA_PATH/light_enhance/enhance_lol_val.json \
--output_dir models/$name \ --output_dir models/$name \
--log_dir models/$name/logs \ --log_dir models/$name/logs \
--finetune path/to/mae_pretrain_vit_large.pth \ --finetune path/to/mae_pretrain_vit_large.pth
# --log_wandb \ "
case $(expr $lrank % 4) in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
...@@ -43,4 +43,3 @@ python -m torch.distributed.launch --nproc_per_node=4 \ ...@@ -43,4 +43,3 @@ python -m torch.distributed.launch --nproc_per_node=4 \
--log_dir models/$name/logs \ --log_dir models/$name/logs \
--finetune path/to/mae_pretrain_vit_large.pth \ --finetune path/to/mae_pretrain_vit_large.pth \
# --log_wandb \ # --log_wandb \
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment