"vscode:/vscode.git/clone" did not exist on "3c85a57297b22df8921bae39c0a2e3982ee69de7"
Commit fdb051c1 authored by zhaoying1's avatar zhaoying1
Browse files

Update run_train_single.sh

parent 410643b6
...@@ -14,7 +14,7 @@ lrank=$OMPI_COMM_WORLD_LOCAL_RANK ...@@ -14,7 +14,7 @@ lrank=$OMPI_COMM_WORLD_LOCAL_RANK
echo "LRANK===============================$lrank" echo "LRANK===============================$lrank"
RANK=$OMPI_COMM_WORLD_RANK RANK=$OMPI_COMM_WORLD_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
export HIP_VISIBLE_DEVICES=0,1,2,3
LR=1e-5 LR=1e-5
APP="python3 ../main.py \ APP="python3 ../main.py \
...@@ -44,50 +44,35 @@ APP="python3 ../main.py \ ...@@ -44,50 +44,35 @@ APP="python3 ../main.py \
case ${lrank} in case ${lrank} in
[0]) [0])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
numactl --cpunodebind=0 --membind=0 ${APP} numactl --cpunodebind=0 --membind=0 ${APP}
;; ;;
[1]) [1])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export UCX_NET_DEVICES=mlx5_1:1 numactl --cpunodebind=1 --membind=1 ${APP}
export UCX_IB_PCI_BW=mlx5_1:50Gbs
numactl --cpunodebind=0 --membind=0 ${APP}
;; ;;
[2]) [2])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export UCX_NET_DEVICES=mlx5_2:1 numactl --cpunodebind=2 --membind=2 ${APP}
export UCX_IB_PCI_BW=mlx5_2:50Gbs
numactl --cpunodebind=0 --membind=0 ${APP}
;; ;;
[3]) [3])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export UCX_NET_DEVICES=mlx5_3:1 numactl --cpunodebind=3 --membind=3 ${APP}
export UCX_IB_PCI_BW=mlx5_3:50Gbs
numactl --cpunodebind=0 --membind=0 ${APP}
;; ;;
[4]) [4])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export UCX_NET_DEVICES=mlx5_4:1 numactl --cpunodebind=4 --membind=4 ${APP}
export UCX_IB_PCI_BW=mlx5_4:50Gbs
numactl --cpunodebind=3 --membind=3 ${APP}
;; ;;
[5]) [5])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export UCX_NET_DEVICES=mlx5_5:1 numactl --cpunodebind=5 --membind=5 ${APP}
export UCX_IB_PCI_BW=mlx5_5:50Gbs
numactl --cpunodebind=3 --membind=3 ${APP}
;; ;;
[6]) [6])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export UCX_NET_DEVICES=mlx5_6:1 numactl --cpunodebind=6 --membind=6 ${APP}
export UCX_IB_PCI_BW=mlx5_6:50Gbs
numactl --cpunodebind=3 --membind=3 ${APP}
;; ;;
[7]) [7])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export UCX_NET_DEVICES=mlx5_7:1 numactl --cpunodebind=7 --membind=7 ${APP}
export UCX_IB_PCI_BW=mlx5_7:50Gbs ;;
numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac esac
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment