"vscode:/vscode.git/clone" did not exist on "85f30655283daef327f1e42fe6cdc4436eacf6c0"
Commit 30b15cb1 authored by silencealiang's avatar silencealiang
Browse files

update nccl and launch with binding

parent cffad2c1
...@@ -73,27 +73,11 @@ def unpermute( ...@@ -73,27 +73,11 @@ def unpermute(
### 使用方式 ### 使用方式
在使用时,进入到examples目录下,有相关模型执行脚本,所用数据集请自行下载:https://r0ddbu55vzx.feishu.cn/drive/folder/ZxHHfCoX4lg75td2hTqcmiAin3g 在使用时,进入到examples目录下,有相关模型执行脚本,所用数据集请自行下载:https://r0ddbu55vzx.feishu.cn/drive/folder/ZxHHfCoX4lg75td2hTqcmiAin3g
``` ```
examples examples/
├── deepseek_v3
├── gpt3 ├── gpt3
│ ├── hostfile_gpt_567B ├── llama
│ ├── README.md ├── mixtral
│ ├── run_gpt_567B_1nodes.sh └── qwen
│ ├── run_gpt_567B_multinodes.sh
│ ├── topo-input.xml
│ ├── train_gpt_567B_1nodes.sh
│ └── train_gpt_567B_multinodes.sh
└── mixtral
├── hostfile_mixtral_8x22B
├── hostfile_mixtral_8x7B
├── README.md
├── run_mixtral_8x22B_1nodes.sh
├── run_mixtral_8x22B_multinodes.sh
├── run_mixtral_8x7B_1nodes.sh
├── run_mixtral_8x7B_multinodes.sh
├── topo-input.xml
├── train_mixtral_8x22B_1nodes.sh
├── train_mixtral_8x22B_multinodes.sh
├── train_mixtral_8x7B_1nodes.sh
└── train_mixtral_8x7B_multinodes.sh
``` ```
#!/bin/bash #!/bin/bash
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numa_map=(0 1 2 3 4 5 6 7)
LOCAL_RANK=$1 LOCAL_RANK=$1
shift shift
numa_map=(0 1 2 3 4 5 6 7)
NUMA_ID=${numa_map[$LOCAL_RANK]} NUMA_ID=${numa_map[$LOCAL_RANK]}
numactl --cpunodebind=${NUMA_ID} --membind=${NUMA_ID} "$@" numactl --cpunodebind=${NUMA_ID} --membind=${NUMA_ID} "$@"
\ No newline at end of file
...@@ -2,15 +2,23 @@ ...@@ -2,15 +2,23 @@
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )" CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR})) MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
export NCCL_ALGO=Ring export NCCL_ALGO=Ring
export NCCL_MAX_NCHANNELS=32 export NCCL_MAX_NCHANNELS=16
export NCCL_MIN_NCHANNELS=32 export NCCL_MIN_NCHANNELS=16
export NCCL_NET_GDR_LEVEL=4 export NCCL_NCHANNELS_PER_PEER=16
export NCCL_MIN_P2P_NCHANNELS=16
export NCCL_MAX_P2P_NCHANNELS=16
export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1 export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0 export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=shca_0:1,shca_1:1,shca_2:1,shca_3:1 export NCCL_IB_HCA=shca_0:1,shca_1:1,shca_2:1,shca_3:1
export NCCL_TOPO_FILE=${MEGATRON_PATH}/requirements/nccl_zz/topo-input.xml export NCCL_TOPO_FILE=${MEGATRON_PATH}/requirements/nccl_zz/topo-input.xml
export NCCL_IB_PCI_RELAXED_ORDERING=0 export NCCL_IB_PCI_RELAXED_ORDERING=0
export NCCL_PLUGIN_P2P=ucx export NCCL_PLUGIN_P2P=ucx
export NCCL_PXN_DISABLE=0
export NCCL_SOCKET_IFNAME=eno1 export NCCL_SOCKET_IFNAME=eno1
export SHCA_DEBUG_MASK=0
export SHCA_CMR_LOG_LEVEL=1
export SHCA_SHUT_UP_FWB=1
export SHCA_UCT_CQ_SIZE_INC=5
export UCX_RNDV_PUT_FORCE_FLUSH=y
export NCCL_PXN_DISABLE=0
export LD_LIBRARY_PATH=${MEGATRON_PATH}/requirements/nccl_zz/lib-v8:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=${MEGATRON_PATH}/requirements/nccl_zz/lib-v8:$LD_LIBRARY_PATH
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment