Commit 9e06ecd3 authored by silencealiang's avatar silencealiang
Browse files

Update train_deepseekv3_671B_4nodes.sh

parent 3041681f
Pipeline #2558 failed with stages
in 0 seconds
......@@ -28,12 +28,12 @@ export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
export NCCL_TOPO_FILE="./topo-input.xml"
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1
export MP_PP0_LAYERS=2 # 是否使能视实际情况而定
export LD_LIBRARY_PATH=/public/home/xingjl/dependency/rocblas-install-0227/lib:$LD_LIBRARY_PATH
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
RANK=$OMPI_COMM_WORLD_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
......@@ -65,9 +65,9 @@ SFT=false
AC=none
OPTIMIZER_OFFLOAD=false
SAVE_INTERVAL=500
DATASET_PATH=./deepseekv3_dataset/mmap_deepseekv3_datasets_text_document
VALID_DATASET_PATH=./deepseekv3_dataset/mmap_deepseekv3_datasets_text_document
PRETRAIN_CHECKPOINT_PATH=./deepseekv3_dataset
DATASET_PATH="patch to mmap_deepseekv3_datasets_text_document"
VALID_DATASET_PATH="patch to mmap_deepseekv3_datasets_text_document"
PRETRAIN_CHECKPOINT_PATH="patch to deepseekv3_dataset"
# the following two values will not be used when SFT is true
TRAIN_TOKENS=100000000
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment