Modify deepspeed multi nodes

cc7fcbbb · chenych · ab643c4f · cc7fcbbb · ab643c4f · cc7fcbbb
Commit cc7fcbbb authored Aug 28, 2024 by chenych
7 changed files
--- a/llama-factory/examples/full_multi_gpu/70B/.deepspeed_env
+++ b/llama-factory/examples/full_multi_gpu/70B/.deepspeed_env
--- a/llama-factory/examples/full_multi_gpu/70B/hostfile
+++ b/llama-factory/examples/full_multi_gpu/70B/hostfile
-10.5.32.245 slots=8
-10.5.32.246 slots=8
\ No newline at end of file
--- a/llama-factory/examples/full_multi_gpu/70B/multi_node_deepspeed.sh
+++ b/llama-factory/examples/full_multi_gpu/70B/multi_node_deepspeed.sh
@@ -2,8 +2,9 @@
 export HSA_FORCE_FINE_GRAIN_PCIE=1
 MASTER_ADDR=''
 # 多机多卡+deepspeed
-deepspeed --hostfile=./hostfile \
+deepspeed --hostfile=/path/of/hostfile \
    --num_nodes 2 \
    --master_addr $MASTER_ADDR \
    --master_port 12345 \
@@ -20,10 +21,10 @@ deepspeed --hostfile=./hostfile \
    --overwrite_cache \
    --overwrite_output_dir \
    --cutoff_len 8192 \
-    --preprocessing_num_workers 1 \
+    --preprocessing_num_workers 16 \
    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 1 \
+    --gradient_accumulation_steps 8 \
    --lr_scheduler_type cosine \
    --logging_steps 10 \
    --warmup_steps 20 \

--- a/llama-factory/examples/lora_multi_gpu/70B/.deepspeed_env
+++ b/llama-factory/examples/lora_multi_gpu/70B/.deepspeed_env
-NCCL_SOCKET_IFNAME=ens38f0
-NCCL_IB_DISABLE=1
-HSA_FORCE_FINE_GRAIN_PCIE=1
-MIOPEN_COMPILE_PARALLEL_LEVEL=1
-NCCL_PATH=/opt/dtk/rccl
-NCCL_DEBUG=DEBUG
--- a/llama-factory/examples/lora_multi_gpu/70B/hostfile
+++ b/llama-factory/examples/lora_multi_gpu/70B/hostfile
-10.5.32.245 slots=8
-10.5.32.246 slots=8
\ No newline at end of file
--- a/llama-factory/examples/lora_multi_gpu/70B/multi_node_deepspeed.sh
+++ b/llama-factory/examples/lora_multi_gpu/70B/multi_node_deepspeed.sh
@@ -6,7 +6,7 @@ export HSA_FORCE_FINE_GRAIN_PCIE=1
 MASTER_ADDR=''
 # LoRA + 多机多卡 + deepspeed
-deepspeed --hostfile=./hostfile \
+deepspeed --hostfile=/path/of/hostfile \
    --num_nodes 2 \
    --master_addr $MASTER_ADDR \
    --master_port 12345 \

--- a/llama-factory/hostfile
+++ b/llama-factory/hostfile
+node1 slots=8
+node2 slots=8
\ No newline at end of file