Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
Baichuan_pytorch
Commits
1af723f8
Commit
1af723f8
authored
Feb 27, 2024
by
zhaoying1
Browse files
update multinode run
parent
9e75f8e2
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
59 additions
and
145 deletions
+59
-145
multi_node/env.sh
multi_node/env.sh
+0
-50
multi_node/run-13b-sft-single.sh
multi_node/run-13b-sft-single.sh
+16
-19
multi_node/run-13b-sft.sh
multi_node/run-13b-sft.sh
+3
-5
multi_node/run-7b-sft-lora-single.sh
multi_node/run-7b-sft-lora-single.sh
+17
-20
multi_node/run-7b-sft-lora.sh
multi_node/run-7b-sft-lora.sh
+3
-5
slurm_script/run-13b-sft-single.sh
slurm_script/run-13b-sft-single.sh
+6
-6
slurm_script/run-13b-sft.sh
slurm_script/run-13b-sft.sh
+1
-14
slurm_script/run-7b-sft-lora-single.sh
slurm_script/run-7b-sft-lora-single.sh
+12
-10
slurm_script/run-7b-sft-lora.sh
slurm_script/run-7b-sft-lora.sh
+1
-16
No files found.
multi_node/env.sh
deleted
100644 → 0
View file @
9e75f8e2
#!/bin/bash
export
ROCM_PATH
=
/opt/dtk-23.04
export
ROCM_SOURCE_DIR
=
${
ROCM_PATH
}
echo
$ROCM_PATH
export
HIP_PATH
=
${
ROCM_PATH
}
/hip
export
AMDGPU_TARGETS
=
"gfx900;gfx906"
export
PATH
=
${
ROCM_PATH
}
/bin:
${
ROCM_PATH
}
/llvm/bin:
${
ROCM_PATH
}
/hcc/bin:
${
ROCM_PATH
}
/hip/bin:
$PATH
export
LD_LIBRARY_PATH
=
${
ROCM_PATH
}
/lib:
${
ROCM_PATH
}
/lib64:
$LD_LIBRARY_PATH
export
LD_LIBRARY_PATH
=
${
ROCM_PATH
}
/hip/lib:
${
ROCM_PATH
}
/llvm/lib:
${
ROCM_PATH
}
/opencl/lib/x86_64:
$LD_LIBRARY_PATH
export
C_INCLUDE_PATH
=
${
ROCM_PATH
}
/include:
${
ROCM_PATH
}
/hip/include/hip:
${
ROCM_PATH
}
/llvm/include:/opencl/include:
${
ROCM_PATH
}
/include/rocrand:
${
ROCM_PATH
}
/include/hiprand
export
CPLUS_INCLUDE_PATH
=
${
ROCM_PATH
}
/include:
${
ROCM_PATH
}
/hip/include/hip:
${
ROCM_PATH
}
/llvm/include:/opencl/include:
${
ROCM_PATH
}
/include/rocrand:
${
ROCM_PATH
}
/include/hiprand
export
PATH
=
${
ROCM_PATH
}
/miopen/bin:
${
ROCM_PATH
}
/rocblas/bin:
${
ROCM_PATH
}
/hipsparse/bin:
$PATH
export
LD_LIBRARY_PATH
=
${
ROCM_PATH
}
/miopen/lib:
${
ROCM_PATH
}
/rocblas/lib:
$LD_LIBRARY_PATH
export
MIOPEN_SYSTEM_DB_PATH
=
${
ROCM_PATH
}
/miopen/share/miopen/db/
export
LD_LIBRARY_PATH
=
/usr/lib64:
$LD_LIBRARY_PATH
export
LIBRARY_PATH
=
/usr/lib64:
$LIBRARY_PATH
export
RCCL_PATH
=
$ROCM_PATH
/rccl
export
NCCL_PATH
=
$ROCM_PATH
/rccl
export
LD_LIBRARY_PATH
=
$RCCL_PATH
/lib:
$LD_LIBRARY_PATH
export
MIOPEN_FIND_MODE
=
3
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
NCCL_P2P_LEVEL
=
5
export
NCCL_GDR_FLUSH_DISABLE
=
1
export
NCCL_NET_GDR_LEVEL
=
SYS
export
RCCL_NCHANNELS
=
2
export
NCCL_IB_HCA
=
mlx5
export
NCCL_SOCKET_IFNAME
=
ib0
export
NCCL_DEBUG
=
INFO
export
MIOPEN_FIND_MODE
=
3
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
MIOPEN_COMPILE_PARALLEL_LEVEL
=
1
export
NCCL_PLUGIN_P2P
=
ucx
export
HIP_CLANG_PATH
=
/opt/dtk-23.04/llvm/bin
export
HSA_PATH
=
/opt/dtk-23.04/hsa
export
AOMP
=
/opt/dtk-23.04/llvm
export
LD_LIBRARY_PATH
=
/opt/dtk-23.04/rccl/lib:/usr/lib64:/opt/dtk-23.04/miopen/lib:/opt/dtk-23.04/rocblas/lib:/opt/dtk-23.04/hip/lib:/opt/dtk-23.04/llvm/lib:/opt/dtk-23.04/opencl/lib/x86_64:/opt/dtk-23.04/lib:/opt/dtk-23.04/lib64:/opt/dtk-23.04/rccl/lib:/usr/lib64:/opt/dtk-23.04/miopen/lib:/opt/dtk-23.04/rocblas/lib:/opt/dtk-23.04/hip/lib:/opt/dtk-23.04/llvm/lib:/opt/dtk-23.04/opencl/lib/x86_64:/opt/dtk-23.04/lib:/opt/dtk-23.04/lib64:/opt/dtk-23.04/roctracer/lib:/opt/dtk-23.04/rocthrust/lib:/opt/dtk-23.04/rocsparse/lib:/opt/dtk-23.04/rocsolver/lib:/opt/dtk-23.04/rocrand/lib:/opt/dtk-23.04/rocprofiler/lib:/opt/dtk-23.04/rocprim/lib:/opt/dtk-23.04/dtk-23.04_smi/lib:/opt/dtk-23.04/rocfft/lib:/opt/dtk-23.04/rocblas/lib:/opt/dtk-23.04/rocalution/lib:/opt/dtk-23.04/rccl/lib:/opt/dtk-23.04/opencl/lib:/opt/dtk-23.04/oam/lib:/opt/dtk-23.04/migraphx/lib:/opt/dtk-23.04/miopengemm/lib:/opt/dtk-23.04/miopen/lib:/opt/dtk-23.04/llvm/lib-debug/src/openmp/libomptarget/plugins/remote/lib:/opt/dtk-23.04/llvm/lib/clang/14.0.0/lib:/opt/dtk-23.04/llvm/lib:/opt/dtk-23.04/hsa/lib:/opt/dtk-23.04/hipsparse/lib:/opt/dtk-23.04/hipsolver/lib:/opt/dtk-23.04/hiprand/lib:/opt/dtk-23.04/hipfft/lib:/opt/dtk-23.04/hipcub/lib:/opt/dtk-23.04/hipblas-clients/lib:/opt/dtk-23.04/hipblas/lib:/opt/dtk-23.04/hip/lib:/opt/dtk-23.04/lib:/opt/dtk-23.04/lib64:/opt/mpi/lib:/usr/local/lib/:/usr/local/lib64/:/usr/lib64/
export
PATH
=
/opt/dtk-23.04/miopen/bin:/opt/dtk-23.04/rocblas/bin:/opt/dtk-23.04/hipsparse/bin:/opt/dtk-23.04/bin:/opt/dtk-23.04/llvm/bin:/opt/dtk-23.04/hcc/bin:/opt/dtk-23.04/hip/bin:/opt/dtk-23.04/miopen/bin:/opt/dtk-23.04/rocblas/bin:/opt/dtk-23.04/hipsparse/bin:/opt/dtk-23.04/bin:/opt/dtk-23.04/llvm/bin:/opt/dtk-23.04/hcc/bin:/opt/dtk-23.04/hip/bin:/opt/dtk-23.04/libexec/rocprofiler:/opt/dtk-23.04/libexec/dtk-23.04_smi:/opt/dtk-23.04/rocprofiler/bin:/opt/dtk-23.04/opencl/bin:/opt/dtk-23.04/miopen/bin:/opt/dtk-23.04/llvm/lib/clang/14.0.0/bin:/opt/dtk-23.04/llvm/bin:/opt/dtk-23.04/hip/bin:/opt/dtk-23.04/bin:/opt/mpi/bin:/root/anaconda3/bin:/root/anaconda3/condabin:/usr/lib64/qt-3.3/bin:/root/perl5/bin:/opt/dtk-23.04/bin:/opt/dtk-23.04/hip/bin:/opt/dtk-23.04/llvm/bin:/opt/dtk-23.04/llvm/lib/clang/14.0.0/bin:/opt/dtk-23.04/miopen/bin:/opt/dtk-23.04/opencl/bin:/opt/dtk-23.04/rocprofiler/bin:/opt/dtk-23.04/libexec/dtk-23.04_smi:/opt/dtk-23.04/libexec/rocprofiler:/opt/rh/devtoolset-7/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/root/bin
export
ROCM_ROOT
=
/opt/dtk-23.04
export
ROCBLAS_TENSILE_LIBPATH
=
/opt/dtk-23.04/lib/rocblas/library
export
HIP_ROCCLR_HOME
=
/opt/dtk-23.04/hip
export
HIP_LIB_PATH
=
/opt/dtk-23.04/hip/lib
export
DEVICE_LIB_PATH
=
/opt/dtk-23.04/amdgcn/bitcode
multi_node/run-13b-sft-single.sh
View file @
1af723f8
#!/bin/bash
source
env.sh
GPUS
=
$1
string
=
""
...
...
@@ -8,8 +7,14 @@ for ((i=0; i<$GPUS; i++)); do
string
=
"
$string$i
,"
done
string
=
${
string
%
","
}
export
HIP_VISIBLE_DEVICES
=
$string
# echo "$HIP_VISIBLE_DEVICES"
export
MASTER_ADDR
=
${
2
}
export
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
export
RANK
=
$OMPI_COMM_WORLD_RANK
local_rank
=
$OMPI_COMM_WORLD_LOCAL_RANK
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
export
MASTER_PORT
=
12365
export
OMP_NUM_THREADS
=
1
APP
=
"python3 ../src/train_bash.py --stage sft
\
...
...
@@ -31,46 +36,38 @@ APP="python3 ../src/train_bash.py --stage sft \
--fp16
\
--deepspeed deepspeed.json
"
local_rank
=
$OMPI_COMM_WORLD_LOCAL_RANK
case
${
local_rank
}
in
[
0]
)
export
HIP_VISIBLE_DEVICES
=
$string
echo
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
[
1]
)
export
HIP_VISIBLE_DEVICES
=
$string
echo
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
;;
[
2]
)
export
HIP_VISIBLE_DEVICES
=
$string
echo
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
;;
[
3]
)
export
HIP_VISIBLE_DEVICES
=
$string
echo
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
;;
[
4]
)
export
HIP_VISIBLE_DEVICES
=
$string
echo
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
numactl
--cpunodebind
=
4
--membind
=
4
${
APP
}
;;
[
5]
)
export
HIP_VISIBLE_DEVICES
=
$string
echo
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
numactl
--cpunodebind
=
5
--membind
=
5
${
APP
}
;;
[
6]
)
export
HIP_VISIBLE_DEVICES
=
$string
echo
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
numactl
--cpunodebind
=
6
--membind
=
6
${
APP
}
;;
[
7]
)
export
HIP_VISIBLE_DEVICES
=
$string
echo
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
numactl
--cpunodebind
=
7
--membind
=
7
${
APP
}
;;
esac
multi_node/run-13b-sft.sh
View file @
1af723f8
source
env.sh
echo
"START TIME:
$(
date
)
"
hostfile
=
./hostfile
np
=
$(
cat
$hostfile
|sort|uniq |wc
-l
)
np
=
$((
$np
*
8
))
nodename
=
$(
cat
$hostfile
|sed
-n
"1p"
)
dist_url
=
`
echo
$nodename
|
awk
'{print $1}'
`
which mpirun
mpirun
-np
$np
--allow-run-as-root
--hostfile
hostfile
--bind-to
none
--mca
btl_tcp_if_include enp97s0f1 run-13b-sft-single.sh 8
mpirun
-np
$np
--allow-run-as-root
--hostfile
./hostfile1
-mca
plm_rsh_args
"-p 2345"
-mca
btl ^openib run-13b-sft-single.sh 8
$dist_url
echo
"END TIME:
$(
date
)
"
...
...
multi_node/run-7b-sft-lora-single.sh
View file @
1af723f8
#!/bin/bash
source
env.sh
GPUS
=
$1
string
=
""
...
...
@@ -8,8 +7,14 @@ for ((i=0; i<$GPUS; i++)); do
string
=
"
$string$i
,"
done
string
=
${
string
%
","
}
export
HIP_VISIBLE_DEVICES
=
$string
# echo "$HIP_VISIBLE_DEVICES"\
export
MASTER_ADDR
=
${
2
}
export
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
export
RANK
=
$OMPI_COMM_WORLD_RANK
local_rank
=
$OMPI_COMM_WORLD_LOCAL_RANK
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
export
MASTER_PORT
=
12365
export
OMP_NUM_THREADS
=
1
...
...
@@ -40,48 +45,40 @@ APP="python3 ../src/train_bash.py --stage sft \
--fp16
\
--deepspeed deepspeed.json
"
local_rank
=
$OMPI_COMM_WORLD_LOCAL_RANK
echo
$local_rank
case
${
local_rank
}
in
[
0]
)
export
HIP_VISIBLE_DEVICES
=
$string
echo
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
[
1]
)
export
HIP_VISIBLE_DEVICES
=
$string
echo
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
;;
[
2]
)
export
HIP_VISIBLE_DEVICES
=
$string
echo
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
;;
[
3]
)
export
HIP_VISIBLE_DEVICES
=
$string
echo
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
;;
[
4]
)
export
HIP_VISIBLE_DEVICES
=
$string
echo
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
numactl
--cpunodebind
=
4
--membind
=
4
${
APP
}
;;
[
5]
)
export
HIP_VISIBLE_DEVICES
=
$string
echo
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
numactl
--cpunodebind
=
5
--membind
=
5
${
APP
}
;;
[
6]
)
export
HIP_VISIBLE_DEVICES
=
$string
echo
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
numactl
--cpunodebind
=
6
--membind
=
6
${
APP
}
;;
[
7]
)
export
HIP_VISIBLE_DEVICES
=
$string
echo
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
numactl
--cpunodebind
=
7
--membind
=
7
${
APP
}
;;
esac
multi_node/run-7b-sft-lora.sh
View file @
1af723f8
source
env.sh
echo
"START TIME:
$(
date
)
"
hostfile
=
./hostfile
np
=
$(
cat
$hostfile
|sort|uniq |wc
-l
)
np
=
$((
$np
*
8
))
nodename
=
$(
cat
$hostfile
|sed
-n
"1p"
)
dist_url
=
`
echo
$nodename
|
awk
'{print $1}'
`
which mpirun
mpirun
-np
$np
--allow-run-as-root
--hostfile
hostfile
--bind-to
none
--mca
btl_tcp_if_include enp97s0f1
`
pwd
`
/run-7b-single-lora.sh 8
mpirun
-np
$np
--allow-run-as-root
--hostfile
./hostfile1
-mca
plm_rsh_args
"-p 2345"
-mca
btl ^openib run-7b-single-lora.sh 8
$dist_url
echo
"END TIME:
$(
date
)
"
slurm_script/run-13b-sft-single.sh
View file @
1af723f8
...
...
@@ -5,17 +5,17 @@ export MIOPEN_FIND_MODE=3
export
MIOPEN_COMPILE_PARALLEL_LEVEL
=
1
export
NCCL_PLUGIN_P2P
=
ucx
export
RCCL_NCHANNELS
=
2
export
NCCL_SOCKET_IFNAME
=
ib0
export
NCCL_P2P_LEVEL
=
5
export
NCCL_IB_HCA
=
mlx5_0
#0号网卡
export
MASTER_ADDR
=
${
1
}
lrank
=
$OMPI_COMM_WORLD_LOCAL_RANK
echo
"LRANK===============================
$lrank
"
RANK
=
$OMPI_COMM_WORLD_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
export
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
export
RANK
=
$OMPI_COMM_WORLD_RANK
export
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
export
MASTER_PORT
=
12365
export
NCCL_IB_HCA
=
mlx5_0
#0号网卡
APP
=
"python3 ../src/train_bash.py --stage sft
\
...
...
slurm_script/run-13b-sft.sh
View file @
1af723f8
...
...
@@ -8,19 +8,6 @@
#SBATCH -o logs-13B/baichuan-ft-%j.out
#SBATCH -e logs-13B/baichuan-ft-%j.err
ulimit
-u
200000
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
OMP_NUM_THREADS
=
1
export
NCCL_DEBUG
=
INFO
# export NCCL_DEBUG_SUBSYS=ALL
export
MIOPEN_FIND_MODE
=
3
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
MIOPEN_COMPILE_PARALLEL_LEVEL
=
1
export
NCCL_PLUGIN_P2P
=
ucx
export
NCCL_SOCKET_IFNAME
=
ib0
export
NCCL_P2P_LEVEL
=
5
echo
"START TIME:
$(
date
)
"
hostfile
=
./hostfile/
$SLURM_JOB_ID
...
...
@@ -35,5 +22,5 @@ np=$(($np*4))
nodename
=
$(
cat
$hostfile
|sed
-n
"1p"
)
dist_url
=
`
echo
$nodename
|
awk
'{print $1}'
`
mpirun
-np
$np
--allow-run-as-root
--hostfile
hostfile/hostfile-dl-
$SLURM_JOB_ID
--bind-to
none
`
pwd
`
/run-13b-sft-single.sh
mpirun
-np
$np
--allow-run-as-root
--hostfile
hostfile/hostfile-dl-
$SLURM_JOB_ID
--bind-to
none
`
pwd
`
/run-13b-sft-single.sh
$dist_url
slurm_script/run-7b-sft-lora-single.sh
View file @
1af723f8
#!/bin/bash
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
MIOPEN_FIND_MODE
=
3
export
GPU_MAX_HW_QUEUES
=
16
export
MIOPEN_COMPILE_PARALLEL_LEVEL
=
1
export
NCCL_PLUGIN_P2P
=
ucx
export
RCCL_NCHANNELS
=
2
export
NCCL_SOCKET_IFNAME
=
ib0
export
NCCL_P2P_LEVEL
=
5
export
NCCL_IB_HCA
=
mlx5_0
#0号网卡
export
MASTER_ADDR
=
${
1
}
lrank
=
$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank
=
$OMPI_COMM_WORLD_RANK
comm_size
=
$OMPI_COMM_WORLD_SIZE
export
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
export
RANK
=
$comm_rank
export
WORLD_SIZE
=
$comm_size
export
MASTER_ADDR
=
$1
export
MASTER_PORT
=
29500
export
NCCL_IB_HCA
=
mlx5
export
NCCL_SOCKET_IFNAME
=
ib0
export
HIP_DIRECT_DISPATCH
=
0
export
RANK
=
$OMPI_COMM_WORLD_RANK
export
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
export
MASTER_PORT
=
12365
APP
=
"python3 ../src/train_bash.py --stage sft
\
...
...
slurm_script/run-7b-sft-lora.sh
View file @
1af723f8
...
...
@@ -10,21 +10,6 @@
#SBATCH --exclusive
ulimit
-s
unlimited
export
HIP_VISIBLE_DEVICES
=
0,1,2,3
export
MIOPEN_FIND_MODE
=
3
export
MIOPEN_DEBUG_CONV_IMPLICIT_GEMM
=
0
export
MIOPEN_USER_DB_PATH
=
/tmp/miopen-udb
export
MIOPEN_CUSTOM_CACHE_DIR
=
/tmp/miopen-cache
export
NCCL_SOCKET_IFNAME
=
ib0
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
OMP_NUM_THREADS
=
1
export
NCCL_IB_HCA
=
mlx5
export
NCCL_DEBUG
=
INFO
export
MIOPEN_COMPILE_PARALLEL_LEVEL
=
1
export
NCCL_PLUGIN_P2P
=
ucx
export
NCCL_P2P_LEVEL
=
5
echo
"START TIME:
$(
date
)
"
...
...
@@ -50,5 +35,5 @@ np=$(($np*4))
nodename
=
$(
cat
$hostfile
|sed
-n
"1p"
)
#读取每行节点 第一个是主节点
dist_url
=
`
echo
$nodename
|
awk
'{print $1}'
`
mpirun
-np
$np
--allow-run-as-root
--hostfile
hostfile/hostfile-dl-
$SLURM_JOB_ID
--bind-to
none
`
pwd
`
/run-7b-single-lora.sh
mpirun
-np
$np
--allow-run-as-root
--hostfile
hostfile/hostfile-dl-
$SLURM_JOB_ID
--bind-to
none
`
pwd
`
/run-7b-single-lora.sh
$dist_url
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment