Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ChatGLM-6B_pytorch
Commits
eed209e7
Commit
eed209e7
authored
Sep 21, 2023
by
zhaoying1
Browse files
调整为标准格式
parent
4fae534d
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
59 additions
and
19 deletions
+59
-19
README.md
README.md
+1
-1
ptuning/multi_nodel/run_train.sh
ptuning/multi_nodel/run_train.sh
+25
-0
ptuning/multi_nodel/run_train_single.sh
ptuning/multi_nodel/run_train_single.sh
+33
-10
ptuning/slurm_scripts/run.sh
ptuning/slurm_scripts/run.sh
+0
-8
No files found.
README.md
View file @
eed209e7
...
@@ -118,7 +118,7 @@ Hugging Face模型下载地址:
...
@@ -118,7 +118,7 @@ Hugging Face模型下载地址:
#### 集群训练
#### 集群训练
```
```
cd ptuning/slurm_scripts
cd ptuning/slurm_scripts
bash run.sh
bash run
_train
.sh
```
```
注意:请根据自己的需求配置其中的模型路径、数据集路径、batchsize、学习率等参数;
注意:请根据自己的需求配置其中的模型路径、数据集路径、batchsize、学习率等参数;
...
...
ptuning/
slurm_scripts
/run_train.sh
→
ptuning/
multi_nodel
/run_train.sh
View file @
eed209e7
#!/bin/bash
#SBATCH -p kshdnormal01
#SBATCH -N 4
#SBATCH --cpus-per-task=1
#SBATCH --ntasks-per-node=32
#SBATCH --mem 100G
#SBATCH --gres=dcu:4
#SBATCH -J chatglm
#SBATCH -o logs/pt-%j.out
#SBATCH -e logs/pt-%j.err
ulimit
-u
200000
ulimit
-u
200000
export
OMP_NUM_THREADS
=
1
export
OMP_NUM_THREADS
=
1
...
@@ -19,23 +9,17 @@ export NCCL_PLUGIN_P2P=ucx
...
@@ -19,23 +9,17 @@ export NCCL_PLUGIN_P2P=ucx
export
NCCL_SOCKET_IFNAME
=
ib0
export
NCCL_SOCKET_IFNAME
=
ib0
export
NCCL_P2P_LEVEL
=
5
export
NCCL_P2P_LEVEL
=
5
export
NCCL_NET_PLUGIN
=
none
export
NCCL_NET_PLUGIN
=
none
unset
RCCL_NCHANNELS
unset
NCCL_NET_GDR_LEVEL
rm
-rf
./hostfile/
*
echo
"START TIME:
$(
date
)
"
hostfile
=
./hostfile/
$SLURM_JOB_ID
scontrol show hostnames
$SLURM_JOB_NODELIST
>
${
hostfile
}
for
i
in
`
cat
$hostfile
`
echo
"START TIME:
$(
date
)
"
do
hostfile
=
./hostfile
echo
${
i
}
slots
=
4
>>
`
pwd
`
/hostfile/hostfile-dl-
$SLURM_JOB_ID
done
np
=
$(
cat
$hostfile
|sort|uniq |wc
-l
)
np
=
$(
cat
$hostfile
|sort|uniq |wc
-l
)
np
=
$((
$np
*
4
))
np
=
$((
$np
*
8
))
nodename
=
$(
cat
$hostfile
|sed
-n
"1p"
)
nodename
=
$(
cat
$hostfile
|sed
-n
"1p"
)
dist_url
=
`
echo
$nodename
|
awk
'{print $1}'
`
dist_url
=
`
echo
$nodename
|
awk
'{print $1}'
`
echo
${
dist_url
}
which mpirun
mpirun
-np
$np
--
hostfile
hostfile
/
hostfile
-dl-
$SLURM_JOB_ID
--bind-to
none
`
pwd
`
/
run_train_single.sh
$dist_url
mpirun
-np
$np
--
allow-run-as-root
--
hostfile
hostfile
--bind-to
none
--mca
btl_tcp_if_include
$dist_url
run_train_single.sh
echo
"END TIME:
$(
date
)
"
ptuning/
slurm_scripts
/run_train_single.sh
→
ptuning/
multi_nodel
/run_train_single.sh
View file @
eed209e7
...
@@ -4,15 +4,12 @@ export HSA_FORCE_FINE_GRAIN_PCIE=1
...
@@ -4,15 +4,12 @@ export HSA_FORCE_FINE_GRAIN_PCIE=1
export
MIOPEN_FIND_MODE
=
3
export
MIOPEN_FIND_MODE
=
3
export
MIOPEN_COMPILE_PARALLEL_LEVEL
=
1
export
MIOPEN_COMPILE_PARALLEL_LEVEL
=
1
export
NCCL_PLUGIN_P2P
=
ucx
export
NCCL_PLUGIN_P2P
=
ucx
export
RCCL_NCHANNELS
=
2
export
NCCL_SOCKET_IFNAME
=
ib0
export
NCCL_SOCKET_IFNAME
=
ib0
export
NCCL_P2P_LEVEL
=
5
export
NCCL_P2P_LEVEL
=
5
export
NCCL_IB_HCA
=
mlx5_0
export
NCCL_IB_HCA
=
mlx5_0
export
NCCL_DEBUG
=
INFO
export
NCCL_DEBUG
=
INFO
export
NCCL_NET_GDR_LEVEL
=
SYS
export
NCCL_NET_PLUGIN
=
none
export
NCCL_NET_PLUGIN
=
none
unset
RCCL_NCHANNELS
unset
NCCL_NET_GDR_LEVEL
lrank
=
$OMPI_COMM_WORLD_LOCAL_RANK
lrank
=
$OMPI_COMM_WORLD_LOCAL_RANK
echo
"LRANK===============================
$lrank
"
echo
"LRANK===============================
$lrank
"
RANK
=
$OMPI_COMM_WORLD_RANK
RANK
=
$OMPI_COMM_WORLD_RANK
...
@@ -42,29 +39,55 @@ APP="python3 ../main.py \
...
@@ -42,29 +39,55 @@ APP="python3 ../main.py \
--fp16
\
--fp16
\
--local_rank
$lrank
"
--local_rank
$lrank
"
case
${
lrank
}
in
case
${
lrank
}
in
[
0]
)
[
0]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3
export
HIP_VISIBLE_DEVICES
=
0,1,2,3
,4,5,6,7
export
UCX_NET_DEVICES
=
mlx5_0:1
export
UCX_NET_DEVICES
=
mlx5_0:1
export
UCX_IB_PCI_BW
=
mlx5_0:50Gbs
export
UCX_IB_PCI_BW
=
mlx5_0:50Gbs
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
;;
[
1]
)
[
1]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3
export
HIP_VISIBLE_DEVICES
=
0,1,2,3
,4,5,6,7
export
UCX_NET_DEVICES
=
mlx5_1:1
export
UCX_NET_DEVICES
=
mlx5_1:1
export
UCX_IB_PCI_BW
=
mlx5_1:50Gbs
export
UCX_IB_PCI_BW
=
mlx5_1:50Gbs
numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
;;
[
2]
)
[
2]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3
export
HIP_VISIBLE_DEVICES
=
0,1,2,3
,4,5,6,7
export
UCX_NET_DEVICES
=
mlx5_2:1
export
UCX_NET_DEVICES
=
mlx5_2:1
export
UCX_IB_PCI_BW
=
mlx5_2:50Gbs
export
UCX_IB_PCI_BW
=
mlx5_2:50Gbs
numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
;;
[
3]
)
[
3]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3
export
HIP_VISIBLE_DEVICES
=
0,1,2,3
,4,5,6,7
export
UCX_NET_DEVICES
=
mlx5_3:1
export
UCX_NET_DEVICES
=
mlx5_3:1
export
UCX_IB_PCI_BW
=
mlx5_3:50Gbs
export
UCX_IB_PCI_BW
=
mlx5_3:50Gbs
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
[
4]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
UCX_NET_DEVICES
=
mlx5_4:1
export
UCX_IB_PCI_BW
=
mlx5_4:50Gbs
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
;;
[
5]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
UCX_NET_DEVICES
=
mlx5_5:1
export
UCX_IB_PCI_BW
=
mlx5_5:50Gbs
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
;;
;;
[
6]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
UCX_NET_DEVICES
=
mlx5_6:1
export
UCX_IB_PCI_BW
=
mlx5_6:50Gbs
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
;;
[
7]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
UCX_NET_DEVICES
=
mlx5_7:1
export
UCX_IB_PCI_BW
=
mlx5_7:50Gbs
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
;;
esac
esac
ptuning/slurm_scripts/run.sh
deleted
100644 → 0
View file @
4fae534d
#/bin/bash
mkdir
-p
logs
#rm -rf log/*
mkdir
-p
pt_output
mkdir
-p
hostfile
sbatch run_train.sh
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment