update README to dtk2404

a540af5e · Rayyyyy · a990c13a · a540af5e · a990c13a · a540af5e
Commit a540af5e authored May 14, 2024 by Rayyyyy
6 changed files
--- a/README.md
+++ b/README.md
@@ -9,30 +9,19 @@ Llama-3中选择了一个相对标准的decoder-only的transformer架构。与Ll
 - 采用分组查询注意力（grouped query attention，GQA）、掩码等技术，帮助开发者以最低的能耗获取绝佳的性能。
 - 在8,192个tokens的序列上训练模型，使用掩码来确保self-attention不会跨越文档边界。

-## 算法原理
-
-<div align=center>
-    <img src="./doc/method.png"/>
-</div>

 ## 环境配置
 -v 路径、docker_name和imageID根据实际情况修改
-**注意**：bitsandbytes库功能不全，暂不支持4bits
+**注意**：bitsandbytes库功能不全，暂不支持量化相关

 ### Docker（方法一）

 ```bash
-docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-ubuntu22.04-dtk23.10.1-py310
-docker run -it -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro --shm-size=32G --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name docker_name imageID bash
+docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk24.04-py310
+docker run -it -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro --shm-size=80G --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name docker_name imageID bash

 cd /your_code_path/llama3_pytorch
-
 pip install -e .
-
-pip install deepspeed-0.12.3+gitfe61783.abi0.dtk2310.torch2.1.0a0-cp310-cp310-manylinux2014_x86_64.whl
-pip install bitsandbytes-0.43.0-py3-none-any.whl
-pip install -U xtuner # 0.1.18
-pip install mmengine==0.10.3
 ```

 ### Dockerfile（方法二）
@@ -40,22 +29,16 @@ pip install mmengine==0.10.3
 ```bash
 cd docker
 docker build --no-cache -t llama3:latest .
-docker run -it -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro --shm-size=32G --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name docker_name imageID bash
+docker run -it -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro --shm-size=80G --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name docker_name imageID bash

 cd /your_code_path/llama3_pytorch
-
 pip install -e .
-
-pip install deepspeed-0.12.3+gitfe61783.abi0.dtk2310.torch2.1.0a0-cp310-cp310-manylinux2014_x86_64.whl
-pip install bitsandbytes-0.43.0-py3-none-any.whl
-pip install -U xtuner # 0.1.18
-pip install mmengine==0.10.3
 ```

 ### Anaconda（方法三）
 关于本项目DCU显卡所需的特殊深度学习库可从[光合](https://developer.hpccube.com/tool/)开发者社区下载安装。
 ```bash
-DTK驱动: dtk23.10.1
+DTK驱动: dtk24.04
 python: python3.10
 torch: 2.1.0
 xtuner: 0.1.18
@@ -65,11 +48,6 @@ xtuner: 0.1.18
 其它非深度学习库安装方式如下：
 ```bash
 pip install -e .
-
-pip install deepspeed-0.12.3+gitfe61783.abi0.dtk2310.torch2.1.0a0-cp310-cp310-manylinux2014_x86_64.whl
-pip install bitsandbytes-0.43.0-py3-none-any.whl
-pip install -U xtuner # 0.1.18
-pip install mmengine==0.10.3
 ```

 ## 数据集
@@ -82,12 +60,18 @@ pip install mmengine==0.10.3

 ## 训练
 ### xtuner微调方法
-1. 下载预训练模型，具体模型请修改`download_models.py`
+1. 训练库安装，请注意所需库版本
+```bash
+pip install deepspeed-0.12.3+das1.0+gita724046.abi0.dtk2404.torch2.1.0-cp310-cp310-manylinux2014_x86_64.whl
+pip install -U xtuner # 0.1.18
+pip install mmengine==0.10.3
+```
+2. 下载预训练模型，具体模型请修改`download_models.py`
 ```bash
 cd /your_code_path/llama3_pytorch
 pip install modelscope
 python download_models.py
-mv ~/.cache/modelscope/hub/LLM-Research ./
+mv ./LLM-Research/* ./
 ```
 2. 修改[llama3_8b_instruct_qlora_alpaca_e3_M.py](./llama3_8b_instruct_qlora_alpaca_e3_M.py)代码中的`pretrained_model_name_or_path`、`data_path`为本地对应数据地址；
 3. 根据硬件环境和自身训练需求来调整`max_length`、`batch_size`、`accumulative_counts`、`max_epochs`、`lr`、`save_steps`、`evaluation_freq`、model.lora中的`r`、`lora_alpha`参数，默认参数支持4*32G；
@@ -279,12 +263,7 @@ huggingface-cli download meta-llama/Meta-Llama-3-70B-Instruct --include "origina
 │   ├── Meta-Llama-3-70B
 │       ├── original
 │           ├── consolidated.00.pth
-│           ├── consolidated.01.pth
-│           ├── consolidated.02.pth
-│           ├── consolidated.03.pth
-│           ├── consolidated.04.pth
-│           ├── consolidated.05.pth
-│           ├── consolidated.06.pth
+│           ...
 │           ├── consolidated.07.pth
 │           ├── params.json
 │           └── tokenizer.model
@@ -306,12 +285,7 @@ huggingface-cli download meta-llama/Meta-Llama-3-70B-Instruct --include "origina
 │   └── Meta-Llama-3-70B-Instruct
 │       ├── original
 │           ├── consolidated.00.pth
-│           ├── consolidated.01.pth
-│           ├── consolidated.02.pth
-│           ├── consolidated.03.pth
-│           ├── consolidated.04.pth
-│           ├── consolidated.05.pth
-│           ├── consolidated.06.pth
+│           ...
 │           ├── consolidated.07.pth
 │           ├── params.json
 │           └── tokenizer.model
@@ -338,5 +312,4 @@ huggingface-cli download meta-llama/Meta-Llama-3-70B-Instruct --include "origina
 ## 参考资料
 - https://github.com/meta-llama/llama3
 - https://github.com/InternLM/xtuner
- https://github.com/SmartFlowAI/EmoLLM
 - https://github.com/meta-llama/llama-recipes
--- a/bitsandbytes-0.43.0-py3-none-any.whl
+++ b/bitsandbytes-0.43.0-py3-none-any.whl
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
-FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-ubuntu22.04-dtk23.10.1-py310
\ No newline at end of file
+FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk24.04-py310
\ No newline at end of file
--- a/download_models.py
+++ b/download_models.py
 from modelscope import snapshot_download
-model_dir = snapshot_download('LLM-Research/Meta-Llama-3-8B-Instruct')
+# 下载模型可选为：
+# LLM-Research/Meta-Llama-3-8B、LLM-Research/Meta-Llama-3-8B-Instruct
+# LLM-Research/Meta-Llama-3-70B、LLM-Research/Meta-Llama-3-70B-Instruct
+
+# 下面以 LLM-Research/Meta-Llama-3-8B-Instruct 为例
+model_dir = snapshot_download('LLM-Research/Meta-Llama-3-8B-Instruct', cache_dir="/your_model_save_path/")
 print(model_dir)
\ No newline at end of file
--- a/run_train.sh
+++ b/run_train.sh
+ulimit -u 200000
+
+export OMP_NUM_THREADS=1
+export NCCL_DEBUG=INFO
+export MIOPEN_FIND_MODE=3
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export MIOPEN_COMPILE_PARALLEL_LEVEL=1
+export NCCL_PLUGIN_P2P=ucx
+export NCCL_SOCKET_IFNAME=ib0
+export NCCL_P2P_LEVEL=5
+export NCCL_NET_PLUGIN=none
+
+echo "START TIME: $(date)"
+hostfile=./hostfile
+
+np=$(cat $hostfile|sort|uniq |wc -l)
+np=$(($np*8))
+
+nodename=$(cat $hostfile |sed -n "1p")
+dist_url=`echo $nodename | awk '{print $1}'`
+which mpirun
+mpirun -np $np --allow-run-as-root --hostfile hostfile --bind-to none --mca btl_tcp_if_include $dist_url run_train_single.sh
+echo "END TIME: $(date)"
--- a/run_train_single.sh
+++ b/run_train_single.sh
+#!/bin/bash
+
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export MIOPEN_FIND_MODE=3
+export MIOPEN_COMPILE_PARALLEL_LEVEL=1
+export NCCL_PLUGIN_P2P=ucx
+export NCCL_SOCKET_IFNAME=ib0
+export NCCL_P2P_LEVEL=5
+export NCCL_IB_HCA=mlx5_0
+export NCCL_DEBUG=INFO
+export NCCL_NET_PLUGIN=none
+
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+echo "LRANK===============================$lrank"
+RANK=$OMPI_COMM_WORLD_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+export HIP_VISIBLE_DEVICES=0,1,2,3
+
+LR=1e-5
+APP="python3 ../main.py \
+    --deepspeed ../deepspeed.json \
+    --do_train \
+    --train_file AdvertiseGen/train.json \
+    --prompt_column content  \
+    --response_column summary \
+    --model_name_or_path THUDM/chatglm-6b \
+    --output_dir ./output_ft/pretrain \
+    --overwrite_output_dir \
+    --max_source_length 64 \
+    --max_target_length 64 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 1 \
+    --predict_with_generate \
+    --max_steps 2000 \
+    --logging_steps 5 \
+    --save_steps 1000 \
+    --learning_rate $LR \
+    --fp16 \
+    --local_rank $lrank "
+
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export UCX_NET_DEVICES=mlx5_1:1
+  export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export UCX_NET_DEVICES=mlx5_2:1
+  export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export UCX_NET_DEVICES=mlx5_3:1
+  export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[4])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export UCX_NET_DEVICES=mlx5_4:1
+  export UCX_IB_PCI_BW=mlx5_4:50Gbs
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[5])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export UCX_NET_DEVICES=mlx5_5:1
+  export UCX_IB_PCI_BW=mlx5_5:50Gbs
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[6])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export UCX_NET_DEVICES=mlx5_6:1
+  export UCX_IB_PCI_BW=mlx5_6:50Gbs
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[7])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  export UCX_NET_DEVICES=mlx5_7:1
+  export UCX_IB_PCI_BW=mlx5_7:50Gbs
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac