Commit 511b03fa authored by hepj987's avatar hepj987
Browse files

去掉多余依赖包

parent 97c4ff1b
FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk-23.04-py39-latest
COPY requirements.txt requirements.txt
RUN source /opt/dtk-23.04/env.sh
ENV LANG C.UTF-8
RUN pip install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
...@@ -30,20 +30,46 @@ https://arxiv.org/pdf/2308.12966.pdf ...@@ -30,20 +30,46 @@ https://arxiv.org/pdf/2308.12966.pdf
## 环境配置 ## 环境配置
### Docker(方式一)
推荐使用docker方式运行,提供[光源](https://www.sourcefind.cn/#/main-page)拉取的docker镜像: 推荐使用docker方式运行,提供[光源](https://www.sourcefind.cn/#/main-page)拉取的docker镜像:
``` ```
docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk-23.04-py39-latest docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk-23.04-py39-latest
docker run -dit --network=host --name=qwen_pytorch --privileged --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size=16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root --ulimit stack=-1:-1 --ulimit memlock=-1:-1 image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk-23.04-py39-latest docker run -dit --network=host --name=qwen_pytorch --privileged --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size=16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root --ulimit stack=-1:-1 --ulimit memlock=-1:-1 image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk-23.04-py39-latest
docker exec -it qwen_pytorch /bin/bash
pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
```
## Dockerfile(方式二)
```
docker build -t qwen:latest .
docker run -dit --network=host --name=qwen_pytorch --privileged --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size=16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root --ulimit stack=-1:-1 --ulimit memlock=-1:-1 qwen:latest
docker exec -it qwen_pytorch /bin/bash
pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
``` ```
进入docker ### conda(方式三)
``` ```
conda create -n qwen python=3.9
pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
```
[torch1.13-dtk23.04](https://cancon.hpccube.com:65024/directlink/4/pytorch/dtk23.04/torch-1.13.1+git55d300e.abi0.dtk2304-cp39-cp39-manylinux2014_x86_64.whl)
[deepspeed0.9.2-dtk23.04](https://cancon.hpccube.com:65024/directlink/4/deepspeed/dtk23.04/deepspeed-0.9.2+git25d5540.abi0.dtk2304.torch1.13.1-cp39-cp39-manylinux2014_x86_64.whl)
Tips:以上dtk驱动、python、deepspeed等工具版本需要严格一一对应。
#注释掉一些版本判断 ### 注意
由于dtk版本的deepspeed目前最高是0.9.2因此需要进入虚拟环境修改一些版本判断
```
#到虚拟环境下对应的python/site-packages注释掉一些版本判断
site-packages/accelerate/accelerator.py 文件 site-packages/accelerate/accelerator.py 文件
287 #if not is_deepspeed_available(): 287 #if not is_deepspeed_available():
...@@ -51,7 +77,7 @@ site-packages/accelerate/accelerator.py 文件 ...@@ -51,7 +77,7 @@ site-packages/accelerate/accelerator.py 文件
289 #if compare_versions("deepspeed", "<", "0.9.3"): 289 #if compare_versions("deepspeed", "<", "0.9.3"):
290 # raise ImportError("DeepSpeed version must be >= 0.9.3. Please update DeepSpeed.") 290 # raise ImportError("DeepSpeed version must be >= 0.9.3. Please update DeepSpeed.")
site-packages/transformers/utils/versions.py 文件 site-packages/transformers/utils/versions.py 文件
43 #if not ops[op](version.parse(got_ver), version.parse(want_ver)): 43 #if not ops[op](version.parse(got_ver), version.parse(want_ver)):
44 # raise ImportError( 44 # raise ImportError(
45 # f"{requirement} is required for a normal functioning of this module, but found {pkg}=={got_ver}.{hint}" 45 # f"{requirement} is required for a normal functioning of this module, but found {pkg}=={got_ver}.{hint}"
...@@ -110,11 +136,11 @@ sh mpirun-nodes.sh ...@@ -110,11 +136,11 @@ sh mpirun-nodes.sh
### 算法类别 ### 算法类别
`自然语言处理` `对话问答`
### 热点应用行业 ### 热点应用行业
`nlp,智能聊天助手` `科研`
## 源码仓库及问题反馈 ## 源码仓库及问题反馈
......
{ {
"train_micro_batch_size_per_gpu": "auto", "train_micro_batch_size_per_gpu": "auto",
"zero_allow_untested_optimizer": true, "zero_allow_untested_optimizer": true,
"fp16": { "fp16": {
"enabled": true, "enabled": true,
"loss_scale": 0, "loss_scale": 0,
"initial_scale_power": 11, "initial_scale_power": 11,
"loss_scale_window": 1000, "loss_scale_window": 1000,
"hysteresis": 2, "hysteresis": 2,
"min_loss_scale": 1 "min_loss_scale": 1
},
"zero_optimization": {
"stage": 3,
"allgather_partitions": true,
"allgather_bucket_size": 5e8,
"overlap_comm": false,
"reduce_scatter": true,
"reduce_bucket_size": 5e8,
"contiguous_gradients" : true,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
}, },
"zero_optimization": { "offload_param": {
"stage": 3, "device": "cpu",
"allgather_partitions": true, "pin_memory": true
"allgather_bucket_size": 5e8, }
"overlap_comm": false, },
"reduce_scatter": true, "gradient_accumulation_steps": "auto",
"reduce_bucket_size": 5e8, "gradient_clipping": "auto",
"contiguous_gradients" : true "steps_per_print": 2000,
}, "train_batch_size": "auto",
"gradient_accumulation_steps": "auto", "train_micro_batch_size_per_gpu": 1,
"gradient_clipping": "auto", "wall_clock_breakdown": false
"steps_per_print": 2000, }
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": 1,
"wall_clock_breakdown": false
}
...@@ -5,6 +5,6 @@ modelName=qwen_pytorch ...@@ -5,6 +5,6 @@ modelName=qwen_pytorch
# 模型描述 # 模型描述
modelDescription=Qwen是阿里开源的预训练语言表征模型。 modelDescription=Qwen是阿里开源的预训练语言表征模型。
# 应用场景 # 应用场景
appScenario=训练,NLP,文本问答 appScenario=训练,NLP,对话问答
# 框架类型 # 框架类型
frameType=Pytorch frameType=Pytorch
...@@ -6,6 +6,6 @@ np=$(($np*8)) ...@@ -6,6 +6,6 @@ np=$(($np*8))
nodename=$(cat $hostfile |sed -n "1p") nodename=$(cat $hostfile |sed -n "1p")
dist_url=`echo $nodename | awk '{print $1}'` dist_url=`echo $nodename | awk '{print $1}'`
which mpirun which mpirun
mpirun -np $np --allow-run-as-root --hostfile $hostfile --bind-to none --mca btl_tcp_if_include $dist_url single-16B.sh $dist_url mpirun -np $np --allow-run-as-root --hostfile $hostfile --bind-to none --mca btl_tcp_if_include $dist_url single_ddp.sh $dist_url
echo "END TIME: $(date)" echo "END TIME: $(date)"
...@@ -12,7 +12,6 @@ click==8.1.7 ...@@ -12,7 +12,6 @@ click==8.1.7
contourpy==1.1.1 contourpy==1.1.1
cycler==0.12.1 cycler==0.12.1
datasets==2.14.5 datasets==2.14.5
deepspeed @ file:///work/home/hepj/DTK-whl/dtk23.04/deepspeed-0.9.2%2Bgit25d5540.abi0.dtk2304.torch1.13.1-cp39-cp39-manylinux2014_x86_64.whl
dill==0.3.7 dill==0.3.7
docstring-parser==0.15 docstring-parser==0.15
einops==0.6.1 einops==0.6.1
...@@ -83,7 +82,6 @@ starlette==0.26.1 ...@@ -83,7 +82,6 @@ starlette==0.26.1
tiktoken==0.5.1 tiktoken==0.5.1
tokenizers==0.13.3 tokenizers==0.13.3
toolz==0.12.0 toolz==0.12.0
torch @ file:///work/home/hepj/DTK-whl/dtk23.04/torch-1.13.1%2Bgit55d300e.abi0.dtk2304-cp39-cp39-manylinux2014_x86_64.whl
tqdm==4.66.1 tqdm==4.66.1
transformers==4.31.0 transformers==4.31.0
transformers-stream-generator==0.0.4 transformers-stream-generator==0.0.4
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment