Commit 511b03fa authored by hepj987's avatar hepj987
Browse files

去掉多余依赖包

parent 97c4ff1b
FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk-23.04-py39-latest
COPY requirements.txt requirements.txt
RUN source /opt/dtk-23.04/env.sh
ENV LANG C.UTF-8
RUN pip install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
......@@ -30,20 +30,46 @@ https://arxiv.org/pdf/2308.12966.pdf
## 环境配置
### Docker(方式一)
推荐使用docker方式运行,提供[光源](https://www.sourcefind.cn/#/main-page)拉取的docker镜像:
```
docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk-23.04-py39-latest
docker run -dit --network=host --name=qwen_pytorch --privileged --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size=16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root --ulimit stack=-1:-1 --ulimit memlock=-1:-1 image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk-23.04-py39-latest
docker exec -it qwen_pytorch /bin/bash
pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
```
## Dockerfile(方式二)
```
docker build -t qwen:latest .
docker run -dit --network=host --name=qwen_pytorch --privileged --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size=16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root --ulimit stack=-1:-1 --ulimit memlock=-1:-1 qwen:latest
docker exec -it qwen_pytorch /bin/bash
pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
```
进入docker
### conda(方式三)
```
conda create -n qwen python=3.9
pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
```
[torch1.13-dtk23.04](https://cancon.hpccube.com:65024/directlink/4/pytorch/dtk23.04/torch-1.13.1+git55d300e.abi0.dtk2304-cp39-cp39-manylinux2014_x86_64.whl)
[deepspeed0.9.2-dtk23.04](https://cancon.hpccube.com:65024/directlink/4/deepspeed/dtk23.04/deepspeed-0.9.2+git25d5540.abi0.dtk2304.torch1.13.1-cp39-cp39-manylinux2014_x86_64.whl)
Tips:以上dtk驱动、python、deepspeed等工具版本需要严格一一对应。
#注释掉一些版本判断
### 注意
由于dtk版本的deepspeed目前最高是0.9.2因此需要进入虚拟环境修改一些版本判断
```
#到虚拟环境下对应的python/site-packages注释掉一些版本判断
site-packages/accelerate/accelerator.py 文件
287 #if not is_deepspeed_available():
......@@ -51,7 +77,7 @@ site-packages/accelerate/accelerator.py 文件
289 #if compare_versions("deepspeed", "<", "0.9.3"):
290 # raise ImportError("DeepSpeed version must be >= 0.9.3. Please update DeepSpeed.")
site-packages/transformers/utils/versions.py 文件
site-packages/transformers/utils/versions.py 文件
43 #if not ops[op](version.parse(got_ver), version.parse(want_ver)):
44 # raise ImportError(
45 # f"{requirement} is required for a normal functioning of this module, but found {pkg}=={got_ver}.{hint}"
......@@ -110,11 +136,11 @@ sh mpirun-nodes.sh
### 算法类别
`自然语言处理`
`对话问答`
### 热点应用行业
`nlp,智能聊天助手`
`科研`
## 源码仓库及问题反馈
......
{
"train_micro_batch_size_per_gpu": "auto",
"zero_allow_untested_optimizer": true,
"fp16": {
"enabled": true,
"loss_scale": 0,
"initial_scale_power": 11,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
"train_micro_batch_size_per_gpu": "auto",
"zero_allow_untested_optimizer": true,
"fp16": {
"enabled": true,
"loss_scale": 0,
"initial_scale_power": 11,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"zero_optimization": {
"stage": 3,
"allgather_partitions": true,
"allgather_bucket_size": 5e8,
"overlap_comm": false,
"reduce_scatter": true,
"reduce_bucket_size": 5e8,
"contiguous_gradients" : true,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"zero_optimization": {
"stage": 3,
"allgather_partitions": true,
"allgather_bucket_size": 5e8,
"overlap_comm": false,
"reduce_scatter": true,
"reduce_bucket_size": 5e8,
"contiguous_gradients" : true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 2000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": 1,
"wall_clock_breakdown": false
}
"offload_param": {
"device": "cpu",
"pin_memory": true
}
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 2000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": 1,
"wall_clock_breakdown": false
}
......@@ -5,6 +5,6 @@ modelName=qwen_pytorch
# 模型描述
modelDescription=Qwen是阿里开源的预训练语言表征模型。
# 应用场景
appScenario=训练,NLP,文本问答
appScenario=训练,NLP,对话问答
# 框架类型
frameType=Pytorch
......@@ -6,6 +6,6 @@ np=$(($np*8))
nodename=$(cat $hostfile |sed -n "1p")
dist_url=`echo $nodename | awk '{print $1}'`
which mpirun
mpirun -np $np --allow-run-as-root --hostfile $hostfile --bind-to none --mca btl_tcp_if_include $dist_url single-16B.sh $dist_url
mpirun -np $np --allow-run-as-root --hostfile $hostfile --bind-to none --mca btl_tcp_if_include $dist_url single_ddp.sh $dist_url
echo "END TIME: $(date)"
......@@ -12,7 +12,6 @@ click==8.1.7
contourpy==1.1.1
cycler==0.12.1
datasets==2.14.5
deepspeed @ file:///work/home/hepj/DTK-whl/dtk23.04/deepspeed-0.9.2%2Bgit25d5540.abi0.dtk2304.torch1.13.1-cp39-cp39-manylinux2014_x86_64.whl
dill==0.3.7
docstring-parser==0.15
einops==0.6.1
......@@ -83,7 +82,6 @@ starlette==0.26.1
tiktoken==0.5.1
tokenizers==0.13.3
toolz==0.12.0
torch @ file:///work/home/hepj/DTK-whl/dtk23.04/torch-1.13.1%2Bgit55d300e.abi0.dtk2304-cp39-cp39-manylinux2014_x86_64.whl
tqdm==4.66.1
transformers==4.31.0
transformers-stream-generator==0.0.4
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment