"csrc/vscode:/vscode.git/clone" did not exist on "9dd24ecd724301361a479658e50b5a05537def53"
Commit 60a524ee authored by chenych's avatar chenych
Browse files

Delete infer_vllm.py and update README“

parent 86b279b8
......@@ -29,7 +29,6 @@ docker run -it --shm-size 200g --network=host --name {docker_name} --privileged
cd /your_code_path/llama4_pytorch
pip install git+https://github.com/hiyouga/transformers.git@llama4_train
pip install -r requirements.txt
```
### Dockerfile(方法二)
......@@ -40,7 +39,6 @@ docker run -it --shm-size 200g --network=host --name {docker_name} --privileged
cd /your_code_path/llama4_pytorch
pip install git+https://github.com/hiyouga/transformers.git@llama4_train
pip install -r requirements.txt
```
### Anaconda(方法三)
......@@ -50,14 +48,13 @@ DTK: 25.04
python: 3.10
torch: 2.4.1
deepspeed: 0.14.2+das.opt2.dtk2504
vllm: 0.6.2+das.opt3.dtk2504
```
`Tips:以上dtk驱动、python、torch等DCU相关工具版本需要严格一一对应`
## 训练
### Llama Factory 微调方法(推荐)
1. 训练库安装(**非llama4_pytorch目录下**),安装版本大于**v0.9.2**`Llama-Factory`具体安装方法请参考仓库的README。
1. 训练库安装(**非llama4_pytorch目录下**),安装版本**大于 v0.9.2**`Llama-Factory`具体安装方法请参考仓库的README。
```
git clone https://developer.sourcefind.cn/codes/OpenDAS/llama-factory
```
......@@ -81,35 +78,6 @@ SFT训练脚本示例,参考`llama-factory/train_lora`下对应yaml文件。
参数解释同[#全参微调](#全参微调)
## 推理
### vllm推理方法
#### OpenAI server启动
**参数解释:**
- MODEL_PATH: 待测模型地址
- tp: 为模型并行度,根据模型大小进行设定
- PORT: 端口号
- MAX_MODEL_LEN: 模型最大长度,根据模型大小进行设定
- MODEL_NAME: 模型名称
```bash
vllm serve ${MODEL_PATH} --trust-remote-code --enforce-eager --tensor-parallel-size ${tp} --port ${PORT} --max-model-len ${MAX_MODEL_LEN} --served-model-name ${MODEL_NAME} &
```
访问方法:
```bash
curl http://localhost:${PORT}1/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": ${MODEL_NAME},
"messages": [
{"role": "user", "content": "你好"}
]
}'
```
#### 本地python脚本启动
```bash
python infer_vllm.py
```
### transformers推理方法
```bash
## 必须添加HF_ENDPOINT环境变量
......@@ -119,6 +87,9 @@ python infer_transformers.py --model_id /path_of/model_id
```
## result
<div align=center>
<img src="./doc/transformers_results.jpg"/>
</div>
### 精度
......
import argparse
import torch
import argparse
from transformers import AutoProcessor, Llama4ForConditionalGeneration
......@@ -18,7 +18,7 @@ if __name__ == "__main__":
processor = AutoProcessor.from_pretrained(args.model_id)
model = Llama4ForConditionalGeneration.from_pretrained(
args.model_id,
attn_implementation="flex_attention",
#attn_implementation="flex_attention", # torch>2.5
device_map="auto",
torch_dtype=torch.bfloat16,
)
......@@ -42,7 +42,7 @@ if __name__ == "__main__":
return_dict=True,
return_tensors="pt",
).to(model.device)
## 生成
outputs = model.generate(
**inputs,
max_new_tokens=256,
......
import time
from openai import OpenAI
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
def infer_llama4_vllm(model_path, message, tp_size=1, max_model_len=4096):
'''vllm 推理 llama4'''
tokenizer = AutoTokenizer.from_pretrained(model_path)
# message = MARKDOWN_TEMPLATE.format(query)
messages = [{"role": "user", "content": message}]
print(f"Prompt: {messages!r}")
sampling_params = SamplingParams(temperature=0.3,
top_p=0.9,
max_tokens=4096,
stop_token_ids=[tokenizer.eos_token_id])
llm = LLM(model=model_path,
max_model_len=max_model_len,
trust_remote_code=True,
enforce_eager=True,
dtype="float16",
tensor_parallel_size=tp_size)
# generate answer
prompt_token_ids = [tokenizer.apply_chat_template(messages, add_generation_prompt=True)]
start_time = time.time()
outputs = llm.generate(prompt_token_ids=prompt_token_ids, sampling_params=sampling_params)
print("total infer time", time.time() - start_time)
# results
for output in outputs:
generated_text = output.outputs[0].text
print(f"Generated text: {generated_text!r}")
def infer_llama4_client(client, messages, model_name='Llama-4-Scout-17B-16E-Instruct'):
print(f"Prompt: {messages!r}")
response = client.chat.completions.create(
messages=[
{"role": "system", "content": "You are a helpful assistant"},
{"role": "user", "content": str(messages)}
],
model=model_name,
stream=False
)
print(f"Response text: {response!r}")
return response
if __name__ == "__main__":
# VLLM 本地推理
infer_llama4_vllm(model_path="meta-llama/Llama-4-Scout-17B-16E-Instruct",
message="你好",
tp_size=1,
max_model_len=4096)
# OpenAI API 推理
# url = "127.0.0.1:8000" # 根据实际情况修改
# client = OpenAI(api_key="EMPTY", base_url=f"http://{url}/v1")
# infer_llama4_client(client=client,
# messages="你好",
# model_name='Llama-4-Scout-17B-16E-Instruct')
\ No newline at end of file
# This file was autogenerated by uv via the following command:
# uv export --frozen --no-hashes --no-emit-project --output-file=requirements.txt
annotated-types==0.7.0
certifi==2025.1.31
charset-normalizer==3.4.1
idna==3.10
jinja2==3.1.6
markupsafe==3.0.2
pillow==11.1.0
pydantic==2.10.6
pydantic-core==2.27.2
pyyaml==6.0.2
regex==2024.11.6
requests==2.32.3
tiktoken==0.8.0
typing-extensions==4.12.2
urllib3==2.3.0
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment