Update README.md

94dca700 · laibao · 9919820f · 94dca700
Commit 94dca700 authored Oct 12, 2024 by laibao
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 5 deletions

README.md README.md +5 -5

No files found.
--- a/README.md
+++ b/README.md
@@ -92,7 +92,7 @@ python examples/offline_inference.py
 ### 离线批量推理性能测试
 1、指定输入输出
 ```bash
-python benchmarks/benchmark_throughput.py --num-prompts 1 --input-len 32 --output-len 128 --model Qwen/Qwen2.5-7B-Chat -tp 1 --trust-remote-code --enforce-eager --dtype float16
+python benchmarks/benchmark_throughput.py --num-prompts 1 --input-len 32 --output-len 128 --model Qwen/Qwen2.5-7B-instruct -tp 1 --trust-remote-code --enforce-eager --dtype float16
 ```
 其中`--num-prompts`是batch数，`--input-len`是输入seqlen，`--output-len`是输出token长度，`--model`为模型路径，`-tp`为使用卡数，`dtype="float16"`为推理数据类型，如果模型权重是bfloat16,需要修改为float16推理。若指定`--output-len  1`即为首字延迟。`-q gptq`为使用gptq量化模型进行推理。
@@ -103,7 +103,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
 ```
 ```bash
-python benchmarks/benchmark_throughput.py --num-prompts 1 --model Qwen/Qwen2.5-7B-Chat --dataset ShareGPT_V3_unfiltered_cleaned_split.json -tp 1 --trust-remote-code --enforce-eager --dtype float16
+python benchmarks/benchmark_throughput.py --num-prompts 1 --model Qwen/Qwen2.5-7B-instruct --dataset ShareGPT_V3_unfiltered_cleaned_split.json -tp 1 --trust-remote-code --enforce-eager --dtype float16
 ```
 其中`--num-prompts`是batch数，`--model`为模型路径，`--dataset`为使用的数据集，`-tp`为使用卡数，`dtype="float16"`为推理数据类型，如果模型权重是bfloat16,需要修改为float16推理。`-q gptq`为使用gptq量化模型进行推理。
@@ -113,7 +113,7 @@ python benchmarks/benchmark_throughput.py --num-prompts 1 --model Qwen/Qwen2.5-7
 ### OpenAI兼容服务
 启动服务：
 ```bash
-python -m vllm.entrypoints.openai.api_server --model Qwen/Qwen2.5-7B-Chat --enforce-eager --dtype float16 --trust-remote-code
+python -m vllm.entrypoints.openai.api_server --model Qwen/Qwen2.5-7B-instruct --enforce-eager --dtype float16 --trust-remote-code
 ```
 这里`--model`为加载模型路径，`--dtype`为数据类型：float16，默认情况使用tokenizer中的预定义聊天模板，`--chat-template`可以添加新模板覆盖默认模板,`-q gptq`为使用gptq量化模型进行推理,`-q awqq`为使用awq量化模型进行推理。
@@ -127,7 +127,7 @@ curl http://localhost:8000/v1/models
 curl http://localhost:8000/v1/completions \
    -H "Content-Type: application/json" \
    -d '{
-        "model": "Qwen/Qwen2.5-7B",
+        "model": "Qwen/Qwen2.5-7B-instruct",
        "prompt": "What is deep learning?",
        "max_tokens": 7,
        "temperature": 0
@@ -141,7 +141,7 @@ curl http://localhost:8000/v1/completions \
 curl http://localhost:8000/v1/chat/completions \
    -H "Content-Type: application/json" \
    -d '{
-        "model": "Qwen/Qwen2.5-7B-Chat",
+        "model": "Qwen/Qwen2.5-7B-instruct",
        "messages": [
            {"role": "system", "content": "What is deep learning?"},
            {"role": "user", "content": "What is deep learning?"}