Update README.md

7c8fb33c · jerrrrry · b30d394f · 7c8fb33c
Commit 7c8fb33c authored May 07, 2025 by jerrrrry
Hide whitespace changes
Inline Side-by-side

Showing with 7 additions and 40 deletions

README.md README.md +7 -40

No files found.
--- a/README.md
+++ b/README.md
@@ -49,13 +49,13 @@ python  benchmark_servein_0.7.2.py --backend vllm --ignore-eos  --dataset-name r
 </pre>
-0.6.2
+# 0.6.2
-Offline推理
+1. Offline推理
 benchmark_throughput_0.6.2.py
 使用如下脚本可以减少不同参数推理时反复load模型
 batch prompt_tokens completion_tokens可以用空格分隔传成字符串
 其他参数与标准脚本一致
-bash
+<pre>
 export HIP_VISIBLE_DEVICES=1
 tp=1
 model_path=/llm-models/qwen1.5/Qwen1.5-0.5B-Chat
@@ -66,7 +66,7 @@ prompt_tokens="16 64"
 completion_tokens="128 256"
 python benchmark_throughput_0.6.2.py --model ${model_path} --tensor-parallel-size ${tp} --num-prompts ${batch} --input-len ${prompt_tokens} --output-len ${completion_tokens} \
    --dtype float16  --trust-remote-code --max-model-len 32768 --output-json ./test_0.5B-0.6.2.txt
+</pre>
 按照如上传参，则计算的场景如下：
 bs    input    output
 1      16        128
@@ -83,11 +83,11 @@ bs_in_out,elapsed_time,Throughput,total_tokens,output_tokens,ttft_mean,ttft_medi
 2_16_128,3.62,0.55,79.56,70.72,0.04829,0.04829,0.04893,0.028,0.02801,0.02801,35.51,35.51,35.51,39.94,39.94,39.95
 2_64_256,7.31,0.27,87.55,70.04,0.04697,0.04697,0.04764,0.0284,0.02836,0.02836,35.17,35.17,35.18,43.97,43.97,43.97
-Server推理
+2. Server推理
 benchmark_servein_0.6.2.py
 backend_request_func.py
 使用此方式可以减少server生成长度和指定长度差距过大
-Bash
+<pre>
 #使用提供的脚本进行测试
 #启动server
@@ -99,38 +99,5 @@ vllm serve $MODEL_PATH  --trust-remote-code   --dtype $dtype --max-model-len $ma
 #--distributed-executor-backend ray等其他参数根据实际情况添加
 方式与平常一样，只是需要加上--ignore-eos
 python  benchmark_servein_0.6.2.py --backend vllm --ignore-eos  --dataset-name random --random-input-len  $input_len --random-output-len  $output_len --model $MODEL_PATH  --num-prompts $num_prompts --endpoint /v1/completions
+</pre>
-prof
-offline_porf
-hipprof
-prof.py
-benchmark_throughput_0.6.2_hipprof.py
-bash
-#使用示例：
-黄色背景为额外添加的部分
-SGLANG_PROF_ROCTX=1 hipprof --trace-off python benchmark_throughput_0.6.2_hipprof.py --num-prompts 1  --input-len 2000  --output-len 1 --model  /models/Llama-2-7b-hf  --trust-remote-code  --enforce-eager --dtype float16 > 7b-prefill-2000-test.log 2>&1 
-torchprof
-benchmark_throughput_0.6.2_torchprof.py
-bash
-#启动方式与平常使用一致
-benchmark_throughput_0.6.2_torchprof.py --num-prompts 1  --input-len 2000  --output-len 1 --model  /models/Llama-2-7b-hf  --trust-remote-code  --enforce-eager --dtype float16 > 7b-prefill-2000-test.log 2>&1
-会打印prof信息，保存的json文件名为:
-{args.num_prompts}-{args.input_len}-{args.output_len}-{args.tensor_parallel_size}_dcu.json
-server_prof
-worker.py
-bash
-替换/usr/local/lib/python3.10/site-packages/vllm/worker/worker.py
-#启动服务
-loca_path为保存的json文件绝对路径
-export VLLM_TORCH_PROFILER_DIR=$loca_path 
-vllm serve $MODEL_PATH  --trust-remote-code   --dtype $dtype --max-model-len $max_len -tp $tp  --gpu-memory-utilization 0.97
-#发送请求
-#--distributed-executor-backend ray等其他参数根据实际情况添加
-python  benchmark_servein_0.6.2.py --backend vllm --ignore-eos  --profile --dataset-name random --random-input-len  $input_len --random-output-len  $output_len --model $MODEL_PATH  --num-prompts $num_prompts --endpoint /v1/completions