Commit f6a338d7 authored by jerrrrry's avatar jerrrrry
Browse files

Initial commit

parents
# 使用官方光源基础镜像
FROM image.sourcefind.cn:5000/dcu/admin/base/vllm:0.8.5-ubuntu22.04-dtk25.04.1-rc5-das1.6-py3.10-20250711
# 安装基础工具
RUN apt-get update && \
apt-get install -y --no-install-recommends \
iproute2 \
dmidecode \
ipmitool \
git \
curl \
jq \
lshw \
iputils-ping \
pciutils \
sysstat \
locate \
&& rm -rf /var/lib/apt/lists/*
# 创建目录结构
RUN mkdir -p /workspace/scripts && \
mkdir -p /workspace/configs && \
mkdir -p /workspace/test/env_check_outputs && \
mkdir -p /workspace/test/inference_outputs && \
mkdir -p /workspace/test/models && \
mkdir -p /workspace/test/env_check_tools
# 复制脚本
COPY ./scripts/* /workspace/scripts/
COPY ./configs/* /workspace/configs/
COPY ./env_check_tools/dcu_env_check.zip /workspace/test/env_check_tools/
COPY ./env_check_tools/rccl-tests.zip /workspace/test/env_check_tools/
RUN chmod +x /workspace/scripts/*
RUN chmod +x /workspace/configs*
# 设置工作目录(建议直接设为脚本目录)
WORKDIR /workspace/scripts/
# 直接执行脚本(无需cd)
CMD bash -c "\
bash entrypoint.sh"
\ No newline at end of file
Qwen2.5-VL-32B;/workspace/llms/qwen2.5/Qwen2.5-VL-32B-Instruct/;4;float16;"1 ";(512 512);32768;0.95
Qwen2.5-VL-7B;/workspace/llms/qwen2.5/Qwen2.5-VL-7B-Instruct/;1;float16;"1 ";(512 512);32768;0.95
#Qwen2.5-VL-32B;/workspace/llms/qwen2.5/Qwen2.5-VL-32B-Instruct/;8;float16;"1 4 8 12 16 20 24 28 32 ";(512 512,512 1024,1024 1024,2048 1024,3072 1024,4096 1024,8192 1024,10240 1024,16384 1024,20480 1024);32768;0.95
#Qwen3-32B;/workspace/llms/qwen3/Qwen3-32B/;8;float16;"1 4 8 12 16 20 24 28 32 ";(512 512,512 1024,1024 1024,2048 1024,3072 1024,4096 1024,8192 1024,10240 1024,16384 1024,20480 1024);32768;0.95
#Qwen3-32B;/workspace/llms//qwen3/Qwen3-32B/;4;float16;"1 4 8 12 16 20 24 28 32 ";(512 512,512 1024,1024 1024,2048 1024,3072 1024,4096 1024,8192 1024,10240 1024,16384 1024,20480 1024);32768;0.95
#Qwen3-30B-A3B;/workspace/llms/qwen3/Qwen3-30B-A3B/;8;float16;"1 4 8 12 16 20 24 28 32 ";(512 512,512 1024,1024 1024,2048 1024,3072 1024,4096 1024,8192 1024,10240 1024,16384 1024,20480 1024);32768;0.95
#Qwen3-30B-A3B;/workspace/llms/qwen3/Qwen3-30B-A3B/;4;float16;"1 4 8 12 16 20 24 28 32 ";(512 512,512 1024,1024 1024,2048 1024,3072 1024,4096 1024,8192 1024,10240 1024,16384 1024,20480 1024);32768;0.95
#Qwen3-4B;/workspace/llms/qwen3/Qwen3-4B/;1;float16;"1 4 8 12 16 20 24 28 32 ";(512 512,512 1024,1024 1024,2048 1024,3072 1024,4096 1024,8192 1024,10240 1024,16384 1024,20480 1024);32768;0.95
#Qwen3-235B-A22B;/workspace/llms/qwen3/Qwen3-235B-A22B/;8;float16;"1 4 8 12 16 20 24 28 32 ";(512 512,512 1024,1024 1024,2048 1024,3072 1024,4096 1024,8192 1024,10240 1024,16384 1024);20000;0.95
# SPDX-License-Identifier: Apache-2.0
import json
import os
import sys
import time
import traceback
from dataclasses import dataclass, field
from typing import Optional, Union
import aiohttp
import huggingface_hub.constants
from tqdm.asyncio import tqdm
from transformers import (AutoTokenizer, PreTrainedTokenizer,
PreTrainedTokenizerFast)
# NOTE(simon): do not import vLLM here so the benchmark script
# can run without vLLM installed.
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
@dataclass
class RequestFuncInput:
prompt: str
api_url: str
prompt_len: int
output_len: int
model: str
model_name: Optional[str] = None
logprobs: Optional[int] = None
extra_body: Optional[dict] = None
multi_modal_content: Optional[dict] = None
ignore_eos: bool = False
@dataclass
class RequestFuncOutput:
generated_text: str = ""
success: bool = False
latency: float = 0.0
output_tokens: int = 0
ttft: float = 0.0 # Time to first token
itl: list[float] = field(
default_factory=list) # list of inter-token latencies
tpot: float = 0.0 # avg next-token latencies
prompt_len: int = 0
error: str = ""
async def async_request_tgi(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith("generate_stream")
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
params = {
"max_new_tokens": request_func_input.output_len,
"do_sample": True,
"temperature": 0.01, # TGI does not accept 0.0 temperature.
"top_p": 0.99, # TGI does not accept 1.0 top_p.
"truncate": request_func_input.prompt_len,
"ignore_eos_token": request_func_input.ignore_eos,
}
payload = {
"inputs": request_func_input.prompt,
"parameters": params,
}
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
if request_func_input.ignore_eos:
output.output_tokens = request_func_input.output_len
else:
output.output_tokens = None
ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload) as response:
if response.status == 200:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk_bytes = chunk_bytes.decode("utf-8")
# NOTE: Sometimes TGI returns a ping response without
# any data, we should skip it.
if chunk_bytes.startswith(":"):
continue
chunk = chunk_bytes.removeprefix("data:")
data = json.loads(chunk)
timestamp = time.perf_counter()
# First token
if ttft == 0.0:
ttft = time.perf_counter() - st
output.ttft = ttft
# Decoding phase
else:
output.itl.append(timestamp -
most_recent_timestamp)
most_recent_timestamp = timestamp
output.latency = most_recent_timestamp - st
output.success = True
output.generated_text = data["generated_text"]
else:
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
if pbar:
pbar.update(1)
return output
async def async_request_trt_llm(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith("generate_stream")
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
payload = {
"accumulate_tokens": True,
"text_input": request_func_input.prompt,
"temperature": 0.0,
"top_p": 1.0,
"max_tokens": request_func_input.output_len,
"stream": True,
}
if request_func_input.ignore_eos:
payload["min_length"] = request_func_input.output_len
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload) as response:
if response.status == 200:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk = chunk_bytes.decode("utf-8").removeprefix(
"data:")
data = json.loads(chunk)
output.generated_text += data["text_output"]
timestamp = time.perf_counter()
# First token
if ttft == 0.0:
ttft = timestamp - st
output.ttft = ttft
# Decoding phase
else:
output.itl.append(timestamp -
most_recent_timestamp)
most_recent_timestamp = timestamp
output.latency = most_recent_timestamp - st
output.success = True
else:
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
if pbar:
pbar.update(1)
return output
async def async_request_deepspeed_mii(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
payload = {
"prompt": request_func_input.prompt,
"max_tokens": request_func_input.output_len,
"temperature": 0.01, # deepspeed-mii does not accept 0.0 temp.
"top_p": 1.0,
}
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
# NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
# will use 0 as placeholder.
# See https://github.com/microsoft/DeepSpeed-MII/pull/311
output.ttft = 0
st = time.perf_counter()
try:
async with session.post(url=request_func_input.api_url,
json=payload) as response:
if response.status == 200:
parsed_resp = await response.json()
output.latency = time.perf_counter() - st
if "choices" in parsed_resp:
output.generated_text = parsed_resp["choices"][0][
"text"]
elif "text" in parsed_resp:
output.generated_text = parsed_resp["text"][0]
else:
output.error = ("Unexpected response format: "
"neither 'choices' nor 'text' found")
output.success = False
output.success = True
else:
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
if pbar:
pbar.update(1)
return output
async def async_request_openai_completions(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith(
("completions", "profile")
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
payload = {
"model": request_func_input.model_name \
if request_func_input.model_name else request_func_input.model,
"prompt": request_func_input.prompt,
"temperature": 0.0,
"max_tokens": request_func_input.output_len,
"logprobs": request_func_input.logprobs,
"stream": True,
"stream_options": {
"include_usage": True,
},
}
if request_func_input.ignore_eos:
payload["ignore_eos"] = request_func_input.ignore_eos
if request_func_input.extra_body:
payload.update(request_func_input.extra_body)
headers = {
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
}
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
generated_text = ""
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload,
headers=headers) as response:
if response.status == 200:
first_chunk_received = False
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk = chunk_bytes.decode("utf-8").removeprefix(
"data: ")
if chunk != "[DONE]":
data = json.loads(chunk)
# NOTE: Some completion API might have a last
# usage summary response without a token so we
# want to check a token was generated
if choices := data.get("choices"):
# Note that text could be empty here
# e.g. for special tokens
text = choices[0].get("text")
timestamp = time.perf_counter()
# First token
if not first_chunk_received:
first_chunk_received = True
ttft = time.perf_counter() - st
output.ttft = ttft
# Decoding phase
else:
output.itl.append(timestamp -
most_recent_timestamp)
most_recent_timestamp = timestamp
generated_text += text or ""
elif usage := data.get("usage"):
output.output_tokens = usage.get(
"completion_tokens")
if first_chunk_received:
output.success = True
else:
output.success = False
output.error = (
"Never received a valid chunk to calculate TTFT."
"This response will be marked as failed!")
output.generated_text = generated_text
output.latency = most_recent_timestamp - st
else:
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
if pbar:
pbar.update(1)
return output
async def async_request_openai_chat_completions(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith(
("chat/completions", "profile")
), "OpenAI Chat Completions API URL must end with 'chat/completions'."
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
content = [{"type": "text", "text": request_func_input.prompt}]
if request_func_input.multi_modal_content:
content.append(request_func_input.multi_modal_content)
payload = {
"model": request_func_input.model_name \
if request_func_input.model_name else request_func_input.model,
"messages": [
{
"role": "user",
"content": content
},
],
"temperature": 0.0,
"max_completion_tokens": request_func_input.output_len,
"stream": True,
"stream_options": {
"include_usage": True,
},
}
if request_func_input.ignore_eos:
payload["ignore_eos"] = request_func_input.ignore_eos
if request_func_input.extra_body:
payload.update(request_func_input.extra_body)
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
}
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
generated_text = ""
ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload,
headers=headers) as response:
if response.status == 200:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk = chunk_bytes.decode("utf-8").removeprefix(
"data: ")
if chunk != "[DONE]":
timestamp = time.perf_counter()
data = json.loads(chunk)
if choices := data.get("choices"):
content = choices[0]["delta"].get("content")
# First token
if ttft == 0.0:
ttft = timestamp - st
output.ttft = ttft
# Decoding phase
else:
output.itl.append(timestamp -
most_recent_timestamp)
generated_text += content or ""
elif usage := data.get("usage"):
output.output_tokens = usage.get(
"completion_tokens")
most_recent_timestamp = timestamp
output.generated_text = generated_text
output.success = True
output.latency = most_recent_timestamp - st
else:
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
if pbar:
pbar.update(1)
return output
def get_model(pretrained_model_name_or_path: str) -> str:
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
from modelscope import snapshot_download
from vllm.model_executor.model_loader.weight_utils import get_lock
# Use file lock to prevent multiple processes from
# downloading the same model weights at the same time.
with get_lock(pretrained_model_name_or_path):
model_path = snapshot_download(
model_id=pretrained_model_name_or_path,
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
return model_path
return pretrained_model_name_or_path
def get_tokenizer(
pretrained_model_name_or_path: str,
tokenizer_mode: str = "auto",
trust_remote_code: bool = False,
**kwargs,
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
if pretrained_model_name_or_path is not None and not os.path.exists(
pretrained_model_name_or_path):
pretrained_model_name_or_path = get_model(
pretrained_model_name_or_path)
if tokenizer_mode == "slow":
if kwargs.get("use_fast", False):
raise ValueError(
"Cannot use the fast tokenizer in slow tokenizer mode.")
kwargs["use_fast"] = False
if tokenizer_mode == "mistral":
try:
from vllm.transformers_utils.tokenizer import MistralTokenizer
except ImportError as e:
raise ImportError("MistralTokenizer requires vllm package.\n"
"Please install it with `pip install vllm` "
"to use mistral tokenizer mode.") from e
return MistralTokenizer.from_pretrained(
str(pretrained_model_name_or_path))
else:
return AutoTokenizer.from_pretrained(
pretrained_model_name_or_path,
trust_remote_code=trust_remote_code,
**kwargs,
)
ASYNC_REQUEST_FUNCS = {
"tgi": async_request_tgi,
"vllm": async_request_openai_completions,
"lmdeploy": async_request_openai_completions,
"deepspeed-mii": async_request_deepspeed_mii,
"openai": async_request_openai_completions,
"openai-chat": async_request_openai_chat_completions,
"tensorrt-llm": async_request_trt_llm,
"scalellm": async_request_openai_completions,
"sglang": async_request_openai_completions,
}
OPENAI_COMPATIBLE_BACKENDS = [
k for k, v in ASYNC_REQUEST_FUNCS.items()
if v in (async_request_openai_completions,
async_request_openai_chat_completions)
]
This diff is collapsed.
This diff is collapsed.
# SPDX-License-Identifier: Apache-2.0
import argparse
import json
import math
import os
from typing import Any
def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
metrics: dict[str, list],
extra_info: dict[str, Any]) -> list:
"""
Save the benchmark results in the format used by PyTorch OSS benchmark with
on metric per record
https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
"""
records = []
if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
return records
for name, benchmark_values in metrics.items():
record = {
"benchmark": {
"name": "vLLM benchmark",
"extra_info": {
"args": vars(args),
},
},
"model": {
"name": args.model,
},
"metric": {
"name": name,
"benchmark_values": benchmark_values,
"extra_info": extra_info,
},
}
tp = record["benchmark"]["extra_info"]["args"].get(
"tensor_parallel_size")
# Save tensor_parallel_size parameter if it's part of the metadata
if not tp and "tensor_parallel_size" in extra_info:
record["benchmark"]["extra_info"]["args"][
"tensor_parallel_size"] = extra_info["tensor_parallel_size"]
records.append(record)
return records
class InfEncoder(json.JSONEncoder):
def clear_inf(self, o: Any):
if isinstance(o, dict):
return {k: self.clear_inf(v) for k, v in o.items()}
elif isinstance(o, list):
return [self.clear_inf(v) for v in o]
elif isinstance(o, float) and math.isinf(o):
return "inf"
return o
def iterencode(self, o: Any, *args, **kwargs) -> Any:
return super().iterencode(self.clear_inf(o), *args, **kwargs)
def write_to_json(filename: str, records: list) -> None:
with open(filename, "w") as f:
json.dump(records, f, cls=InfEncoder)
#!/bin/bash
# 运行环境检查
echo "==================== 开始系统环境检查 ===================="
#/workspace/scripts/run_envcheck.sh
# 运行性能测试
echo "==================== 开始性能测试 ===================="
/workspace/scripts/run_benchmark.sh
echo "==================== 所有测试完成 ===================="
\ No newline at end of file
#!/bin/bash
# 初始化目录
mkdir -p /workspace/test/inference_outputs/results
mkdir -p /workspace/test/inference_outputs/logs/server
mkdir -p /workspace/test/inference_outputs/logs/models
# 基础端口
BASE_PORT=8001
# 读取配置文件(分号分隔)
while IFS=';' read -r model_name model_path tp data_type batch_list prompt_pairs max_model_len gpu_mem_util; do
# 清理参数(去除空格和引号)
model_name=$(echo "$model_name" | xargs)
model_path=$(echo "$model_path" | xargs)
tp=$(echo "$tp" | xargs)
data_type=$(echo "$data_type" | xargs)
batch_list=$(echo "$batch_list" | tr -d '"' | xargs)
prompt_pairs=$(echo "$prompt_pairs" | tr -d '()"' | xargs)
max_model_len=$(echo "$max_model_len" | xargs)
gpu_mem_util=$(echo "$gpu_mem_util" | xargs)
# 动态分配端口
port=$((BASE_PORT++))
# 生成 server.sh
cat > "/workspace/test/inference_outputs/server_${model_name}_tp${tp}.sh" <<EOF
#!/bin/bash
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export ALLREDUCE_STREAM_WITH_COMPUTE=1
export NCCL_MIN_NCHANNELS=16
export NCCL_MAX_NCHANNELS=16
export VLLM_PCIE_USE_CUSTOM_ALLREDUCE=1
export VLLM_USE_TRITON_PREFIX_FLASH_ATTN=1
export VLLM_NUMA_BIND=1
export VLLM_RANK0_NUMA=0
export VLLM_RANK1_NUMA=1
export VLLM_RANK2_NUMA=2
export VLLM_RANK3_NUMA=3
export VLLM_RANK4_NUMA=4
export VLLM_RANK5_NUMA=5
export VLLM_RANK6_NUMA=6
export VLLM_RANK7_NUMA=7
export VLLM_USE_TRITON_PREFIX_FLASH_ATTN=1
vllm serve "$model_path" --trust-remote-code \\
--enable-prefix-caching \\
--dtype $data_type \\
--tensor-parallel-size $tp \\
--max-model-len $max_model_len \\
--port $port \\
--gpu-memory-utilization $gpu_mem_util
EOF
# 赋予执行权限
chmod +x "/workspace/test/inference_outputs/server_${model_name}_tp${tp}.sh"
echo "Generated server script for ${model_name}_tp${tp} at /workspace/test/inference_outputs/server_${model_name}_tp${tp}.sh"
# 1. 启动 vLLM 服务,并记录日志到 server.log
/workspace/test/inference_outputs/server_${model_name}_tp${tp}.sh > "/workspace/test/inference_outputs/logs/server/${model_name}_tp${tp}_server.log" 2>&1 &
SERVER_PID=$!
# 2. 改进的日志检测函数
check_server_status() {
local log_file=$1
local server_pid=$2
local success_msg="Starting vLLM API server on http://0.0.0.0"
local error_patterns=("RuntimeError" "ValueError" "segmentation fault" "core dumped" )
# 检查成功消息
if grep -q "$success_msg" "$log_file"; then
echo "✅ Server started successfully!"
return 0
fi
# 检查错误消息
for pattern in "${error_patterns[@]}"; do
if grep -i -q "$pattern" "$log_file"; then
echo -e "\n❌ [$(date '+%Y-%m-%d %H:%M:%S')] Error detected in logs (pattern: $pattern)!"
echo "===== ERROR CONTEXT ====="
grep -i -A 5 -B 5 "$pattern" "$log_file" | tail -n 20
echo "========================="
return 1
fi
done
# 检查进程是否存活
if ! kill -0 $server_pid 2>/dev/null; then
echo -e "\n❌ [$(date '+%Y-%m-%d %H:%M:%S')] Server process died unexpectedly!"
echo "===== LAST LOG LINES ====="
tail -n 20 "$log_file"
echo "========================="
return 1
fi
# 默认返回继续等待
return 2
}
# 3. 等待服务器启动或失败
echo -e "\n🔍 [$(date '+%Y-%m-%d %H:%M:%S')] Starting monitoring for ${model_name}_tp${tp} (PID: $SERVER_PID)"
max_wait_seconds=20000
start_time=$(date +%s)
log_file="/workspace/test/inference_outputs/logs/server/${model_name}_tp${tp}_server.log"
while true; do
sleep 20 # 每20秒检查一次
check_server_status "$log_file" "$SERVER_PID"
status=$?
# 成功状态
if [ $status -eq 0 ]; then
break
fi
# 失败状态
if [ $status -eq 1 ]; then
# 清理资源
kill $SERVER_PID 2>/dev/null
pkill -f "vllm serve" 2>/dev/null
echo "🛑 Cleaned up resources after failure"
# 直接继续下一个模型测试
continue 2
fi
# 检查超时
current_time=$(date +%s)
elapsed=$((current_time - start_time))
if [ $elapsed -ge $max_wait_seconds ]; then
echo -e "\n⏰ [$(date '+%Y-%m-%d %H:%M:%S')] Timeout waiting for server to start!"
# 清理资源
kill $SERVER_PID 2>/dev/null
pkill -f "vllm serve" 2>/dev/null
echo "🛑 Cleaned up resources after timeout"
# 直接继续下一个模型测试
continue 2
fi
echo "Waiting... (${elapsed}s elapsed)"
done
# 4. 只有成功启动时才执行测试
echo -e "\n🚀 [$(date '+%Y-%m-%d %H:%M:%S')] Running tests for ${model_name}_tp${tp}..."
# 设置测试环境变量
export MODEL_NAME="$model_name"
export MODEL_PATH="$model_path"
export TP="$tp"
export DATA_TYPE="$data_type"
export BATCH_LIST="$batch_list"
export PROMPT_PAIRS="$prompt_pairs"
export PORT="$port"
# 运行测试
./test.sh
# 5. 测试完成后清理
kill $SERVER_PID
pkill -f "vllm serve" 2>/dev/null
echo "✅ [$(date '+%Y-%m-%d %H:%M:%S')] ${model_name}_tp${tp} test completed and cleaned up"
done < <(grep -v '^#' ../configs/model_to_test.cfg | grep -v '^$')
echo -e "\n📊 [$(date '+%Y-%m-%d %H:%M:%S')] All tests completed. Results saved to results/"
\ No newline at end of file
This diff is collapsed.
#!/bin/bash
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export ALLREDUCE_STREAM_WITH_COMPUTE=1
export NCCL_MIN_NCHANNELS=16
export NCCL_MAX_NCHANNELS=16
export VLLM_PCIE_USE_CUSTOM_ALLREDUCE=1
export VLLM_USE_TRITON_PREFIX_FLASH_ATTN=1
export VLLM_NUMA_BIND=1
export VLLM_RANK0_NUMA=0
export VLLM_RANK1_NUMA=1
export VLLM_RANK2_NUMA=2
export VLLM_RANK3_NUMA=3
export VLLM_RANK4_NUMA=4
export VLLM_RANK5_NUMA=5
export VLLM_RANK6_NUMA=6
export VLLM_RANK7_NUMA=7
# 从环境变量读取参数
model_name=${MODEL_NAME}
model_path=${MODEL_PATH}
tp=${TP}
data_type=${DATA_TYPE}
batch_list=${BATCH_LIST}
prompt_pairs=${PROMPT_PAIRS}
port=${PORT}
# 生成结果文件名
result_file="/workspace/test/inference_outputs/results/${model_name}_tp${tp}.csv"
echo "tp,data_type,batch,prompt_tokens,completion_tokens,TOTAL_THROUGHPUT(toks/s),generate_throughput(toks/s),TTFT(ms),TPOT(ms),ITL(ms)" > "$result_file"
# 转换字符串为数组
IFS=' ' read -ra batches <<< "$batch_list"
IFS=',' read -ra pairs <<< "$prompt_pairs"
# 执行测试
for batch in "${batches[@]}"; do
for pair in "${pairs[@]}"; do
IFS=' ' read -r prompt_tokens completion_tokens <<< "$pair"
log_file="/workspace/test/inference_outputs/logs/models/${model_name}_${tp}/batch_${batch}_prompt_${prompt_tokens}_completion_${completion_tokens}.log"
mkdir -p "$(dirname "$log_file")"
echo "Running: batch=$batch, prompt=$prompt_tokens, completion=$completion_tokens"
python benchmark_serving.py \
--backend openai \
--port "$port" \
--model "$model_path" \
--trust-remote-code \
--dataset-name random \
--ignore-eos \
--random-input-len "$prompt_tokens" \
--random-output-len "$completion_tokens" \
--num-prompts "$batch" \
2>&1 | tee "$log_file"
# 提取指标
TOTAL_THROUGHPUT=$(grep "^Total Token" "$log_file" | awk '{print $5}')
GEN_THROUGHPUT=$(grep "^Output token" "$log_file" | awk '{print $5}')
TTFT=$(grep "^Mean TTFT" "$log_file" | awk '{print $4}')
TPOT=$(grep "^Mean TPOT" "$log_file" | awk '{print $4}')
ITL=$(grep "^Mean ITL" "$log_file" | awk '{print $4}')
echo "$tp,$data_type,$batch,$prompt_tokens,$completion_tokens,$TOTAL_THROUGHPUT,$GEN_THROUGHPUT,$TTFT,$TPOT,$ITL" >> "$result_file"
done
done
docker build -t vllm-test1 . && \
docker run \
-v /usr/local/hyhal:/usr/local/hyhal:ro \
-v /opt/hyhal:/opt/hyhal:ro \
-v $PWD/outputs/env_check_outputs:/workspace/test/env_check_outputs/ \
-v /public/opendas/DL_DATA/llm-models:/workspace/llms/:ro \
-v $PWD/outputs/inference_outputs:/workspace/test/inference_outputs/ \
--ipc=host \
--network=host \
--cap-add=SYS_PTRACE \
--group-add video \
--ulimit memlock=-1:-1 \
--privileged \
--device=/dev/kfd \
--device=/dev/mkfd \
--device=/dev/dri \
--shm-size=500G \
-u root \
--security-opt seccomp=unconfined \
vllm-test1 \
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment