Initial commit

f6a338d7 · jerrrrry · f6a338d7 · f6a338d7 · f6a338d7 · f6a338d7
Commit f6a338d7 authored Jul 16, 2025 by jerrrrry
13 changed files
--- a/3_env_check-batch_onlinetests/Dockerfile
+++ b/3_env_check-batch_onlinetests/Dockerfile
+# 使用官方光源基础镜像
+FROM image.sourcefind.cn:5000/dcu/admin/base/vllm:0.8.5-ubuntu22.04-dtk25.04.1-rc5-das1.6-py3.10-20250711
+
+# 安装基础工具
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    iproute2 \
+    dmidecode \
+    ipmitool \
+    git \
+    curl \
+    jq \
+    lshw \
+    iputils-ping \
+    pciutils \
+    sysstat \
+    locate \
+    && rm -rf /var/lib/apt/lists/*
+
+# 创建目录结构
+RUN mkdir -p /workspace/scripts && \
+    mkdir -p /workspace/configs && \
+    mkdir -p /workspace/test/env_check_outputs && \
+    mkdir -p /workspace/test/inference_outputs && \
+    mkdir -p /workspace/test/models  && \
+    mkdir -p /workspace/test/env_check_tools
+
+# 复制脚本
+COPY ./scripts/* /workspace/scripts/
+COPY ./configs/* /workspace/configs/
+COPY ./env_check_tools/dcu_env_check.zip  /workspace/test/env_check_tools/
+COPY ./env_check_tools/rccl-tests.zip  /workspace/test/env_check_tools/
+RUN chmod +x /workspace/scripts/*
+RUN chmod +x /workspace/configs*
+
+# 设置工作目录（建议直接设为脚本目录）
+WORKDIR /workspace/scripts/
+
+# 直接执行脚本（无需cd）
+CMD bash -c "\
+  bash entrypoint.sh"
\ No newline at end of file
--- a/3_env_check-batch_onlinetests/configs/model_to_test.cfg
+++ b/3_env_check-batch_onlinetests/configs/model_to_test.cfg
+
+
+
+
+Qwen2.5-VL-32B;/workspace/llms/qwen2.5/Qwen2.5-VL-32B-Instruct/;4;float16;"1  ";(512 512);32768;0.95
+
+Qwen2.5-VL-7B;/workspace/llms/qwen2.5/Qwen2.5-VL-7B-Instruct/;1;float16;"1  ";(512 512);32768;0.95
+
+
+#Qwen2.5-VL-32B;/workspace/llms/qwen2.5/Qwen2.5-VL-32B-Instruct/;8;float16;"1 4 8 12 16 20 24 28 32 ";(512 512,512 1024,1024 1024,2048 1024,3072 1024,4096 1024,8192 1024,10240 1024,16384 1024,20480 1024);32768;0.95
+
+
+
+#Qwen3-32B;/workspace/llms/qwen3/Qwen3-32B/;8;float16;"1 4 8 12 16 20 24 28 32 ";(512 512,512 1024,1024 1024,2048 1024,3072 1024,4096 1024,8192 1024,10240 1024,16384 1024,20480 1024);32768;0.95
+
+#Qwen3-32B;/workspace/llms//qwen3/Qwen3-32B/;4;float16;"1 4 8 12 16 20 24 28 32 ";(512 512,512 1024,1024 1024,2048 1024,3072 1024,4096 1024,8192 1024,10240 1024,16384 1024,20480 1024);32768;0.95
+
+#Qwen3-30B-A3B;/workspace/llms/qwen3/Qwen3-30B-A3B/;8;float16;"1 4 8 12 16 20 24 28 32 ";(512 512,512 1024,1024 1024,2048 1024,3072 1024,4096 1024,8192 1024,10240 1024,16384 1024,20480 1024);32768;0.95
+
+#Qwen3-30B-A3B;/workspace/llms/qwen3/Qwen3-30B-A3B/;4;float16;"1 4 8 12 16 20 24 28 32 ";(512 512,512 1024,1024 1024,2048 1024,3072 1024,4096 1024,8192 1024,10240 1024,16384 1024,20480 1024);32768;0.95
+
+
+
+#Qwen3-4B;/workspace/llms/qwen3/Qwen3-4B/;1;float16;"1 4 8 12 16 20 24 28 32 ";(512 512,512 1024,1024 1024,2048 1024,3072 1024,4096 1024,8192 1024,10240 1024,16384 1024,20480 1024);32768;0.95
+
+#Qwen3-235B-A22B;/workspace/llms/qwen3/Qwen3-235B-A22B/;8;float16;"1 4 8 12 16 20 24 28 32 ";(512 512,512 1024,1024 1024,2048 1024,3072 1024,4096 1024,8192 1024,10240 1024,16384 1024);20000;0.95
--- a/3_env_check-batch_onlinetests/env_check_tools/dcu_env_check.zip
+++ b/3_env_check-batch_onlinetests/env_check_tools/dcu_env_check.zip
--- a/3_env_check-batch_onlinetests/env_check_tools/rccl-tests.zip
+++ b/3_env_check-batch_onlinetests/env_check_tools/rccl-tests.zip
--- a/3_env_check-batch_onlinetests/scripts/backend_request_func.py
+++ b/3_env_check-batch_onlinetests/scripts/backend_request_func.py
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import os
+import sys
+import time
+import traceback
+from dataclasses import dataclass, field
+from typing import Optional, Union
+
+import aiohttp
+import huggingface_hub.constants
+from tqdm.asyncio import tqdm
+from transformers import (AutoTokenizer, PreTrainedTokenizer,
+                          PreTrainedTokenizerFast)
+
+# NOTE(simon): do not import vLLM here so the benchmark script
+# can run without vLLM installed.
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+
+
+@dataclass
+class RequestFuncInput:
+    prompt: str
+    api_url: str
+    prompt_len: int
+    output_len: int
+    model: str
+    model_name: Optional[str] = None
+    logprobs: Optional[int] = None
+    extra_body: Optional[dict] = None
+    multi_modal_content: Optional[dict] = None
+    ignore_eos: bool = False
+
+
+@dataclass
+class RequestFuncOutput:
+    generated_text: str = ""
+    success: bool = False
+    latency: float = 0.0
+    output_tokens: int = 0
+    ttft: float = 0.0  # Time to first token
+    itl: list[float] = field(
+        default_factory=list)  # list of inter-token latencies
+    tpot: float = 0.0  # avg next-token latencies
+    prompt_len: int = 0
+    error: str = ""
+
+
+async def async_request_tgi(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith("generate_stream")
+
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+        params = {
+            "max_new_tokens": request_func_input.output_len,
+            "do_sample": True,
+            "temperature": 0.01,  # TGI does not accept 0.0 temperature.
+            "top_p": 0.99,  # TGI does not accept 1.0 top_p.
+            "truncate": request_func_input.prompt_len,
+            "ignore_eos_token": request_func_input.ignore_eos,
+        }
+        payload = {
+            "inputs": request_func_input.prompt,
+            "parameters": params,
+        }
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+        if request_func_input.ignore_eos:
+            output.output_tokens = request_func_input.output_len
+        else:
+            output.output_tokens = None
+
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+                        chunk_bytes = chunk_bytes.decode("utf-8")
+
+                        # NOTE: Sometimes TGI returns a ping response without
+                        # any data, we should skip it.
+                        if chunk_bytes.startswith(":"):
+                            continue
+                        chunk = chunk_bytes.removeprefix("data:")
+
+                        data = json.loads(chunk)
+                        timestamp = time.perf_counter()
+                        # First token
+                        if ttft == 0.0:
+                            ttft = time.perf_counter() - st
+                            output.ttft = ttft
+
+                        # Decoding phase
+                        else:
+                            output.itl.append(timestamp -
+                                              most_recent_timestamp)
+
+                        most_recent_timestamp = timestamp
+
+                    output.latency = most_recent_timestamp - st
+                    output.success = True
+                    output.generated_text = data["generated_text"]
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+async def async_request_trt_llm(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith("generate_stream")
+
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+        payload = {
+            "accumulate_tokens": True,
+            "text_input": request_func_input.prompt,
+            "temperature": 0.0,
+            "top_p": 1.0,
+            "max_tokens": request_func_input.output_len,
+            "stream": True,
+        }
+        if request_func_input.ignore_eos:
+            payload["min_length"] = request_func_input.output_len
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                            "data:")
+
+                        data = json.loads(chunk)
+                        output.generated_text += data["text_output"]
+                        timestamp = time.perf_counter()
+                        # First token
+                        if ttft == 0.0:
+                            ttft = timestamp - st
+                            output.ttft = ttft
+
+                        # Decoding phase
+                        else:
+                            output.itl.append(timestamp -
+                                              most_recent_timestamp)
+
+                        most_recent_timestamp = timestamp
+
+                    output.latency = most_recent_timestamp - st
+                    output.success = True
+
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+async def async_request_deepspeed_mii(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+
+        payload = {
+            "prompt": request_func_input.prompt,
+            "max_tokens": request_func_input.output_len,
+            "temperature": 0.01,  # deepspeed-mii does not accept 0.0 temp.
+            "top_p": 1.0,
+        }
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        # NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
+        # will use 0 as placeholder.
+        # See https://github.com/microsoft/DeepSpeed-MII/pull/311
+        output.ttft = 0
+
+        st = time.perf_counter()
+        try:
+            async with session.post(url=request_func_input.api_url,
+                                    json=payload) as response:
+                if response.status == 200:
+                    parsed_resp = await response.json()
+                    output.latency = time.perf_counter() - st
+                    if "choices" in parsed_resp:
+                        output.generated_text = parsed_resp["choices"][0][
+                            "text"]
+                    elif "text" in parsed_resp:
+                        output.generated_text = parsed_resp["text"][0]
+                    else:
+                        output.error = ("Unexpected response format: "
+                                        "neither 'choices' nor 'text' found")
+                        output.success = False
+                    output.success = True
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+async def async_request_openai_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        ("completions", "profile")
+    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
+
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+        payload = {
+            "model": request_func_input.model_name \
+                if request_func_input.model_name else request_func_input.model,
+            "prompt": request_func_input.prompt,
+            "temperature": 0.0,
+            "max_tokens": request_func_input.output_len,
+            "logprobs": request_func_input.logprobs,
+            "stream": True,
+            "stream_options": {
+                "include_usage": True,
+            },
+        }
+        if request_func_input.ignore_eos:
+            payload["ignore_eos"] = request_func_input.ignore_eos
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
+        }
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        generated_text = ""
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload,
+                                    headers=headers) as response:
+                if response.status == 200:
+                    first_chunk_received = False
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                            "data: ")
+                        if chunk != "[DONE]":
+                            data = json.loads(chunk)
+
+                            # NOTE: Some completion API might have a last
+                            # usage summary response without a token so we
+                            # want to check a token was generated
+                            if choices := data.get("choices"):
+                                # Note that text could be empty here
+                                # e.g. for special tokens
+                                text = choices[0].get("text")
+                                timestamp = time.perf_counter()
+                                # First token
+                                if not first_chunk_received:
+                                    first_chunk_received = True
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp -
+                                                      most_recent_timestamp)
+
+                                most_recent_timestamp = timestamp
+                                generated_text += text or ""
+                            elif usage := data.get("usage"):
+                                output.output_tokens = usage.get(
+                                    "completion_tokens")
+                    if first_chunk_received:
+                        output.success = True
+                    else:
+                        output.success = False
+                        output.error = (
+                            "Never received a valid chunk to calculate TTFT."
+                            "This response will be marked as failed!")
+                    output.generated_text = generated_text
+                    output.latency = most_recent_timestamp - st
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+async def async_request_openai_chat_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        ("chat/completions", "profile")
+    ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
+
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+        content = [{"type": "text", "text": request_func_input.prompt}]
+        if request_func_input.multi_modal_content:
+            content.append(request_func_input.multi_modal_content)
+        payload = {
+            "model": request_func_input.model_name \
+                if request_func_input.model_name else request_func_input.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": content
+                },
+            ],
+            "temperature": 0.0,
+            "max_completion_tokens": request_func_input.output_len,
+            "stream": True,
+            "stream_options": {
+                "include_usage": True,
+            },
+        }
+        if request_func_input.ignore_eos:
+            payload["ignore_eos"] = request_func_input.ignore_eos
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        }
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        generated_text = ""
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload,
+                                    headers=headers) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                            "data: ")
+                        if chunk != "[DONE]":
+                            timestamp = time.perf_counter()
+                            data = json.loads(chunk)
+
+                            if choices := data.get("choices"):
+                                content = choices[0]["delta"].get("content")
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = timestamp - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp -
+                                                      most_recent_timestamp)
+
+                                generated_text += content or ""
+                            elif usage := data.get("usage"):
+                                output.output_tokens = usage.get(
+                                    "completion_tokens")
+
+                            most_recent_timestamp = timestamp
+
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = most_recent_timestamp - st
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+def get_model(pretrained_model_name_or_path: str) -> str:
+    if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
+        from modelscope import snapshot_download
+
+        from vllm.model_executor.model_loader.weight_utils import get_lock
+
+        # Use file lock to prevent multiple processes from
+        # downloading the same model weights at the same time.
+        with get_lock(pretrained_model_name_or_path):
+            model_path = snapshot_download(
+                model_id=pretrained_model_name_or_path,
+                local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+                ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
+
+            return model_path
+    return pretrained_model_name_or_path
+
+
+def get_tokenizer(
+    pretrained_model_name_or_path: str,
+    tokenizer_mode: str = "auto",
+    trust_remote_code: bool = False,
+    **kwargs,
+) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+    if pretrained_model_name_or_path is not None and not os.path.exists(
+            pretrained_model_name_or_path):
+        pretrained_model_name_or_path = get_model(
+            pretrained_model_name_or_path)
+    if tokenizer_mode == "slow":
+        if kwargs.get("use_fast", False):
+            raise ValueError(
+                "Cannot use the fast tokenizer in slow tokenizer mode.")
+        kwargs["use_fast"] = False
+    if tokenizer_mode == "mistral":
+        try:
+            from vllm.transformers_utils.tokenizer import MistralTokenizer
+        except ImportError as e:
+            raise ImportError("MistralTokenizer requires vllm package.\n"
+                              "Please install it with `pip install vllm` "
+                              "to use mistral tokenizer mode.") from e
+        return MistralTokenizer.from_pretrained(
+            str(pretrained_model_name_or_path))
+    else:
+        return AutoTokenizer.from_pretrained(
+            pretrained_model_name_or_path,
+            trust_remote_code=trust_remote_code,
+            **kwargs,
+        )
+
+
+ASYNC_REQUEST_FUNCS = {
+    "tgi": async_request_tgi,
+    "vllm": async_request_openai_completions,
+    "lmdeploy": async_request_openai_completions,
+    "deepspeed-mii": async_request_deepspeed_mii,
+    "openai": async_request_openai_completions,
+    "openai-chat": async_request_openai_chat_completions,
+    "tensorrt-llm": async_request_trt_llm,
+    "scalellm": async_request_openai_completions,
+    "sglang": async_request_openai_completions,
+}
+
+OPENAI_COMPATIBLE_BACKENDS = [
+    k for k, v in ASYNC_REQUEST_FUNCS.items()
+    if v in (async_request_openai_completions,
+             async_request_openai_chat_completions)
+]
--- a/3_env_check-batch_onlinetests/scripts/benchmark_dataset.py
+++ b/3_env_check-batch_onlinetests/scripts/benchmark_dataset.py
--- a/3_env_check-batch_onlinetests/scripts/benchmark_serving.py
+++ b/3_env_check-batch_onlinetests/scripts/benchmark_serving.py
--- a/3_env_check-batch_onlinetests/scripts/benchmark_utils.py
+++ b/3_env_check-batch_onlinetests/scripts/benchmark_utils.py
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import json
+import math
+import os
+from typing import Any
+
+
+def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
+                                        metrics: dict[str, list],
+                                        extra_info: dict[str, Any]) -> list:
+    """
+    Save the benchmark results in the format used by PyTorch OSS benchmark with
+    on metric per record
+    https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
+    """
+    records = []
+    if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
+        return records
+
+    for name, benchmark_values in metrics.items():
+        record = {
+            "benchmark": {
+                "name": "vLLM benchmark",
+                "extra_info": {
+                    "args": vars(args),
+                },
+            },
+            "model": {
+                "name": args.model,
+            },
+            "metric": {
+                "name": name,
+                "benchmark_values": benchmark_values,
+                "extra_info": extra_info,
+            },
+        }
+
+        tp = record["benchmark"]["extra_info"]["args"].get(
+            "tensor_parallel_size")
+        # Save tensor_parallel_size parameter if it's part of the metadata
+        if not tp and "tensor_parallel_size" in extra_info:
+            record["benchmark"]["extra_info"]["args"][
+                "tensor_parallel_size"] = extra_info["tensor_parallel_size"]
+
+        records.append(record)
+
+    return records
+
+
+class InfEncoder(json.JSONEncoder):
+
+    def clear_inf(self, o: Any):
+        if isinstance(o, dict):
+            return {k: self.clear_inf(v) for k, v in o.items()}
+        elif isinstance(o, list):
+            return [self.clear_inf(v) for v in o]
+        elif isinstance(o, float) and math.isinf(o):
+            return "inf"
+        return o
+
+    def iterencode(self, o: Any, *args, **kwargs) -> Any:
+        return super().iterencode(self.clear_inf(o), *args, **kwargs)
+
+
+def write_to_json(filename: str, records: list) -> None:
+    with open(filename, "w") as f:
+        json.dump(records, f, cls=InfEncoder)
--- a/3_env_check-batch_onlinetests/scripts/entrypoint.sh
+++ b/3_env_check-batch_onlinetests/scripts/entrypoint.sh
+#!/bin/bash
+# 运行环境检查
+echo "==================== 开始系统环境检查 ===================="
+
+#/workspace/scripts/run_envcheck.sh 
+
+# 运行性能测试
+echo "==================== 开始性能测试 ===================="
+/workspace/scripts/run_benchmark.sh 
+
+echo "==================== 所有测试完成 ===================="
\ No newline at end of file
--- a/3_env_check-batch_onlinetests/scripts/run_benchmark.sh
+++ b/3_env_check-batch_onlinetests/scripts/run_benchmark.sh
+#!/bin/bash
+
+# 初始化目录
+mkdir -p /workspace/test/inference_outputs/results 
+mkdir -p /workspace/test/inference_outputs/logs/server
+mkdir -p /workspace/test/inference_outputs/logs/models
+# 基础端口
+BASE_PORT=8001
+
+# 读取配置文件（分号分隔）
+while IFS=';' read -r model_name model_path tp data_type batch_list prompt_pairs max_model_len gpu_mem_util; do
+    # 清理参数（去除空格和引号）
+    model_name=$(echo "$model_name" | xargs)
+    model_path=$(echo "$model_path" | xargs)
+    tp=$(echo "$tp" | xargs)
+    data_type=$(echo "$data_type" | xargs)
+    batch_list=$(echo "$batch_list" | tr -d '"' | xargs)
+    prompt_pairs=$(echo "$prompt_pairs" | tr -d '()"' | xargs)
+    max_model_len=$(echo "$max_model_len" | xargs)
+    gpu_mem_util=$(echo "$gpu_mem_util" | xargs)
+
+    # 动态分配端口
+    port=$((BASE_PORT++))
+
+    # 生成 server.sh
+    cat > "/workspace/test/inference_outputs/server_${model_name}_tp${tp}.sh" <<EOF
+#!/bin/bash
+
+
+export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export ALLREDUCE_STREAM_WITH_COMPUTE=1
+export NCCL_MIN_NCHANNELS=16
+export NCCL_MAX_NCHANNELS=16
+export VLLM_PCIE_USE_CUSTOM_ALLREDUCE=1
+export VLLM_USE_TRITON_PREFIX_FLASH_ATTN=1
+
+export VLLM_NUMA_BIND=1
+export VLLM_RANK0_NUMA=0
+export VLLM_RANK1_NUMA=1
+export VLLM_RANK2_NUMA=2
+export VLLM_RANK3_NUMA=3
+export VLLM_RANK4_NUMA=4
+export VLLM_RANK5_NUMA=5
+export VLLM_RANK6_NUMA=6
+export VLLM_RANK7_NUMA=7
+export VLLM_USE_TRITON_PREFIX_FLASH_ATTN=1
+vllm serve "$model_path" --trust-remote-code \\
+--enable-prefix-caching \\
+--dtype $data_type \\
+--tensor-parallel-size $tp \\
+--max-model-len $max_model_len \\
+--port $port \\
+--gpu-memory-utilization $gpu_mem_util
+EOF
+
+    # 赋予执行权限
+    chmod +x "/workspace/test/inference_outputs/server_${model_name}_tp${tp}.sh"
+    echo "Generated server script for ${model_name}_tp${tp} at /workspace/test/inference_outputs/server_${model_name}_tp${tp}.sh"
+    
+    # 1. 启动 vLLM 服务，并记录日志到 server.log
+    /workspace/test/inference_outputs/server_${model_name}_tp${tp}.sh > "/workspace/test/inference_outputs/logs/server/${model_name}_tp${tp}_server.log" 2>&1 &
+    SERVER_PID=$!
+    
+    # 2. 改进的日志检测函数
+    check_server_status() {
+        local log_file=$1
+        local server_pid=$2
+        local success_msg="Starting vLLM API server on http://0.0.0.0"
+        local error_patterns=("RuntimeError" "ValueError" "segmentation fault" "core dumped" )
+
+        # 检查成功消息
+        if grep -q "$success_msg" "$log_file"; then
+            echo "✅ Server started successfully!"
+            return 0
+        fi
+
+        # 检查错误消息
+        for pattern in "${error_patterns[@]}"; do
+            if grep -i -q "$pattern" "$log_file"; then
+                echo -e "\n❌ [$(date '+%Y-%m-%d %H:%M:%S')] Error detected in logs (pattern: $pattern)!"
+                echo "===== ERROR CONTEXT ====="
+                grep -i -A 5 -B 5 "$pattern" "$log_file" | tail -n 20
+                echo "========================="
+                return 1
+            fi
+        done
+
+        # 检查进程是否存活
+        if ! kill -0 $server_pid 2>/dev/null; then
+            echo -e "\n❌ [$(date '+%Y-%m-%d %H:%M:%S')] Server process died unexpectedly!"
+            echo "===== LAST LOG LINES ====="
+            tail -n 20 "$log_file"
+            echo "========================="
+            return 1
+        fi
+
+        # 默认返回继续等待
+        return 2
+    }
+
+    # 3. 等待服务器启动或失败
+    echo -e "\n🔍 [$(date '+%Y-%m-%d %H:%M:%S')] Starting monitoring for ${model_name}_tp${tp} (PID: $SERVER_PID)"
+    max_wait_seconds=20000
+    start_time=$(date +%s)
+    log_file="/workspace/test/inference_outputs/logs/server/${model_name}_tp${tp}_server.log"
+
+    while true; do
+        sleep 20  # 每20秒检查一次
+
+        check_server_status "$log_file" "$SERVER_PID"
+        status=$?
+
+        # 成功状态
+        if [ $status -eq 0 ]; then
+            break
+        fi
+
+        # 失败状态
+        if [ $status -eq 1 ]; then
+            # 清理资源
+            kill $SERVER_PID 2>/dev/null
+            pkill -f "vllm serve" 2>/dev/null
+            echo "🛑 Cleaned up resources after failure"
+            # 直接继续下一个模型测试
+            continue 2
+        fi
+
+        # 检查超时
+        current_time=$(date +%s)
+        elapsed=$((current_time - start_time))
+        if [ $elapsed -ge $max_wait_seconds ]; then
+            echo -e "\n⏰ [$(date '+%Y-%m-%d %H:%M:%S')] Timeout waiting for server to start!"
+            # 清理资源
+            kill $SERVER_PID 2>/dev/null
+            pkill -f "vllm serve" 2>/dev/null
+            echo "🛑 Cleaned up resources after timeout"
+            # 直接继续下一个模型测试
+            continue 2
+        fi
+
+        echo "Waiting... (${elapsed}s elapsed)"
+    done
+
+    # 4. 只有成功启动时才执行测试
+    echo -e "\n🚀 [$(date '+%Y-%m-%d %H:%M:%S')] Running tests for ${model_name}_tp${tp}..."
+    
+    # 设置测试环境变量
+    export MODEL_NAME="$model_name"
+    export MODEL_PATH="$model_path"
+    export TP="$tp"
+    export DATA_TYPE="$data_type"
+    export BATCH_LIST="$batch_list"
+    export PROMPT_PAIRS="$prompt_pairs"
+    export PORT="$port"
+    
+    # 运行测试
+    ./test.sh
+    
+    # 5. 测试完成后清理
+    kill $SERVER_PID
+    pkill -f "vllm serve" 2>/dev/null
+    echo "✅ [$(date '+%Y-%m-%d %H:%M:%S')] ${model_name}_tp${tp} test completed and cleaned up"
+    
+done < <(grep -v '^#' ../configs/model_to_test.cfg | grep -v '^$')
+echo -e "\n📊 [$(date '+%Y-%m-%d %H:%M:%S')] All tests completed. Results saved to results/"
\ No newline at end of file
--- a/3_env_check-batch_onlinetests/scripts/run_envcheck.sh
+++ b/3_env_check-batch_onlinetests/scripts/run_envcheck.sh
--- a/3_env_check-batch_onlinetests/scripts/test.sh
+++ b/3_env_check-batch_onlinetests/scripts/test.sh
+#!/bin/bash
+export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export ALLREDUCE_STREAM_WITH_COMPUTE=1
+export NCCL_MIN_NCHANNELS=16
+export NCCL_MAX_NCHANNELS=16
+export VLLM_PCIE_USE_CUSTOM_ALLREDUCE=1
+export VLLM_USE_TRITON_PREFIX_FLASH_ATTN=1
+
+export VLLM_NUMA_BIND=1
+export VLLM_RANK0_NUMA=0
+export VLLM_RANK1_NUMA=1
+export VLLM_RANK2_NUMA=2
+export VLLM_RANK3_NUMA=3
+export VLLM_RANK4_NUMA=4
+export VLLM_RANK5_NUMA=5
+export VLLM_RANK6_NUMA=6
+export VLLM_RANK7_NUMA=7
+
+
+
+# 从环境变量读取参数
+model_name=${MODEL_NAME}
+model_path=${MODEL_PATH}
+tp=${TP}
+data_type=${DATA_TYPE}
+batch_list=${BATCH_LIST}
+prompt_pairs=${PROMPT_PAIRS}
+port=${PORT}
+
+# 生成结果文件名
+result_file="/workspace/test/inference_outputs/results/${model_name}_tp${tp}.csv"
+echo "tp,data_type,batch,prompt_tokens,completion_tokens,TOTAL_THROUGHPUT(toks/s),generate_throughput(toks/s),TTFT(ms),TPOT(ms),ITL(ms)" > "$result_file"
+
+# 转换字符串为数组
+IFS=' ' read -ra batches <<< "$batch_list"
+IFS=',' read -ra pairs <<< "$prompt_pairs"
+# 执行测试
+for batch in "${batches[@]}"; do
+    for pair in "${pairs[@]}"; do
+        IFS=' ' read -r prompt_tokens completion_tokens <<< "$pair"
+        log_file="/workspace/test/inference_outputs/logs/models/${model_name}_${tp}/batch_${batch}_prompt_${prompt_tokens}_completion_${completion_tokens}.log"
+        mkdir -p "$(dirname "$log_file")"
+        echo "Running: batch=$batch, prompt=$prompt_tokens, completion=$completion_tokens"
+        python benchmark_serving.py \
+            --backend openai \
+            --port "$port" \
+            --model "$model_path" \
+            --trust-remote-code \
+            --dataset-name random \
+            --ignore-eos \
+            --random-input-len "$prompt_tokens" \
+            --random-output-len "$completion_tokens" \
+            --num-prompts "$batch" \
+            2>&1 | tee "$log_file"
+        
+        # 提取指标
+        TOTAL_THROUGHPUT=$(grep "^Total Token" "$log_file" | awk '{print $5}')
+        GEN_THROUGHPUT=$(grep "^Output token" "$log_file" | awk '{print $5}')
+        TTFT=$(grep "^Mean TTFT" "$log_file" | awk '{print $4}')
+        TPOT=$(grep "^Mean TPOT" "$log_file" | awk '{print $4}')
+        ITL=$(grep "^Mean ITL" "$log_file" | awk '{print $4}')
+        echo "$tp,$data_type,$batch,$prompt_tokens,$completion_tokens,$TOTAL_THROUGHPUT,$GEN_THROUGHPUT,$TTFT,$TPOT,$ITL" >> "$result_file"
+    done
+done
--- a/3_env_check-batch_onlinetests/start.sh
+++ b/3_env_check-batch_onlinetests/start.sh
+docker build -t vllm-test1 . && \
+docker run \
+-v /usr/local/hyhal:/usr/local/hyhal:ro \
+-v /opt/hyhal:/opt/hyhal:ro   \
+-v $PWD/outputs/env_check_outputs:/workspace/test/env_check_outputs/  \
+-v /public/opendas/DL_DATA/llm-models:/workspace/llms/:ro  \
+-v $PWD/outputs/inference_outputs:/workspace/test/inference_outputs/  \
+--ipc=host \
+--network=host \
+--cap-add=SYS_PTRACE \
+--group-add video \
+--ulimit memlock=-1:-1 \
+--privileged \
+--device=/dev/kfd \
+--device=/dev/mkfd \
+--device=/dev/dri \
+--shm-size=500G \
+-u root \
+--security-opt seccomp=unconfined \
+vllm-test1 \
\ No newline at end of file