Initial commit

8d4db4be · jerrrrry · 8d4db4be · 8d4db4be · 8d4db4be · 8d4db4be
Commit 8d4db4be authored Jun 05, 2025 by jerrrrry
19 changed files
--- a/1_env_check/Dockerfile
+++ b/1_env_check/Dockerfile
+# 使用官方光源基础镜像
+FROM image.sourcefind.cn:5000/dcu/admin/base/custom:vllm0.8.5-ubuntu22.04-dtk25.04-rc7-das1.5-py3.10-20250521-fixpy-rocblas0521-beta2
+
+# 安装基础工具
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    iproute2 \
+    dmidecode \
+    ipmitool \
+    git \
+    curl \
+    jq \
+    lshw \
+    iputils-ping \
+    pciutils \
+    && rm -rf /var/lib/apt/lists/*
+
+# 创建目录结构
+RUN mkdir -p /workspace/scripts && \
+    mkdir -p /workspace/test/env_check_outputs
+
+# 复制脚本并设置权限
+COPY ./scripts/run_envcheck.sh /workspace/scripts/
+
+# 验证脚本可执行性
+RUN ls -l /workspace/scripts/ && \
+    file /workspace/scripts/run_envcheck.sh && \
+    head -n 1 /workspace/scripts/run_envcheck.sh  # 检查shebang
+
+# 设置工作目录（建议直接设为脚本目录）
+WORKDIR /workspace/scripts/
+
+# 直接执行脚本（无需cd）
+CMD bash -c "\
+  bash run_envcheck.sh"
\ No newline at end of file
--- a/1_env_check/scripts/run_envcheck.sh
+++ b/1_env_check/scripts/run_envcheck.sh
+#!/bin/bash
+set -eo pipefail  # 严格错误处理
+
+log_dir="/workspace/test/env_check_outputs"
+mkdir -p "$log_dir"
+
+echo "==================== 开始系统环境检查 ===================="
+
+# 基础检查函数
+run_test() {
+  local name=$1
+  shift
+  echo "[RUN] $name"
+  "$@" 2>&1 | tee "$log_dir/${name}.log" || {
+    echo "[WARN] $name 检查失败" | tee -a "$log_dir/${name}.log"
+    return 1
+  }
+}
+
+run_pipe_test() {
+  local name=$1
+  local cmd=$2
+  echo "[RUN] $name"
+  
+  bash -c "$cmd" 2>&1 | tee "$log_dir/${name}.log" || {
+    echo "[WARN] $name 检查失败" | tee -a "$log_dir/${name}.log"
+    return 1
+  }
+}
+
+# 系统基础检查
+run_test rocm_bandwidth_test rocm-bandwidth-test
+run_test hy_smi hy-smi
+run_test hy_smi_config hy-smi -c
+run_test pip_list pip list
+run_test cpu_info lscpu
+run_test cpu_cores nproc
+run_test memory_usage free -h
+run_test disk_usage df -h
+run_test hardware_info lshw -short || true
+run_test network_interfaces ip a
+run_test ibstat ibstat
+run_test ibdev2netdev ibdev2netdev
+run_pipe_test ACS_stat "lspci -vvv | grep -i acsct"
+run_test rocm_info rocminfo || true
+
+echo "==================== RCCL-TEST ===================="
+cd /workspace/test/env_check_outputs
+
+if command -v git &>/dev/null && command -v make &>/dev/null; then
+  if [ ! -d rccl-tests ]; then
+    git clone https://www.ghproxy.cn/github.com/ROCm/rccl-tests.git --depth 1 -b master || exit 1
+  fi
+  
+  cd rccl-tests || exit 1
+  source /opt/dtk/env.sh
+  
+  if make MPI=1 MPI_HOME=/opt/mpi ROCM_HOME=/opt/dtk NCCL_HOME=/opt/dtk/rccl \
+       CUSTOM_RCCL_LIB=/opt/dtk/rccl/lib/librccl.so -j32; then
+    ./build/all_reduce_perf -b 8 -e 1G -f 2 -g 8 2>&1 | tee "$log_dir/all_reduce_perf_8.log"
+    ./build/all_reduce_perf -b 4 -e 1G -f 2 -g 4 2>&1 | tee "$log_dir/all_reduce_perf_4.log"
+  else
+    echo "[ERROR] RCCL编译失败" | tee "$log_dir/rccl_build_fail.log"
+  fi
+  cd ..
+else
+  echo "[WARN] 缺少git或make，跳过RCCL测试" | tee "$log_dir/rccl_skip.log"
+fi
+
+echo "==================== DCU-ENV-CHECK ===================="
+if [ ! -d dcu_env_check ]; then
+  git clone http://developer.sourcefind.cn/codes/OpenDAS/dcu_env_check.git || {
+    echo "[ERROR] DCU环境检查代码克隆失败" | tee "$log_dir/dcu_clone_fail.log"
+    exit 1
+  }
+fi
+
+cd dcu_env_check && {
+  bash system_check.sh 2>&1 | tee "$log_dir/dcu_env_check.log"
+  cd ..
+} || {
+  echo "[ERROR] DCU环境检查执行失败" | tee "$log_dir/dcu_check_fail.log"
+  exit 1
+}
+
+echo "==================== 检查完成 ===================="
+echo "所有日志已保存至: $log_dir"
+ls -lh "$log_dir"
\ No newline at end of file
--- a/1_env_check/start.sh
+++ b/1_env_check/start.sh
+docker build -t env_check . && \
+docker run \
+-v /usr/local/hyhal:/usr/local/hyhal:ro \
+-v /opt/hyhal:/opt/hyhal:ro   \
+-v $PWD/outputs/env_check_outputs:/workspace/test/env_check_outputs/  \
+--ipc=host \
+--cap-add=SYS_PTRACE \
+--group-add video \
+--ulimit memlock=-1:-1 \
+--privileged \
+--device=/dev/kfd \
+--device=/dev/mkfd \
+--device=/dev/dri \
+--shm-size=500G \
+-u root \
+--security-opt seccomp=unconfined \
+env_check \
\ No newline at end of file
--- a/2_env_check&model_download&llm_inference/Dockerfile
+++ b/2_env_check&model_download&llm_inference/Dockerfile
+# 使用官方光源基础镜像
+FROM image.sourcefind.cn:5000/dcu/admin/base/custom:vllm0.8.5-ubuntu22.04-dtk25.04-rc7-das1.5-py3.10-20250521-fixpy-rocblas0521-beta2
+
+# 安装基础工具
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    iproute2 \
+    dmidecode \
+    ipmitool \
+    git \
+    curl \
+    jq \
+    lshw \
+    iputils-ping \
+    pciutils \
+    && rm -rf /var/lib/apt/lists/*
+
+# 创建目录结构
+RUN mkdir -p /workspace/scripts && \
+    mkdir -p /workspace/configs && \
+    mkdir -p /workspace/test/env_check_outputs && \
+    mkdir -p /workspace/test/inference_outputs && \
+    mkdir -p /workspace/test/models 
+
+# 复制脚本
+COPY ./scripts/* /workspace/scripts/
+COPY ./configs/* /workspace/configs/
+RUN chmod +x /workspace/scripts/*
+RUN chmod +x /workspace/configs*
+
+# 设置工作目录（建议直接设为脚本目录）
+WORKDIR /workspace/scripts/
+
+# 直接执行脚本（无需cd）
+CMD bash -c "\
+  bash entrypoint.sh"
\ No newline at end of file
--- a/2_env_check&model_download&llm_inference/configs/download-list.cfg
+++ b/2_env_check&model_download&llm_inference/configs/download-list.cfg
+# 格式: 模型ID;本地保存路径
+#模型ID为modelscope官网指定的id
+Qwen/Qwen3-0.6B;/workspace/test/models/Qwen/Qwen3-0.6B
\ No newline at end of file
--- a/2_env_check&model_download&llm_inference/configs/model_to_test.cfg
+++ b/2_env_check&model_download&llm_inference/configs/model_to_test.cfg
+# 格式说明:
+# 模型名称;模型路径;tp;batch;prompt_tokens;completion_tokens;dtype;max_model_len;gpu_memory_utilization
+#模型路径为docker容器内的路径
+# 多个值用逗号分隔
+
+
+Qwen3-0.6B;/workspace/test/models/Qwen/Qwen3-0.6B;1;1;512;512;float16;32768;0.95
--- a/2_env_check&model_download&llm_inference/scripts/benchmark_throughput.py
+++ b/2_env_check&model_download&llm_inference/scripts/benchmark_throughput.py
+# SPDX-License-Identifier: Apache-2.0
+"""Benchmark offline inference throughput."""
+import argparse
+import dataclasses
+import json
+import random
+import time
+from functools import cache
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+import torch
+import uvloop
+from PIL import Image
+from tqdm import tqdm
+from transformers import (AutoModelForCausalLM, AutoTokenizer,
+                          PreTrainedTokenizerBase)
+
+
+from vllm.inputs import PromptType
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.entrypoints.openai.api_server import (
+    build_async_engine_client_from_engine_args)
+from vllm.inputs import TextPrompt
+from vllm.lora.request import LoRARequest
+from vllm.lora.utils import get_adapter_absolute_path
+from vllm.multimodal import MultiModalDataDict
+from vllm.sampling_params import BeamSearchParams
+from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
+from vllm.utils import FlexibleArgumentParser, merge_async_iterators
+
+
+@dataclasses.dataclass
+class SampleRequest:
+    """A class representing a single inference request for benchmarking.
+
+    Attributes:
+        prompt: The input text prompt for the model.
+        prompt_len: The length of the prompt in tokens.
+        expected_output_len: The expected length of the output in tokens.
+        multi_modal_data: Optional dictionary containing multi-modal data (e.g.
+            images).
+        lora_request: Optional LoRARequest specifying the LoRA to use. 
+    """
+    prompt: str
+    prompt_len: int
+    expected_output_len: int
+    multi_modal_data: Optional[MultiModalDataDict] = None
+    lora_request: Optional[LoRARequest] = None
+
+
+def _get_prompt_for_image_model(question: str, *, model: str) -> str:
+    """Prepend and append special tokens around the question to form a prompt.
+
+    Args:
+        question: The input question text to wrap with special tokens
+        model: The name of the model being used, to determine which special
+            tokens to add
+
+    Returns:
+        The formatted prompt string with appropriate special tokens for the
+            model
+
+    Raises:
+        ValueError: If an unsupported model name is provided
+    """
+    model = model.lower()
+    if "pixtral" in model:
+        return f"<s>[INST]{question}\n[IMG][/INST]"
+    raise ValueError(f"Unsupported model {model}")
+
+
+@cache
+def lora_path_on_disk(lora_path: str) -> str:
+    return get_adapter_absolute_path(lora_path)
+
+
+lora_tokenizer_cache: Dict[int, AnyTokenizer] = {}
+
+
+def get_random_lora_request(
+        args: argparse.Namespace
+) -> Tuple[LoRARequest, Optional[AnyTokenizer]]:
+    global lora_tokenizer_cache
+    lora_id = random.randint(1, args.max_loras)
+    lora_request = LoRARequest(lora_name=str(lora_id),
+                               lora_int_id=lora_id,
+                               lora_path=lora_path_on_disk(args.lora_path))
+    if lora_id not in lora_tokenizer_cache:
+        lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
+    return lora_request, lora_tokenizer_cache[lora_id]
+
+
+def sample_requests(tokenizer: PreTrainedTokenizerBase,
+                    args: argparse.Namespace) -> List[SampleRequest]:
+
+    dataset_path: str = args.dataset
+    num_requests: int = args.num_prompts
+    fixed_output_len: Optional[int] = args.output_len
+    model: str = args.model
+    if fixed_output_len is not None and fixed_output_len < 4:
+        raise ValueError("output_len too small")
+
+    # Load the dataset.
+    with open(dataset_path) as f:
+        dataset = json.load(f)
+    # Filter out the conversations with less than 2 turns.
+    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+    # Shuffle the dataset.
+    random.shuffle(dataset)
+
+    # Filter out sequences that are too long or too short
+    filtered_dataset: List[SampleRequest] = []
+    for data in tqdm(dataset,
+                     total=len(filtered_dataset),
+                     desc="sampling requests"):
+        if len(filtered_dataset) == num_requests:
+            break
+
+        # Only keep the first two turns of each conversation.
+        prompt = data["conversations"][0]["value"]
+        completion = data["conversations"][1]["value"]
+
+        multi_modal_data: Optional[MultiModalDataDict] = None
+        if "image" in data:
+            multi_modal_data = multi_modal_data or {}
+            image_path = data["image"]
+            # TODO(vllm-project/vllm/issues/9778): Support multiple images.
+            assert isinstance(image_path,
+                              str), "Only support single image input"
+            try:
+                multi_modal_data["image"] = Image.open(image_path).convert(
+                    "RGB")
+            except FileNotFoundError:
+                # Ignore datapoint where asset is missing
+                continue
+            prompt = _get_prompt_for_image_model(question=prompt, model=model)
+
+        request_tokenizer = tokenizer
+        lora_request: Optional[LoRARequest] = None
+        if args.enable_lora:
+            lora_request, lora_tokenizer = get_random_lora_request(args)
+            if lora_tokenizer:
+                request_tokenizer = lora_tokenizer
+
+        # Tokenize the prompts and completions.
+        prompt_token_ids = request_tokenizer(prompt).input_ids
+        completion_token_ids = request_tokenizer(completion).input_ids
+        prompt_len = len(prompt_token_ids)
+        output_len = len(completion_token_ids
+                         ) if fixed_output_len is None else fixed_output_len
+        if prompt_len < 4 or output_len < 4:
+            # Prune too short sequences.
+            continue
+        if prompt_len > 1024 or prompt_len + output_len > 2048:
+            # Prune too long sequences.
+            continue
+        filtered_dataset.append(
+            SampleRequest(prompt=prompt,
+                          prompt_len=prompt_len,
+                          expected_output_len=output_len,
+                          multi_modal_data=multi_modal_data,
+                          lora_request=lora_request))
+
+    return filtered_dataset
+
+
+def run_vllm(
+    requests_json: List[SampleRequest],
+    n: int,
+    num_iters_warmup: int,
+    engine_args: EngineArgs,
+
+) -> float:
+    from vllm import LLM, SamplingParams
+    llm = LLM(**dataclasses.asdict(engine_args))
+
+    # warmup
+    warmup_sampling_params = SamplingParams(
+        n=args.n,
+        temperature=1.0,
+        top_p=1.0,
+        ignore_eos=True,
+        max_tokens=10,
+    )
+    dummy_prompt_token_ids = np.random.randint(10000, size=(1,10))
+    dummy_prompts: List[PromptType] = [{
+        "prompt_token_ids": batch
+    } for batch in dummy_prompt_token_ids.tolist()]
+    
+    print("Warming up...")
+    for _ in tqdm(range(num_iters_warmup), desc="Warmup iterations"):
+        llm.generate(dummy_prompts,
+                        sampling_params=warmup_sampling_params,
+                        use_tqdm=False)
+
+    info_json={}
+    for ELEprompt in args.num_prompts:
+        for ELEinput,ELEoutput  in zip(args.input_len,args.output_len):
+            info={}
+            requests=requests_json["{}_{}_{}".format(ELEprompt,ELEinput,ELEoutput)]
+
+            # Add the requests to the engine.
+            prompts: List[TextPrompt] = []
+            sampling_params: List[SamplingParams] = []
+            for request in requests:
+                prompts.append(
+                    TextPrompt(prompt=request.prompt,
+                            multi_modal_data=request.multi_modal_data))
+                sampling_params.append(
+                    SamplingParams(
+                        n=n,
+                        temperature=1.0,
+                        top_p=1.0,
+                        ignore_eos=True,
+                        max_tokens=request.expected_output_len,
+                    ))
+            lora_requests: Optional[List[LoRARequest]] = None
+            if engine_args.enable_lora:
+                lora_requests = [request.lora_request for request in requests]
+
+
+
+            use_beam_search = False
+
+            if not use_beam_search:
+                start = time.perf_counter()
+                real_output=llm.generate(prompts,
+                            sampling_params,
+                            lora_request=lora_requests,
+                            use_tqdm=True)
+                end = time.perf_counter()
+            else:
+                assert lora_requests is None, "BeamSearch API does not support LoRA"
+                prompts = [request.prompt for request in requests]
+                # output_len should be the same for all requests.
+                output_len = requests[0][2]
+                for request in requests:
+                    assert request.expected_output_len == output_len
+
+                start = time.perf_counter()
+                real_output = llm.beam_search(
+                    prompts,
+                    BeamSearchParams(
+                        beam_width=n,
+                        max_tokens=output_len,
+                        ignore_eos=True,
+                    ))
+                end = time.perf_counter()
+            total_ttfts = []
+            total_tpops = []
+            total_output_token_throughput = []
+            total_inout_token_throughput = []
+
+            for output in real_output:
+
+                ttft_ = output.metrics.first_token_time - output.metrics.arrival_time
+                tpop_ = (output.metrics.finished_time - output.metrics.arrival_time - ttft_) / (ELEoutput-1)
+                output_token_throughput = (ELEoutput) / (output.metrics.finished_time - output.metrics.arrival_time)
+                inout_token_throughput = (ELEoutput + ELEinput) / (output.metrics.finished_time - output.metrics.arrival_time)
+                total_ttfts.append(ttft_)
+                total_tpops.append(tpop_)
+                total_output_token_throughput.append(output_token_throughput)
+                total_inout_token_throughput.append(inout_token_throughput)
+
+
+            total_num_tokens = sum(request.prompt_len + request.expected_output_len
+                for request in requests)
+            total_output_tokens = sum(request.expected_output_len
+                for request in requests)
+            # ttft_mean = np.mean(total_ttfts)
+            # ttft_median = np.median(total_ttfts or 0)
+            # ttft_p99 = np.percentile(total_ttfts or 0, 99)
+
+            # tpop_mean = np.mean(total_tpops)
+            # tpop_median = np.median(total_tpops or 0)
+            # tpop_p99 = np.percentile(total_tpops or 0, 99)
+
+            # output_token_throughput_mean = np.mean(total_output_token_throughput)
+            # output_token_throughput_median = np.median(total_output_token_throughput or 0)
+            # output_token_throughput_p99 = np.percentile(total_output_token_throughput or 0, 99)
+
+            # inout_token_throughput_mean = np.mean(total_inout_token_throughput)
+            # inout_token_throughput_median = np.median(total_inout_token_throughput or 0)
+            # inout_token_throughput_p99 = np.percentile(total_inout_token_throughput or 0, 99)
+
+            info["elapsed_time"] = np.around(end - start,2)
+            info["Throughput"] = np.around(len(requests) / info['elapsed_time'],2)
+            info["total_tokens"] = np.around(total_num_tokens / info['elapsed_time'],2)
+            info["output_tokens"] = np.around(total_output_tokens / info['elapsed_time'],2)
+
+
+            info["ttft_mean"] = np.around(np.mean(total_ttfts),5)
+            info["ttft_median"] = np.around(np.median(total_ttfts or 0),5)
+            info["ttft_p99"] = np.around(np.percentile(total_ttfts or 0, 99),5)
+
+            info["tpop_mean"] = np.around(np.mean(total_tpops),4)
+            info["tpop_median"] = np.around(np.median(total_tpops or 0),5)
+            info["tpop_p99"] = np.around(np.percentile(total_tpops or 0, 99),5)
+
+            info["output_token_throughput_mean"]  = np.around(np.mean(total_output_token_throughput),2)
+            info["output_token_throughput_median"]  = np.around(np.median(total_output_token_throughput or 0),2)
+            info["output_token_throughput_p99"]  = np.around(np.percentile(total_output_token_throughput or 0, 99),2)
+
+            info["inout_token_throughput_mean"] = np.around(np.mean(total_inout_token_throughput),2)
+            info["inout_token_throughput_median"] = np.around(np.median(total_inout_token_throughput or 0),2)
+            info["inout_token_throughput_p99"] = np.around(np.percentile(total_inout_token_throughput or 0, 99),2)
+
+            
+            info_json["{}_{}_{}".format(ELEprompt,ELEinput,ELEoutput)] = info
+            print("promt:{},input:{},output:{}".format(ELEprompt,ELEinput,ELEoutput))
+            print(f"Latency: {info['elapsed_time']:.2f} s")
+            print(f"Throughput: {len(requests) / info['elapsed_time']:.2f} requests/s, "
+                f"{total_num_tokens / info['elapsed_time']:.2f} total tokens/s, "
+                f"{total_output_tokens / info['elapsed_time']:.2f} output tokens/s")
+            print("==============================================")
+            print(f"total_out_tokens: {total_output_tokens: .2f} tokens")
+            print(f"elapsed_time: {info['elapsed_time']: .2f} s")      # 总耗时
+            print(f"TTFT_mean: {info['ttft_mean']*1000: .2f} ms")       # 首字延时
+            print(f"ttft_p99: {info['ttft_p99']*1000: .2f} ms")
+            print(f"ttft_median: {info['ttft_median']*1000: .2f} ms")
+            print(f"TPOP_mean: {info['tpop_mean']*1000: .2f} ms")              # 单字decode时间
+            print(f"tpop_median: {info['tpop_median']*1000: .2f} ms")
+            print(f"tpop_p99: {info['tpop_p99']*1000: .2f} ms")
+            print(f"output_token_throughput_mean: {info['output_token_throughput_mean']:.2f} tokens/s")           # 单路生成吞吐
+            print(f"output_token_throughput_median: {info['output_token_throughput_median']:.2f} tokens/s")
+            print(f"output_token_throughput_p99: {info['output_token_throughput_p99']:.2f} tokens/s")
+            print(f"inout_token_throughput_mean: {info['inout_token_throughput_mean']:.2f} tokens/s")           # 单路总吞吐
+            print(f"tinout_token_throughput_median: {info['inout_token_throughput_median']:.2f} tokens/s")
+            print(f"inout_token_throughput_p99: {info['inout_token_throughput_p99']:.2f} tokens/s")
+            print("==============================================")
+            print("\n")
+
+    return info_json
+
+async def run_vllm_async(
+    requests: List[SampleRequest],
+    n: int,
+    engine_args: AsyncEngineArgs,
+    disable_frontend_multiprocessing: bool = False,
+) -> float:
+    from vllm import SamplingParams
+
+    async with build_async_engine_client_from_engine_args(
+            engine_args, disable_frontend_multiprocessing) as llm:
+
+        # Add the requests to the engine.
+        prompts: List[TextPrompt] = []
+        sampling_params: List[SamplingParams] = []
+        lora_requests: List[Optional[LoRARequest]] = []
+        for request in requests:
+            prompts.append(
+                TextPrompt(prompt=request.prompt,
+                           multi_modal_data=request.multi_modal_data))
+            sampling_params.append(
+                SamplingParams(
+                    n=n,
+                    temperature=1.0,
+                    top_p=1.0,
+                    ignore_eos=True,
+                    max_tokens=request.expected_output_len,
+                ))
+            lora_requests.append(request.lora_request)
+
+        generators = []
+        start = time.perf_counter()
+        for i, (prompt, sp,
+                lr) in enumerate(zip(prompts, sampling_params, lora_requests)):
+            generator = llm.generate(prompt,
+                                     sp,
+                                     lora_request=lr,
+                                     request_id=f"test{i}")
+            generators.append(generator)
+        all_gens = merge_async_iterators(*generators)
+        async for i, res in all_gens:
+            pass
+        end = time.perf_counter()
+        return end - start
+
+
+def run_hf(
+    requests: List[SampleRequest],
+    model: str,
+    tokenizer: PreTrainedTokenizerBase,
+    n: int,
+    max_batch_size: int,
+    trust_remote_code: bool,
+) -> float:
+    llm = AutoModelForCausalLM.from_pretrained(
+        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
+    if llm.config.model_type == "llama":
+        # To enable padding in the HF backend.
+        tokenizer.pad_token = tokenizer.eos_token
+    llm = llm.cuda()
+
+    pbar = tqdm(total=len(requests))
+    start = time.perf_counter()
+    batch: List[str] = []
+    max_prompt_len = 0
+    max_output_len = 0
+    for i in range(len(requests)):
+        prompt, prompt_len, output_len = requests[i]
+        # Add the prompt to the batch.
+        batch.append(prompt)
+        max_prompt_len = max(max_prompt_len, prompt_len)
+        max_output_len = max(max_output_len, output_len)
+        if len(batch) < max_batch_size and i != len(requests) - 1:
+            # Check if we can add more requests to the batch.
+            _, next_prompt_len, next_output_len = requests[i + 1]
+            if (max(max_prompt_len, next_prompt_len) +
+                    max(max_output_len, next_output_len)) <= 2048:
+                # We can add more requests to the batch.
+                continue
+
+        # Generate the sequences.
+        input_ids = tokenizer(batch, return_tensors="pt",
+                              padding=True).input_ids
+        llm_outputs = llm.generate(
+            input_ids=input_ids.cuda(),
+            do_sample=True,
+            num_return_sequences=n,
+            temperature=1.0,
+            top_p=1.0,
+            use_cache=True,
+            max_new_tokens=max_output_len,
+        )
+        # Include the decoding time.
+        tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
+        pbar.update(len(batch))
+
+        # Clear the batch.
+        batch = []
+        max_prompt_len = 0
+        max_output_len = 0
+    end = time.perf_counter()
+    return end - start
+
+
+def run_mii(
+    requests: List[SampleRequest],
+    model: str,
+    tensor_parallel_size: int,
+    output_len: int,
+) -> float:
+    from mii import client, serve
+    llm = serve(model, tensor_parallel=tensor_parallel_size)
+    prompts = [request.prompt for request in requests]
+
+    start = time.perf_counter()
+    llm.generate(prompts, max_new_tokens=output_len)
+    end = time.perf_counter()
+    client = client(model)
+    client.terminate_server()
+    return end - start
+
+
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+
+    # Sample the requests.
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer, trust_remote_code=args.trust_remote_code)
+    if args.dataset is None:
+        vocab_size = tokenizer.vocab_size
+        requests_json={}
+        for ELEprompt in args.num_prompts:
+            for ELEinput,ELEoutput  in zip(args.input_len,args.output_len):
+                requests = []
+                for _ in range(ELEprompt):
+
+                    request_tokenizer = tokenizer
+                    lora_request: Optional[LoRARequest] = None
+                    if args.enable_lora:
+                        lora_request, lora_tokenizer = get_random_lora_request(args)
+                        if lora_tokenizer:
+                            request_tokenizer = lora_tokenizer
+
+                    # Synthesize a prompt with the given input length.
+                    candidate_ids = [
+                        random.randint(0, vocab_size - 1)
+                        for _ in range(ELEinput)
+                    ]
+                    # As tokenizer may add additional tokens like BOS, we need to try
+                    # different lengths to get the desired input length.
+                    for _ in range(5):  # Max attempts to correct
+                        candidate_prompt = request_tokenizer.decode(candidate_ids)
+                        tokenized_len = len(request_tokenizer.encode(candidate_prompt))
+
+                        if tokenized_len == ELEinput:
+                            break
+
+                        # Adjust length based on difference
+                        diff = ELEinput - tokenized_len
+                        if diff > 0:
+                            candidate_ids.extend([
+                                random.randint(100, vocab_size - 100)
+                                for _ in range(diff)
+                            ])
+                        else:
+                            candidate_ids = candidate_ids[:diff]
+                    requests.append(
+                        SampleRequest(prompt=candidate_prompt,
+                                    prompt_len=ELEinput,
+                                    expected_output_len=ELEoutput,
+                                    lora_request=lora_request))
+                requests_json["{}_{}_{}".format(ELEprompt,ELEinput,ELEoutput)]=requests
+    else:
+        requests = sample_requests(tokenizer, args)
+
+    is_multi_modal = any(request.multi_modal_data is not None
+                         for request in requests)
+    if args.backend == "vllm":
+        if args.async_engine:
+            elapsed_time = uvloop.run(
+                run_vllm_async(
+                    requests,
+                    args.n,
+                    AsyncEngineArgs.from_cli_args(args),
+                    args.disable_frontend_multiprocessing,
+                ))
+        else:
+            info_json = run_vllm(requests_json, args.n, args.num_iters_warmup,
+                                    EngineArgs.from_cli_args(args))
+    elif args.backend == "hf":
+        assert args.tensor_parallel_size == 1
+        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
+                              args.hf_max_batch_size, args.trust_remote_code)
+    elif args.backend == "mii":
+        elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
+                               args.output_len)
+    else:
+        raise ValueError(f"Unknown backend: {args.backend}")
+
+    # file_name=args.model.rsplit("/")[-1]+"-tp"+str(args.tensor_parallel_size)+".txt"
+
+    if is_multi_modal:
+        print("\033[91mWARNING\033[0m: Multi-modal request detected. The "
+                "following metrics are not accurate because image tokens are not"
+                " counted. See vllm-project/vllm/issues/9778 for details.")    
+    with open(args.output_json,"w") as f:
+        title="bs_in_out"
+        data_keys=info_json[list(info_json.keys())[0]].keys()
+        keys_string = ','.join(data_keys)
+        title=title+","+keys_string
+        f.write(title)
+        f.write("\n")
+        for key, value in info_json.items():
+            values_as_strings = [str(value) for value in info_json[key].values()]
+            values_string = ','.join(values_as_strings)
+            key=key+","+values_string
+            f.writelines(key)
+            f.write("\n")
+        # json.dump(info_json, f, indent=4)
+    # Output JSON results if specified
+    # if args.output_json:
+    #     results = {
+    #         "elapsed_time": elapsed_time,
+    #         "num_requests": len(requests),
+    #         "total_num_tokens": total_num_tokens,
+    #         "requests_per_second": len(requests) / elapsed_time,
+    #         "tokens_per_second": total_num_tokens / elapsed_time,
+    #     }
+    #     with open(args.output_json, "w") as f:
+    #         json.dump(results, f, indent=4)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
+    parser.add_argument("--backend",
+                        type=str,
+                        choices=["vllm", "hf", "mii"],
+                        default="vllm")
+    parser.add_argument("--dataset",
+                        type=str,
+                        default=None,
+                        help="Path to the dataset. The dataset is expected to "
+                        "be a json in form of List[Dict[..., conversations: "
+                        "List[Dict[..., value: <prompt_or_response>]]]]")
+    parser.add_argument("--input-len",
+                        type=int,
+                        nargs="*",
+                        default=None,
+                        help="Input prompt length for each request")
+    parser.add_argument("--output-len",
+                        type=int,
+                        nargs="*",
+                        default=None,
+                        help="Output length for each request. Overrides the "
+                        "output length from the dataset.")
+    parser.add_argument("--n",
+                        type=int,
+                        default=1,
+                        help="Number of generated sequences per prompt.")
+    parser.add_argument('--num-iters-warmup',
+                        type=int,
+                        default=1,
+                        help='Number of iterations to run for warmup.')
+    parser.add_argument("--num-prompts",
+                        type=int,
+                        nargs="*",
+                        default=1000,
+                        help="Number of prompts to process.")
+    parser.add_argument("--hf-max-batch-size",
+                        type=int,
+                        default=None,
+                        help="Maximum batch size for HF backend.")
+    parser.add_argument(
+        '--output-json',
+        type=str,
+        default=None,
+        help='Path to save the throughput results in JSON format.')
+    parser.add_argument("--async-engine",
+                        action='store_true',
+                        default=False,
+                        help="Use vLLM async engine rather than LLM class.")
+    parser.add_argument("--disable-frontend-multiprocessing",
+                        action='store_true',
+                        default=False,
+                        help="Disable decoupled async engine frontend.")
+    # LoRA
+    parser.add_argument(
+        "--lora-path",
+        type=str,
+        default=None,
+        help="Path to the lora adapters to use. This can be an absolute path, "
+        "a relative path, or a Hugging Face model identifier.")
+
+    parser = AsyncEngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    if args.tokenizer is None:
+        args.tokenizer = args.model
+    if args.dataset is None:
+        assert args.input_len is not None
+        assert args.output_len is not None
+    else:
+        assert args.input_len is None
+    if args.enable_lora:
+        assert args.lora_path is not None
+
+    if args.backend == "vllm":
+        if args.hf_max_batch_size is not None:
+            raise ValueError("HF max batch size is only for HF backend.")
+    elif args.backend == "hf":
+        if args.hf_max_batch_size is None:
+            raise ValueError("HF max batch size is required for HF backend.")
+        if args.quantization is not None:
+            raise ValueError("Quantization is only for vLLM backend.")
+        if args.enable_lora is not None:
+            raise ValueError("LoRA benchmarking is only supported for vLLM"
+                             " backend")
+    elif args.backend == "mii":
+        if args.dtype != "auto":
+            raise ValueError("dtype must be auto for MII backend.")
+        if args.n != 1:
+            raise ValueError("n must be 1 for MII backend.")
+        if args.quantization is not None:
+            raise ValueError("Quantization is only for vLLM backend.")
+        if args.hf_max_batch_size is not None:
+            raise ValueError("HF max batch size is only for HF backend.")
+        if args.tokenizer != args.model:
+            raise ValueError("Tokenizer must be the same as the model for MII "
+                             "backend.")
+        if args.enable_lora is not None:
+            raise ValueError("LoRA benchmarking is only supported for vLLM"
+                             " backend")
+    main(args)
+
+
--- a/2_env_check&model_download&llm_inference/scripts/download_model.sh
+++ b/2_env_check&model_download&llm_inference/scripts/download_model.sh
+#!/bin/bash
+
+# ModelScope CLI批量下载脚本
+# 使用说明: ./ms_download.sh -f 模型列表.cfg [-F 强制重新下载]
+pip install modelscope -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+# 参数解析
+CONFIG_FILE=""
+FORCE_DOWNLOAD=false
+MODELSCOPE_CMD="modelscope download"
+
+while getopts "f:F" opt; do
+  case $opt in
+    f) CONFIG_FILE="$OPTARG" ;;
+    F) FORCE_DOWNLOAD=true ;;
+    *) echo "Usage: $0 -f config.cfg [-F]" >&2
+       exit 1
+  esac
+done
+
+# 检查配置文件
+if [ ! -f "$CONFIG_FILE" ]; then
+  echo "Error: Config file $CONFIG_FILE not found!" >&2
+  exit 1
+fi
+
+# 检查modelscope是否安装
+if ! command -v modelscope &> /dev/null; then
+  echo "Error: modelscope CLI not installed. Please install with: pip install modelscope" >&2
+  exit 1
+fi
+
+# 读取配置文件
+TOTAL=0
+SUCCESS=0
+FAILED=0
+
+echo "=== Starting batch download ==="
+
+while IFS=';' read -r model_id local_dir || [[ -n "$model_id" ]]; do
+  # 跳过空行和注释
+  [[ -z "$model_id" || "$model_id" =~ ^# ]] && continue
+  
+  ((TOTAL++))
+  
+  # 清理变量
+  model_id=$(echo "$model_id" | xargs)
+  local_dir=$(echo "$local_dir" | xargs)
+  
+  echo -e "\n[Progress] $TOTAL. Downloading $model_id"
+  echo "[Location] $local_dir"
+
+  # 检查目录是否存在
+  if [ "$FORCE_DOWNLOAD" = false ] && [ -d "$local_dir" ]; then
+    echo "[Status] Skipped (already exists)"
+    ((SUCCESS++))
+    continue
+  fi
+
+  # 创建目录
+  mkdir -p "$local_dir" || {
+    echo "[Error] Failed to create directory $local_dir" >&2
+    ((FAILED++))
+    continue
+  }
+
+  # 执行下载命令
+  if $MODELSCOPE_CMD --model "$model_id" --local_dir "$local_dir"; then
+    echo "[Status] Download successful"
+    ((SUCCESS++))
+  else
+    echo "[Error] Download failed" >&2
+    ((FAILED++))
+    # 删除空目录防止残留
+    rmdir "$local_dir" 2>/dev/null
+  fi
+
+done < "$CONFIG_FILE"
+
+# 结果统计
+echo -e "\n=== Download summary ==="
+echo "Total:    $TOTAL"
+echo "Success:  $SUCCESS"
+echo "Failed:   $FAILED"
+
+# 退出状态
+if [ "$FAILED" -gt 0 ]; then
+  exit 1
+else
+  exit 0
+fi
\ No newline at end of file
--- a/2_env_check&model_download&llm_inference/scripts/entrypoint.sh
+++ b/2_env_check&model_download&llm_inference/scripts/entrypoint.sh
+#!/bin/bash
+
+# 执行环境检查
+echo "==================== 开始系统环境检查 ===================="
+/workspace/scripts/run_envcheck.sh
+
+# 下载模型
+echo "==================== 开始模型下载 ===================="
+/workspace/scripts/download_model.sh -f /workspace/configs/download-list.cfg
+
+# 运行性能测试
+echo "==================== 开始性能测试 ===================="
+/workspace/scripts/run_benchmark.sh 
+
+echo "==================== 所有测试完成 ===================="
\ No newline at end of file
--- a/2_env_check&model_download&llm_inference/scripts/run_benchmark.sh
+++ b/2_env_check&model_download&llm_inference/scripts/run_benchmark.sh
+export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export ROCBLAS_COMPUTETYPE_FP16R=0
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export NCCL_LAUNCH_MODE=GROUP
+export NCCL_NCHANNELS_PER_PEER=16
+export NCCL_MAX_NCHANNELS=16
+export NCCL_MIN_NCHANNELS=16
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_P2P_LEVEL=SYS
+export NCCL_NET_GDR_LEVEL=7
+export NCCL_NET_GDR_READ=1
+export RCCL_SDMA_COPY_ENABLE=0
+export SENDRECV_STREAM_WITH_COMPUTE=1
+export LD_LIBRARY_PATH=/usr/local/lib/python3.10/site-packages/torch/lib/:$LD_LIBRARY_PATH
+export ALLREDUCE_STREAM_WITH_COMPUTE=1
+export ALLREDUCE_STREAM_WITH_COMPUTE=1
+export NCCL_MIN_NCHANNELS=16
+export NCCL_MAX_NCHANNELS=16
+export VLLM_PCIE_USE_CUSTOM_ALLREDUCE=1
+
+export VLLM_RPC_TIMEOUT=100000
+
+
+
+#!/bin/bash
+
+# 模型配置文件路径
+MODELS_CONFIG="/workspace/configs/model_to_test.cfg"
+
+# 结果目录
+RESULTS_DIR="/workspace/test/inference_outputs"
+
+# 读取配置文件，跳过注释和空行
+while IFS= read -r line || [[ -n "$line" ]]; do
+    # 跳过注释行和空行
+    if [[ "$line" =~ ^# ]] || [[ -z "$line" ]]; then
+        continue
+    fi
+    
+    # 解析配置行
+    IFS=';' read -ra CONFIG <<< "$line"
+    
+    model_name="${CONFIG[0]}"
+    model_path="${CONFIG[1]}"
+    tp="${CONFIG[2]}"
+    batch="${CONFIG[3]//,/ }"        # 将逗号替换为空格
+    prompt_tokens="${CONFIG[4]//,/ }"
+    completion_tokens="${CONFIG[5]//,/ }"
+    dtype="${CONFIG[6]}"
+    max_model_len="${CONFIG[7]}"
+    gpu_memory_utilization="${CONFIG[8]}"
+    
+    echo "开始测试模型: $model_name"
+    echo "模型路径: $model_path"
+    echo "参数配置:"
+    echo "  tensor_parallel_size: $tp"
+    echo "  batch_sizes: $batch"
+    echo "  prompt_tokens: $prompt_tokens"
+    echo "  completion_tokens: $completion_tokens"
+    echo "  dtype: $dtype"
+    echo "  max_model_len: $max_model_len"
+    echo "  gpu_memory_utilization: $gpu_memory_utilization"
+    
+    # 创建模型专属结果目录
+    model_result_dir="${RESULTS_DIR}/${model_name}"
+    mkdir -p "$model_result_dir"
+    
+    # 运行基准测试
+    python /workspace/scripts/benchmark_throughput.py \
+        --model "$model_path" \
+        --tensor-parallel-size "$tp" \
+        --num-prompts $batch \
+        --input-len $prompt_tokens \
+        --output-len $completion_tokens \
+        --dtype "$dtype" \
+        --trust-remote-code \
+        --max-model-len "$max_model_len" \
+        --gpu-memory-utilization "$gpu_memory_utilization" \
+        --output-json "${model_result_dir}/${model_name}_tp${tp}.txt" \
+        2>&1 | tee "${model_result_dir}/${model_name}_tp${tp}.log"
+    
+    echo "完成测试模型: $model_name"
+    echo "结果保存在: $model_result_dir"
+    echo "----------------------------------------"
+done < "$MODELS_CONFIG"
\ No newline at end of file
--- a/2_env_check&model_download&llm_inference/scripts/run_envcheck.sh
+++ b/2_env_check&model_download&llm_inference/scripts/run_envcheck.sh
+#!/bin/bash
+set -eo pipefail  # 严格错误处理
+
+log_dir="/workspace/test/env_check_outputs"
+mkdir -p "$log_dir"
+
+echo "==================== 开始系统环境检查 ===================="
+
+# 基础检查函数
+run_test() {
+  local name=$1
+  shift
+  echo "[RUN] $name"
+  "$@" 2>&1 | tee "$log_dir/${name}.log" || {
+    echo "[WARN] $name 检查失败" | tee -a "$log_dir/${name}.log"
+    return 1
+  }
+}
+
+run_pipe_test() {
+  local name=$1
+  local cmd=$2
+  echo "[RUN] $name"
+  
+  bash -c "$cmd" 2>&1 | tee "$log_dir/${name}.log" || {
+    echo "[WARN] $name 检查失败" | tee -a "$log_dir/${name}.log"
+    return 1
+  }
+}
+
+# 系统基础检查
+run_test rocm_bandwidth_test rocm-bandwidth-test
+run_test hy_smi hy-smi
+run_test hy_smi_config hy-smi -c
+run_test pip_list pip list
+run_test cpu_info lscpu
+run_test cpu_cores nproc
+run_test memory_usage free -h
+run_test disk_usage df -h
+run_test hardware_info lshw -short || true
+run_test network_interfaces ip a
+run_test ibstat ibstat
+run_test ibdev2netdev ibdev2netdev
+run_pipe_test ACS_stat "lspci -vvv | grep -i acsct"
+run_test rocm_info rocminfo || true
+
+echo "==================== RCCL-TEST ===================="
+cd /workspace/test/env_check_outputs
+
+if command -v git &>/dev/null && command -v make &>/dev/null; then
+  if [ ! -d rccl-tests ]; then
+    git clone https://www.ghproxy.cn/github.com/ROCm/rccl-tests.git --depth 1 -b master || exit 1
+  fi
+  
+  cd rccl-tests || exit 1
+  source /opt/dtk/env.sh
+  
+  if make MPI=1 MPI_HOME=/opt/mpi ROCM_HOME=/opt/dtk NCCL_HOME=/opt/dtk/rccl \
+       CUSTOM_RCCL_LIB=/opt/dtk/rccl/lib/librccl.so -j32; then
+    ./build/all_reduce_perf -b 8 -e 1G -f 2 -g 8 2>&1 | tee "$log_dir/all_reduce_perf_8.log"
+    ./build/all_reduce_perf -b 4 -e 1G -f 2 -g 4 2>&1 | tee "$log_dir/all_reduce_perf_4.log"
+  else
+    echo "[ERROR] RCCL编译失败" | tee "$log_dir/rccl_build_fail.log"
+  fi
+  cd ..
+else
+  echo "[WARN] 缺少git或make，跳过RCCL测试" | tee "$log_dir/rccl_skip.log"
+fi
+
+echo "==================== DCU-ENV-CHECK ===================="
+if [ ! -d dcu_env_check ]; then
+  git clone http://developer.sourcefind.cn/codes/OpenDAS/dcu_env_check.git || {
+    echo "[ERROR] DCU环境检查代码克隆失败" | tee "$log_dir/dcu_clone_fail.log"
+    exit 1
+  }
+fi
+
+cd dcu_env_check && {
+  bash system_check.sh 2>&1 | tee "$log_dir/dcu_env_check.log"
+  cd ..
+} || {
+  echo "[ERROR] DCU环境检查执行失败" | tee "$log_dir/dcu_check_fail.log"
+  exit 1
+}
+
+echo "==================== 检查完成 ===================="
+echo "所有日志已保存至: $log_dir"
+ls -lh "$log_dir"
\ No newline at end of file
--- a/2_env_check&model_download&llm_inference/start.sh
+++ b/2_env_check&model_download&llm_inference/start.sh
+docker build -t vllm-test1 . && \
+docker run \
+-v /usr/local/hyhal:/usr/local/hyhal:ro \
+-v /opt/hyhal:/opt/hyhal:ro   \
+-v $PWD/outputs/env_check_outputs:/workspace/test/env_check_outputs/  \
+-v $PWD/outputs/models:/workspace/test/models/  \
+-v $PWD/outputs/inference_outputs:/workspace/test/inference_outputs/  \
+--ipc=host \
+--cap-add=SYS_PTRACE \
+--group-add video \
+--ulimit memlock=-1:-1 \
+--privileged \
+--device=/dev/kfd \
+--device=/dev/mkfd \
+--device=/dev/dri \
+--shm-size=500G \
+-u root \
+--security-opt seccomp=unconfined \
+vllm-test1 \
\ No newline at end of file
--- a/3_env_check&batches_llm_inference/Dockerfile
+++ b/3_env_check&batches_llm_inference/Dockerfile
+# 使用官方光源基础镜像
+FROM image.sourcefind.cn:5000/dcu/admin/base/custom:vllm0.8.5-ubuntu22.04-dtk25.04-rc7-das1.5-py3.10-20250521-fixpy-rocblas0521-beta2
+
+# 安装基础工具
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    iproute2 \
+    dmidecode \
+    ipmitool \
+    git \
+    curl \
+    jq \
+    lshw \
+    iputils-ping \
+    pciutils \
+    && rm -rf /var/lib/apt/lists/*
+
+# 创建目录结构
+RUN mkdir -p /workspace/scripts && \
+    mkdir -p /workspace/configs && \
+    mkdir -p /workspace/test/env_check_outputs && \
+    mkdir -p /workspace/test/inference_outputs 
+
+# 复制脚本
+COPY ./scripts/* /workspace/scripts/
+COPY ./configs/* /workspace/configs/
+RUN chmod +x /workspace/scripts/*
+RUN chmod +x /workspace/configs*
+
+# 设置工作目录（建议直接设为脚本目录）
+WORKDIR /workspace/scripts/
+
+# 直接执行脚本（无需cd）
+CMD bash -c "\
+  bash entrypoint.sh"
\ No newline at end of file
--- a/3_env_check&batches_llm_inference/configs/model_to_test.cfg
+++ b/3_env_check&batches_llm_inference/configs/model_to_test.cfg
+# 格式说明:
+# 模型名称;模型路径;tp;batch;prompt_tokens;completion_tokens;dtype;max_model_len;gpu_memory_utilization
+#模型路径为docker容器内的路径
+# 多个值用逗号分隔
+
+
+Qwen3-4B;/workspace/test/models/Qwen/Qwen3-4B;1;1;512;512;float16;32768;0.95
+Qwen3-0.6B;/workspace/test/models/Qwen/Qwen3-0.6B;1;1;512;512;float16;32768;0.95
+Qwen3-1.7B;/workspace/test/models/Qwen/Qwen3-1.7B;1;1;512;512;float16;32768;0.95
\ No newline at end of file
--- a/3_env_check&batches_llm_inference/scripts/benchmark_throughput.py
+++ b/3_env_check&batches_llm_inference/scripts/benchmark_throughput.py
+# SPDX-License-Identifier: Apache-2.0
+"""Benchmark offline inference throughput."""
+import argparse
+import dataclasses
+import json
+import random
+import time
+from functools import cache
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+import torch
+import uvloop
+from PIL import Image
+from tqdm import tqdm
+from transformers import (AutoModelForCausalLM, AutoTokenizer,
+                          PreTrainedTokenizerBase)
+
+
+from vllm.inputs import PromptType
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.entrypoints.openai.api_server import (
+    build_async_engine_client_from_engine_args)
+from vllm.inputs import TextPrompt
+from vllm.lora.request import LoRARequest
+from vllm.lora.utils import get_adapter_absolute_path
+from vllm.multimodal import MultiModalDataDict
+from vllm.sampling_params import BeamSearchParams
+from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
+from vllm.utils import FlexibleArgumentParser, merge_async_iterators
+
+
+@dataclasses.dataclass
+class SampleRequest:
+    """A class representing a single inference request for benchmarking.
+
+    Attributes:
+        prompt: The input text prompt for the model.
+        prompt_len: The length of the prompt in tokens.
+        expected_output_len: The expected length of the output in tokens.
+        multi_modal_data: Optional dictionary containing multi-modal data (e.g.
+            images).
+        lora_request: Optional LoRARequest specifying the LoRA to use. 
+    """
+    prompt: str
+    prompt_len: int
+    expected_output_len: int
+    multi_modal_data: Optional[MultiModalDataDict] = None
+    lora_request: Optional[LoRARequest] = None
+
+
+def _get_prompt_for_image_model(question: str, *, model: str) -> str:
+    """Prepend and append special tokens around the question to form a prompt.
+
+    Args:
+        question: The input question text to wrap with special tokens
+        model: The name of the model being used, to determine which special
+            tokens to add
+
+    Returns:
+        The formatted prompt string with appropriate special tokens for the
+            model
+
+    Raises:
+        ValueError: If an unsupported model name is provided
+    """
+    model = model.lower()
+    if "pixtral" in model:
+        return f"<s>[INST]{question}\n[IMG][/INST]"
+    raise ValueError(f"Unsupported model {model}")
+
+
+@cache
+def lora_path_on_disk(lora_path: str) -> str:
+    return get_adapter_absolute_path(lora_path)
+
+
+lora_tokenizer_cache: Dict[int, AnyTokenizer] = {}
+
+
+def get_random_lora_request(
+        args: argparse.Namespace
+) -> Tuple[LoRARequest, Optional[AnyTokenizer]]:
+    global lora_tokenizer_cache
+    lora_id = random.randint(1, args.max_loras)
+    lora_request = LoRARequest(lora_name=str(lora_id),
+                               lora_int_id=lora_id,
+                               lora_path=lora_path_on_disk(args.lora_path))
+    if lora_id not in lora_tokenizer_cache:
+        lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
+    return lora_request, lora_tokenizer_cache[lora_id]
+
+
+def sample_requests(tokenizer: PreTrainedTokenizerBase,
+                    args: argparse.Namespace) -> List[SampleRequest]:
+
+    dataset_path: str = args.dataset
+    num_requests: int = args.num_prompts
+    fixed_output_len: Optional[int] = args.output_len
+    model: str = args.model
+    if fixed_output_len is not None and fixed_output_len < 4:
+        raise ValueError("output_len too small")
+
+    # Load the dataset.
+    with open(dataset_path) as f:
+        dataset = json.load(f)
+    # Filter out the conversations with less than 2 turns.
+    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+    # Shuffle the dataset.
+    random.shuffle(dataset)
+
+    # Filter out sequences that are too long or too short
+    filtered_dataset: List[SampleRequest] = []
+    for data in tqdm(dataset,
+                     total=len(filtered_dataset),
+                     desc="sampling requests"):
+        if len(filtered_dataset) == num_requests:
+            break
+
+        # Only keep the first two turns of each conversation.
+        prompt = data["conversations"][0]["value"]
+        completion = data["conversations"][1]["value"]
+
+        multi_modal_data: Optional[MultiModalDataDict] = None
+        if "image" in data:
+            multi_modal_data = multi_modal_data or {}
+            image_path = data["image"]
+            # TODO(vllm-project/vllm/issues/9778): Support multiple images.
+            assert isinstance(image_path,
+                              str), "Only support single image input"
+            try:
+                multi_modal_data["image"] = Image.open(image_path).convert(
+                    "RGB")
+            except FileNotFoundError:
+                # Ignore datapoint where asset is missing
+                continue
+            prompt = _get_prompt_for_image_model(question=prompt, model=model)
+
+        request_tokenizer = tokenizer
+        lora_request: Optional[LoRARequest] = None
+        if args.enable_lora:
+            lora_request, lora_tokenizer = get_random_lora_request(args)
+            if lora_tokenizer:
+                request_tokenizer = lora_tokenizer
+
+        # Tokenize the prompts and completions.
+        prompt_token_ids = request_tokenizer(prompt).input_ids
+        completion_token_ids = request_tokenizer(completion).input_ids
+        prompt_len = len(prompt_token_ids)
+        output_len = len(completion_token_ids
+                         ) if fixed_output_len is None else fixed_output_len
+        if prompt_len < 4 or output_len < 4:
+            # Prune too short sequences.
+            continue
+        if prompt_len > 1024 or prompt_len + output_len > 2048:
+            # Prune too long sequences.
+            continue
+        filtered_dataset.append(
+            SampleRequest(prompt=prompt,
+                          prompt_len=prompt_len,
+                          expected_output_len=output_len,
+                          multi_modal_data=multi_modal_data,
+                          lora_request=lora_request))
+
+    return filtered_dataset
+
+
+def run_vllm(
+    requests_json: List[SampleRequest],
+    n: int,
+    num_iters_warmup: int,
+    engine_args: EngineArgs,
+
+) -> float:
+    from vllm import LLM, SamplingParams
+    llm = LLM(**dataclasses.asdict(engine_args))
+
+    # warmup
+    warmup_sampling_params = SamplingParams(
+        n=args.n,
+        temperature=1.0,
+        top_p=1.0,
+        ignore_eos=True,
+        max_tokens=10,
+    )
+    dummy_prompt_token_ids = np.random.randint(10000, size=(1,10))
+    dummy_prompts: List[PromptType] = [{
+        "prompt_token_ids": batch
+    } for batch in dummy_prompt_token_ids.tolist()]
+    
+    print("Warming up...")
+    for _ in tqdm(range(num_iters_warmup), desc="Warmup iterations"):
+        llm.generate(dummy_prompts,
+                        sampling_params=warmup_sampling_params,
+                        use_tqdm=False)
+
+    info_json={}
+    for ELEprompt in args.num_prompts:
+        for ELEinput,ELEoutput  in zip(args.input_len,args.output_len):
+            info={}
+            requests=requests_json["{}_{}_{}".format(ELEprompt,ELEinput,ELEoutput)]
+
+            # Add the requests to the engine.
+            prompts: List[TextPrompt] = []
+            sampling_params: List[SamplingParams] = []
+            for request in requests:
+                prompts.append(
+                    TextPrompt(prompt=request.prompt,
+                            multi_modal_data=request.multi_modal_data))
+                sampling_params.append(
+                    SamplingParams(
+                        n=n,
+                        temperature=1.0,
+                        top_p=1.0,
+                        ignore_eos=True,
+                        max_tokens=request.expected_output_len,
+                    ))
+            lora_requests: Optional[List[LoRARequest]] = None
+            if engine_args.enable_lora:
+                lora_requests = [request.lora_request for request in requests]
+
+
+
+            use_beam_search = False
+
+            if not use_beam_search:
+                start = time.perf_counter()
+                real_output=llm.generate(prompts,
+                            sampling_params,
+                            lora_request=lora_requests,
+                            use_tqdm=True)
+                end = time.perf_counter()
+            else:
+                assert lora_requests is None, "BeamSearch API does not support LoRA"
+                prompts = [request.prompt for request in requests]
+                # output_len should be the same for all requests.
+                output_len = requests[0][2]
+                for request in requests:
+                    assert request.expected_output_len == output_len
+
+                start = time.perf_counter()
+                real_output = llm.beam_search(
+                    prompts,
+                    BeamSearchParams(
+                        beam_width=n,
+                        max_tokens=output_len,
+                        ignore_eos=True,
+                    ))
+                end = time.perf_counter()
+            total_ttfts = []
+            total_tpops = []
+            total_output_token_throughput = []
+            total_inout_token_throughput = []
+
+            for output in real_output:
+
+                ttft_ = output.metrics.first_token_time - output.metrics.arrival_time
+                tpop_ = (output.metrics.finished_time - output.metrics.arrival_time - ttft_) / (ELEoutput-1)
+                output_token_throughput = (ELEoutput) / (output.metrics.finished_time - output.metrics.arrival_time)
+                inout_token_throughput = (ELEoutput + ELEinput) / (output.metrics.finished_time - output.metrics.arrival_time)
+                total_ttfts.append(ttft_)
+                total_tpops.append(tpop_)
+                total_output_token_throughput.append(output_token_throughput)
+                total_inout_token_throughput.append(inout_token_throughput)
+
+
+            total_num_tokens = sum(request.prompt_len + request.expected_output_len
+                for request in requests)
+            total_output_tokens = sum(request.expected_output_len
+                for request in requests)
+            # ttft_mean = np.mean(total_ttfts)
+            # ttft_median = np.median(total_ttfts or 0)
+            # ttft_p99 = np.percentile(total_ttfts or 0, 99)
+
+            # tpop_mean = np.mean(total_tpops)
+            # tpop_median = np.median(total_tpops or 0)
+            # tpop_p99 = np.percentile(total_tpops or 0, 99)
+
+            # output_token_throughput_mean = np.mean(total_output_token_throughput)
+            # output_token_throughput_median = np.median(total_output_token_throughput or 0)
+            # output_token_throughput_p99 = np.percentile(total_output_token_throughput or 0, 99)
+
+            # inout_token_throughput_mean = np.mean(total_inout_token_throughput)
+            # inout_token_throughput_median = np.median(total_inout_token_throughput or 0)
+            # inout_token_throughput_p99 = np.percentile(total_inout_token_throughput or 0, 99)
+
+            info["elapsed_time"] = np.around(end - start,2)
+            info["Throughput"] = np.around(len(requests) / info['elapsed_time'],2)
+            info["total_tokens"] = np.around(total_num_tokens / info['elapsed_time'],2)
+            info["output_tokens"] = np.around(total_output_tokens / info['elapsed_time'],2)
+
+
+            info["ttft_mean"] = np.around(np.mean(total_ttfts),5)
+            info["ttft_median"] = np.around(np.median(total_ttfts or 0),5)
+            info["ttft_p99"] = np.around(np.percentile(total_ttfts or 0, 99),5)
+
+            info["tpop_mean"] = np.around(np.mean(total_tpops),4)
+            info["tpop_median"] = np.around(np.median(total_tpops or 0),5)
+            info["tpop_p99"] = np.around(np.percentile(total_tpops or 0, 99),5)
+
+            info["output_token_throughput_mean"]  = np.around(np.mean(total_output_token_throughput),2)
+            info["output_token_throughput_median"]  = np.around(np.median(total_output_token_throughput or 0),2)
+            info["output_token_throughput_p99"]  = np.around(np.percentile(total_output_token_throughput or 0, 99),2)
+
+            info["inout_token_throughput_mean"] = np.around(np.mean(total_inout_token_throughput),2)
+            info["inout_token_throughput_median"] = np.around(np.median(total_inout_token_throughput or 0),2)
+            info["inout_token_throughput_p99"] = np.around(np.percentile(total_inout_token_throughput or 0, 99),2)
+
+            
+            info_json["{}_{}_{}".format(ELEprompt,ELEinput,ELEoutput)] = info
+            print("promt:{},input:{},output:{}".format(ELEprompt,ELEinput,ELEoutput))
+            print(f"Latency: {info['elapsed_time']:.2f} s")
+            print(f"Throughput: {len(requests) / info['elapsed_time']:.2f} requests/s, "
+                f"{total_num_tokens / info['elapsed_time']:.2f} total tokens/s, "
+                f"{total_output_tokens / info['elapsed_time']:.2f} output tokens/s")
+            print("==============================================")
+            print(f"total_out_tokens: {total_output_tokens: .2f} tokens")
+            print(f"elapsed_time: {info['elapsed_time']: .2f} s")      # 总耗时
+            print(f"TTFT_mean: {info['ttft_mean']*1000: .2f} ms")       # 首字延时
+            print(f"ttft_p99: {info['ttft_p99']*1000: .2f} ms")
+            print(f"ttft_median: {info['ttft_median']*1000: .2f} ms")
+            print(f"TPOP_mean: {info['tpop_mean']*1000: .2f} ms")              # 单字decode时间
+            print(f"tpop_median: {info['tpop_median']*1000: .2f} ms")
+            print(f"tpop_p99: {info['tpop_p99']*1000: .2f} ms")
+            print(f"output_token_throughput_mean: {info['output_token_throughput_mean']:.2f} tokens/s")           # 单路生成吞吐
+            print(f"output_token_throughput_median: {info['output_token_throughput_median']:.2f} tokens/s")
+            print(f"output_token_throughput_p99: {info['output_token_throughput_p99']:.2f} tokens/s")
+            print(f"inout_token_throughput_mean: {info['inout_token_throughput_mean']:.2f} tokens/s")           # 单路总吞吐
+            print(f"tinout_token_throughput_median: {info['inout_token_throughput_median']:.2f} tokens/s")
+            print(f"inout_token_throughput_p99: {info['inout_token_throughput_p99']:.2f} tokens/s")
+            print("==============================================")
+            print("\n")
+
+    return info_json
+
+async def run_vllm_async(
+    requests: List[SampleRequest],
+    n: int,
+    engine_args: AsyncEngineArgs,
+    disable_frontend_multiprocessing: bool = False,
+) -> float:
+    from vllm import SamplingParams
+
+    async with build_async_engine_client_from_engine_args(
+            engine_args, disable_frontend_multiprocessing) as llm:
+
+        # Add the requests to the engine.
+        prompts: List[TextPrompt] = []
+        sampling_params: List[SamplingParams] = []
+        lora_requests: List[Optional[LoRARequest]] = []
+        for request in requests:
+            prompts.append(
+                TextPrompt(prompt=request.prompt,
+                           multi_modal_data=request.multi_modal_data))
+            sampling_params.append(
+                SamplingParams(
+                    n=n,
+                    temperature=1.0,
+                    top_p=1.0,
+                    ignore_eos=True,
+                    max_tokens=request.expected_output_len,
+                ))
+            lora_requests.append(request.lora_request)
+
+        generators = []
+        start = time.perf_counter()
+        for i, (prompt, sp,
+                lr) in enumerate(zip(prompts, sampling_params, lora_requests)):
+            generator = llm.generate(prompt,
+                                     sp,
+                                     lora_request=lr,
+                                     request_id=f"test{i}")
+            generators.append(generator)
+        all_gens = merge_async_iterators(*generators)
+        async for i, res in all_gens:
+            pass
+        end = time.perf_counter()
+        return end - start
+
+
+def run_hf(
+    requests: List[SampleRequest],
+    model: str,
+    tokenizer: PreTrainedTokenizerBase,
+    n: int,
+    max_batch_size: int,
+    trust_remote_code: bool,
+) -> float:
+    llm = AutoModelForCausalLM.from_pretrained(
+        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
+    if llm.config.model_type == "llama":
+        # To enable padding in the HF backend.
+        tokenizer.pad_token = tokenizer.eos_token
+    llm = llm.cuda()
+
+    pbar = tqdm(total=len(requests))
+    start = time.perf_counter()
+    batch: List[str] = []
+    max_prompt_len = 0
+    max_output_len = 0
+    for i in range(len(requests)):
+        prompt, prompt_len, output_len = requests[i]
+        # Add the prompt to the batch.
+        batch.append(prompt)
+        max_prompt_len = max(max_prompt_len, prompt_len)
+        max_output_len = max(max_output_len, output_len)
+        if len(batch) < max_batch_size and i != len(requests) - 1:
+            # Check if we can add more requests to the batch.
+            _, next_prompt_len, next_output_len = requests[i + 1]
+            if (max(max_prompt_len, next_prompt_len) +
+                    max(max_output_len, next_output_len)) <= 2048:
+                # We can add more requests to the batch.
+                continue
+
+        # Generate the sequences.
+        input_ids = tokenizer(batch, return_tensors="pt",
+                              padding=True).input_ids
+        llm_outputs = llm.generate(
+            input_ids=input_ids.cuda(),
+            do_sample=True,
+            num_return_sequences=n,
+            temperature=1.0,
+            top_p=1.0,
+            use_cache=True,
+            max_new_tokens=max_output_len,
+        )
+        # Include the decoding time.
+        tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
+        pbar.update(len(batch))
+
+        # Clear the batch.
+        batch = []
+        max_prompt_len = 0
+        max_output_len = 0
+    end = time.perf_counter()
+    return end - start
+
+
+def run_mii(
+    requests: List[SampleRequest],
+    model: str,
+    tensor_parallel_size: int,
+    output_len: int,
+) -> float:
+    from mii import client, serve
+    llm = serve(model, tensor_parallel=tensor_parallel_size)
+    prompts = [request.prompt for request in requests]
+
+    start = time.perf_counter()
+    llm.generate(prompts, max_new_tokens=output_len)
+    end = time.perf_counter()
+    client = client(model)
+    client.terminate_server()
+    return end - start
+
+
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+
+    # Sample the requests.
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer, trust_remote_code=args.trust_remote_code)
+    if args.dataset is None:
+        vocab_size = tokenizer.vocab_size
+        requests_json={}
+        for ELEprompt in args.num_prompts:
+            for ELEinput,ELEoutput  in zip(args.input_len,args.output_len):
+                requests = []
+                for _ in range(ELEprompt):
+
+                    request_tokenizer = tokenizer
+                    lora_request: Optional[LoRARequest] = None
+                    if args.enable_lora:
+                        lora_request, lora_tokenizer = get_random_lora_request(args)
+                        if lora_tokenizer:
+                            request_tokenizer = lora_tokenizer
+
+                    # Synthesize a prompt with the given input length.
+                    candidate_ids = [
+                        random.randint(0, vocab_size - 1)
+                        for _ in range(ELEinput)
+                    ]
+                    # As tokenizer may add additional tokens like BOS, we need to try
+                    # different lengths to get the desired input length.
+                    for _ in range(5):  # Max attempts to correct
+                        candidate_prompt = request_tokenizer.decode(candidate_ids)
+                        tokenized_len = len(request_tokenizer.encode(candidate_prompt))
+
+                        if tokenized_len == ELEinput:
+                            break
+
+                        # Adjust length based on difference
+                        diff = ELEinput - tokenized_len
+                        if diff > 0:
+                            candidate_ids.extend([
+                                random.randint(100, vocab_size - 100)
+                                for _ in range(diff)
+                            ])
+                        else:
+                            candidate_ids = candidate_ids[:diff]
+                    requests.append(
+                        SampleRequest(prompt=candidate_prompt,
+                                    prompt_len=ELEinput,
+                                    expected_output_len=ELEoutput,
+                                    lora_request=lora_request))
+                requests_json["{}_{}_{}".format(ELEprompt,ELEinput,ELEoutput)]=requests
+    else:
+        requests = sample_requests(tokenizer, args)
+
+    is_multi_modal = any(request.multi_modal_data is not None
+                         for request in requests)
+    if args.backend == "vllm":
+        if args.async_engine:
+            elapsed_time = uvloop.run(
+                run_vllm_async(
+                    requests,
+                    args.n,
+                    AsyncEngineArgs.from_cli_args(args),
+                    args.disable_frontend_multiprocessing,
+                ))
+        else:
+            info_json = run_vllm(requests_json, args.n, args.num_iters_warmup,
+                                    EngineArgs.from_cli_args(args))
+    elif args.backend == "hf":
+        assert args.tensor_parallel_size == 1
+        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
+                              args.hf_max_batch_size, args.trust_remote_code)
+    elif args.backend == "mii":
+        elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
+                               args.output_len)
+    else:
+        raise ValueError(f"Unknown backend: {args.backend}")
+
+    # file_name=args.model.rsplit("/")[-1]+"-tp"+str(args.tensor_parallel_size)+".txt"
+
+    if is_multi_modal:
+        print("\033[91mWARNING\033[0m: Multi-modal request detected. The "
+                "following metrics are not accurate because image tokens are not"
+                " counted. See vllm-project/vllm/issues/9778 for details.")    
+    with open(args.output_json,"w") as f:
+        title="bs_in_out"
+        data_keys=info_json[list(info_json.keys())[0]].keys()
+        keys_string = ','.join(data_keys)
+        title=title+","+keys_string
+        f.write(title)
+        f.write("\n")
+        for key, value in info_json.items():
+            values_as_strings = [str(value) for value in info_json[key].values()]
+            values_string = ','.join(values_as_strings)
+            key=key+","+values_string
+            f.writelines(key)
+            f.write("\n")
+        # json.dump(info_json, f, indent=4)
+    # Output JSON results if specified
+    # if args.output_json:
+    #     results = {
+    #         "elapsed_time": elapsed_time,
+    #         "num_requests": len(requests),
+    #         "total_num_tokens": total_num_tokens,
+    #         "requests_per_second": len(requests) / elapsed_time,
+    #         "tokens_per_second": total_num_tokens / elapsed_time,
+    #     }
+    #     with open(args.output_json, "w") as f:
+    #         json.dump(results, f, indent=4)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
+    parser.add_argument("--backend",
+                        type=str,
+                        choices=["vllm", "hf", "mii"],
+                        default="vllm")
+    parser.add_argument("--dataset",
+                        type=str,
+                        default=None,
+                        help="Path to the dataset. The dataset is expected to "
+                        "be a json in form of List[Dict[..., conversations: "
+                        "List[Dict[..., value: <prompt_or_response>]]]]")
+    parser.add_argument("--input-len",
+                        type=int,
+                        nargs="*",
+                        default=None,
+                        help="Input prompt length for each request")
+    parser.add_argument("--output-len",
+                        type=int,
+                        nargs="*",
+                        default=None,
+                        help="Output length for each request. Overrides the "
+                        "output length from the dataset.")
+    parser.add_argument("--n",
+                        type=int,
+                        default=1,
+                        help="Number of generated sequences per prompt.")
+    parser.add_argument('--num-iters-warmup',
+                        type=int,
+                        default=1,
+                        help='Number of iterations to run for warmup.')
+    parser.add_argument("--num-prompts",
+                        type=int,
+                        nargs="*",
+                        default=1000,
+                        help="Number of prompts to process.")
+    parser.add_argument("--hf-max-batch-size",
+                        type=int,
+                        default=None,
+                        help="Maximum batch size for HF backend.")
+    parser.add_argument(
+        '--output-json',
+        type=str,
+        default=None,
+        help='Path to save the throughput results in JSON format.')
+    parser.add_argument("--async-engine",
+                        action='store_true',
+                        default=False,
+                        help="Use vLLM async engine rather than LLM class.")
+    parser.add_argument("--disable-frontend-multiprocessing",
+                        action='store_true',
+                        default=False,
+                        help="Disable decoupled async engine frontend.")
+    # LoRA
+    parser.add_argument(
+        "--lora-path",
+        type=str,
+        default=None,
+        help="Path to the lora adapters to use. This can be an absolute path, "
+        "a relative path, or a Hugging Face model identifier.")
+
+    parser = AsyncEngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    if args.tokenizer is None:
+        args.tokenizer = args.model
+    if args.dataset is None:
+        assert args.input_len is not None
+        assert args.output_len is not None
+    else:
+        assert args.input_len is None
+    if args.enable_lora:
+        assert args.lora_path is not None
+
+    if args.backend == "vllm":
+        if args.hf_max_batch_size is not None:
+            raise ValueError("HF max batch size is only for HF backend.")
+    elif args.backend == "hf":
+        if args.hf_max_batch_size is None:
+            raise ValueError("HF max batch size is required for HF backend.")
+        if args.quantization is not None:
+            raise ValueError("Quantization is only for vLLM backend.")
+        if args.enable_lora is not None:
+            raise ValueError("LoRA benchmarking is only supported for vLLM"
+                             " backend")
+    elif args.backend == "mii":
+        if args.dtype != "auto":
+            raise ValueError("dtype must be auto for MII backend.")
+        if args.n != 1:
+            raise ValueError("n must be 1 for MII backend.")
+        if args.quantization is not None:
+            raise ValueError("Quantization is only for vLLM backend.")
+        if args.hf_max_batch_size is not None:
+            raise ValueError("HF max batch size is only for HF backend.")
+        if args.tokenizer != args.model:
+            raise ValueError("Tokenizer must be the same as the model for MII "
+                             "backend.")
+        if args.enable_lora is not None:
+            raise ValueError("LoRA benchmarking is only supported for vLLM"
+                             " backend")
+    main(args)
+
+
--- a/3_env_check&batches_llm_inference/scripts/entrypoint.sh
+++ b/3_env_check&batches_llm_inference/scripts/entrypoint.sh
+#!/bin/bash
+
+# 执行环境检查
+echo "==================== 开始系统环境检查 ===================="
+/workspace/scripts/run_envcheck.sh
+# 运行性能测试
+echo "==================== 开始性能测试 ===================="
+/workspace/scripts/run_benchmark.sh 
+
+echo "==================== 所有测试完成 ===================="
\ No newline at end of file
--- a/3_env_check&batches_llm_inference/scripts/run_benchmark.sh
+++ b/3_env_check&batches_llm_inference/scripts/run_benchmark.sh
+export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export ROCBLAS_COMPUTETYPE_FP16R=0
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export NCCL_LAUNCH_MODE=GROUP
+export NCCL_NCHANNELS_PER_PEER=16
+export NCCL_MAX_NCHANNELS=16
+export NCCL_MIN_NCHANNELS=16
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_P2P_LEVEL=SYS
+export NCCL_NET_GDR_LEVEL=7
+export NCCL_NET_GDR_READ=1
+export RCCL_SDMA_COPY_ENABLE=0
+export SENDRECV_STREAM_WITH_COMPUTE=1
+export LD_LIBRARY_PATH=/usr/local/lib/python3.10/site-packages/torch/lib/:$LD_LIBRARY_PATH
+export ALLREDUCE_STREAM_WITH_COMPUTE=1
+export ALLREDUCE_STREAM_WITH_COMPUTE=1
+export NCCL_MIN_NCHANNELS=16
+export NCCL_MAX_NCHANNELS=16
+export VLLM_PCIE_USE_CUSTOM_ALLREDUCE=1
+
+export VLLM_RPC_TIMEOUT=100000
+
+
+
+#!/bin/bash
+
+# 模型配置文件路径
+MODELS_CONFIG="/workspace/configs/model_to_test.cfg"
+
+# 结果目录
+RESULTS_DIR="/workspace/test/inference_outputs"
+
+# 读取配置文件，跳过注释和空行
+while IFS= read -r line || [[ -n "$line" ]]; do
+    # 跳过注释行和空行
+    if [[ "$line" =~ ^# ]] || [[ -z "$line" ]]; then
+        continue
+    fi
+    
+    # 解析配置行
+    IFS=';' read -ra CONFIG <<< "$line"
+    
+    model_name="${CONFIG[0]}"
+    model_path="${CONFIG[1]}"
+    tp="${CONFIG[2]}"
+    batch="${CONFIG[3]//,/ }"        # 将逗号替换为空格
+    prompt_tokens="${CONFIG[4]//,/ }"
+    completion_tokens="${CONFIG[5]//,/ }"
+    dtype="${CONFIG[6]}"
+    max_model_len="${CONFIG[7]}"
+    gpu_memory_utilization="${CONFIG[8]}"
+    
+    echo "开始测试模型: $model_name"
+    echo "模型路径: $model_path"
+    echo "参数配置:"
+    echo "  tensor_parallel_size: $tp"
+    echo "  batch_sizes: $batch"
+    echo "  prompt_tokens: $prompt_tokens"
+    echo "  completion_tokens: $completion_tokens"
+    echo "  dtype: $dtype"
+    echo "  max_model_len: $max_model_len"
+    echo "  gpu_memory_utilization: $gpu_memory_utilization"
+    
+    # 创建模型专属结果目录
+    model_result_dir="${RESULTS_DIR}/${model_name}"
+    mkdir -p "$model_result_dir"
+    
+    # 运行基准测试
+    python /workspace/scripts/benchmark_throughput.py \
+        --model "$model_path" \
+        --tensor-parallel-size "$tp" \
+        --num-prompts $batch \
+        --input-len $prompt_tokens \
+        --output-len $completion_tokens \
+        --dtype "$dtype" \
+        --trust-remote-code \
+        --max-model-len "$max_model_len" \
+        --gpu-memory-utilization "$gpu_memory_utilization" \
+        --output-json "${model_result_dir}/${model_name}_tp${tp}.txt" \
+        2>&1 | tee "${model_result_dir}/${model_name}_tp${tp}.log"
+    
+    echo "完成测试模型: $model_name"
+    echo "结果保存在: $model_result_dir"
+    echo "----------------------------------------"
+done < "$MODELS_CONFIG"
\ No newline at end of file
--- a/3_env_check&batches_llm_inference/scripts/run_envcheck.sh
+++ b/3_env_check&batches_llm_inference/scripts/run_envcheck.sh
+#!/bin/bash
+set -eo pipefail  # 严格错误处理
+
+log_dir="/workspace/test/env_check_outputs"
+mkdir -p "$log_dir"
+
+echo "==================== 开始系统环境检查 ===================="
+
+# 基础检查函数
+run_test() {
+  local name=$1
+  shift
+  echo "[RUN] $name"
+  "$@" 2>&1 | tee "$log_dir/${name}.log" || {
+    echo "[WARN] $name 检查失败" | tee -a "$log_dir/${name}.log"
+    return 1
+  }
+}
+
+run_pipe_test() {
+  local name=$1
+  local cmd=$2
+  echo "[RUN] $name"
+  
+  bash -c "$cmd" 2>&1 | tee "$log_dir/${name}.log" || {
+    echo "[WARN] $name 检查失败" | tee -a "$log_dir/${name}.log"
+    return 1
+  }
+}
+
+# 系统基础检查
+run_test rocm_bandwidth_test rocm-bandwidth-test
+run_test hy_smi hy-smi
+run_test hy_smi_config hy-smi -c
+run_test pip_list pip list
+run_test cpu_info lscpu
+run_test cpu_cores nproc
+run_test memory_usage free -h
+run_test disk_usage df -h
+run_test hardware_info lshw -short || true
+run_test network_interfaces ip a
+run_test ibstat ibstat
+run_test ibdev2netdev ibdev2netdev
+run_pipe_test ACS_stat "lspci -vvv | grep -i acsct"
+run_test rocm_info rocminfo || true
+
+echo "==================== RCCL-TEST ===================="
+cd /workspace/test/env_check_outputs
+
+if command -v git &>/dev/null && command -v make &>/dev/null; then
+  if [ ! -d rccl-tests ]; then
+    git clone https://www.ghproxy.cn/github.com/ROCm/rccl-tests.git --depth 1 -b master || exit 1
+  fi
+  
+  cd rccl-tests || exit 1
+  source /opt/dtk/env.sh
+  
+  if make MPI=1 MPI_HOME=/opt/mpi ROCM_HOME=/opt/dtk NCCL_HOME=/opt/dtk/rccl \
+       CUSTOM_RCCL_LIB=/opt/dtk/rccl/lib/librccl.so -j32; then
+    ./build/all_reduce_perf -b 8 -e 1G -f 2 -g 8 2>&1 | tee "$log_dir/all_reduce_perf_8.log"
+    ./build/all_reduce_perf -b 4 -e 1G -f 2 -g 4 2>&1 | tee "$log_dir/all_reduce_perf_4.log"
+  else
+    echo "[ERROR] RCCL编译失败" | tee "$log_dir/rccl_build_fail.log"
+  fi
+  cd ..
+else
+  echo "[WARN] 缺少git或make，跳过RCCL测试" | tee "$log_dir/rccl_skip.log"
+fi
+
+echo "==================== DCU-ENV-CHECK ===================="
+if [ ! -d dcu_env_check ]; then
+  git clone http://developer.sourcefind.cn/codes/OpenDAS/dcu_env_check.git || {
+    echo "[ERROR] DCU环境检查代码克隆失败" | tee "$log_dir/dcu_clone_fail.log"
+    exit 1
+  }
+fi
+
+cd dcu_env_check && {
+  bash system_check.sh 2>&1 | tee "$log_dir/dcu_env_check.log"
+  cd ..
+} || {
+  echo "[ERROR] DCU环境检查执行失败" | tee "$log_dir/dcu_check_fail.log"
+  exit 1
+}
+
+echo "==================== 检查完成 ===================="
+echo "所有日志已保存至: $log_dir"
+ls -lh "$log_dir"
\ No newline at end of file
--- a/3_env_check&batches_llm_inference/start.sh
+++ b/3_env_check&batches_llm_inference/start.sh
+docker build -t vllm-test1 . && \
+docker run \
+-v /usr/local/hyhal:/usr/local/hyhal:ro \
+-v /opt/hyhal:/opt/hyhal:ro   \
+-v $PWD/outputs/env_check_outputs:/workspace/test/env_check_outputs/  \
+-v /public/models:/workspace/test/models/  \
+-v $PWD/outputs/inference_outputs:/workspace/test/inference_outputs/  \
+--ipc=host \
+--cap-add=SYS_PTRACE \
+--group-add video \
+--ulimit memlock=-1:-1 \
+--privileged \
+--device=/dev/kfd \
+--device=/dev/mkfd \
+--device=/dev/dri \
+--shm-size=500G \
+-u root \
+--security-opt seccomp=unconfined \
+vllm-test1 \
\ No newline at end of file