Initial commit

f6a338d7 · jerrrrry · f6a338d7 · f6a338d7 · f6a338d7 · f6a338d7
Commit f6a338d7 authored Jul 16, 2025 by jerrrrry
20 changed files
--- a/1_env_check/Dockerfile
+++ b/1_env_check/Dockerfile
+# 使用官方光源基础镜像
+FROM image.sourcefind.cn:5000/dcu/admin/base/custom:vllm0.8.5-ubuntu22.04-dtk25.04.1-rc4-das1.6-py3.10-20250620-fixpy
+# 安装基础工具
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    iproute2 \
+    dmidecode \
+    ipmitool \
+    git \
+    curl \
+    jq \
+    lshw \
+    iputils-ping \
+    pciutils \
+    sysstat \
+    locate \
+    && rm -rf /var/lib/apt/lists/*
+# 创建目录结构
+RUN mkdir -p /workspace/scripts && \
+    mkdir -p /workspace/test/env_check_outputs && \
+    mkdir -p /workspace/test/env_check_tools
+# 复制脚本并设置权限
+COPY ./scripts/run_envcheck.sh /workspace/scripts/
+COPY ./env_check_tools/dcu_env_check.zip  /workspace/test/env_check_tools/
+COPY ./env_check_tools/rccl-tests.zip  /workspace/test/env_check_tools/
+# 验证脚本可执行性
+RUN ls -l /workspace/scripts/ && \
+    file /workspace/scripts/run_envcheck.sh && \
+    head -n 1 /workspace/scripts/run_envcheck.sh  # 检查shebang
+# 设置工作目录（建议直接设为脚本目录）
+WORKDIR /workspace/scripts/
+# 直接执行脚本（无需cd）
+CMD bash -c "\
+  bash run_envcheck.sh"
\ No newline at end of file
--- a/1_env_check/env_check_tools/dcu_env_check.zip
+++ b/1_env_check/env_check_tools/dcu_env_check.zip
--- a/1_env_check/env_check_tools/rccl-tests.zip
+++ b/1_env_check/env_check_tools/rccl-tests.zip
--- a/1_env_check/scripts/run_envcheck.sh
+++ b/1_env_check/scripts/run_envcheck.sh
+#!/bin/bash
+set -eo pipefail  # 严格错误处理
+log_dir="/workspace/test/env_check_outputs/"
+mkdir -p "$log_dir"
+echo "==================== 开始系统环境检查 ===================="
+# 增强版检查函数 - 遇到错误继续执行
+run_test() {
+  local name=$1
+  local chinese_name=$2
+  shift 2
+  echo "[RUN] $chinese_name"
+  if ! "$@" 2>&1 | tee "$log_dir/${chinese_name}.log"; then
+    echo "[WARN] $chinese_name 检查失败" | tee -a "$log_dir/${chinese_name}.log"
+    return 1  # 返回非零状态但不退出脚本
+  fi
+  return 0
+}
+run_pipe_test() {
+  local name=$1
+  local chinese_name=$2
+  local cmd=$3
+  echo "[RUN] $chinese_name"
+  if ! bash -c "$cmd" 2>&1 | tee "$log_dir/${chinese_name}.log"; then
+    echo "[WARN] $chinese_name 检查失败" | tee -a "$log_dir/${chinese_name}.log"
+    return 1
+  fi
+  return 0
+}
+# 安全执行函数 - 确保即使命令失败也不会中断脚本
+safe_run() {
+  local section=$1
+  shift
+  echo "==================== $section ===================="
+  for cmd in "$@"; do
+    # 使用eval来正确处理带引号的命令
+    if ! eval "$cmd"; then
+      echo "[WARN] 命令执行失败: $cmd" | tee -a "$log_dir/error.log"
+    fi
+  done
+}
+# ------------------------- 1. 系统基础检查 -------------------------
+safe_run "1.系统基础信息检查" \
+  'run_test uname "01_系统内核信息" uname -a' \
+  'run_test os_release "02_操作系统版本" cat /etc/os-release' \
+  'run_test locale "03_系统语言环境" locale'
+# ------------------------- 2. CPU & 内存检查 -------------------------
+safe_run "2.CPU_内存检查" \
+  'run_test cpu_info "04_CPU详细信息" lscpu' \
+  'run_test cpu_cores "05_CPU核心数" nproc' \
+  'run_pipe_test cpu_freq "06_CPU频率" "cat /proc/cpuinfo | grep \"MHz\" | sort -u"' \
+  'run_test memory_usage "07_内存使用情况" free -h' \
+  'run_test vm_stat "08_系统整体CPU和内存使用情况" vmstat 1 10' \
+  'run_test numa_nodes "09_NUMA节点信息" numactl --hardware || true' \
+  'run_pipe_test cpu_usage "10_CPU利用率检查" "mpstat -P ALL 1 5"' \
+  'run_pipe_test cpu_top_usage "11_CPU占用最高进程检查" "ps -eo pid,%cpu,cmd --sort=-%cpu | head -n 10"'
+# ------------------------- 3. 存储设备检查 -------------------------
+safe_run "3.存储设备检查" \
+  'run_test disk_usage "12_磁盘使用情况" df -hT' \
+  'run_test mount_info "13_挂载信息" mount | column -t' \
+  'run_test block_devices "14_块设备信息" lsblk -o NAME,SIZE,TYPE,MOUNTPOINT'
+# ------------------------- 4. 网络检查 -------------------------
+safe_run "4.网络检查" \
+  'run_test netstat "15_网络连接状态" ss -tulnp' \
+  'run_test network_interfaces "16_网络接口信息" ip -br a' \
+  'run_test routing_table "17_路由表信息" ip route' \
+  'run_test arp_table "18_ARP表信息" ip neigh' \
+  'run_test ibdev2netdev "19_InfiniBand设备映射" ibdev2netdev' \
+  'run_test topo "20_网卡-dcu-topo"   lspci -vt '
+# ------------------------- 5. DCU&内核&驱动检查 -------------------------
+safe_run "5.DCU_内核_驱动检查" \
+  'run_test hy_smi "21_DCU设备状态" hy-smi' \
+  'run_test clock_level "22_DCU时钟级别" hy-smi -g' \
+  'run_test driverversion "23_DCU驱动版本" hy-smi --showdriverversion' \
+  'run_test rocminfo "24_ROCM信息" rocminfo' \
+  'run_test kernel_modules "25_已加载内核模块" lsmod' \
+  'run_test kernel_version "26_内核版本" uname -r'
+# ------------------------- 6. 软件栈检查 -------------------------
+safe_run "6.软件栈检查" \
+  'run_test pip_list "27_Python包列表" pip list' \
+  'run_test glibc_version "28_GLIBC版本" ldd --version'
+# ------------------------- 7. 其他硬件状态检查 -------------------------
+safe_run "7.其他硬件状态检查" \
+  'run_test lspci "29_PCI设备列表" lspci' \
+  'run_test iostat "30_IO统计信息" iostat' \
+  'run_test hardware_info "31_硬件摘要信息" lshw -short || true' \
+  'run_pipe_test ACS_stat "32_ACS状态检查" "lspci -vvv | grep -i acsct"' \
+  'run_test dmesg "33_内核日志" dmesg' \
+  'run_pipe_test pcie_topology "34_PCIe拓扑结构" "echo \"====== PCIe 桥接器 ======\"; lspci -vvv | grep -E \"PCI bridge|Root port\" -A 20 | grep -E \"Device|Vendor|LnkSta:|LnkCap:|Secondary bus\"; echo \"\"; echo \"====== PCIe 带宽汇总 ======\"; lspci -vvv | grep \"LnkSta:\" | sort | uniq -c"' \
+  'run_pipe_test storage_details "35_存储控制器详情" "echo \"====== 存储控制器 ======\"; lspci -vvv | grep -E \"NVMe|SATA|RAID|Storage controller\" -A 30 | grep -E \"Device|Vendor|Kernel driver|LnkSta:|Speed|Width|MSI|Bar Memory\""' \
+  'run_pipe_test nic_details "36_网卡详细信息" "echo \"====== 网卡详细信息 ======\"; lspci -vvv | grep -E \"Ethernet controller|Network controller|InfiniBand\" -A 50 | grep -E \"Device|Vendor|Subsystem|Kernel driver|Kernel modules|LnkSta:|LnkCap:|NUMA node|Speed|Width\""' \
+  'run_pipe_test iommu_stat "37_IOMMU状态" "dmesg | grep IOMMU"' \
+  'run_pipe_test SELinux_stat "38_SELinux状态" "dmesg | grep SELinux"'
+# ------------------------- 8. 带宽检查 -------------------------
+source /opt/dtk/env.sh
+safe_run "8.带宽检查" \
+  'run_test D2D-a_test "39_D2D单向带宽测试" /opt/dtk/bin/BandwidthTest  -a -s 512MB ' \
+  'run_test D2D-A_test "40_D2D双向带宽测试" /opt/dtk/bin/BandwidthTest  -A -s 512MB ' \
+  'run_test D2H-H2D_test "41_D2H和H2D带宽测试" /opt/dtk/bin/BandwidthTest  -t 3  ' \
+  'cd /workspace/test/env_check_tools/ || { echo "[ERROR] 无法进入/workspace/test/env_check_tools/目录"; exit 1; }' \
+  'if [ -f "rccl-tests.zip" ]; then
+     echo "[INFO] 发现 rccl-tests.zip，开始解压..."
+     unzip -o rccl-tests.zip -d rccl-tests || {
+       echo "[ERROR] rccl-tests.zip 解压失败" | tee "$log_dir/42_RCCL测试解压失败.log"
+       exit 1
+     }
+     cd rccl-tests/rccl-tests || { echo "[ERROR] 无法进入rccl-tests目录"; exit 1; }
+     if make MPI=1 MPI_HOME=/opt/mpi ROCM_HOME=/opt/dtk NCCL_HOME=/opt/dtk/rccl \
+          CUSTOM_RCCL_LIB=/opt/dtk/rccl/lib/librccl.so -j32; then
+       ./build/all_reduce_perf -b 8 -e 1G -f 2 -g 8 2>&1 | tee "$log_dir/43_RCCL_all_reduce_8卡测试.log" || true
+       ./build/all_reduce_perf -b 4 -e 1G -f 2 -g 4 2>&1 | tee "$log_dir/44_RCCL_all_reduce_4卡测试.log" || true
+     else
+       echo "[ERROR] RCCL编译失败" | tee "$log_dir/45_RCCL编译失败.log"
+     fi
+     cd ../..
+   else
+     echo "[WARN] 未找到 rccl-tests.zip，跳过 RCCL 测试" | tee "$log_dir/46_RCCL测试跳过.log"
+   fi'
+# ------------------------- 9.DCU环境检查 -------------------------
+safe_run "9.DCU环境检查" \
+  'cd /workspace/test/env_check_tools/ || { echo "[ERROR] 无法进入/workspace/test/env_check_tools/目录"; exit 1; }' \
+  'if [ -f "dcu_env_check.zip" ]; then
+     echo "[INFO] 发现 dcu_env_check.zip，开始解压..."
+     unzip -o dcu_env_check.zip -d dcu_env_check || {
+       echo "[ERROR] dcu_env_check.zip 解压失败" | tee "$log_dir/47_DCU环境检查解压失败.log"
+       exit 1
+     }
+     chmod +x dcu_env_check/dcu_env_check-main/tools/*
+     cd dcu_env_check/dcu_env_check-main && {
+       bash system_check.sh 2>&1 | tee "$log_dir/48_DCU环境检查结果.log" || true
+       cp system_info* /workspace/test/env_check_outputs/ || true
+       cd ../..
+     } || {
+       echo "[ERROR] DCU环境检查执行失败" | tee "$log_dir/49_DCU环境检查执行失败.log"
+     }
+   else
+     echo "[WARN] 未找到 dcu_env_check.zip，跳过 DCU 环境检查" | tee "$log_dir/50_DCU环境检查跳过.log"
+   fi'
+echo "==================== 检查完成 ===================="
+echo "所有日志已保存至: $log_dir"
+ls -lh "$log_dir"
\ No newline at end of file
--- a/1_env_check/start.sh
+++ b/1_env_check/start.sh
+docker build -t env_check . && \
+docker run \
+-v /usr/local/hyhal:/usr/local/hyhal:ro \
+-v /opt/hyhal:/opt/hyhal:ro   \
+-v $PWD/outputs/env_check_outputs:/workspace/test/env_check_outputs/  \
+--ipc=host \
+--network=host \
+--cap-add=SYS_PTRACE \
+--group-add video \
+--ulimit memlock=-1:-1 \
+--privileged \
+--device=/dev/kfd \
+--device=/dev/mkfd \
+--device=/dev/dri \
+--shm-size=500G \
+-u root \
+--security-opt seccomp=unconfined \
+env_check \
\ No newline at end of file
--- a/2_env_check-model_download-llm_inference/Dockerfile
+++ b/2_env_check-model_download-llm_inference/Dockerfile
+# 使用官方光源基础镜像
+FROM image.sourcefind.cn:5000/dcu/admin/base/custom:vllm0.8.5-ubuntu22.04-dtk25.04.1-rc4-das1.6-py3.10-20250620-fixpy
+# 安装基础工具
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    iproute2 \
+    dmidecode \
+    ipmitool \
+    git \
+    curl \
+    jq \
+    lshw \
+    iputils-ping \
+    pciutils \
+    sysstat \
+    locate \
+    && rm -rf /var/lib/apt/lists/*
+# 创建目录结构
+RUN mkdir -p /workspace/scripts && \
+    mkdir -p /workspace/configs && \
+    mkdir -p /workspace/test/env_check_outputs && \
+    mkdir -p /workspace/test/inference_outputs && \
+    mkdir -p /workspace/test/models  && \
+    mkdir -p /workspace/test/env_check_tools
+# 复制脚本
+COPY ./scripts/* /workspace/scripts/
+COPY ./configs/* /workspace/configs/
+COPY ./env_check_tools/dcu_env_check.zip  /workspace/test/env_check_tools/
+COPY ./env_check_tools/rccl-tests.zip  /workspace/test/env_check_tools/
+RUN chmod +x /workspace/scripts/*
+RUN chmod +x /workspace/configs*
+# 设置工作目录（建议直接设为脚本目录）
+WORKDIR /workspace/scripts/
+# 直接执行脚本（无需cd）
+CMD bash -c "\
+  bash entrypoint.sh"
\ No newline at end of file
--- a/2_env_check-model_download-llm_inference/configs/download-list.cfg
+++ b/2_env_check-model_download-llm_inference/configs/download-list.cfg
+# 格式: 模型ID;本地保存路径
+#模型ID为modelscope官网指定的id
+Qwen/Qwen3-0.6B;/workspace/test/models/Qwen/Qwen3-0.6B
\ No newline at end of file
--- a/2_env_check-model_download-llm_inference/configs/model_to_test.cfg
+++ b/2_env_check-model_download-llm_inference/configs/model_to_test.cfg
+# 格式：model_name,model_path;tp;data_type;batch_list;prompt_completion_pairs;max_model_len;gpu_memory_utilization
+# 每行一个模型配置，字段用分号分隔
+Qwen3-0.6B;/workspace/test/models/Qwen/Qwen3-0.6B;1;float16;"1 ";(512 512,512 1024);40000;0.95
\ No newline at end of file
--- a/2_env_check-model_download-llm_inference/env_check_tools/dcu_env_check.zip
+++ b/2_env_check-model_download-llm_inference/env_check_tools/dcu_env_check.zip
--- a/2_env_check-model_download-llm_inference/env_check_tools/rccl-tests.zip
+++ b/2_env_check-model_download-llm_inference/env_check_tools/rccl-tests.zip
--- a/2_env_check-model_download-llm_inference/scripts/backend_request_func.py
+++ b/2_env_check-model_download-llm_inference/scripts/backend_request_func.py
+# SPDX-License-Identifier: Apache-2.0
+import json
+import os
+import sys
+import time
+import traceback
+from dataclasses import dataclass, field
+from typing import Optional, Union
+import aiohttp
+import huggingface_hub.constants
+from tqdm.asyncio import tqdm
+from transformers import (AutoTokenizer, PreTrainedTokenizer,
+                          PreTrainedTokenizerFast)
+# NOTE(simon): do not import vLLM here so the benchmark script
+# can run without vLLM installed.
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+@dataclass
+class RequestFuncInput:
+    prompt: str
+    api_url: str
+    prompt_len: int
+    output_len: int
+    model: str
+    model_name: Optional[str] = None
+    logprobs: Optional[int] = None
+    extra_body: Optional[dict] = None
+    multi_modal_content: Optional[dict] = None
+    ignore_eos: bool = False
+@dataclass
+class RequestFuncOutput:
+    generated_text: str = ""
+    success: bool = False
+    latency: float = 0.0
+    output_tokens: int = 0
+    ttft: float = 0.0  # Time to first token
+    itl: list[float] = field(
+        default_factory=list)  # list of inter-token latencies
+    tpot: float = 0.0  # avg next-token latencies
+    prompt_len: int = 0
+    error: str = ""
+async def async_request_tgi(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith("generate_stream")
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+        params = {
+            "max_new_tokens": request_func_input.output_len,
+            "do_sample": True,
+            "temperature": 0.01,  # TGI does not accept 0.0 temperature.
+            "top_p": 0.99,  # TGI does not accept 1.0 top_p.
+            "truncate": request_func_input.prompt_len,
+            "ignore_eos_token": request_func_input.ignore_eos,
+        }
+        payload = {
+            "inputs": request_func_input.prompt,
+            "parameters": params,
+        }
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+        if request_func_input.ignore_eos:
+            output.output_tokens = request_func_input.output_len
+        else:
+            output.output_tokens = None
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+                        chunk_bytes = chunk_bytes.decode("utf-8")
+                        # NOTE: Sometimes TGI returns a ping response without
+                        # any data, we should skip it.
+                        if chunk_bytes.startswith(":"):
+                            continue
+                        chunk = chunk_bytes.removeprefix("data:")
+                        data = json.loads(chunk)
+                        timestamp = time.perf_counter()
+                        # First token
+                        if ttft == 0.0:
+                            ttft = time.perf_counter() - st
+                            output.ttft = ttft
+                        # Decoding phase
+                        else:
+                            output.itl.append(timestamp -
+                                              most_recent_timestamp)
+                        most_recent_timestamp = timestamp
+                    output.latency = most_recent_timestamp - st
+                    output.success = True
+                    output.generated_text = data["generated_text"]
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+        if pbar:
+            pbar.update(1)
+        return output
+async def async_request_trt_llm(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith("generate_stream")
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+        payload = {
+            "accumulate_tokens": True,
+            "text_input": request_func_input.prompt,
+            "temperature": 0.0,
+            "top_p": 1.0,
+            "max_tokens": request_func_input.output_len,
+            "stream": True,
+        }
+        if request_func_input.ignore_eos:
+            payload["min_length"] = request_func_input.output_len
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                            "data:")
+                        data = json.loads(chunk)
+                        output.generated_text += data["text_output"]
+                        timestamp = time.perf_counter()
+                        # First token
+                        if ttft == 0.0:
+                            ttft = timestamp - st
+                            output.ttft = ttft
+                        # Decoding phase
+                        else:
+                            output.itl.append(timestamp -
+                                              most_recent_timestamp)
+                        most_recent_timestamp = timestamp
+                    output.latency = most_recent_timestamp - st
+                    output.success = True
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+        if pbar:
+            pbar.update(1)
+        return output
+async def async_request_deepspeed_mii(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+        payload = {
+            "prompt": request_func_input.prompt,
+            "max_tokens": request_func_input.output_len,
+            "temperature": 0.01,  # deepspeed-mii does not accept 0.0 temp.
+            "top_p": 1.0,
+        }
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+        # NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
+        # will use 0 as placeholder.
+        # See https://github.com/microsoft/DeepSpeed-MII/pull/311
+        output.ttft = 0
+        st = time.perf_counter()
+        try:
+            async with session.post(url=request_func_input.api_url,
+                                    json=payload) as response:
+                if response.status == 200:
+                    parsed_resp = await response.json()
+                    output.latency = time.perf_counter() - st
+                    if "choices" in parsed_resp:
+                        output.generated_text = parsed_resp["choices"][0][
+                            "text"]
+                    elif "text" in parsed_resp:
+                        output.generated_text = parsed_resp["text"][0]
+                    else:
+                        output.error = ("Unexpected response format: "
+                                        "neither 'choices' nor 'text' found")
+                        output.success = False
+                    output.success = True
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+        if pbar:
+            pbar.update(1)
+        return output
+async def async_request_openai_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        ("completions", "profile")
+    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+        payload = {
+            "model": request_func_input.model_name \
+                if request_func_input.model_name else request_func_input.model,
+            "prompt": request_func_input.prompt,
+            "temperature": 0.0,
+            "max_tokens": request_func_input.output_len,
+            "logprobs": request_func_input.logprobs,
+            "stream": True,
+            "stream_options": {
+                "include_usage": True,
+            },
+        }
+        if request_func_input.ignore_eos:
+            payload["ignore_eos"] = request_func_input.ignore_eos
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
+        }
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+        generated_text = ""
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload,
+                                    headers=headers) as response:
+                if response.status == 200:
+                    first_chunk_received = False
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                            "data: ")
+                        if chunk != "[DONE]":
+                            data = json.loads(chunk)
+                            # NOTE: Some completion API might have a last
+                            # usage summary response without a token so we
+                            # want to check a token was generated
+                            if choices := data.get("choices"):
+                                # Note that text could be empty here
+                                # e.g. for special tokens
+                                text = choices[0].get("text")
+                                timestamp = time.perf_counter()
+                                # First token
+                                if not first_chunk_received:
+                                    first_chunk_received = True
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp -
+                                                      most_recent_timestamp)
+                                most_recent_timestamp = timestamp
+                                generated_text += text or ""
+                            elif usage := data.get("usage"):
+                                output.output_tokens = usage.get(
+                                    "completion_tokens")
+                    if first_chunk_received:
+                        output.success = True
+                    else:
+                        output.success = False
+                        output.error = (
+                            "Never received a valid chunk to calculate TTFT."
+                            "This response will be marked as failed!")
+                    output.generated_text = generated_text
+                    output.latency = most_recent_timestamp - st
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+    if pbar:
+        pbar.update(1)
+    return output
+async def async_request_openai_chat_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        ("chat/completions", "profile")
+    ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+        content = [{"type": "text", "text": request_func_input.prompt}]
+        if request_func_input.multi_modal_content:
+            content.append(request_func_input.multi_modal_content)
+        payload = {
+            "model": request_func_input.model_name \
+                if request_func_input.model_name else request_func_input.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": content
+                },
+            ],
+            "temperature": 0.0,
+            "max_completion_tokens": request_func_input.output_len,
+            "stream": True,
+            "stream_options": {
+                "include_usage": True,
+            },
+        }
+        if request_func_input.ignore_eos:
+            payload["ignore_eos"] = request_func_input.ignore_eos
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        }
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+        generated_text = ""
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload,
+                                    headers=headers) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                            "data: ")
+                        if chunk != "[DONE]":
+                            timestamp = time.perf_counter()
+                            data = json.loads(chunk)
+                            if choices := data.get("choices"):
+                                content = choices[0]["delta"].get("content")
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = timestamp - st
+                                    output.ttft = ttft
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp -
+                                                      most_recent_timestamp)
+                                generated_text += content or ""
+                            elif usage := data.get("usage"):
+                                output.output_tokens = usage.get(
+                                    "completion_tokens")
+                            most_recent_timestamp = timestamp
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = most_recent_timestamp - st
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+    if pbar:
+        pbar.update(1)
+    return output
+def get_model(pretrained_model_name_or_path: str) -> str:
+    if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
+        from modelscope import snapshot_download
+        from vllm.model_executor.model_loader.weight_utils import get_lock
+        # Use file lock to prevent multiple processes from
+        # downloading the same model weights at the same time.
+        with get_lock(pretrained_model_name_or_path):
+            model_path = snapshot_download(
+                model_id=pretrained_model_name_or_path,
+                local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+                ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
+            return model_path
+    return pretrained_model_name_or_path
+def get_tokenizer(
+    pretrained_model_name_or_path: str,
+    tokenizer_mode: str = "auto",
+    trust_remote_code: bool = False,
+    **kwargs,
+) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+    if pretrained_model_name_or_path is not None and not os.path.exists(
+            pretrained_model_name_or_path):
+        pretrained_model_name_or_path = get_model(
+            pretrained_model_name_or_path)
+    if tokenizer_mode == "slow":
+        if kwargs.get("use_fast", False):
+            raise ValueError(
+                "Cannot use the fast tokenizer in slow tokenizer mode.")
+        kwargs["use_fast"] = False
+    if tokenizer_mode == "mistral":
+        try:
+            from vllm.transformers_utils.tokenizer import MistralTokenizer
+        except ImportError as e:
+            raise ImportError("MistralTokenizer requires vllm package.\n"
+                              "Please install it with `pip install vllm` "
+                              "to use mistral tokenizer mode.") from e
+        return MistralTokenizer.from_pretrained(
+            str(pretrained_model_name_or_path))
+    else:
+        return AutoTokenizer.from_pretrained(
+            pretrained_model_name_or_path,
+            trust_remote_code=trust_remote_code,
+            **kwargs,
+        )
+ASYNC_REQUEST_FUNCS = {
+    "tgi": async_request_tgi,
+    "vllm": async_request_openai_completions,
+    "lmdeploy": async_request_openai_completions,
+    "deepspeed-mii": async_request_deepspeed_mii,
+    "openai": async_request_openai_completions,
+    "openai-chat": async_request_openai_chat_completions,
+    "tensorrt-llm": async_request_trt_llm,
+    "scalellm": async_request_openai_completions,
+    "sglang": async_request_openai_completions,
+}
+OPENAI_COMPATIBLE_BACKENDS = [
+    k for k, v in ASYNC_REQUEST_FUNCS.items()
+    if v in (async_request_openai_completions,
+             async_request_openai_chat_completions)
+]
--- a/2_env_check-model_download-llm_inference/scripts/benchmark_dataset.py
+++ b/2_env_check-model_download-llm_inference/scripts/benchmark_dataset.py
--- a/2_env_check-model_download-llm_inference/scripts/benchmark_serving.py
+++ b/2_env_check-model_download-llm_inference/scripts/benchmark_serving.py
--- a/2_env_check-model_download-llm_inference/scripts/benchmark_utils.py
+++ b/2_env_check-model_download-llm_inference/scripts/benchmark_utils.py
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+import json
+import math
+import os
+from typing import Any
+def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
+                                        metrics: dict[str, list],
+                                        extra_info: dict[str, Any]) -> list:
+    """
+    Save the benchmark results in the format used by PyTorch OSS benchmark with
+    on metric per record
+    https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
+    """
+    records = []
+    if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
+        return records
+    for name, benchmark_values in metrics.items():
+        record = {
+            "benchmark": {
+                "name": "vLLM benchmark",
+                "extra_info": {
+                    "args": vars(args),
+                },
+            },
+            "model": {
+                "name": args.model,
+            },
+            "metric": {
+                "name": name,
+                "benchmark_values": benchmark_values,
+                "extra_info": extra_info,
+            },
+        }
+        tp = record["benchmark"]["extra_info"]["args"].get(
+            "tensor_parallel_size")
+        # Save tensor_parallel_size parameter if it's part of the metadata
+        if not tp and "tensor_parallel_size" in extra_info:
+            record["benchmark"]["extra_info"]["args"][
+                "tensor_parallel_size"] = extra_info["tensor_parallel_size"]
+        records.append(record)
+    return records
+class InfEncoder(json.JSONEncoder):
+    def clear_inf(self, o: Any):
+        if isinstance(o, dict):
+            return {k: self.clear_inf(v) for k, v in o.items()}
+        elif isinstance(o, list):
+            return [self.clear_inf(v) for v in o]
+        elif isinstance(o, float) and math.isinf(o):
+            return "inf"
+        return o
+    def iterencode(self, o: Any, *args, **kwargs) -> Any:
+        return super().iterencode(self.clear_inf(o), *args, **kwargs)
+def write_to_json(filename: str, records: list) -> None:
+    with open(filename, "w") as f:
+        json.dump(records, f, cls=InfEncoder)
--- a/2_env_check-model_download-llm_inference/scripts/download_model.sh
+++ b/2_env_check-model_download-llm_inference/scripts/download_model.sh
+#!/bin/bash
+# ModelScope CLI批量下载脚本
+# 使用说明: ./ms_download.sh -f 模型列表.cfg [-F 强制重新下载]
+pip install modelscope -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+# 参数解析
+CONFIG_FILE=""
+FORCE_DOWNLOAD=false
+MODELSCOPE_CMD="modelscope download"
+while getopts "f:F" opt; do
+  case $opt in
+    f) CONFIG_FILE="$OPTARG" ;;
+    F) FORCE_DOWNLOAD=true ;;
+    *) echo "Usage: $0 -f config.cfg [-F]" >&2
+       exit 1
+  esac
+done
+# 检查配置文件
+if [ ! -f "$CONFIG_FILE" ]; then
+  echo "Error: Config file $CONFIG_FILE not found!" >&2
+  exit 1
+fi
+# 检查modelscope是否安装
+if ! command -v modelscope &> /dev/null; then
+  echo "Error: modelscope CLI not installed. Please install with: pip install modelscope" >&2
+  exit 1
+fi
+# 读取配置文件
+TOTAL=0
+SUCCESS=0
+FAILED=0
+echo "=== Starting batch download ==="
+while IFS=';' read -r model_id local_dir || [[ -n "$model_id" ]]; do
+  # 跳过空行和注释
+  [[ -z "$model_id" || "$model_id" =~ ^# ]] && continue
+  ((TOTAL++))
+  # 清理变量
+  model_id=$(echo "$model_id" | xargs)
+  local_dir=$(echo "$local_dir" | xargs)
+  echo -e "\n[Progress] $TOTAL. Downloading $model_id"
+  echo "[Location] $local_dir"
+  # 检查目录是否存在
+  if [ "$FORCE_DOWNLOAD" = false ] && [ -d "$local_dir" ]; then
+    echo "[Status] Skipped (already exists)"
+    ((SUCCESS++))
+    continue
+  fi
+  # 创建目录
+  mkdir -p "$local_dir" || {
+    echo "[Error] Failed to create directory $local_dir" >&2
+    ((FAILED++))
+    continue
+  }
+  # 执行下载命令
+  if $MODELSCOPE_CMD --model "$model_id" --local_dir "$local_dir"; then
+    echo "[Status] Download successful"
+    ((SUCCESS++))
+  else
+    echo "[Error] Download failed" >&2
+    ((FAILED++))
+    # 删除空目录防止残留
+    rmdir "$local_dir" 2>/dev/null
+  fi
+done < "$CONFIG_FILE"
+# 结果统计
+echo -e "\n=== Download summary ==="
+echo "Total:    $TOTAL"
+echo "Success:  $SUCCESS"
+echo "Failed:   $FAILED"
+# 退出状态
+if [ "$FAILED" -gt 0 ]; then
+  exit 1
+else
+  exit 0
+fi
\ No newline at end of file
--- a/2_env_check-model_download-llm_inference/scripts/entrypoint.sh
+++ b/2_env_check-model_download-llm_inference/scripts/entrypoint.sh
+#!/bin/bash
+# 执行环境检查
+echo "==================== 开始系统环境检查 ===================="
+/workspace/scripts/run_envcheck.sh
+# 下载模型
+echo "==================== 开始模型下载 ===================="
+/workspace/scripts/download_model.sh -f /workspace/configs/download-list.cfg
+# 运行性能测试
+echo "==================== 开始性能测试 ===================="
+/workspace/scripts/run_benchmark.sh 
+echo "==================== 所有测试完成 ===================="
\ No newline at end of file
--- a/2_env_check-model_download-llm_inference/scripts/run_benchmark.sh
+++ b/2_env_check-model_download-llm_inference/scripts/run_benchmark.sh
+#!/bin/bash
+# 初始化目录
+mkdir -p /workspace/test/inference_outputs/results 
+mkdir -p /workspace/test/inference_outputs/logs/server
+mkdir -p /workspace/test/inference_outputs/logs/models
+# 基础端口
+BASE_PORT=8001
+# 读取配置文件（分号分隔）
+while IFS=';' read -r model_name model_path tp data_type batch_list prompt_pairs max_model_len gpu_mem_util; do
+    # 清理参数（去除空格和引号）
+    model_name=$(echo "$model_name" | xargs)
+    model_path=$(echo "$model_path" | xargs)
+    tp=$(echo "$tp" | xargs)
+    data_type=$(echo "$data_type" | xargs)
+    batch_list=$(echo "$batch_list" | tr -d '"' | xargs)
+    prompt_pairs=$(echo "$prompt_pairs" | tr -d '()"' | xargs)
+    max_model_len=$(echo "$max_model_len" | xargs)
+    gpu_mem_util=$(echo "$gpu_mem_util" | xargs)
+    # 动态分配端口
+    port=$((BASE_PORT++))
+    # 生成 server.sh
+    cat > "server_${model_name}_tp${tp}.sh" <<EOF
+#!/bin/bash
+export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export ALLREDUCE_STREAM_WITH_COMPUTE=1
+export NCCL_MIN_NCHANNELS=16
+export NCCL_MAX_NCHANNELS=16
+export VLLM_PCIE_USE_CUSTOM_ALLREDUCE=1
+export VLLM_USE_TRITON_PREFIX_FLASH_ATTN=1
+export VLLM_NUMA_BIND=1
+export VLLM_RANK0_NUMA=0
+export VLLM_RANK1_NUMA=1
+export VLLM_RANK2_NUMA=2
+export VLLM_RANK3_NUMA=3
+export VLLM_RANK4_NUMA=4
+export VLLM_RANK5_NUMA=5
+export VLLM_RANK6_NUMA=6
+export VLLM_RANK7_NUMA=7
+vllm serve "$model_path" --trust-remote-code \\
+--enable-prefix-caching \\
+--dtype $data_type \\
+--tensor-parallel-size $tp \\
+--max-model-len $max_model_len \\
+--port $port \\
+--gpu-memory-utilization $gpu_mem_util
+EOF
+    # 赋予执行权限
+    chmod +x "server_${model_name}_tp${tp}.sh"
+    echo "Generated server script for ${model_name}_tp${tp} at server_${model_name}_tp${tp}.sh"
+    # 1. 启动 vLLM 服务，并记录日志到 server.log
+    ./server_${model_name}_tp${tp}.sh > "/workspace/test/inference_outputs/logs/server/${model_name}_tp${tp}_server.log" 2>&1 &
+    SERVER_PID=$!
+    # 2. 改进的日志检测函数
+    check_server_status() {
+        local log_file=$1
+        local server_pid=$2
+        local success_msg="Starting vLLM API server on http://0.0.0.0"
+        local error_patterns=("RuntimeError" "ValueError" "segmentation fault" "core dumped" )
+        # 检查成功消息
+        if grep -q "$success_msg" "$log_file"; then
+            echo "✅ Server started successfully!"
+            return 0
+        fi
+        # 检查错误消息
+        for pattern in "${error_patterns[@]}"; do
+            if grep -i -q "$pattern" "$log_file"; then
+                echo -e "\n❌ [$(date '+%Y-%m-%d %H:%M:%S')] Error detected in logs (pattern: $pattern)!"
+                echo "===== ERROR CONTEXT ====="
+                grep -i -A 5 -B 5 "$pattern" "$log_file" | tail -n 20
+                echo "========================="
+                return 1
+            fi
+        done
+        # 检查进程是否存活
+        if ! kill -0 $server_pid 2>/dev/null; then
+            echo -e "\n❌ [$(date '+%Y-%m-%d %H:%M:%S')] Server process died unexpectedly!"
+            echo "===== LAST LOG LINES ====="
+            tail -n 20 "$log_file"
+            echo "========================="
+            return 1
+        fi
+        # 默认返回继续等待
+        return 2
+    }
+    # 3. 等待服务器启动或失败
+    echo -e "\n🔍 [$(date '+%Y-%m-%d %H:%M:%S')] Starting monitoring for ${model_name}_tp${tp} (PID: $SERVER_PID)"
+    max_wait_seconds=20000
+    start_time=$(date +%s)
+    log_file="/workspace/test/inference_outputs/logs/server/${model_name}_tp${tp}_server.log"
+    while true; do
+        sleep 20  # 每20秒检查一次
+        check_server_status "$log_file" "$SERVER_PID"
+        status=$?
+        # 成功状态
+        if [ $status -eq 0 ]; then
+            break
+        fi
+        # 失败状态
+        if [ $status -eq 1 ]; then
+            # 清理资源
+            kill $SERVER_PID 2>/dev/null
+            pkill -f "vllm serve" 2>/dev/null
+            echo "🛑 Cleaned up resources after failure"
+            # 直接继续下一个模型测试
+            continue 2
+        fi
+        # 检查超时
+        current_time=$(date +%s)
+        elapsed=$((current_time - start_time))
+        if [ $elapsed -ge $max_wait_seconds ]; then
+            echo -e "\n⏰ [$(date '+%Y-%m-%d %H:%M:%S')] Timeout waiting for server to start!"
+            # 清理资源
+            kill $SERVER_PID 2>/dev/null
+            pkill -f "vllm serve" 2>/dev/null
+            echo "🛑 Cleaned up resources after timeout"
+            # 直接继续下一个模型测试
+            continue 2
+        fi
+        echo "Waiting... (${elapsed}s elapsed)"
+    done
+    # 4. 只有成功启动时才执行测试
+    echo -e "\n🚀 [$(date '+%Y-%m-%d %H:%M:%S')] Running tests for ${model_name}_tp${tp}..."
+    # 设置测试环境变量
+    export MODEL_NAME="$model_name"
+    export MODEL_PATH="$model_path"
+    export TP="$tp"
+    export DATA_TYPE="$data_type"
+    export BATCH_LIST="$batch_list"
+    export PROMPT_PAIRS="$prompt_pairs"
+    export PORT="$port"
+    # 运行测试
+    ./test.sh
+    # 5. 测试完成后清理
+    kill $SERVER_PID
+    pkill -f "vllm serve" 2>/dev/null
+    echo "✅ [$(date '+%Y-%m-%d %H:%M:%S')] ${model_name}_tp${tp} test completed and cleaned up"
+done < <(grep -v '^#' ../configs/model_to_test.cfg | grep -v '^$')
+echo -e "\n📊 [$(date '+%Y-%m-%d %H:%M:%S')] All tests completed. Results saved to results/"
\ No newline at end of file
--- a/2_env_check-model_download-llm_inference/scripts/run_envcheck.sh
+++ b/2_env_check-model_download-llm_inference/scripts/run_envcheck.sh
+#!/bin/bash
+set -eo pipefail  # 严格错误处理
+log_dir="/workspace/test/env_check_outputs/"
+mkdir -p "$log_dir"
+echo "==================== 开始系统环境检查 ===================="
+# 增强版检查函数 - 遇到错误继续执行
+run_test() {
+  local name=$1
+  local chinese_name=$2
+  shift 2
+  echo "[RUN] $chinese_name"
+  if ! "$@" 2>&1 | tee "$log_dir/${chinese_name}.log"; then
+    echo "[WARN] $chinese_name 检查失败" | tee -a "$log_dir/${chinese_name}.log"
+    return 1  # 返回非零状态但不退出脚本
+  fi
+  return 0
+}
+run_pipe_test() {
+  local name=$1
+  local chinese_name=$2
+  local cmd=$3
+  echo "[RUN] $chinese_name"
+  if ! bash -c "$cmd" 2>&1 | tee "$log_dir/${chinese_name}.log"; then
+    echo "[WARN] $chinese_name 检查失败" | tee -a "$log_dir/${chinese_name}.log"
+    return 1
+  fi
+  return 0
+}
+# 安全执行函数 - 确保即使命令失败也不会中断脚本
+safe_run() {
+  local section=$1
+  shift
+  echo "==================== $section ===================="
+  for cmd in "$@"; do
+    # 使用eval来正确处理带引号的命令
+    if ! eval "$cmd"; then
+      echo "[WARN] 命令执行失败: $cmd" | tee -a "$log_dir/error.log"
+    fi
+  done
+}
+# ------------------------- 1. 系统基础检查 -------------------------
+safe_run "1.系统基础信息检查" \
+  'run_test uname "01_系统内核信息" uname -a' \
+  'run_test os_release "02_操作系统版本" cat /etc/os-release' \
+  'run_test locale "03_系统语言环境" locale'
+# ------------------------- 2. CPU & 内存检查 -------------------------
+safe_run "2.CPU_内存检查" \
+  'run_test cpu_info "04_CPU详细信息" lscpu' \
+  'run_test cpu_cores "05_CPU核心数" nproc' \
+  'run_pipe_test cpu_freq "06_CPU频率" "cat /proc/cpuinfo | grep \"MHz\" | sort -u"' \
+  'run_test memory_usage "07_内存使用情况" free -h' \
+  'run_test vm_stat "08_系统整体CPU和内存使用情况" vmstat 1 10' \
+  'run_test numa_nodes "09_NUMA节点信息" numactl --hardware || true' \
+  'run_pipe_test cpu_usage "10_CPU利用率检查" "mpstat -P ALL 1 5"' \
+  'run_pipe_test cpu_top_usage "11_CPU占用最高进程检查" "ps -eo pid,%cpu,cmd --sort=-%cpu | head -n 10"'
+# ------------------------- 3. 存储设备检查 -------------------------
+safe_run "3.存储设备检查" \
+  'run_test disk_usage "12_磁盘使用情况" df -hT' \
+  'run_test mount_info "13_挂载信息" mount | column -t' \
+  'run_test block_devices "14_块设备信息" lsblk -o NAME,SIZE,TYPE,MOUNTPOINT'
+# ------------------------- 4. 网络检查 -------------------------
+safe_run "4.网络检查" \
+  'run_test netstat "15_网络连接状态" ss -tulnp' \
+  'run_test network_interfaces "16_网络接口信息" ip -br a' \
+  'run_test routing_table "17_路由表信息" ip route' \
+  'run_test arp_table "18_ARP表信息" ip neigh' \
+  'run_test ibdev2netdev "19_InfiniBand设备映射" ibdev2netdev' \
+  'run_test topo "20_网卡-dcu-topo"   lspci -vt '
+# ------------------------- 5. DCU&内核&驱动检查 -------------------------
+safe_run "5.DCU_内核_驱动检查" \
+  'run_test hy_smi "21_DCU设备状态" hy-smi' \
+  'run_test clock_level "22_DCU时钟级别" hy-smi -g' \
+  'run_test driverversion "23_DCU驱动版本" hy-smi --showdriverversion' \
+  'run_test rocminfo "24_ROCM信息" rocminfo' \
+  'run_test kernel_modules "25_已加载内核模块" lsmod' \
+  'run_test kernel_version "26_内核版本" uname -r'
+# ------------------------- 6. 软件栈检查 -------------------------
+safe_run "6.软件栈检查" \
+  'run_test pip_list "27_Python包列表" pip list' \
+  'run_test glibc_version "28_GLIBC版本" ldd --version'
+# ------------------------- 7. 其他硬件状态检查 -------------------------
+safe_run "7.其他硬件状态检查" \
+  'run_test lspci "29_PCI设备列表" lspci' \
+  'run_test iostat "30_IO统计信息" iostat' \
+  'run_test hardware_info "31_硬件摘要信息" lshw -short || true' \
+  'run_pipe_test ACS_stat "32_ACS状态检查" "lspci -vvv | grep -i acsct"' \
+  'run_test dmesg "33_内核日志" dmesg' \
+  'run_pipe_test pcie_topology "34_PCIe拓扑结构" "echo \"====== PCIe 桥接器 ======\"; lspci -vvv | grep -E \"PCI bridge|Root port\" -A 20 | grep -E \"Device|Vendor|LnkSta:|LnkCap:|Secondary bus\"; echo \"\"; echo \"====== PCIe 带宽汇总 ======\"; lspci -vvv | grep \"LnkSta:\" | sort | uniq -c"' \
+  'run_pipe_test storage_details "35_存储控制器详情" "echo \"====== 存储控制器 ======\"; lspci -vvv | grep -E \"NVMe|SATA|RAID|Storage controller\" -A 30 | grep -E \"Device|Vendor|Kernel driver|LnkSta:|Speed|Width|MSI|Bar Memory\""' \
+  'run_pipe_test nic_details "36_网卡详细信息" "echo \"====== 网卡详细信息 ======\"; lspci -vvv | grep -E \"Ethernet controller|Network controller|InfiniBand\" -A 50 | grep -E \"Device|Vendor|Subsystem|Kernel driver|Kernel modules|LnkSta:|LnkCap:|NUMA node|Speed|Width\""' \
+  'run_pipe_test iommu_stat "37_IOMMU状态" "dmesg | grep IOMMU"' \
+  'run_pipe_test SELinux_stat "38_SELinux状态" "dmesg | grep SELinux"'
+# ------------------------- 8. 带宽检查 -------------------------
+source /opt/dtk/env.sh
+safe_run "8.带宽检查" \
+  'run_test D2D-a_test "39_D2D单向带宽测试" /opt/dtk/bin/BandwidthTest  -a -s 512MB ' \
+  'run_test D2D-A_test "40_D2D双向带宽测试" /opt/dtk/bin/BandwidthTest  -A -s 512MB ' \
+  'run_test D2H-H2D_test "41_D2H和H2D带宽测试" /opt/dtk/bin/BandwidthTest  -t 3  ' \
+  'cd /workspace/test/env_check_tools/ || { echo "[ERROR] 无法进入/workspace/test/env_check_tools/目录"; exit 1; }' \
+  'if [ -f "rccl-tests.zip" ]; then
+     echo "[INFO] 发现 rccl-tests.zip，开始解压..."
+     unzip -o rccl-tests.zip -d rccl-tests || {
+       echo "[ERROR] rccl-tests.zip 解压失败" | tee "$log_dir/42_RCCL测试解压失败.log"
+       exit 1
+     }
+     cd rccl-tests/rccl-tests || { echo "[ERROR] 无法进入rccl-tests目录"; exit 1; }
+     if make MPI=1 MPI_HOME=/opt/mpi ROCM_HOME=/opt/dtk NCCL_HOME=/opt/dtk/rccl \
+          CUSTOM_RCCL_LIB=/opt/dtk/rccl/lib/librccl.so -j32; then
+       ./build/all_reduce_perf -b 8 -e 1G -f 2 -g 8 2>&1 | tee "$log_dir/43_RCCL_all_reduce_8卡测试.log" || true
+       ./build/all_reduce_perf -b 4 -e 1G -f 2 -g 4 2>&1 | tee "$log_dir/44_RCCL_all_reduce_4卡测试.log" || true
+     else
+       echo "[ERROR] RCCL编译失败" | tee "$log_dir/45_RCCL编译失败.log"
+     fi
+     cd ../..
+   else
+     echo "[WARN] 未找到 rccl-tests.zip，跳过 RCCL 测试" | tee "$log_dir/46_RCCL测试跳过.log"
+   fi'
+# ------------------------- 9.DCU环境检查 -------------------------
+safe_run "9.DCU环境检查" \
+  'cd /workspace/test/env_check_tools/ || { echo "[ERROR] 无法进入/workspace/test/env_check_tools/目录"; exit 1; }' \
+  'if [ -f "dcu_env_check.zip" ]; then
+     echo "[INFO] 发现 dcu_env_check.zip，开始解压..."
+     unzip -o dcu_env_check.zip -d dcu_env_check || {
+       echo "[ERROR] dcu_env_check.zip 解压失败" | tee "$log_dir/47_DCU环境检查解压失败.log"
+       exit 1
+     }
+     chmod +x dcu_env_check/dcu_env_check-main/tools/*
+     cd dcu_env_check/dcu_env_check-main && {
+       bash system_check.sh 2>&1 | tee "$log_dir/48_DCU环境检查结果.log" || true
+       cp system_info* /workspace/test/env_check_outputs/ || true
+       cd ../..
+     } || {
+       echo "[ERROR] DCU环境检查执行失败" | tee "$log_dir/49_DCU环境检查执行失败.log"
+     }
+   else
+     echo "[WARN] 未找到 dcu_env_check.zip，跳过 DCU 环境检查" | tee "$log_dir/50_DCU环境检查跳过.log"
+   fi'
+echo "==================== 检查完成 ===================="
+echo "所有日志已保存至: $log_dir"
+ls -lh "$log_dir"
\ No newline at end of file
--- a/2_env_check-model_download-llm_inference/scripts/test.sh
+++ b/2_env_check-model_download-llm_inference/scripts/test.sh
+#!/bin/bash
+export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export ALLREDUCE_STREAM_WITH_COMPUTE=1
+export NCCL_MIN_NCHANNELS=16
+export NCCL_MAX_NCHANNELS=16
+export VLLM_PCIE_USE_CUSTOM_ALLREDUCE=1
+export VLLM_USE_TRITON_PREFIX_FLASH_ATTN=1
+export VLLM_NUMA_BIND=1
+export VLLM_RANK0_NUMA=0
+export VLLM_RANK1_NUMA=1
+export VLLM_RANK2_NUMA=2
+export VLLM_RANK3_NUMA=3
+export VLLM_RANK4_NUMA=4
+export VLLM_RANK5_NUMA=5
+export VLLM_RANK6_NUMA=6
+export VLLM_RANK7_NUMA=7
+# 从环境变量读取参数
+model_name=${MODEL_NAME}
+model_path=${MODEL_PATH}
+tp=${TP}
+data_type=${DATA_TYPE}
+batch_list=${BATCH_LIST}
+prompt_pairs=${PROMPT_PAIRS}
+port=${PORT}
+# 生成结果文件名
+result_file="/workspace/test/inference_outputs/results/${model_name}_tp${tp}.csv"
+echo "tp,data_type,batch,prompt_tokens,completion_tokens,TOTAL_THROUGHPUT(toks/s),generate_throughput(toks/s),TTFT(ms),TPOT(ms),ITL(ms)" > "$result_file"
+# 转换字符串为数组
+IFS=' ' read -ra batches <<< "$batch_list"
+IFS=',' read -ra pairs <<< "$prompt_pairs"
+# 执行测试
+for batch in "${batches[@]}"; do
+    for pair in "${pairs[@]}"; do
+        IFS=' ' read -r prompt_tokens completion_tokens <<< "$pair"
+        log_file="/workspace/test/inference_outputs/logs/models/${model_name}_${tp}/batch_${batch}_prompt_${prompt_tokens}_completion_${completion_tokens}.log"
+        mkdir -p "$(dirname "$log_file")"
+        echo "Running: batch=$batch, prompt=$prompt_tokens, completion=$completion_tokens"
+        python benchmark_serving.py \
+            --backend openai \
+            --port "$port" \
+            --model "$model_path" \
+            --trust-remote-code \
+            --dataset-name random \
+            --ignore-eos \
+            --random-input-len "$prompt_tokens" \
+            --random-output-len "$completion_tokens" \
+            --num-prompts "$batch" \
+            2>&1 | tee "$log_file"
+        # 提取指标
+        TOTAL_THROUGHPUT=$(grep "^Total Token" "$log_file" | awk '{print $5}')
+        GEN_THROUGHPUT=$(grep "^Output token" "$log_file" | awk '{print $5}')
+        TTFT=$(grep "^Mean TTFT" "$log_file" | awk '{print $4}')
+        TPOT=$(grep "^Mean TPOT" "$log_file" | awk '{print $4}')
+        ITL=$(grep "^Mean ITL" "$log_file" | awk '{print $4}')
+        echo "$tp,$data_type,$batch,$prompt_tokens,$completion_tokens,$TOTAL_THROUGHPUT,$GEN_THROUGHPUT,$TTFT,$TPOT,$ITL" >> "$result_file"
+    done
+done
--- a/2_env_check-model_download-llm_inference/start.sh
+++ b/2_env_check-model_download-llm_inference/start.sh
+docker build -t vllm-test1 . && \
+docker run \
+-v /usr/local/hyhal:/usr/local/hyhal:ro \
+-v /opt/hyhal:/opt/hyhal:ro   \
+-v $PWD/outputs/env_check_outputs:/workspace/test/env_check_outputs/  \
+-v $PWD/outputs/models:/workspace/test/models/  \
+-v $PWD/outputs/inference_outputs:/workspace/test/inference_outputs/  \
+--ipc=host \
+--network=host \
+--cap-add=SYS_PTRACE \
+--group-add video \
+--ulimit memlock=-1:-1 \
+--privileged \
+--device=/dev/kfd \
+--device=/dev/mkfd \
+--device=/dev/dri \
+--shm-size=500G \
+-u root \
+--security-opt seccomp=unconfined \
+vllm-test1 \
\ No newline at end of file