#!/bin/bash # 初始化目录 mkdir -p /workspace/test/inference_outputs/results mkdir -p /workspace/test/inference_outputs/logs/server mkdir -p /workspace/test/inference_outputs/logs/models # 基础端口 BASE_PORT=8001 # 读取配置文件(分号分隔) while IFS=';' read -r model_name model_path tp data_type batch_list prompt_pairs max_model_len gpu_mem_util; do # 清理参数(去除空格和引号) model_name=$(echo "$model_name" | xargs) model_path=$(echo "$model_path" | xargs) tp=$(echo "$tp" | xargs) data_type=$(echo "$data_type" | xargs) batch_list=$(echo "$batch_list" | tr -d '"' | xargs) prompt_pairs=$(echo "$prompt_pairs" | tr -d '()"' | xargs) max_model_len=$(echo "$max_model_len" | xargs) gpu_mem_util=$(echo "$gpu_mem_util" | xargs) # 动态分配端口 port=$((BASE_PORT++)) # 生成 server.sh cat > "/workspace/test/inference_outputs/server_${model_name}_tp${tp}.sh" < "/workspace/test/inference_outputs/logs/server/${model_name}_tp${tp}_server.log" 2>&1 & SERVER_PID=$! # 2. 改进的日志检测函数 check_server_status() { local log_file=$1 local server_pid=$2 local success_msg="Starting vLLM API server" local error_patterns=("RuntimeError" "ValueError" "segmentation fault" "core dumped" ) # 检查成功消息 if grep -q "$success_msg" "$log_file"; then echo "✅ Server started successfully!" return 0 fi # 检查错误消息 for pattern in "${error_patterns[@]}"; do if grep -i -q "$pattern" "$log_file"; then echo -e "\n❌ [$(date '+%Y-%m-%d %H:%M:%S')] Error detected in logs (pattern: $pattern)!" echo "===== ERROR CONTEXT =====" grep -i -A 5 -B 5 "$pattern" "$log_file" | tail -n 20 echo "=========================" return 1 fi done # 检查进程是否存活 if ! kill -0 $server_pid 2>/dev/null; then echo -e "\n❌ [$(date '+%Y-%m-%d %H:%M:%S')] Server process died unexpectedly!" echo "===== LAST LOG LINES =====" tail -n 20 "$log_file" echo "=========================" return 1 fi # 默认返回继续等待 return 2 } # 3. 等待服务器启动或失败 echo -e "\n🔍 [$(date '+%Y-%m-%d %H:%M:%S')] Starting monitoring for ${model_name}_tp${tp} (PID: $SERVER_PID)" max_wait_seconds=20000 start_time=$(date +%s) log_file="/workspace/test/inference_outputs/logs/server/${model_name}_tp${tp}_server.log" while true; do sleep 20 # 每20秒检查一次 check_server_status "$log_file" "$SERVER_PID" status=$? # 成功状态 if [ $status -eq 0 ]; then break fi # 失败状态 if [ $status -eq 1 ]; then # 清理资源 kill $SERVER_PID 2>/dev/null pkill -f "vllm serve" 2>/dev/null echo "🛑 Cleaned up resources after failure" # 直接继续下一个模型测试 continue 2 fi # 检查超时 current_time=$(date +%s) elapsed=$((current_time - start_time)) if [ $elapsed -ge $max_wait_seconds ]; then echo -e "\n⏰ [$(date '+%Y-%m-%d %H:%M:%S')] Timeout waiting for server to start!" # 清理资源 kill $SERVER_PID 2>/dev/null pkill -f "vllm serve" 2>/dev/null echo "🛑 Cleaned up resources after timeout" # 直接继续下一个模型测试 continue 2 fi echo "Waiting... (${elapsed}s elapsed)" done # 4. 只有成功启动时才执行测试 echo -e "\n🚀 [$(date '+%Y-%m-%d %H:%M:%S')] Running tests for ${model_name}_tp${tp}..." # 设置测试环境变量 export MODEL_NAME="$model_name" export MODEL_PATH="$model_path" export TP="$tp" export DATA_TYPE="$data_type" export BATCH_LIST="$batch_list" export PROMPT_PAIRS="$prompt_pairs" export PORT="$port" # 运行测试 ./test.sh # 5. 测试完成后清理 kill $SERVER_PID pkill -f "vllm serve" 2>/dev/null echo "✅ [$(date '+%Y-%m-%d %H:%M:%S')] ${model_name}_tp${tp} test completed and cleaned up" done < <(grep -v '^#' ../configs/model_to_test.cfg | grep -v '^$') echo -e "\n📊 [$(date '+%Y-%m-%d %H:%M:%S')] All tests completed. Results saved to results/"