step3p5_benchmark_test.sh

#!/usr/bin/env bash

set -u

DEFAULT_BATCH_SIZES=(1 8 16 32 64 128)
MODEL_PATH="/module/step3.5-fp8/"
SERVED_MODEL_NAME="/module/step3.5-fp8/"
DATASET_NAME="random"
DEFAULT_OUTPUT_LEN_DECODE=4096
DEFAULT_OUTPUT_LEN_PREFILL=1
DEFAULT_ROLE="decode"
READY_CHECK_TIMEOUT=3
RESULT_DIR="benchmark_result"

print_usage() {
  cat <<'USAGE'
Usage:
  ./scripts/step3p5_benchmark_test.sh
  ./scripts/step3p5_benchmark_test.sh 1,8,16,32
  ./scripts/step3p5_benchmark_test.sh --role prefill
  ./scripts/step3p5_benchmark_test.sh --role decode
  ./scripts/step3p5_benchmark_test.sh --role both
  ./scripts/step3p5_benchmark_test.sh --role prefill --output-len 1

Description:
  - No argument: use default batch sizes, role=decode, output-len=4096
  - Optional positional argument: batch size list (comma or space separated)
  - Optional flag: --role <prefill|decode|both>
  - Optional flag: --output-len <N> (must be positive integer)
  - role=both 时串行执行 prefill 再 decode
  - Result files are saved under:
      <result_dir>/prefill  (when role=prefill)
      <result_dir>/decode   (when role=decode)
USAGE
}

parse_batch_sizes() {
  local raw_input="${1:-}"

  if [[ -z "$raw_input" ]]; then
    BATCH_SIZES=("${DEFAULT_BATCH_SIZES[@]}")
    return
  fi

  raw_input="${raw_input//,/ }"
  read -r -a BATCH_SIZES <<< "$raw_input"

  if (( ${#BATCH_SIZES[@]} == 0 )); then
    echo "[ERROR] batch size 列表为空。"
    print_usage
    exit 1
  fi

  local batch_size
  for batch_size in "${BATCH_SIZES[@]}"; do
    if ! [[ "$batch_size" =~ ^[1-9][0-9]*$ ]]; then
      echo "[ERROR] 非法 batch size: $batch_size（必须是正整数）"
      exit 1
    fi
  done
}

parse_role() {
  local role_input="${1:-$DEFAULT_ROLE}"

  if [[ "$role_input" != "prefill" && "$role_input" != "decode" && "$role_input" != "both" ]]; then
    echo "[ERROR] 非法 role: $role_input（必须是 prefill、decode 或 both）"
    print_usage
    exit 1
  fi

  ROLE="$role_input"
}

parse_output_len() {
  local output_len_input="$1"

  if ! [[ "$output_len_input" =~ ^[1-9][0-9]*$ ]]; then
    echo "[ERROR] 非法 output-len: $output_len_input（必须是正整数）"
    print_usage
    exit 1
  fi

  RANDOM_OUTPUT_LEN="$output_len_input"
}

calc_num_prompts() {
  local batch_size="$1"
  local num_prompts=$((batch_size + batch_size / 2))

  if (( num_prompts < 16 )); then
    num_prompts=16
  fi

  if (( num_prompts > 384 )); then
    num_prompts=384
  fi

  if (( num_prompts < batch_size )); then
    num_prompts=$batch_size
  fi

  echo "$num_prompts"
}

main() {
  local batch_arg=""
  local role_arg="$DEFAULT_ROLE"
  local output_len_arg=""

  while (( $# > 0 )); do
    case "$1" in
      -h|--help)
        print_usage
        exit 0
        ;;
      --role|-r)
        if [[ -z "${2:-}" ]]; then
          echo "[ERROR] --role 缺少参数。"
          print_usage
          exit 1
        fi
        role_arg="$2"
        shift 2
        ;;
      --output-len|-o)
        if [[ -z "${2:-}" ]]; then
          echo "[ERROR] --output-len 缺少参数。"
          print_usage
          exit 1
        fi
        output_len_arg="$2"
        shift 2
        ;;
      --*)
        echo "[ERROR] 未知参数: $1"
        print_usage
        exit 1
        ;;
      *)
        if [[ -n "$batch_arg" ]]; then
          echo "[ERROR] 仅支持一个 batch size 列表参数。"
          print_usage
          exit 1
        fi
        batch_arg="$1"
        shift
        ;;
    esac
  done

  parse_batch_sizes "$batch_arg"
  parse_role "$role_arg"

  if [[ "$ROLE" == "both" ]]; then
    local -a prefill_cmd=("$0")
    local -a decode_cmd=("$0")

    if [[ -n "$batch_arg" ]]; then
      prefill_cmd+=("$batch_arg")
      decode_cmd+=("$batch_arg")
    fi

    prefill_cmd+=("--role" "prefill")
    decode_cmd+=("--role" "decode")

    if [[ -n "$output_len_arg" ]]; then
      decode_cmd+=("--output-len" "$output_len_arg")
    fi

    echo "[INFO] role=both: 将串行执行 prefill 和 decode"
    echo "[INFO] step1: ${prefill_cmd[*]}"
    "${prefill_cmd[@]}"

    echo "[INFO] step2: ${decode_cmd[*]}"
    "${decode_cmd[@]}"

    echo "[INFO] role=both 执行完成。"
    return 0
  fi

  if [[ "$ROLE" == "prefill" ]]; then
    if [[ -n "$output_len_arg" && "$output_len_arg" != "$DEFAULT_OUTPUT_LEN_PREFILL" ]]; then
      echo "[WARN] role=prefill 时 output-len 必须为 1，已自动覆盖为 1。"
    fi
    output_len_arg="$DEFAULT_OUTPUT_LEN_PREFILL"
  elif [[ -z "$output_len_arg" ]]; then
    output_len_arg="$DEFAULT_OUTPUT_LEN_DECODE"
  fi

  parse_output_len "$output_len_arg"

  RESULT_SUBDIR="$RESULT_DIR/$ROLE"
  mkdir -p "$RESULT_SUBDIR"

  echo "[INFO] 将执行 ${#BATCH_SIZES[@]} 组 benchmark"
  echo "[INFO] role: $ROLE"
  echo "[INFO] random-output-len: $RANDOM_OUTPUT_LEN"
  echo "[INFO] batch size 列表: ${BATCH_SIZES[*]}"
  echo "[INFO] result_dir: $RESULT_SUBDIR"

  local batch_size
  local num_prompts
  local failed_count=0

  for batch_size in "${BATCH_SIZES[@]}"; do
    num_prompts="$(calc_num_prompts "$batch_size")"

    echo ""
    echo "[INFO] 开始测试: role=$ROLE, batch_size=$batch_size, max_concurrency=$batch_size, num_prompts=$num_prompts, random_output_len=$RANDOM_OUTPUT_LEN"

    if ! vllm bench serve \
      --backend vllm \
      --model "$MODEL_PATH" \
      --served-model-name "$SERVED_MODEL_NAME" \
      --dataset-name "$DATASET_NAME" \
      --random-input-len 65536 \
      --random-output-len "$RANDOM_OUTPUT_LEN" \
      --num-prompts "$num_prompts" \
      --temperature 0 \
      --max-concurrency "$batch_size" \
      --ready-check-timeout "$READY_CHECK_TIMEOUT" \
      --result-dir "$RESULT_SUBDIR" \
      --port 8018 \
      --save-result; then
      echo "[WARN] batch_size=$batch_size 执行失败，继续下一个。"
      failed_count=$((failed_count + 1))
    else
      echo "[INFO] batch_size=$batch_size 执行完成。"
    fi
  done

  echo ""
  if (( failed_count > 0 )); then
    echo "[WARN] 全部执行结束，但有 $failed_count 组失败。"
    exit 1
  fi

  echo "[INFO] 全部 benchmark 执行完成。"
}

main "$@"