"vscode:/vscode.git/clone" did not exist on "c9415c19d3df26d8ede611abefba35c6837cd934"
Unverified Commit c61a98f5 authored by junuxyz's avatar junuxyz Committed by GitHub
Browse files

[CI][BugFix] ShellCheck cleanup to remove baseline and preserve runtime behavior (#34514)


Signed-off-by: default avatarjunuxyz <216036880+junuxyz@users.noreply.github.com>
parent 28bffe94
...@@ -9,10 +9,11 @@ ENV_FILE=$1 ...@@ -9,10 +9,11 @@ ENV_FILE=$1
# For testing on local vm, use `set -a` to export all variables # For testing on local vm, use `set -a` to export all variables
source /etc/environment source /etc/environment
source $ENV_FILE # shellcheck source=/dev/null
source "$ENV_FILE"
remove_docker_container() { remove_docker_container() {
docker rm -f $CONTAINER_NAME || true; docker rm -f "$CONTAINER_NAME" || true;
} }
trap remove_docker_container EXIT trap remove_docker_container EXIT
...@@ -41,13 +42,13 @@ echo ...@@ -41,13 +42,13 @@ echo
echo "starting docker...$CONTAINER_NAME" echo "starting docker...$CONTAINER_NAME"
echo echo
docker run \ docker run \
-v $DOWNLOAD_DIR:$DOWNLOAD_DIR \ -v "$DOWNLOAD_DIR":"$DOWNLOAD_DIR" \
--env-file $ENV_FILE \ --env-file "$ENV_FILE" \
-e HF_TOKEN="$HF_TOKEN" \ -e HF_TOKEN="$HF_TOKEN" \
-e TARGET_COMMIT=$BUILDKITE_COMMIT \ -e TARGET_COMMIT="$BUILDKITE_COMMIT" \
-e MODEL=$MODEL \ -e MODEL="$MODEL" \
-e WORKSPACE=/workspace \ -e WORKSPACE=/workspace \
--name $CONTAINER_NAME \ --name "$CONTAINER_NAME" \
-d \ -d \
--privileged \ --privileged \
--network host \ --network host \
......
...@@ -42,21 +42,21 @@ echo "lanching vllm..." ...@@ -42,21 +42,21 @@ echo "lanching vllm..."
echo "logging to $VLLM_LOG" echo "logging to $VLLM_LOG"
echo echo
vllm serve $MODEL \ vllm serve "$MODEL" \
--seed 42 \ --seed 42 \
--max-num-seqs $MAX_NUM_SEQS \ --max-num-seqs "$MAX_NUM_SEQS" \
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \ --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" \
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \ --tensor-parallel-size "$TENSOR_PARALLEL_SIZE" \
--no-enable-prefix-caching \ --no-enable-prefix-caching \
--download_dir $DOWNLOAD_DIR \ --download_dir "$DOWNLOAD_DIR" \
--max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 & --max-model-len "$MAX_MODEL_LEN" > "$VLLM_LOG" 2>&1 &
echo "wait for 20 minutes.." echo "wait for 20 minutes.."
echo echo
# sleep 1200 # sleep 1200
# wait for 10 minutes... # wait for 10 minutes...
for i in {1..120}; do for _ in {1..120}; do
# TODO: detect other type of errors. # TODO: detect other type of errors.
if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
echo "Detected RuntimeError, exiting." echo "Detected RuntimeError, exiting."
...@@ -78,11 +78,11 @@ echo "logging to $BM_LOG" ...@@ -78,11 +78,11 @@ echo "logging to $BM_LOG"
echo echo
vllm bench serve \ vllm bench serve \
--backend vllm \ --backend vllm \
--model $MODEL \ --model "$MODEL" \
--dataset-name sonnet \ --dataset-name sonnet \
--dataset-path benchmarks/sonnet_4x.txt \ --dataset-path benchmarks/sonnet_4x.txt \
--sonnet-input-len $INPUT_LEN \ --sonnet-input-len "$INPUT_LEN" \
--sonnet-output-len $OUTPUT_LEN \ --sonnet-output-len "$OUTPUT_LEN" \
--ignore-eos > "$BM_LOG" --ignore-eos > "$BM_LOG"
echo "completed..." echo "completed..."
......
...@@ -76,16 +76,15 @@ mkdir -p "$INDICES_OUTPUT_DIR" ...@@ -76,16 +76,15 @@ mkdir -p "$INDICES_OUTPUT_DIR"
# this indices have relative paths that could work as long as it is next to the wheel directory in s3 # this indices have relative paths that could work as long as it is next to the wheel directory in s3
# i.e., the wheels are always in s3://vllm-wheels/<commit>/ # i.e., the wheels are always in s3://vllm-wheels/<commit>/
# and indices can be placed in /<commit>/, or /nightly/, or /<version>/ # and indices can be placed in /<commit>/, or /nightly/, or /<version>/
if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then alias_args=()
alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS" if [[ -n "$DEFAULT_VARIANT_ALIAS" ]]; then
else alias_args=(--alias-to-default "$DEFAULT_VARIANT_ALIAS")
alias_arg=""
fi fi
# HACK: we do not need regex module here, but it is required by pre-commit hook # HACK: we do not need regex module here, but it is required by pre-commit hook
# To avoid any external dependency, we simply replace it back to the stdlib re module # To avoid any external dependency, we simply replace it back to the stdlib re module
sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" "${alias_args[@]}"
# copy indices to /<commit>/ unconditionally # copy indices to /<commit>/ unconditionally
echo "Uploading indices to $S3_COMMIT_PREFIX" echo "Uploading indices to $S3_COMMIT_PREFIX"
...@@ -100,9 +99,9 @@ fi ...@@ -100,9 +99,9 @@ fi
# re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version # re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version
if [[ "$version" != *"dev"* ]]; then if [[ "$version" != *"dev"* ]]; then
echo "Re-generating indices for /$pure_version/" echo "Re-generating indices for /$pure_version/"
rm -rf "$INDICES_OUTPUT_DIR/*" rm -rf "${INDICES_OUTPUT_DIR:?}/*"
mkdir -p "$INDICES_OUTPUT_DIR" mkdir -p "$INDICES_OUTPUT_DIR"
# wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path # wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" "${alias_args[@]}"
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/" aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
fi fi
...@@ -7,7 +7,7 @@ SUBPATH=$BUILDKITE_COMMIT ...@@ -7,7 +7,7 @@ SUBPATH=$BUILDKITE_COMMIT
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/" S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
RELEASE_VERSION=$(buildkite-agent meta-data get release-version) RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null) GIT_VERSION=$(git describe --exact-match --tags "$BUILDKITE_COMMIT" 2>/dev/null)
echo "Release version from Buildkite: $RELEASE_VERSION" echo "Release version from Buildkite: $RELEASE_VERSION"
...@@ -55,7 +55,7 @@ mkdir -p $DIST_DIR ...@@ -55,7 +55,7 @@ mkdir -p $DIST_DIR
aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
echo "Wheels copied to local directory" echo "Wheels copied to local directory"
# generate source tarball # generate source tarball
git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" $BUILDKITE_COMMIT git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" "$BUILDKITE_COMMIT"
ls -la $DIST_DIR ls -la $DIST_DIR
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name) # upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
...@@ -65,6 +65,6 @@ if [[ -z "$PYPI_WHEEL_FILES" ]]; then ...@@ -65,6 +65,6 @@ if [[ -z "$PYPI_WHEEL_FILES" ]]; then
exit 1 exit 1
fi fi
python3 -m twine check $PYPI_WHEEL_FILES python3 -m twine check "$PYPI_WHEEL_FILES"
python3 -m twine upload --non-interactive --verbose $PYPI_WHEEL_FILES python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES"
echo "Wheels uploaded to PyPI" echo "Wheels uploaded to PyPI"
...@@ -55,7 +55,7 @@ mkdir -p all-rocm-wheels ...@@ -55,7 +55,7 @@ mkdir -p all-rocm-wheels
cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true
cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true
WHEEL_COUNT=$(ls all-rocm-wheels/*.whl 2>/dev/null | wc -l) WHEEL_COUNT=$(find all-rocm-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
echo "Total wheels to upload: $WHEEL_COUNT" echo "Total wheels to upload: $WHEEL_COUNT"
if [ "$WHEEL_COUNT" -eq 0 ]; then if [ "$WHEEL_COUNT" -eq 0 ]; then
...@@ -115,7 +115,7 @@ if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]] | ...@@ -115,7 +115,7 @@ if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]] |
fi fi
# Extract version from vLLM wheel and update version-specific index # Extract version from vLLM wheel and update version-specific index
VLLM_WHEEL=$(ls all-rocm-wheels/vllm*.whl 2>/dev/null | head -1) VLLM_WHEEL=$(find all-rocm-wheels -maxdepth 1 -name 'vllm*.whl' 2>/dev/null | head -1)
if [ -n "$VLLM_WHEEL" ]; then if [ -n "$VLLM_WHEEL" ]; then
VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2) VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
echo "Version in wheel: $VERSION" echo "Version in wheel: $VERSION"
......
...@@ -46,10 +46,10 @@ echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL" ...@@ -46,10 +46,10 @@ echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL"
echo "RESULT_FILE=$RESULT" echo "RESULT_FILE=$RESULT"
echo "====================== AUTO TUNEPARAMETERS ====================" echo "====================== AUTO TUNEPARAMETERS ===================="
rm -rf $LOG_FOLDER rm -rf "$LOG_FOLDER"
rm -rf $PROFILE_PATH rm -rf "$PROFILE_PATH"
mkdir -p $LOG_FOLDER mkdir -p "$LOG_FOLDER"
mkdir -p $PROFILE_PATH mkdir -p "$PROFILE_PATH"
cd "$BASE/vllm" cd "$BASE/vllm"
...@@ -114,7 +114,7 @@ start_server() { ...@@ -114,7 +114,7 @@ start_server() {
# wait for 10 minutes... # wait for 10 minutes...
server_started=0 server_started=0
for i in {1..60}; do for _ in {1..60}; do
# This line checks whether the server is still alive or not, # This line checks whether the server is still alive or not,
# since that we should always have permission to send signal to the server process. # since that we should always have permission to send signal to the server process.
kill -0 $server_pid 2> /dev/null || break kill -0 $server_pid 2> /dev/null || break
...@@ -145,12 +145,12 @@ run_benchmark() { ...@@ -145,12 +145,12 @@ run_benchmark() {
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt" local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
echo "vllm_log: $vllm_log" echo "vllm_log: $vllm_log"
echo echo
rm -f $vllm_log rm -f "$vllm_log"
pkill -if "vllm serve" || true pkill -if "vllm serve" || true
echo "starting server..." echo "starting server..."
# Call start_server without a profile_dir to avoid profiling overhead # Call start_server without a profile_dir to avoid profiling overhead
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log "" start_server "$gpu_memory_utilization" "$max_num_seqs" "$max_num_batched_tokens" "$vllm_log" ""
result=$? result=$?
if [[ "$result" -eq 1 ]]; then if [[ "$result" -eq 1 ]]; then
echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens" echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
...@@ -168,15 +168,15 @@ run_benchmark() { ...@@ -168,15 +168,15 @@ run_benchmark() {
# --profile flag is removed from this call # --profile flag is removed from this call
vllm bench serve \ vllm bench serve \
--backend vllm \ --backend vllm \
--model $MODEL \ --model "$MODEL" \
--dataset-name random \ --dataset-name random \
--random-input-len $adjusted_input_len \ --random-input-len $adjusted_input_len \
--random-output-len $OUTPUT_LEN \ --random-output-len "$OUTPUT_LEN" \
--ignore-eos \ --ignore-eos \
--disable-tqdm \ --disable-tqdm \
--request-rate inf \ --request-rate inf \
--percentile-metrics ttft,tpot,itl,e2el \ --percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \ --goodput e2el:"$MAX_LATENCY_ALLOWED_MS" \
--num-prompts 1000 \ --num-prompts 1000 \
--random-prefix-len $prefix_len \ --random-prefix-len $prefix_len \
--host "$HOSTNAME" \ --host "$HOSTNAME" \
...@@ -195,20 +195,20 @@ run_benchmark() { ...@@ -195,20 +195,20 @@ run_benchmark() {
request_rate=$((${throughput%.*} + 1)) request_rate=$((${throughput%.*} + 1))
while ((request_rate > 0)); do while ((request_rate > 0)); do
# clear prefix cache # clear prefix cache
curl -X POST http://${HOSTNAME}:8004/reset_prefix_cache curl -X POST http://"${HOSTNAME}":8004/reset_prefix_cache
sleep 5 sleep 5
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt" bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
vllm bench serve \ vllm bench serve \
--backend vllm \ --backend vllm \
--model $MODEL \ --model "$MODEL" \
--dataset-name random \ --dataset-name random \
--random-input-len $adjusted_input_len \ --random-input-len $adjusted_input_len \
--random-output-len $OUTPUT_LEN \ --random-output-len "$OUTPUT_LEN" \
--ignore-eos \ --ignore-eos \
--disable-tqdm \ --disable-tqdm \
--request-rate $request_rate \ --request-rate $request_rate \
--percentile-metrics ttft,tpot,itl,e2el \ --percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \ --goodput e2el:"$MAX_LATENCY_ALLOWED_MS" \
--num-prompts 100 \ --num-prompts 100 \
--random-prefix-len $prefix_len \ --random-prefix-len $prefix_len \
--host "$HOSTNAME" \ --host "$HOSTNAME" \
...@@ -255,7 +255,7 @@ gpu_memory_utilization=0.98 ...@@ -255,7 +255,7 @@ gpu_memory_utilization=0.98
find_gpu_memory_utilization=0 find_gpu_memory_utilization=0
while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
# Pass empty string for profile_dir argument # Pass empty string for profile_dir argument
start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" "" start_server "$gpu_memory_utilization" "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" ""
result=$? result=$?
if [[ "$result" -eq 0 ]]; then if [[ "$result" -eq 0 ]]; then
find_gpu_memory_utilization=1 find_gpu_memory_utilization=1
...@@ -274,7 +274,7 @@ fi ...@@ -274,7 +274,7 @@ fi
for num_seqs in "${num_seqs_list[@]}"; do for num_seqs in "${num_seqs_list[@]}"; do
for num_batched_tokens in "${num_batched_tokens_list[@]}"; do for num_batched_tokens in "${num_batched_tokens_list[@]}"; do
run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization run_benchmark "$num_seqs" "$num_batched_tokens" "$gpu_memory_utilization"
done done
done done
echo "finish permutations" echo "finish permutations"
...@@ -285,7 +285,7 @@ echo "finish permutations" ...@@ -285,7 +285,7 @@ echo "finish permutations"
if (( $(echo "$best_throughput > 0" | bc -l) )); then if (( $(echo "$best_throughput > 0" | bc -l) )); then
echo echo
echo "Benchmark tuning finished. Now running profiling on the best configuration found..." echo "Benchmark tuning finished. Now running profiling on the best configuration found..."
echo "Best config: max_num_seqs: $best_max_num_seqs, max_num_batched_tokens: $best_num_batched_tokens, throughput: $best_throughput" echo "Best config: max_num_seqs: $best_max_num_seqs, max_num_batched_tokens: $best_num_batched_tokens, throughput: $best_throughput, goodput: $best_goodput"
echo echo
vllm_log="$LOG_FOLDER/vllm_log_BEST_PROFILE.txt" vllm_log="$LOG_FOLDER/vllm_log_BEST_PROFILE.txt"
...@@ -293,7 +293,7 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then ...@@ -293,7 +293,7 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
# Start server with the best params and profiling ENABLED # Start server with the best params and profiling ENABLED
echo "Starting server for profiling..." echo "Starting server for profiling..."
start_server $gpu_memory_utilization $best_max_num_seqs $best_num_batched_tokens "$vllm_log" "$PROFILE_PATH" start_server "$gpu_memory_utilization" "$best_max_num_seqs" "$best_num_batched_tokens" "$vllm_log" "$PROFILE_PATH"
# Run benchmark with the best params and the --profile flag # Run benchmark with the best params and the --profile flag
echo "Running benchmark with profiling..." echo "Running benchmark with profiling..."
...@@ -301,15 +301,15 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then ...@@ -301,15 +301,15 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
adjusted_input_len=$(( INPUT_LEN - prefix_len )) adjusted_input_len=$(( INPUT_LEN - prefix_len ))
vllm bench serve \ vllm bench serve \
--backend vllm \ --backend vllm \
--model $MODEL \ --model "$MODEL" \
--dataset-name random \ --dataset-name random \
--random-input-len $adjusted_input_len \ --random-input-len $adjusted_input_len \
--random-output-len $OUTPUT_LEN \ --random-output-len "$OUTPUT_LEN" \
--ignore-eos \ --ignore-eos \
--disable-tqdm \ --disable-tqdm \
--request-rate $best_request_rate \ --request-rate "$best_request_rate" \
--percentile-metrics ttft,tpot,itl,e2el \ --percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \ --goodput e2el:"$MAX_LATENCY_ALLOWED_MS" \
--num-prompts 100 \ --num-prompts 100 \
--random-prefix-len $prefix_len \ --random-prefix-len $prefix_len \
--host "$HOSTNAME" \ --host "$HOSTNAME" \
......
...@@ -64,7 +64,7 @@ for i in $(seq 0 $(($num_runs - 1))); do ...@@ -64,7 +64,7 @@ for i in $(seq 0 $(($num_runs - 1))); do
else else
STATUS="FAILURE" STATUS="FAILURE"
((FAILURE_COUNT++)) ((FAILURE_COUNT++))
FAILED_RUNS+=("Run #$((i+1)): $(echo $run_object | jq -c .)") FAILED_RUNS+=("Run #$((i+1)): $(echo "$run_object" | jq -c .)")
fi fi
RUN_OUTPUT=$(<"$RUN_OUTPUT_FILE") RUN_OUTPUT=$(<"$RUN_OUTPUT_FILE")
......
...@@ -71,7 +71,7 @@ while [[ $# -gt 0 ]]; do ...@@ -71,7 +71,7 @@ while [[ $# -gt 0 ]]; do
usage usage
;; ;;
*) *)
echo "Unknown argument: $1\n" printf "Unknown argument: %s\n" "$1"
usage usage
;; ;;
esac esac
...@@ -84,15 +84,17 @@ mkdir -p "$OUTPUT_DIR" ...@@ -84,15 +84,17 @@ mkdir -p "$OUTPUT_DIR"
QPS_VALUES=(25 20 15 10 5 1) QPS_VALUES=(25 20 15 10 5 1)
# Common parameters # Common parameters
COMMON_PARAMS="--backend $BACKEND \ COMMON_PARAMS=(
--model $MODEL \ --backend "$BACKEND"
--dataset $DATASET \ --model "$MODEL"
--structured-output-ratio $STRUCTURED_OUTPUT_RATIO \ --dataset "$DATASET"
--save-results \ --structured-output-ratio "$STRUCTURED_OUTPUT_RATIO"
--result-dir $OUTPUT_DIR \ --save-results
--output-len $MAX_NEW_TOKENS \ --result-dir "$OUTPUT_DIR"
--port $PORT \ --output-len "$MAX_NEW_TOKENS"
--tokenizer-mode $TOKENIZER_MODE" --port "$PORT"
--tokenizer-mode "$TOKENIZER_MODE"
)
echo "Starting structured output benchmark with model: $MODEL" echo "Starting structured output benchmark with model: $MODEL"
echo "Backend: $BACKEND" echo "Backend: $BACKEND"
...@@ -109,17 +111,17 @@ for qps in "${QPS_VALUES[@]}"; do ...@@ -109,17 +111,17 @@ for qps in "${QPS_VALUES[@]}"; do
GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown") GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown")
# Construct filename for this run # Construct filename for this run
FILENAME="${BACKEND}_${qps}qps_$(basename $MODEL)_${DATASET}_${GIT_HASH}.json" FILENAME="${BACKEND}_${qps}qps_$(basename "$MODEL")_${DATASET}_${GIT_HASH}_${GIT_BRANCH}.json"
NUM_PROMPTS=$(echo "$TOTAL_SECONDS * $qps" | bc) NUM_PROMPTS=$(echo "$TOTAL_SECONDS * $qps" | bc)
NUM_PROMPTS=${NUM_PROMPTS%.*} # Remove fractional part NUM_PROMPTS=${NUM_PROMPTS%.*} # Remove fractional part
echo "Running benchmark with $NUM_PROMPTS prompts" echo "Running benchmark with $NUM_PROMPTS prompts"
# Run the benchmark # Run the benchmark
python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \ python "$SCRIPT_DIR/benchmark_serving_structured_output.py" "${COMMON_PARAMS[@]}" \
--request-rate $qps \ --request-rate "$qps" \
--result-filename "$FILENAME" \ --result-filename "$FILENAME" \
--num-prompts $NUM_PROMPTS --num-prompts "$NUM_PROMPTS"
echo "Completed benchmark with QPS: $qps" echo "Completed benchmark with QPS: $qps"
echo "----------------------------------------" echo "----------------------------------------"
......
...@@ -8,7 +8,7 @@ declare -a PIDS=() ...@@ -8,7 +8,7 @@ declare -a PIDS=()
############################################################################### ###############################################################################
MODEL="${MODEL:-Qwen/Qwen2.5-VL-3B-Instruct}" MODEL="${MODEL:-Qwen/Qwen2.5-VL-3B-Instruct}"
LOG_PATH="${LOG_PATH:-./logs}" LOG_PATH="${LOG_PATH:-./logs}"
mkdir -p $LOG_PATH mkdir -p "$LOG_PATH"
ENCODE_PORT="${ENCODE_PORT:-19534}" ENCODE_PORT="${ENCODE_PORT:-19534}"
PREFILL_PORT="${PREFILL_PORT:-19535}" PREFILL_PORT="${PREFILL_PORT:-19535}"
...@@ -84,10 +84,10 @@ trap cleanup TERM ...@@ -84,10 +84,10 @@ trap cleanup TERM
# clear previous cache # clear previous cache
echo "remove previous ec cache folder" echo "remove previous ec cache folder"
rm -rf $EC_SHARED_STORAGE_PATH rm -rf "$EC_SHARED_STORAGE_PATH"
echo "make ec cache folder" echo "make ec cache folder"
mkdir -p $EC_SHARED_STORAGE_PATH mkdir -p "$EC_SHARED_STORAGE_PATH"
############################################################################### ###############################################################################
# Encoder worker # Encoder worker
...@@ -100,7 +100,7 @@ CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \ ...@@ -100,7 +100,7 @@ CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
--no-enable-prefix-caching \ --no-enable-prefix-caching \
--max-num-batched-tokens 114688 \ --max-num-batched-tokens 114688 \
--max-num-seqs 128 \ --max-num-seqs 128 \
--allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \ --allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
--ec-transfer-config '{ --ec-transfer-config '{
"ec_connector": "ECExampleConnector", "ec_connector": "ECExampleConnector",
"ec_role": "ec_producer", "ec_role": "ec_producer",
...@@ -124,7 +124,7 @@ vllm serve "$MODEL" \ ...@@ -124,7 +124,7 @@ vllm serve "$MODEL" \
--enforce-eager \ --enforce-eager \
--enable-request-id-headers \ --enable-request-id-headers \
--max-num-seqs 128 \ --max-num-seqs 128 \
--allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \ --allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
--ec-transfer-config '{ --ec-transfer-config '{
"ec_connector": "ECExampleConnector", "ec_connector": "ECExampleConnector",
"ec_role": "ec_consumer", "ec_role": "ec_consumer",
...@@ -152,7 +152,7 @@ vllm serve "$MODEL" \ ...@@ -152,7 +152,7 @@ vllm serve "$MODEL" \
--enforce-eager \ --enforce-eager \
--enable-request-id-headers \ --enable-request-id-headers \
--max-num-seqs 128 \ --max-num-seqs 128 \
--allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \ --allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
--kv-transfer-config '{ --kv-transfer-config '{
"kv_connector": "NixlConnector", "kv_connector": "NixlConnector",
"kv_role": "kv_consumer" "kv_role": "kv_consumer"
...@@ -162,9 +162,9 @@ vllm serve "$MODEL" \ ...@@ -162,9 +162,9 @@ vllm serve "$MODEL" \
PIDS+=($!) PIDS+=($!)
# Wait for workers # Wait for workers
wait_for_server $ENCODE_PORT wait_for_server "$ENCODE_PORT"
wait_for_server $PREFILL_PORT wait_for_server "$PREFILL_PORT"
wait_for_server $DECODE_PORT wait_for_server "$DECODE_PORT"
############################################################################### ###############################################################################
# Proxy # Proxy
...@@ -179,7 +179,7 @@ python disagg_epd_proxy.py \ ...@@ -179,7 +179,7 @@ python disagg_epd_proxy.py \
PIDS+=($!) PIDS+=($!)
wait_for_server $PROXY_PORT wait_for_server "$PROXY_PORT"
echo "All services are up!" echo "All services are up!"
############################################################################### ###############################################################################
...@@ -187,14 +187,14 @@ echo "All services are up!" ...@@ -187,14 +187,14 @@ echo "All services are up!"
############################################################################### ###############################################################################
echo "Running benchmark (stream)..." echo "Running benchmark (stream)..."
vllm bench serve \ vllm bench serve \
--model $MODEL \ --model "$MODEL" \
--backend openai-chat \ --backend openai-chat \
--endpoint /v1/chat/completions \ --endpoint /v1/chat/completions \
--dataset-name hf \ --dataset-name hf \
--dataset-path lmarena-ai/VisionArena-Chat \ --dataset-path lmarena-ai/VisionArena-Chat \
--seed 0 \ --seed 0 \
--num-prompts $NUM_PROMPTS \ --num-prompts "$NUM_PROMPTS" \
--port $PROXY_PORT --port "$PROXY_PORT"
PIDS+=($!) PIDS+=($!)
...@@ -202,10 +202,10 @@ PIDS+=($!) ...@@ -202,10 +202,10 @@ PIDS+=($!)
# Single request with local image # Single request with local image
############################################################################### ###############################################################################
echo "Running single request with local image (non-stream)..." echo "Running single request with local image (non-stream)..."
curl http://127.0.0.1:${PROXY_PORT}/v1/chat/completions \ curl http://127.0.0.1:"${PROXY_PORT}"/v1/chat/completions \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
-d '{ -d '{
"model": "'${MODEL}'", "model": "'"${MODEL}"'",
"messages": [ "messages": [
{"role": "system", "content": "You are a helpful assistant."}, {"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": [ {"role": "user", "content": [
......
...@@ -8,7 +8,7 @@ declare -a PIDS=() ...@@ -8,7 +8,7 @@ declare -a PIDS=()
############################################################################### ###############################################################################
MODEL="${MODEL:-Qwen/Qwen2.5-VL-3B-Instruct}" MODEL="${MODEL:-Qwen/Qwen2.5-VL-3B-Instruct}"
LOG_PATH="${LOG_PATH:-./logs}" LOG_PATH="${LOG_PATH:-./logs}"
mkdir -p $LOG_PATH mkdir -p "$LOG_PATH"
ENCODE_PORT="${ENCODE_PORT:-19534}" ENCODE_PORT="${ENCODE_PORT:-19534}"
PREFILL_DECODE_PORT="${PREFILL_DECODE_PORT:-19535}" PREFILL_DECODE_PORT="${PREFILL_DECODE_PORT:-19535}"
...@@ -78,10 +78,10 @@ trap cleanup TERM ...@@ -78,10 +78,10 @@ trap cleanup TERM
# clear previous cache # clear previous cache
echo "remove previous ec cache folder" echo "remove previous ec cache folder"
rm -rf $EC_SHARED_STORAGE_PATH rm -rf "$EC_SHARED_STORAGE_PATH"
echo "make ec cache folder" echo "make ec cache folder"
mkdir -p $EC_SHARED_STORAGE_PATH mkdir -p "$EC_SHARED_STORAGE_PATH"
############################################################################### ###############################################################################
# Encoder worker # Encoder worker
...@@ -94,7 +94,7 @@ CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \ ...@@ -94,7 +94,7 @@ CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
--no-enable-prefix-caching \ --no-enable-prefix-caching \
--max-num-batched-tokens 114688 \ --max-num-batched-tokens 114688 \
--max-num-seqs 128 \ --max-num-seqs 128 \
--allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \ --allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
--ec-transfer-config '{ --ec-transfer-config '{
"ec_connector": "ECExampleConnector", "ec_connector": "ECExampleConnector",
"ec_role": "ec_producer", "ec_role": "ec_producer",
...@@ -115,7 +115,7 @@ CUDA_VISIBLE_DEVICES="$GPU_PD" vllm serve "$MODEL" \ ...@@ -115,7 +115,7 @@ CUDA_VISIBLE_DEVICES="$GPU_PD" vllm serve "$MODEL" \
--enforce-eager \ --enforce-eager \
--enable-request-id-headers \ --enable-request-id-headers \
--max-num-seqs 128 \ --max-num-seqs 128 \
--allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \ --allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
--ec-transfer-config '{ --ec-transfer-config '{
"ec_connector": "ECExampleConnector", "ec_connector": "ECExampleConnector",
"ec_role": "ec_consumer", "ec_role": "ec_consumer",
...@@ -128,8 +128,8 @@ CUDA_VISIBLE_DEVICES="$GPU_PD" vllm serve "$MODEL" \ ...@@ -128,8 +128,8 @@ CUDA_VISIBLE_DEVICES="$GPU_PD" vllm serve "$MODEL" \
PIDS+=($!) PIDS+=($!)
# Wait for workers # Wait for workers
wait_for_server $ENCODE_PORT wait_for_server "$ENCODE_PORT"
wait_for_server $PREFILL_DECODE_PORT wait_for_server "$PREFILL_DECODE_PORT"
############################################################################### ###############################################################################
# Proxy # Proxy
...@@ -144,7 +144,7 @@ python disagg_epd_proxy.py \ ...@@ -144,7 +144,7 @@ python disagg_epd_proxy.py \
PIDS+=($!) PIDS+=($!)
wait_for_server $PROXY_PORT wait_for_server "$PROXY_PORT"
echo "All services are up!" echo "All services are up!"
############################################################################### ###############################################################################
...@@ -152,14 +152,14 @@ echo "All services are up!" ...@@ -152,14 +152,14 @@ echo "All services are up!"
############################################################################### ###############################################################################
echo "Running benchmark (stream)..." echo "Running benchmark (stream)..."
vllm bench serve \ vllm bench serve \
--model $MODEL \ --model "$MODEL" \
--backend openai-chat \ --backend openai-chat \
--endpoint /v1/chat/completions \ --endpoint /v1/chat/completions \
--dataset-name hf \ --dataset-name hf \
--dataset-path lmarena-ai/VisionArena-Chat \ --dataset-path lmarena-ai/VisionArena-Chat \
--seed 0 \ --seed 0 \
--num-prompts $NUM_PROMPTS \ --num-prompts "$NUM_PROMPTS" \
--port $PROXY_PORT --port "$PROXY_PORT"
PIDS+=($!) PIDS+=($!)
...@@ -167,10 +167,10 @@ PIDS+=($!) ...@@ -167,10 +167,10 @@ PIDS+=($!)
# Single request with local image # Single request with local image
############################################################################### ###############################################################################
echo "Running single request with local image (non-stream)..." echo "Running single request with local image (non-stream)..."
curl http://127.0.0.1:${PROXY_PORT}/v1/chat/completions \ curl http://127.0.0.1:"${PROXY_PORT}"/v1/chat/completions \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
-d '{ -d '{
"model": "'${MODEL}'", "model": "'"${MODEL}"'",
"messages": [ "messages": [
{"role": "system", "content": "You are a helpful assistant."}, {"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": [ {"role": "user", "content": [
......
...@@ -54,7 +54,7 @@ wait_for_server() { ...@@ -54,7 +54,7 @@ wait_for_server() {
# You can also adjust --kv-ip and --kv-port for distributed inference. # You can also adjust --kv-ip and --kv-port for distributed inference.
# prefilling instance, which is the KV producer # prefilling instance, which is the KV producer
CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \ CUDA_VISIBLE_DEVICES=0 vllm serve "$MODEL_NAME" \
--host 0.0.0.0 \ --host 0.0.0.0 \
--port 8100 \ --port 8100 \
--max-model-len 100 \ --max-model-len 100 \
...@@ -64,7 +64,7 @@ CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \ ...@@ -64,7 +64,7 @@ CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":"1e9","kv_port":"14579","kv_connector_extra_config":{"proxy_ip":"'"$VLLM_HOST_IP"'","proxy_port":"30001","http_ip":"'"$VLLM_HOST_IP"'","http_port":"8100","send_type":"PUT_ASYNC"}}' & '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":"1e9","kv_port":"14579","kv_connector_extra_config":{"proxy_ip":"'"$VLLM_HOST_IP"'","proxy_port":"30001","http_ip":"'"$VLLM_HOST_IP"'","http_port":"8100","send_type":"PUT_ASYNC"}}' &
# decoding instance, which is the KV consumer # decoding instance, which is the KV consumer
CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \ CUDA_VISIBLE_DEVICES=1 vllm serve "$MODEL_NAME" \
--host 0.0.0.0 \ --host 0.0.0.0 \
--port 8200 \ --port 8200 \
--max-model-len 100 \ --max-model-len 100 \
......
...@@ -34,7 +34,7 @@ wait_for_server() { ...@@ -34,7 +34,7 @@ wait_for_server() {
done" && return 0 || return 1 done" && return 0 || return 1
} }
vllm serve $MODEL_NAME \ vllm serve "$MODEL_NAME" \
--port 8100 \ --port 8100 \
--max-model-len 100 \ --max-model-len 100 \
--enforce-eager \ --enforce-eager \
......
...@@ -143,7 +143,7 @@ main() { ...@@ -143,7 +143,7 @@ main() {
IFS=',' read -ra BOOTSTRAP_PORT_ARRAY <<< "$BOOTSTRAP_PORTS" IFS=',' read -ra BOOTSTRAP_PORT_ARRAY <<< "$BOOTSTRAP_PORTS"
IFS=',' read -ra DECODE_PORT_ARRAY <<< "$DECODE_PORTS" IFS=',' read -ra DECODE_PORT_ARRAY <<< "$DECODE_PORTS"
proxy_param="" proxy_args=()
# ============================================================================= # =============================================================================
# Launch Prefill Servers (X Producers) # Launch Prefill Servers (X Producers)
...@@ -156,12 +156,12 @@ main() { ...@@ -156,12 +156,12 @@ main() {
local bootstrap_port=${BOOTSTRAP_PORT_ARRAY[$i]} local bootstrap_port=${BOOTSTRAP_PORT_ARRAY[$i]}
echo " Prefill server $((i+1)): GPU $gpu_id, Port $port, Bootstrap Port $bootstrap_port" echo " Prefill server $((i+1)): GPU $gpu_id, Port $port, Bootstrap Port $bootstrap_port"
VLLM_MOONCAKE_BOOTSTRAP_PORT=$bootstrap_port CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \ VLLM_MOONCAKE_BOOTSTRAP_PORT=$bootstrap_port CUDA_VISIBLE_DEVICES=$gpu_id vllm serve "$MODEL" \
--port $port \ --port "$port" \
--kv-transfer-config \ --kv-transfer-config \
"{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_producer\"}" > prefill$((i+1)).log 2>&1 & "{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_producer\"}" > prefill$((i+1)).log 2>&1 &
PIDS+=($!) PIDS+=($!)
proxy_param="${proxy_param} --prefill http://0.0.0.0:${port} $bootstrap_port" proxy_args+=(--prefill "http://0.0.0.0:${port}" "$bootstrap_port")
done done
# ============================================================================= # =============================================================================
...@@ -174,12 +174,12 @@ main() { ...@@ -174,12 +174,12 @@ main() {
local port=${DECODE_PORT_ARRAY[$i]} local port=${DECODE_PORT_ARRAY[$i]}
echo " Decode server $((i+1)): GPU $gpu_id, Port $port" echo " Decode server $((i+1)): GPU $gpu_id, Port $port"
CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \ CUDA_VISIBLE_DEVICES=$gpu_id vllm serve "$MODEL" \
--port $port \ --port "$port" \
--kv-transfer-config \ --kv-transfer-config \
"{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_consumer\"}" > decode$((i+1)).log 2>&1 & "{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_consumer\"}" > decode$((i+1)).log 2>&1 &
PIDS+=($!) PIDS+=($!)
proxy_param="${proxy_param} --decode http://0.0.0.0:${port}" proxy_args+=(--decode "http://0.0.0.0:${port}")
done done
# ============================================================================= # =============================================================================
...@@ -187,7 +187,7 @@ main() { ...@@ -187,7 +187,7 @@ main() {
# ============================================================================= # =============================================================================
echo "" echo ""
echo "Starting proxy server on port $PROXY_PORT..." echo "Starting proxy server on port $PROXY_PORT..."
python3 mooncake_connector_proxy.py $proxy_param --port $PROXY_PORT > proxy.log 2>&1 & python3 mooncake_connector_proxy.py "${proxy_args[@]}" --port "$PROXY_PORT" > proxy.log 2>&1 &
PIDS+=($!) PIDS+=($!)
# ============================================================================= # =============================================================================
...@@ -196,9 +196,10 @@ main() { ...@@ -196,9 +196,10 @@ main() {
echo "" echo ""
echo "Waiting for all servers to start..." echo "Waiting for all servers to start..."
for port in "${PREFILL_PORT_ARRAY[@]}" "${DECODE_PORT_ARRAY[@]}"; do for port in "${PREFILL_PORT_ARRAY[@]}" "${DECODE_PORT_ARRAY[@]}"; do
if ! wait_for_server $port; then if ! wait_for_server "$port"; then
echo "Failed to start server on port $port" echo "Failed to start server on port $port"
cleanup cleanup
# shellcheck disable=SC2317
exit 1 exit 1
fi fi
done done
...@@ -209,8 +210,8 @@ main() { ...@@ -209,8 +210,8 @@ main() {
# ============================================================================= # =============================================================================
# Run Benchmark # Run Benchmark
# ============================================================================= # =============================================================================
vllm bench serve --port $PROXY_PORT --seed $(date +%s) \ vllm bench serve --port "$PROXY_PORT" --seed "$(date +%s)" \
--backend vllm --model $MODEL \ --backend vllm --model "$MODEL" \
--dataset-name random --random-input-len 7500 --random-output-len 200 \ --dataset-name random --random-input-len 7500 --random-output-len 200 \
--num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log --num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log
......
...@@ -166,10 +166,10 @@ main() { ...@@ -166,10 +166,10 @@ main() {
local kv_port=$((21001 + i)) local kv_port=$((21001 + i))
echo " Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port" echo " Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \ CUDA_VISIBLE_DEVICES=$gpu_id vllm serve "$MODEL" \
--enforce-eager \ --enforce-eager \
--host 0.0.0.0 \ --host 0.0.0.0 \
--port $port \ --port "$port" \
--tensor-parallel-size 1 \ --tensor-parallel-size 1 \
--seed 1024 \ --seed 1024 \
--dtype float16 \ --dtype float16 \
...@@ -194,10 +194,10 @@ main() { ...@@ -194,10 +194,10 @@ main() {
local kv_port=$((22001 + i)) local kv_port=$((22001 + i))
echo " Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port" echo " Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \ CUDA_VISIBLE_DEVICES=$gpu_id vllm serve "$MODEL" \
--enforce-eager \ --enforce-eager \
--host 0.0.0.0 \ --host 0.0.0.0 \
--port $port \ --port "$port" \
--tensor-parallel-size 1 \ --tensor-parallel-size 1 \
--seed 1024 \ --seed 1024 \
--dtype float16 \ --dtype float16 \
...@@ -217,9 +217,10 @@ main() { ...@@ -217,9 +217,10 @@ main() {
echo "" echo ""
echo "Waiting for all servers to start..." echo "Waiting for all servers to start..."
for port in "${PREFILL_PORT_ARRAY[@]}" "${DECODE_PORT_ARRAY[@]}"; do for port in "${PREFILL_PORT_ARRAY[@]}" "${DECODE_PORT_ARRAY[@]}"; do
if ! wait_for_server $port; then if ! wait_for_server "$port"; then
echo "Failed to start server on port $port" echo "Failed to start server on port $port"
cleanup cleanup
# shellcheck disable=SC2317
exit 1 exit 1
fi fi
done done
...@@ -231,8 +232,8 @@ main() { ...@@ -231,8 +232,8 @@ main() {
# Run Benchmark # Run Benchmark
# ============================================================================= # =============================================================================
cd ../../../benchmarks/ cd ../../../benchmarks/
vllm bench serve --port 10001 --seed $(date +%s) \ vllm bench serve --port 10001 --seed "$(date +%s)" \
--model $MODEL \ --model "$MODEL" \
--dataset-name random --random-input-len 7500 --random-output-len 200 \ --dataset-name random --random-input-len 7500 --random-output-len 200 \
--num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log --num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log
......
...@@ -50,8 +50,8 @@ while [[ $# -gt 0 ]]; do ...@@ -50,8 +50,8 @@ while [[ $# -gt 0 ]]; do
done done
vllm bench serve \ vllm bench serve \
--model $MODEL_NAME \ --model "$MODEL_NAME" \
--host $HOST \ --host "$HOST" \
--port $PORT \ --port "$PORT" \
--num-prompts $NUM_PROMPTS \ --num-prompts "$NUM_PROMPTS" \
--request-rate $REQUEST_RATE --request-rate "$REQUEST_RATE"
...@@ -57,15 +57,15 @@ echo "Starting vLLM server for $MODEL_NAME with data parallel size: $DATA_PARALL ...@@ -57,15 +57,15 @@ echo "Starting vLLM server for $MODEL_NAME with data parallel size: $DATA_PARALL
export RAY_DEDUP_LOGS=0 export RAY_DEDUP_LOGS=0
export VLLM_USE_DEEP_GEMM=1 export VLLM_USE_DEEP_GEMM=1
vllm serve $MODEL_NAME \ vllm serve "$MODEL_NAME" \
--data-parallel-size $DATA_PARALLEL_SIZE \ --data-parallel-size "$DATA_PARALLEL_SIZE" \
--data-parallel-size-local $DATA_PARALLEL_SIZE \ --data-parallel-size-local "$DATA_PARALLEL_SIZE" \
--data-parallel-backend ray \ --data-parallel-backend ray \
--enforce-eager \ --enforce-eager \
--enable-expert-parallel \ --enable-expert-parallel \
--enable-eplb \ --enable-eplb \
--all2all-backend pplx \ --all2all-backend pplx \
--num-redundant-experts $REDUNDANT_EXPERTS \ --num-redundant-experts "$REDUNDANT_EXPERTS" \
--trust-remote-code \ --trust-remote-code \
--host $HOST \ --host "$HOST" \
--port $PORT --port "$PORT"
...@@ -57,8 +57,7 @@ case "$subcommand" in ...@@ -57,8 +57,7 @@ case "$subcommand" in
# Retry until the worker node connects to the head node or the timeout expires. # Retry until the worker node connects to the head node or the timeout expires.
for (( i=0; i < $ray_init_timeout; i+=5 )); do for (( i=0; i < $ray_init_timeout; i+=5 )); do
ray start --address=$ray_address:$ray_port --block "${start_params[@]}" if ray start --address="$ray_address":"$ray_port" --block "${start_params[@]}"; then
if [ $? -eq 0 ]; then
echo "Worker: Ray runtime started with head address $ray_address:$ray_port" echo "Worker: Ray runtime started with head address $ray_address:$ray_port"
exit 0 exit 0
fi fi
...@@ -95,12 +94,12 @@ case "$subcommand" in ...@@ -95,12 +94,12 @@ case "$subcommand" in
fi fi
# Start the Ray head node. # Start the Ray head node.
ray start --head --port=$ray_port "${start_params[@]}" ray start --head --port="$ray_port" "${start_params[@]}"
# Poll Ray until every worker node is active. # Poll Ray until every worker node is active.
for (( i=0; i < $ray_init_timeout; i+=5 )); do for (( i=0; i < $ray_init_timeout; i+=5 )); do
active_nodes=`python3 -c 'import ray; ray.init(); print(sum(node["Alive"] for node in ray.nodes()))'` active_nodes=$(python3 -c 'import ray; ray.init(); print(sum(node["Alive"] for node in ray.nodes()))')
if [ $active_nodes -eq $ray_cluster_size ]; then if [ "$active_nodes" -eq "$ray_cluster_size" ]; then
echo "All ray workers are active and the ray cluster is initialized successfully." echo "All ray workers are active and the ray cluster is initialized successfully."
exit 0 exit 0
fi fi
......
...@@ -22,11 +22,10 @@ check_hf_token() { ...@@ -22,11 +22,10 @@ check_hf_token() {
check_num_gpus() { check_num_gpus() {
# can you check if the number of GPUs are >=2 via nvidia-smi/rocm-smi? # can you check if the number of GPUs are >=2 via nvidia-smi/rocm-smi?
which rocm-smi > /dev/null 2>&1 if ! which rocm-smi > /dev/null 2>&1; then
if [ $? -ne 0 ]; then
num_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) num_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
else else
num_gpus=$(rocm-smi --showid | grep Instinct | wc -l) num_gpus=$(rocm-smi --showid | grep -c Instinct)
fi fi
if [ "$num_gpus" -lt 2 ]; then if [ "$num_gpus" -lt 2 ]; then
...@@ -39,8 +38,7 @@ check_num_gpus() { ...@@ -39,8 +38,7 @@ check_num_gpus() {
ensure_python_library_installed() { ensure_python_library_installed() {
echo "Checking if $1 is installed..." echo "Checking if $1 is installed..."
python3 -c "import $1" > /dev/null 2>&1 if ! python3 -c "import $1" > /dev/null 2>&1; then
if [ $? -ne 0 ]; then
if [ "$1" == "nixl" ]; then if [ "$1" == "nixl" ]; then
echo "$1 is not installed. Please refer to https://github.com/ai-dynamo/nixl for installation." echo "$1 is not installed. Please refer to https://github.com/ai-dynamo/nixl for installation."
else else
...@@ -102,12 +100,12 @@ main() { ...@@ -102,12 +100,12 @@ main() {
bash disagg_vllm_launcher.sh prefiller \ bash disagg_vllm_launcher.sh prefiller \
> >(tee prefiller.log) 2>&1 & > >(tee prefiller.log) 2>&1 &
prefiller_pid=$! prefiller_pid=$!
PIDS+=($prefiller_pid) PIDS+=("$prefiller_pid")
bash disagg_vllm_launcher.sh decoder \ bash disagg_vllm_launcher.sh decoder \
> >(tee decoder.log) 2>&1 & > >(tee decoder.log) 2>&1 &
decoder_pid=$! decoder_pid=$!
PIDS+=($decoder_pid) PIDS+=("$decoder_pid")
python3 disagg_proxy_server.py \ python3 disagg_proxy_server.py \
--host localhost \ --host localhost \
...@@ -118,7 +116,7 @@ main() { ...@@ -118,7 +116,7 @@ main() {
--decoder-port 8200 \ --decoder-port 8200 \
> >(tee proxy.log) 2>&1 & > >(tee proxy.log) 2>&1 &
proxy_pid=$! proxy_pid=$!
PIDS+=($proxy_pid) PIDS+=("$proxy_pid")
wait_for_server 8100 wait_for_server 8100
wait_for_server 8200 wait_for_server 8200
...@@ -128,7 +126,7 @@ main() { ...@@ -128,7 +126,7 @@ main() {
# begin benchmark # begin benchmark
cd ../../../../benchmarks/ cd ../../../../benchmarks/
vllm bench serve --port 9000 --seed $(date +%s) \ vllm bench serve --port 9000 --seed "$(date +%s)" \
--model meta-llama/Llama-3.1-8B-Instruct \ --model meta-llama/Llama-3.1-8B-Instruct \
--dataset-name random --random-input-len 7500 --random-output-len 200 \ --dataset-name random --random-input-len 7500 --random-output-len 200 \
--num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log --num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log
......
...@@ -34,7 +34,7 @@ if [[ $1 == "prefiller" ]]; then ...@@ -34,7 +34,7 @@ if [[ $1 == "prefiller" ]]; then
VLLM_ENABLE_V1_MULTIPROCESSING=1 \ VLLM_ENABLE_V1_MULTIPROCESSING=1 \
VLLM_WORKER_MULTIPROC_METHOD=spawn \ VLLM_WORKER_MULTIPROC_METHOD=spawn \
CUDA_VISIBLE_DEVICES=0 \ CUDA_VISIBLE_DEVICES=0 \
vllm serve $MODEL \ vllm serve "$MODEL" \
--port 8100 \ --port 8100 \
--enforce-eager \ --enforce-eager \
--kv-transfer-config \ --kv-transfer-config \
...@@ -51,7 +51,7 @@ elif [[ $1 == "decoder" ]]; then ...@@ -51,7 +51,7 @@ elif [[ $1 == "decoder" ]]; then
VLLM_ENABLE_V1_MULTIPROCESSING=1 \ VLLM_ENABLE_V1_MULTIPROCESSING=1 \
VLLM_WORKER_MULTIPROC_METHOD=spawn \ VLLM_WORKER_MULTIPROC_METHOD=spawn \
CUDA_VISIBLE_DEVICES=1 \ CUDA_VISIBLE_DEVICES=1 \
vllm serve $MODEL \ vllm serve "$MODEL" \
--port 8200 \ --port 8200 \
--enforce-eager \ --enforce-eager \
--kv-transfer-config \ --kv-transfer-config \
......
...@@ -103,7 +103,7 @@ vllm serve "$MODEL_NAME" \ ...@@ -103,7 +103,7 @@ vllm serve "$MODEL_NAME" \
--tensor-parallel-size "$GPU_COUNT" \ --tensor-parallel-size "$GPU_COUNT" \
--enforce-eager \ --enforce-eager \
--pooler-config "$POOLER_CONFIG" \ --pooler-config "$POOLER_CONFIG" \
--served-model-name ${MODEL_CODE} \ --served-model-name "${MODEL_CODE}" \
--api-key "$API_KEY" \ --api-key "$API_KEY" \
--trust-remote-code \ --trust-remote-code \
--port "$PORT" \ --port "$PORT" \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment