Unverified Commit c61a98f5 authored by junuxyz's avatar junuxyz Committed by GitHub
Browse files

[CI][BugFix] ShellCheck cleanup to remove baseline and preserve runtime behavior (#34514)


Signed-off-by: default avatarjunuxyz <216036880+junuxyz@users.noreply.github.com>
parent 28bffe94
......@@ -9,10 +9,11 @@ ENV_FILE=$1
# For testing on local vm, use `set -a` to export all variables
source /etc/environment
source $ENV_FILE
# shellcheck source=/dev/null
source "$ENV_FILE"
remove_docker_container() {
docker rm -f $CONTAINER_NAME || true;
docker rm -f "$CONTAINER_NAME" || true;
}
trap remove_docker_container EXIT
......@@ -41,13 +42,13 @@ echo
echo "starting docker...$CONTAINER_NAME"
echo
docker run \
-v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
--env-file $ENV_FILE \
-v "$DOWNLOAD_DIR":"$DOWNLOAD_DIR" \
--env-file "$ENV_FILE" \
-e HF_TOKEN="$HF_TOKEN" \
-e TARGET_COMMIT=$BUILDKITE_COMMIT \
-e MODEL=$MODEL \
-e TARGET_COMMIT="$BUILDKITE_COMMIT" \
-e MODEL="$MODEL" \
-e WORKSPACE=/workspace \
--name $CONTAINER_NAME \
--name "$CONTAINER_NAME" \
-d \
--privileged \
--network host \
......
......@@ -42,21 +42,21 @@ echo "lanching vllm..."
echo "logging to $VLLM_LOG"
echo
vllm serve $MODEL \
vllm serve "$MODEL" \
--seed 42 \
--max-num-seqs $MAX_NUM_SEQS \
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
--max-num-seqs "$MAX_NUM_SEQS" \
--max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" \
--tensor-parallel-size "$TENSOR_PARALLEL_SIZE" \
--no-enable-prefix-caching \
--download_dir $DOWNLOAD_DIR \
--max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
--download_dir "$DOWNLOAD_DIR" \
--max-model-len "$MAX_MODEL_LEN" > "$VLLM_LOG" 2>&1 &
echo "wait for 20 minutes.."
echo
# sleep 1200
# wait for 10 minutes...
for i in {1..120}; do
for _ in {1..120}; do
# TODO: detect other type of errors.
if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
echo "Detected RuntimeError, exiting."
......@@ -78,11 +78,11 @@ echo "logging to $BM_LOG"
echo
vllm bench serve \
--backend vllm \
--model $MODEL \
--model "$MODEL" \
--dataset-name sonnet \
--dataset-path benchmarks/sonnet_4x.txt \
--sonnet-input-len $INPUT_LEN \
--sonnet-output-len $OUTPUT_LEN \
--sonnet-input-len "$INPUT_LEN" \
--sonnet-output-len "$OUTPUT_LEN" \
--ignore-eos > "$BM_LOG"
echo "completed..."
......
......@@ -76,16 +76,15 @@ mkdir -p "$INDICES_OUTPUT_DIR"
# this indices have relative paths that could work as long as it is next to the wheel directory in s3
# i.e., the wheels are always in s3://vllm-wheels/<commit>/
# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
else
alias_arg=""
alias_args=()
if [[ -n "$DEFAULT_VARIANT_ALIAS" ]]; then
alias_args=(--alias-to-default "$DEFAULT_VARIANT_ALIAS")
fi
# HACK: we do not need regex module here, but it is required by pre-commit hook
# To avoid any external dependency, we simply replace it back to the stdlib re module
sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" "${alias_args[@]}"
# copy indices to /<commit>/ unconditionally
echo "Uploading indices to $S3_COMMIT_PREFIX"
......@@ -100,9 +99,9 @@ fi
# re-generate and copy to /<pure_version>/ only if it does not have "dev" in the version
if [[ "$version" != *"dev"* ]]; then
echo "Re-generating indices for /$pure_version/"
rm -rf "$INDICES_OUTPUT_DIR/*"
rm -rf "${INDICES_OUTPUT_DIR:?}/*"
mkdir -p "$INDICES_OUTPUT_DIR"
# wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" "${alias_args[@]}"
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
fi
......@@ -7,7 +7,7 @@ SUBPATH=$BUILDKITE_COMMIT
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)
GIT_VERSION=$(git describe --exact-match --tags "$BUILDKITE_COMMIT" 2>/dev/null)
echo "Release version from Buildkite: $RELEASE_VERSION"
......@@ -55,7 +55,7 @@ mkdir -p $DIST_DIR
aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR
echo "Wheels copied to local directory"
# generate source tarball
git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" $BUILDKITE_COMMIT
git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" "$BUILDKITE_COMMIT"
ls -la $DIST_DIR
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
......@@ -65,6 +65,6 @@ if [[ -z "$PYPI_WHEEL_FILES" ]]; then
exit 1
fi
python3 -m twine check $PYPI_WHEEL_FILES
python3 -m twine upload --non-interactive --verbose $PYPI_WHEEL_FILES
python3 -m twine check "$PYPI_WHEEL_FILES"
python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES"
echo "Wheels uploaded to PyPI"
......@@ -55,7 +55,7 @@ mkdir -p all-rocm-wheels
cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true
cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true
WHEEL_COUNT=$(ls all-rocm-wheels/*.whl 2>/dev/null | wc -l)
WHEEL_COUNT=$(find all-rocm-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
echo "Total wheels to upload: $WHEEL_COUNT"
if [ "$WHEEL_COUNT" -eq 0 ]; then
......@@ -115,7 +115,7 @@ if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]] |
fi
# Extract version from vLLM wheel and update version-specific index
VLLM_WHEEL=$(ls all-rocm-wheels/vllm*.whl 2>/dev/null | head -1)
VLLM_WHEEL=$(find all-rocm-wheels -maxdepth 1 -name 'vllm*.whl' 2>/dev/null | head -1)
if [ -n "$VLLM_WHEEL" ]; then
VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
echo "Version in wheel: $VERSION"
......
......@@ -46,10 +46,10 @@ echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL"
echo "RESULT_FILE=$RESULT"
echo "====================== AUTO TUNEPARAMETERS ===================="
rm -rf $LOG_FOLDER
rm -rf $PROFILE_PATH
mkdir -p $LOG_FOLDER
mkdir -p $PROFILE_PATH
rm -rf "$LOG_FOLDER"
rm -rf "$PROFILE_PATH"
mkdir -p "$LOG_FOLDER"
mkdir -p "$PROFILE_PATH"
cd "$BASE/vllm"
......@@ -114,7 +114,7 @@ start_server() {
# wait for 10 minutes...
server_started=0
for i in {1..60}; do
for _ in {1..60}; do
# This line checks whether the server is still alive or not,
# since that we should always have permission to send signal to the server process.
kill -0 $server_pid 2> /dev/null || break
......@@ -145,12 +145,12 @@ run_benchmark() {
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
echo "vllm_log: $vllm_log"
echo
rm -f $vllm_log
rm -f "$vllm_log"
pkill -if "vllm serve" || true
echo "starting server..."
# Call start_server without a profile_dir to avoid profiling overhead
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log ""
start_server "$gpu_memory_utilization" "$max_num_seqs" "$max_num_batched_tokens" "$vllm_log" ""
result=$?
if [[ "$result" -eq 1 ]]; then
echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
......@@ -168,15 +168,15 @@ run_benchmark() {
# --profile flag is removed from this call
vllm bench serve \
--backend vllm \
--model $MODEL \
--model "$MODEL" \
--dataset-name random \
--random-input-len $adjusted_input_len \
--random-output-len $OUTPUT_LEN \
--random-output-len "$OUTPUT_LEN" \
--ignore-eos \
--disable-tqdm \
--request-rate inf \
--percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--goodput e2el:"$MAX_LATENCY_ALLOWED_MS" \
--num-prompts 1000 \
--random-prefix-len $prefix_len \
--host "$HOSTNAME" \
......@@ -195,20 +195,20 @@ run_benchmark() {
request_rate=$((${throughput%.*} + 1))
while ((request_rate > 0)); do
# clear prefix cache
curl -X POST http://${HOSTNAME}:8004/reset_prefix_cache
curl -X POST http://"${HOSTNAME}":8004/reset_prefix_cache
sleep 5
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
vllm bench serve \
--backend vllm \
--model $MODEL \
--model "$MODEL" \
--dataset-name random \
--random-input-len $adjusted_input_len \
--random-output-len $OUTPUT_LEN \
--random-output-len "$OUTPUT_LEN" \
--ignore-eos \
--disable-tqdm \
--request-rate $request_rate \
--percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--goodput e2el:"$MAX_LATENCY_ALLOWED_MS" \
--num-prompts 100 \
--random-prefix-len $prefix_len \
--host "$HOSTNAME" \
......@@ -255,7 +255,7 @@ gpu_memory_utilization=0.98
find_gpu_memory_utilization=0
while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
# Pass empty string for profile_dir argument
start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" ""
start_server "$gpu_memory_utilization" "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" ""
result=$?
if [[ "$result" -eq 0 ]]; then
find_gpu_memory_utilization=1
......@@ -274,7 +274,7 @@ fi
for num_seqs in "${num_seqs_list[@]}"; do
for num_batched_tokens in "${num_batched_tokens_list[@]}"; do
run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization
run_benchmark "$num_seqs" "$num_batched_tokens" "$gpu_memory_utilization"
done
done
echo "finish permutations"
......@@ -285,7 +285,7 @@ echo "finish permutations"
if (( $(echo "$best_throughput > 0" | bc -l) )); then
echo
echo "Benchmark tuning finished. Now running profiling on the best configuration found..."
echo "Best config: max_num_seqs: $best_max_num_seqs, max_num_batched_tokens: $best_num_batched_tokens, throughput: $best_throughput"
echo "Best config: max_num_seqs: $best_max_num_seqs, max_num_batched_tokens: $best_num_batched_tokens, throughput: $best_throughput, goodput: $best_goodput"
echo
vllm_log="$LOG_FOLDER/vllm_log_BEST_PROFILE.txt"
......@@ -293,7 +293,7 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
# Start server with the best params and profiling ENABLED
echo "Starting server for profiling..."
start_server $gpu_memory_utilization $best_max_num_seqs $best_num_batched_tokens "$vllm_log" "$PROFILE_PATH"
start_server "$gpu_memory_utilization" "$best_max_num_seqs" "$best_num_batched_tokens" "$vllm_log" "$PROFILE_PATH"
# Run benchmark with the best params and the --profile flag
echo "Running benchmark with profiling..."
......@@ -301,15 +301,15 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
adjusted_input_len=$(( INPUT_LEN - prefix_len ))
vllm bench serve \
--backend vllm \
--model $MODEL \
--model "$MODEL" \
--dataset-name random \
--random-input-len $adjusted_input_len \
--random-output-len $OUTPUT_LEN \
--random-output-len "$OUTPUT_LEN" \
--ignore-eos \
--disable-tqdm \
--request-rate $best_request_rate \
--request-rate "$best_request_rate" \
--percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--goodput e2el:"$MAX_LATENCY_ALLOWED_MS" \
--num-prompts 100 \
--random-prefix-len $prefix_len \
--host "$HOSTNAME" \
......
......@@ -64,7 +64,7 @@ for i in $(seq 0 $(($num_runs - 1))); do
else
STATUS="FAILURE"
((FAILURE_COUNT++))
FAILED_RUNS+=("Run #$((i+1)): $(echo $run_object | jq -c .)")
FAILED_RUNS+=("Run #$((i+1)): $(echo "$run_object" | jq -c .)")
fi
RUN_OUTPUT=$(<"$RUN_OUTPUT_FILE")
......
......@@ -71,7 +71,7 @@ while [[ $# -gt 0 ]]; do
usage
;;
*)
echo "Unknown argument: $1\n"
printf "Unknown argument: %s\n" "$1"
usage
;;
esac
......@@ -84,15 +84,17 @@ mkdir -p "$OUTPUT_DIR"
QPS_VALUES=(25 20 15 10 5 1)
# Common parameters
COMMON_PARAMS="--backend $BACKEND \
--model $MODEL \
--dataset $DATASET \
--structured-output-ratio $STRUCTURED_OUTPUT_RATIO \
--save-results \
--result-dir $OUTPUT_DIR \
--output-len $MAX_NEW_TOKENS \
--port $PORT \
--tokenizer-mode $TOKENIZER_MODE"
COMMON_PARAMS=(
--backend "$BACKEND"
--model "$MODEL"
--dataset "$DATASET"
--structured-output-ratio "$STRUCTURED_OUTPUT_RATIO"
--save-results
--result-dir "$OUTPUT_DIR"
--output-len "$MAX_NEW_TOKENS"
--port "$PORT"
--tokenizer-mode "$TOKENIZER_MODE"
)
echo "Starting structured output benchmark with model: $MODEL"
echo "Backend: $BACKEND"
......@@ -109,17 +111,17 @@ for qps in "${QPS_VALUES[@]}"; do
GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown")
# Construct filename for this run
FILENAME="${BACKEND}_${qps}qps_$(basename $MODEL)_${DATASET}_${GIT_HASH}.json"
FILENAME="${BACKEND}_${qps}qps_$(basename "$MODEL")_${DATASET}_${GIT_HASH}_${GIT_BRANCH}.json"
NUM_PROMPTS=$(echo "$TOTAL_SECONDS * $qps" | bc)
NUM_PROMPTS=${NUM_PROMPTS%.*} # Remove fractional part
echo "Running benchmark with $NUM_PROMPTS prompts"
# Run the benchmark
python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \
--request-rate $qps \
python "$SCRIPT_DIR/benchmark_serving_structured_output.py" "${COMMON_PARAMS[@]}" \
--request-rate "$qps" \
--result-filename "$FILENAME" \
--num-prompts $NUM_PROMPTS
--num-prompts "$NUM_PROMPTS"
echo "Completed benchmark with QPS: $qps"
echo "----------------------------------------"
......
......@@ -8,7 +8,7 @@ declare -a PIDS=()
###############################################################################
MODEL="${MODEL:-Qwen/Qwen2.5-VL-3B-Instruct}"
LOG_PATH="${LOG_PATH:-./logs}"
mkdir -p $LOG_PATH
mkdir -p "$LOG_PATH"
ENCODE_PORT="${ENCODE_PORT:-19534}"
PREFILL_PORT="${PREFILL_PORT:-19535}"
......@@ -84,10 +84,10 @@ trap cleanup TERM
# clear previous cache
echo "remove previous ec cache folder"
rm -rf $EC_SHARED_STORAGE_PATH
rm -rf "$EC_SHARED_STORAGE_PATH"
echo "make ec cache folder"
mkdir -p $EC_SHARED_STORAGE_PATH
mkdir -p "$EC_SHARED_STORAGE_PATH"
###############################################################################
# Encoder worker
......@@ -100,7 +100,7 @@ CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
--no-enable-prefix-caching \
--max-num-batched-tokens 114688 \
--max-num-seqs 128 \
--allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
--allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
--ec-transfer-config '{
"ec_connector": "ECExampleConnector",
"ec_role": "ec_producer",
......@@ -124,7 +124,7 @@ vllm serve "$MODEL" \
--enforce-eager \
--enable-request-id-headers \
--max-num-seqs 128 \
--allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
--allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
--ec-transfer-config '{
"ec_connector": "ECExampleConnector",
"ec_role": "ec_consumer",
......@@ -152,7 +152,7 @@ vllm serve "$MODEL" \
--enforce-eager \
--enable-request-id-headers \
--max-num-seqs 128 \
--allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
--allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
--kv-transfer-config '{
"kv_connector": "NixlConnector",
"kv_role": "kv_consumer"
......@@ -162,9 +162,9 @@ vllm serve "$MODEL" \
PIDS+=($!)
# Wait for workers
wait_for_server $ENCODE_PORT
wait_for_server $PREFILL_PORT
wait_for_server $DECODE_PORT
wait_for_server "$ENCODE_PORT"
wait_for_server "$PREFILL_PORT"
wait_for_server "$DECODE_PORT"
###############################################################################
# Proxy
......@@ -179,7 +179,7 @@ python disagg_epd_proxy.py \
PIDS+=($!)
wait_for_server $PROXY_PORT
wait_for_server "$PROXY_PORT"
echo "All services are up!"
###############################################################################
......@@ -187,14 +187,14 @@ echo "All services are up!"
###############################################################################
echo "Running benchmark (stream)..."
vllm bench serve \
--model $MODEL \
--model "$MODEL" \
--backend openai-chat \
--endpoint /v1/chat/completions \
--dataset-name hf \
--dataset-path lmarena-ai/VisionArena-Chat \
--seed 0 \
--num-prompts $NUM_PROMPTS \
--port $PROXY_PORT
--num-prompts "$NUM_PROMPTS" \
--port "$PROXY_PORT"
PIDS+=($!)
......@@ -202,10 +202,10 @@ PIDS+=($!)
# Single request with local image
###############################################################################
echo "Running single request with local image (non-stream)..."
curl http://127.0.0.1:${PROXY_PORT}/v1/chat/completions \
curl http://127.0.0.1:"${PROXY_PORT}"/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "'${MODEL}'",
"model": "'"${MODEL}"'",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": [
......
......@@ -8,7 +8,7 @@ declare -a PIDS=()
###############################################################################
MODEL="${MODEL:-Qwen/Qwen2.5-VL-3B-Instruct}"
LOG_PATH="${LOG_PATH:-./logs}"
mkdir -p $LOG_PATH
mkdir -p "$LOG_PATH"
ENCODE_PORT="${ENCODE_PORT:-19534}"
PREFILL_DECODE_PORT="${PREFILL_DECODE_PORT:-19535}"
......@@ -78,10 +78,10 @@ trap cleanup TERM
# clear previous cache
echo "remove previous ec cache folder"
rm -rf $EC_SHARED_STORAGE_PATH
rm -rf "$EC_SHARED_STORAGE_PATH"
echo "make ec cache folder"
mkdir -p $EC_SHARED_STORAGE_PATH
mkdir -p "$EC_SHARED_STORAGE_PATH"
###############################################################################
# Encoder worker
......@@ -94,7 +94,7 @@ CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
--no-enable-prefix-caching \
--max-num-batched-tokens 114688 \
--max-num-seqs 128 \
--allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
--allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
--ec-transfer-config '{
"ec_connector": "ECExampleConnector",
"ec_role": "ec_producer",
......@@ -115,7 +115,7 @@ CUDA_VISIBLE_DEVICES="$GPU_PD" vllm serve "$MODEL" \
--enforce-eager \
--enable-request-id-headers \
--max-num-seqs 128 \
--allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
--allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
--ec-transfer-config '{
"ec_connector": "ECExampleConnector",
"ec_role": "ec_consumer",
......@@ -128,8 +128,8 @@ CUDA_VISIBLE_DEVICES="$GPU_PD" vllm serve "$MODEL" \
PIDS+=($!)
# Wait for workers
wait_for_server $ENCODE_PORT
wait_for_server $PREFILL_DECODE_PORT
wait_for_server "$ENCODE_PORT"
wait_for_server "$PREFILL_DECODE_PORT"
###############################################################################
# Proxy
......@@ -144,7 +144,7 @@ python disagg_epd_proxy.py \
PIDS+=($!)
wait_for_server $PROXY_PORT
wait_for_server "$PROXY_PORT"
echo "All services are up!"
###############################################################################
......@@ -152,14 +152,14 @@ echo "All services are up!"
###############################################################################
echo "Running benchmark (stream)..."
vllm bench serve \
--model $MODEL \
--model "$MODEL" \
--backend openai-chat \
--endpoint /v1/chat/completions \
--dataset-name hf \
--dataset-path lmarena-ai/VisionArena-Chat \
--seed 0 \
--num-prompts $NUM_PROMPTS \
--port $PROXY_PORT
--num-prompts "$NUM_PROMPTS" \
--port "$PROXY_PORT"
PIDS+=($!)
......@@ -167,10 +167,10 @@ PIDS+=($!)
# Single request with local image
###############################################################################
echo "Running single request with local image (non-stream)..."
curl http://127.0.0.1:${PROXY_PORT}/v1/chat/completions \
curl http://127.0.0.1:"${PROXY_PORT}"/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "'${MODEL}'",
"model": "'"${MODEL}"'",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": [
......
......@@ -54,7 +54,7 @@ wait_for_server() {
# You can also adjust --kv-ip and --kv-port for distributed inference.
# prefilling instance, which is the KV producer
CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \
CUDA_VISIBLE_DEVICES=0 vllm serve "$MODEL_NAME" \
--host 0.0.0.0 \
--port 8100 \
--max-model-len 100 \
......@@ -64,7 +64,7 @@ CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":"1e9","kv_port":"14579","kv_connector_extra_config":{"proxy_ip":"'"$VLLM_HOST_IP"'","proxy_port":"30001","http_ip":"'"$VLLM_HOST_IP"'","http_port":"8100","send_type":"PUT_ASYNC"}}' &
# decoding instance, which is the KV consumer
CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
CUDA_VISIBLE_DEVICES=1 vllm serve "$MODEL_NAME" \
--host 0.0.0.0 \
--port 8200 \
--max-model-len 100 \
......
......@@ -34,7 +34,7 @@ wait_for_server() {
done" && return 0 || return 1
}
vllm serve $MODEL_NAME \
vllm serve "$MODEL_NAME" \
--port 8100 \
--max-model-len 100 \
--enforce-eager \
......
......@@ -143,7 +143,7 @@ main() {
IFS=',' read -ra BOOTSTRAP_PORT_ARRAY <<< "$BOOTSTRAP_PORTS"
IFS=',' read -ra DECODE_PORT_ARRAY <<< "$DECODE_PORTS"
proxy_param=""
proxy_args=()
# =============================================================================
# Launch Prefill Servers (X Producers)
......@@ -156,12 +156,12 @@ main() {
local bootstrap_port=${BOOTSTRAP_PORT_ARRAY[$i]}
echo " Prefill server $((i+1)): GPU $gpu_id, Port $port, Bootstrap Port $bootstrap_port"
VLLM_MOONCAKE_BOOTSTRAP_PORT=$bootstrap_port CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
--port $port \
VLLM_MOONCAKE_BOOTSTRAP_PORT=$bootstrap_port CUDA_VISIBLE_DEVICES=$gpu_id vllm serve "$MODEL" \
--port "$port" \
--kv-transfer-config \
"{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_producer\"}" > prefill$((i+1)).log 2>&1 &
PIDS+=($!)
proxy_param="${proxy_param} --prefill http://0.0.0.0:${port} $bootstrap_port"
proxy_args+=(--prefill "http://0.0.0.0:${port}" "$bootstrap_port")
done
# =============================================================================
......@@ -174,12 +174,12 @@ main() {
local port=${DECODE_PORT_ARRAY[$i]}
echo " Decode server $((i+1)): GPU $gpu_id, Port $port"
CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
--port $port \
CUDA_VISIBLE_DEVICES=$gpu_id vllm serve "$MODEL" \
--port "$port" \
--kv-transfer-config \
"{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_consumer\"}" > decode$((i+1)).log 2>&1 &
PIDS+=($!)
proxy_param="${proxy_param} --decode http://0.0.0.0:${port}"
proxy_args+=(--decode "http://0.0.0.0:${port}")
done
# =============================================================================
......@@ -187,7 +187,7 @@ main() {
# =============================================================================
echo ""
echo "Starting proxy server on port $PROXY_PORT..."
python3 mooncake_connector_proxy.py $proxy_param --port $PROXY_PORT > proxy.log 2>&1 &
python3 mooncake_connector_proxy.py "${proxy_args[@]}" --port "$PROXY_PORT" > proxy.log 2>&1 &
PIDS+=($!)
# =============================================================================
......@@ -196,9 +196,10 @@ main() {
echo ""
echo "Waiting for all servers to start..."
for port in "${PREFILL_PORT_ARRAY[@]}" "${DECODE_PORT_ARRAY[@]}"; do
if ! wait_for_server $port; then
if ! wait_for_server "$port"; then
echo "Failed to start server on port $port"
cleanup
# shellcheck disable=SC2317
exit 1
fi
done
......@@ -209,8 +210,8 @@ main() {
# =============================================================================
# Run Benchmark
# =============================================================================
vllm bench serve --port $PROXY_PORT --seed $(date +%s) \
--backend vllm --model $MODEL \
vllm bench serve --port "$PROXY_PORT" --seed "$(date +%s)" \
--backend vllm --model "$MODEL" \
--dataset-name random --random-input-len 7500 --random-output-len 200 \
--num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log
......
......@@ -166,10 +166,10 @@ main() {
local kv_port=$((21001 + i))
echo " Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
CUDA_VISIBLE_DEVICES=$gpu_id vllm serve "$MODEL" \
--enforce-eager \
--host 0.0.0.0 \
--port $port \
--port "$port" \
--tensor-parallel-size 1 \
--seed 1024 \
--dtype float16 \
......@@ -194,10 +194,10 @@ main() {
local kv_port=$((22001 + i))
echo " Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
CUDA_VISIBLE_DEVICES=$gpu_id vllm serve "$MODEL" \
--enforce-eager \
--host 0.0.0.0 \
--port $port \
--port "$port" \
--tensor-parallel-size 1 \
--seed 1024 \
--dtype float16 \
......@@ -217,9 +217,10 @@ main() {
echo ""
echo "Waiting for all servers to start..."
for port in "${PREFILL_PORT_ARRAY[@]}" "${DECODE_PORT_ARRAY[@]}"; do
if ! wait_for_server $port; then
if ! wait_for_server "$port"; then
echo "Failed to start server on port $port"
cleanup
# shellcheck disable=SC2317
exit 1
fi
done
......@@ -231,8 +232,8 @@ main() {
# Run Benchmark
# =============================================================================
cd ../../../benchmarks/
vllm bench serve --port 10001 --seed $(date +%s) \
--model $MODEL \
vllm bench serve --port 10001 --seed "$(date +%s)" \
--model "$MODEL" \
--dataset-name random --random-input-len 7500 --random-output-len 200 \
--num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log
......
......@@ -50,8 +50,8 @@ while [[ $# -gt 0 ]]; do
done
vllm bench serve \
--model $MODEL_NAME \
--host $HOST \
--port $PORT \
--num-prompts $NUM_PROMPTS \
--request-rate $REQUEST_RATE
--model "$MODEL_NAME" \
--host "$HOST" \
--port "$PORT" \
--num-prompts "$NUM_PROMPTS" \
--request-rate "$REQUEST_RATE"
......@@ -57,15 +57,15 @@ echo "Starting vLLM server for $MODEL_NAME with data parallel size: $DATA_PARALL
export RAY_DEDUP_LOGS=0
export VLLM_USE_DEEP_GEMM=1
vllm serve $MODEL_NAME \
--data-parallel-size $DATA_PARALLEL_SIZE \
--data-parallel-size-local $DATA_PARALLEL_SIZE \
vllm serve "$MODEL_NAME" \
--data-parallel-size "$DATA_PARALLEL_SIZE" \
--data-parallel-size-local "$DATA_PARALLEL_SIZE" \
--data-parallel-backend ray \
--enforce-eager \
--enable-expert-parallel \
--enable-eplb \
--all2all-backend pplx \
--num-redundant-experts $REDUNDANT_EXPERTS \
--num-redundant-experts "$REDUNDANT_EXPERTS" \
--trust-remote-code \
--host $HOST \
--port $PORT
--host "$HOST" \
--port "$PORT"
......@@ -57,8 +57,7 @@ case "$subcommand" in
# Retry until the worker node connects to the head node or the timeout expires.
for (( i=0; i < $ray_init_timeout; i+=5 )); do
ray start --address=$ray_address:$ray_port --block "${start_params[@]}"
if [ $? -eq 0 ]; then
if ray start --address="$ray_address":"$ray_port" --block "${start_params[@]}"; then
echo "Worker: Ray runtime started with head address $ray_address:$ray_port"
exit 0
fi
......@@ -95,12 +94,12 @@ case "$subcommand" in
fi
# Start the Ray head node.
ray start --head --port=$ray_port "${start_params[@]}"
ray start --head --port="$ray_port" "${start_params[@]}"
# Poll Ray until every worker node is active.
for (( i=0; i < $ray_init_timeout; i+=5 )); do
active_nodes=`python3 -c 'import ray; ray.init(); print(sum(node["Alive"] for node in ray.nodes()))'`
if [ $active_nodes -eq $ray_cluster_size ]; then
active_nodes=$(python3 -c 'import ray; ray.init(); print(sum(node["Alive"] for node in ray.nodes()))')
if [ "$active_nodes" -eq "$ray_cluster_size" ]; then
echo "All ray workers are active and the ray cluster is initialized successfully."
exit 0
fi
......
......@@ -22,11 +22,10 @@ check_hf_token() {
check_num_gpus() {
# can you check if the number of GPUs are >=2 via nvidia-smi/rocm-smi?
which rocm-smi > /dev/null 2>&1
if [ $? -ne 0 ]; then
if ! which rocm-smi > /dev/null 2>&1; then
num_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
else
num_gpus=$(rocm-smi --showid | grep Instinct | wc -l)
num_gpus=$(rocm-smi --showid | grep -c Instinct)
fi
if [ "$num_gpus" -lt 2 ]; then
......@@ -39,8 +38,7 @@ check_num_gpus() {
ensure_python_library_installed() {
echo "Checking if $1 is installed..."
python3 -c "import $1" > /dev/null 2>&1
if [ $? -ne 0 ]; then
if ! python3 -c "import $1" > /dev/null 2>&1; then
if [ "$1" == "nixl" ]; then
echo "$1 is not installed. Please refer to https://github.com/ai-dynamo/nixl for installation."
else
......@@ -102,12 +100,12 @@ main() {
bash disagg_vllm_launcher.sh prefiller \
> >(tee prefiller.log) 2>&1 &
prefiller_pid=$!
PIDS+=($prefiller_pid)
PIDS+=("$prefiller_pid")
bash disagg_vllm_launcher.sh decoder \
> >(tee decoder.log) 2>&1 &
decoder_pid=$!
PIDS+=($decoder_pid)
PIDS+=("$decoder_pid")
python3 disagg_proxy_server.py \
--host localhost \
......@@ -118,7 +116,7 @@ main() {
--decoder-port 8200 \
> >(tee proxy.log) 2>&1 &
proxy_pid=$!
PIDS+=($proxy_pid)
PIDS+=("$proxy_pid")
wait_for_server 8100
wait_for_server 8200
......@@ -128,7 +126,7 @@ main() {
# begin benchmark
cd ../../../../benchmarks/
vllm bench serve --port 9000 --seed $(date +%s) \
vllm bench serve --port 9000 --seed "$(date +%s)" \
--model meta-llama/Llama-3.1-8B-Instruct \
--dataset-name random --random-input-len 7500 --random-output-len 200 \
--num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log
......
......@@ -34,7 +34,7 @@ if [[ $1 == "prefiller" ]]; then
VLLM_ENABLE_V1_MULTIPROCESSING=1 \
VLLM_WORKER_MULTIPROC_METHOD=spawn \
CUDA_VISIBLE_DEVICES=0 \
vllm serve $MODEL \
vllm serve "$MODEL" \
--port 8100 \
--enforce-eager \
--kv-transfer-config \
......@@ -51,7 +51,7 @@ elif [[ $1 == "decoder" ]]; then
VLLM_ENABLE_V1_MULTIPROCESSING=1 \
VLLM_WORKER_MULTIPROC_METHOD=spawn \
CUDA_VISIBLE_DEVICES=1 \
vllm serve $MODEL \
vllm serve "$MODEL" \
--port 8200 \
--enforce-eager \
--kv-transfer-config \
......
......@@ -103,7 +103,7 @@ vllm serve "$MODEL_NAME" \
--tensor-parallel-size "$GPU_COUNT" \
--enforce-eager \
--pooler-config "$POOLER_CONFIG" \
--served-model-name ${MODEL_CODE} \
--served-model-name "${MODEL_CODE}" \
--api-key "$API_KEY" \
--trust-remote-code \
--port "$PORT" \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment