run-amd-test.sh 17.3 KB
Newer Older
1
2
#!/bin/bash

3
4
5
6
7
8
# This script runs tests inside the corresponding ROCm docker container.
# It handles both single-node and multi-node test configurations.
#
# Multi-node detection: Instead of matching on fragile group names, we detect
# multi-node jobs structurally by looking for the bracket command syntax
# "[node0_cmds] && [node1_cmds]" or via the NUM_NODES environment variable.
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
#
###############################################################################
# QUOTING / COMMAND PASSING
#
# Passing commands as positional arguments ($*) is fragile when the command
# string itself contains double quotes, e.g.:
#
#   bash run-amd-test.sh "export FLAGS="value" && pytest -m "not slow""
#
# The outer shell resolves the nested quotes *before* this script runs, so
# the script receives mangled input it cannot fully recover.
#
# Preferred: pass commands via the VLLM_TEST_COMMANDS environment variable:
#
#   export VLLM_TEST_COMMANDS='export FLAGS="value" && pytest -m "not slow"'
#   bash run-amd-test.sh
#
# Single-quoted assignment preserves all inner double quotes verbatim.
# The $* path is kept for backward compatibility but callers should migrate.
###############################################################################
29
set -o pipefail
Simon Mo's avatar
Simon Mo committed
30

31
32
33
# Export Python path
export PYTHONPATH=".."

34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
###############################################################################
# Helper Functions
###############################################################################

wait_for_clean_gpus() {
  local timeout=${1:-300}
  local start=$SECONDS
  echo "--- Waiting for clean GPU state (timeout: ${timeout}s)"
  while true; do
    if grep -q clean /opt/amdgpu/etc/gpu_state; then
      echo "GPUs state is \"clean\""
      return
    fi
    if (( SECONDS - start >= timeout )); then
      echo "Error: GPUs did not reach clean state within ${timeout}s" >&2
      exit 1
    fi
    sleep 3
  done
}
Simon Mo's avatar
Simon Mo committed
54

55
56
57
58
59
60
61
62
cleanup_docker() {
  # Get Docker's root directory
  docker_root=$(docker info -f '{{.DockerRootDir}}')
  if [ -z "$docker_root" ]; then
    echo "Failed to determine Docker root directory."
    exit 1
  fi
  echo "Docker root directory: $docker_root"
63

64
65
66
67
68
  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
  threshold=70
  if [ "$disk_usage" -gt "$threshold" ]; then
    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
    docker image prune -f
69
    docker volume prune -f && docker system prune --force --filter "until=72h" --all
70
71
72
73
74
75
    echo "Docker images and volumes cleanup completed."
  else
    echo "Disk usage is below $threshold%. No cleanup needed."
  fi
}

76
cleanup_network() {
77
78
79
80
  local max_nodes=${NUM_NODES:-2}
  for node in $(seq 0 $((max_nodes - 1))); do
    if docker ps -a -q -f name="node${node}" | grep -q .; then
      docker stop "node${node}" || true
81
82
    fi
  done
83
84
  if docker network ls | grep -q docker-net; then
    docker network rm docker-net || true
85
86
87
  fi
}

88
89
90
91
92
93
94
95
96
97
98
99
100
101
is_multi_node() {
  local cmds="$1"
  # Primary signal: NUM_NODES environment variable set by the pipeline
  if [[ "${NUM_NODES:-1}" -gt 1 ]]; then
    return 0
  fi
  # Fallback: detect the bracket syntax structurally
  # Pattern: [...] && [...] (per-node command arrays)
  if [[ "$cmds" =~ \[.*\].*\&\&.*\[.*\] ]]; then
    return 0
  fi
  return 1
}

102
103
104
105
106
107
108
109
110
handle_pytest_exit() {
  local exit_code=$1
  if [ "$exit_code" -eq 5 ]; then
    echo "Pytest exit code 5 (no tests collected) - treating as success."
    exit 0
  fi
  exit "$exit_code"
}

111
###############################################################################
112
# Pytest marker/keyword re-quoting
113
114
#
# When commands are passed through Buildkite -> shell -> $* -> bash -c,
115
# quotes around multi-word pytest -m/-k expressions get stripped:
116
117
118
119
120
#   pytest -v -s -m 'not cpu_test' v1/core
# becomes:
#   pytest -v -s -m not cpu_test v1/core
#
# pytest then interprets "cpu_test" as a file path, not part of the marker.
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#
# This function detects unquoted expressions after -m/-k and re-quotes them
# by collecting tokens until a recognizable boundary is reached:
#   - test path (contains '/')
#   - test file (ends with '.py')
#   - another pytest flag (--xxx or -x single-char flags)
#   - command separator (&& || ; |)
#   - environment variable assignment (FOO=bar)
#
# Single-word markers (e.g. -m cpu_test, -m hybrid_model) pass through
# unquoted since they have no spaces and work fine.
#
# Already-quoted expressions (containing literal single quotes) are passed
# through untouched to avoid double-quoting values injected by
# apply_rocm_test_overrides.
#
# NOTE: This ONLY fixes -m/-k flags. It cannot recover arbitrary inner
# double-quotes stripped by the calling shell (see header comment).
# Use VLLM_TEST_COMMANDS to avoid the problem entirely.
140
141
###############################################################################
re_quote_pytest_markers() {
142
143
144
145
146
  local input="$1"
  local output=""
  local collecting=false
  local marker_buf=""

147
148
149
  # Strip backslash-newline continuations, then flatten remaining newlines
  local flat="${input//$'\\\n'/ }"
  flat="${flat//$'\n'/ }"
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176

  # Disable globbing to prevent *.py etc. from expanding during read -ra
  local restore_glob
  restore_glob="$(shopt -p -o noglob 2>/dev/null || true)"
  set -o noglob
  local -a words
  read -ra words <<< "$flat"
  eval "$restore_glob"

  for word in "${words[@]}"; do
    if $collecting; then
      # If the token we're about to collect already contains a literal
      # single quote, the expression was already quoted upstream.
      # Flush and stop collecting.
      if [[ "$word" == *"'"* ]]; then
        if [[ -n "$marker_buf" ]]; then
          # Should not normally happen (partial buf + quote), flush raw
          output+="${marker_buf} "
          marker_buf=""
        fi
        output+="${word} "
        collecting=false
        continue
      fi

      local is_boundary=false
      case "$word" in
177
178
179
        # Line-continuation artifact
        "\\")
          is_boundary=true ;;
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
        # Command separators
        "&&"|"||"|";"|"|")
          is_boundary=true ;;
        # Long flags (--ignore, --shard-id, etc.)
        --*)
          is_boundary=true ;;
        # Short flags (-v, -s, -x, etc.) but NOT negative marker tokens
        # like "not" which don't start with "-". Also skip -k/-m which
        # would start a new marker (handled below).
        -[a-zA-Z])
          is_boundary=true ;;
        # Test path (contains /)
        */*)
          is_boundary=true ;;
        # Test file (ends with .py, possibly with ::method)
        *.py|*.py::*)
          is_boundary=true ;;
        # Environment variable assignment preceding a command (FOO=bar)
        *=*)
          # Only treat as boundary if it looks like VAR=value, not
          # pytest filter expressions like num_gpus=2 inside markers
          if [[ "$word" =~ ^[A-Z_][A-Z0-9_]*= ]]; then
            is_boundary=true
          fi
          ;;
      esac

      if $is_boundary; then
208
209
210
211
212
213
214
        # Strip surrounding double quotes if present (from upstream
        # single-to-double conversion); without this, wrapping below
        # would produce '"expr"' with literal double-quote characters.
        if [[ "$marker_buf" == '"'*'"' ]]; then
          marker_buf="${marker_buf#\"}"
          marker_buf="${marker_buf%\"}"
        fi
215
216
217
218
219
220
221
222
223
224
225
226
        # Flush the collected marker expression
        if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
          output+="'${marker_buf}' "
        else
          output+="${marker_buf} "
        fi
        collecting=false
        marker_buf=""
        # Check if this boundary word itself starts a new -m/-k
        if [[ "$word" == "-m" || "$word" == "-k" ]]; then
          output+="${word} "
          collecting=true
227
228
229
        # Drop stray backslash tokens silently
        elif [[ "$word" == "\\" ]]; then
          :
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
        else
          output+="${word} "
        fi
      else
        # Accumulate into marker buffer
        if [[ -n "$marker_buf" ]]; then
          marker_buf+=" ${word}"
        else
          marker_buf="${word}"
        fi
      fi
    elif [[ "$word" == "-m" || "$word" == "-k" ]]; then
      output+="${word} "
      collecting=true
      marker_buf=""
    else
      output+="${word} "
    fi
  done

  # Flush any trailing marker expression (marker at end of command)
  if $collecting && [[ -n "$marker_buf" ]]; then
252
253
254
255
256
    # Strip surrounding double quotes (see mid-stream flush comment)
    if [[ "$marker_buf" == '"'*'"' ]]; then
      marker_buf="${marker_buf#\"}"
      marker_buf="${marker_buf%\"}"
    fi
257
258
259
260
261
262
263
264
    if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
      output+="'${marker_buf}'"
    else
      output+="${marker_buf}"
    fi
  fi

  echo "${output% }"
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
}

###############################################################################
# ROCm-specific pytest command rewrites
#
# These apply ignore flags and environment overrides for tests that are not
# yet supported or behave differently on ROCm hardware. Kept as a single
# function so new exclusions are easy to add in one place.
###############################################################################

apply_rocm_test_overrides() {
  local cmds="$1"

  # --- Model registry filter ---
  if [[ $cmds == *"pytest -v -s models/test_registry.py"* ]]; then
    cmds=${cmds//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
  fi

  # --- LoRA: disable custom paged attention ---
  if [[ $cmds == *"pytest -v -s lora"* ]]; then
    cmds=${cmds//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
  fi

  # --- Kernel ignores ---
  if [[ $cmds == *" kernels/core"* ]]; then
    cmds="${cmds} \
    --ignore=kernels/core/test_fused_quant_layernorm.py \
    --ignore=kernels/core/test_permute_cols.py"
  fi

  if [[ $cmds == *" kernels/attention"* ]]; then
    cmds="${cmds} \
    --ignore=kernels/attention/test_attention_selector.py \
    --ignore=kernels/attention/test_encoder_decoder_attn.py \
    --ignore=kernels/attention/test_flash_attn.py \
    --ignore=kernels/attention/test_flashinfer.py \
    --ignore=kernels/attention/test_prefix_prefill.py \
    --ignore=kernels/attention/test_cascade_flash_attn.py \
    --ignore=kernels/attention/test_mha_attn.py \
    --ignore=kernels/attention/test_lightning_attn.py \
    --ignore=kernels/attention/test_attention.py"
  fi

  if [[ $cmds == *" kernels/quantization"* ]]; then
    cmds="${cmds} \
    --ignore=kernels/quantization/test_int8_quant.py \
    --ignore=kernels/quantization/test_machete_mm.py \
    --ignore=kernels/quantization/test_block_fp8.py \
    --ignore=kernels/quantization/test_block_int8.py \
    --ignore=kernels/quantization/test_marlin_gemm.py \
    --ignore=kernels/quantization/test_cutlass_scaled_mm.py \
    --ignore=kernels/quantization/test_int8_kernel.py"
  fi

  if [[ $cmds == *" kernels/mamba"* ]]; then
    cmds="${cmds} \
    --ignore=kernels/mamba/test_mamba_mixer2.py \
    --ignore=kernels/mamba/test_causal_conv1d.py \
    --ignore=kernels/mamba/test_mamba_ssm_ssd.py"
  fi

  if [[ $cmds == *" kernels/moe"* ]]; then
    cmds="${cmds} \
    --ignore=kernels/moe/test_moe.py \
    --ignore=kernels/moe/test_cutlass_moe.py \
    --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
  fi

  # --- Entrypoint ignores ---
  if [[ $cmds == *" entrypoints/openai "* ]]; then
    cmds=${cmds//" entrypoints/openai "/" entrypoints/openai \
336
337
    --ignore=entrypoints/openai/chat_completion/test_audio.py \
    --ignore=entrypoints/openai/completion/test_shutdown.py \
338
339
340
341
    --ignore=entrypoints/openai/test_completion.py \
    --ignore=entrypoints/openai/test_models.py \
    --ignore=entrypoints/openai/test_lora_adapters.py \
    --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
342
    --ignore=entrypoints/openai/chat_completion/test_root_path.py \
343
    --ignore=entrypoints/openai/test_tokenization.py \
344
    --ignore=entrypoints/openai/completion/test_prompt_validation.py "}
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
  fi

  if [[ $cmds == *" entrypoints/llm "* ]]; then
    cmds=${cmds//" entrypoints/llm "/" entrypoints/llm \
    --ignore=entrypoints/llm/test_chat.py \
    --ignore=entrypoints/llm/test_accuracy.py \
    --ignore=entrypoints/llm/test_init.py \
    --ignore=entrypoints/llm/test_prompt_validation.py "}
  fi

  # Clean up escaped newlines from --ignore appends
  cmds=$(echo "$cmds" | sed 's/ \\ / /g')

  echo "$cmds"
}

###############################################################################
# Main
###############################################################################

# --- GPU initialization ---
echo "--- Confirming Clean Initial State"
wait_for_clean_gpus

echo "--- ROCm info"
rocminfo

# --- Docker housekeeping ---
373
374
cleanup_docker

375
echo "--- Resetting GPUs"
376
echo "reset" > /opt/amdgpu/etc/gpu_state
377
wait_for_clean_gpus
378

379
# --- Pull test image ---
380
echo "--- Pulling container"
381
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
382
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
383
docker pull "${image_name}"
384
385

remove_docker_container() {
386
  docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
387
388
}
trap remove_docker_container EXIT
389

390
# --- Prepare commands ---
391
echo "--- Running container"
392

393
HF_CACHE="$(realpath ~)/huggingface"
394
mkdir -p "${HF_CACHE}"
395
396
HF_MOUNT="/root/.cache/huggingface"

397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
# ---- Command source selection ----
# Prefer VLLM_TEST_COMMANDS (preserves all inner quoting intact).
# Fall back to $* for backward compatibility, but warn that inner
# double-quotes will have been stripped by the calling shell.
if [[ -n "${VLLM_TEST_COMMANDS:-}" ]]; then
  commands="${VLLM_TEST_COMMANDS}"
  echo "Commands sourced from VLLM_TEST_COMMANDS (quoting preserved)"
else
  commands="$*"
  if [[ -z "$commands" ]]; then
    echo "Error: No test commands provided." >&2
    echo "Usage:" >&2
    echo "  Preferred:  VLLM_TEST_COMMANDS='...' bash $0" >&2
    echo "  Legacy:     bash $0 \"commands here\"" >&2
    exit 1
  fi
  echo "Commands sourced from positional args (legacy mode)"
  echo "WARNING: Inner double-quotes in the command string may have been"
  echo "  stripped by the calling shell. If you see syntax errors, switch to:"
  echo "  export VLLM_TEST_COMMANDS='your commands here'"
  echo "  bash $0"
fi

420
echo "Raw commands: $commands"
421

422
423
# Fix quoting before ROCm overrides (so overrides see correct structure)
commands=$(re_quote_pytest_markers "$commands")
424
425
echo "After re-quoting: $commands"

426
commands=$(apply_rocm_test_overrides "$commands")
427
428
echo "Final commands: $commands"

429
430
MYPYTHONPATH=".."

431
# Verify GPU access
432
433
434
435
436
437
render_gid=$(getent group render | cut -d: -f3)
if [[ -z "$render_gid" ]]; then
  echo "Error: 'render' group not found. This is required for GPU access." >&2
  exit 1
fi

438
439
440
441
442
443
444
445
446
447
448
449
# --- RDMA device passthrough (conditional) ---
# If the host has RDMA devices, pass them through so tests like
# test_moriio_connector can access ibverbs. On hosts without RDMA
# hardware the tests will gracefully skip via _rdma_available().
RDMA_FLAGS=""
if [ -d /dev/infiniband ]; then
  echo "RDMA devices detected on host, enabling passthrough"
  RDMA_FLAGS="--device /dev/infiniband --cap-add=IPC_LOCK"
else
  echo "No RDMA devices found on host, RDMA tests will be skipped"
fi

450
451
452
# --- Route: multi-node vs single-node ---
if is_multi_node "$commands"; then
  echo "--- Multi-node job detected"
453
454
  export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/')

455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
  # Parse the bracket syntax:  prefix ; [node0_cmds] && [node1_cmds]
  #   BASH_REMATCH[1] = prefix (everything before first bracket)
  #   BASH_REMATCH[2] = comma-separated node0 commands
  #   BASH_REMATCH[3] = comma-separated node1 commands
  if [[ "$commands" =~ ^(.*)\[(.*)"] && ["(.*)\]$ ]]; then
    prefix=$(echo "${BASH_REMATCH[1]}" | sed 's/;//g')
    echo "PREFIX: ${prefix}"

    export composite_command="(command rocm-smi || true)"
    saved_IFS=$IFS
    IFS=','
    read -ra node0 <<< "${BASH_REMATCH[2]}"
    read -ra node1 <<< "${BASH_REMATCH[3]}"
    IFS=$saved_IFS

    if [[ ${#node0[@]} -ne ${#node1[@]} ]]; then
      echo "Warning: node0 has ${#node0[@]} commands, node1 has ${#node1[@]}. They will be paired by index."
    fi

    for i in "${!node0[@]}"; do
      command_node_0=$(echo "${node0[i]}" | sed 's/\"//g')
      command_node_1=$(echo "${node1[i]}" | sed 's/\"//g')

      step_cmd="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
      echo "COMMANDS: ${step_cmd}"
      composite_command="${composite_command} && ${step_cmd}"
    done

    /bin/bash -c "${composite_command}"
484
    exit_code=$?
485
    cleanup_network
486
    handle_pytest_exit "$exit_code"
487
  else
488
489
490
491
492
    echo "Multi-node job detected but failed to parse bracket command syntax."
    echo "Expected format: prefix ; [node0_cmd1, node0_cmd2] && [node1_cmd1, node1_cmd2]"
    echo "Got: $commands"
    cleanup_network
    exit 111
493
  fi
494
else
495
  echo "--- Single-node job"
496
  echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
497
  docker run \
498
    --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
499
    $RDMA_FLAGS \
500
501
502
503
504
505
506
    --network=host \
    --shm-size=16gb \
    --group-add "$render_gid" \
    --rm \
    -e HF_TOKEN \
    -e AWS_ACCESS_KEY_ID \
    -e AWS_SECRET_ACCESS_KEY \
507
508
    -e BUILDKITE_PARALLEL_JOB \
    -e BUILDKITE_PARALLEL_JOB_COUNT \
509
510
511
512
513
514
    -v "${HF_CACHE}:${HF_MOUNT}" \
    -e "HF_HOME=${HF_MOUNT}" \
    -e "PYTHONPATH=${MYPYTHONPATH}" \
    --name "${container_name}" \
    "${image_name}" \
    /bin/bash -c "${commands}"
515
516
517

  exit_code=$?
  handle_pytest_exit "$exit_code"
518
fi